001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.io.Reader;
022    import java.io.BufferedReader;
023    import java.io.InputStreamReader;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.util.List;
027    import java.util.ArrayList;
028    import java.util.Iterator;
029    import java.util.HashSet;
030    import java.util.Set;
031    import java.util.Arrays;
032    import java.util.BitSet;
033    import java.text.DecimalFormat;
034    
035    /**
036     * Manage a set of profiles and determine "most similar" ones
037     * to a given profile. Allows access to the complete results of
038     * previous last ranking. Note this uses a competetive ranking
039     * approach, which is memory efficient, time efficient for not
040     * too many languages and provides contextual scoring of ngrams.
041     *
042     * @author frank nestel
043     * @author $Author: nestefan $
044     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
045     */
046    public class NGramProfiles
047    {
048      public final static String NOLANGNAME = "--";
049    
050      public final static char END_CHAR = (char) 0;
051    
052      public final static DecimalFormat DF = new DecimalFormat("+0.00;-0.00");
053    
054      public final static double LOWSTATSAFETY = 0.5; // was 0.5;
055    
056      private List profiles = null;
057    
058      private HashSet allNGrams = new HashSet(10000);
059    
060      private int firstNGrams;
061    
062      private int maxLen = -1;
063    
064      private Trie myTrie = null;
065    
066      private float[][] vals;
067    
068      private int mode;
069    
070      public NGramProfiles()
071        throws IOException
072      {
073        this(1);
074      }
075    
076      public NGramProfiles(int mode)
077        throws IOException
078      {
079        InputStream ip = getClass().getResourceAsStream("profiles.lst");
080        BufferedReader br = new BufferedReader(new InputStreamReader(ip));
081        this.mode = mode;
082        init(br);
083      }
084    
085      public NGramProfiles(BufferedReader br)
086        throws IOException
087      {
088        init(br);
089      }
090    
091      protected void init(BufferedReader br)
092        throws IOException
093      {
094        profiles = new ArrayList();
095        firstNGrams = 0;
096        String line;
097        while ( ( line = br.readLine() ) != null )
098        {
099          if ( line.charAt(0) == '#' )
100            continue;
101          InputStream is = getClass().getResourceAsStream(line+"."+
102              NGramProfile.NGRAM_PROFILE_EXTENSION);
103          NGramProfileImpl np = new NGramProfileImpl(line);
104          np.load(is);
105          profiles.add(np);
106          Iterator iter = np.getSorted();
107          while ( iter.hasNext() )
108          {
109            NGram ng = (NGram) iter.next();
110            if ( ng.length() > maxLen ) maxLen = ng.length();
111            firstNGrams++;
112            allNGrams.add(ng);
113          }
114        }
115        myTrie = null;
116      }
117    
118      public void info()
119      {
120        System.err.println("#profiles="+profiles.size()+", firstNGrams="+firstNGrams+", secondNGrams="+allNGrams.size()+", maxLen="+maxLen+", mode="+mode);
121        CosMetric cm = new CosMetric();
122        Iterator i1 = profiles.iterator();
123        while ( i1.hasNext() )
124        {
125            NGramProfile p1 = (NGramProfile) i1.next();
126            System.out.print("# "+p1.getName());
127            Iterator i2 = profiles.iterator();
128            while ( i2.hasNext() )
129            {
130                NGramProfile p2 = (NGramProfile) i2.next();
131                System.out.print(" ");
132                System.out.print(DF.format(1.0 - cm.diff(p1,p2)));
133            }
134            System.out.println();
135        }
136        Ranker r = getRanker();
137        //
138        check(r, "Das ist cool men!");
139        check(r, "Les Ordinateurs sont appeles a jouer un role");
140        check(r, "Sein oder Nichtsein, das ist hier die Frage!");
141        check(r, "Zu Ihren Aufgaben zählen u. a. die Einführung und Erprobung neuer Technologien, die Erarbeitung von Rationalisierungslösungen für die Fertigung sowie die Erarbeitung und Durchführung von Technologieversuchen bei der Mustererstellung mit Hinblick auf die Serienfertigung. Desweiteren sind Sie für die Betreuung und Durchführung von Entwicklungsprojekten sowie die Erarbeitung und konstruktive Auslegung von Werkzeugen, Vorrichtungen und Automatisierungseinrichtungen zuständig. Sie sind Ansprechpartner für die Zusammenarbeit und Betreuung von externen Lieferanten bei der Konstruktion und der Herstellung von Maschinen und Vorrichtungen. Sie leiten qualitätsverbessernde Maßnahmen im Prozess und in der Fertigung ein und sind für die kostenbewusste und zukunftsweisende Planung des Bereiches verantwortlich. Zudem führen Sie Ihre Mitarbeiter zielgerichtet entsprechend der INA-Führungsleitlinien.");
142        check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!");
143        check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question! Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!");
144        check(r, "Marcel Andre Casasola Merkle");
145        check(r, "question");
146        check(r, "methodist");
147        /*
148        */
149      }
150    
151      public static void check(Ranker r, CharSequence seq)
152      {
153        System.out.println("## \""+seq+"\": ");
154        r.reset();
155        long t1 = System.currentTimeMillis();
156        r.account(seq);
157        long t2 = System.currentTimeMillis();
158        RankResult rs = r.getRankResult();
159        double sum = 0.0;
160        for(int i = 0; i < rs.getLength(); i++)
161        {
162          if ( rs.getScore(i) > 0.0 )
163          {
164            System.out.println("  "+rs.getName(i)
165                  +" "+DF.format(rs.getScore(i))
166                  +" "+(i+1)
167              );
168            sum += rs.getScore(i);
169          }
170        }
171        System.out.println("#===== took="+(t2-t1)+"ms: "+DF.format(1.0*(t2-t1)/seq.length())+"ms/ch. other="+DF.format(1.0-sum));
172      }
173    
174      public Ranker getRanker()
175      {
176        int[] otherCount = new int[maxLen+1];
177        if ( myTrie == null )
178        {
179          synchronized(profiles)
180          {
181            if ( myTrie == null )
182            {
183              // Create a reverse reference of all strings
184              // which makes it easy to create reverse Trie's
185              String[] ngs = new String[allNGrams.size()];
186              Iterator it = allNGrams.iterator();
187              int j = 0;
188              while( it.hasNext() )
189              {
190                NGram ng = (NGram) it.next();
191                ngs[j++] = reverse(ng);
192              }
193              Arrays.sort(ngs);
194              // Create Strings in correct order but sorted from reverse end.
195              String[] ng1 = new String[allNGrams.size()];
196              for(int i = 0; i < ngs.length; i++)
197              {
198                ng1[i] = reverse(ngs[i]);
199              }
200              myTrie = createTrie(ngs, 0, 0, ngs.length);
201              vals = new float[ngs.length][profiles.size()];
202              int[] lengthes = new int[ngs.length];
203              for(int k = 0; k < profiles.size(); k++)
204              {
205                NGramProfile ngp = ((NGramProfile)profiles.get(k));
206                double norm[] = new double[maxLen+1];
207                int count[] = new int[maxLen+1];
208                for(int i = 0; i < ngs.length; i++)
209                {
210                  NGram ng = ngp.get(ng1[i]);
211                  if ( ng != null && ng.getCount() > LOWSTATSAFETY )
212                  {
213                    int ngl = ng.length();
214                    lengthes[i] = ngl; // write at least once, read once :-|
215                    double raw1 = ng.getCount() - LOWSTATSAFETY;
216                    count[ngl]++;
217                    norm[ngl] += raw1;
218                    vals[i][k] = (float) raw1;
219                  }
220                }
221                for(int i = 1; i <= maxLen; i++)
222                {
223                  norm[i] *= (1.0+count[i])/count[i];
224                  norm[i] += 1.0;
225                }
226                for(int i = 0; i < ngs.length; i++)
227                {
228                  NGram ng = ngp.get(ng1[i]);
229                  if ( ng != null && ng.getCount() > 0 )
230                  {
231                    int ngl = ng.length();
232                    double trans = vals[i][k] / norm[ngl];
233                    vals[i][k] = (float)trans;
234                  }
235                }
236              }
237              // Horizontal additive zero sum + nonlinear weighting
238              for(int i = 0; i < ngs.length; i++)
239              {
240                double sum = 0.0;
241                for(int k = 0; k < profiles.size(); k++)
242                {
243                  double h = vals[i][k];
244                  sum += h;
245                }
246                double av = sum / profiles.size();
247                /**
248                 * Assumed minimum amount of score for significance.
249                 * XXX Heuristics for the following constant:
250                 * Higher means faster and less noise
251                 * Lower means better adaption to mixed language text
252                 */
253                double n = modeTrans(av, ng1[i].length()) / av / 100.0 * (-Math.log(av));
254                for(int k = 0; k < profiles.size(); k++)
255                  vals[i][k] = (float) ((vals[i][k] - av) * n);
256              }
257            }
258          }
259        }
260        return new Ranker()
261          {
262              private double score[] = new double[profiles.size()+1];
263              private double rscore[] = new double[profiles.size()+1];
264              private boolean flushed = false;
265    
266              {
267                reset();
268              }
269    
270              public RankResult getRankResult()
271              {
272                flush();
273                double pscore[] = new double[profiles.size()];
274                double sum = 0.0;
275                for(int i = 0; i <= profiles.size(); i++)
276                {
277                  sum += rscore[i];
278                }
279                for(int i = 0; i < profiles.size(); i++)
280                {
281                  pscore[i] = rscore[i] / sum;
282                }
283                return new SimpleRankResult(pscore, true);
284              }
285    
286              public void reset()
287              {
288                for(int i = 0; i < score.length; i++)
289                {
290                  rscore[i] = score[i] = 0.0;
291                }
292                score[score.length-1] =  0.0;
293                rscore[score.length-1] = 0.5; // 0.2 is too low;
294              }
295    
296              public void flush()
297              {
298                if ( !flushed )
299                {
300                  flushed = true;
301                  double maxValue = -1.0;
302                  for(int i = 0; i < score.length; i++)
303                  {
304                    maxValue = Math.max(maxValue, score[i]);
305                  }
306                  double limit = maxValue / 2.0;
307                  double f = 1.0/(maxValue - limit);
308                  for(int i = 0; i < score.length; i++)
309                  {
310                    double delta = score[i] - limit;
311                    if ( delta > 0.0 )
312                      rscore[i] += delta * f;
313                    // We do not reset to zero, this makes classification contextual
314                    score[i] /= 2.0;
315                  }
316                }
317              }
318    
319              public void account(CharSequence seq, int pos)
320              {
321    // System.out.println("--");            
322                Trie currentNode = myTrie;
323                int p2 = pos;
324                while(currentNode != null)
325                {
326                  char ch;
327                  if ( p2 == -1 )
328                  {
329                    ch = ' ';
330                  }
331                  else 
332                  {
333                    ch = Character.toLowerCase(seq.charAt(p2));
334                    if ( isSeparator(ch) )
335                      ch = ' ';
336                  }
337                  Trie t2 = currentNode.subtree(ch);
338                  if ( t2 == null )
339                    break;
340    // System.out.println("- "+(pos-p2)+"|"+ch+"|"+t2+"| t2.split="+t2.split+" t2.id="+t2.id+" ("+p2+")");
341                  if ( t2.id >= 0 )
342                  {
343                    flushed = false;
344    // double max = 0.0;                
345                    for(int i = 0; i < profiles.size(); i++)
346                    {
347    // max = Math.max(max, vals[t2.id][i]);
348                      score[i] += vals[t2.id][i];
349                    }
350    /*
351    if (p2 >= 0 )System.out.print("<"+seq.subSequence(p2,pos+1)+">:");else System.out.print("< "+seq.subSequence(0,pos+1)+">:");
352    int llh = pos - p2;
353    System.out.print("     ".subSequence(0,5 - pos + p2));
354    for(int i = 0; i < profiles.size(); i++)
355    {
356      System.out.print(" "+getProfileName(i)
357        +(vals[t2.id][i] == max ? '*':':')
358        +DF.format(vals[t2.id][i])
359        ); 
360    }
361    System.out.println();                
362    */
363                  }
364                  if ( p2-- == -1 )
365                    break;
366                  currentNode = t2.center;
367                }
368                char startChar = seq.charAt(pos);
369                boolean startSep = isSeparator(startChar);
370                double max = 0.0;
371                for(int i = 0; i < score.length; i++)
372                {
373                  max = Math.max(max, score[i]);
374                }
375                if ( startSep && max > 1.0 )
376                {
377    /*
378    System.out.println(" - "+DF.format(max)
379                        +" "+DF.format(score[score.length-1])+" "+pos
380                        +" "+seq.charAt(pos-2)+seq.charAt(pos-1)+seq.charAt(pos)
381                        ); 
382    */
383                  flush();
384                }
385              }
386    
387              public void account(CharSequence seq)
388              {
389                for(int i = 0; i < seq.length(); i++)
390                  account(seq, i);
391              }
392    
393              public void account(Reader reader)
394                throws IOException
395              {
396                BufferedReader br;
397                if ( reader instanceof BufferedReader )
398                  br = (BufferedReader) reader;
399                else
400                  br = new BufferedReader(reader);
401                String line;
402                while( ( line = br.readLine() ) != null )
403                {
404                  account(line);
405                }
406              }
407          };
408      }
409    
410      public double modeTrans(double x, int l)
411      {
412        double f;
413        switch(mode)
414        {
415          case 1:
416          case 10: 
417              if ( l == 1 )
418                return x;
419              f = 1.0 / (l+1);
420              return Math.pow(x/f, f);
421          case 9: 
422              f = 1.0 / (l+1);
423              return Math.pow(x, f) / Math.sqrt(f);
424          case 8: 
425              f = 1.0 / (l+1);
426              return Math.pow(x, f) / Math.sqrt(f);
427          case 7: 
428              f = 1.0 / (l+1);
429              return Math.pow(x, f) / f;
430          case 6: 
431              f = 1.0 / l;
432              return Math.pow(x, f) / Math.sqrt(f); 
433          case 5: 
434              f = 1.0 / l;
435              return Math.pow(x, f) / f; 
436          case 3: 
437              f = 1.0 / l;
438              return Math.pow(x, f);
439          case 2: 
440              f = 1.0 / l;
441              return Math.pow(x/f, f);
442          case 4: 
443              f = 1.0 / l;
444              return Math.pow(x*f, f);
445        }
446        return x;
447      }
448    
449      public String getProfileName(int i)
450      {
451        if ( i < 0 || i >= profiles.size() )
452          return NOLANGNAME;
453        return ((NGramProfile)profiles.get(i)).getName();
454      }
455    
456      public static boolean isSeparator(char ch)
457      {
458        return ( ch <= ' '
459                  || Character.isWhitespace(ch)
460                  || Character.isDigit(ch)
461                  || ".!?:,;".indexOf(ch) >= 0
462                );
463      }
464    
465      public static String reverse(CharSequence seq)
466      {
467        StringBuilder sb = new StringBuilder(seq.length());
468        for(int i = 0; i < seq.length(); i++)
469          sb.insert(0, seq.charAt(i));
470        return sb.toString();
471      }
472    
473      private static void printTrie(String indent, int w, Trie trie, String seq)
474      {
475        if ( --w <= 0 )
476          return;
477        if ( trie == null )
478          return;
479        if ( trie.split == END_CHAR )
480        {
481          System.out.println(indent+"<"+trie.split+">["+seq+"]");
482        }
483        else
484        {
485          System.out.println(indent+"<"+trie.split+">");
486          printTrie(indent+" c", w, trie.center,trie.split + seq);
487          printTrie(indent+"l", w, trie.left,seq);
488          printTrie(indent+"r", w, trie.right,seq);
489        }
490      }
491    
492      private static Trie createTrie(String[] array, int pos, int start, int end)
493      {
494        if(start >= end)
495          return null;
496        /*
497        if ( start == end )
498        {
499          // XXX Some special Trie-Node and moving to getters for links
500          // could save memory here!
501          Trie leaf = new Trie();
502          leaf.split = array[start].charAt(pos);
503          leaf.id = start;
504          return leaf;
505        }
506        */
507        int mid = (start + end)/2;
508        Trie nt = new Trie();
509    // System.out.println("=! "+start+" "+mid+"["+array[mid]+"] "+end);    
510        nt.split = array[mid].charAt(pos);
511        int goRight = mid;
512        while ( goRight < end && charAt(array[goRight],pos) == nt.split )
513          goRight++;
514        int goLeft = mid;
515        while ( goLeft > start && charAt(array[goLeft-1],pos) == nt.split )
516          goLeft--;
517        // Try to move "end" nodes direktly into the id field!
518        int goLeft2 = goLeft;
519        if ( array[goLeft].length() == pos+1 )
520        {
521    // System.out.println("=# "+goLeft+" <"+nt.split+"> <"+array[goLeft]+">");      
522          nt.id = goLeft;
523          goLeft2++;
524        }
525    // System.out.println("== "+start+" "+goLeft+"["+array[goLeft]+"]|"+(goLeft2 < array.length ?"["+array[goLeft2]+"]":"[]")+goLeft2+" ("+mid+","+nt.split+") "+(goRight < array.length ? "["+array[goRight]+"]":"[]")+goRight+"-"+end);
526        nt.center = createTrie(array, pos+1, goLeft2, goRight);
527        nt.left   = createTrie(array, pos, start, goLeft);
528        nt.right  = createTrie(array, pos, goRight, end);
529        return nt;
530      }
531    
532      public final static char charAt(CharSequence cs, int pos)
533      {
534        if ( pos < cs.length() )
535          return cs.charAt(pos);
536        return END_CHAR;
537      }
538    
539      private final static class Trie
540      {
541        static int count = 0;
542        char split;
543        Trie left;
544        Trie right;
545        Trie center;
546        int id = -1;
547    
548        public Trie()
549        {
550          count++;
551        }
552    
553        public Trie subtree(char c)
554        {
555          Trie current = this;
556          do
557          {
558            if ( c == current.split )
559              return current;
560            else if ( c > current.split )
561              current = current.right;
562            else
563              current = current.left;
564          }
565          while ( current != null );
566          return null;
567        }
568      }
569    
570      public static char sc(char c)
571      {
572        return c == END_CHAR ? '?' : c;
573      }
574    
575      /**
576       * Note this class returns a complete match result, for the
577       * sake of thread safety!
578       */
579      public RankResult rank(NGramMetric metric, NGramProfile profile)
580      {
581        Iterator it = profiles.iterator();
582        String res = null;
583        double[] scores = new double[profiles.size()];
584        for(int i = 0; i < profiles.size(); i++)
585          scores[i] = metric.diff(profile, (NGramProfile)(profiles.get(i)));
586        return new SimpleRankResult(scores, false);
587      }
588    
589      private class SimpleRankResult
590        implements RankResult
591      {
592        private double scores[];
593        private NGramProfile[] profs;
594        private double remain;
595    
596        public SimpleRankResult(double[] scorex, boolean inverse)
597        {
598          scores = new double[scorex.length];
599          System.arraycopy(scorex, 0, scores,0, scorex.length);
600          profs = new NGramProfile[scores.length];
601          remain = 1.0;
602          for(int i = 0; i < scores.length; i++)
603          {
604            NGramProfile prof = ((NGramProfile)profiles.get(i));
605            double m = scores[i];
606            remain -= m;
607            int j = i;
608            while ( --j >= 0 && (inverse ^ (m < scores[j])) )
609            {
610              scores[j+1] = scores[j];
611              profs[j+1] = profs[j];
612            }
613            scores[j+1] = m;
614            profs[j+1] = prof;
615          }
616        }
617    
618        public NGramProfiles getProfiles()
619        {
620            return NGramProfiles.this;
621        }
622    
623        public double getScore(int pos)
624        {
625          if ( pos == getLength() )
626            return remain;
627          if ( pos < 0 )
628            pos += getLength();
629          return scores[pos];
630        }
631    
632        public String getName(int pos)
633        {
634          if ( pos == getLength() )
635            return NOLANGNAME;
636          else if ( pos < 0 )
637            pos += getLength();
638          return profs[pos].getName();
639        }
640    
641        public int getLength()
642        {
643          return profs.length;
644        }
645      }
646    
647      public int getProfileCount()
648      {
649        return profiles.size();
650      }
651    
652      public Set getAllNGrams()
653      {
654          // XXX make this read only or is this slowing down too much?
655          return allNGrams;
656      }
657    
658      public interface RankResult
659      {
660        public NGramProfiles getProfiles();
661        public int getLength();
662        public double getScore(int pos);
663        public String getName(int pos);
664      }
665    
666      public interface Ranker
667      {
668        public RankResult getRankResult();
669        public void reset();
670        public void flush();
671        public void account(CharSequence seq, int pos);
672        public void account(CharSequence seq);
673        public void account(Reader reader)  
674          throws IOException;
675      }
676    }
677