001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019 package de.spieleck.app.cngram;
020
021 import java.io.Reader;
022 import java.io.BufferedReader;
023 import java.io.InputStreamReader;
024 import java.io.IOException;
025 import java.io.InputStream;
026 import java.util.List;
027 import java.util.ArrayList;
028 import java.util.Iterator;
029 import java.util.HashSet;
030 import java.util.Set;
031 import java.util.Arrays;
032 import java.util.BitSet;
033 import java.text.DecimalFormat;
034
035 /**
036 * Manage a set of profiles and determine "most similar" ones
037 * to a given profile. Allows access to the complete results of
038 * previous last ranking. Note this uses a competetive ranking
039 * approach, which is memory efficient, time efficient for not
040 * too many languages and provides contextual scoring of ngrams.
041 *
042 * @author frank nestel
043 * @author $Author: nestefan $
044 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
045 */
046 public class NGramProfiles
047 {
048 public final static String NOLANGNAME = "--";
049
050 public final static char END_CHAR = (char) 0;
051
052 public final static DecimalFormat DF = new DecimalFormat("+0.00;-0.00");
053
054 public final static double LOWSTATSAFETY = 0.5; // was 0.5;
055
056 private List profiles = null;
057
058 private HashSet allNGrams = new HashSet(10000);
059
060 private int firstNGrams;
061
062 private int maxLen = -1;
063
064 private Trie myTrie = null;
065
066 private float[][] vals;
067
068 private int mode;
069
070 public NGramProfiles()
071 throws IOException
072 {
073 this(1);
074 }
075
076 public NGramProfiles(int mode)
077 throws IOException
078 {
079 InputStream ip = getClass().getResourceAsStream("profiles.lst");
080 BufferedReader br = new BufferedReader(new InputStreamReader(ip));
081 this.mode = mode;
082 init(br);
083 }
084
085 public NGramProfiles(BufferedReader br)
086 throws IOException
087 {
088 init(br);
089 }
090
091 protected void init(BufferedReader br)
092 throws IOException
093 {
094 profiles = new ArrayList();
095 firstNGrams = 0;
096 String line;
097 while ( ( line = br.readLine() ) != null )
098 {
099 if ( line.charAt(0) == '#' )
100 continue;
101 InputStream is = getClass().getResourceAsStream(line+"."+
102 NGramProfile.NGRAM_PROFILE_EXTENSION);
103 NGramProfileImpl np = new NGramProfileImpl(line);
104 np.load(is);
105 profiles.add(np);
106 Iterator iter = np.getSorted();
107 while ( iter.hasNext() )
108 {
109 NGram ng = (NGram) iter.next();
110 if ( ng.length() > maxLen ) maxLen = ng.length();
111 firstNGrams++;
112 allNGrams.add(ng);
113 }
114 }
115 myTrie = null;
116 }
117
118 public void info()
119 {
120 System.err.println("#profiles="+profiles.size()+", firstNGrams="+firstNGrams+", secondNGrams="+allNGrams.size()+", maxLen="+maxLen+", mode="+mode);
121 CosMetric cm = new CosMetric();
122 Iterator i1 = profiles.iterator();
123 while ( i1.hasNext() )
124 {
125 NGramProfile p1 = (NGramProfile) i1.next();
126 System.out.print("# "+p1.getName());
127 Iterator i2 = profiles.iterator();
128 while ( i2.hasNext() )
129 {
130 NGramProfile p2 = (NGramProfile) i2.next();
131 System.out.print(" ");
132 System.out.print(DF.format(1.0 - cm.diff(p1,p2)));
133 }
134 System.out.println();
135 }
136 Ranker r = getRanker();
137 //
138 check(r, "Das ist cool men!");
139 check(r, "Les Ordinateurs sont appeles a jouer un role");
140 check(r, "Sein oder Nichtsein, das ist hier die Frage!");
141 check(r, "Zu Ihren Aufgaben zählen u. a. die Einführung und Erprobung neuer Technologien, die Erarbeitung von Rationalisierungslösungen für die Fertigung sowie die Erarbeitung und Durchführung von Technologieversuchen bei der Mustererstellung mit Hinblick auf die Serienfertigung. Desweiteren sind Sie für die Betreuung und Durchführung von Entwicklungsprojekten sowie die Erarbeitung und konstruktive Auslegung von Werkzeugen, Vorrichtungen und Automatisierungseinrichtungen zuständig. Sie sind Ansprechpartner für die Zusammenarbeit und Betreuung von externen Lieferanten bei der Konstruktion und der Herstellung von Maschinen und Vorrichtungen. Sie leiten qualitätsverbessernde Maßnahmen im Prozess und in der Fertigung ein und sind für die kostenbewusste und zukunftsweisende Planung des Bereiches verantwortlich. Zudem führen Sie Ihre Mitarbeiter zielgerichtet entsprechend der INA-Führungsleitlinien.");
142 check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!");
143 check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question! Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!");
144 check(r, "Marcel Andre Casasola Merkle");
145 check(r, "question");
146 check(r, "methodist");
147 /*
148 */
149 }
150
151 public static void check(Ranker r, CharSequence seq)
152 {
153 System.out.println("## \""+seq+"\": ");
154 r.reset();
155 long t1 = System.currentTimeMillis();
156 r.account(seq);
157 long t2 = System.currentTimeMillis();
158 RankResult rs = r.getRankResult();
159 double sum = 0.0;
160 for(int i = 0; i < rs.getLength(); i++)
161 {
162 if ( rs.getScore(i) > 0.0 )
163 {
164 System.out.println(" "+rs.getName(i)
165 +" "+DF.format(rs.getScore(i))
166 +" "+(i+1)
167 );
168 sum += rs.getScore(i);
169 }
170 }
171 System.out.println("#===== took="+(t2-t1)+"ms: "+DF.format(1.0*(t2-t1)/seq.length())+"ms/ch. other="+DF.format(1.0-sum));
172 }
173
174 public Ranker getRanker()
175 {
176 int[] otherCount = new int[maxLen+1];
177 if ( myTrie == null )
178 {
179 synchronized(profiles)
180 {
181 if ( myTrie == null )
182 {
183 // Create a reverse reference of all strings
184 // which makes it easy to create reverse Trie's
185 String[] ngs = new String[allNGrams.size()];
186 Iterator it = allNGrams.iterator();
187 int j = 0;
188 while( it.hasNext() )
189 {
190 NGram ng = (NGram) it.next();
191 ngs[j++] = reverse(ng);
192 }
193 Arrays.sort(ngs);
194 // Create Strings in correct order but sorted from reverse end.
195 String[] ng1 = new String[allNGrams.size()];
196 for(int i = 0; i < ngs.length; i++)
197 {
198 ng1[i] = reverse(ngs[i]);
199 }
200 myTrie = createTrie(ngs, 0, 0, ngs.length);
201 vals = new float[ngs.length][profiles.size()];
202 int[] lengthes = new int[ngs.length];
203 for(int k = 0; k < profiles.size(); k++)
204 {
205 NGramProfile ngp = ((NGramProfile)profiles.get(k));
206 double norm[] = new double[maxLen+1];
207 int count[] = new int[maxLen+1];
208 for(int i = 0; i < ngs.length; i++)
209 {
210 NGram ng = ngp.get(ng1[i]);
211 if ( ng != null && ng.getCount() > LOWSTATSAFETY )
212 {
213 int ngl = ng.length();
214 lengthes[i] = ngl; // write at least once, read once :-|
215 double raw1 = ng.getCount() - LOWSTATSAFETY;
216 count[ngl]++;
217 norm[ngl] += raw1;
218 vals[i][k] = (float) raw1;
219 }
220 }
221 for(int i = 1; i <= maxLen; i++)
222 {
223 norm[i] *= (1.0+count[i])/count[i];
224 norm[i] += 1.0;
225 }
226 for(int i = 0; i < ngs.length; i++)
227 {
228 NGram ng = ngp.get(ng1[i]);
229 if ( ng != null && ng.getCount() > 0 )
230 {
231 int ngl = ng.length();
232 double trans = vals[i][k] / norm[ngl];
233 vals[i][k] = (float)trans;
234 }
235 }
236 }
237 // Horizontal additive zero sum + nonlinear weighting
238 for(int i = 0; i < ngs.length; i++)
239 {
240 double sum = 0.0;
241 for(int k = 0; k < profiles.size(); k++)
242 {
243 double h = vals[i][k];
244 sum += h;
245 }
246 double av = sum / profiles.size();
247 /**
248 * Assumed minimum amount of score for significance.
249 * XXX Heuristics for the following constant:
250 * Higher means faster and less noise
251 * Lower means better adaption to mixed language text
252 */
253 double n = modeTrans(av, ng1[i].length()) / av / 100.0 * (-Math.log(av));
254 for(int k = 0; k < profiles.size(); k++)
255 vals[i][k] = (float) ((vals[i][k] - av) * n);
256 }
257 }
258 }
259 }
260 return new Ranker()
261 {
262 private double score[] = new double[profiles.size()+1];
263 private double rscore[] = new double[profiles.size()+1];
264 private boolean flushed = false;
265
266 {
267 reset();
268 }
269
270 public RankResult getRankResult()
271 {
272 flush();
273 double pscore[] = new double[profiles.size()];
274 double sum = 0.0;
275 for(int i = 0; i <= profiles.size(); i++)
276 {
277 sum += rscore[i];
278 }
279 for(int i = 0; i < profiles.size(); i++)
280 {
281 pscore[i] = rscore[i] / sum;
282 }
283 return new SimpleRankResult(pscore, true);
284 }
285
286 public void reset()
287 {
288 for(int i = 0; i < score.length; i++)
289 {
290 rscore[i] = score[i] = 0.0;
291 }
292 score[score.length-1] = 0.0;
293 rscore[score.length-1] = 0.5; // 0.2 is too low;
294 }
295
296 public void flush()
297 {
298 if ( !flushed )
299 {
300 flushed = true;
301 double maxValue = -1.0;
302 for(int i = 0; i < score.length; i++)
303 {
304 maxValue = Math.max(maxValue, score[i]);
305 }
306 double limit = maxValue / 2.0;
307 double f = 1.0/(maxValue - limit);
308 for(int i = 0; i < score.length; i++)
309 {
310 double delta = score[i] - limit;
311 if ( delta > 0.0 )
312 rscore[i] += delta * f;
313 // We do not reset to zero, this makes classification contextual
314 score[i] /= 2.0;
315 }
316 }
317 }
318
319 public void account(CharSequence seq, int pos)
320 {
321 // System.out.println("--");
322 Trie currentNode = myTrie;
323 int p2 = pos;
324 while(currentNode != null)
325 {
326 char ch;
327 if ( p2 == -1 )
328 {
329 ch = ' ';
330 }
331 else
332 {
333 ch = Character.toLowerCase(seq.charAt(p2));
334 if ( isSeparator(ch) )
335 ch = ' ';
336 }
337 Trie t2 = currentNode.subtree(ch);
338 if ( t2 == null )
339 break;
340 // System.out.println("- "+(pos-p2)+"|"+ch+"|"+t2+"| t2.split="+t2.split+" t2.id="+t2.id+" ("+p2+")");
341 if ( t2.id >= 0 )
342 {
343 flushed = false;
344 // double max = 0.0;
345 for(int i = 0; i < profiles.size(); i++)
346 {
347 // max = Math.max(max, vals[t2.id][i]);
348 score[i] += vals[t2.id][i];
349 }
350 /*
351 if (p2 >= 0 )System.out.print("<"+seq.subSequence(p2,pos+1)+">:");else System.out.print("< "+seq.subSequence(0,pos+1)+">:");
352 int llh = pos - p2;
353 System.out.print(" ".subSequence(0,5 - pos + p2));
354 for(int i = 0; i < profiles.size(); i++)
355 {
356 System.out.print(" "+getProfileName(i)
357 +(vals[t2.id][i] == max ? '*':':')
358 +DF.format(vals[t2.id][i])
359 );
360 }
361 System.out.println();
362 */
363 }
364 if ( p2-- == -1 )
365 break;
366 currentNode = t2.center;
367 }
368 char startChar = seq.charAt(pos);
369 boolean startSep = isSeparator(startChar);
370 double max = 0.0;
371 for(int i = 0; i < score.length; i++)
372 {
373 max = Math.max(max, score[i]);
374 }
375 if ( startSep && max > 1.0 )
376 {
377 /*
378 System.out.println(" - "+DF.format(max)
379 +" "+DF.format(score[score.length-1])+" "+pos
380 +" "+seq.charAt(pos-2)+seq.charAt(pos-1)+seq.charAt(pos)
381 );
382 */
383 flush();
384 }
385 }
386
387 public void account(CharSequence seq)
388 {
389 for(int i = 0; i < seq.length(); i++)
390 account(seq, i);
391 }
392
393 public void account(Reader reader)
394 throws IOException
395 {
396 BufferedReader br;
397 if ( reader instanceof BufferedReader )
398 br = (BufferedReader) reader;
399 else
400 br = new BufferedReader(reader);
401 String line;
402 while( ( line = br.readLine() ) != null )
403 {
404 account(line);
405 }
406 }
407 };
408 }
409
410 public double modeTrans(double x, int l)
411 {
412 double f;
413 switch(mode)
414 {
415 case 1:
416 case 10:
417 if ( l == 1 )
418 return x;
419 f = 1.0 / (l+1);
420 return Math.pow(x/f, f);
421 case 9:
422 f = 1.0 / (l+1);
423 return Math.pow(x, f) / Math.sqrt(f);
424 case 8:
425 f = 1.0 / (l+1);
426 return Math.pow(x, f) / Math.sqrt(f);
427 case 7:
428 f = 1.0 / (l+1);
429 return Math.pow(x, f) / f;
430 case 6:
431 f = 1.0 / l;
432 return Math.pow(x, f) / Math.sqrt(f);
433 case 5:
434 f = 1.0 / l;
435 return Math.pow(x, f) / f;
436 case 3:
437 f = 1.0 / l;
438 return Math.pow(x, f);
439 case 2:
440 f = 1.0 / l;
441 return Math.pow(x/f, f);
442 case 4:
443 f = 1.0 / l;
444 return Math.pow(x*f, f);
445 }
446 return x;
447 }
448
449 public String getProfileName(int i)
450 {
451 if ( i < 0 || i >= profiles.size() )
452 return NOLANGNAME;
453 return ((NGramProfile)profiles.get(i)).getName();
454 }
455
456 public static boolean isSeparator(char ch)
457 {
458 return ( ch <= ' '
459 || Character.isWhitespace(ch)
460 || Character.isDigit(ch)
461 || ".!?:,;".indexOf(ch) >= 0
462 );
463 }
464
465 public static String reverse(CharSequence seq)
466 {
467 StringBuilder sb = new StringBuilder(seq.length());
468 for(int i = 0; i < seq.length(); i++)
469 sb.insert(0, seq.charAt(i));
470 return sb.toString();
471 }
472
473 private static void printTrie(String indent, int w, Trie trie, String seq)
474 {
475 if ( --w <= 0 )
476 return;
477 if ( trie == null )
478 return;
479 if ( trie.split == END_CHAR )
480 {
481 System.out.println(indent+"<"+trie.split+">["+seq+"]");
482 }
483 else
484 {
485 System.out.println(indent+"<"+trie.split+">");
486 printTrie(indent+" c", w, trie.center,trie.split + seq);
487 printTrie(indent+"l", w, trie.left,seq);
488 printTrie(indent+"r", w, trie.right,seq);
489 }
490 }
491
492 private static Trie createTrie(String[] array, int pos, int start, int end)
493 {
494 if(start >= end)
495 return null;
496 /*
497 if ( start == end )
498 {
499 // XXX Some special Trie-Node and moving to getters for links
500 // could save memory here!
501 Trie leaf = new Trie();
502 leaf.split = array[start].charAt(pos);
503 leaf.id = start;
504 return leaf;
505 }
506 */
507 int mid = (start + end)/2;
508 Trie nt = new Trie();
509 // System.out.println("=! "+start+" "+mid+"["+array[mid]+"] "+end);
510 nt.split = array[mid].charAt(pos);
511 int goRight = mid;
512 while ( goRight < end && charAt(array[goRight],pos) == nt.split )
513 goRight++;
514 int goLeft = mid;
515 while ( goLeft > start && charAt(array[goLeft-1],pos) == nt.split )
516 goLeft--;
517 // Try to move "end" nodes direktly into the id field!
518 int goLeft2 = goLeft;
519 if ( array[goLeft].length() == pos+1 )
520 {
521 // System.out.println("=# "+goLeft+" <"+nt.split+"> <"+array[goLeft]+">");
522 nt.id = goLeft;
523 goLeft2++;
524 }
525 // System.out.println("== "+start+" "+goLeft+"["+array[goLeft]+"]|"+(goLeft2 < array.length ?"["+array[goLeft2]+"]":"[]")+goLeft2+" ("+mid+","+nt.split+") "+(goRight < array.length ? "["+array[goRight]+"]":"[]")+goRight+"-"+end);
526 nt.center = createTrie(array, pos+1, goLeft2, goRight);
527 nt.left = createTrie(array, pos, start, goLeft);
528 nt.right = createTrie(array, pos, goRight, end);
529 return nt;
530 }
531
532 public final static char charAt(CharSequence cs, int pos)
533 {
534 if ( pos < cs.length() )
535 return cs.charAt(pos);
536 return END_CHAR;
537 }
538
539 private final static class Trie
540 {
541 static int count = 0;
542 char split;
543 Trie left;
544 Trie right;
545 Trie center;
546 int id = -1;
547
548 public Trie()
549 {
550 count++;
551 }
552
553 public Trie subtree(char c)
554 {
555 Trie current = this;
556 do
557 {
558 if ( c == current.split )
559 return current;
560 else if ( c > current.split )
561 current = current.right;
562 else
563 current = current.left;
564 }
565 while ( current != null );
566 return null;
567 }
568 }
569
570 public static char sc(char c)
571 {
572 return c == END_CHAR ? '?' : c;
573 }
574
575 /**
576 * Note this class returns a complete match result, for the
577 * sake of thread safety!
578 */
579 public RankResult rank(NGramMetric metric, NGramProfile profile)
580 {
581 Iterator it = profiles.iterator();
582 String res = null;
583 double[] scores = new double[profiles.size()];
584 for(int i = 0; i < profiles.size(); i++)
585 scores[i] = metric.diff(profile, (NGramProfile)(profiles.get(i)));
586 return new SimpleRankResult(scores, false);
587 }
588
589 private class SimpleRankResult
590 implements RankResult
591 {
592 private double scores[];
593 private NGramProfile[] profs;
594 private double remain;
595
596 public SimpleRankResult(double[] scorex, boolean inverse)
597 {
598 scores = new double[scorex.length];
599 System.arraycopy(scorex, 0, scores,0, scorex.length);
600 profs = new NGramProfile[scores.length];
601 remain = 1.0;
602 for(int i = 0; i < scores.length; i++)
603 {
604 NGramProfile prof = ((NGramProfile)profiles.get(i));
605 double m = scores[i];
606 remain -= m;
607 int j = i;
608 while ( --j >= 0 && (inverse ^ (m < scores[j])) )
609 {
610 scores[j+1] = scores[j];
611 profs[j+1] = profs[j];
612 }
613 scores[j+1] = m;
614 profs[j+1] = prof;
615 }
616 }
617
618 public NGramProfiles getProfiles()
619 {
620 return NGramProfiles.this;
621 }
622
623 public double getScore(int pos)
624 {
625 if ( pos == getLength() )
626 return remain;
627 if ( pos < 0 )
628 pos += getLength();
629 return scores[pos];
630 }
631
632 public String getName(int pos)
633 {
634 if ( pos == getLength() )
635 return NOLANGNAME;
636 else if ( pos < 0 )
637 pos += getLength();
638 return profs[pos].getName();
639 }
640
641 public int getLength()
642 {
643 return profs.length;
644 }
645 }
646
647 public int getProfileCount()
648 {
649 return profiles.size();
650 }
651
652 public Set getAllNGrams()
653 {
654 // XXX make this read only or is this slowing down too much?
655 return allNGrams;
656 }
657
658 public interface RankResult
659 {
660 public NGramProfiles getProfiles();
661 public int getLength();
662 public double getScore(int pos);
663 public String getName(int pos);
664 }
665
666 public interface Ranker
667 {
668 public RankResult getRankResult();
669 public void reset();
670 public void flush();
671 public void account(CharSequence seq, int pos);
672 public void account(CharSequence seq);
673 public void account(Reader reader)
674 throws IOException;
675 }
676 }
677