001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.io.Reader; 022 import java.io.BufferedReader; 023 import java.io.InputStreamReader; 024 import java.io.IOException; 025 import java.io.InputStream; 026 import java.util.List; 027 import java.util.ArrayList; 028 import java.util.Iterator; 029 import java.util.HashSet; 030 import java.util.Set; 031 import java.util.Arrays; 032 import java.util.BitSet; 033 import java.text.DecimalFormat; 034 035 /** 036 * Manage a set of profiles and determine "most similar" ones 037 * to a given profile. Allows access to the complete results of 038 * previous last ranking. Note this uses a competetive ranking 039 * approach, which is memory efficient, time efficient for not 040 * too many languages and provides contextual scoring of ngrams. 041 * 042 * @author frank nestel 043 * @author $Author: nestefan $ 044 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 045 */ 046 public class NGramProfiles 047 { 048 public final static String NOLANGNAME = "--"; 049 050 public final static char END_CHAR = (char) 0; 051 052 public final static DecimalFormat DF = new DecimalFormat("+0.00;-0.00"); 053 054 public final static double LOWSTATSAFETY = 0.5; // was 0.5; 055 056 private List profiles = null; 057 058 private HashSet allNGrams = new HashSet(10000); 059 060 private int firstNGrams; 061 062 private int maxLen = -1; 063 064 private Trie myTrie = null; 065 066 private float[][] vals; 067 068 private int mode; 069 070 public NGramProfiles() 071 throws IOException 072 { 073 this(1); 074 } 075 076 public NGramProfiles(int mode) 077 throws IOException 078 { 079 InputStream ip = getClass().getResourceAsStream("profiles.lst"); 080 BufferedReader br = new BufferedReader(new InputStreamReader(ip)); 081 this.mode = mode; 082 init(br); 083 } 084 085 public NGramProfiles(BufferedReader br) 086 throws IOException 087 { 088 init(br); 089 } 090 091 protected void init(BufferedReader br) 092 throws IOException 093 { 094 profiles = new ArrayList(); 095 firstNGrams = 0; 096 String line; 097 while ( ( line = br.readLine() ) != null ) 098 { 099 if ( line.charAt(0) == '#' ) 100 continue; 101 InputStream is = getClass().getResourceAsStream(line+"."+ 102 NGramProfile.NGRAM_PROFILE_EXTENSION); 103 NGramProfileImpl np = new NGramProfileImpl(line); 104 np.load(is); 105 profiles.add(np); 106 Iterator iter = np.getSorted(); 107 while ( iter.hasNext() ) 108 { 109 NGram ng = (NGram) iter.next(); 110 if ( ng.length() > maxLen ) maxLen = ng.length(); 111 firstNGrams++; 112 allNGrams.add(ng); 113 } 114 } 115 myTrie = null; 116 } 117 118 public void info() 119 { 120 System.err.println("#profiles="+profiles.size()+", firstNGrams="+firstNGrams+", secondNGrams="+allNGrams.size()+", maxLen="+maxLen+", mode="+mode); 121 CosMetric cm = new CosMetric(); 122 Iterator i1 = profiles.iterator(); 123 while ( i1.hasNext() ) 124 { 125 NGramProfile p1 = (NGramProfile) i1.next(); 126 System.out.print("# "+p1.getName()); 127 Iterator i2 = profiles.iterator(); 128 while ( i2.hasNext() ) 129 { 130 NGramProfile p2 = (NGramProfile) i2.next(); 131 System.out.print(" "); 132 System.out.print(DF.format(1.0 - cm.diff(p1,p2))); 133 } 134 System.out.println(); 135 } 136 Ranker r = getRanker(); 137 // 138 check(r, "Das ist cool men!"); 139 check(r, "Les Ordinateurs sont appeles a jouer un role"); 140 check(r, "Sein oder Nichtsein, das ist hier die Frage!"); 141 check(r, "Zu Ihren Aufgaben zählen u. a. die Einführung und Erprobung neuer Technologien, die Erarbeitung von Rationalisierungslösungen für die Fertigung sowie die Erarbeitung und Durchführung von Technologieversuchen bei der Mustererstellung mit Hinblick auf die Serienfertigung. Desweiteren sind Sie für die Betreuung und Durchführung von Entwicklungsprojekten sowie die Erarbeitung und konstruktive Auslegung von Werkzeugen, Vorrichtungen und Automatisierungseinrichtungen zuständig. Sie sind Ansprechpartner für die Zusammenarbeit und Betreuung von externen Lieferanten bei der Konstruktion und der Herstellung von Maschinen und Vorrichtungen. Sie leiten qualitätsverbessernde Maßnahmen im Prozess und in der Fertigung ein und sind für die kostenbewusste und zukunftsweisende Planung des Bereiches verantwortlich. Zudem führen Sie Ihre Mitarbeiter zielgerichtet entsprechend der INA-Führungsleitlinien."); 142 check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!"); 143 check(r, "Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question! Sein oder Nichtsein, das ist hier die Frage! To be or not to be, that is the question!"); 144 check(r, "Marcel Andre Casasola Merkle"); 145 check(r, "question"); 146 check(r, "methodist"); 147 /* 148 */ 149 } 150 151 public static void check(Ranker r, CharSequence seq) 152 { 153 System.out.println("## \""+seq+"\": "); 154 r.reset(); 155 long t1 = System.currentTimeMillis(); 156 r.account(seq); 157 long t2 = System.currentTimeMillis(); 158 RankResult rs = r.getRankResult(); 159 double sum = 0.0; 160 for(int i = 0; i < rs.getLength(); i++) 161 { 162 if ( rs.getScore(i) > 0.0 ) 163 { 164 System.out.println(" "+rs.getName(i) 165 +" "+DF.format(rs.getScore(i)) 166 +" "+(i+1) 167 ); 168 sum += rs.getScore(i); 169 } 170 } 171 System.out.println("#===== took="+(t2-t1)+"ms: "+DF.format(1.0*(t2-t1)/seq.length())+"ms/ch. other="+DF.format(1.0-sum)); 172 } 173 174 public Ranker getRanker() 175 { 176 int[] otherCount = new int[maxLen+1]; 177 if ( myTrie == null ) 178 { 179 synchronized(profiles) 180 { 181 if ( myTrie == null ) 182 { 183 // Create a reverse reference of all strings 184 // which makes it easy to create reverse Trie's 185 String[] ngs = new String[allNGrams.size()]; 186 Iterator it = allNGrams.iterator(); 187 int j = 0; 188 while( it.hasNext() ) 189 { 190 NGram ng = (NGram) it.next(); 191 ngs[j++] = reverse(ng); 192 } 193 Arrays.sort(ngs); 194 // Create Strings in correct order but sorted from reverse end. 195 String[] ng1 = new String[allNGrams.size()]; 196 for(int i = 0; i < ngs.length; i++) 197 { 198 ng1[i] = reverse(ngs[i]); 199 } 200 myTrie = createTrie(ngs, 0, 0, ngs.length); 201 vals = new float[ngs.length][profiles.size()]; 202 int[] lengthes = new int[ngs.length]; 203 for(int k = 0; k < profiles.size(); k++) 204 { 205 NGramProfile ngp = ((NGramProfile)profiles.get(k)); 206 double norm[] = new double[maxLen+1]; 207 int count[] = new int[maxLen+1]; 208 for(int i = 0; i < ngs.length; i++) 209 { 210 NGram ng = ngp.get(ng1[i]); 211 if ( ng != null && ng.getCount() > LOWSTATSAFETY ) 212 { 213 int ngl = ng.length(); 214 lengthes[i] = ngl; // write at least once, read once :-| 215 double raw1 = ng.getCount() - LOWSTATSAFETY; 216 count[ngl]++; 217 norm[ngl] += raw1; 218 vals[i][k] = (float) raw1; 219 } 220 } 221 for(int i = 1; i <= maxLen; i++) 222 { 223 norm[i] *= (1.0+count[i])/count[i]; 224 norm[i] += 1.0; 225 } 226 for(int i = 0; i < ngs.length; i++) 227 { 228 NGram ng = ngp.get(ng1[i]); 229 if ( ng != null && ng.getCount() > 0 ) 230 { 231 int ngl = ng.length(); 232 double trans = vals[i][k] / norm[ngl]; 233 vals[i][k] = (float)trans; 234 } 235 } 236 } 237 // Horizontal additive zero sum + nonlinear weighting 238 for(int i = 0; i < ngs.length; i++) 239 { 240 double sum = 0.0; 241 for(int k = 0; k < profiles.size(); k++) 242 { 243 double h = vals[i][k]; 244 sum += h; 245 } 246 double av = sum / profiles.size(); 247 /** 248 * Assumed minimum amount of score for significance. 249 * XXX Heuristics for the following constant: 250 * Higher means faster and less noise 251 * Lower means better adaption to mixed language text 252 */ 253 double n = modeTrans(av, ng1[i].length()) / av / 100.0 * (-Math.log(av)); 254 for(int k = 0; k < profiles.size(); k++) 255 vals[i][k] = (float) ((vals[i][k] - av) * n); 256 } 257 } 258 } 259 } 260 return new Ranker() 261 { 262 private double score[] = new double[profiles.size()+1]; 263 private double rscore[] = new double[profiles.size()+1]; 264 private boolean flushed = false; 265 266 { 267 reset(); 268 } 269 270 public RankResult getRankResult() 271 { 272 flush(); 273 double pscore[] = new double[profiles.size()]; 274 double sum = 0.0; 275 for(int i = 0; i <= profiles.size(); i++) 276 { 277 sum += rscore[i]; 278 } 279 for(int i = 0; i < profiles.size(); i++) 280 { 281 pscore[i] = rscore[i] / sum; 282 } 283 return new SimpleRankResult(pscore, true); 284 } 285 286 public void reset() 287 { 288 for(int i = 0; i < score.length; i++) 289 { 290 rscore[i] = score[i] = 0.0; 291 } 292 score[score.length-1] = 0.0; 293 rscore[score.length-1] = 0.5; // 0.2 is too low; 294 } 295 296 public void flush() 297 { 298 if ( !flushed ) 299 { 300 flushed = true; 301 double maxValue = -1.0; 302 for(int i = 0; i < score.length; i++) 303 { 304 maxValue = Math.max(maxValue, score[i]); 305 } 306 double limit = maxValue / 2.0; 307 double f = 1.0/(maxValue - limit); 308 for(int i = 0; i < score.length; i++) 309 { 310 double delta = score[i] - limit; 311 if ( delta > 0.0 ) 312 rscore[i] += delta * f; 313 // We do not reset to zero, this makes classification contextual 314 score[i] /= 2.0; 315 } 316 } 317 } 318 319 public void account(CharSequence seq, int pos) 320 { 321 // System.out.println("--"); 322 Trie currentNode = myTrie; 323 int p2 = pos; 324 while(currentNode != null) 325 { 326 char ch; 327 if ( p2 == -1 ) 328 { 329 ch = ' '; 330 } 331 else 332 { 333 ch = Character.toLowerCase(seq.charAt(p2)); 334 if ( isSeparator(ch) ) 335 ch = ' '; 336 } 337 Trie t2 = currentNode.subtree(ch); 338 if ( t2 == null ) 339 break; 340 // System.out.println("- "+(pos-p2)+"|"+ch+"|"+t2+"| t2.split="+t2.split+" t2.id="+t2.id+" ("+p2+")"); 341 if ( t2.id >= 0 ) 342 { 343 flushed = false; 344 // double max = 0.0; 345 for(int i = 0; i < profiles.size(); i++) 346 { 347 // max = Math.max(max, vals[t2.id][i]); 348 score[i] += vals[t2.id][i]; 349 } 350 /* 351 if (p2 >= 0 )System.out.print("<"+seq.subSequence(p2,pos+1)+">:");else System.out.print("< "+seq.subSequence(0,pos+1)+">:"); 352 int llh = pos - p2; 353 System.out.print(" ".subSequence(0,5 - pos + p2)); 354 for(int i = 0; i < profiles.size(); i++) 355 { 356 System.out.print(" "+getProfileName(i) 357 +(vals[t2.id][i] == max ? '*':':') 358 +DF.format(vals[t2.id][i]) 359 ); 360 } 361 System.out.println(); 362 */ 363 } 364 if ( p2-- == -1 ) 365 break; 366 currentNode = t2.center; 367 } 368 char startChar = seq.charAt(pos); 369 boolean startSep = isSeparator(startChar); 370 double max = 0.0; 371 for(int i = 0; i < score.length; i++) 372 { 373 max = Math.max(max, score[i]); 374 } 375 if ( startSep && max > 1.0 ) 376 { 377 /* 378 System.out.println(" - "+DF.format(max) 379 +" "+DF.format(score[score.length-1])+" "+pos 380 +" "+seq.charAt(pos-2)+seq.charAt(pos-1)+seq.charAt(pos) 381 ); 382 */ 383 flush(); 384 } 385 } 386 387 public void account(CharSequence seq) 388 { 389 for(int i = 0; i < seq.length(); i++) 390 account(seq, i); 391 } 392 393 public void account(Reader reader) 394 throws IOException 395 { 396 BufferedReader br; 397 if ( reader instanceof BufferedReader ) 398 br = (BufferedReader) reader; 399 else 400 br = new BufferedReader(reader); 401 String line; 402 while( ( line = br.readLine() ) != null ) 403 { 404 account(line); 405 } 406 } 407 }; 408 } 409 410 public double modeTrans(double x, int l) 411 { 412 double f; 413 switch(mode) 414 { 415 case 1: 416 case 10: 417 if ( l == 1 ) 418 return x; 419 f = 1.0 / (l+1); 420 return Math.pow(x/f, f); 421 case 9: 422 f = 1.0 / (l+1); 423 return Math.pow(x, f) / Math.sqrt(f); 424 case 8: 425 f = 1.0 / (l+1); 426 return Math.pow(x, f) / Math.sqrt(f); 427 case 7: 428 f = 1.0 / (l+1); 429 return Math.pow(x, f) / f; 430 case 6: 431 f = 1.0 / l; 432 return Math.pow(x, f) / Math.sqrt(f); 433 case 5: 434 f = 1.0 / l; 435 return Math.pow(x, f) / f; 436 case 3: 437 f = 1.0 / l; 438 return Math.pow(x, f); 439 case 2: 440 f = 1.0 / l; 441 return Math.pow(x/f, f); 442 case 4: 443 f = 1.0 / l; 444 return Math.pow(x*f, f); 445 } 446 return x; 447 } 448 449 public String getProfileName(int i) 450 { 451 if ( i < 0 || i >= profiles.size() ) 452 return NOLANGNAME; 453 return ((NGramProfile)profiles.get(i)).getName(); 454 } 455 456 public static boolean isSeparator(char ch) 457 { 458 return ( ch <= ' ' 459 || Character.isWhitespace(ch) 460 || Character.isDigit(ch) 461 || ".!?:,;".indexOf(ch) >= 0 462 ); 463 } 464 465 public static String reverse(CharSequence seq) 466 { 467 StringBuilder sb = new StringBuilder(seq.length()); 468 for(int i = 0; i < seq.length(); i++) 469 sb.insert(0, seq.charAt(i)); 470 return sb.toString(); 471 } 472 473 private static void printTrie(String indent, int w, Trie trie, String seq) 474 { 475 if ( --w <= 0 ) 476 return; 477 if ( trie == null ) 478 return; 479 if ( trie.split == END_CHAR ) 480 { 481 System.out.println(indent+"<"+trie.split+">["+seq+"]"); 482 } 483 else 484 { 485 System.out.println(indent+"<"+trie.split+">"); 486 printTrie(indent+" c", w, trie.center,trie.split + seq); 487 printTrie(indent+"l", w, trie.left,seq); 488 printTrie(indent+"r", w, trie.right,seq); 489 } 490 } 491 492 private static Trie createTrie(String[] array, int pos, int start, int end) 493 { 494 if(start >= end) 495 return null; 496 /* 497 if ( start == end ) 498 { 499 // XXX Some special Trie-Node and moving to getters for links 500 // could save memory here! 501 Trie leaf = new Trie(); 502 leaf.split = array[start].charAt(pos); 503 leaf.id = start; 504 return leaf; 505 } 506 */ 507 int mid = (start + end)/2; 508 Trie nt = new Trie(); 509 // System.out.println("=! "+start+" "+mid+"["+array[mid]+"] "+end); 510 nt.split = array[mid].charAt(pos); 511 int goRight = mid; 512 while ( goRight < end && charAt(array[goRight],pos) == nt.split ) 513 goRight++; 514 int goLeft = mid; 515 while ( goLeft > start && charAt(array[goLeft-1],pos) == nt.split ) 516 goLeft--; 517 // Try to move "end" nodes direktly into the id field! 518 int goLeft2 = goLeft; 519 if ( array[goLeft].length() == pos+1 ) 520 { 521 // System.out.println("=# "+goLeft+" <"+nt.split+"> <"+array[goLeft]+">"); 522 nt.id = goLeft; 523 goLeft2++; 524 } 525 // System.out.println("== "+start+" "+goLeft+"["+array[goLeft]+"]|"+(goLeft2 < array.length ?"["+array[goLeft2]+"]":"[]")+goLeft2+" ("+mid+","+nt.split+") "+(goRight < array.length ? "["+array[goRight]+"]":"[]")+goRight+"-"+end); 526 nt.center = createTrie(array, pos+1, goLeft2, goRight); 527 nt.left = createTrie(array, pos, start, goLeft); 528 nt.right = createTrie(array, pos, goRight, end); 529 return nt; 530 } 531 532 public final static char charAt(CharSequence cs, int pos) 533 { 534 if ( pos < cs.length() ) 535 return cs.charAt(pos); 536 return END_CHAR; 537 } 538 539 private final static class Trie 540 { 541 static int count = 0; 542 char split; 543 Trie left; 544 Trie right; 545 Trie center; 546 int id = -1; 547 548 public Trie() 549 { 550 count++; 551 } 552 553 public Trie subtree(char c) 554 { 555 Trie current = this; 556 do 557 { 558 if ( c == current.split ) 559 return current; 560 else if ( c > current.split ) 561 current = current.right; 562 else 563 current = current.left; 564 } 565 while ( current != null ); 566 return null; 567 } 568 } 569 570 public static char sc(char c) 571 { 572 return c == END_CHAR ? '?' : c; 573 } 574 575 /** 576 * Note this class returns a complete match result, for the 577 * sake of thread safety! 578 */ 579 public RankResult rank(NGramMetric metric, NGramProfile profile) 580 { 581 Iterator it = profiles.iterator(); 582 String res = null; 583 double[] scores = new double[profiles.size()]; 584 for(int i = 0; i < profiles.size(); i++) 585 scores[i] = metric.diff(profile, (NGramProfile)(profiles.get(i))); 586 return new SimpleRankResult(scores, false); 587 } 588 589 private class SimpleRankResult 590 implements RankResult 591 { 592 private double scores[]; 593 private NGramProfile[] profs; 594 private double remain; 595 596 public SimpleRankResult(double[] scorex, boolean inverse) 597 { 598 scores = new double[scorex.length]; 599 System.arraycopy(scorex, 0, scores,0, scorex.length); 600 profs = new NGramProfile[scores.length]; 601 remain = 1.0; 602 for(int i = 0; i < scores.length; i++) 603 { 604 NGramProfile prof = ((NGramProfile)profiles.get(i)); 605 double m = scores[i]; 606 remain -= m; 607 int j = i; 608 while ( --j >= 0 && (inverse ^ (m < scores[j])) ) 609 { 610 scores[j+1] = scores[j]; 611 profs[j+1] = profs[j]; 612 } 613 scores[j+1] = m; 614 profs[j+1] = prof; 615 } 616 } 617 618 public NGramProfiles getProfiles() 619 { 620 return NGramProfiles.this; 621 } 622 623 public double getScore(int pos) 624 { 625 if ( pos == getLength() ) 626 return remain; 627 if ( pos < 0 ) 628 pos += getLength(); 629 return scores[pos]; 630 } 631 632 public String getName(int pos) 633 { 634 if ( pos == getLength() ) 635 return NOLANGNAME; 636 else if ( pos < 0 ) 637 pos += getLength(); 638 return profs[pos].getName(); 639 } 640 641 public int getLength() 642 { 643 return profs.length; 644 } 645 } 646 647 public int getProfileCount() 648 { 649 return profiles.size(); 650 } 651 652 public Set getAllNGrams() 653 { 654 // XXX make this read only or is this slowing down too much? 655 return allNGrams; 656 } 657 658 public interface RankResult 659 { 660 public NGramProfiles getProfiles(); 661 public int getLength(); 662 public double getScore(int pos); 663 public String getName(int pos); 664 } 665 666 public interface Ranker 667 { 668 public RankResult getRankResult(); 669 public void reset(); 670 public void flush(); 671 public void account(CharSequence seq, int pos); 672 public void account(CharSequence seq); 673 public void account(Reader reader) 674 throws IOException; 675 } 676 } 677