001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.io.BufferedInputStream; 022 import java.io.BufferedReader; 023 import java.io.IOException; 024 import java.io.InputStream; 025 import java.io.InputStreamReader; 026 import java.io.OutputStream; 027 import java.util.Date; 028 import java.util.Collections; 029 import java.util.HashMap; 030 import java.util.Iterator; 031 import java.util.Arrays; 032 import java.util.Set; 033 034 /** 035 * Actual implementation of a NGramProfile 036 * 037 * Methods are provided to build new NGramProfiles profiles. 038 * @author frank nestel 039 * @author $Author: nestefan $ 040 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 041 */ 042 public class NGramProfileImpl 043 implements NGramProfile 044 { 045 /** separator char */ 046 public static final char SEPARATOR = '_'; 047 048 /** default min length of ngram. */ 049 public static final int DEFAULT_MIN_NGRAM_LENGTH = 1; 050 051 /** default max length of ngram */ 052 public static final int DEFAULT_MAX_NGRAM_LENGTH = 5; 053 054 /** Name this Profile. */ 055 private String name; 056 057 /** Internal to provide sorted access. */ 058 private volatile NGram[] sorted = null; 059 060 /** Internal to provide ordered access. */ 061 private volatile NGram[] ordered = null; 062 063 /** Minimum length for an ngram. */ 064 private int minNGramLength; 065 066 /** Maximum length for an ngram. */ 067 private int maxNGramLength; 068 069 /** Norm factor for this profile. */ 070 private int normalization = 0; 071 072 /** Map to store char sequences and ngrams */ 073 private HashMap ngrams = null; 074 075 /** */ 076 private Set restricted = null; 077 078 /** 079 * Create a new ngram profile with default lengths. 080 * 081 * @param name Name of profile 082 */ 083 public NGramProfileImpl(String name) { 084 this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH); 085 } 086 087 /** 088 * Create a new ngram profile 089 * 090 * @param name Name of profile 091 * @param minlen min length of ngram sequences 092 * @param maxlen max length of ngram sequences 093 */ 094 public NGramProfileImpl(String name, int minlen, int maxlen) 095 { 096 ngrams = new HashMap(); 097 this.maxNGramLength = maxlen; 098 this.minNGramLength = minlen; 099 setName(name); 100 } 101 102 public void setRestricted(Set restricted) 103 { 104 this.restricted = restricted; 105 } 106 107 /** 108 * Analyze a piece of text 109 * 110 * @param text the text to be analyzed 111 */ 112 public void analyze(CharSequence text) 113 { 114 StringBuffer word = new StringBuffer(30).append(SEPARATOR); 115 for (int i = 0; i < text.length(); i++) 116 { 117 char c = Character.toLowerCase(text.charAt(i)); 118 if (Character.isLetter(c)) 119 { 120 word.append(c); 121 } 122 else 123 { 124 addAnalyze(word); 125 word.setLength(1); 126 } 127 } 128 addAnalyze(word); 129 } 130 131 private void addAnalyze(StringBuffer word) 132 { 133 if (word.length() > 1) 134 { 135 word.append(SEPARATOR); 136 addNGrams(word); 137 } 138 } 139 140 public void clear() 141 { 142 if ( ngrams != null ) 143 { 144 ngrams.clear(); 145 } 146 normalization = 0; 147 ordered = sorted = null; 148 } 149 150 public int getCount() 151 { 152 return ngrams.size(); 153 } 154 155 public int getNormalization() 156 { 157 return normalization; 158 } 159 160 /** 161 * Add ngrams from a single word to this profile 162 * 163 * @param word 164 */ 165 public void addNGrams(CharSequence word) 166 { 167 for (int i = minNGramLength; i <= maxNGramLength && i < word.length(); i++) 168 { 169 addNGrams(word, i); 170 } 171 } 172 173 /** 174 * @param word 175 * @param n sequence length 176 */ 177 private void addNGrams(CharSequence word, int n) 178 { 179 for (int i = 0, end = word.length() - n; i <= end; i++) 180 { 181 CharSequence cs = word.subSequence(i, i + n); 182 NGram nge = (NGram) ngrams.get(cs); 183 if ( nge == null ) 184 { 185 nge = new NGramImpl(cs); 186 if ( restricted != null && !restricted.contains(nge) ) 187 continue; 188 ngrams.put(cs, nge); 189 ordered = null; // A new element invalidates the ordered access 190 } 191 nge.inc(); 192 normalization++; 193 sorted = null; 194 } 195 } 196 197 public Iterator getSorted() 198 { 199 if (sorted == null) 200 { 201 sorted = (NGram[]) ngrams.values().toArray(NO_NGRAM); 202 Arrays.sort(sorted); 203 } 204 return Arrays.asList(sorted).iterator(); 205 } 206 207 public NGram get(CharSequence seq) 208 { 209 if ( ordered == null ) 210 { 211 ordered = (NGram[]) ngrams.values().toArray(NO_NGRAM); 212 Arrays.sort(ordered, CHAR_SEQ_COMPARATOR); 213 } 214 int i = Arrays.binarySearch(ordered, seq, CHAR_SEQ_COMPARATOR); 215 if ( i < 0 ) 216 return null; 217 return ordered[i]; 218 } 219 220 /** 221 * Return ngramprofile as text 222 * 223 * @return ngramprofile as text 224 */ 225 public String toString() 226 { 227 StringBuffer s = new StringBuffer(2000); 228 229 Iterator i = getSorted(); 230 231 s.append("NGramProfile: ").append(name).append('\n'); 232 while (i.hasNext()) 233 { 234 NGram entry = (NGram) i.next(); 235 s.append(entry).append(' ').append(entry.getCount()).append('\n'); 236 } 237 return s.toString(); 238 } 239 240 /** 241 * Loads a ngram profile from InputStream (assumes UTF-8 encoded content) 242 */ 243 public void load(InputStream is) 244 throws IOException 245 { 246 BufferedReader bis = new BufferedReader(new InputStreamReader(is,"UTF-8")); 247 String line; 248 ngrams.clear(); 249 int storeCount = -1; 250 String eliminators = ""; //XXX ad hoc correction of reference 251 int discards = 0; 252 while ((line = bis.readLine()) != null) 253 { 254 line = line.trim(); 255 if (line.length() < 2 ) 256 continue; 257 // # starts a comment line 258 // - starts a correction line 259 if (line.charAt(0) == '-') 260 { 261 eliminators += line.charAt(1); 262 } 263 else if (line.startsWith(FINISHREAD_STR)) 264 { 265 break; 266 } 267 else if (line.charAt(0) != '#') 268 { 269 int spacepos = line.indexOf(' '); 270 String ngramsequence = line.substring(0, spacepos).trim().replace('_',' '); 271 if ( " ".equals(ngramsequence) ) 272 { 273 // Single spaces are so paar as n-grams (1-grams), that 274 // we throw them away!! 275 continue; 276 } 277 int count = Integer.parseInt(line.substring(spacepos + 1).trim()); 278 if (line.startsWith(NORMALIZATION_STR)) 279 { 280 storeCount = count; 281 } 282 else if ( ngramsequence.length() >= minNGramLength 283 && ngramsequence.length() <= maxNGramLength ) 284 { 285 // XXX Check for eliminations! 286 int l; 287 for(l = 0; l < eliminators.length(); l++) 288 { 289 if ( ngramsequence.indexOf(eliminators.charAt(l)) >= 0 ) 290 break; 291 } 292 if ( l < eliminators.length() ) 293 { 294 discards++; 295 // System.out.println(" "+discards+".DISCARD --> <"+ngramsequence+"> <"+eliminators.charAt(l)+">"); 296 } 297 else 298 { 299 // System.out.println("<"+ngramsequence+"> "+" "+((int)ngramsequence.charAt(ngramsequence.length()-1))+" "+count); 300 NGram en = new NGramImpl(ngramsequence, count); 301 ngrams.put(ngramsequence, en); 302 normalization += count; 303 } 304 } 305 } 306 } 307 if ( storeCount != -1 ) 308 { 309 if ( storeCount != normalization ) 310 System.err.println(" WARNING "+storeCount+" != "+normalization); 311 //XXX Which one is better :-) normalization = storeCount; 312 } 313 // if ( discards > 0 ) System.err.println(" "+getName()+" has "+discards+" discards."); 314 } 315 316 /** 317 * Create a new Language profile from (preferably quite large) text file 318 * 319 * @param name name of profile 320 * @param is 321 * @param encoding encoding of stream 322 */ 323 public static NGramProfileImpl createProfile(String name, InputStream is, 324 String encoding) 325 throws IOException 326 { 327 NGramProfileImpl newProfile = new NGramProfileImpl(name); 328 BufferedReader bis = new BufferedReader(new InputStreamReader(is,encoding)); 329 String line; 330 while ( ( line = bis.readLine() ) != null ) 331 { 332 newProfile.analyze(line); 333 } 334 return newProfile; 335 } 336 337 /** 338 * Writes NGramProfile content into OutputStream, content is outputted with 339 * UTF-8 encoding 340 * 341 * @param os Stream to output to 342 * @throws IOException 343 */ 344 345 public void save(OutputStream os) throws IOException 346 { 347 Iterator i = getSorted(); 348 os.write(("# NgramProfile generated at " + new Date() + " for Language Identification\n") 349 .getBytes()); 350 os.write((NORMALIZATION_STR+" " + normalization + "\n").getBytes()); 351 while (i.hasNext()) { 352 NGram e = (NGram) i.next(); 353 String line = e + " " + e.getCount() + "\n"; 354 os.write(line.getBytes("UTF-8")); 355 } 356 os.flush(); 357 } 358 359 /** 360 * @return Returns the name. 361 */ 362 public String getName() { 363 return name; 364 } 365 366 /** 367 * @param name 368 * The name to set. 369 */ 370 public void setName(String name) { 371 this.name = name; 372 } 373 }