001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019 package de.spieleck.app.cngram;
020
021 import java.io.BufferedInputStream;
022 import java.io.BufferedReader;
023 import java.io.IOException;
024 import java.io.InputStream;
025 import java.io.InputStreamReader;
026 import java.io.OutputStream;
027 import java.util.Date;
028 import java.util.Collections;
029 import java.util.HashMap;
030 import java.util.Iterator;
031 import java.util.Arrays;
032 import java.util.Set;
033
034 /**
035 * Actual implementation of a NGramProfile
036 *
037 * Methods are provided to build new NGramProfiles profiles.
038 * @author frank nestel
039 * @author $Author: nestefan $
040 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
041 */
042 public class NGramProfileImpl
043 implements NGramProfile
044 {
045 /** separator char */
046 public static final char SEPARATOR = '_';
047
048 /** default min length of ngram. */
049 public static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
050
051 /** default max length of ngram */
052 public static final int DEFAULT_MAX_NGRAM_LENGTH = 5;
053
054 /** Name this Profile. */
055 private String name;
056
057 /** Internal to provide sorted access. */
058 private volatile NGram[] sorted = null;
059
060 /** Internal to provide ordered access. */
061 private volatile NGram[] ordered = null;
062
063 /** Minimum length for an ngram. */
064 private int minNGramLength;
065
066 /** Maximum length for an ngram. */
067 private int maxNGramLength;
068
069 /** Norm factor for this profile. */
070 private int normalization = 0;
071
072 /** Map to store char sequences and ngrams */
073 private HashMap ngrams = null;
074
075 /** */
076 private Set restricted = null;
077
078 /**
079 * Create a new ngram profile with default lengths.
080 *
081 * @param name Name of profile
082 */
083 public NGramProfileImpl(String name) {
084 this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
085 }
086
087 /**
088 * Create a new ngram profile
089 *
090 * @param name Name of profile
091 * @param minlen min length of ngram sequences
092 * @param maxlen max length of ngram sequences
093 */
094 public NGramProfileImpl(String name, int minlen, int maxlen)
095 {
096 ngrams = new HashMap();
097 this.maxNGramLength = maxlen;
098 this.minNGramLength = minlen;
099 setName(name);
100 }
101
102 public void setRestricted(Set restricted)
103 {
104 this.restricted = restricted;
105 }
106
107 /**
108 * Analyze a piece of text
109 *
110 * @param text the text to be analyzed
111 */
112 public void analyze(CharSequence text)
113 {
114 StringBuffer word = new StringBuffer(30).append(SEPARATOR);
115 for (int i = 0; i < text.length(); i++)
116 {
117 char c = Character.toLowerCase(text.charAt(i));
118 if (Character.isLetter(c))
119 {
120 word.append(c);
121 }
122 else
123 {
124 addAnalyze(word);
125 word.setLength(1);
126 }
127 }
128 addAnalyze(word);
129 }
130
131 private void addAnalyze(StringBuffer word)
132 {
133 if (word.length() > 1)
134 {
135 word.append(SEPARATOR);
136 addNGrams(word);
137 }
138 }
139
140 public void clear()
141 {
142 if ( ngrams != null )
143 {
144 ngrams.clear();
145 }
146 normalization = 0;
147 ordered = sorted = null;
148 }
149
150 public int getCount()
151 {
152 return ngrams.size();
153 }
154
155 public int getNormalization()
156 {
157 return normalization;
158 }
159
160 /**
161 * Add ngrams from a single word to this profile
162 *
163 * @param word
164 */
165 public void addNGrams(CharSequence word)
166 {
167 for (int i = minNGramLength; i <= maxNGramLength && i < word.length(); i++)
168 {
169 addNGrams(word, i);
170 }
171 }
172
173 /**
174 * @param word
175 * @param n sequence length
176 */
177 private void addNGrams(CharSequence word, int n)
178 {
179 for (int i = 0, end = word.length() - n; i <= end; i++)
180 {
181 CharSequence cs = word.subSequence(i, i + n);
182 NGram nge = (NGram) ngrams.get(cs);
183 if ( nge == null )
184 {
185 nge = new NGramImpl(cs);
186 if ( restricted != null && !restricted.contains(nge) )
187 continue;
188 ngrams.put(cs, nge);
189 ordered = null; // A new element invalidates the ordered access
190 }
191 nge.inc();
192 normalization++;
193 sorted = null;
194 }
195 }
196
197 public Iterator getSorted()
198 {
199 if (sorted == null)
200 {
201 sorted = (NGram[]) ngrams.values().toArray(NO_NGRAM);
202 Arrays.sort(sorted);
203 }
204 return Arrays.asList(sorted).iterator();
205 }
206
207 public NGram get(CharSequence seq)
208 {
209 if ( ordered == null )
210 {
211 ordered = (NGram[]) ngrams.values().toArray(NO_NGRAM);
212 Arrays.sort(ordered, CHAR_SEQ_COMPARATOR);
213 }
214 int i = Arrays.binarySearch(ordered, seq, CHAR_SEQ_COMPARATOR);
215 if ( i < 0 )
216 return null;
217 return ordered[i];
218 }
219
220 /**
221 * Return ngramprofile as text
222 *
223 * @return ngramprofile as text
224 */
225 public String toString()
226 {
227 StringBuffer s = new StringBuffer(2000);
228
229 Iterator i = getSorted();
230
231 s.append("NGramProfile: ").append(name).append('\n');
232 while (i.hasNext())
233 {
234 NGram entry = (NGram) i.next();
235 s.append(entry).append(' ').append(entry.getCount()).append('\n');
236 }
237 return s.toString();
238 }
239
240 /**
241 * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
242 */
243 public void load(InputStream is)
244 throws IOException
245 {
246 BufferedReader bis = new BufferedReader(new InputStreamReader(is,"UTF-8"));
247 String line;
248 ngrams.clear();
249 int storeCount = -1;
250 String eliminators = ""; //XXX ad hoc correction of reference
251 int discards = 0;
252 while ((line = bis.readLine()) != null)
253 {
254 line = line.trim();
255 if (line.length() < 2 )
256 continue;
257 // # starts a comment line
258 // - starts a correction line
259 if (line.charAt(0) == '-')
260 {
261 eliminators += line.charAt(1);
262 }
263 else if (line.startsWith(FINISHREAD_STR))
264 {
265 break;
266 }
267 else if (line.charAt(0) != '#')
268 {
269 int spacepos = line.indexOf(' ');
270 String ngramsequence = line.substring(0, spacepos).trim().replace('_',' ');
271 if ( " ".equals(ngramsequence) )
272 {
273 // Single spaces are so paar as n-grams (1-grams), that
274 // we throw them away!!
275 continue;
276 }
277 int count = Integer.parseInt(line.substring(spacepos + 1).trim());
278 if (line.startsWith(NORMALIZATION_STR))
279 {
280 storeCount = count;
281 }
282 else if ( ngramsequence.length() >= minNGramLength
283 && ngramsequence.length() <= maxNGramLength )
284 {
285 // XXX Check for eliminations!
286 int l;
287 for(l = 0; l < eliminators.length(); l++)
288 {
289 if ( ngramsequence.indexOf(eliminators.charAt(l)) >= 0 )
290 break;
291 }
292 if ( l < eliminators.length() )
293 {
294 discards++;
295 // System.out.println(" "+discards+".DISCARD --> <"+ngramsequence+"> <"+eliminators.charAt(l)+">");
296 }
297 else
298 {
299 // System.out.println("<"+ngramsequence+"> "+" "+((int)ngramsequence.charAt(ngramsequence.length()-1))+" "+count);
300 NGram en = new NGramImpl(ngramsequence, count);
301 ngrams.put(ngramsequence, en);
302 normalization += count;
303 }
304 }
305 }
306 }
307 if ( storeCount != -1 )
308 {
309 if ( storeCount != normalization )
310 System.err.println(" WARNING "+storeCount+" != "+normalization);
311 //XXX Which one is better :-) normalization = storeCount;
312 }
313 // if ( discards > 0 ) System.err.println(" "+getName()+" has "+discards+" discards.");
314 }
315
316 /**
317 * Create a new Language profile from (preferably quite large) text file
318 *
319 * @param name name of profile
320 * @param is
321 * @param encoding encoding of stream
322 */
323 public static NGramProfileImpl createProfile(String name, InputStream is,
324 String encoding)
325 throws IOException
326 {
327 NGramProfileImpl newProfile = new NGramProfileImpl(name);
328 BufferedReader bis = new BufferedReader(new InputStreamReader(is,encoding));
329 String line;
330 while ( ( line = bis.readLine() ) != null )
331 {
332 newProfile.analyze(line);
333 }
334 return newProfile;
335 }
336
337 /**
338 * Writes NGramProfile content into OutputStream, content is outputted with
339 * UTF-8 encoding
340 *
341 * @param os Stream to output to
342 * @throws IOException
343 */
344
345 public void save(OutputStream os) throws IOException
346 {
347 Iterator i = getSorted();
348 os.write(("# NgramProfile generated at " + new Date() + " for Language Identification\n")
349 .getBytes());
350 os.write((NORMALIZATION_STR+" " + normalization + "\n").getBytes());
351 while (i.hasNext()) {
352 NGram e = (NGram) i.next();
353 String line = e + " " + e.getCount() + "\n";
354 os.write(line.getBytes("UTF-8"));
355 }
356 os.flush();
357 }
358
359 /**
360 * @return Returns the name.
361 */
362 public String getName() {
363 return name;
364 }
365
366 /**
367 * @param name
368 * The name to set.
369 */
370 public void setName(String name) {
371 this.name = name;
372 }
373 }