001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.io.BufferedInputStream;
022    import java.io.BufferedReader;
023    import java.io.IOException;
024    import java.io.InputStream;
025    import java.io.InputStreamReader;
026    import java.io.OutputStream;
027    import java.util.Date;
028    import java.util.Collections;
029    import java.util.HashMap;
030    import java.util.Iterator;
031    import java.util.Arrays;
032    import java.util.Set;
033    
034    /**
035     * Actual implementation of a NGramProfile
036     * 
037     * Methods are provided to build new NGramProfiles profiles.
038     * @author frank nestel
039     * @author $Author: nestefan $
040     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
041     */
042    public class NGramProfileImpl
043      implements NGramProfile
044    {
045      /** separator char */
046      public static final char SEPARATOR = '_';
047    
048      /** default min length of ngram. */
049      public static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
050    
051      /** default max length of ngram */
052      public static final int DEFAULT_MAX_NGRAM_LENGTH = 5;
053    
054      /** Name this Profile. */
055      private String name;
056      
057      /** Internal to provide sorted access. */
058      private volatile NGram[] sorted = null;
059    
060      /** Internal to provide ordered access. */
061      private volatile NGram[] ordered = null;
062    
063      /** Minimum length for an ngram. */
064      private int minNGramLength;
065    
066      /** Maximum length for an ngram. */
067      private int maxNGramLength;  
068    
069      /** Norm factor for this profile. */
070      private int normalization = 0;
071    
072      /** Map to store char sequences and ngrams */
073      private HashMap ngrams = null;
074    
075      /** */
076      private Set restricted = null;
077    
078      /**
079       * Create a new ngram profile with default lengths.
080       * 
081       * @param name Name of profile
082       */
083      public NGramProfileImpl(String name) {
084        this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
085      }
086    
087      /**
088       * Create a new ngram profile
089       * 
090       * @param name Name of profile
091       * @param minlen min length of ngram sequences
092       * @param maxlen max length of ngram sequences
093       */
094      public NGramProfileImpl(String name, int minlen, int maxlen)
095      {
096        ngrams = new HashMap();
097        this.maxNGramLength = maxlen;
098        this.minNGramLength = minlen;
099        setName(name);
100      }
101    
102      public void setRestricted(Set restricted)
103      {
104          this.restricted = restricted;
105      }
106    
107      /**
108       * Analyze a piece of text
109       * 
110       * @param text the text to be analyzed
111       */
112      public void analyze(CharSequence text)
113      {
114        StringBuffer word = new StringBuffer(30).append(SEPARATOR);
115        for (int i = 0; i < text.length(); i++)
116        {
117          char c = Character.toLowerCase(text.charAt(i));
118          if (Character.isLetter(c))
119          {
120            word.append(c);
121          }
122          else
123          {
124            addAnalyze(word);
125            word.setLength(1);
126          }
127        }
128        addAnalyze(word);
129      }
130    
131      private void addAnalyze(StringBuffer word)
132      {
133        if (word.length() > 1)
134        {
135          word.append(SEPARATOR);
136          addNGrams(word);
137        }
138      }
139    
140      public void clear()
141      {
142        if ( ngrams != null )
143        {
144            ngrams.clear();
145        }
146        normalization = 0;
147        ordered = sorted = null;
148      }
149    
150      public int getCount()
151      {
152        return ngrams.size();
153      }
154    
155      public int getNormalization()
156      {
157        return normalization;
158      }
159    
160      /**
161       * Add ngrams from a single word to this profile
162       * 
163       * @param word
164       */
165      public void addNGrams(CharSequence word)
166      {
167        for (int i = minNGramLength; i <= maxNGramLength && i < word.length(); i++)
168        {
169          addNGrams(word, i);
170        }
171      }
172    
173      /**
174       * @param word
175       * @param n sequence length
176       */
177      private void addNGrams(CharSequence word, int n)
178      {
179        for (int i = 0, end = word.length() - n; i <= end; i++)
180        {
181          CharSequence cs = word.subSequence(i, i + n);
182          NGram nge = (NGram) ngrams.get(cs);
183          if ( nge == null )
184          {
185            nge = new NGramImpl(cs);
186            if ( restricted != null && !restricted.contains(nge) )
187              continue;
188            ngrams.put(cs, nge);
189            ordered = null; // A new element invalidates the ordered access
190          }
191          nge.inc();
192          normalization++;
193          sorted = null;
194        }
195      }
196    
197      public Iterator getSorted()
198      {
199        if (sorted == null)
200        {
201          sorted = (NGram[]) ngrams.values().toArray(NO_NGRAM);
202          Arrays.sort(sorted);
203        }
204        return Arrays.asList(sorted).iterator();
205      }
206    
207      public NGram get(CharSequence seq)
208      {
209        if ( ordered == null )
210        {
211          ordered = (NGram[]) ngrams.values().toArray(NO_NGRAM);
212          Arrays.sort(ordered, CHAR_SEQ_COMPARATOR);
213        }
214        int i = Arrays.binarySearch(ordered, seq, CHAR_SEQ_COMPARATOR);
215        if ( i < 0 )
216            return null;
217        return ordered[i];
218      }
219    
220      /**
221       * Return ngramprofile as text
222       * 
223       * @return ngramprofile as text
224       */
225      public String toString()
226      {
227        StringBuffer s = new StringBuffer(2000);
228    
229        Iterator i = getSorted();
230    
231        s.append("NGramProfile: ").append(name).append('\n');
232        while (i.hasNext())
233        {
234          NGram entry = (NGram) i.next();
235          s.append(entry).append(' ').append(entry.getCount()).append('\n');
236        }
237        return s.toString();
238      }
239    
240      /**
241       * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
242       */
243      public void load(InputStream is)
244        throws IOException
245      {
246        BufferedReader bis = new BufferedReader(new InputStreamReader(is,"UTF-8"));
247        String line;
248        ngrams.clear();
249        int storeCount = -1;
250        String eliminators = ""; //XXX ad hoc correction of reference
251        int discards = 0;
252        while ((line = bis.readLine()) != null)
253        {
254          line = line.trim();
255          if (line.length() < 2 )
256            continue;
257          // # starts a comment line
258          // - starts a correction line
259          if (line.charAt(0) == '-') 
260          {
261            eliminators += line.charAt(1);
262          }
263          else if (line.startsWith(FINISHREAD_STR))
264          {
265            break;
266          }
267          else if (line.charAt(0) != '#')
268          {
269            int spacepos = line.indexOf(' ');
270            String ngramsequence = line.substring(0, spacepos).trim().replace('_',' ');
271            if ( " ".equals(ngramsequence) )
272            {
273              // Single spaces are so paar as n-grams (1-grams), that
274              // we throw them away!!
275              continue;
276            }
277            int count = Integer.parseInt(line.substring(spacepos + 1).trim());
278            if (line.startsWith(NORMALIZATION_STR))
279            {
280              storeCount = count;
281            }
282            else if ( ngramsequence.length() >= minNGramLength
283                    && ngramsequence.length() <= maxNGramLength )
284            {
285              // XXX Check for eliminations!
286              int l;
287              for(l = 0; l < eliminators.length(); l++)
288              {
289                if ( ngramsequence.indexOf(eliminators.charAt(l)) >= 0 )
290                  break;
291              }
292              if ( l < eliminators.length() )
293              {
294                discards++;
295    // System.out.println(" "+discards+".DISCARD --> <"+ngramsequence+"> <"+eliminators.charAt(l)+">");
296              }
297              else
298              {
299    // System.out.println("<"+ngramsequence+"> "+" "+((int)ngramsequence.charAt(ngramsequence.length()-1))+" "+count);            
300                NGram en = new NGramImpl(ngramsequence, count);
301                ngrams.put(ngramsequence, en);
302                normalization += count;
303              }
304            }
305          }
306        }
307        if ( storeCount != -1 )
308        {
309          if ( storeCount != normalization )
310              System.err.println(" WARNING "+storeCount+" != "+normalization);
311          //XXX Which one is better :-) normalization = storeCount;
312        }
313    // if ( discards > 0 ) System.err.println("  "+getName()+" has "+discards+" discards.");
314      }
315    
316      /**
317       * Create a new Language profile from (preferably quite large) text file
318       * 
319       * @param name name of profile
320       * @param is
321       * @param encoding encoding of stream
322       */
323      public static NGramProfileImpl createProfile(String name, InputStream is,
324          String encoding)
325        throws IOException
326      {
327        NGramProfileImpl newProfile = new NGramProfileImpl(name);
328        BufferedReader bis = new BufferedReader(new InputStreamReader(is,encoding));
329        String line;
330        while ( ( line = bis.readLine() ) != null )
331        {
332          newProfile.analyze(line);
333        }
334        return newProfile;
335      }
336    
337      /**
338       * Writes NGramProfile content into OutputStream, content is outputted with
339       * UTF-8 encoding
340       * 
341       * @param os Stream to output to
342       * @throws IOException
343       */
344    
345      public void save(OutputStream os) throws IOException
346      {
347        Iterator i = getSorted();
348        os.write(("# NgramProfile generated at " + new Date() + " for Language Identification\n")
349                .getBytes());
350        os.write((NORMALIZATION_STR+" " + normalization + "\n").getBytes());
351        while (i.hasNext()) {
352          NGram e = (NGram) i.next();
353          String line = e + " " + e.getCount() + "\n";
354          os.write(line.getBytes("UTF-8"));
355        }
356        os.flush();
357      }
358    
359      /**
360       * @return Returns the name.
361       */
362      public String getName() {
363        return name;
364      }
365    
366      /**
367       * @param name
368       *          The name to set.
369       */
370      public void setName(String name) {
371        this.name = name;
372      }
373    }