001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.util.Iterator;
022    import java.util.Comparator;
023    
024    /**
025     * A device to keep a bunch of ngram statistics.
026     * @author frank nestel
027     * @author $Author: nestefan $
028     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
029     */
030    public interface NGramProfile
031    {
032      public static final String NGRAM_PROFILE_EXTENSION = "ngp";
033    
034      public static final String NORMALIZATION_STR = "ngram_count";
035    
036      public static final String FINISHREAD_STR = "#END";
037    
038      public static final NGram[] NO_NGRAM = new NGram[0];
039    
040      public final static Comparator CHAR_SEQ_COMPARATOR = new Comparator()
041          {
042              public int compare(Object o1, Object o2)
043              {
044                  CharSequence c1 = (CharSequence) o1;
045                  CharSequence c2 = (CharSequence) o2;
046                  for(int i = 0; i < c1.length() && i < c2.length(); i++)
047                  {
048                    int d = c1.charAt(i) - c2.charAt(i);
049                    if ( d != 0 )
050                      return d;
051                  }
052                  return c2.length() - c1.length();
053              }
054          };
055    
056      /**
057       * Return sorted ngrams
058       * 
059       * @return sorted ngrams
060       */
061      public Iterator getSorted();
062    
063      /**
064       * @return Returns the number of ngrams.
065       */
066      public int getCount();
067    
068      /**
069       * @return Returns the name.
070       */
071      public String getName();
072    
073      /**
074       * Get the normalization of all NGrams contained.
075       */
076      public int getNormalization();
077    
078      /**
079       * @return NGram corresponding to seq, null if not found.
080       */
081      public NGram get(CharSequence seq);
082    
083    }