001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.util.Iterator; 022 import java.util.Comparator; 023 024 /** 025 * A device to keep a bunch of ngram statistics. 026 * @author frank nestel 027 * @author $Author: nestefan $ 028 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 029 */ 030 public interface NGramProfile 031 { 032 public static final String NGRAM_PROFILE_EXTENSION = "ngp"; 033 034 public static final String NORMALIZATION_STR = "ngram_count"; 035 036 public static final String FINISHREAD_STR = "#END"; 037 038 public static final NGram[] NO_NGRAM = new NGram[0]; 039 040 public final static Comparator CHAR_SEQ_COMPARATOR = new Comparator() 041 { 042 public int compare(Object o1, Object o2) 043 { 044 CharSequence c1 = (CharSequence) o1; 045 CharSequence c2 = (CharSequence) o2; 046 for(int i = 0; i < c1.length() && i < c2.length(); i++) 047 { 048 int d = c1.charAt(i) - c2.charAt(i); 049 if ( d != 0 ) 050 return d; 051 } 052 return c2.length() - c1.length(); 053 } 054 }; 055 056 /** 057 * Return sorted ngrams 058 * 059 * @return sorted ngrams 060 */ 061 public Iterator getSorted(); 062 063 /** 064 * @return Returns the number of ngrams. 065 */ 066 public int getCount(); 067 068 /** 069 * @return Returns the name. 070 */ 071 public String getName(); 072 073 /** 074 * Get the normalization of all NGrams contained. 075 */ 076 public int getNormalization(); 077 078 /** 079 * @return NGram corresponding to seq, null if not found. 080 */ 081 public NGram get(CharSequence seq); 082 083 }