001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj.lm;
021
022 import java.io.*;
023
024 import java.util.List;
025 import java.util.ArrayList;
026 import java.util.Iterator;
027 import java.util.Enumeration;
028
029 import de.spieleck.app.ngramj.*;
030
031 /**
032 * One class to classify a profile against a set of profiles.
033 *
034 * Note this has a main() method for testing and tuning purposes.
035 */
036 public class CategorizerImpl
037 implements Categorizer, LMConstants
038 {
039 protected List profiles = new ArrayList();
040
041 /**
042 * Construct an uninitialized Categorizer.
043 */
044 public CategorizerImpl()
045 throws IOException
046 {
047 InputStream ip = getClass().getResourceAsStream("profiles.lst");
048 BufferedReader br = new BufferedReader(new InputStreamReader(ip));
049 ArrayList al = new ArrayList();
050 String line;
051 while ( ( line = br.readLine() ) != null )
052 {
053 InputStream is = getClass().getResourceAsStream(line);
054 IterableProfile prof = new LMDataProfile(line, is);
055 addProfile(prof);
056 }
057 }
058
059 /**
060 * Construct an Categorizer from a whole Directory of resources.
061 */
062 public CategorizerImpl(String dirName)
063 throws NGramException, FileNotFoundException
064 {
065 File fi = new File(dirName);
066 if ( ! fi.isDirectory() )
067 throw new NGramException("Base must be a directory.");
068 String[] names = fi.list(LMFilter);
069 init(fi, names);
070 }
071
072 public static FilenameFilter LMFilter = new FilenameFilter()
073 {
074 public boolean accept(File dir, String name)
075 {
076 return name.endsWith(".lm");
077 }
078 };
079
080 /**
081 * Construct an Categorizer from a List of resource file names.
082 */
083 public CategorizerImpl(String[] fileNames)
084 throws NGramException, FileNotFoundException
085 {
086 init(null, fileNames);
087 }
088
089 /**
090 * Fetch the set of file resources.
091 */
092 protected void init(File fi, String[] names)
093 throws NGramException, FileNotFoundException
094 {
095 if ( names == null || names.length == 0 )
096 throw new NGramException("Need at least one NGram input file.");
097 for (int i = 0; i < names.length; i++ )
098 {
099 File ifi = new File(fi, names[i]);
100 InputStream in = new FileInputStream(ifi);
101 IterableProfile prof = new LMDataProfile(names[i], in);
102 addProfile(prof);
103 }
104 System.err.println("Statistics: "
105 +NGramImpl.getNGramImplCount()+" n-grams, "
106 +names.length+" Profiles."
107 +" q="+(NGramImpl.getNGramImplCount()/names.length)
108 );
109 }
110
111 /**
112 * add an Categorization alternative to the profiles.
113 */
114 public void addProfile(IterableProfile prof)
115 {
116 profiles.add(prof);
117 }
118
119 /**
120 * Match a given profile against the Categorizer
121 */
122 public Profile match(Profile prof)
123 {
124 double error = Double.MAX_VALUE;
125 Profile opt = null;
126 Iterator iter = profiles.iterator();
127 while ( iter.hasNext() )
128 {
129 IterableProfile prof2 = (IterableProfile) iter.next();
130 double newError = deltaRank(prof2, prof);
131 if ( newError < error )
132 {
133 error = newError;
134 opt = prof2;
135 }
136 }
137 return opt;
138 }
139
140 /**
141 * Calculate "the distance" between two profiles
142 */
143 public double deltaRank(IterableProfile prof1, Profile prof2)
144 {
145 double delta = 0.0;
146 Iterator grams = prof1.ngrams();
147 int j = 0;
148 while ( grams.hasNext() )
149 {
150 j++;
151 NGram ngram = (NGram) grams.next();
152 double rank = prof2.getRank(ngram);
153 if ( rank != 0.0 )
154 delta += Math.abs(rank - j );
155 else
156 delta += USEDNGRAMS; // XXX ?!
157 }
158 return delta;
159 }
160
161 /**
162 * Sample application, like the text_cat main mode.
163 */
164 public static void main(String[] args)
165 throws Exception
166 {
167 if ( args.length == 1 )
168 {
169 Categorizer cath = new CategorizerImpl();
170 EntryProfile prof = new EntryProfile(args[0], USEDNGRAMS);
171 Profile res = cath.match(prof);
172 System.err.println("Best match is: "+res);
173 }
174 else
175 {
176 Categorizer cath = new CategorizerImpl(args[0]);
177 for (int i = 1; i < args.length; i++ )
178 {
179 EntryProfile prof = new EntryProfile(args[i], USEDNGRAMS);
180 Profile res = cath.match(prof);
181 System.err.println("Best match is: "+res);
182 }
183 }
184 }
185
186 }