001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019 package de.spieleck.app.cngram;
020
021 import java.io.File;
022 import java.io.FileInputStream;
023 import java.io.FileReader;
024 import java.io.BufferedInputStream;
025 import java.io.BufferedReader;
026 import java.io.Reader;
027 import java.io.FileOutputStream;
028 import java.io.InputStreamReader;
029 import java.io.IOException;
030 import java.io.PrintStream;
031 import java.text.DecimalFormat;
032
033 /**
034 * Commandline interface that runs a ngram analysis over submitted text,
035 * results can be used for automatic language identification.
036 *
037 * @author Frank S. Nestel
038 * @author $Author: nestefan $
039 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
040 */
041 public class RunNGram
042 {
043 public static final int CREATE = 1;
044 public static final int SIMILARITY = 2;
045 public static final int SCORE = 3;
046 public static final int LANG = 4;
047 public static final int TEST = 5;
048 public static final int LANG2 = 6;
049 public static final int LANG2B = 7;
050 public static final int CHECK = 8;
051 public static final int PROFILES = 9;
052
053 public final static DecimalFormat DF = new DecimalFormat("0.000");
054 public final static DecimalFormat DFE = new DecimalFormat("0.0E0");
055
056 private static void usage(PrintStream out)
057 {
058 out.println("Usage: RunNGram commandset");
059 out.println(" [-create profilename(out) textfile [encoding]]");
060 out.println(" or [-similarity metricName textfile1 textfile2 [encoding]]");
061 out.println(" or [-score metricName profile-name textfile [encoding]]");
062 out.println(" or [-lang metricName textfile [encoding]]");
063 out.println(" or [-test ]");
064 out.println(" or [-lang2 textfile [encoding]]");
065 out.println(" or [-lang2b textfile [encoding]]");
066 out.println(" or [-check textlistFile]");
067 out.println(" or [-profiles metricName profile1 profile2]");
068 System.exit(42);
069 }
070
071 public static void main(String args[])
072 throws Exception
073 {
074 int command = 0;
075
076
077 if (args.length == 0)
078 usage(System.out);
079
080 for (int i = 0; i < args.length; i++)
081 {
082 String profilename = "";
083 String profilename2 = "";
084 String textfile = "";
085 String filename2 = "";
086 String metricName = null;
087 NGramMetric metric = null;
088 String encoding = "";
089
090 if ("-c".equals(args[i]) || "-create".equals(args[i]) )
091 {
092 command = CREATE;
093 profilename = args[++i];
094 textfile = args[++i];
095 }
096 else if ("-i".equals(args[i]) || "-similarity".equals(args[i]))
097 {
098 command = SIMILARITY;
099 metricName = args[++i];
100 metric = (NGramMetric) Class.forName(metricName).newInstance();
101 textfile = args[++i];
102 filename2 = args[++i];
103 }
104 else if ("-s".equals(args[i]) || args[i].equals("-score"))
105 {
106 command = SCORE;
107 metricName = args[++i];
108 metric = (NGramMetric) Class.forName(metricName).newInstance();
109 profilename = args[++i];
110 textfile = args[++i];
111 }
112 else if ( "-p".equals(args[i]) || "-profiles".equals(args[i]) )
113 {
114 command = PROFILES;
115 metricName = args[++i];
116 metric = (NGramMetric) Class.forName(metricName).newInstance();
117 profilename = args[++i];
118 profilename2 = args[++i];
119 }
120 else if ("-l".equals(args[i]) || "-lang".equals(args[i]) )
121 {
122 command = LANG;
123 metricName = args[++i];
124 metric = (NGramMetric) Class.forName(metricName).newInstance();
125 textfile = args[++i];
126 }
127 else if ("-l2".equals(args[i]) || "-lang2".equals(args[i]) )
128 {
129 command = LANG2;
130 textfile = args[++i];
131 }
132 else if ("-l2b".equals(args[i]) || "-lang2b".equals(args[i]) )
133 {
134 command = LANG2B;
135 textfile = args[++i];
136 }
137 else if ("-x".equals(args[i]) || "-check".equals(args[i]) )
138 {
139 command = CHECK;
140 textfile = args[++i];
141 }
142 else if ( "-t".equals(args[i]) || "-test".equals(args[i]) )
143 {
144 command = TEST;
145 }
146 else
147 {
148 usage(System.err);
149 }
150
151 if ( i+1 < args.length && args[i].charAt(0) != '-' )
152 {
153 encoding = args[++i];
154 }
155 else
156 {
157 encoding = "iso-8859-1";
158 }
159 if ( command == TEST )
160 {
161 NGramProfiles npi = new NGramProfiles();
162 npi.info();
163 }
164 else if ( command == LANG2 || command == LANG2B )
165 {
166 long t1 = System.currentTimeMillis();
167 NGramProfiles nps = new NGramProfiles();
168 NGramProfiles.Ranker ranker = nps.getRanker();
169 ranker.account(createReader(textfile,encoding));
170 NGramProfiles.RankResult res = ranker.getRankResult();
171 long t2 = System.currentTimeMillis();
172 printRankResult("speed", res, t2-t1);
173 if ( command == LANG2B )
174 {
175 t1 = t2;
176 ranker.reset();
177 ranker.account(createReader(textfile,encoding));
178 res = ranker.getRankResult();
179 t2 = System.currentTimeMillis();
180 printRankResult("speed", res, t2-t1);
181 }
182 }
183 else if ( command == CHECK )
184 {
185 NGramProfiles npi = new NGramProfiles();
186 NGramProfiles.Ranker ranker = npi.getRanker();
187 File fi = new File(textfile);
188 BufferedReader br = new BufferedReader(new FileReader(fi));
189 String line;
190 while ( ( line = br.readLine() ) != null )
191 {
192 line = line.trim();
193 if ( line.charAt(0) == '#' )
194 continue;
195 String[] ss = line.split(";");
196 long t1 = System.currentTimeMillis();
197 ranker.reset();
198 ranker.account(createReader(ss[0], ss[1]));
199 long t2 = System.currentTimeMillis();
200 NGramProfiles.RankResult res = ranker.getRankResult();
201 printRankResult(ss[0], res, t2-t1);
202 }
203 }
204 else if ( command == PROFILES )
205 {
206 FileInputStream fis;
207 File f2=new File(profilename);
208 fis = new FileInputStream(f2);
209 NGramProfileImpl comp1 = new NGramProfileImpl(profilename);
210 comp1.load(fis);
211 File f3=new File(profilename2);
212 fis = new FileInputStream(f3);
213 NGramProfileImpl comp2 = new NGramProfileImpl(profilename2);
214 comp2.load(fis);
215 System.out.println("diff("+profilename+":"+profilename2+")=" + DFE.format(metric.diff(comp1, comp2)));
216 }
217 else
218 {
219 long t1 = System.currentTimeMillis();
220 NGramProfileImpl newProf = create(textfile, encoding);
221 long t2 = System.currentTimeMillis();
222
223 switch (command) {
224
225 case CREATE:
226 String fname = profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION;
227 File f = new File(fname);
228 FileOutputStream fos = new FileOutputStream(f);
229 newProf.save(fos);
230 System.out.println("new profile '" + fname + "' was created.");
231 break;
232
233 case SIMILARITY:
234 NGramProfile newProf2 = create(filename2, encoding);
235 System.out.println("Difference is "+ DFE.format(metric.diff(newProf, newProf2)));
236 break;
237
238 case SCORE:
239 File f2=new File(profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION);
240 FileInputStream fis = new FileInputStream(f2);
241 NGramProfileImpl compare = new NGramProfileImpl(profilename);
242 compare.load(fis);
243 System.out.println("Score ("+profilename+") is " + DFE.format(metric.diff(compare, newProf)));
244
245 break;
246
247 case LANG:
248 NGramProfiles nps = new NGramProfiles();
249 // Set restrict = nps.getAllNGrams();
250 long dt1 = t2 - t1;
251 t1 = System.currentTimeMillis();
252 NGramProfiles.RankResult res = nps.rank(metric, newProf);
253 t2 = System.currentTimeMillis();
254 int ppos = metricName.lastIndexOf(".");
255 printRankResult(metricName.substring(ppos+1)+"("+dt1+")",res,t2-t1);
256 break;
257 }
258 }
259 }
260 }
261
262 public static Reader createReader(String textfile, String encoding)
263 throws IOException
264 {
265 return new InputStreamReader(new BufferedInputStream(new FileInputStream(textfile)),encoding);
266 }
267
268 public static void printRankResult(String msg, NGramProfiles.RankResult res,
269 long dt)
270 {
271 System.out.println(msg
272 +": "+res.getName(0) +":"+DF.format(res.getScore(0))
273 +" "+res.getName(1)+":"+DF.format(res.getScore(1))
274 +" "+res.getName(2)+":"+DF.format(res.getScore(2))
275 +" .. "+res.getName(-1)+":"+DF.format(res.getScore(-1))
276 +" |"+DFE.format(res.getScore(1)/res.getScore(0))
277 +" |"+DFE.format(res.getScore(-1)/res.getScore(0))
278 +" dt="+dt
279 );
280 }
281
282 public static NGramProfileImpl create(String textfile, String encoding)
283 throws IOException
284 {
285 File f = new File(textfile);
286 FileInputStream fis = new FileInputStream(f);
287 NGramProfileImpl prof = NGramProfileImpl.createProfile(textfile,
288 fis, encoding);
289 fis.close();
290 return prof;
291 }
292 }