001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.io.File; 022 import java.io.FileInputStream; 023 import java.io.FileReader; 024 import java.io.BufferedInputStream; 025 import java.io.BufferedReader; 026 import java.io.Reader; 027 import java.io.FileOutputStream; 028 import java.io.InputStreamReader; 029 import java.io.IOException; 030 import java.io.PrintStream; 031 import java.text.DecimalFormat; 032 033 /** 034 * Commandline interface that runs a ngram analysis over submitted text, 035 * results can be used for automatic language identification. 036 * 037 * @author Frank S. Nestel 038 * @author $Author: nestefan $ 039 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 040 */ 041 public class RunNGram 042 { 043 public static final int CREATE = 1; 044 public static final int SIMILARITY = 2; 045 public static final int SCORE = 3; 046 public static final int LANG = 4; 047 public static final int TEST = 5; 048 public static final int LANG2 = 6; 049 public static final int LANG2B = 7; 050 public static final int CHECK = 8; 051 public static final int PROFILES = 9; 052 053 public final static DecimalFormat DF = new DecimalFormat("0.000"); 054 public final static DecimalFormat DFE = new DecimalFormat("0.0E0"); 055 056 private static void usage(PrintStream out) 057 { 058 out.println("Usage: RunNGram commandset"); 059 out.println(" [-create profilename(out) textfile [encoding]]"); 060 out.println(" or [-similarity metricName textfile1 textfile2 [encoding]]"); 061 out.println(" or [-score metricName profile-name textfile [encoding]]"); 062 out.println(" or [-lang metricName textfile [encoding]]"); 063 out.println(" or [-test ]"); 064 out.println(" or [-lang2 textfile [encoding]]"); 065 out.println(" or [-lang2b textfile [encoding]]"); 066 out.println(" or [-check textlistFile]"); 067 out.println(" or [-profiles metricName profile1 profile2]"); 068 System.exit(42); 069 } 070 071 public static void main(String args[]) 072 throws Exception 073 { 074 int command = 0; 075 076 077 if (args.length == 0) 078 usage(System.out); 079 080 for (int i = 0; i < args.length; i++) 081 { 082 String profilename = ""; 083 String profilename2 = ""; 084 String textfile = ""; 085 String filename2 = ""; 086 String metricName = null; 087 NGramMetric metric = null; 088 String encoding = ""; 089 090 if ("-c".equals(args[i]) || "-create".equals(args[i]) ) 091 { 092 command = CREATE; 093 profilename = args[++i]; 094 textfile = args[++i]; 095 } 096 else if ("-i".equals(args[i]) || "-similarity".equals(args[i])) 097 { 098 command = SIMILARITY; 099 metricName = args[++i]; 100 metric = (NGramMetric) Class.forName(metricName).newInstance(); 101 textfile = args[++i]; 102 filename2 = args[++i]; 103 } 104 else if ("-s".equals(args[i]) || args[i].equals("-score")) 105 { 106 command = SCORE; 107 metricName = args[++i]; 108 metric = (NGramMetric) Class.forName(metricName).newInstance(); 109 profilename = args[++i]; 110 textfile = args[++i]; 111 } 112 else if ( "-p".equals(args[i]) || "-profiles".equals(args[i]) ) 113 { 114 command = PROFILES; 115 metricName = args[++i]; 116 metric = (NGramMetric) Class.forName(metricName).newInstance(); 117 profilename = args[++i]; 118 profilename2 = args[++i]; 119 } 120 else if ("-l".equals(args[i]) || "-lang".equals(args[i]) ) 121 { 122 command = LANG; 123 metricName = args[++i]; 124 metric = (NGramMetric) Class.forName(metricName).newInstance(); 125 textfile = args[++i]; 126 } 127 else if ("-l2".equals(args[i]) || "-lang2".equals(args[i]) ) 128 { 129 command = LANG2; 130 textfile = args[++i]; 131 } 132 else if ("-l2b".equals(args[i]) || "-lang2b".equals(args[i]) ) 133 { 134 command = LANG2B; 135 textfile = args[++i]; 136 } 137 else if ("-x".equals(args[i]) || "-check".equals(args[i]) ) 138 { 139 command = CHECK; 140 textfile = args[++i]; 141 } 142 else if ( "-t".equals(args[i]) || "-test".equals(args[i]) ) 143 { 144 command = TEST; 145 } 146 else 147 { 148 usage(System.err); 149 } 150 151 if ( i+1 < args.length && args[i].charAt(0) != '-' ) 152 { 153 encoding = args[++i]; 154 } 155 else 156 { 157 encoding = "iso-8859-1"; 158 } 159 if ( command == TEST ) 160 { 161 NGramProfiles npi = new NGramProfiles(); 162 npi.info(); 163 } 164 else if ( command == LANG2 || command == LANG2B ) 165 { 166 long t1 = System.currentTimeMillis(); 167 NGramProfiles nps = new NGramProfiles(); 168 NGramProfiles.Ranker ranker = nps.getRanker(); 169 ranker.account(createReader(textfile,encoding)); 170 NGramProfiles.RankResult res = ranker.getRankResult(); 171 long t2 = System.currentTimeMillis(); 172 printRankResult("speed", res, t2-t1); 173 if ( command == LANG2B ) 174 { 175 t1 = t2; 176 ranker.reset(); 177 ranker.account(createReader(textfile,encoding)); 178 res = ranker.getRankResult(); 179 t2 = System.currentTimeMillis(); 180 printRankResult("speed", res, t2-t1); 181 } 182 } 183 else if ( command == CHECK ) 184 { 185 NGramProfiles npi = new NGramProfiles(); 186 NGramProfiles.Ranker ranker = npi.getRanker(); 187 File fi = new File(textfile); 188 BufferedReader br = new BufferedReader(new FileReader(fi)); 189 String line; 190 while ( ( line = br.readLine() ) != null ) 191 { 192 line = line.trim(); 193 if ( line.charAt(0) == '#' ) 194 continue; 195 String[] ss = line.split(";"); 196 long t1 = System.currentTimeMillis(); 197 ranker.reset(); 198 ranker.account(createReader(ss[0], ss[1])); 199 long t2 = System.currentTimeMillis(); 200 NGramProfiles.RankResult res = ranker.getRankResult(); 201 printRankResult(ss[0], res, t2-t1); 202 } 203 } 204 else if ( command == PROFILES ) 205 { 206 FileInputStream fis; 207 File f2=new File(profilename); 208 fis = new FileInputStream(f2); 209 NGramProfileImpl comp1 = new NGramProfileImpl(profilename); 210 comp1.load(fis); 211 File f3=new File(profilename2); 212 fis = new FileInputStream(f3); 213 NGramProfileImpl comp2 = new NGramProfileImpl(profilename2); 214 comp2.load(fis); 215 System.out.println("diff("+profilename+":"+profilename2+")=" + DFE.format(metric.diff(comp1, comp2))); 216 } 217 else 218 { 219 long t1 = System.currentTimeMillis(); 220 NGramProfileImpl newProf = create(textfile, encoding); 221 long t2 = System.currentTimeMillis(); 222 223 switch (command) { 224 225 case CREATE: 226 String fname = profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION; 227 File f = new File(fname); 228 FileOutputStream fos = new FileOutputStream(f); 229 newProf.save(fos); 230 System.out.println("new profile '" + fname + "' was created."); 231 break; 232 233 case SIMILARITY: 234 NGramProfile newProf2 = create(filename2, encoding); 235 System.out.println("Difference is "+ DFE.format(metric.diff(newProf, newProf2))); 236 break; 237 238 case SCORE: 239 File f2=new File(profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION); 240 FileInputStream fis = new FileInputStream(f2); 241 NGramProfileImpl compare = new NGramProfileImpl(profilename); 242 compare.load(fis); 243 System.out.println("Score ("+profilename+") is " + DFE.format(metric.diff(compare, newProf))); 244 245 break; 246 247 case LANG: 248 NGramProfiles nps = new NGramProfiles(); 249 // Set restrict = nps.getAllNGrams(); 250 long dt1 = t2 - t1; 251 t1 = System.currentTimeMillis(); 252 NGramProfiles.RankResult res = nps.rank(metric, newProf); 253 t2 = System.currentTimeMillis(); 254 int ppos = metricName.lastIndexOf("."); 255 printRankResult(metricName.substring(ppos+1)+"("+dt1+")",res,t2-t1); 256 break; 257 } 258 } 259 } 260 } 261 262 public static Reader createReader(String textfile, String encoding) 263 throws IOException 264 { 265 return new InputStreamReader(new BufferedInputStream(new FileInputStream(textfile)),encoding); 266 } 267 268 public static void printRankResult(String msg, NGramProfiles.RankResult res, 269 long dt) 270 { 271 System.out.println(msg 272 +": "+res.getName(0) +":"+DF.format(res.getScore(0)) 273 +" "+res.getName(1)+":"+DF.format(res.getScore(1)) 274 +" "+res.getName(2)+":"+DF.format(res.getScore(2)) 275 +" .. "+res.getName(-1)+":"+DF.format(res.getScore(-1)) 276 +" |"+DFE.format(res.getScore(1)/res.getScore(0)) 277 +" |"+DFE.format(res.getScore(-1)/res.getScore(0)) 278 +" dt="+dt 279 ); 280 } 281 282 public static NGramProfileImpl create(String textfile, String encoding) 283 throws IOException 284 { 285 File f = new File(textfile); 286 FileInputStream fis = new FileInputStream(f); 287 NGramProfileImpl prof = NGramProfileImpl.createProfile(textfile, 288 fis, encoding); 289 fis.close(); 290 return prof; 291 } 292 }