001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.io.File;
022    import java.io.FileInputStream;
023    import java.io.FileReader;
024    import java.io.BufferedInputStream;
025    import java.io.BufferedReader;
026    import java.io.Reader;
027    import java.io.FileOutputStream;
028    import java.io.InputStreamReader;
029    import java.io.IOException;
030    import java.io.PrintStream;
031    import java.text.DecimalFormat;
032    
033    /**
034     * Commandline interface that runs a ngram analysis over submitted text,
035     * results can be used for automatic language identification.
036     *
037     * @author Frank S. Nestel
038     * @author $Author: nestefan $
039     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
040     */
041    public class RunNGram
042    {
043      public static final int CREATE = 1;
044      public static final int SIMILARITY = 2;
045      public static final int SCORE = 3;
046      public static final int LANG = 4;
047      public static final int TEST = 5;
048      public static final int LANG2 = 6;
049      public static final int LANG2B = 7;
050      public static final int CHECK = 8;
051      public static final int PROFILES = 9;
052    
053      public final static DecimalFormat DF = new DecimalFormat("0.000");
054      public final static DecimalFormat DFE = new DecimalFormat("0.0E0");
055    
056      private static void usage(PrintStream out)
057      {
058        out.println("Usage: RunNGram commandset");
059        out.println("          [-create profilename(out) textfile [encoding]]");
060        out.println("   or     [-similarity metricName textfile1 textfile2 [encoding]]");
061        out.println("   or     [-score metricName profile-name textfile [encoding]]");
062        out.println("   or     [-lang metricName textfile [encoding]]");
063        out.println("   or     [-test ]");
064        out.println("   or     [-lang2 textfile [encoding]]");
065        out.println("   or     [-lang2b textfile [encoding]]");
066        out.println("   or     [-check textlistFile]");
067        out.println("   or     [-profiles metricName profile1 profile2]");
068        System.exit(42);
069      }
070    
071      public static void main(String args[])
072        throws Exception
073      {
074        int command = 0;
075    
076    
077        if (args.length == 0)
078            usage(System.out);
079    
080        for (int i = 0; i < args.length; i++)
081        {
082          String profilename = "";
083          String profilename2 = "";
084          String textfile = "";
085          String filename2 = "";
086          String metricName = null;
087          NGramMetric metric = null;
088          String encoding = "";
089    
090          if ("-c".equals(args[i]) || "-create".equals(args[i]) )
091          {
092            command = CREATE;
093            profilename = args[++i];
094            textfile = args[++i];
095          }
096          else if ("-i".equals(args[i]) || "-similarity".equals(args[i])) 
097          { 
098            command = SIMILARITY;
099            metricName = args[++i];
100            metric = (NGramMetric) Class.forName(metricName).newInstance();
101            textfile = args[++i];
102            filename2 = args[++i];
103          }
104          else if ("-s".equals(args[i]) || args[i].equals("-score")) 
105          {
106            command = SCORE;
107            metricName = args[++i];
108            metric = (NGramMetric) Class.forName(metricName).newInstance();
109            profilename = args[++i];
110            textfile = args[++i];
111          }
112          else if ( "-p".equals(args[i]) || "-profiles".equals(args[i]) )
113          {
114              command = PROFILES;
115              metricName = args[++i];
116              metric = (NGramMetric) Class.forName(metricName).newInstance();
117              profilename = args[++i];
118              profilename2 = args[++i];
119          }
120          else if ("-l".equals(args[i]) || "-lang".equals(args[i]) )
121          {
122            command = LANG;
123            metricName = args[++i];
124            metric = (NGramMetric) Class.forName(metricName).newInstance();
125            textfile = args[++i];
126          }
127          else if ("-l2".equals(args[i]) || "-lang2".equals(args[i]) )
128          {
129            command = LANG2;
130            textfile = args[++i];
131          }
132          else if ("-l2b".equals(args[i]) || "-lang2b".equals(args[i]) )
133          {
134            command = LANG2B;
135            textfile = args[++i];
136          }
137          else if ("-x".equals(args[i]) || "-check".equals(args[i]) )
138          {
139            command = CHECK;
140            textfile = args[++i];
141          }
142          else if ( "-t".equals(args[i]) || "-test".equals(args[i]) )
143          {
144            command = TEST;
145          }
146          else
147          {
148              usage(System.err);
149          }
150    
151          if ( i+1 < args.length && args[i].charAt(0) != '-' )
152          {
153            encoding = args[++i];
154          }
155          else
156          {
157            encoding = "iso-8859-1";
158          }
159          if ( command == TEST )
160          {
161              NGramProfiles npi = new NGramProfiles();
162              npi.info();
163          }
164          else if ( command == LANG2 || command == LANG2B )
165          {
166            long t1 = System.currentTimeMillis();
167            NGramProfiles nps = new NGramProfiles();
168            NGramProfiles.Ranker ranker = nps.getRanker();
169            ranker.account(createReader(textfile,encoding));
170            NGramProfiles.RankResult res = ranker.getRankResult();
171            long t2 = System.currentTimeMillis();
172            printRankResult("speed", res, t2-t1);
173            if ( command == LANG2B )
174            {
175              t1 = t2;
176              ranker.reset();
177              ranker.account(createReader(textfile,encoding));
178              res = ranker.getRankResult();
179              t2 = System.currentTimeMillis();
180              printRankResult("speed", res, t2-t1);
181            }
182          }
183          else if ( command == CHECK )
184          {
185            NGramProfiles npi = new NGramProfiles();
186            NGramProfiles.Ranker ranker = npi.getRanker();
187            File fi = new File(textfile);
188            BufferedReader br = new BufferedReader(new FileReader(fi));
189            String line;
190            while ( ( line = br.readLine() ) != null )
191            {
192              line = line.trim();
193              if ( line.charAt(0) == '#' )
194                continue;
195              String[] ss = line.split(";");
196              long t1 = System.currentTimeMillis();
197              ranker.reset();
198              ranker.account(createReader(ss[0], ss[1]));
199              long t2 = System.currentTimeMillis();
200              NGramProfiles.RankResult res = ranker.getRankResult();
201              printRankResult(ss[0], res, t2-t1);
202            }
203          }
204          else if ( command == PROFILES )
205          {
206              FileInputStream fis;
207              File f2=new File(profilename);
208              fis = new FileInputStream(f2);
209              NGramProfileImpl comp1 = new NGramProfileImpl(profilename);
210              comp1.load(fis);
211              File f3=new File(profilename2);
212              fis = new FileInputStream(f3);
213              NGramProfileImpl comp2 = new NGramProfileImpl(profilename2);
214              comp2.load(fis);
215              System.out.println("diff("+profilename+":"+profilename2+")=" + DFE.format(metric.diff(comp1, comp2)));
216          }
217          else
218          {
219            long t1 = System.currentTimeMillis();
220            NGramProfileImpl newProf = create(textfile, encoding);
221            long t2 = System.currentTimeMillis();
222    
223            switch (command) {
224    
225            case CREATE:
226              String fname = profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION;
227              File f = new File(fname);
228              FileOutputStream fos = new FileOutputStream(f);
229              newProf.save(fos);
230              System.out.println("new profile '" + fname + "' was created.");
231              break;
232    
233            case SIMILARITY:
234              NGramProfile newProf2 = create(filename2, encoding);
235              System.out.println("Difference is "+ DFE.format(metric.diff(newProf, newProf2)));
236              break;
237    
238            case SCORE:
239              File f2=new File(profilename+"."+NGramProfile.NGRAM_PROFILE_EXTENSION);
240              FileInputStream fis = new FileInputStream(f2);
241              NGramProfileImpl compare = new NGramProfileImpl(profilename);
242              compare.load(fis);
243              System.out.println("Score ("+profilename+") is " + DFE.format(metric.diff(compare, newProf)));
244    
245              break;
246    
247            case LANG:
248              NGramProfiles nps = new NGramProfiles();
249              // Set restrict = nps.getAllNGrams();
250              long dt1 = t2 - t1;
251              t1 = System.currentTimeMillis();
252              NGramProfiles.RankResult res = nps.rank(metric, newProf);
253              t2 = System.currentTimeMillis();
254              int ppos = metricName.lastIndexOf(".");
255              printRankResult(metricName.substring(ppos+1)+"("+dt1+")",res,t2-t1);
256              break;
257            }
258          }
259        }
260      }
261    
262      public static Reader createReader(String textfile, String encoding)
263        throws IOException
264      {
265        return new InputStreamReader(new BufferedInputStream(new FileInputStream(textfile)),encoding);
266      }
267    
268      public static void printRankResult(String msg, NGramProfiles.RankResult res,
269                                            long dt)
270      {
271          System.out.println(msg
272                  +": "+res.getName(0) +":"+DF.format(res.getScore(0))
273                  +" "+res.getName(1)+":"+DF.format(res.getScore(1))
274                  +" "+res.getName(2)+":"+DF.format(res.getScore(2))
275                  +" .. "+res.getName(-1)+":"+DF.format(res.getScore(-1))
276                  +" |"+DFE.format(res.getScore(1)/res.getScore(0))
277                  +" |"+DFE.format(res.getScore(-1)/res.getScore(0))
278                  +" dt="+dt
279                );
280      }
281    
282      public static NGramProfileImpl create(String textfile, String encoding)
283        throws IOException
284      {
285        File f = new File(textfile);
286        FileInputStream fis = new FileInputStream(f);
287        NGramProfileImpl prof = NGramProfileImpl.createProfile(textfile,
288            fis, encoding);
289        fis.close();
290        return prof;
291      }
292    }