001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj.lm;
021    
022    import java.io.*;
023    import java.util.ArrayList;
024    
025    import de.spieleck.app.ngramj.*;
026    
027    /** 
028     * Create a text_cat compatible LM file resource.
029     */
030    public class LMWriter
031        implements LMConstants
032    {
033        public static void main(String[] args)
034            throws IOException
035        {
036            if ( args.length != 2 )
037            {
038                System.err.println("LMWriter: Need exactly 2 arguments.");
039                System.exit(1);
040            }
041            InputStream in = new FileInputStream(args[0]);
042            OutputStream out = new FileOutputStream(args[1]);
043            ArrayList order = ProtoReader.read(in);
044            int limit;
045            if ( order.size() < USEDNGRAMS )
046                limit = ((CountedNGram)order.get(order.size()-1)).getCount();
047            else
048                limit = ((CountedNGram)order.get(USEDNGRAMS-1)).getCount();
049            int k, i = 0;
050            while ( i < order.size() 
051                    && ((CountedNGram)order.get(i)).getCount() >= limit )
052            {
053                CountedNGram gram = (CountedNGram)order.get(i);
054                int cnt = gram.getCount();
055                for (k = 0; k < gram.getSize(); k++)
056                {
057                    byte b = (byte)gram.getByte(k);
058                    if ( b == (byte)' ' )
059                        out.write((byte)'_');
060                    else
061                        out.write(b);
062                }
063                // Whether text_cat will recognize this separator???
064                out.write(' '); 
065                out.write('\t'); 
066                String h = Integer.toString(cnt);
067                for (k = 0; k < h.length(); k++)
068                    out.write((byte)h.charAt(k));
069                out.write((byte)13);
070                out.write((byte)10);
071                i++;
072            }
073            out.flush();
074            out.close();
075        }
076    }