001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj.lm; 021 022 import java.io.*; 023 import java.util.ArrayList; 024 025 import de.spieleck.app.ngramj.*; 026 027 /** 028 * Create a text_cat compatible LM file resource. 029 */ 030 public class LMWriter 031 implements LMConstants 032 { 033 public static void main(String[] args) 034 throws IOException 035 { 036 if ( args.length != 2 ) 037 { 038 System.err.println("LMWriter: Need exactly 2 arguments."); 039 System.exit(1); 040 } 041 InputStream in = new FileInputStream(args[0]); 042 OutputStream out = new FileOutputStream(args[1]); 043 ArrayList order = ProtoReader.read(in); 044 int limit; 045 if ( order.size() < USEDNGRAMS ) 046 limit = ((CountedNGram)order.get(order.size()-1)).getCount(); 047 else 048 limit = ((CountedNGram)order.get(USEDNGRAMS-1)).getCount(); 049 int k, i = 0; 050 while ( i < order.size() 051 && ((CountedNGram)order.get(i)).getCount() >= limit ) 052 { 053 CountedNGram gram = (CountedNGram)order.get(i); 054 int cnt = gram.getCount(); 055 for (k = 0; k < gram.getSize(); k++) 056 { 057 byte b = (byte)gram.getByte(k); 058 if ( b == (byte)' ' ) 059 out.write((byte)'_'); 060 else 061 out.write(b); 062 } 063 // Whether text_cat will recognize this separator??? 064 out.write(' '); 065 out.write('\t'); 066 String h = Integer.toString(cnt); 067 for (k = 0; k < h.length(); k++) 068 out.write((byte)h.charAt(k)); 069 out.write((byte)13); 070 out.write((byte)10); 071 i++; 072 } 073 out.flush(); 074 out.close(); 075 } 076 }