001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj.lm; 021 022 import java.io.InputStream; 023 import java.io.BufferedInputStream; 024 import java.io.IOException; 025 026 import java.util.List; 027 import java.util.ArrayList; 028 import java.util.Iterator; 029 030 import de.spieleck.app.ngramj.*; 031 032 /** 033 * Profile implementation which reads itself from a text_cat generated 034 * resource. 035 */ 036 public class LMDataProfile 037 implements LMConstants, IterableProfile 038 { 039 040 protected List rankedNGrams; 041 protected String name; 042 043 public LMDataProfile(String name, InputStream stream) 044 { 045 this.name = name; 046 readStream(stream); 047 } 048 049 protected void readStream(InputStream stream) 050 { 051 BufferedInputStream bi = new BufferedInputStream(stream); 052 rankedNGrams = new ArrayList(); 053 int b = 0; 054 byte[] bs = new byte[10]; 055 try 056 { 057 do 058 { 059 int j = 0; 060 middleloop: 061 while ( ( b = bi.read() ) != -1 ) 062 { 063 int i; 064 for (i = 0; i < SKIPABLE.length; i++) 065 if ( b == SKIPABLE[i] ) 066 break middleloop; 067 if ( j < 10 ) 068 bs[j++] = (b != '_') ? (byte) b : (byte)' '; 069 } 070 if ( j > 0 ) 071 { 072 rankedNGrams.add(NGramImpl.newNGram(bs, 0, j-1)); 073 } 074 } 075 while ( b != -1 ); 076 } 077 catch (IOException e) 078 { 079 System.err.println("exception="+e); 080 e.printStackTrace(); 081 }; 082 } 083 084 public double getRank(NGram gram) 085 { 086 //XXX Very inefficient!!! 087 Iterator iter = ngrams(); 088 int i = 0; 089 while ( iter.hasNext() ) 090 { 091 i++; 092 if ( ((NGram)iter.next()).equals(gram) ) 093 return i; 094 } 095 return 0; 096 } 097 098 public int getSize() 099 { 100 return rankedNGrams.size(); 101 } 102 103 public String getName() 104 { 105 return name; 106 } 107 108 public String toString() 109 { 110 // return super.toString()+"["+name+","+rankedGrams.length+"]"; 111 return getName(); 112 } 113 114 public Iterator ngrams() 115 { 116 return rankedNGrams.iterator(); 117 } 118 } 119