001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj.lm;
021
022 import java.io.InputStream;
023 import java.io.BufferedInputStream;
024 import java.io.IOException;
025
026 import java.util.List;
027 import java.util.ArrayList;
028 import java.util.Iterator;
029
030 import de.spieleck.app.ngramj.*;
031
032 /**
033 * Profile implementation which reads itself from a text_cat generated
034 * resource.
035 */
036 public class LMDataProfile
037 implements LMConstants, IterableProfile
038 {
039
040 protected List rankedNGrams;
041 protected String name;
042
043 public LMDataProfile(String name, InputStream stream)
044 {
045 this.name = name;
046 readStream(stream);
047 }
048
049 protected void readStream(InputStream stream)
050 {
051 BufferedInputStream bi = new BufferedInputStream(stream);
052 rankedNGrams = new ArrayList();
053 int b = 0;
054 byte[] bs = new byte[10];
055 try
056 {
057 do
058 {
059 int j = 0;
060 middleloop:
061 while ( ( b = bi.read() ) != -1 )
062 {
063 int i;
064 for (i = 0; i < SKIPABLE.length; i++)
065 if ( b == SKIPABLE[i] )
066 break middleloop;
067 if ( j < 10 )
068 bs[j++] = (b != '_') ? (byte) b : (byte)' ';
069 }
070 if ( j > 0 )
071 {
072 rankedNGrams.add(NGramImpl.newNGram(bs, 0, j-1));
073 }
074 }
075 while ( b != -1 );
076 }
077 catch (IOException e)
078 {
079 System.err.println("exception="+e);
080 e.printStackTrace();
081 };
082 }
083
084 public double getRank(NGram gram)
085 {
086 //XXX Very inefficient!!!
087 Iterator iter = ngrams();
088 int i = 0;
089 while ( iter.hasNext() )
090 {
091 i++;
092 if ( ((NGram)iter.next()).equals(gram) )
093 return i;
094 }
095 return 0;
096 }
097
098 public int getSize()
099 {
100 return rankedNGrams.size();
101 }
102
103 public String getName()
104 {
105 return name;
106 }
107
108 public String toString()
109 {
110 // return super.toString()+"["+name+","+rankedGrams.length+"]";
111 return getName();
112 }
113
114 public Iterator ngrams()
115 {
116 return rankedNGrams.iterator();
117 }
118 }
119