001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj;
021
022 import java.io.*;
023 import java.util.*;
024
025 /**
026 * A profile to be created from a file.
027 * Use this if you have an abitrary bunch of bytes to be repackaged into
028 * ngrams. Note if you have an byte-Array in memory, you can feed it into
029 * this class via a {@link java.io.ByteArrayInputStream}.
030 *
031 * @author Frank S. Nestel
032 * @author $Author: nestefan $
033 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
034 */
035 public class EntryProfile
036 implements Profile, Constants
037 {
038 protected HashMap grams;
039 protected int theLimit = -1;
040
041 public EntryProfile(String fname)
042 throws IOException, FileNotFoundException
043 {
044 this(fname, -1);
045 }
046
047 public EntryProfile(String fname, int theLimit)
048 throws IOException, FileNotFoundException
049 {
050 this.theLimit = theLimit;
051 FileInputStream fi = new FileInputStream(fname);
052 digestStream(fi);
053 fi.close();
054 }
055
056 public EntryProfile(InputStream stream)
057 throws IOException
058 {
059 this(stream, -1);
060 }
061
062 public EntryProfile(InputStream stream, int theLimit)
063 throws IOException
064 {
065 this.theLimit = theLimit;
066 digestStream(stream);
067 }
068
069 protected void digestStream(InputStream stream)
070 throws IOException
071 {
072 int i;
073 ArrayList order = ProtoReader.read(stream);
074 int limit;
075 if ( theLimit < 0 )
076 {
077 limit = -1;
078 grams = new HashMap(order.size());
079 }
080 else if ( order.size() < theLimit )
081 {
082 limit = ((CountedNGram)order.get(order.size()-1)).getCount();
083 grams = new HashMap(order.size());
084 }
085 else
086 {
087 limit = ((CountedNGram)order.get(theLimit-1)).getCount();
088 grams = new HashMap(theLimit);
089 }
090 i = 0;
091 while ( i < order.size()
092 && ((CountedNGram)order.get(i)).getCount() >= limit )
093 {
094 int cnt = ((CountedNGram)order.get(i)).getCount();
095 int j = i;
096 while ( ++j < order.size()
097 && ((CountedNGram)order.get(j)).getCount() == cnt ) ;
098 double h = (i + j + 1) * 0.5;
099 for (int k = i; k < j; k++ )
100 grams.put(((CountedNGram)order.get(k)).getNGram(),
101 new Double(h) );
102 // XXX Should resolve ties, otherwise behaviour is unpredictable
103 // due to internal behaviour of sort.
104 //System.err.println("---> "+((CountedNGram)order.get(i)).getNGram()+" "+i);
105 //grams.put(((CountedNGram)order.get(i)).getNGram(), new Integer(i) );
106 i = j;
107 }
108 }
109
110 public double getRank(NGram ng)
111 {
112 Double in = (Double)grams.get(ng);
113 if ( in == null )
114 return 0.0;
115 else
116 {
117 return in.doubleValue();
118 }
119 }
120
121 }