001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj;
021    
022    import java.io.*;
023    import java.util.*;
024    
025    /**
026     * A profile to be created from a file.
027     * Use this if you have an abitrary bunch of bytes to be repackaged into
028     * ngrams. Note if you have an byte-Array in memory, you can feed it into
029     * this class via a {@link java.io.ByteArrayInputStream}.
030     *
031     * @author Frank S. Nestel
032     * @author $Author: nestefan $
033     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
034     */
035    public class EntryProfile
036        implements Profile, Constants
037    {
038        protected HashMap grams;
039        protected int theLimit = -1;
040    
041        public EntryProfile(String fname)
042            throws IOException, FileNotFoundException
043        {
044            this(fname, -1);
045        }
046    
047        public EntryProfile(String fname, int theLimit)
048            throws IOException, FileNotFoundException
049        {
050            this.theLimit = theLimit;
051            FileInputStream fi = new FileInputStream(fname);
052            digestStream(fi);
053            fi.close();
054        }
055    
056        public EntryProfile(InputStream stream)
057            throws IOException
058        {
059            this(stream, -1);
060        }
061    
062        public EntryProfile(InputStream stream, int theLimit)
063            throws IOException
064        {
065            this.theLimit = theLimit;
066            digestStream(stream);
067        }
068    
069        protected void digestStream(InputStream stream)
070            throws IOException 
071        {
072            int i;
073            ArrayList order = ProtoReader.read(stream);
074            int limit;
075            if ( theLimit < 0  )
076            {
077                limit = -1;
078                grams = new HashMap(order.size());
079            }
080            else if ( order.size() < theLimit )
081            {
082                limit = ((CountedNGram)order.get(order.size()-1)).getCount();
083                grams = new HashMap(order.size());
084            }
085            else
086            {
087                limit = ((CountedNGram)order.get(theLimit-1)).getCount();
088                grams = new HashMap(theLimit);
089            }
090            i = 0;
091            while ( i < order.size() 
092                    && ((CountedNGram)order.get(i)).getCount() >= limit )
093            {
094                int cnt = ((CountedNGram)order.get(i)).getCount();
095                int j = i;
096                while ( ++j < order.size()
097                    && ((CountedNGram)order.get(j)).getCount() == cnt ) ;
098                double h = (i + j + 1) * 0.5;
099                for (int k = i; k < j; k++ )
100                    grams.put(((CountedNGram)order.get(k)).getNGram(), 
101                                new Double(h) );
102    // XXX Should resolve ties, otherwise behaviour is unpredictable
103    // due to internal behaviour of sort.
104    //System.err.println("---> "+((CountedNGram)order.get(i)).getNGram()+" "+i);
105    //grams.put(((CountedNGram)order.get(i)).getNGram(), new Integer(i) );
106                i = j;
107            }
108        }
109    
110        public double getRank(NGram ng)
111        {
112            Double in = (Double)grams.get(ng);
113            if ( in == null )
114                return 0.0;
115            else
116            {
117                return in.doubleValue();
118            }
119        }
120    
121    }