001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj.lm;
021    
022    import java.io.InputStream;
023    import java.io.BufferedInputStream;
024    import java.io.IOException;
025    
026    import java.util.List;
027    import java.util.ArrayList;
028    import java.util.Iterator;
029    
030    import de.spieleck.app.ngramj.*;
031    
032    /**
033     * Profile implementation which reads itself from a text_cat generated
034     * resource.
035     */
036    public class LMDataProfile
037        implements LMConstants, IterableProfile
038    {
039    
040        protected List rankedNGrams;
041        protected String name;
042    
043        public LMDataProfile(String name, InputStream stream)
044        {
045            this.name = name;
046            readStream(stream);
047        }
048    
049        protected void readStream(InputStream stream)
050        {
051            BufferedInputStream bi = new BufferedInputStream(stream);
052            rankedNGrams = new ArrayList();
053            int b = 0;
054            byte[] bs = new byte[10];
055            try
056            {
057                do
058                {
059                    int j = 0;
060                middleloop:
061                    while ( ( b = bi.read() ) != -1 )
062                    {
063                        int i;
064                        for (i = 0; i < SKIPABLE.length; i++)
065                            if ( b == SKIPABLE[i] )
066                                break middleloop;
067                        if ( j < 10 )
068                            bs[j++] = (b != '_') ? (byte) b : (byte)' ';
069                    }
070                    if ( j > 0 )
071                    {
072                        rankedNGrams.add(NGramImpl.newNGram(bs, 0, j-1));
073                    }
074                }
075                while ( b != -1 );
076            }
077            catch (IOException e) 
078            { 
079                System.err.println("exception="+e);
080                e.printStackTrace();
081            };
082        }
083    
084        public double getRank(NGram gram)
085        {
086            //XXX Very inefficient!!!
087            Iterator iter = ngrams();
088            int i = 0;
089            while ( iter.hasNext() )
090            {
091                i++;
092                if ( ((NGram)iter.next()).equals(gram) )
093                    return i;
094            }
095            return 0;
096        }
097    
098        public int getSize()
099        {
100            return rankedNGrams.size();
101        }
102    
103        public String getName()
104        {
105            return name;
106        }
107    
108        public String toString()
109        {
110            // return super.toString()+"["+name+","+rankedGrams.length+"]";
111            return getName();
112        }
113    
114        public Iterator ngrams()
115        {
116            return rankedNGrams.iterator();
117        }
118    }
119