001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj;
021    
022    import java.io.IOException;
023    import java.io.InputStream;
024    import java.io.BufferedInputStream;
025    import java.util.ArrayList;
026    import java.util.Collections;
027    import java.util.HashMap;
028    
029    /**
030     * Class to hold (static) methods to read in profile data
031     * XXX Probably needs some cleaning up and probably can be optimized.
032     *
033     * @author Christiaan Fluit
034     * @author Frank S. Nestel
035     * @author $Author: nestefan $
036     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
037     */
038    public class ProtoReader
039    {
040        public static ArrayList read(InputStream stream)
041            throws IOException 
042        {
043            // XXX to get the last performance kick a high performance
044            // HashMap replacement should be dropped in here (e.g. trove
045            // stuff or s.th. self brewn.
046            HashMap count = new HashMap(1000);
047            BufferedInputStream bi = new BufferedInputStream(stream);
048            int b;
049            byte ba[] = new byte[5];
050            ba[4] = 42;
051            int i = 0;
052            while ( ( b = bi.read() ) != -1 )
053            {
054                // XXX ???
055                if ( b == 13 || b == 10 || b == 9 )
056                    b = 32;
057                i++;
058                if ( b != 32 || ba[3] != 32 )
059                {
060                    ba[0] = ba[1];
061                    ba[1] = ba[2];
062                    ba[2] = ba[3];
063                    ba[3] = ba[4];
064                    ba[4] = (byte)b;
065                    newNGram(count, ba, 4, 1);
066                    if ( i > 1 )
067                        newNGram(count, ba,3,2);
068                    if ( i > 2 )
069                        newNGram(count, ba,2,3);
070                    if ( i > 3 )
071                        newNGram(count, ba,1,4);
072                    if ( i > 4 )
073                        newNGram(count, ba,0,5);
074                }
075            }
076            ArrayList order = new ArrayList(count.values());
077            Collections.sort(order);
078            return order;
079        }
080    
081        protected static void newNGram(HashMap count, byte[] ba, int start, int len)
082        {
083            NGram ng = NGramImpl.newNGram(ba, start, len, false);
084            CountedNGram cng = (CountedNGram) count.get(ng);
085            if ( cng != null )
086                cng.inc();
087            else
088                count.put(ng, new CountedNGram(ng));
089        }
090    
091    }
092