001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj;
021
022 import java.io.IOException;
023 import java.io.InputStream;
024 import java.io.BufferedInputStream;
025 import java.util.ArrayList;
026 import java.util.Collections;
027 import java.util.HashMap;
028
029 /**
030 * Class to hold (static) methods to read in profile data
031 * XXX Probably needs some cleaning up and probably can be optimized.
032 *
033 * @author Christiaan Fluit
034 * @author Frank S. Nestel
035 * @author $Author: nestefan $
036 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
037 */
038 public class ProtoReader
039 {
040 public static ArrayList read(InputStream stream)
041 throws IOException
042 {
043 // XXX to get the last performance kick a high performance
044 // HashMap replacement should be dropped in here (e.g. trove
045 // stuff or s.th. self brewn.
046 HashMap count = new HashMap(1000);
047 BufferedInputStream bi = new BufferedInputStream(stream);
048 int b;
049 byte ba[] = new byte[5];
050 ba[4] = 42;
051 int i = 0;
052 while ( ( b = bi.read() ) != -1 )
053 {
054 // XXX ???
055 if ( b == 13 || b == 10 || b == 9 )
056 b = 32;
057 i++;
058 if ( b != 32 || ba[3] != 32 )
059 {
060 ba[0] = ba[1];
061 ba[1] = ba[2];
062 ba[2] = ba[3];
063 ba[3] = ba[4];
064 ba[4] = (byte)b;
065 newNGram(count, ba, 4, 1);
066 if ( i > 1 )
067 newNGram(count, ba,3,2);
068 if ( i > 2 )
069 newNGram(count, ba,2,3);
070 if ( i > 3 )
071 newNGram(count, ba,1,4);
072 if ( i > 4 )
073 newNGram(count, ba,0,5);
074 }
075 }
076 ArrayList order = new ArrayList(count.values());
077 Collections.sort(order);
078 return order;
079 }
080
081 protected static void newNGram(HashMap count, byte[] ba, int start, int len)
082 {
083 NGram ng = NGramImpl.newNGram(ba, start, len, false);
084 CountedNGram cng = (CountedNGram) count.get(ng);
085 if ( cng != null )
086 cng.inc();
087 else
088 count.put(ng, new CountedNGram(ng));
089 }
090
091 }
092