001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj; 021 022 import java.io.IOException; 023 import java.io.InputStream; 024 import java.io.BufferedInputStream; 025 import java.util.ArrayList; 026 import java.util.Collections; 027 import java.util.HashMap; 028 029 /** 030 * Class to hold (static) methods to read in profile data 031 * XXX Probably needs some cleaning up and probably can be optimized. 032 * 033 * @author Christiaan Fluit 034 * @author Frank S. Nestel 035 * @author $Author: nestefan $ 036 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 037 */ 038 public class ProtoReader 039 { 040 public static ArrayList read(InputStream stream) 041 throws IOException 042 { 043 // XXX to get the last performance kick a high performance 044 // HashMap replacement should be dropped in here (e.g. trove 045 // stuff or s.th. self brewn. 046 HashMap count = new HashMap(1000); 047 BufferedInputStream bi = new BufferedInputStream(stream); 048 int b; 049 byte ba[] = new byte[5]; 050 ba[4] = 42; 051 int i = 0; 052 while ( ( b = bi.read() ) != -1 ) 053 { 054 // XXX ??? 055 if ( b == 13 || b == 10 || b == 9 ) 056 b = 32; 057 i++; 058 if ( b != 32 || ba[3] != 32 ) 059 { 060 ba[0] = ba[1]; 061 ba[1] = ba[2]; 062 ba[2] = ba[3]; 063 ba[3] = ba[4]; 064 ba[4] = (byte)b; 065 newNGram(count, ba, 4, 1); 066 if ( i > 1 ) 067 newNGram(count, ba,3,2); 068 if ( i > 2 ) 069 newNGram(count, ba,2,3); 070 if ( i > 3 ) 071 newNGram(count, ba,1,4); 072 if ( i > 4 ) 073 newNGram(count, ba,0,5); 074 } 075 } 076 ArrayList order = new ArrayList(count.values()); 077 Collections.sort(order); 078 return order; 079 } 080 081 protected static void newNGram(HashMap count, byte[] ba, int start, int len) 082 { 083 NGram ng = NGramImpl.newNGram(ba, start, len, false); 084 CountedNGram cng = (CountedNGram) count.get(ng); 085 if ( cng != null ) 086 cng.inc(); 087 else 088 count.put(ng, new CountedNGram(ng)); 089 } 090 091 } 092