001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj; 021 022 import java.io.*; 023 import java.util.*; 024 025 /** 026 * A profile to be created from a file. 027 * Use this if you have an abitrary bunch of bytes to be repackaged into 028 * ngrams. Note if you have an byte-Array in memory, you can feed it into 029 * this class via a {@link java.io.ByteArrayInputStream}. 030 * 031 * @author Frank S. Nestel 032 * @author $Author: nestefan $ 033 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 034 */ 035 public class EntryProfile 036 implements Profile, Constants 037 { 038 protected HashMap grams; 039 protected int theLimit = -1; 040 041 public EntryProfile(String fname) 042 throws IOException, FileNotFoundException 043 { 044 this(fname, -1); 045 } 046 047 public EntryProfile(String fname, int theLimit) 048 throws IOException, FileNotFoundException 049 { 050 this.theLimit = theLimit; 051 FileInputStream fi = new FileInputStream(fname); 052 digestStream(fi); 053 fi.close(); 054 } 055 056 public EntryProfile(InputStream stream) 057 throws IOException 058 { 059 this(stream, -1); 060 } 061 062 public EntryProfile(InputStream stream, int theLimit) 063 throws IOException 064 { 065 this.theLimit = theLimit; 066 digestStream(stream); 067 } 068 069 protected void digestStream(InputStream stream) 070 throws IOException 071 { 072 int i; 073 ArrayList order = ProtoReader.read(stream); 074 int limit; 075 if ( theLimit < 0 ) 076 { 077 limit = -1; 078 grams = new HashMap(order.size()); 079 } 080 else if ( order.size() < theLimit ) 081 { 082 limit = ((CountedNGram)order.get(order.size()-1)).getCount(); 083 grams = new HashMap(order.size()); 084 } 085 else 086 { 087 limit = ((CountedNGram)order.get(theLimit-1)).getCount(); 088 grams = new HashMap(theLimit); 089 } 090 i = 0; 091 while ( i < order.size() 092 && ((CountedNGram)order.get(i)).getCount() >= limit ) 093 { 094 int cnt = ((CountedNGram)order.get(i)).getCount(); 095 int j = i; 096 while ( ++j < order.size() 097 && ((CountedNGram)order.get(j)).getCount() == cnt ) ; 098 double h = (i + j + 1) * 0.5; 099 for (int k = i; k < j; k++ ) 100 grams.put(((CountedNGram)order.get(k)).getNGram(), 101 new Double(h) ); 102 // XXX Should resolve ties, otherwise behaviour is unpredictable 103 // due to internal behaviour of sort. 104 //System.err.println("---> "+((CountedNGram)order.get(i)).getNGram()+" "+i); 105 //grams.put(((CountedNGram)order.get(i)).getNGram(), new Integer(i) ); 106 i = j; 107 } 108 } 109 110 public double getRank(NGram ng) 111 { 112 Double in = (Double)grams.get(ng); 113 if ( in == null ) 114 return 0.0; 115 else 116 { 117 return in.doubleValue(); 118 } 119 } 120 121 }