001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj; 021 022 /** 023 * Class to modell a concrete and simple NGram. 024 * <P> 025 * To make it slightly more interestion (and efficient), 026 * those NGrams follow a Flyweight pattern! I.e. 027 * of each different NGram there will only be one 028 * instance in the System. This is a bit technical, 029 * but one can safely ignore this and just 030 * deliberately call newNGram(). 031 * </P> 032 * 033 * @author Christiaan Fluit 034 * @author Frank S. Nestel 035 * @author $Author: nestefan $ 036 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 037 */ 038 public class NGramImpl 039 implements NGram 040 { 041 protected byte[] bytes; 042 protected int size; 043 044 // Power of 2 !! 045 protected static NGramImpl[] known = new NGramImpl[512]; 046 protected static int knownCount = 0; 047 protected static int knownStep = 400; 048 049 // XXX Seems to be necessary to allow subclassing: 050 protected NGramImpl() {} 051 052 private NGramImpl(byte[] bytes, int start, int length) 053 { 054 this.size = length; 055 this.bytes = new byte[length]; 056 for(int i = 0; i < length; i++ ) 057 this.bytes[i] = bytes[i+start]; 058 } 059 060 /** 061 * QuasiConstructor. 062 * FlyWeight means that we first have to look if we 063 * allready know the current beasty. 064 */ 065 public static NGram newNGram(byte[] bytes, int start) 066 { 067 return newNGram(bytes, start, bytes.length-start); 068 } 069 070 /** 071 * QuasiConstructor. 072 * FlyWeight means that we first have to look if we 073 * allready know the current beasty. 074 */ 075 public static NGram newNGram(byte[] bytes, int start, int length) 076 { 077 return newNGram(bytes, start, length, true); 078 } 079 080 public static NGram newNGram(byte[] bytes, int start, int length, boolean cacheObject) 081 { 082 int c = code(bytes, start, length); 083 int p = ( c % known.length ); 084 if ( p < 0 ) 085 p += known.length; 086 // XXX Implementation not synchronized !!! 087 while ( known[p] != null ) 088 { 089 if ( known[p].equals(bytes, start, length) ) 090 break; 091 p = ( p + 5 ) % known.length; 092 } 093 094 NGramImpl nn = known[p]; 095 096 // Do we need a new NGram?! 097 if ( nn == null ) 098 { 099 nn = new NGramImpl(bytes, start, length); 100 101 if (cacheObject) { 102 knownCount++; 103 if ( knownCount >= knownStep ) 104 { 105 // Increase cache size 106 NGramImpl[] oldknown = known; 107 int len = known.length; 108 int l2n = len * 2; 109 known = new NGramImpl[l2n]; 110 for (int i = 0; i < len; i++) 111 if ( oldknown[i] != null ) 112 { 113 int h = oldknown[i].hashCode() % l2n; 114 if ( h < 0 ) 115 h += l2n; 116 while ( known[h] != null ) 117 { 118 h = ( h + 5 ) % l2n; 119 } 120 known[h] = oldknown[i]; 121 } 122 knownStep *= 2; 123 p = nn.hashCode() % l2n; 124 if ( p < 0 ) 125 p += l2n; 126 while ( known[p] != null ) 127 p = ( p + 5 ) % l2n; 128 } 129 known[p] = nn; 130 } 131 } 132 133 return nn; 134 } 135 136 /** 137 * Return the size of the ngram. 138 */ 139 public int getSize() 140 { 141 return size; 142 } 143 144 /** 145 * Return a single byte of the NGram. 146 * @throws ArrayIndexOutOfBoundException ... 147 */ 148 public int getByte(int pos) 149 { 150 return bytes[pos]; 151 } 152 153 public static int getKnownCount() 154 { 155 return knownCount; 156 } 157 158 public boolean equals(byte[] bytes, int start, int length) 159 { 160 if ( size != length ) 161 return false; 162 else 163 { 164 for (int i = 0; i < size; i++ ) 165 if ( this.bytes[i] != bytes[i+start] ) 166 return false; 167 } 168 return true; 169 } 170 171 /** 172 * Override the hashCode by s.th. that allows to hash NGrams 173 * against tiny byte sequences. 174 */ 175 public int hashCode() 176 { 177 return code(bytes, 0, bytes.length); 178 } 179 180 /** 181 * Encode a byte sequence. 182 */ 183 public static int code(byte[] bytes, int start, int length) 184 { 185 int h = 0; 186 for (int i = 0; i < length; i++) 187 h = appendCode(h, bytes[i+start]); 188 return h; 189 } 190 191 /** 192 * scrambler for hashcodes... 193 */ 194 public final static int appendCode(int h, byte b) 195 { 196 return ( 0x50501005 * h + 0x0AAA0AAA ) ^ b; 197 } 198 199 public String toString() 200 { 201 StringBuffer sb = new StringBuffer(super.toString()+"{"); 202 for (int i = 0; i < size; i++) 203 { 204 if ( i != 0 ) 205 sb.append(";"); 206 sb.append(bytes[i]); 207 } 208 sb.append("}"); 209 return sb.toString(); 210 } 211 212 public NGramImpl getNGramImpl() 213 { 214 return this; 215 } 216 217 public static int getNGramImplCount() 218 { 219 return knownCount; 220 } 221 222 } 223