001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj;
021
022 /**
023 * Class to modell a concrete and simple NGram.
024 * <P>
025 * To make it slightly more interestion (and efficient),
026 * those NGrams follow a Flyweight pattern! I.e.
027 * of each different NGram there will only be one
028 * instance in the System. This is a bit technical,
029 * but one can safely ignore this and just
030 * deliberately call newNGram().
031 * </P>
032 *
033 * @author Christiaan Fluit
034 * @author Frank S. Nestel
035 * @author $Author: nestefan $
036 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
037 */
038 public class NGramImpl
039 implements NGram
040 {
041 protected byte[] bytes;
042 protected int size;
043
044 // Power of 2 !!
045 protected static NGramImpl[] known = new NGramImpl[512];
046 protected static int knownCount = 0;
047 protected static int knownStep = 400;
048
049 // XXX Seems to be necessary to allow subclassing:
050 protected NGramImpl() {}
051
052 private NGramImpl(byte[] bytes, int start, int length)
053 {
054 this.size = length;
055 this.bytes = new byte[length];
056 for(int i = 0; i < length; i++ )
057 this.bytes[i] = bytes[i+start];
058 }
059
060 /**
061 * QuasiConstructor.
062 * FlyWeight means that we first have to look if we
063 * allready know the current beasty.
064 */
065 public static NGram newNGram(byte[] bytes, int start)
066 {
067 return newNGram(bytes, start, bytes.length-start);
068 }
069
070 /**
071 * QuasiConstructor.
072 * FlyWeight means that we first have to look if we
073 * allready know the current beasty.
074 */
075 public static NGram newNGram(byte[] bytes, int start, int length)
076 {
077 return newNGram(bytes, start, length, true);
078 }
079
080 public static NGram newNGram(byte[] bytes, int start, int length, boolean cacheObject)
081 {
082 int c = code(bytes, start, length);
083 int p = ( c % known.length );
084 if ( p < 0 )
085 p += known.length;
086 // XXX Implementation not synchronized !!!
087 while ( known[p] != null )
088 {
089 if ( known[p].equals(bytes, start, length) )
090 break;
091 p = ( p + 5 ) % known.length;
092 }
093
094 NGramImpl nn = known[p];
095
096 // Do we need a new NGram?!
097 if ( nn == null )
098 {
099 nn = new NGramImpl(bytes, start, length);
100
101 if (cacheObject) {
102 knownCount++;
103 if ( knownCount >= knownStep )
104 {
105 // Increase cache size
106 NGramImpl[] oldknown = known;
107 int len = known.length;
108 int l2n = len * 2;
109 known = new NGramImpl[l2n];
110 for (int i = 0; i < len; i++)
111 if ( oldknown[i] != null )
112 {
113 int h = oldknown[i].hashCode() % l2n;
114 if ( h < 0 )
115 h += l2n;
116 while ( known[h] != null )
117 {
118 h = ( h + 5 ) % l2n;
119 }
120 known[h] = oldknown[i];
121 }
122 knownStep *= 2;
123 p = nn.hashCode() % l2n;
124 if ( p < 0 )
125 p += l2n;
126 while ( known[p] != null )
127 p = ( p + 5 ) % l2n;
128 }
129 known[p] = nn;
130 }
131 }
132
133 return nn;
134 }
135
136 /**
137 * Return the size of the ngram.
138 */
139 public int getSize()
140 {
141 return size;
142 }
143
144 /**
145 * Return a single byte of the NGram.
146 * @throws ArrayIndexOutOfBoundException ...
147 */
148 public int getByte(int pos)
149 {
150 return bytes[pos];
151 }
152
153 public static int getKnownCount()
154 {
155 return knownCount;
156 }
157
158 public boolean equals(byte[] bytes, int start, int length)
159 {
160 if ( size != length )
161 return false;
162 else
163 {
164 for (int i = 0; i < size; i++ )
165 if ( this.bytes[i] != bytes[i+start] )
166 return false;
167 }
168 return true;
169 }
170
171 /**
172 * Override the hashCode by s.th. that allows to hash NGrams
173 * against tiny byte sequences.
174 */
175 public int hashCode()
176 {
177 return code(bytes, 0, bytes.length);
178 }
179
180 /**
181 * Encode a byte sequence.
182 */
183 public static int code(byte[] bytes, int start, int length)
184 {
185 int h = 0;
186 for (int i = 0; i < length; i++)
187 h = appendCode(h, bytes[i+start]);
188 return h;
189 }
190
191 /**
192 * scrambler for hashcodes...
193 */
194 public final static int appendCode(int h, byte b)
195 {
196 return ( 0x50501005 * h + 0x0AAA0AAA ) ^ b;
197 }
198
199 public String toString()
200 {
201 StringBuffer sb = new StringBuffer(super.toString()+"{");
202 for (int i = 0; i < size; i++)
203 {
204 if ( i != 0 )
205 sb.append(";");
206 sb.append(bytes[i]);
207 }
208 sb.append("}");
209 return sb.toString();
210 }
211
212 public NGramImpl getNGramImpl()
213 {
214 return this;
215 }
216
217 public static int getNGramImplCount()
218 {
219 return knownCount;
220 }
221
222 }
223