001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj;
021    
022    /**
023     * Class to modell a concrete and simple NGram.
024     * <P>
025     * To make it slightly more interestion (and efficient),
026     * those NGrams follow a Flyweight pattern! I.e.
027     * of each different NGram there will only be one
028     * instance in the System. This is a bit technical,
029     * but one can safely ignore this and just 
030     * deliberately call newNGram().
031     * </P>
032     * 
033     * @author Christiaan Fluit
034     * @author Frank S. Nestel
035     * @author $Author: nestefan $
036     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
037     */
038    public class NGramImpl
039        implements NGram
040    {
041        protected byte[] bytes;
042        protected int size;
043    
044        // Power of 2 !!
045        protected static NGramImpl[] known = new NGramImpl[512];
046        protected static int knownCount = 0;
047        protected static int knownStep  = 400;
048    
049        // XXX Seems to be necessary to allow subclassing:
050        protected NGramImpl() {}
051    
052        private NGramImpl(byte[] bytes, int start, int length)
053        {
054            this.size = length;
055            this.bytes = new byte[length];
056            for(int i = 0; i < length; i++ )
057                this.bytes[i] = bytes[i+start];
058        }
059    
060        /**
061         * QuasiConstructor.
062         * FlyWeight means that we first have to look if we
063         * allready know the current beasty.
064         */
065        public static NGram newNGram(byte[] bytes, int start)
066        {
067            return newNGram(bytes, start, bytes.length-start);
068        }
069    
070        /**
071         * QuasiConstructor.
072         * FlyWeight means that we first have to look if we
073         * allready know the current beasty.
074         */
075        public static NGram newNGram(byte[] bytes, int start, int length)
076        {
077                    return newNGram(bytes, start, length, true);
078            }
079    
080        public static NGram newNGram(byte[] bytes, int start, int length, boolean cacheObject)
081            {
082            int c = code(bytes, start, length);
083            int p = ( c % known.length );
084            if ( p < 0 ) 
085                p += known.length;
086            // XXX Implementation not synchronized !!!
087            while ( known[p] != null )
088            {
089                if ( known[p].equals(bytes, start, length) )
090                    break;
091                p = ( p + 5 ) % known.length;
092            }
093    
094                    NGramImpl nn = known[p];
095    
096            // Do we need a new NGram?!
097            if ( nn == null )
098            {
099                nn = new NGramImpl(bytes, start, length);
100    
101                            if (cacheObject) {
102                    knownCount++;
103                    if ( knownCount >= knownStep )
104                    {
105                                            // Increase cache size
106                            NGramImpl[] oldknown = known;
107                            int len = known.length;
108                            int l2n = len * 2;
109                            known = new NGramImpl[l2n];
110                            for (int i = 0; i < len; i++)
111                            if ( oldknown[i] != null )
112                            {
113                                    int h = oldknown[i].hashCode() % l2n;
114                                    if ( h < 0 ) 
115                                    h += l2n;
116                                    while ( known[h] != null )
117                                    {
118                                    h = ( h + 5 ) % l2n;
119                                    }
120                                    known[h] = oldknown[i];
121                            }
122                            knownStep *= 2;
123                            p = nn.hashCode() % l2n;
124                            if ( p < 0 )
125                            p += l2n;
126                            while ( known[p] != null ) 
127                            p = ( p + 5 ) % l2n;
128                    }
129                    known[p] = nn;
130                            }
131            }
132    
133            return nn;
134        }
135    
136        /**
137         * Return the size of the ngram.
138         */
139        public int getSize()
140        {
141            return size;
142        }
143    
144        /** 
145         * Return a single byte of the NGram.
146         * @throws ArrayIndexOutOfBoundException ...
147         */
148        public int getByte(int pos)
149        {
150            return bytes[pos];
151        }
152    
153        public static int getKnownCount()
154        {
155            return knownCount;
156        }
157    
158        public boolean equals(byte[] bytes, int start, int length)
159        {
160            if ( size != length )
161                return false;
162            else
163            {
164                for (int i = 0; i < size; i++ )
165                    if ( this.bytes[i] != bytes[i+start] )
166                        return false;
167            }
168            return true;
169        }
170    
171        /**
172         * Override the hashCode by s.th. that allows to hash NGrams
173         * against tiny byte sequences.
174         */
175        public int hashCode()
176        {
177            return code(bytes, 0, bytes.length);
178        }
179    
180        /**
181         * Encode a byte sequence.
182         */
183        public static int code(byte[] bytes, int start, int length)
184        {
185            int h = 0;
186            for (int i = 0; i < length; i++)
187                h = appendCode(h, bytes[i+start]);
188            return h;
189        }
190    
191        /**
192         * scrambler for hashcodes...
193         */
194        public final static int appendCode(int h, byte b)
195        {
196            return ( 0x50501005 * h + 0x0AAA0AAA ) ^ b;
197        }
198    
199        public String toString()
200        {
201            StringBuffer sb = new StringBuffer(super.toString()+"{");
202            for (int i = 0; i < size; i++)
203            {
204                if ( i != 0 )
205                    sb.append(";");
206                sb.append(bytes[i]);
207            }
208            sb.append("}");
209            return sb.toString();
210        }
211    
212        public NGramImpl getNGramImpl()
213        {
214            return this;
215        }
216    
217        public static int getNGramImplCount()
218        {
219            return knownCount;
220        }
221    
222    }
223