001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019
020 package de.spieleck.app.ngramj.phoner;
021
022 import java.util.Map;
023 import java.util.HashMap;
024 import java.util.ArrayList;
025 import java.util.Collections;
026
027 import de.spieleck.app.ngramj.*;
028 import de.spieleck.util.*;
029
030 /**
031 * Even better enumeration, does not enumerate the phone
032 * numbers, but their corresponding profiles.
033 */
034 public class PhonerProfileEnum
035 extends PhoneKeys
036 {
037 protected int i, length;
038 protected byte[][] charSets;
039 protected byte[] bytes;
040 protected int[] j;
041 protected int gtcount[], eqcount[];
042 protected IntMap grams;
043 protected boolean notFirst = false;
044
045 public PhonerProfileEnum(String pnumber)
046 throws IllegalArgumentException
047 {
048 length = pnumber.length();
049 charSets = new byte[length][];
050 for (i = 0; i < length; i++)
051 {
052 String ch = String.valueOf(pnumber.charAt(i));
053 charSets[i] = (byte[])replacers.get(ch);
054 // Non existing characters are taken verbatim
055 if ( charSets[i] == null )
056 charSets[i] = new byte[] { ch.getBytes()[0] };
057 }
058 bytes = new byte[length+2];
059 // Attach word boundaries at both ends of the number...
060 bytes[0] = bytes[i+1] = (byte)' ';
061 j = new int[length];
062 i = 0;
063 j[0] = -1;
064 grams = new IntMap(3 << (2*length) );
065 // XXX ??? do those dimensions carry thru?
066 // Is it clear that no ngram is there more than length times??
067 gtcount = new int[length+1];
068 eqcount = new int[length+1];
069 }
070
071 public Profile next()
072 {
073 // System.err.println("### "+i);
074 if ( notFirst )
075 {
076 addNGrams(length+1, -1);
077 i--;
078 }
079 notFirst = true;
080 while ( i >= 0 )
081 {
082 j[i]++;
083 int i1 = i+1;
084 if ( j[i] >= charSets[i].length )
085 {
086 addNGrams(i1, -1);
087 i--;
088 }
089 else
090 {
091 if ( j[i] > 0 )
092 addNGrams(i1, -1);
093 // Add the new byte
094 bytes[i1] = charSets[i][j[i]];
095 i = i1;
096 addNGrams(i, 1);
097 if ( i == length )
098 {
099 addNGrams(length+1, 1);
100 if(false)
101 {
102 System.err.println();
103 for(int k = 0; k < length+2; k++) System.err.print(bytes[k]+" ");
104 System.err.println();
105 java.util.Enumeration en = grams.keys();
106 while ( en.hasMoreElements() )
107 {
108 Object o = en.nextElement();
109 int n = grams.get(o);
110 if ( n > 0 )
111 {
112 System.err.println("> "+n+" "+o);
113 }
114 }
115 for(int k = 1; k < length; k++) System.err.println(k+".: "+gtcount[k]+" "+eqcount[k]);
116 }
117 return returnProf;
118 }
119 else
120 {
121 j[i] = -1;
122 }
123 }
124 }
125 return null;
126 }
127
128 protected void addNGrams(int pos, int off)
129 {
130 int len = 0;
131 while ( len < 5 )
132 {
133 int start = pos - len;
134 if ( start < 0 )
135 break;
136 len++; // we allready need len+1 below here
137 NGram ng = NGramImpl.newNGram(bytes, start, len);
138 int cng = grams.get(ng);
139 // System.err.println("~ "+off+" "+ng+" "+cng);
140 if ( off > 0 )
141 {
142 if ( cng != grams.getNullValue() )
143 {
144 gtcount[cng]++;
145 eqcount[cng]--;
146 cng += off;
147 grams.put(ng, cng);
148 eqcount[cng]++;
149 }
150 else
151 {
152 // we don't care about zero counts! gtcount[0]++;
153 eqcount[off]++;
154 grams.put(ng, off);
155 }
156 }
157 else
158 {
159 if ( cng != grams.getNullValue() )
160 {
161 eqcount[cng]--;
162 cng += off;
163 grams.put(ng, cng);
164 gtcount[cng]--;
165 eqcount[cng]++;
166 }
167 }
168 }
169 }
170
171 public byte[] getRes()
172 {
173 return bytes;
174 }
175
176 protected Profile returnProf = new Profile()
177 {
178 public double getRank(NGram ng)
179 {
180 // It is enough to return a Pseudorank
181 // since the evaluation is very coarse:
182 int n = grams.get(ng);
183 if ( n == grams.getNullValue() || n == 0 )
184 {
185 // System.err.println(ng+" null");
186 return 0.0;
187 }
188 else
189 {
190 double h = gtcount[n] + 0.5 * ( eqcount[n] + 1 );
191 // System.err.println(ng+" "+h);
192 return h;
193 }
194 }
195 };
196 }
197