001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj.phoner; 021 022 import java.util.Map; 023 import java.util.HashMap; 024 import java.util.ArrayList; 025 import java.util.Collections; 026 027 import de.spieleck.app.ngramj.*; 028 import de.spieleck.util.*; 029 030 /** 031 * Even better enumeration, does not enumerate the phone 032 * numbers, but their corresponding profiles. 033 */ 034 public class PhonerProfileEnum 035 extends PhoneKeys 036 { 037 protected int i, length; 038 protected byte[][] charSets; 039 protected byte[] bytes; 040 protected int[] j; 041 protected int gtcount[], eqcount[]; 042 protected IntMap grams; 043 protected boolean notFirst = false; 044 045 public PhonerProfileEnum(String pnumber) 046 throws IllegalArgumentException 047 { 048 length = pnumber.length(); 049 charSets = new byte[length][]; 050 for (i = 0; i < length; i++) 051 { 052 String ch = String.valueOf(pnumber.charAt(i)); 053 charSets[i] = (byte[])replacers.get(ch); 054 // Non existing characters are taken verbatim 055 if ( charSets[i] == null ) 056 charSets[i] = new byte[] { ch.getBytes()[0] }; 057 } 058 bytes = new byte[length+2]; 059 // Attach word boundaries at both ends of the number... 060 bytes[0] = bytes[i+1] = (byte)' '; 061 j = new int[length]; 062 i = 0; 063 j[0] = -1; 064 grams = new IntMap(3 << (2*length) ); 065 // XXX ??? do those dimensions carry thru? 066 // Is it clear that no ngram is there more than length times?? 067 gtcount = new int[length+1]; 068 eqcount = new int[length+1]; 069 } 070 071 public Profile next() 072 { 073 // System.err.println("### "+i); 074 if ( notFirst ) 075 { 076 addNGrams(length+1, -1); 077 i--; 078 } 079 notFirst = true; 080 while ( i >= 0 ) 081 { 082 j[i]++; 083 int i1 = i+1; 084 if ( j[i] >= charSets[i].length ) 085 { 086 addNGrams(i1, -1); 087 i--; 088 } 089 else 090 { 091 if ( j[i] > 0 ) 092 addNGrams(i1, -1); 093 // Add the new byte 094 bytes[i1] = charSets[i][j[i]]; 095 i = i1; 096 addNGrams(i, 1); 097 if ( i == length ) 098 { 099 addNGrams(length+1, 1); 100 if(false) 101 { 102 System.err.println(); 103 for(int k = 0; k < length+2; k++) System.err.print(bytes[k]+" "); 104 System.err.println(); 105 java.util.Enumeration en = grams.keys(); 106 while ( en.hasMoreElements() ) 107 { 108 Object o = en.nextElement(); 109 int n = grams.get(o); 110 if ( n > 0 ) 111 { 112 System.err.println("> "+n+" "+o); 113 } 114 } 115 for(int k = 1; k < length; k++) System.err.println(k+".: "+gtcount[k]+" "+eqcount[k]); 116 } 117 return returnProf; 118 } 119 else 120 { 121 j[i] = -1; 122 } 123 } 124 } 125 return null; 126 } 127 128 protected void addNGrams(int pos, int off) 129 { 130 int len = 0; 131 while ( len < 5 ) 132 { 133 int start = pos - len; 134 if ( start < 0 ) 135 break; 136 len++; // we allready need len+1 below here 137 NGram ng = NGramImpl.newNGram(bytes, start, len); 138 int cng = grams.get(ng); 139 // System.err.println("~ "+off+" "+ng+" "+cng); 140 if ( off > 0 ) 141 { 142 if ( cng != grams.getNullValue() ) 143 { 144 gtcount[cng]++; 145 eqcount[cng]--; 146 cng += off; 147 grams.put(ng, cng); 148 eqcount[cng]++; 149 } 150 else 151 { 152 // we don't care about zero counts! gtcount[0]++; 153 eqcount[off]++; 154 grams.put(ng, off); 155 } 156 } 157 else 158 { 159 if ( cng != grams.getNullValue() ) 160 { 161 eqcount[cng]--; 162 cng += off; 163 grams.put(ng, cng); 164 gtcount[cng]--; 165 eqcount[cng]++; 166 } 167 } 168 } 169 } 170 171 public byte[] getRes() 172 { 173 return bytes; 174 } 175 176 protected Profile returnProf = new Profile() 177 { 178 public double getRank(NGram ng) 179 { 180 // It is enough to return a Pseudorank 181 // since the evaluation is very coarse: 182 int n = grams.get(ng); 183 if ( n == grams.getNullValue() || n == 0 ) 184 { 185 // System.err.println(ng+" null"); 186 return 0.0; 187 } 188 else 189 { 190 double h = gtcount[n] + 0.5 * ( eqcount[n] + 1 ); 191 // System.err.println(ng+" "+h); 192 return h; 193 } 194 } 195 }; 196 } 197