001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 020 package de.spieleck.app.ngramj.lm; 021 022 import java.io.*; 023 024 import java.util.List; 025 import java.util.ArrayList; 026 import java.util.Iterator; 027 import java.util.Enumeration; 028 029 import de.spieleck.app.ngramj.*; 030 031 /** 032 * One class to classify a profile against a set of profiles. 033 * 034 * Note this has a main() method for testing and tuning purposes. 035 */ 036 public class CategorizerImpl 037 implements Categorizer, LMConstants 038 { 039 protected List profiles = new ArrayList(); 040 041 /** 042 * Construct an uninitialized Categorizer. 043 */ 044 public CategorizerImpl() 045 throws IOException 046 { 047 InputStream ip = getClass().getResourceAsStream("profiles.lst"); 048 BufferedReader br = new BufferedReader(new InputStreamReader(ip)); 049 ArrayList al = new ArrayList(); 050 String line; 051 while ( ( line = br.readLine() ) != null ) 052 { 053 InputStream is = getClass().getResourceAsStream(line); 054 IterableProfile prof = new LMDataProfile(line, is); 055 addProfile(prof); 056 } 057 } 058 059 /** 060 * Construct an Categorizer from a whole Directory of resources. 061 */ 062 public CategorizerImpl(String dirName) 063 throws NGramException, FileNotFoundException 064 { 065 File fi = new File(dirName); 066 if ( ! fi.isDirectory() ) 067 throw new NGramException("Base must be a directory."); 068 String[] names = fi.list(LMFilter); 069 init(fi, names); 070 } 071 072 public static FilenameFilter LMFilter = new FilenameFilter() 073 { 074 public boolean accept(File dir, String name) 075 { 076 return name.endsWith(".lm"); 077 } 078 }; 079 080 /** 081 * Construct an Categorizer from a List of resource file names. 082 */ 083 public CategorizerImpl(String[] fileNames) 084 throws NGramException, FileNotFoundException 085 { 086 init(null, fileNames); 087 } 088 089 /** 090 * Fetch the set of file resources. 091 */ 092 protected void init(File fi, String[] names) 093 throws NGramException, FileNotFoundException 094 { 095 if ( names == null || names.length == 0 ) 096 throw new NGramException("Need at least one NGram input file."); 097 for (int i = 0; i < names.length; i++ ) 098 { 099 File ifi = new File(fi, names[i]); 100 InputStream in = new FileInputStream(ifi); 101 IterableProfile prof = new LMDataProfile(names[i], in); 102 addProfile(prof); 103 } 104 System.err.println("Statistics: " 105 +NGramImpl.getNGramImplCount()+" n-grams, " 106 +names.length+" Profiles." 107 +" q="+(NGramImpl.getNGramImplCount()/names.length) 108 ); 109 } 110 111 /** 112 * add an Categorization alternative to the profiles. 113 */ 114 public void addProfile(IterableProfile prof) 115 { 116 profiles.add(prof); 117 } 118 119 /** 120 * Match a given profile against the Categorizer 121 */ 122 public Profile match(Profile prof) 123 { 124 double error = Double.MAX_VALUE; 125 Profile opt = null; 126 Iterator iter = profiles.iterator(); 127 while ( iter.hasNext() ) 128 { 129 IterableProfile prof2 = (IterableProfile) iter.next(); 130 double newError = deltaRank(prof2, prof); 131 if ( newError < error ) 132 { 133 error = newError; 134 opt = prof2; 135 } 136 } 137 return opt; 138 } 139 140 /** 141 * Calculate "the distance" between two profiles 142 */ 143 public double deltaRank(IterableProfile prof1, Profile prof2) 144 { 145 double delta = 0.0; 146 Iterator grams = prof1.ngrams(); 147 int j = 0; 148 while ( grams.hasNext() ) 149 { 150 j++; 151 NGram ngram = (NGram) grams.next(); 152 double rank = prof2.getRank(ngram); 153 if ( rank != 0.0 ) 154 delta += Math.abs(rank - j ); 155 else 156 delta += USEDNGRAMS; // XXX ?! 157 } 158 return delta; 159 } 160 161 /** 162 * Sample application, like the text_cat main mode. 163 */ 164 public static void main(String[] args) 165 throws Exception 166 { 167 if ( args.length == 1 ) 168 { 169 Categorizer cath = new CategorizerImpl(); 170 EntryProfile prof = new EntryProfile(args[0], USEDNGRAMS); 171 Profile res = cath.match(prof); 172 System.err.println("Best match is: "+res); 173 } 174 else 175 { 176 Categorizer cath = new CategorizerImpl(args[0]); 177 for (int i = 1; i < args.length; i++ ) 178 { 179 EntryProfile prof = new EntryProfile(args[i], USEDNGRAMS); 180 Profile res = cath.match(prof); 181 System.err.println("Best match is: "+res); 182 } 183 } 184 } 185 186 }