001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001 Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    
020    package de.spieleck.app.ngramj.lm;
021    
022    import java.io.*;
023    
024    import java.util.List;
025    import java.util.ArrayList;
026    import java.util.Iterator;
027    import java.util.Enumeration;
028    
029    import de.spieleck.app.ngramj.*;
030    
031    /**
032     * One class to classify a profile against a set of profiles.
033     *
034     * Note this has a main() method for testing and tuning purposes.
035     */
036    public class CategorizerImpl
037        implements Categorizer, LMConstants 
038    {
039        protected List profiles = new ArrayList();
040    
041        /**
042         * Construct an uninitialized Categorizer.
043         */
044        public CategorizerImpl()
045            throws IOException
046        {
047            InputStream ip = getClass().getResourceAsStream("profiles.lst");
048            BufferedReader br = new BufferedReader(new InputStreamReader(ip));
049            ArrayList al = new ArrayList();
050            String line;
051            while ( ( line = br.readLine() ) != null )
052            {
053                InputStream is = getClass().getResourceAsStream(line);
054                IterableProfile prof = new LMDataProfile(line, is);
055                addProfile(prof);
056            }
057        }
058    
059        /** 
060         * Construct an Categorizer from a whole Directory of resources.
061         */
062        public CategorizerImpl(String dirName)
063            throws NGramException, FileNotFoundException
064        {
065            File fi = new File(dirName);
066            if ( ! fi.isDirectory() )
067                throw new NGramException("Base must be a directory.");
068            String[] names = fi.list(LMFilter);
069            init(fi, names);
070        }
071    
072        public static FilenameFilter LMFilter = new FilenameFilter()
073            {
074                public boolean accept(File dir, String name)
075                {
076                    return name.endsWith(".lm");
077                }
078            };
079    
080        /** 
081         * Construct an Categorizer from a List of resource file names.
082         */
083        public CategorizerImpl(String[] fileNames)
084            throws NGramException, FileNotFoundException
085        {
086            init(null, fileNames);
087        }
088    
089        /**
090         * Fetch the set of file resources.
091         */
092        protected void init(File fi, String[] names)
093            throws NGramException, FileNotFoundException
094        {
095            if ( names == null || names.length == 0 )
096                throw new NGramException("Need at least one NGram input file.");
097            for (int i = 0; i < names.length; i++ )
098            {
099                File ifi = new File(fi, names[i]);
100                InputStream in = new FileInputStream(ifi);
101                IterableProfile prof = new LMDataProfile(names[i], in);
102                addProfile(prof);
103            }
104            System.err.println("Statistics: "
105                                +NGramImpl.getNGramImplCount()+" n-grams, "
106                                +names.length+" Profiles."
107                                +" q="+(NGramImpl.getNGramImplCount()/names.length)
108                            );
109        }
110    
111        /**
112         * add an Categorization alternative to the profiles.
113         */
114        public void addProfile(IterableProfile prof)
115        {
116            profiles.add(prof);
117        }
118    
119        /**
120         * Match a given profile against the Categorizer
121         */
122        public Profile match(Profile prof)
123        {
124            double error = Double.MAX_VALUE;
125            Profile opt = null;
126            Iterator iter = profiles.iterator();
127            while ( iter.hasNext() )
128            {
129                IterableProfile prof2 = (IterableProfile) iter.next();
130                double newError = deltaRank(prof2, prof);
131                if ( newError < error )
132                {
133                    error = newError;
134                    opt = prof2;
135                }
136            }
137            return opt;
138        }
139    
140        /**
141         * Calculate "the distance" between two profiles
142         */
143        public double deltaRank(IterableProfile prof1, Profile prof2)
144        {
145            double delta = 0.0;
146            Iterator grams = prof1.ngrams();
147            int j = 0;
148            while ( grams.hasNext() )
149            {
150                j++;
151                NGram ngram = (NGram) grams.next();
152                double rank = prof2.getRank(ngram);
153                if ( rank != 0.0 )
154                    delta += Math.abs(rank - j );
155                else
156                    delta += USEDNGRAMS; // XXX ?!
157            }
158            return delta;
159        }
160    
161        /**
162         * Sample application, like the text_cat main mode.
163         */
164        public static void main(String[] args)
165            throws Exception
166        {
167            if ( args.length == 1 )
168            {
169                Categorizer cath = new CategorizerImpl();
170                EntryProfile prof = new EntryProfile(args[0], USEDNGRAMS);
171                Profile res = cath.match(prof);
172                System.err.println("Best match is: "+res);
173            }
174            else
175            {
176                Categorizer cath = new CategorizerImpl(args[0]);
177                for (int i = 1; i < args.length; i++ )
178                {
179                    EntryProfile prof = new EntryProfile(args[i], USEDNGRAMS);
180                    Profile res = cath.match(prof);
181                    System.err.println("Best match is: "+res);
182                }
183            }
184        }
185    
186    }