001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.util.Iterator;
022    
023    /**
024     * Chi^2 Metric without rectification of classes
025     * @author frank nestel
026     * @author $Author: nestefan $
027     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
028     */
029    public class C2Metric
030      implements NGramMetric
031    {
032      public double diff(NGramProfile p1, NGramProfile p2)
033      {
034        double sum = 0.0;
035        int norm1 = p1.getNormalization();
036        int norm2 = p2.getNormalization();
037    
038        // Treat all NGrams contained in p1;
039        Iterator i = p1.getSorted();
040        while (i.hasNext())
041        {
042          NGram ng1 = (NGram) i.next();
043          NGram ng2 = p2.get(ng1);
044          int c1 = ng1.getCount();
045          if ( ng2 != null )
046          {
047            int c2 = ng2.getCount();
048            sum += ((double) c1)/norm1 * c1 /(c1+c2);
049            sum += ((double) c2)/norm2 * c2 /(c1+c2);
050          } else {
051            sum += ((double) c1)/norm1;
052          }
053        }
054    
055        // Treat NGrams contained ONLY in p2
056        i = p2.getSorted();
057        while (i.hasNext())
058        {
059          NGram ng2 = (NGram) i.next();
060          if ( p1.get(ng2) == null )
061          {
062            int c2 = ng2.getCount();
063            sum += ((double) c2)/norm2;
064          }
065        }
066        // Would be Chi^2 return (norm1+norm2)*(sum-1.0);
067        return sum-1.0;
068      }
069    }