001    /*
002    NGramJ - n-gram based text classification
003    Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004    
005    This program is free software; you can redistribute it and/or modify
006    it under the terms of the GNU Lesser General Public License as published 
007    by the Free Software Foundation; either version 2.1 of the License, or
008    (at your option) any later version.
009    
010    This program is distributed in the hope that it will be useful,
011    but WITHOUT ANY WARRANTY; without even the implied warranty of
012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013    GNU General Public License for more details.
014    
015    You should have received a copy of the GNU Lesser General Public License
016    along with this program (lesser.txt); if not, write to the Free Software
017    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018    */
019    package de.spieleck.app.cngram;
020    
021    import java.util.Iterator;
022    
023    /**
024     * ALPHA modified Chi^2 Metric without rectification of classes
025     * and with centrification between the two profiles.
026     * @author frank nestel
027     * @author $Author: nestefan $
028     * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
029     */
030    public class C2xMetric
031      implements NGramMetric
032    {
033      public final static double ALPHA = 1.0; // 0.0 is exact
034    
035      public double diff(NGramProfile p1, NGramProfile p2)
036      {
037        double sum = 0.0;
038        int norm1 = p1.getNormalization();
039        int norm2 = p2.getNormalization();
040        int norm = norm1 + norm2;
041        double d1 = ((double)norm1) / norm;
042        double d2 = ((double)norm2) / norm;
043        double u;
044    
045        // Treat all NGrams contained in p1;
046        Iterator i = p1.getSorted();
047        while (i.hasNext())
048        {
049          NGram ng1 = (NGram) i.next();
050          NGram ng2 = p2.get(ng1);
051          int c1 = ng1.getCount();
052          int c2 = 0;
053          if ( ng2 != null )
054            c2 = ng2.getCount();
055          u = d1 * (c1+c2);
056          sum += (c1 - u)*(c1 - u)/(u + ALPHA);
057          u = d2 * (c1+c2);
058          sum += (c2 - u)*(c2 - u)/(u + ALPHA);
059        }
060    
061        // Treat NGrams contained ONLY in p2
062        i = p2.getSorted();
063        while (i.hasNext())
064        {
065          NGram ng2 = (NGram) i.next();
066          if ( p1.get(ng2) == null )
067          {
068            // int c1 = 0;
069            int c2 = ng2.getCount();
070            u = d1 * c2; // = d1 * (c1+c2);
071            sum += u*u / (u + ALPHA); // = (c1 - u)*(c1 - u)/u;
072            u = d2 * c2; // = d2 * (c1+c2);
073            sum += (c2 - u)*(c2 - u)/ (u + ALPHA);
074          }
075        }
076    
077        return sum/norm*(1.0+ALPHA); // "Wrong" but nicer normalization
078      }
079    }