001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019 package de.spieleck.app.cngram;
020
021 import java.util.Iterator;
022
023 /**
024 * ALPHA modified Chi^2 Metric without rectification of classes
025 * and with centrification between the two profiles.
026 * @author frank nestel
027 * @author $Author: nestefan $
028 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
029 */
030 public class C2xMetric
031 implements NGramMetric
032 {
033 public final static double ALPHA = 1.0; // 0.0 is exact
034
035 public double diff(NGramProfile p1, NGramProfile p2)
036 {
037 double sum = 0.0;
038 int norm1 = p1.getNormalization();
039 int norm2 = p2.getNormalization();
040 int norm = norm1 + norm2;
041 double d1 = ((double)norm1) / norm;
042 double d2 = ((double)norm2) / norm;
043 double u;
044
045 // Treat all NGrams contained in p1;
046 Iterator i = p1.getSorted();
047 while (i.hasNext())
048 {
049 NGram ng1 = (NGram) i.next();
050 NGram ng2 = p2.get(ng1);
051 int c1 = ng1.getCount();
052 int c2 = 0;
053 if ( ng2 != null )
054 c2 = ng2.getCount();
055 u = d1 * (c1+c2);
056 sum += (c1 - u)*(c1 - u)/(u + ALPHA);
057 u = d2 * (c1+c2);
058 sum += (c2 - u)*(c2 - u)/(u + ALPHA);
059 }
060
061 // Treat NGrams contained ONLY in p2
062 i = p2.getSorted();
063 while (i.hasNext())
064 {
065 NGram ng2 = (NGram) i.next();
066 if ( p1.get(ng2) == null )
067 {
068 // int c1 = 0;
069 int c2 = ng2.getCount();
070 u = d1 * c2; // = d1 * (c1+c2);
071 sum += u*u / (u + ALPHA); // = (c1 - u)*(c1 - u)/u;
072 u = d2 * c2; // = d2 * (c1+c2);
073 sum += (c2 - u)*(c2 - u)/ (u + ALPHA);
074 }
075 }
076
077 return sum/norm*(1.0+ALPHA); // "Wrong" but nicer normalization
078 }
079 }