001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.util.Iterator; 022 023 /** 024 * ALPHA modified Chi^2 Metric without rectification of classes 025 * and with centrification between the two profiles. 026 * @author frank nestel 027 * @author $Author: nestefan $ 028 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 029 */ 030 public class C2xMetric 031 implements NGramMetric 032 { 033 public final static double ALPHA = 1.0; // 0.0 is exact 034 035 public double diff(NGramProfile p1, NGramProfile p2) 036 { 037 double sum = 0.0; 038 int norm1 = p1.getNormalization(); 039 int norm2 = p2.getNormalization(); 040 int norm = norm1 + norm2; 041 double d1 = ((double)norm1) / norm; 042 double d2 = ((double)norm2) / norm; 043 double u; 044 045 // Treat all NGrams contained in p1; 046 Iterator i = p1.getSorted(); 047 while (i.hasNext()) 048 { 049 NGram ng1 = (NGram) i.next(); 050 NGram ng2 = p2.get(ng1); 051 int c1 = ng1.getCount(); 052 int c2 = 0; 053 if ( ng2 != null ) 054 c2 = ng2.getCount(); 055 u = d1 * (c1+c2); 056 sum += (c1 - u)*(c1 - u)/(u + ALPHA); 057 u = d2 * (c1+c2); 058 sum += (c2 - u)*(c2 - u)/(u + ALPHA); 059 } 060 061 // Treat NGrams contained ONLY in p2 062 i = p2.getSorted(); 063 while (i.hasNext()) 064 { 065 NGram ng2 = (NGram) i.next(); 066 if ( p1.get(ng2) == null ) 067 { 068 // int c1 = 0; 069 int c2 = ng2.getCount(); 070 u = d1 * c2; // = d1 * (c1+c2); 071 sum += u*u / (u + ALPHA); // = (c1 - u)*(c1 - u)/u; 072 u = d2 * c2; // = d2 * (c1+c2); 073 sum += (c2 - u)*(c2 - u)/ (u + ALPHA); 074 } 075 } 076 077 return sum/norm*(1.0+ALPHA); // "Wrong" but nicer normalization 078 } 079 }