001 /*
002 NGramJ - n-gram based text classification
003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
004
005 This program is free software; you can redistribute it and/or modify
006 it under the terms of the GNU Lesser General Public License as published
007 by the Free Software Foundation; either version 2.1 of the License, or
008 (at your option) any later version.
009
010 This program is distributed in the hope that it will be useful,
011 but WITHOUT ANY WARRANTY; without even the implied warranty of
012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 GNU General Public License for more details.
014
015 You should have received a copy of the GNU Lesser General Public License
016 along with this program (lesser.txt); if not, write to the Free Software
017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018 */
019 package de.spieleck.app.cngram;
020
021 import java.util.Iterator;
022
023 /**
024 * Chi^2 Metric without rectification of classes
025 * @author frank nestel
026 * @author $Author: nestefan $
027 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
028 */
029 public class C2Metric
030 implements NGramMetric
031 {
032 public double diff(NGramProfile p1, NGramProfile p2)
033 {
034 double sum = 0.0;
035 int norm1 = p1.getNormalization();
036 int norm2 = p2.getNormalization();
037
038 // Treat all NGrams contained in p1;
039 Iterator i = p1.getSorted();
040 while (i.hasNext())
041 {
042 NGram ng1 = (NGram) i.next();
043 NGram ng2 = p2.get(ng1);
044 int c1 = ng1.getCount();
045 if ( ng2 != null )
046 {
047 int c2 = ng2.getCount();
048 sum += ((double) c1)/norm1 * c1 /(c1+c2);
049 sum += ((double) c2)/norm2 * c2 /(c1+c2);
050 } else {
051 sum += ((double) c1)/norm1;
052 }
053 }
054
055 // Treat NGrams contained ONLY in p2
056 i = p2.getSorted();
057 while (i.hasNext())
058 {
059 NGram ng2 = (NGram) i.next();
060 if ( p1.get(ng2) == null )
061 {
062 int c2 = ng2.getCount();
063 sum += ((double) c2)/norm2;
064 }
065 }
066 // Would be Chi^2 return (norm1+norm2)*(sum-1.0);
067 return sum-1.0;
068 }
069 }