001 /* 002 NGramJ - n-gram based text classification 003 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) 004 005 This program is free software; you can redistribute it and/or modify 006 it under the terms of the GNU Lesser General Public License as published 007 by the Free Software Foundation; either version 2.1 of the License, or 008 (at your option) any later version. 009 010 This program is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 GNU General Public License for more details. 014 015 You should have received a copy of the GNU Lesser General Public License 016 along with this program (lesser.txt); if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 018 */ 019 package de.spieleck.app.cngram; 020 021 import java.util.Iterator; 022 023 /** 024 * Cosine Metric 025 * This is nicely valued between zero and one 026 * @author frank nestel 027 * @author $Author: nestefan $ 028 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ 029 */ 030 public class CosMetric 031 implements NGramMetric 032 { 033 public double diff(NGramProfile p1, NGramProfile p2) 034 { 035 double sum = 0.0; 036 double s1 = 0.0; 037 double s2 = 0.0; 038 039 // Treat all NGrams contained in p1; 040 Iterator i = p1.getSorted(); 041 while (i.hasNext()) 042 { 043 NGram ng1 = (NGram) i.next(); 044 NGram ng2 = p2.get(ng1); 045 double c1 = ng1.getCount(); 046 s1 += c1 * c1; 047 if ( ng2 != null ) 048 { 049 double c2 = ng2.getCount(); 050 sum += c1 * c2; 051 s2 += c2 * c2; 052 } 053 } 054 // Treat NGrams contained ONLY in p2 055 i = p2.getSorted(); 056 while (i.hasNext()) 057 { 058 NGram ng2 = (NGram) i.next(); 059 if ( p1.get(ng2) == null ) 060 { 061 double c2 = ng2.getCount(); 062 s2 += c2 * c2; 063 } 064 } 065 return 1.0 - sum / Math.sqrt(s1*s2); 066 } 067 }