余弦相似性算法
发布时间:2020-12-14 01:52:38 所属栏目:大数据 来源:网络整理
导读:余弦相似性算法的具体介绍参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html 下面是我根据上边的介绍进行的java语言的实现: import java.io.IOException; import java.io.StringReader; import java.util.Collections; import java.util.
余弦相似性算法的具体介绍参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html 下面是我根据上边的介绍进行的java语言的实现: import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.wjb.util.common.WjbTuple2;
public class CosineTextSimilarity {
public static Map<String,Integer> makeTermFrequency(String text) throws IOException
{
Analyzer analyzer = new IKAnalyzer(true);
StringReader reader = new StringReader(text);
TokenStream ts = analyzer.tokenStream("",reader);
CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
Map<String,Integer> tf = new HashMap<String,Integer>();
while(ts.incrementToken()){
String t = term.toString();
Integer count = tf.get(t);
if(count == null)
{
tf.put(t,1);
}else{
tf.put(t,count + 1);
}
}
analyzer.close();
reader.close();
return tf;
}
/** * 根据key的长度进行过滤,只有key的长度不小于 length 时,这个key才会保留 * @param map * @param length * @return * @throws IOException */
public static Map<String,Integer> filterByKeyLength(Map<String,Integer> map,int length) throws IOException
{
Map<String,Integer> m = new HashMap<String,Integer>();
for(String key : map.keySet())
{
if(key == null || key.trim().length() >= length)
{
m.put(key,map.get(key));
}
}
return m;
}
public static WjbTuple2<int[],int[]> makeVector(Map<String,Integer> first,Map<String,Integer> second){
Set<String> keys = new HashSet<String>();
keys.addAll(first.keySet());
keys.addAll(second.keySet());
int[] vector1 = new int[keys.size()];
int[] vector2 = new int[keys.size()];
int i = 0;
for(String key : keys)
{
Integer count1 = first.get(key);
if(count1 != null)
{
vector1[i] = count1;
}
Integer count2 = second.get(key);
if(count2 != null)
{
vector2[i] = count2;
}
i++;
}
return new WjbTuple2<int[],int[]>(vector1,vector2);
}
public static double cosine(WjbTuple2<int[],int[]> tuple)
{
int[] vector1 = tuple._1;
int[] vector2 = tuple._2;
double sum1 = 0;
double sum21 = 0;
double sum22 = 0;
for (int i = 0; i < vector1.length; i++) {
sum1 += vector1[i] * vector2[i];
sum21 += vector1[i] * vector1[i];
sum22 += vector2[i] * vector2[i];
}
return sum1/(Math.sqrt(sum21 * sum22 ));
}
public static List<Entry> sort(Map unsortMap) {
// Convert Map to List
List<Map.Entry> list = new LinkedList<Map.Entry>(unsortMap.entrySet());
// Sort list with comparator,to compare the Map values
Collections.sort(list,new Comparator<Map.Entry>() {
public int compare(Map.Entry o1,Map.Entry o2) {
String d1 = o1.getValue().toString();
String d2 = o2.getValue().toString();
String k1 = o1.getKey().toString();
String k2 = o2.getKey().toString();
if(o1.getValue() instanceof Integer)
{
Integer nd1 = Integer.parseInt(d1);
Integer nd2 = Integer.parseInt(d2);
if( nd2 - nd1 != 0 )
return nd2 - nd1;
else{
return k2.compareTo(k1);
}
}else
return d2.compareTo(d1);
}
});
return list;
}
}
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.wjb.util.common.WjbFileUtil;
import com.wjb.util.common.WjbTuple2;
public class Main {
public static void main(String[] args) throws Exception {
String text1 = WjbFileUtil.fromFile("d:/1.txt");
String text2 = WjbFileUtil.fromFile("d:/2.txt",WjbFileUtil.GBK);
System.out.println(text2);
long begin = System.currentTimeMillis();
Map<String,Integer> map1 = CosineTextSimilarity.makeTermFrequency(text1);
Map<String,Integer> map2 = CosineTextSimilarity.makeTermFrequency(text2);
// map1 = CosineTextSimilarity.filterByKeyLength(map1,2);
// map2 = CosineTextSimilarity.filterByKeyLength(map2,2);
List<Entry> list1 = CosineTextSimilarity.sort(map1);
System.out.println(list1);
list1 = list1.subList(0,list1.size() > 20 ? 20 : list1.size());
List<Entry> list2 = CosineTextSimilarity.sort(map2);
System.out.println(list2);
list2 = list2.subList(0,list2.size() > 20 ? 20 : list2.size());
map1 = list2Map(list1);
map2 = list2Map(list2);
WjbTuple2<int[],int[]> tuple = CosineTextSimilarity.makeVector(map1,map2);
double cos = CosineTextSimilarity.cosine(tuple);
long end = System.currentTimeMillis();
System.out.println(end - begin);
System.out.println(cos);
}
public static Map<String,Integer> list2Map(List<Entry> list)
{
Map<String,Integer> map = new HashMap<String,Integer>();
for(Entry e : list)
{
map.put(e.getKey().toString(),(Integer)e.getValue());
}
return map;
}
}
(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |