简单的文本相似性测量(Python实现)
发布时间:2020-12-14 02:38:12 所属栏目:大数据 来源:网络整理
导读:一、数据集与测试集 数据集: 测试集: imaging databases 二、Python代码 # -*- coding: utf-8 -*-"""Created on Mon Apr 13 09:49:25 2015@author: Administrator"""import numpyimport sysimport scipy as sp import osimport nltk.stem as stmenglish_st
一、数据集与测试集 数据集: 测试集: imaging databases 二、Python代码 # -*- coding: utf-8 -*- """ Created on Mon Apr 13 09:49:25 2015 @author: Administrator """ import numpy import sys import scipy as sp import os import nltk.stem as stm english_stemmer = stm.SnowballStemmer('english') from sklearn.feature_extraction.text import CountVectorizer class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer=super(StemmedCountVectorizer,self).build_analyzer() return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc)) from sklearn.feature_extraction.text import CountVectorizer def dist_raw(v1,v2): delta=v1-v2 return sp.linalg.norm(delta.toarray()) def dist_norm(v1,v2): v1_normalized=v1/sp.linalg.norm(v1.toarray()) v2_normalized=v2/sp.linalg.norm(v2.toarray()) delta=v1_normalized-v2_normalized return sp.linalg.norm(delta.toarray()) #content=["how to format my hard disk","Hard disk format problems"] #X=vectorizer.fit_transform(content) dir="E:data" posts=[open(os.path.join(dir,f)).read() for f in os.listdir(dir)] #vectorizer=CountVectorizer(min_df=1) vectorizer=StemmedCountVectorizer(min_df=1,stop_words='english') x_train=vectorizer.fit_transform(posts) num_samples,num_features=x_train.shape """ print("#samples: %d,#features: %d" % (num_samples,num_features)) print(vectorizer.get_feature_names()) """ new_post="imaging databases" new_post_vec=vectorizer.transform([new_post]) best_doc=None best_dist=sys.maxint best_i=None for i in range(0,num_samples): post=posts[i] if(post==new_post): continue post_vec=x_train.getrow(i) #d=dist_raw(post_vec,new_post_vec) d=dist_norm(post_vec,new_post_vec) print"=== Post %i with dis=%.2f: %s" %(i,d,post) if d<best_dist: best_dist=d best_i=i print("Best post is %i with dist=%.2f" %(best_i,best_dist))3、结果 === Post 0 with dis=1.41: This is a toy post about machine learning. Actually,it contains not much interesting stuff === Post 1 with dis=0.61: Imaging databases can get huge. === Post 2 with dis=0.63: Most imaging databases safe images permanently === Post 3 with dis=0.52: Imaging databases store images. === Post 4 with dis=0.52: Imaging databases store images. Imaging databases store images. Imaging databases store images. Best post is 3 with dist=0.52 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |