完整的中英文词频统计
发布时间:2020-12-14 04:18:33 所属栏目:大数据 来源:网络整理
导读:# 读取字符串str f = open( ‘ zz.txt ‘ , ‘ r ‘ ,encoding= ‘ utf-8 ‘ )strbig = f.read()f.close()sep = ‘‘‘ .,;:?!-_ ‘‘‘ for ch in sep: strbig = strbig.replace(ch, ‘ ‘ )strbig = strbig.lower() print (strbig)strlist = strbig.split(
#读取字符串str f = open(‘zz.txt‘,‘r‘,encoding=‘utf-8‘) strbig= f.read() f.close() sep =‘‘‘.,;:?!-_‘‘‘ for ch in sep: strbig = strbig.replace(ch,‘ ‘) strbig = strbig.lower() print(strbig) strlist = strbig.split() print(len(strlist),strlist) strset = set(strlist) exclude = {‘a‘,‘the‘,‘and‘,‘i‘,‘you‘} strset = strset-exclude print(len(strset),strset) strdict ={} for word in strset: strdict[word] = strlist.count(word) print(len(strdict),strdict) ccList = list(strdict.items()) print(ccList) ccList.sort(key=lambda x: x[1],reverse=True) print(ccList) for i in range(20): b = ccList[i] print(b) ? ? fo = open(‘a.txt‘,‘r‘,encoding=‘utf-8‘) shz=fo.read() fo.close() print(shz) # jieba3种模式分词 import jieba print(list(jieba.cut(shz))) print(list(jieba.cut(shz,cut_all=True))) print(list(jieba.cut_for_search(shz))) s1=‘‘‘, 。 ; : - ! ? 、 “ ”‘‘‘ #标点符号转换成空格 for ch in s1: shz=shz.replace(ch,‘ ‘) print(shz) strlist=shz.split() print(len(strlist),strlist) #分解提取词语 wordsls=jieba.lcut(shz) wcdict={} for word in wordsls: #词为1的删除 if len(word)==1: continue else: wcdict[word]=wcdict.get(word,0)+1 print(wordsls) # # 按词频排序 shls=list(wcdict.items()) shls.sort(key=lambda x:x[1],reverse=True) print(shls) # # # 输出TOP15 for i in range(15): print(shls[i]) ? (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |