R语言做文本挖掘:基于网购评论提炼电热水器的差异化卖点
发布时间:2020-12-14 02:32:47 所属栏目:大数据 来源:网络整理
导读:这是我参加一个数据挖掘竞赛的作品,这是代码部分,论文正文并没有贴出来。 水平一般般,很多还可以改进的地方。 不过辛辛苦苦做出来的东西,多少还是有些地方可以借鉴的,贴出来大家共同探讨下。 #读入数据guomei = read.csv("C:UsershormyDesktop
这是我参加一个数据挖掘竞赛的作品,这是代码部分,论文正文并没有贴出来。 #读入数据 guomei = read.csv("C:UsershormyDesktop电热评论原始数据汇总-国美.csv",stringsAsFactors=F,header=T) jingdong = read.csv("C:UsershormyDesktop电热评论原始数据汇总-京东.csv",header=T) suning = read.csv("C:UsershormyDesktop电热评论原始数据汇总-苏宁.csv",header=T) tianmao = read.csv("C:UsershormyDesktop电热评论原始数据汇总-天猫淘宝.csv",header=T) yixun = read.csv("C:UsershormyDesktop电热评论原始数据汇总-易迅.csv",header=T) #去掉不必要的列,统一列名 guomei = guomei[,4:5] names(guomei)= c('品牌','评论') jingdong = jingdong[,5:6] names(jingdong)= c('品牌','评论') suning = suning[,c(5,7)] names(suning) = c('品牌','评论') tianmao = tianmao[,5:6] names(tianmao) = c('品牌','评论') yixun = yixun[,7)] names(yixun) = c('品牌','评论') #去掉默认好评,和空白评论 guomei = guomei[guomei$评论!="未及时做出评论,默认好评!",] tianmao = tianmao[complete.cases(tianmao$评论),] #筛选出海尔的评论数据 haier = c(guomei$评论,jingdong[jingdong$品牌=="海尔",]$评论,suning[suning$品牌=="海尔",tianmao[tianmao$品牌=="海尔",yixun[yixun$品牌=="海尔",]$评论) #除海尔外的所有品牌评论数据 others = c(jingdong[jingdong$品牌!="海尔",suning[suning$品牌!="海尔",tianmao[tianmao$品牌!="海尔",yixun[yixun$品牌!="海尔",]$评论) #去掉数字,字母 haier = gsub("[a-z0-9A-Z_]","",haier) others = gsub("[a-z0-9A-Z_]",others) #基于中科院ICTCLAS的分词包 library(Rwordseg) #加入自定义的词典后,对海尔的评论分词 haier = segmentCN(haier,nature=TRUE) #对其他品牌的评论分词 others = segmentCN(others,nature=TRUE) rm(guomei,tianmao,jingdong,suning,yixun) #去停用词 #生成自定义停词表stopwordsCN.txt,读入,必须是utf-8编码 stopwordsCN = as.character(readLines("stopwordsCN.txt")) stopwordsCN = enc2utf8(stopwordsCN) stopwordsCN = stopwordsCN[Encoding(stopwordsCN)!="unknown"] #自定义去停词函数 removeStopWords <- function(x,stopwords) { temp <- character(0) index <- 1 xLen <- length(x) while (index <= xLen) { if (length(stopwords[stopwords==x[index]]) <1) temp<- c(temp,x[index]) index <- index +1 } temp } #去停词 haier = lapply(haier,removeStopWords,stopwordsCN) others = lapply(others,stopwordsCN) #提取海尔的名词和动名词,作为候选产品特征集 haier.vc = unlist(haier) haier.character = c(haier.vc[grep("n",names(haier.vc))],haier.vc[grep("vn",names(haier.vc))]) haier.character = haier.character[nchar(haier.character)>1] #编写函数提取其他品牌的名词和动名词,作为候选产品特征集 others.vc = unlist(others) others.character = c(others.vc[grep("n",names(others.vc))],others.vc[grep("vn",names(others.vc))]) others.character = others.character[nchar(others.character)>1] #特征提取,初始特征集 #提取候选特征中的出现次数大于100次的 haier.character = sort(table(haier.character)) haier.character = haier.character[as.vector(haier.character) >= 100] others.character = sort(table(others.character)) others.character = others.character[as.vector(others.character) >= 100] #根据候选产品特征集,计算频率并用词云图可视化显示 library(wordcloud) windows() #打开一个plot新界面显示词云图 wordcloud(names(haier.character),as.vector(haier.character),colors=brewer.pal(8,"Dark2")) #选取画出出现次数大于500的词语 windows() wordcloud(names(others.character),as.vector(others.character),"Dark2"),min.freq=500*3535/2730) #手工去除非特征名词,得到特征集9个 waiguan = c('造型','外表','外包装','样式','外形','款式','样子','设计','外观','外壳','包装') peijian = c('电源线','接口','旋钮','管线','阀门','水龙头','面板','管道','架子','遥控器','接头','螺丝','软管','电源','放电器','插头','双管','喷头','混水阀','管子','花洒','水管','插座','配件','泄压阀','螺栓','配置','装置','接地线','安全阀','内胆','出水管','波纹管') tiyan = c('水流','声音','噪音','制热','速热','体积','水压','恒温','出水量','容量','水量','水温') gongxiao = c('能效','能耗','效率','性能','功率','效果','功能','耗电量') jiage = c('服务费','价位','价钱','价格','费用','材料费','收费','运费','路费') anquan = c('安全') wuliu = c('快递','速度','物流') fuwu = c('服务','态度','售后') zhiliang = c('质量') #情感分析,采用打分的策略 #读入知网褒贬词库,注意txt文件要为ANSI编码格式 pwords = readLines("E:中文情感极性词典 NTUSDpositive.txt") nwords = readLines("E:中文情感极性词典 NTUSDnegative.txt") ######################################## #含有外观特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(waiguan)){ if(waiguan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.waiguan = unique(lapply(haier,getwaiguan)) #提取形容词,副形容词作为观点词 getaad = function(x){ c(x[grep("a",names(x))],x[grep("ad",names(x))]) } haier.waiguan.vc = as.vector(unlist(lapply(haier.waiguan,getaad))) #打分 haier.waiguan.score = 0 for(i in 1:length(haier.waiguan.vc)){ if(haier.waiguan.vc[i] %in% pwords) haier.waiguan.score = haier.waiguan.score + 1 else if(haier.waiguan.vc[i] %in% nwords) haier.waiguan.score = haier.waiguan.score - 1 } #总得分除以含有该特征的评论数,得到海尔“外观”特征平均分是0.7581172 haier.waiguan.score = haier.waiguan.score/(length(haier.waiguan)-1) #同样道理,计算其他品牌的“外观”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(waiguan)){ if(waiguan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.waiguan = unique(lapply(others,names(x))]) } others.waiguan.vc = as.vector(unlist(lapply(others.waiguan,getaad))) #打分 others.waiguan.score = 0 for(i in 1:length(others.waiguan.vc)){ if(others.waiguan.vc[i] %in% pwords) others.waiguan.score = others.waiguan.score + 1 else if(others.waiguan.vc[i] %in% nwords) others.waiguan.score = others.waiguan.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“外观”特征平均分是0.7616708 others.waiguan.score = others.waiguan.score/(length(others.waiguan)-1) ############################################## #含有“配件”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(peijian)){ if(peijian[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.peijian = unique(lapply(haier,names(x))]) } haier.peijian.vc = as.vector(unlist(lapply(haier.peijian,getaad))) #打分 haier.peijian.score = 0 for(i in 1:length(haier.peijian.vc)){ if(haier.peijian.vc[i] %in% pwords) haier.peijian.score = haier.peijian.score + 1 else if(haier.peijian.vc[i] %in% nwords) haier.peijian.score = haier.peijian.score - 1 } #总得分除以含有该特征的评论数,得到海尔“配件”特征平均分是0.4876891 haier.peijian.score = haier.peijian.score/(length(haier.peijian)-1) #同样道理,计算其他品牌的“配件”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(peijian)){ if(peijian[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.peijian = unique(lapply(others,names(x))]) } others.peijian.vc = as.vector(unlist(lapply(others.peijian,getaad))) #打分 others.peijian.score = 0 for(i in 1:length(others.peijian.vc)){ if(others.peijian.vc[i] %in% pwords) others.peijian.score = others.peijian.score + 1 else if(others.peijian.vc[i] %in% nwords) others.peijian.score = others.peijian.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“配件”特征平均分是0.3960827 others.peijian.score = others.peijian.score/(length(others.peijian)-1) ################################################ #含有“体验”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(tiyan)){ if(tiyan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.tiyan = unique(lapply(haier,names(x))]) } haier.tiyan.vc = as.vector(unlist(lapply(haier.tiyan,getaad))) #打分 haier.tiyan.score = 0 for(i in 1:length(haier.tiyan.vc)){ if(haier.tiyan.vc[i] %in% pwords) haier.tiyan.score = haier.tiyan.score + 1 else if(haier.tiyan.vc[i] %in% nwords) haier.tiyan.score = haier.tiyan.score - 1 } #总得分除以含有该特征的评论数,得到海尔“体验”特征平均分是0.5566502 haier.tiyan.score = haier.tiyan.score/(length(haier.tiyan)-1) #同样道理,计算其他品牌的“体验”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(tiyan)){ if(tiyan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.tiyan = unique(lapply(others,names(x))]) } others.tiyan.vc = as.vector(unlist(lapply(others.tiyan,getaad))) #打分 others.tiyan.score = 0 for(i in 1:length(others.tiyan.vc)){ if(others.tiyan.vc[i] %in% pwords) others.tiyan.score = others.tiyan.score + 1 else if(others.tiyan.vc[i] %in% nwords) others.tiyan.score = others.tiyan.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“体验”特征平均分是0.5568163 others.tiyan.score = others.tiyan.score/(length(others.tiyan)-1) ################################################### #含有“功效”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(gongxiao)){ if(gongxiao[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.gongxiao = unique(lapply(haier,names(x))]) } haier.gongxiao.vc = as.vector(unlist(lapply(haier.gongxiao,getaad))) #打分 haier.gongxiao.score = 0 for(i in 1:length(haier.gongxiao.vc)){ if(haier.gongxiao.vc[i] %in% pwords) haier.gongxiao.score = haier.gongxiao.score + 1 else if(haier.gongxiao.vc[i] %in% nwords) haier.gongxiao.score = haier.gongxiao.score - 1 } #总得分除以含有该特征的评论数,得到海尔“功效”特征平均分是0.523503 haier.gongxiao.score = haier.gongxiao.score/(length(haier.gongxiao)-1) #同样道理,计算其他品牌的“功效”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(gongxiao)){ if(gongxiao[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.gongxiao = unique(lapply(others,names(x))]) } others.gongxiao.vc = as.vector(unlist(lapply(others.gongxiao,getaad))) #打分 others.gongxiao.score = 0 for(i in 1:length(others.gongxiao.vc)){ if(others.gongxiao.vc[i] %in% pwords) others.gongxiao.score = others.gongxiao.score + 1 else if(others.gongxiao.vc[i] %in% nwords) others.gongxiao.score = others.gongxiao.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“功效”特征平均分是0.4762309 others.gongxiao.score = others.gongxiao.score/(length(others.gongxiao)-1) ################################################# #含有“价格”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(jiage)){ if(jiage[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.jiage = unique(lapply(haier,names(x))]) } haier.jiage.vc = as.vector(unlist(lapply(haier.jiage,getaad))) #打分 haier.jiage.score = 0 for(i in 1:length(haier.jiage.vc)){ if(haier.jiage.vc[i] %in% pwords) haier.jiage.score = haier.jiage.score + 1 else if(haier.jiage.vc[i] %in% nwords) haier.jiage.score = haier.jiage.score - 1 } #总得分除以含有该特征的评论数,得到海尔“价格”特征平均分是 0.6497261 haier.jiage.score = haier.jiage.score/(length(haier.jiage)-1) #同样道理,计算其他品牌的“价格”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(jiage)){ if(jiage[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.jiage = unique(lapply(others,names(x))]) } others.jiage.vc = as.vector(unlist(lapply(others.jiage,getaad))) #打分 others.jiage.score = 0 for(i in 1:length(others.jiage.vc)){ if(others.jiage.vc[i] %in% pwords) others.jiage.score = others.jiage.score + 1 else if(others.jiage.vc[i] %in% nwords) others.jiage.score = others.jiage.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“价格”特征平均分是0.5644507 others.jiage.score = others.jiage.score/(length(others.jiage)-1) ######################################################### #含有“安全”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(anquan)){ if(anquan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.anquan = unique(lapply(haier,names(x))]) } haier.anquan.vc = as.vector(unlist(lapply(haier.anquan,getaad))) #打分 haier.anquan.score = 0 for(i in 1:length(haier.anquan.vc)){ if(haier.anquan.vc[i] %in% pwords) haier.anquan.score = haier.anquan.score + 1 else if(haier.anquan.vc[i] %in% nwords) haier.anquan.score = haier.anquan.score - 1 } #总得分除以含有该特征的评论数,得到海尔“安全”特征平均分是1.60709 haier.anquan.score = haier.anquan.score/(length(haier.anquan)-1) #同样道理,计算其他品牌的“安全”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(anquan)){ if(anquan[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.anquan = unique(lapply(others,names(x))]) } others.anquan.vc = as.vector(unlist(lapply(others.anquan,getaad))) #打分 others.anquan.score = 0 for(i in 1:length(others.anquan.vc)){ if(others.anquan.vc[i] %in% pwords) others.anquan.score = others.anquan.score + 1 else if(others.anquan.vc[i] %in% nwords) others.anquan.score = others.anquan.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“安全”特征平均分是 1.605308 others.anquan.score = others.anquan.score/(length(others.anquan)-1) ######################################################### #含有“物流”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(wuliu)){ if(wuliu[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.wuliu = unique(lapply(haier,names(x))]) } haier.wuliu.vc = as.vector(unlist(lapply(haier.wuliu,getaad))) #打分 haier.wuliu.score = 0 for(i in 1:length(haier.wuliu.vc)){ if(haier.wuliu.vc[i] %in% pwords) haier.wuliu.score = haier.wuliu.score + 1 else if(haier.wuliu.vc[i] %in% nwords) haier.wuliu.score = haier.wuliu.score - 1 } #总得分除以含有该特征的评论数,得到海尔“物流”特征平均分是0.3783224 haier.wuliu.score = haier.wuliu.score/(length(haier.wuliu)-1) #同样道理,计算其他品牌的“物流”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(wuliu)){ if(wuliu[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.wuliu = unique(lapply(others,names(x))]) } others.wuliu.vc = as.vector(unlist(lapply(others.wuliu,getaad))) #打分 others.wuliu.score = 0 for(i in 1:length(others.wuliu.vc)){ if(others.wuliu.vc[i] %in% pwords) others.wuliu.score = others.wuliu.score + 1 else if(others.wuliu.vc[i] %in% nwords) others.wuliu.score = others.wuliu.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“物流”特征平均分是0.371418 others.wuliu.score = others.wuliu.score/(length(others.wuliu)-1) ######################################################### #含有“服务”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(fuwu)){ if(fuwu[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.fuwu = unique(lapply(haier,names(x))]) } haier.fuwu.vc = as.vector(unlist(lapply(haier.fuwu,getaad))) #打分 haier.fuwu.score = 0 for(i in 1:length(haier.fuwu.vc)){ if(haier.fuwu.vc[i] %in% pwords) haier.fuwu.score = haier.fuwu.score + 1 else if(haier.fuwu.vc[i] %in% nwords) haier.fuwu.score = haier.fuwu.score - 1 } #总得分除以含有该特征的评论数,得到海尔“服务”特征平均分是 0.4670337 haier.fuwu.score = haier.fuwu.score/(length(haier.fuwu)-1) #同样道理,计算其他品牌的“服务”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(fuwu)){ if(fuwu[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.fuwu = unique(lapply(others,names(x))]) } others.fuwu.vc = as.vector(unlist(lapply(others.fuwu,getaad))) #打分 others.fuwu.score = 0 for(i in 1:length(others.fuwu.vc)){ if(others.fuwu.vc[i] %in% pwords) others.fuwu.score = others.fuwu.score + 1 else if(others.fuwu.vc[i] %in% nwords) others.fuwu.score = others.fuwu.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“服务”特征平均分是0.4353763 others.fuwu.score = others.fuwu.score/(length(others.fuwu)-1) ######################################################## #含有“质量”特征的海尔评论 getwaiguan = function(x){ temp = c() for(i in 1:length(zhiliang)){ if(zhiliang[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } #微处理一下 haier.zhiliang = unique(lapply(haier,names(x))]) } haier.zhiliang.vc = as.vector(unlist(lapply(haier.zhiliang,getaad))) #打分 haier.zhiliang.score = 0 for(i in 1:length(haier.zhiliang.vc)){ if(haier.zhiliang.vc[i] %in% pwords) haier.zhiliang.score = haier.zhiliang.score + 1 else if(haier.zhiliang.vc[i] %in% nwords) haier.zhiliang.score = haier.zhiliang.score - 1 } #总得分除以含有该特征的评论数,得到海尔“质量”特征平均分是0.3855133 haier.zhiliang.score = haier.zhiliang.score/(length(haier.zhiliang)-1) #同样道理,计算其他品牌的“质量”特征的得分 getwaiguan = function(x){ temp = c() for(i in 1:length(zhiliang)){ if(zhiliang[i] %in% unlist(x)){ temp = c(temp,unlist(x)) } } temp } others.zhiliang = unique(lapply(others,names(x))]) } others.zhiliang.vc = as.vector(unlist(lapply(others.zhiliang,getaad))) #打分 others.zhiliang.score = 0 for(i in 1:length(others.zhiliang.vc)){ if(others.zhiliang.vc[i] %in% pwords) others.zhiliang.score = others.zhiliang.score + 1 else if(others.zhiliang.vc[i] %in% nwords) others.zhiliang.score = others.zhiliang.score - 1 } #总得分除以含有该特征的评论数,得到其他品牌“质量”特征平均分是0.3750291 others.zhiliang.score = others.zhiliang.score/(length(others.zhiliang)-1) ############# #####用直方图展示结果 data = data.frame(c(0.7581,0.7617),c(0.4877,0.3961),c(0.5567,0.5568),c(0.5235,0.4762),c(0.6497,0.5645),c(1.6071,1.6053),c(0.3783,0.3714),c(0.4670,0.4354),c(0.3755,0.3750) ) names(data) = c('外观','体验','功效','安全','物流','服务','质量') barplot(as.matrix(data),col=c("green","red"),beside=TRUE,xlab="产品特征",ylab="分数",main="海尔(绿色)与其他品牌(红色)的特征得分对比") 结果可视化之后,具体的分析大家可以自行补充。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |