加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

R语言做文本挖掘:基于网购评论提炼电热水器的差异化卖点

发布时间:2020-12-14 02:32:47 所属栏目:大数据 来源:网络整理
导读:这是我参加一个数据挖掘竞赛的作品,这是代码部分,论文正文并没有贴出来。 水平一般般,很多还可以改进的地方。 不过辛辛苦苦做出来的东西,多少还是有些地方可以借鉴的,贴出来大家共同探讨下。 #读入数据guomei = read.csv("C:UsershormyDesktop

这是我参加一个数据挖掘竞赛的作品,这是代码部分,论文正文并没有贴出来。
水平一般般,很多还可以改进的地方。
不过辛辛苦苦做出来的东西,多少还是有些地方可以借鉴的,贴出来大家共同探讨下。

#读入数据
guomei = read.csv("C:UsershormyDesktop电热评论原始数据汇总-国美.csv",stringsAsFactors=F,header=T)
jingdong = read.csv("C:UsershormyDesktop电热评论原始数据汇总-京东.csv",header=T)
suning = read.csv("C:UsershormyDesktop电热评论原始数据汇总-苏宁.csv",header=T)
tianmao = read.csv("C:UsershormyDesktop电热评论原始数据汇总-天猫淘宝.csv",header=T)
yixun = read.csv("C:UsershormyDesktop电热评论原始数据汇总-易迅.csv",header=T)

#去掉不必要的列,统一列名
guomei = guomei[,4:5]
names(guomei)= c('品牌','评论')
jingdong = jingdong[,5:6]
names(jingdong)= c('品牌','评论')
suning = suning[,c(5,7)]
names(suning) = c('品牌','评论')
tianmao = tianmao[,5:6]
names(tianmao) = c('品牌','评论')
yixun = yixun[,7)]
names(yixun) = c('品牌','评论')

#去掉默认好评,和空白评论
guomei = guomei[guomei$评论!="未及时做出评论,默认好评!",]
tianmao = tianmao[complete.cases(tianmao$评论),]

#筛选出海尔的评论数据
haier = c(guomei$评论,jingdong[jingdong$品牌=="海尔",]$评论,suning[suning$品牌=="海尔",tianmao[tianmao$品牌=="海尔",yixun[yixun$品牌=="海尔",]$评论)

#除海尔外的所有品牌评论数据
others = c(jingdong[jingdong$品牌!="海尔",suning[suning$品牌!="海尔",tianmao[tianmao$品牌!="海尔",yixun[yixun$品牌!="海尔",]$评论)

#去掉数字,字母
haier = gsub("[a-z0-9A-Z_]","",haier)
others = gsub("[a-z0-9A-Z_]",others)


#基于中科院ICTCLAS的分词包
library(Rwordseg)

#加入自定义的词典后,对海尔的评论分词
haier = segmentCN(haier,nature=TRUE)
#对其他品牌的评论分词
others = segmentCN(others,nature=TRUE)

rm(guomei,tianmao,jingdong,suning,yixun)

#去停用词
#生成自定义停词表stopwordsCN.txt,读入,必须是utf-8编码
stopwordsCN = as.character(readLines("stopwordsCN.txt"))
stopwordsCN = enc2utf8(stopwordsCN)
stopwordsCN = stopwordsCN[Encoding(stopwordsCN)!="unknown"]

#自定义去停词函数
removeStopWords <- function(x,stopwords) {
  
  temp <- character(0)
  
  index <- 1
  
  xLen <- length(x)
  
  while (index <= xLen) {
    
    if (length(stopwords[stopwords==x[index]]) <1)
      
      temp<- c(temp,x[index])
    
    index <- index +1
    
  }
  
  temp
  
}

#去停词
haier = lapply(haier,removeStopWords,stopwordsCN)
others = lapply(others,stopwordsCN)


#提取海尔的名词和动名词,作为候选产品特征集
haier.vc = unlist(haier)
haier.character = c(haier.vc[grep("n",names(haier.vc))],haier.vc[grep("vn",names(haier.vc))])
haier.character = haier.character[nchar(haier.character)>1]

#编写函数提取其他品牌的名词和动名词,作为候选产品特征集
others.vc = unlist(others)
others.character = c(others.vc[grep("n",names(others.vc))],others.vc[grep("vn",names(others.vc))])
others.character = others.character[nchar(others.character)>1]


#特征提取,初始特征集
#提取候选特征中的出现次数大于100次的
haier.character = sort(table(haier.character))
haier.character = haier.character[as.vector(haier.character) >= 100]

others.character = sort(table(others.character))
others.character = others.character[as.vector(others.character) >= 100]


#根据候选产品特征集,计算频率并用词云图可视化显示
library(wordcloud) 

windows() #打开一个plot新界面显示词云图
wordcloud(names(haier.character),as.vector(haier.character),colors=brewer.pal(8,"Dark2"))  #选取画出出现次数大于500的词语

windows()
wordcloud(names(others.character),as.vector(others.character),"Dark2"),min.freq=500*3535/2730)


#手工去除非特征名词,得到特征集9个
waiguan = c('造型','外表','外包装','样式','外形','款式','样子','设计','外观','外壳','包装')
peijian = c('电源线','接口','旋钮','管线','阀门','水龙头','面板','管道','架子','遥控器','接头','螺丝','软管','电源','放电器','插头','双管','喷头','混水阀','管子','花洒','水管','插座','配件','泄压阀','螺栓','配置','装置','接地线','安全阀','内胆','出水管','波纹管')
tiyan = c('水流','声音','噪音','制热','速热','体积','水压','恒温','出水量','容量','水量','水温')
gongxiao = c('能效','能耗','效率','性能','功率','效果','功能','耗电量')
jiage = c('服务费','价位','价钱','价格','费用','材料费','收费','运费','路费')
anquan = c('安全')
wuliu = c('快递','速度','物流')
fuwu = c('服务','态度','售后')
zhiliang = c('质量')


#情感分析,采用打分的策略

#读入知网褒贬词库,注意txt文件要为ANSI编码格式
pwords = readLines("E:中文情感极性词典 NTUSDpositive.txt")
nwords = readLines("E:中文情感极性词典 NTUSDnegative.txt")


########################################
#含有外观特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(waiguan)){
    if(waiguan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.waiguan = unique(lapply(haier,getwaiguan))

#提取形容词,副形容词作为观点词
getaad = function(x){
  c(x[grep("a",names(x))],x[grep("ad",names(x))])
}
haier.waiguan.vc = as.vector(unlist(lapply(haier.waiguan,getaad)))

#打分
haier.waiguan.score = 0
for(i in 1:length(haier.waiguan.vc)){
  if(haier.waiguan.vc[i] %in% pwords)
    haier.waiguan.score =  haier.waiguan.score + 1
  else if(haier.waiguan.vc[i] %in% nwords)
    haier.waiguan.score  = haier.waiguan.score  - 1
}
#总得分除以含有该特征的评论数,得到海尔“外观”特征平均分是0.7581172
haier.waiguan.score = haier.waiguan.score/(length(haier.waiguan)-1)

#同样道理,计算其他品牌的“外观”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(waiguan)){
    if(waiguan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.waiguan = unique(lapply(others,names(x))])
}
others.waiguan.vc = as.vector(unlist(lapply(others.waiguan,getaad)))

#打分
others.waiguan.score = 0
for(i in 1:length(others.waiguan.vc)){
  if(others.waiguan.vc[i] %in% pwords)
    others.waiguan.score =  others.waiguan.score + 1
  else if(others.waiguan.vc[i] %in% nwords)
    others.waiguan.score  = others.waiguan.score  - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“外观”特征平均分是0.7616708
others.waiguan.score = others.waiguan.score/(length(others.waiguan)-1)

##############################################
#含有“配件”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(peijian)){
    if(peijian[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.peijian = unique(lapply(haier,names(x))])
}
haier.peijian.vc = as.vector(unlist(lapply(haier.peijian,getaad)))

#打分
haier.peijian.score = 0
for(i in 1:length(haier.peijian.vc)){
  if(haier.peijian.vc[i] %in% pwords)
    haier.peijian.score =  haier.peijian.score + 1
  else if(haier.peijian.vc[i] %in% nwords)
    haier.peijian.score  = haier.peijian.score  - 1
}
#总得分除以含有该特征的评论数,得到海尔“配件”特征平均分是0.4876891
haier.peijian.score = haier.peijian.score/(length(haier.peijian)-1)

#同样道理,计算其他品牌的“配件”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(peijian)){
    if(peijian[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.peijian = unique(lapply(others,names(x))])
}
others.peijian.vc = as.vector(unlist(lapply(others.peijian,getaad)))

#打分
others.peijian.score = 0
for(i in 1:length(others.peijian.vc)){
  if(others.peijian.vc[i] %in% pwords)
    others.peijian.score = others.peijian.score + 1
  else if(others.peijian.vc[i] %in% nwords)
    others.peijian.score = others.peijian.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“配件”特征平均分是0.3960827
others.peijian.score = others.peijian.score/(length(others.peijian)-1)

################################################
#含有“体验”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(tiyan)){
    if(tiyan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.tiyan = unique(lapply(haier,names(x))])
}
haier.tiyan.vc = as.vector(unlist(lapply(haier.tiyan,getaad)))

#打分
haier.tiyan.score = 0
for(i in 1:length(haier.tiyan.vc)){
  if(haier.tiyan.vc[i] %in% pwords)
    haier.tiyan.score =  haier.tiyan.score + 1
  else if(haier.tiyan.vc[i] %in% nwords)
    haier.tiyan.score  = haier.tiyan.score  - 1
}
#总得分除以含有该特征的评论数,得到海尔“体验”特征平均分是0.5566502
haier.tiyan.score = haier.tiyan.score/(length(haier.tiyan)-1)

#同样道理,计算其他品牌的“体验”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(tiyan)){
    if(tiyan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.tiyan = unique(lapply(others,names(x))])
}
others.tiyan.vc = as.vector(unlist(lapply(others.tiyan,getaad)))

#打分
others.tiyan.score = 0
for(i in 1:length(others.tiyan.vc)){
  if(others.tiyan.vc[i] %in% pwords)
    others.tiyan.score = others.tiyan.score + 1
  else if(others.tiyan.vc[i] %in% nwords)
    others.tiyan.score = others.tiyan.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“体验”特征平均分是0.5568163
others.tiyan.score = others.tiyan.score/(length(others.tiyan)-1)

###################################################
#含有“功效”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(gongxiao)){
    if(gongxiao[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.gongxiao = unique(lapply(haier,names(x))])
}
haier.gongxiao.vc = as.vector(unlist(lapply(haier.gongxiao,getaad)))

#打分
haier.gongxiao.score = 0
for(i in 1:length(haier.gongxiao.vc)){
  if(haier.gongxiao.vc[i] %in% pwords)
    haier.gongxiao.score =  haier.gongxiao.score + 1
  else if(haier.gongxiao.vc[i] %in% nwords)
    haier.gongxiao.score  = haier.gongxiao.score  - 1
}
#总得分除以含有该特征的评论数,得到海尔“功效”特征平均分是0.523503
haier.gongxiao.score = haier.gongxiao.score/(length(haier.gongxiao)-1)


#同样道理,计算其他品牌的“功效”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(gongxiao)){
    if(gongxiao[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.gongxiao = unique(lapply(others,names(x))])
}
others.gongxiao.vc = as.vector(unlist(lapply(others.gongxiao,getaad)))

#打分
others.gongxiao.score = 0
for(i in 1:length(others.gongxiao.vc)){
  if(others.gongxiao.vc[i] %in% pwords)
    others.gongxiao.score = others.gongxiao.score + 1
  else if(others.gongxiao.vc[i] %in% nwords)
    others.gongxiao.score = others.gongxiao.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“功效”特征平均分是0.4762309
others.gongxiao.score = others.gongxiao.score/(length(others.gongxiao)-1)

#################################################
#含有“价格”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(jiage)){
    if(jiage[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.jiage = unique(lapply(haier,names(x))])
}
haier.jiage.vc = as.vector(unlist(lapply(haier.jiage,getaad)))

#打分
haier.jiage.score = 0
for(i in 1:length(haier.jiage.vc)){
  if(haier.jiage.vc[i] %in% pwords)
    haier.jiage.score =  haier.jiage.score + 1
  else if(haier.jiage.vc[i] %in% nwords)
    haier.jiage.score = haier.jiage.score - 1
}
#总得分除以含有该特征的评论数,得到海尔“价格”特征平均分是 0.6497261
haier.jiage.score = haier.jiage.score/(length(haier.jiage)-1)

#同样道理,计算其他品牌的“价格”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(jiage)){
    if(jiage[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.jiage = unique(lapply(others,names(x))])
}
others.jiage.vc = as.vector(unlist(lapply(others.jiage,getaad)))

#打分
others.jiage.score = 0
for(i in 1:length(others.jiage.vc)){
  if(others.jiage.vc[i] %in% pwords)
    others.jiage.score = others.jiage.score + 1
  else if(others.jiage.vc[i] %in% nwords)
    others.jiage.score = others.jiage.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“价格”特征平均分是0.5644507
others.jiage.score = others.jiage.score/(length(others.jiage)-1)

#########################################################
#含有“安全”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(anquan)){
    if(anquan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.anquan = unique(lapply(haier,names(x))])
}
haier.anquan.vc = as.vector(unlist(lapply(haier.anquan,getaad)))

#打分
haier.anquan.score = 0
for(i in 1:length(haier.anquan.vc)){
  if(haier.anquan.vc[i] %in% pwords)
    haier.anquan.score =  haier.anquan.score + 1
  else if(haier.anquan.vc[i] %in% nwords)
    haier.anquan.score = haier.anquan.score - 1
}
#总得分除以含有该特征的评论数,得到海尔“安全”特征平均分是1.60709
haier.anquan.score = haier.anquan.score/(length(haier.anquan)-1)

#同样道理,计算其他品牌的“安全”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(anquan)){
    if(anquan[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.anquan = unique(lapply(others,names(x))])
}
others.anquan.vc = as.vector(unlist(lapply(others.anquan,getaad)))

#打分
others.anquan.score = 0
for(i in 1:length(others.anquan.vc)){
  if(others.anquan.vc[i] %in% pwords)
    others.anquan.score = others.anquan.score + 1
  else if(others.anquan.vc[i] %in% nwords)
    others.anquan.score = others.anquan.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“安全”特征平均分是 1.605308
others.anquan.score = others.anquan.score/(length(others.anquan)-1)

#########################################################
#含有“物流”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(wuliu)){
    if(wuliu[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.wuliu = unique(lapply(haier,names(x))])
}
haier.wuliu.vc = as.vector(unlist(lapply(haier.wuliu,getaad)))

#打分
haier.wuliu.score = 0
for(i in 1:length(haier.wuliu.vc)){
  if(haier.wuliu.vc[i] %in% pwords)
    haier.wuliu.score =  haier.wuliu.score + 1
  else if(haier.wuliu.vc[i] %in% nwords)
    haier.wuliu.score = haier.wuliu.score - 1
}
#总得分除以含有该特征的评论数,得到海尔“物流”特征平均分是0.3783224
haier.wuliu.score = haier.wuliu.score/(length(haier.wuliu)-1)

#同样道理,计算其他品牌的“物流”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(wuliu)){
    if(wuliu[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.wuliu = unique(lapply(others,names(x))])
}
others.wuliu.vc = as.vector(unlist(lapply(others.wuliu,getaad)))

#打分
others.wuliu.score = 0
for(i in 1:length(others.wuliu.vc)){
  if(others.wuliu.vc[i] %in% pwords)
    others.wuliu.score = others.wuliu.score + 1
  else if(others.wuliu.vc[i] %in% nwords)
    others.wuliu.score = others.wuliu.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“物流”特征平均分是0.371418
others.wuliu.score = others.wuliu.score/(length(others.wuliu)-1)

#########################################################
#含有“服务”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(fuwu)){
    if(fuwu[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.fuwu = unique(lapply(haier,names(x))])
}
haier.fuwu.vc = as.vector(unlist(lapply(haier.fuwu,getaad)))

#打分
haier.fuwu.score = 0
for(i in 1:length(haier.fuwu.vc)){
  if(haier.fuwu.vc[i] %in% pwords)
    haier.fuwu.score =  haier.fuwu.score + 1
  else if(haier.fuwu.vc[i] %in% nwords)
    haier.fuwu.score = haier.fuwu.score - 1
}
#总得分除以含有该特征的评论数,得到海尔“服务”特征平均分是 0.4670337
haier.fuwu.score = haier.fuwu.score/(length(haier.fuwu)-1)

#同样道理,计算其他品牌的“服务”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(fuwu)){
    if(fuwu[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.fuwu = unique(lapply(others,names(x))])
}
others.fuwu.vc = as.vector(unlist(lapply(others.fuwu,getaad)))

#打分
others.fuwu.score = 0
for(i in 1:length(others.fuwu.vc)){
  if(others.fuwu.vc[i] %in% pwords)
    others.fuwu.score = others.fuwu.score + 1
  else if(others.fuwu.vc[i] %in% nwords)
    others.fuwu.score = others.fuwu.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“服务”特征平均分是0.4353763
others.fuwu.score = others.fuwu.score/(length(others.fuwu)-1)

########################################################
#含有“质量”特征的海尔评论
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(zhiliang)){
    if(zhiliang[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
#微处理一下
haier.zhiliang = unique(lapply(haier,names(x))])
}
haier.zhiliang.vc = as.vector(unlist(lapply(haier.zhiliang,getaad)))

#打分
haier.zhiliang.score = 0
for(i in 1:length(haier.zhiliang.vc)){
  if(haier.zhiliang.vc[i] %in% pwords)
    haier.zhiliang.score =  haier.zhiliang.score + 1
  else if(haier.zhiliang.vc[i] %in% nwords)
    haier.zhiliang.score = haier.zhiliang.score - 1
}
#总得分除以含有该特征的评论数,得到海尔“质量”特征平均分是0.3855133
haier.zhiliang.score = haier.zhiliang.score/(length(haier.zhiliang)-1)

#同样道理,计算其他品牌的“质量”特征的得分
getwaiguan = function(x){
  temp = c()
  for(i in 1:length(zhiliang)){
    if(zhiliang[i] %in% unlist(x)){
      temp = c(temp,unlist(x))
    }
  }
  temp
}
others.zhiliang = unique(lapply(others,names(x))])
}
others.zhiliang.vc = as.vector(unlist(lapply(others.zhiliang,getaad)))

#打分
others.zhiliang.score = 0
for(i in 1:length(others.zhiliang.vc)){
  if(others.zhiliang.vc[i] %in% pwords)
    others.zhiliang.score = others.zhiliang.score + 1
  else if(others.zhiliang.vc[i] %in% nwords)
    others.zhiliang.score = others.zhiliang.score - 1
}
#总得分除以含有该特征的评论数,得到其他品牌“质量”特征平均分是0.3750291
others.zhiliang.score = others.zhiliang.score/(length(others.zhiliang)-1)

#############
#####用直方图展示结果
data = data.frame(c(0.7581,0.7617),c(0.4877,0.3961),c(0.5567,0.5568),c(0.5235,0.4762),c(0.6497,0.5645),c(1.6071,1.6053),c(0.3783,0.3714),c(0.4670,0.4354),c(0.3755,0.3750)
                  )
names(data) = c('外观','体验','功效','安全','物流','服务','质量')
barplot(as.matrix(data),col=c("green","red"),beside=TRUE,xlab="产品特征",ylab="分数",main="海尔(绿色)与其他品牌(红色)的特征得分对比")


结果可视化之后,具体的分析大家可以自行补充。



(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读