Python实现「碟中谍」5W条评论可视化!果然还是烧脑大戏!
发布时间:2020-12-17 00:53:27 所属栏目:Python 来源:网络整理
导读:进群:548377875 ? 即可获取数十套PDF哦! 项目介绍 本篇文章会针对用户在猫眼上对于「碟中谍6」的评论进行一个可视化分析,我们总共采集了44872条用户评论,文章内容包括: 用户评分分布; 产生评论时间分布????; 评论用户地理位置分布热点图; 词频统计;
进群:548377875 ? 即可获取数十套PDF哦! 项目介绍 本篇文章会针对用户在猫眼上对于「碟中谍6」的评论进行一个可视化分析,我们总共采集了44872条用户评论,文章内容包括:
数据背景
数据清洗
import pandas as pd with open('comment.txt','r') as f: comment = f.read() comment_list = comment.split(' ') print '>>>累计评论数:%s '%len(comment_list) data = [] temp = ['','',''] for comment in comment_list: comment = comment.split('|') if len(comment) == 1: temp[4] = comment[0] comment = temp data.append(comment) elif len(comment) != 5: pass else: data.append(comment) data = pd.DataFrame(data,columns = ['时间','昵称','城市','评分','内容']) print data.head() 评分分布
temp = data[data['评分'] != ''].groupby('评分')['昵称'].count().reset_index() temp.columns = ['评分','数量'] Pie = pyecharts.Pie('「碟中谍」评分分布','统计时间:2018-9-6') Pie.add("",temp['评分'],temp['数量'],radius=[30,75],rosetype='radius',is_legend_show=False,is_label_show=True) Pie
时间分布
data['日期'] = data['时间'].str[0:10] data['小时'] = data['时间'].str[11:13] temp = data[(data['时间'] >= '2018-08-31 00:00:00')& (data['时间'] <= '2018-09-07 00:00:00')].groupby(['小时','日期'])['昵称'].count().reset_index() temp.columns = ['小时','日期','数量'] date = ['2018-08-31','2018-09-01','2018-09-02','2018-09-03','2018-09-04','2018-09-05','2018-09-06'] temp['小时'] = temp['小时'].astype('int') temp['日期'] = temp['日期'].replace({'2018-08-31':0,'2018-09-01':1,'2018-09-02':2,'2018-09-03':3,'2018-09-04':4,'2018-09-05':5,'2018-09-06':6}) temp = temp.values.tolist() hour = range(24) HeatMap = pyecharts.HeatMap('评论-时间分布','统计时间:2018-09-06') HeatMap.add("评论数量",hour,date,temp,is_visualmap=True,visual_range=[0,700],is_legend_show = False,visual_text_color="#000",visual_orient='vertical',visual_pos="right") HeatMap
地理分布
代码 import requests from tqdm import tqdm data['城市'][data['城市'] == '伊犁'] = '伊犁哈萨克自治州' temp = data[data['城市'] != ''].groupby('城市')['昵称'].count().reset_index() temp.columns = ['城市','数量'] headers = {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML,like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} key = '申请的KEY ' dic = {} city_list = list(set(temp['城市'])) for city in tqdm(city_list): url = 'http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s'%(city,key) response = requests.get(url) position = response.json() positin_list = [] if position['status'] == 0: positin_list.append(position['result']['location']['lng']) positin_list.append(position['result']['location']['lat']) dic[city] = positin_list else: pass
Geo = pyecharts.Geo("评论城市分布","来源:Kaggle",title_color="#fff",title_pos="center",width=800,height=600,background_color='#404a59') Geo.add("",temp['城市'],1000],type='heatmap',visual_text_color="#fff",geo_cities_coords = dic) Geo image.png
词频统计
from jieba import posseg as psg import collections string = ''.join(data['内容'][data['内容'] <> '']) word_list = [] stop_words = ['就是','这是','但是','虽然','一部','觉得','还是','没有'] words = psg.cut(string) for x in words: if x.flag == 'x': pass elif len(x.word) == 1: pass elif x.word.encode('utf-8') in stop_words: pass else: word_list.append(x.word) c = collections.Counter(word_list) attr = [] value = [] for x in c.most_common(10): attr.append(x[0]) value.append(x[1]) Bar = pyecharts.Bar("评论中出现频率最高的10个词","统计时间:2018-09-06") Bar.add("出现次数",attr,value,mark_point=['max'],is_legend_show = False) Bar
词云
import imageio from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator import matplotlib.pyplot as plt back_color = imageio.imread('TomCruise.jpg') words = ' '.join(word_list) wc = WordCloud(background_color='white',max_words=5000,mask=back_color,max_font_size=200,font_path="/Users/tangwenpan/Documents/fonts/SimHei.ttf",random_state=None ) wc.generate(words) image_colors = ImageColorGenerator(back_color) plt.figure(figsize = (15,8)) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') plt.show() wc.to_file('comment.png') 原图: 词云图
(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |