爬行和分析猫眼对电影《流浪地球》的评论
最编程
2024-05-07 12:44:19
...
爬取和分析电影《流浪地球》的猫眼评论
一,选题背景
《流浪地球》在猫眼评论区的评价趋于两极分化,很多人将其与“爱国主义情怀”联系在一起,这并没有必要,因为电影毕竟只是电影,更何况这是一部商业片,其根本目的是赚取更多票房。将一种商业模式的产物与“爱国主义”联系在一起,是对“爱国”的误解——热爱祖国,不需要通过夸奖一部电影来表达。同理,对国产电影合理且有依有据的批判,也并不代表着对国产电影的失望。
二,设计方案
1,爬虫名称:爬取和分析电影《流浪地球》的猫眼评论
2,爬虫爬取的内容与数据特征分析
内容:爬取电影《流浪地球》的猫眼评论,把评论保存到mongodb
数据特征分析:
- 共有102580条数据;
- 包含字段:评论内容、性别、评论ID、评论者昵称、回复数量、评分、时间、点赞数量、评论者ID、评论者等级
三,结构特征分析
1,页面的结构与特征分析
四,程序设计
1.#实例化MongoClient,# 连接到maoyan数据库
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 from pymongo import MongoClient 5 from pandas.io.json import json_normalize 6 %matplotlib inline 7 8 conn = MongoClient(host='127.0.0.1', port=27017) # 实例化MongoClient 9 db = conn.get_database('maoyan') # 连接到maoyan数据库 10 maoyan = db.get_collection('maoyan') # 连接到集合maoyan 11 mon_data = maoyan.find() # 查询这个集合下的所有记录 12 13 data = json_normalize([comment for comment in mon_data]) 14 15 data.info()
data.head()
2,#数据清洗
1 # 数据清洗 2 data = data.drop(columns='_id') 3 data = data.drop_duplicates(subset='userId') 4 data['time'] = pd.to_datetime(data['time']/1000, unit='s') 5 data = data[data['time']>=pd.to_datetime('2019-02-05 00:00:00')] 6 data.set_index(data["time"], inplace=True) 7 data.head()
1 data.info()
3,#数据分析
- 共有102580条数据;
- 包含字段:评论内容、性别、评论ID、评论者昵称、回复数量、评分、时间、点赞数量、评论者ID、评论者等级
#3.1. 总体评价
data['score'].mean()
1 from pyecharts import Bar 2 from pyecharts import Line 3 from pyecharts import Overlap 4 5 score_total = data['score'].value_counts().sort_index() 6 bar = Bar("《流浪地球》各评分数量", width=700) 7 line = Line("", width=700) 8 bar.add("", score_total.index, score_total.values, is_stack=True, is_label_show=True, 9 bar_category_gap='40%', label_color = ['#130f40'], 10 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 11 line.add("", score_total.index, score_total.values+1000, is_smooth=True) 12 13 overlap = Overlap(width=700) 14 overlap.add(bar) 15 overlap.add(line) 16 17 overlap
1 # 低分百分比 2 score_total[:5].sum()/score_total.sum()*100
# 高分百分比 score_total[7:].sum()/score_total.sum()*100
# 满分百分比 score_total[10:].sum()/score_total.sum()*100
#3.2总体评价的时间走向
1 score_by_time = data['score'].resample('H').mean() 2 line = Line("《流浪地球》平均评分时间走向", width=700) 3 line.add("", score_by_time.index.date, score_by_time.values, is_smooth=True, 4 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18, 5 xaxis_rotate=20, yaxis_min=8) 6 line
score_by_time.nsmallest(6)
#3.3. 高分的评价理由
1 import jieba 2 from collections import Counter 3 from pyecharts import WordCloud 4 5 jieba.add_word('屈楚萧') 6 jieba.add_word('刘启') 7 jieba.add_word('吴京') 8 jieba.add_word('刘培强') 9 jieba.add_word('李光洁') 10 jieba.add_word('王磊') 11 jieba.add_word('吴孟达') 12 jieba.add_word('达叔') 13 jieba.add_word('韩子昂') 14 jieba.add_word('赵今麦') 15 jieba.add_word('韩朵朵') 16 17 swords = [x.strip() for x in open ('stopwords.txt')]
1 def plot_word_cloud(data, swords): 2 text = ''.join(data['content']) 3 words = list(jieba.cut(text)) 4 ex_sw_words = [] 5 for word in words: 6 if len(word)>1 and (word not in swords): 7 ex_sw_words.append(word) 8 c = Counter() 9 c = Counter(ex_sw_words) 10 wc_data = pd.DataFrame({'word':list(c.keys()), 'counts':list(c.values())}).sort_values(by='counts', ascending=False).head(100) 11 wordcloud = WordCloud(width=1300, height=620) 12 wordcloud.add("", wc_data['word'], wc_data['counts'], word_size_range=[20, 100]) 13 return wordcloud 14 # 高分的评价 15 plot_word_cloud(data=data[data['score']>6], swords=swords)
1 data[data['score']>6].nlargest(10, 'upCount')
1 for i in data[data['score']>6].nlargest(10, 'upCount')['content']: 2 print(i+'\n')
#3.4 低分的评价理由
1 # 低分的评价 2 plot_word_cloud(data=data[data['score']<5], swords=swords)
1 data[data['score']<5].nlargest(10, 'upCount')
1 for i in data[data['score']<5].nlargest(10, 'upCount')['content']: 2 print(i+'\n')
#3.5低分的人群有哪些特征
1 # 总体的性别比例 2 gender_total = data['gender'].value_counts() 3 bar = Bar("《流浪地球》观众性别", width=700) 4 bar.add("", ['未知', '男', '女'], gender_total.values, is_stack=True, is_label_show=True, 5 bar_category_gap='60%', label_color = ['#130f40'], 6 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 7 bar 8 gender_total/gender_total.sum()*100
1 # 低分的性别比例 2 3 gender_low = data.loc[data['score']<5, 'gender'].value_counts() 4 5 bar = Bar("《流浪地球》低分评论观众性别", width=700) 6 7 bar.add("", ['未知'zh, '男', '女'], gender_low.values, is_stack=True, is_label_show=True, bar_category_gap='60%', label_color = ['#130f40'], 8 9 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 10 11 bar 12 13 gender_low/gender_low.sum()*100
1 mapping = {'liucixin':'刘慈欣|大刘', 'guofan':'郭帆', 'quchuxiao':'屈楚萧|刘启|户口', 'wujing':'吴京|刘培强', 2 3 'liguangjie':'李光洁|王磊', 'wumengda':'吴孟达|达叔|韩子昂', 'zhaojinmai':'赵今麦|韩朵朵'} 4 5 for key, value in mapping.items(): 6 7 data[key] = data['content'].str.contains(value) 8 9 10 # 总体提及次数 staff_count = pd.Series({key: data.loc[data[key], 'score'].count() for key in 11 12 mapping.keys()}).sort_values() 13 14 staff_count
1 bar = Bar("《流浪地球》演职员总体提及次数", width=700) 2 bar.add("", ['李光洁','郭帆','赵今麦','吴孟达','屈楚萧','刘慈欣','吴京'], staff_count.values, is_stack=True, is_label_show=True, 3 bar_category_gap='60%', label_color = ['#130f40'], 4 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 5 bar 6 average_score = pd.Series({key: data.loc[data[key], 'score'].mean() for key in mapping.keys()}).sort_values() 7 average_score
1 bar = Bar("《流浪地球》演职员平均分", width=700) 2 bar.add("", ['赵今麦','吴孟达','吴京','屈楚萧','李光洁','刘慈欣','郭帆'], np.round(average_score.values,2), is_stack=True, is_label_show=True, 3 bar_category_gap='60%', label_color = ['#130f40'], 4 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 5 bar 6 7 staff_count_low = pd.Series({key: data.loc[data[key]&(data['score']<5), 'score'].count() for key in mapping.keys()}).sort_values() 8 staff_count_low
1 staff_count_pct = np.round(staff_count_low/staff_count*100, 2).sort_values() 2 staff_count_pct
1 bar = Bar("《流浪地球》演职员低分评论提及百分比", width=700) 2 bar.add("", ['郭帆','刘慈欣','李光洁','屈楚萧','赵今麦','吴京','吴孟达'], staff_count_pct.values, is_stack=True, is_label_show=True, 3 bar_category_gap='60%', label_color = ['#130f40'], 4 legend_text_size=18,xaxis_label_textsize=18,yaxis_label_textsize=18) 5 bar 6 7 data[data['wumengda']&(data['score']<5)].nlargest(5, 'upCount')
1 for i in data[data['wujing']&(data['score']<5)].nlargest(5, 'upCount')['content']: 2 print(i+'\n')
五、总结
总体来说,猫眼的观众对于这部电影的认可度还是很高的,认为导演良心、剧情紧凑、表演过关、特效震撼,中国首部硬核科幻电影当之无愧!同时也还是有一些剧情上和年轻演员上的小问题,导致有些本片有些尴尬和无聊。不过,我觉得瑕不掩瑜、应该支持,相较于真正低分的毕导的《逐梦演艺圈》,不知道高到哪里去了。希望国产科幻片可以越来越好!
完整代码:
1 import os 2 import time 3 from datetime import datetime 4 import requests 5 from pymongo import MongoClient 6 7 8 class MaoYan(object): 9 """ 10 猫眼评论爬虫,爬取电影《流浪地球》的评论和评分 11 """ 12 13 def __init__(self): 14 """ 15 初始化函数 16 :param 17 headers: 请求头 18 time: 当前时间戳 19 premiere_time: 首映时间的时间戳 20 """ 21 self.headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.' 22 '38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 23 'Connection': 'keep-alive', 24 'Cookie': '_lxsdk_cuid=168d5d128e7c8-033114908a580c-10376654-fa000-168d5d128e7c8;' 25 ' _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; uuid_n_v=v1;' 26 ' iuuid=5D49FF702DB211E9AF1B8D0648275EC02D381B7848144BC1A299A63C05094BF5;' 27 ' webp=true; selectci=true; ci=281%2C%E6%83%A0%E5%B7%9E;' 28 ' __mta=247299643.1549775481575.1549783540088.1549862773375.3;' 29 ' _lxsdk=5D49FF702DB211E9AF1B8D0648275EC02D381B7848144BC1A299A63C05094BF5;' 30 ' _lxsdk_s=168db05185a-332-e0d-bc5%7C%7C157'} 31 self.time = int(time.time()*1000) 32 self.premiere_time = int(time.mktime(time.strptime('2019-02-05 00:00:00', '%Y-%m-%d %H:%M:%S'))*1000) 33 34 # 配置mongodb数据库 35 host = os.environ.get('MONGODB_HOST', '127.0.0.1') # 本地数据库 36 port = os.environ.get('MONGODB_PORT', '27017') # 数据库端口 37 mongo_url = 'mongodb://{}:{}'.format(host, port) 38 mongo_db = os.environ.get('MONGODB_DATABASE', 'maoyan'上一篇: 个性化的 24 款酷炫手机壁纸!
推荐阅读