文章目录
import requests#请求服务器获取网页数据
from bs4 import BeautifulSoup as bs#解析数据
import re#正则匹配
import jieba.analyse#分词包,提取关键词
import matplotlib.pyplot as plt#画图函数
# import matplotlib
from wordcloud import WordCloud#词云包,显示词频统计
url="https://movie.douban.com/cinema/nowplaying/zhengzhou/"
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 Edg/80.0.361.111"}
res=requests.get(url,headers=headers)#封装网页向服务器发送请求
html=res.text#将服务前返回的数据转码为字符串
soup=bs(html,"html.parser")#解析请求数据
# print(soup)
nowplaying_movie=soup.find("ul",class_="lists")#获取包含电影id及名称的标签
# print(nowplaying_movie)
nowplaying_movie_list=nowplaying_movie.find_all("li",class_="list-item")#获取最新电影列表信息
# print(nowplaying_movie_list)
nowplaying_list=[]
for item in nowplaying_movie_list:
nowplaying_dict={}#存储每一部电影的id号和名称
nowplaying_dict['id']=item['data-subject']
for img_item in item.find_all("img"):
nowplaying_dict['name']=img_item['alt']
nowplaying_list.append(nowplaying_dict)
# for i in nowplaying_list:
# print(i)
# 27619748
num=1
for item in nowplaying_list:
print(str(num)+":"+item['name'])
num=num+1
i=int(input("请输入电影序号:"))
# 影片短评网址
for i in range(10):
page = (10 - 1) * 20 # 获取10页评论信息
url1 = r"https://movie.douban.com/subject/" + nowplaying_list[i - 1]['id'] + "/comments?start=" + str(
page) + "&limit=20"
# url1=r"https://movie.douban.com/subject/26826330/comments?start=0&limit=20"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 Edg/80.0.361.111"}
# headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 Edg/80.0.361.111"}
req = requests.get(url1, headers=headers)
html1 = req.text # 获取评论网页数据
soup1 = bs(html1, "html.parser")
# print(soup1)
comments = soup1.find("div", class_="mod-bd")
# print(comments)
comment_lists = comments.find_all("div", class_="comment-item") # 获取电影存储所有短评的标签
# print(comment_lists)
comment_list = []
for comment_p in comment_lists:
comment = comment_p.find_all('p')[0].text # 获取所有短评
if comment is not None:
# print(comment)
comment_list.append(comment)
# print(comment_list)
com=''
for k in range(len(comment_list)):#将所有评论数据转换为字符串
com=com+comment_list[k]
#清理数据
pattern=re.compile(r'[\u4e00-\u9fa5]+')#正则匹配,去除数据中的换行符及其他无关符号
filterdata=re.findall(pattern,com)
cleaned_comments=''.join(filterdata)#清理后得到的最终评论数据
print(cleaned_comments)
#提取关键词
# jieba.analyse.set_stop_words("D:\pythonProject1\spider_code\jieba_stop_words.txt")
result=jieba.analyse.textrank(cleaned_comments,topK=50,withWeight=True)#对清理的数据提取出50个关键词,并显示每个关键词的权重比例
# result输出数据为以元组为元素的列表,故将元组转换为字典
keywords=dict()
for item in result:
keywords[item[0]]=item[1]
stopwords=''#存储对词频统计无意义不相关的词汇,即添加停用词(可自定义)
f=open('D:\pythonProject1\spider_code\jieba_stop_words.txt',encoding="utf-8")#jieba_stop_words.txt,停用词字典数据(自定义)
while True:
word=f.readline()
if word=='':
break
stopwords+=word
# print(stopwords)
keywords={x:keywords[x] for x in keywords if x not in stopwords}#将评论数据与停用词作比较,清理无意义的停用词
print(keywords)
# 显示词云图
plt.rcParams['figure.figsize']=(10.0,5.0)#设置图像显示大小
# 设置字体类型\颜色\大小
wordcloud=WordCloud(font_path='simhei.ttf',background_color='white',max_font_size=80,stopwords=stopwords)
myword=wordcloud.fit_words(keywords)#词云统计图
plt.imshow(myword)#显示词云图
plt.axis("off")
plt.show()