评论数据获取、词频统计、词云图
# coding: utf-8
# In[2]:
import urllib.request
import re
import requests
import time
import random
import json
# 設(shè)置請(qǐng)求頭
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
# 獲取URL
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=&productId=100002749549&score=0&sortType=5&pageSize=10&isShadowSku=0&fold=1'
f = open('E:/comments/華為P30.txt', 'w', encoding='utf-8')
for i in range(0, 20):
? ? t = str(time.time()*1000).split('.')
? ? pagram = {
? ? ? ? 'page': i+1,
? ? ? ? 'callback': 'fetchJSON_comment98vv4092%s' % (int(t[1])+1)
? ? }
? ? # print(pagram)
? ? # 隨機(jī)休眠 行為分析
? ? time.sleep(random.random())
? ??
?? ?# 發(fā)送http請(qǐng)求
? ? response = requests.get(url, params=pagram)
? ? # 入庫(kù),文件
? ? data = response.text
? ? # 解析數(shù)據(jù)
? ? data = re.findall(r'{.*}', data)[0]
? ? # 格式成字典
? ? data = json.loads(data)
? ? data = data['comments']
? ? comment_data = {}
? ? for item in data:
? ? ? ? comment_data['手機(jī)型號(hào)'] = item['referenceName']
? ? ? ? comment_data['昵稱'] = item['nickname']
? ? ? ? comment_data['評(píng)論內(nèi)容'] = item['content']
? ? ? ? f.write('手機(jī)型號(hào):'+item['referenceName']+'\n'+'昵稱:'+item['nickname']+'\n'+'評(píng)論內(nèi)容:'+item['content']+'\n')
f.close()
# In[12]:
import jieba
# 評(píng)論內(nèi)容進(jìn)行去重
def quchong(infile, outfile):
? ? infopen = open(infile, 'r', encoding='utf-8')
? ? outopen = open(outfile, 'w', encoding='utf-8')
? ? lines = infopen.readlines()
? ? list_1 = []
? ? for line in lines:
? ? ? ? if line not in list_1:
? ? ? ? ? ? list_1.append(line)
? ? ? ? ? ? outopen.write(line)
? ? infopen.close()
? ? outopen.close()
quchong("E:/comments/華為P30.txt", "E:/comments/P30去重.txt")
# jieba.load_userdict('userdict.txt')
# 創(chuàng)建停用詞list
def stopwordslist(filepath):
? ? stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
? ? return stopwords
# 對(duì)評(píng)論內(nèi)容進(jìn)行分詞
def seg_sentence(sentence):
? ? sentence_seged = jieba.cut(sentence.strip())
? ? stopwords = stopwordslist('E:/comments/cn_stopwords.txt') ?# 這里加載停用詞的路徑
? ? outstr = ''
? ? for word in sentence_seged:
? ? ? ? if word not in stopwords:
? ? ? ? ? ? if word != '\t':
? ? ? ? ? ? ? ? outstr += word
? ? ? ? ? ? ? ? outstr += " "
? ? return outstr
inputs = open('E:/comments/P30去重.txt', 'r', encoding='utf-8')
outputs = open('E:/comments/P30分詞.txt', 'w')
for line in inputs:
? ? line_seg = seg_sentence(line) ?# 這里的返回值是字符串
? ? outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
print('分詞完畢')
# In[14]:
# 詞頻統(tǒng)計(jì)
import jieba.analyse
from collections import Counter ?# 詞頻統(tǒng)計(jì)
with open('E:/comments/P30分詞.txt', 'r', encoding='utf-8') as fr:?
? ? data = jieba.cut(fr.read())
data = dict(Counter(data))
with open('E:/comments/P30詞頻.txt', 'w', encoding='utf-8') as fw: ?# 讀入存儲(chǔ)wordcount的文件路徑
? ? for k, v in data.items():
? ? ? ? fw.write('%s, %d\n' % (k, v))
# In[18]:
import jieba.analyse
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 生成詞云
with open('E:/comments/P30詞頻.txt',encoding='utf-8') as f:
? ? # 提取關(guān)鍵詞
? ? data = f.read()
? ? keyword = jieba.analyse.extract_tags(data, topK=50, withWeight=False)
? ? wl = " ".join(keyword)
? ? # 設(shè)置詞云
? ? wc = WordCloud(
? ? ? ? # 設(shè)置背景顏色
? ? ? ? background_color = "white",
? ? ? ? # 設(shè)置最大顯示的詞云數(shù)
? ? ? ? max_words=2000,
? ? ? ? # 這種字體都在電腦字體中,一般路徑
? ? ? ? font_path='C:/Windows/Fonts/simfang.ttf',
? ? ? ? height=1200,
? ? ? ? width=1600,
? ? ? ? # 設(shè)置字體最大值
? ? ? ? max_font_size=100,
? ? ? ? # 設(shè)置有多少種隨機(jī)生成狀態(tài),即有多少種配色方案
? ? ? ? random_state=30,
? ? )
? ? myword = wc.generate(wl) ?# 生成詞云
? ? # 展示詞云圖
? ? plt.imshow(myword)
? ? plt.axis("off")
? ? plt.show()
? ? wc.to_file('E:/comments/P30.png') ?# 把詞云保存下
總結(jié)
以上是生活随笔為你收集整理的评论数据获取、词频统计、词云图的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 用pythone画棵圣诞树,祝大家圣诞快
- 下一篇: 11.MYSQL高级(一)