當(dāng)前位置：首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

评论数据获取、词频统计、词云图

發(fā)布時(shí)間：2024/7/5 编程问答 33 豆豆

生活随笔收集整理的這篇文章主要介紹了评论数据获取、词频统计、词云图小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

# coding: utf-8

# In[2]:

import urllib.request
import re
import requests
import time
import random
import json

# 設(shè)置請(qǐng)求頭
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

# 獲取URL
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=&productId=100002749549&score=0&sortType=5&pageSize=10&isShadowSku=0&fold=1'

f = open('E:/comments/華為P30.txt', 'w', encoding='utf-8')
for i in range(0, 20):
? ? t = str(time.time()*1000).split('.')
? ? pagram = {
? ? ? ? 'page': i+1,
? ? ? ? 'callback': 'fetchJSON_comment98vv4092%s' % (int(t[1])+1)
? ? }
? ? # print(pagram)
? ? # 隨機(jī)休眠行為分析
? ? time.sleep(random.random())
? ??
?? ?# 發(fā)送http請(qǐng)求
? ? response = requests.get(url, params=pagram)
? ? # 入庫(kù)，文件
? ? data = response.text
? ? # 解析數(shù)據(jù)
? ? data = re.findall(r'{.*}', data)[0]
? ? # 格式成字典
? ? data = json.loads(data)
? ? data = data['comments']
? ? comment_data = {}
? ? for item in data:
? ? ? ? comment_data['手機(jī)型號(hào)'] = item['referenceName']
? ? ? ? comment_data['昵稱'] = item['nickname']
? ? ? ? comment_data['評(píng)論內(nèi)容'] = item['content']
? ? ? ? f.write('手機(jī)型號(hào)：'+item['referenceName']+'\n'+'昵稱：'+item['nickname']+'\n'+'評(píng)論內(nèi)容：'+item['content']+'\n')
f.close()

# In[12]:

import jieba

# 評(píng)論內(nèi)容進(jìn)行去重
def quchong(infile, outfile):
? ? infopen = open(infile, 'r', encoding='utf-8')
? ? outopen = open(outfile, 'w', encoding='utf-8')
? ? lines = infopen.readlines()
? ? list_1 = []
? ? for line in lines:
? ? ? ? if line not in list_1:
? ? ? ? ? ? list_1.append(line)
? ? ? ? ? ? outopen.write(line)
? ? infopen.close()
? ? outopen.close()

quchong("E:/comments/華為P30.txt", "E:/comments/P30去重.txt")

# jieba.load_userdict('userdict.txt')
# 創(chuàng)建停用詞list
def stopwordslist(filepath):
? ? stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
? ? return stopwords

# 對(duì)評(píng)論內(nèi)容進(jìn)行分詞
def seg_sentence(sentence):
? ? sentence_seged = jieba.cut(sentence.strip())
? ? stopwords = stopwordslist('E:/comments/cn_stopwords.txt') ?# 這里加載停用詞的路徑
? ? outstr = ''
? ? for word in sentence_seged:
? ? ? ? if word not in stopwords:
? ? ? ? ? ? if word != '\t':
? ? ? ? ? ? ? ? outstr += word
? ? ? ? ? ? ? ? outstr += " "
? ? return outstr

inputs = open('E:/comments/P30去重.txt', 'r', encoding='utf-8')
outputs = open('E:/comments/P30分詞.txt', 'w')
for line in inputs:
? ? line_seg = seg_sentence(line) ?# 這里的返回值是字符串
? ? outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
print('分詞完畢')

# In[14]:

# 詞頻統(tǒng)計(jì)
import jieba.analyse
from collections import Counter ?# 詞頻統(tǒng)計(jì)

with open('E:/comments/P30分詞.txt', 'r', encoding='utf-8') as fr:?
? ? data = jieba.cut(fr.read())
data = dict(Counter(data))

with open('E:/comments/P30詞頻.txt', 'w', encoding='utf-8') as fw: ?# 讀入存儲(chǔ)wordcount的文件路徑
? ? for k, v in data.items():
? ? ? ? fw.write('%s, %d\n' % (k, v))

# In[18]:

import jieba.analyse
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 生成詞云
with open('E:/comments/P30詞頻.txt',encoding='utf-8') as f:
? ? # 提取關(guān)鍵詞
? ? data = f.read()
? ? keyword = jieba.analyse.extract_tags(data, topK=50, withWeight=False)
? ? wl = " ".join(keyword)

? ? # 設(shè)置詞云
? ? wc = WordCloud(
? ? ? ? # 設(shè)置背景顏色
? ? ? ? background_color = "white",
? ? ? ? # 設(shè)置最大顯示的詞云數(shù)
? ? ? ? max_words=2000,
? ? ? ? # 這種字體都在電腦字體中，一般路徑
? ? ? ? font_path='C:/Windows/Fonts/simfang.ttf',
? ? ? ? height=1200,
? ? ? ? width=1600,
? ? ? ? # 設(shè)置字體最大值
? ? ? ? max_font_size=100,
? ? ? ? # 設(shè)置有多少種隨機(jī)生成狀態(tài)，即有多少種配色方案
? ? ? ? random_state=30,
? ? )

? ? myword = wc.generate(wl) ?# 生成詞云
? ? # 展示詞云圖
? ? plt.imshow(myword)
? ? plt.axis("off")
? ? plt.show()
? ? wc.to_file('E:/comments/P30.png') ?# 把詞云保存下

總結(jié)

以上是生活随笔為你收集整理的评论数据获取、词频统计、词云图的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺(jué)得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇：用pythone画棵圣诞树，祝大家圣诞快
下一篇： 11.MYSQL高级（一）

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

编程问答

评论数据获取、词频统计、词云图

總結(jié)