python高频词_python几万条微博高频词分析
python幾萬條微博高頻詞分析
看到別人有做影視熱評的分析統計,覺得挺好玩的,就來試試
看看效果
Screenshot_2018-05-21-11-00-42-879_com.master.wei.png
思路
抓取想要的微博數據寫入數據庫
分詞統計出詞匯出現次數
過濾無意義的干擾詞
存入數據庫
寫接口,然后Android端展示
代碼
數據庫連接 masterWeiBo.Utils.Sql
import pymysql
import pymysql.cursors
import threading
class Mydb(object):
tableName='master'
def __init__(self):
self.lock=threading.Lock()
self.client = pymysql.connect(host='localhost',charset='utf8', port=3306, user='root', passwd='ck123', db='weibo', cursorclass=pymysql.cursors.DictCursor)
self.client.autocommit(True)
self.cursor = self.client.cursor()
開始
import jieba
from masterWeiBo.Utils.Sql import Mydb as db
# 創建停用詞list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
cursor = db().cursor
#如果不存在詞表就創建
cursor.execute("""CREATE TABLE IF NOT EXISTS `weibo`.`masterWeiBo_category` (
`id` INT NOT NULL AUTO_INCREMENT,
`count` INT NOT NULL DEFAULT 0,
`category` VARCHAR(100) NOT NULL,
`wordsTop10` VARCHAR(1000) NULL,
PRIMARY KEY (`id`));""")
#清空詞表
cursor.execute("DELETE FROM weibo.masterWeiBo_category")
#獲取分類分詞
cursor.execute("SELECT count(id) as countd, come FROM weibo.masterWeiBo_master GROUP BY come")
results = cursor.fetchall()
print(results)
dicts=[]
#加載過濾詞匯
stopwords = stopwordslist("/root/PYServer/myFirstPYServer/words.txt")
for result in results:
each={}
each['count']=result['countd']
each['come']=result['come']
print(result['countd'])
print(result['come'])
cursor.execute("SELECT content from weibo.masterWeiBo_master where come= '"+result['come']+"'")
contents = cursor.fetchall()
articals=''
#把指定分類的內容拼接起來
for artical in contents:
articals+=","+artical['content']
#結巴分詞
cuts = jieba.cut(articals)
words={}
#統計詞頻
for cut in cuts:
if(cut in words):
words[cut]=words[cut]+1
else:
words[cut]=1
#按詞頻倒序排列
sortedWords = sorted(words.items(), key=lambda d: d[1], reverse=True)
wordsTop10=''
i=0
#獲取top10詞匯
for key ,value in sortedWords:
#過濾無效詞匯
if(key in stopwords or key.__len__()<2):
continue
wordsTop10+=key+","+str(value)+";"
i+=1
if(i==10):
wordsTop10=wordsTop10[:wordsTop10.__len__()-1]
break
each['wordsTop10']=wordsTop10
dicts.append(each)
#寫入數據庫
for value in dicts:
sql = "INSERT INTO weibo.masterWeiBo_category (count,category,wordsTop10) values( '" + str(
value['count']) + "','" + value['come'] + "','" + value['wordsTop10'] + "')"
print(sql)
cursor.execute(sql)
cursor.close()
print(dicts)
大功告成
總結
以上是生活随笔為你收集整理的python高频词_python几万条微博高频词分析的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: php mysql循环语句怎么写_mys
- 下一篇: oracle11gR版本GI中新增,Or