舆情分析项目-重庆公交坠江原因
輿情分析項目
1、分析事件:重慶公交墜江原因
2、分析對象:
(1)網友評論(初級分類-分詞匹配;高級分類-自然語言識別,映射人類情感和意圖,比如:積極、消極、無奈、諷刺、建設、謾罵、理性分析、事后、和事佬等)
(2)評論者的公網IP(依據公網IP識別不同地域的網絡用戶,對本次事件的關注度)
(3)評論者的省份屬性(同上)
3、數據來源:
新浪評論:http://comment5.news.sina.com.cn/comment/skin/default.html?channel=gn&newsid=comos-hnfikve6671738&group=0
4、其他:
準備數據:(直接用:中國省份數據庫,世界國家名稱數據庫)參考本人博客
(1)中國的行政區劃數據,包括全國的省、市、縣(參考csdn、民政部官網)
(2)世界的國家數據(參考csdn)
(一)輿情分析項目之數據準備:采集評論數據
1、采集字段
三個字段:評論、IP、省份
其他字段:收到點贊數等等
2、Python實現數據采集
文件結構
?
(1)python主代碼
?01-busremark.py中
import json import requests import pymysql import time as timeimport from mylog import Loggerlogger1 = Logger(logfile='log1.log', logname="log1", logformat=1).getlog() # 使用自定義日志對象# 連接數據庫 connect = pymysql.Connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標 cursor = connect.cursor()# 創建數據庫語句for page_num in range(1, 6001): # 從1采集到6000條評論if page_num % 50 == 0: # 每采集50條數據,休息2秒timeimport.sleep(2)url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-hnfikve6671738&group=0&compress=0&ie=utf8&oe=utf8&page=" + str(page_num) + "&page_size=1&jsvar=loader_1541133929419_28637561"# url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-hnfikve6671738&group=0&compress=0&ie=gbk&oe=gbk&page=1&page_size=2&jsvar=loader_1541133929419_28637561"headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}try: # 嘗試采集# 發出請求獲取響應response = requests.get(url, headers=headers)data_str = response.content.decode('unicode_escape')# 排除干擾字符串data_str = data_str.lstrip("var loader_1541133929419_28637561=")# print(data_str)# str轉字典data_dict = json.loads(data_str)print(type(data_dict))# 獲取每次響應中的所有評論all_remarks = data_dict['result']['cmntlist']print(len(all_remarks))i = 0for c in all_remarks: # 遍歷每次響應中的評論,并存入mysqli += 1print(i, "*" * 100)nick = c["nick"] # 昵稱content = c["content"] # 評論agree = int(c["agree"]) # 收到點贊area = c["area"] # 地區ip = c["ip"] # 源iptime = c["time"] # 評論發布時間profile_img = c["profile_img"] # 頭像print(nick)print(content)print(agree)print(ip)print(time)print(profile_img)# sql操作# 增加數據操作sql_1 = "insert into all_remarks(nick, content, agree, area, ip, time, profile_img) values(%s,%s,%s,%s,%s,%s,%s)"data = (nick, content, agree, area, ip, time, profile_img)cursor.execute(sql_1, data) # 生成增加sql語句connect.commit() # 確認永久執行增加except Exception as e: # 采集異常處理my_e = str(e) + " ==> " + str(url)logger1.warning(my_e) # 定義調試日志內容# print(my_e)continue # 忽視異常,進行后面的采集
?
(2)python日志
?mylog.py中
# 開發一個日志系統, 既要把日志輸出到控制臺, 還要寫入日志文件 import logging# 用字典保存輸出格式 format_dict = {1: logging.Formatter('%(asctime)s - %(name)s - %(filename)s - %(levelname)s - %(message)s'),2: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),3: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),4: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),5: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') }class Logger():def __init__(self, logfile, logname, logformat):'''指定保存日志的文件路徑,日志級別,以及調用文件將日志存入到指定的文件中'''# 創建一個loggerself.logger = logging.getLogger(logname)self.logger.setLevel(logging.DEBUG)# 創建一個handler,用于寫入日志文件fh = logging.FileHandler(logfile)fh.setLevel(logging.DEBUG)# 再創建一個handler,用于輸出到控制臺ch = logging.StreamHandler()ch.setLevel(logging.DEBUG)# 定義handler的輸出格式# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')formatter = format_dict[int(logformat)]fh.setFormatter(formatter)ch.setFormatter(formatter)# 給logger添加handlerself.logger.addHandler(fh)self.logger.addHandler(ch)def getlog(self):return self.loggerif __name__ == '__main__':logger1 = Logger(logfile='log1.txt', logname="fox1", logformat=1).getlog()logger1.debug('i am debug')logger1.info('i am info')logger1.warning('i am warning')logger2 = Logger(logfile='log2.txt', logname="fox2", logformat=2).getlog()logger2.debug('i am debug2')logger2.info('i am info2')logger2.warning('i am warning2')
3、sql建表語句
?
/* Navicat MySQL Data TransferSource Server : win7_local Source Server Version : 50717 Source Host : localhost:3306 Source Database : analyzeTarget Server Type : MYSQL Target Server Version : 50717 File Encoding : 65001Date: 2018-11-06 19:33:57 */SET FOREIGN_KEY_CHECKS=0;-- ---------------------------- -- Table structure for all_remarks -- ---------------------------- DROP TABLE IF EXISTS `all_remarks`; CREATE TABLE `all_remarks` (`id` int(11) unsigned NOT NULL AUTO_INCREMENT,`nick` varchar(255) DEFAULT NULL,`content` text,`agree` int(10) DEFAULT NULL,`area` varchar(100) DEFAULT NULL,`ip` varchar(20) DEFAULT NULL,`time` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,`profile_img` varchar(255) DEFAULT NULL,`province_brief` varchar(20) DEFAULT NULL,PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
?
?
4、效果截圖
?
?
中途在添加了字段
?
?
02-mysql_to_province_country.py import pymysql# 連接數據庫 connect = pymysql.connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標 cursor = connect.cursor()# 獲取中國所有的省份二字簡稱 sql_2 = "select brief from tb_provinces" ret_num2 = cursor.execute(sql_2) # 生成查詢sql語句,并且執行。ret_num表示受影響的記錄條數 pro_data = cursor.fetchall() # 獲取查詢結果 # 取出所有省份直轄市等列表 pro_list = [] for pro in pro_data:pro_list.append(pro[0]) print(pro_list)# 獲取國家名稱列表 sql_get_country = "select country from tb_countries" ret_num_cou = cursor.execute(sql_get_country) countries = cursor.fetchall() # print(countries) # 取出所有國家名稱列表 countries_list = [] for country in countries:countries_list.append(country[0]) print(countries_list) print(len(countries_list))# 循環給每條記錄打省份標簽 for i in range(1, 2000):# 查詢數據操作(只有查詢用的全是游標,其他3種操作,要用連接的提交commit)sql_1 = "select id,area,province_brief from all_remarks where province_brief is null or province_brief='' limit 1"# sql_1 = "select id,area,province_brief from all_remarks limit 1"ret_num = cursor.execute(sql_1) # 生成查詢sql語句,并且執行。ret_num表示受影響的記錄條數if ret_num < 1: # 沒有獲取到數據庫任何結果,終止本次任務breakdata = cursor.fetchall() # 獲取查詢結果print(data)# 數據準備id = data[0][0] # idlocation = data[0][1] # 地理位置詳情# print(location)# 判斷歸屬省份for pro in pro_list:if pro in location:# print(pro)province_brief = pro# 修改數據操作sql_3 = "update all_remarks set province_brief=%s where id=%s"data = (pro, id)cursor.execute(sql_3, data) # 生成增加sql語句connect.commit() # 確認永久執行增加# print("執行完畢")breakelse: # 上面循環完成,沒有匹配到對應省份時print("id=%s,不屬于任何省份" % id)print("開始判斷屬于哪個國家")for country in countries_list:if country in location:# print(country)# 修改數據操作sql_4 = "update all_remarks set province_brief=%s where id=%s"data = (country, id)cursor.execute(sql_4, data) # 生成增加sql語句connect.commit() # 確認永久執行增加print("id=%s ,屬于 %s" % (id, country))breakelse: # 上面循環完成,沒有匹配到對應國家時print("位置異常,沒有匹配到任何省份和國家:%s" % location)# 關閉指針 cursor.close() # 關閉連接 connect.close()
03-matplotlib_provinc_count.py
import pymysql import matplotlib.pyplot as plt import matplotlib # 載入matplotlib完整庫matplotlib.rcParams['font.family'] = 'Microsoft Yahei' # 字體,改為微軟雅黑,默認 sans-serif matplotlib.rcParams['font.size'] = 18 # 字體大小,整數字號,默認10# 連接數據庫 connect = pymysql.connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標 cursor = connect.cursor()# 獲取數據 sql_1 = "select province_brief,count_id from stst_count_province" ret_num2 = cursor.execute(sql_1) # 生成查詢sql語句,并且執行。ret_num表示受影響的記錄條數 pro_data = cursor.fetchall() # 獲取查詢結果 # print(pro_data)# 按照count_id 降序排列 list1 = list(pro_data) # print(list1) list2 = sorted(list1, key=lambda p: p[1], reverse=True) print(list2)# 關閉指針 cursor.close() # 關閉連接 connect.close()# 繪圖 # 導入待繪圖處理數據 base_data = list2# 獲取數據 province_list = [x[0] for x in base_data] count_id_list = [x[1] for x in base_data] print(province_list) print(count_id_list) # 設置x,y x = [i for i in range(len(province_list))] y = count_id_listplt.figure(figsize=(20, 10), dpi=80) plt.bar(x,y,width=0.5,color='r' )# 設置x軸刻度 _xticks_labels = [str(index + 1) + " " + value for index, value in enumerate(province_list)] plt.xticks(x, _xticks_labels, rotation=40, fontsize=12) # 設置y軸刻度 # y_new = [i for i in range(0, 701)][::50] # plt.yticks(y_new)# 設置網格 plt.grid()# 設置文字 plt.title("中國各個省份對《重慶公交墜江事件》關注度統計 數據來源:sina") plt.xlabel("省/直轄市/特別行政區", color='b') plt.ylabel("評論數", color='black')plt.show()
?
最終效果:
轉載于:https://www.cnblogs.com/andy9468/p/9897391.html
總結
以上是生活随笔為你收集整理的舆情分析项目-重庆公交坠江原因的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【贪心】小Y的炮[cannon]题解
- 下一篇: 织梦正则批量替换文章内容内链变成绝对路径