python酒店评论分析_酒店评论的情感分析
一、情感分析
情感極性分析,即情感分類,對帶有主觀情感色彩的文本進行分析、歸納。情感極性分析主要有兩種分類方法:基于情感知識的方法和基于機器學習的方法
基于情感知識的方法通過一些已有的情感詞典計算文本的情感極性(正向或負向),其方法是統(tǒng)計文本中出現(xiàn)的正、負向情感詞數(shù)目或情感詞的情感值來判斷文本情感類別
基于機器學習的方法利用機器學習算法訓練已標注情感類別的訓練數(shù)據(jù)集訓練分類模型,再通過分類模型預測文本所屬情感分類
本文采用機器學習方法實現(xiàn)對酒店評論數(shù)據(jù)的情感分類,旨在通過實踐一步步了解、實現(xiàn)中文情感極性分析
下面詳細介紹實戰(zhàn)過程:
1)數(shù)據(jù)下載
a)停用詞:
本文使用中科院計算所中文自然語言處理開放平臺發(fā)布的中文停用詞表,包含了1200多個停用詞。下載地址:
b)正負向語料庫:
文本從
數(shù)據(jù)集已經(jīng)被我上傳到百度文庫:
c)數(shù)據(jù)解壓:
下載上面的數(shù)據(jù)后,在桌面新建情感分析文件夾,進入情感分析文件夾,新建data文件夾,然后將上面的壓縮文件解壓到data下面,并將stopWord.txt放于data平行目錄
在情感分析文件夾下按住shift+鼠標右鍵,選擇在此處新建dos窗口,然后輸入jupyter notebook,新建酒店評論情感分析的腳本文件:
2)數(shù)據(jù)預處理
a)正負向語料預處理
為了方便之后的操作,需要把正向和負向評論分別規(guī)整到對應的一個txt文件中,即正向語料的集合文檔(命名為2000_pos.txt)和負向語料的集合文檔(命名為2000_neg.txt),這里注意encoding和errors參數(shù)的使用,否則會解碼錯誤
這里的2000代表正負預料總共2000條數(shù)據(jù)
import logging
import os
import sys
import codecs
program = os.path.basename( sys.argv[0] )
logger = logging.getLogger( program )
logging.basicConfig( format='%(asctime)s: %(levelname)s: %(message)s' )
logging.root.setLevel( level=logging.INFO )
def getContent(fullname):
f = codecs.open(fullname, 'r', encoding="gbk", errors="ignore")
lines = []
for eachline in f:
#eachline = eachline.decode('gbk','ignore').strip()
eachline = eachline.strip()
if eachline:#很多空行
lines.append(eachline)
f.close()
#print(fullname, 'OK')
return lines
inp = 'data/ChnSentiCorp_htl_ba_2000'
folders = ['neg', 'pos']
for foldername in folders:
logger.info('running ' + foldername + ' files.')
outp = '2000_' + foldername + '.txt'#輸出文件
output = codecs.open( os.path.join('data/ChnSentiCorp_htl_ba_2000', outp), 'w')
i = 0
rootdir = os.path.join(inp, foldername)
for each_txt in os.listdir(rootdir):
contents = getContent( os.path.join(rootdir, each_txt) )
i = i + 1
output.write(''.join(contents) + '\n' )
output.close
logger.info("Saved "+str(i)+" files.")
然后我們來看看合并后的文件(2000_pos.txt和2000_neg.txt)
b)中文文本分詞,并去停頓詞
采用結巴分詞分別對正向語料和負向語料進行分詞處理。在進行分詞前,需要對文本進行去除數(shù)字、字母和特殊符號的處理,使用python自帶的string和re模塊可以實現(xiàn)
其中string模塊用于處理字符串操作,re模塊用于正則表達式處理。 具體實現(xiàn)代碼如下所示:
import jieba
import os
import codecs
import re
def prepareData(sourceFile, targetFile):
f =codecs.open(sourceFile, 'r', encoding='gbk')
target = codecs.open(targetFile, 'w', encoding='gbk')
print( 'open source file: '+ sourceFile )
print( 'open target file: '+ targetFile )
lineNum = 0
for eachline in f:
lineNum += 1
print('---processing ', sourceFile, lineNum,' article---')
eachline = clearTxt(eachline)
#print( eachline )
seg_line = sent2word(eachline)
#print(seg_line)
target.write(seg_line + '\n')
print('---Well Done!!!---' * 4)
f.close()
target.close()
#文本清洗
def clearTxt(line):
if line != '':
line = line.strip()
#去除文本中的英文和數(shù)字
line = re.sub("[a-zA-Z0-9]","",line)
#去除文本中的中文符號和英文符號
line = re.sub("[\s+\.\!\/_,$%^*(+\"\';:“”.]+|[+——!,。??、~@#¥%……&*()]+", "", line)
return line
else:
return 'Empyt Line'
#文本切割,并去除停頓詞
def sent2word(line):
segList = jieba.cut(line, cut_all=False)
segSentence = ''
for word in segList:
if word != '\t' and ( word not in stopwords ):
segSentence += ( word + " " )
return segSentence.strip()
inp = 'data/ChnSentiCorp_htl_ba_2000'
stopwords = [ w.strip() for w in codecs.open('stopWord.txt', 'r', encoding='utf-8') ]
folders = ['neg', 'pos']
for folder in folders:
sourceFile = '2000_{}.txt'.format(folder)
targetFile = '2000_{}_cut.txt'.format(folder)
prepareData( os.path.join(inp, sourceFile), os.path.join(inp,targetFile) )
然后我們看一下分詞結果(2000_pos_cut.txt和2000_neg_cut.txt):
c)獲取特征詞向量
根據(jù)以上步驟得到了正負向語料的特征詞文本,而模型的輸入必須是數(shù)值型數(shù)據(jù),因此需要將每條由詞語組合而成的語句轉化為一個數(shù)值型向量。常見的轉化算法有Bag of Words(BOW)、TF-IDF、Word2Vec
本文采用Word2Vec詞向量模型將語料轉換為詞向量
由于特征詞向量的抽取是基于已經(jīng)訓練好的詞向量模型,而wiki中文語料是公認的大型中文語料,本文擬從wiki中文語料生成的詞向量中抽取本文語料的特征詞向量
Wiki中文語料的Word2vec模型訓練在之前寫過的一篇文章:“wiki.zh.text.vector中抽取特征詞向量作為模型的輸入
獲取特征詞向量的主要步驟如下:
1)讀取模型詞向量矩陣;
2)遍歷語句中的每個詞,從模型詞向量矩陣中抽取當前詞的數(shù)值向量,一條語句即可得到一個二維矩陣,行數(shù)為詞的個數(shù),列數(shù)為模型設定的維度;
3)根據(jù)得到的矩陣計算矩陣均值作為當前語句的特征詞向量;
4)全部語句計算完成后,拼接語句類別代表的值,寫入csv文件中
import os
import sys
import logging
import gensim
import codecs
import numpy as np
import pandas as pd
def getWordVecs(wordList, model):
vecs = []
for word in wordList:
word = word.replace('\n', '')
try:
vecs.append(model[word])
except KeyError:
continue
return np.array(vecs, dtype='float')
def buildVecs(filename, model):
fileVecs = []
with codecs.open(filename, 'r', encoding='gbk') as contents:
for line in contents:
logger.info('Start line: ' + line )
wordList = line.strip().split(' ')#每一行去掉換行后分割
vecs = getWordVecs(wordList, model)#vecs為嵌套列表,每個列表元素為每個分詞的詞向量
if len(vecs) > 0:
vecsArray = sum(np.array(vecs)) / len (vecs)
fileVecs.append(vecsArray)
return fileVecs
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
inp = 'data/ChnSentiCorp_htl_ba_2000/wiki.zh.text.vector'
model = gensim.models.KeyedVectors.load_word2vec_format(inp, binary=False)
posInput = buildVecs('data/ChnSentiCorp_htl_ba_2000/2000_pos_cut.txt', model)
negInput = buildVecs('data/ChnSentiCorp_htl_ba_2000/2000_neg_cut.txt', model)
Y = np.concatenate( ( np.ones(len(posInput)), np.zeros(len(negInput)) ) )
#這里也可以用np.concatenate將posInput和negInput進行合并
X = posInput[:]
for neg in negInput:
X.append(neg)
X = np.array(X)
df_x = pd.DataFrame(X)
df_y = pd.DataFrame(Y)
data = pd.concat( [df_y, df_x], axis=1 )
data.to_csv('data/ChnSentiCorp_htl_ba_2000/2000_data.csv')
d)降維
Word2vec模型設定了400的維度進行訓練,得到的詞向量為400維,本文采用PCA算法對結果進行降維。具體實現(xiàn)代碼如下所示(先看看我們需要降到多少維):
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn import metrics
df = pd.read_csv('data/ChnSentiCorp_htl_ba_2000/2000_data.csv')
y = df.iloc[:, 1]#第一列是索引,第二列是標簽
x = df.iloc[:, 2:]#第三列之后是400維的詞向量
n_components = 400
pca = PCA(n_components=n_components)
pca.fit(x)
plt.figure(1, figsize=(4,3) )
plt.clf()
plt.axes( [.2, .2, .7, .7] )
plt.plot( pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.show()
代碼示例結果,展示df前5行:
e)分類模型構建
支持向量機(SVM)是一種有監(jiān)督的機器學習模型。本文首先采用經(jīng)典的機器學習算法SVM作為分類器算法,通過計算測試集的預測精度和ROC曲線來驗證分類器的有效性,一般來說ROC曲線的面積(AUC)越大模型的表現(xiàn)越好
##根據(jù)圖形取100維
import warnings
warnings.filterwarnings('ignore')
x_pca = PCA(n_components = 100).fit_transform(x)
# SVM (RBF)
# using training data with 100 dimensions
clf = svm.SVC(C = 2, probability = True)
clf.fit(x_pca,y)
print ('Test Accuracy: %.2f'% clf.score(x_pca,y))
#Create ROC curve
pred_probas = clf.predict_proba(x_pca)[:,1] #score
fpr,tpr,_ = metrics.roc_curve(y, pred_probas)
roc_auc = metrics.auc(fpr,tpr)
plt.plot(fpr, tpr, label = 'area = %.2f' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc = 'lower right')
plt.show()
運行代碼,得到Test Accuracy: 0.88,即本次實驗測試集的預測準確率為88%,ROC曲線如下圖所示:
二、模型構建,訓練
上面的SVM模型并未對數(shù)據(jù)集進行訓練和測試的拆分,我們下面將數(shù)據(jù)集進行拆分,test_size設置為0.25,我們先看看AUC判斷標準
AUC的一般判斷標準
0.5 - 0.7:效果較低,但用于預測股票已經(jīng)很不錯了
0.7 - 0.85:效果一般
0.85 - 0.95:效果很好
0.95 - 1:效果非常好,但一般不太可能
a)邏輯回歸
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
def show_roc(model, X, y):
#Create ROC curve
pred_probas = model.predict_proba(X)[:,1]#score
fpr,tpr,_ = metrics.roc_curve(y, pred_probas)
roc_auc = metrics.auc(fpr,tpr)
plt.plot(fpr, tpr, label = 'area = %.2f' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc = 'lower right')
plt.show()
X_train, X_test, y_train, y_test = train_test_split( x_pca, y, test_size=0.25, random_state=0)
param_grid = {'C': [0.01,0.1, 1, 10, 100, 1000,],'penalty': [ 'l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10)
grid_search.fit( X_train,y_train )
print( grid_search.best_params_, grid_search.best_score_ )
#預測拆分的test
LR = LogisticRegression( C=grid_search.best_params_['C'], penalty=grid_search.best_params_['penalty'] )
LR.fit( X_train,y_train )
lr_y_predict = LR.predict(X_test)
print(accuracy_score(y_test, lr_y_predict))
print('使用LR進行分類的報告結果:\n')
print(classification_report(y_test, lr_y_predict))
print( "AUC值:",roc_auc_score( y_test, lr_y_predict ) )
print('Show The Roc Curve:')
show_roc(LR, X_test, y_test)
代碼示例結果:
這里我們用網(wǎng)格搜索進行了參數(shù)的調(diào)整,但有個小疑問,為何roc_auc_score求出的AUC與圖上的area不一致呢?
b)Xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
def modelfit(alg, dtrain_x, dtrain_y, useTrainCV=True, cv_flods=5, early_stopping_rounds=50):
"""
:param alg: 初始模型
:param dtrain_x:訓練數(shù)據(jù)X
:param dtrain_y:訓練數(shù)據(jù)y(label)
:param useTrainCV: 是否使用cv函數(shù)來確定最佳n_estimators
:param cv_flods:交叉驗證的cv數(shù)
:param early_stopping_rounds:在該數(shù)迭代次數(shù)之前,eval_metric都沒有提升的話則停止
"""
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain_x, dtrain_y)
print(alg.get_params()['n_estimators'])
cv_result = xgb.cv( xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'],
nfold=cv_flods, metrics = 'auc', early_stopping_rounds = early_stopping_rounds )
print('useTrainCV\n',cv_result)
print('Total estimators:',cv_result.shape[0])
alg.set_params(n_estimators=cv_result.shape[0])
# train data
alg.fit(dtrain_x, dtrain_y, eval_metric='auc')
#predict train data
train_y_pre = alg.predict(dtrain_x)
print ("\nModel Report")
print ("Accuracy : %.4g" % accuracy_score( dtrain_y, train_y_pre) )
return cv_result.shape[0]
#XGBoost調(diào)參
def xgboost_change_param(train_X, train_y):
print('######Xgboost調(diào)參######')
print('\n step1 確定學習速率和迭代次數(shù)n_estimators')
xgb1 = XGBClassifier(learning_rate=0.1, booster='gbtree', n_estimators=1000,
max_depth=4, min_child_weight=1,
gamma=0, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic',scale_pos_weight=1, seed=10)
#useTrainCV=True時,最佳n_estimators=23, learning_rate=0.1
NEstimators = modelfit(xgb1, train_X, train_y, early_stopping_rounds=45)
print('\n step2 調(diào)試參數(shù)min_child_weight以及max_depth')
param_test1 = { 'max_depth' : range(3, 8, 1), 'min_child_weight' : range(1, 6, 2) }
#GridSearchCV()中的estimator參數(shù)所使用的分類器
#并且傳入除需要確定最佳的參數(shù)之外的其他參數(shù)
#每一個分類器都需要一個scoring參數(shù),或者score方法
gsearch1 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1,n_estimators=NEstimators,
gamma=0,subsample=0.8,colsample_bytree=0.8,
objective='binary:logistic',scale_pos_weight=1,seed=10 ),
param_grid=param_test1,scoring='roc_auc', cv=5)
gsearch1.fit(train_X,train_y)
#最佳max_depth = 4 min_child_weight=1
print(gsearch1.best_params_, gsearch1.best_score_)
MCW = gsearch1.best_params_['min_child_weight']
MD = gsearch1.best_params_['max_depth']
print('\n step3 gamma參數(shù)調(diào)優(yōu)')
param_test2 = { 'gamma': [i/10.0 for i in range(0,5)] }
gsearch2 = GridSearchCV( estimator=XGBClassifier(learning_rate=0.1,n_estimators=NEstimators,
max_depth=MD, min_child_weight=MCW,
subsample=0.8,colsample_bytree=0.8,
objective='binary:logistic',scale_pos_weight=1,seed=10),
param_grid=param_test2,scoring='roc_auc',cv=5 )
gsearch2.fit(train_X, train_y)
#最佳 gamma = 0.0
print(gsearch2.best_params_, gsearch2.best_score_)
GA = gsearch2.best_params_['gamma']
print('\n step4 調(diào)整subsample 和 colsample_bytrees參數(shù)')
param_test3 = { 'subsample': [i/10.0 for i in range(6,10)],
'colsample_bytree': [i/10.0 for i in range(6,10)] }
gsearch3 = GridSearchCV( estimator=XGBClassifier(learning_rate=0.1,n_estimators=NEstimators,
max_depth=MD,min_child_weight=MCW,gamma=GA,
objective='binary:logistic',scale_pos_weight=1,seed=10),
param_grid=param_test3,scoring='roc_auc',cv=5 )
gsearch3.fit(train_X, train_y)
#最佳'subsample': 0.8, 'colsample_bytree': 0.8
print(gsearch3.best_params_, gsearch3.best_score_)
SS = gsearch3.best_params_['subsample']
CB = gsearch3.best_params_['colsample_bytree']
print('\nstep5 正則化參數(shù)調(diào)優(yōu)')
param_test4= { 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] }
gsearch4= GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=NEstimators,
max_depth=MD,min_child_weight=MCW,gamma=GA,
subsample=SS,colsample_bytree=CB,
objective='binary:logistic',
scale_pos_weight=1,seed=10),
param_grid=param_test4,scoring='roc_auc',cv=5 )
gsearch4.fit(train_X, train_y)
#reg_alpha:1e-5
print(gsearch4.best_params_, gsearch4.best_score_)
RA = gsearch4.best_params_['reg_alpha']
param_test5 ={ 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100] }
gsearch5= GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=NEstimators,
max_depth=MD,min_child_weight=MCW,gamma=GA,
subsample=SS,colsample_bytree=CB,
objective='binary:logistic',reg_alpha=RA,
scale_pos_weight=1,seed=10),
param_grid=param_test5,scoring='roc_auc',cv=5)
gsearch5.fit(train_X, train_y)
#reg_lambda:1
print(gsearch5.best_params_, gsearch5.best_score_)
RL = gsearch5.best_params_['reg_lambda']
return NEstimators, MD, MCW, GA, SS, CB, RA, RL
# XGBoost調(diào)參
X_train = np.array(X_train)
#返回最佳參數(shù)
NEstimators, MD, MCW, GA, SS, CB, RA, RL = xgboost_change_param(X_train, y_train)
#parameters at last
print( '\nNow we use the best parasms to fit and predict:\n' )
print( 'n_estimators = ', NEstimators)
print( 'max_depth = ', MD)
print( 'min_child_weight = ', MCW)
print( 'gamma = ', GA)
print( 'subsample = ', SS)
print( 'colsample_bytree = ', CB)
print( 'reg_alpha = ', RA)
print( 'reg_lambda = ', RL)
xgb1 = XGBClassifier(learning_rate=0.1,n_estimators=NEstimators,max_depth=MD,min_child_weight=MCW,
gamma=GA,subsample=SS,colsample_bytree=CB,objective='binary:logistic',reg_alpha=RA,reg_lambda=RL,
scale_pos_weight=1,seed=10)
xgb1.fit(X_train, y_train)
xgb_test_pred = xgb1.predict( np.array(X_test) )
print ("The xgboost model Accuracy : %.4g" % accuracy_score(y_pred=xgb_test_pred, y_true=y_test))
print('使用Xgboost進行分類的報告結果:\n')
print( "AUC值:",roc_auc_score( y_test, xgb_test_pred ) )
print( classification_report(y_test, xgb_test_pred) )
print('Show The Roc Curve:')
show_roc(xgb1, X_test, y_test)
代碼示例結果:
從結果上看,Xgboost比LR效果強一些,感興趣的讀者可以使用神經(jīng)網(wǎng)絡來進行情感分析
總結
以上是生活随笔為你收集整理的python酒店评论分析_酒店评论的情感分析的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
                            
                        - 上一篇: mybatis学习(51):扩展集
 - 下一篇: Java线程之多线程与多进程(1)——以