电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型
生活随笔
收集整理的這篇文章主要介紹了
电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
載入包
import torch # torch==1.7.1 import torch.nn as nn from torch.utils.data import Dataset from torch.utils.data import DataLoader import os import re import numpy as np from tqdm import tqdm device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高頻的10000詞 MAX_LEN = 300 # 句子統(tǒng)一長度為200 word_count={} # 詞-詞出現(xiàn)的詞數(shù) 詞典數(shù)據(jù)處理
#讀取數(shù)據(jù)集
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3) print('dataset shape is', data.shape)#數(shù)據(jù)清洗
#去除網(wǎng)頁符號 from bs4 import BeautifulSoup example = BeautifulSoup(data['review'][0]) print(example.get_text())#去除非字母元素 import re letters_only = re.sub('[^A-Za-z]', ' ', example.get_text()) print(letters_only)#將大寫字母轉化成小寫,并對元素進行劃分 lower_case = letters_only.lower() words = lower_case.split() print(words)#獲取停用詞
# import nltk # nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_list stop_words_file = 'english.txt' stopwords = get_custom_stopwords(stop_words_file) words = [word for word in words if word not in stopwords] ' '.join(words)#打包成數(shù)據(jù)清洗函數(shù)
from bs4 import BeautifulSoup #導入正則表達式工具包 # import re # from nltk.corpus import stopwords #定義review_to_text函數(shù),完成對原始評論的三項數(shù)據(jù)預處理任務 def review_to_text(review):#任務一:去掉html標記。raw_text = BeautifulSoup(review,'html').get_text()#任務二:去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通過指定分隔符對字符串進行切片,如果參數(shù) num 有指定值,則僅分隔 num 個子字符串#這里是先將句子轉成小寫字母表示,再按照空格劃分為單詞listwords = letters.lower().split()return words#分別對原始數(shù)據(jù)和測試數(shù)據(jù)集進行上述三項處理
X_data = [] y_data=[] for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment) # # y_data = data['sentiment'] # print(X_data,y_data)#對數(shù)據(jù)集進行拆分
from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)利用傳統(tǒng)機器學習模型,樸素貝葉斯
#向量表示,和對數(shù)據(jù)進行學習,利用樸素貝葉斯分類器
from sklearn.feature_extraction.text import CountVectorizer #5000的含義向量最大長度為5000,選取次數(shù)最多的5000個單詞作為向量下標 vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) train_data_features = vectorizer.fit_transform(X_train) t_data_features = vectorizer.transform(X_test)from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(train_data_features,y_train) print(nb.score(train_data_features, y_train)) print(nb.score(t_data_features, y_test))# #預測 # pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty" # pre_str_list=[(' '.join(review_to_text(pre_str)))] # pre_data = vectorizer.transform(pd.Series(pre_str_list)) # result = nb.predict(pre_data) # print(result)訓練集精度為0.86,測試集精度為0.84 #output 0.86145 0.8498 [1]**
利用神經(jīng)模型LSTM/GRU進行數(shù)據(jù)學習、分類、預測
**
#將處理好的訓練數(shù)據(jù)和測試數(shù)據(jù)寫入新的train.txt和test.txt,便于使用dataset讀取數(shù)據(jù)
with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n") f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n") f1.close()#將英文句子切成單詞,并統(tǒng)計詞頻,生成詞典
def tokenizer(sentence):return sentence.split()def data_process(text): for line in text:tokens = tokenizer(line) # 分詞統(tǒng)計詞數(shù)for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 對詞進行排序,過濾低頻詞,只取前MAX_WORD個高頻詞word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocab#建立詞典
vocab=data_process(X_train) # print(vocab)#GRU模型構建,如果要換成LSTM,把nn.GRU換成nn.LSTM即可
class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding層self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形狀是(批量大小,詞數(shù)),因此LSTM需要將序列長度(Seq_len)作為第一維,所以將輸入轉置后 再提取詞特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交換維度# LSTM只傳入輸入embeddings,因此只返回最后一層的隱藏層再各時間步的隱藏狀態(tài)# outputs的形狀是(詞數(shù),批量大小, 隱藏單元個數(shù))outputs, _ = self.encoder(embeddings)# 連接初時間步和最終時間步的隱藏狀態(tài)作為全連接層的輸入。形狀為(批量大小, 隱藏單元個數(shù))encoding = outputs[-1] # 取LSTM最后一層結果outs = self.softmax(self.decoder(encoding)) # 輸出層為二維概率[a,b]return outs#文本向量轉化
def text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分詞轉為idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 對長度不夠的句子進行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN長度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 將轉為idx的詞轉為tensor#模型訓練
def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定義損失函數(shù)和優(yōu)化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均損失avg_acc = 0 # 平均準確率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一個epoch結束后,計算平均loss和評平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存訓練完成后的模型參數(shù)torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')#設計數(shù)據(jù)格式
class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本標簽與內容file.close()def __getitem__(self, index): # 重寫getitemline = self.text_with_tag[index] # 獲取一個樣本的標簽和文本信息label = int(line[0]) # 標簽信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)#模型測試
def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc#計算預測準確性
def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正確的個數(shù)return acc.detach().cpu().numpy() / len(y_pred)#main函數(shù)
def main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 詞典保存為本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加載本地已經(jīng)存儲的vocab# 構建MyDataset實例train_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 構建DataLodertrain_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定義模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加載訓練好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 測試結果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)#執(zhí)行
if __name__ == '__main__':main()結果
參考:
情感分析-IMDB數(shù)據(jù)集
pytorch構建LSTM分類器用于IMDB情感分類
數(shù)據(jù)集:
文中涉及到的數(shù)據(jù)集和停用詞表
鏈接:https://pan.baidu.com/s/1OTgLDoE1P9_FPDQaLU1VKw
提取碼:mz6p
全部源碼
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore')import torch # torch==1.7.1 import torch.nn as nn from torch.utils.data import Dataset from torch.utils.data import DataLoader import os import re import numpy as np from tqdm import tqdm device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高頻的10000詞 MAX_LEN = 300 # 句子統(tǒng)一長度為200 word_count={} # 詞-詞出現(xiàn)的詞數(shù) 詞典data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3) print('dataset shape is', data.shape)from bs4 import BeautifulSoup example = BeautifulSoup(data['review'][0]) print(example.get_text())import re letters_only = re.sub('[^A-Za-z]', ' ', example.get_text()) print(letters_only)lower_case = letters_only.lower() words = lower_case.split() print(words)# import nltk # nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_list stop_words_file = 'english.txt' stopwords = get_custom_stopwords(stop_words_file) words = [word for word in words if word not in stopwords] ' '.join(words)from bs4 import BeautifulSoup #導入正則表達式工具包 # import re # from nltk.corpus import stopwords #定義review_to_text函數(shù),完成對原始評論的三項數(shù)據(jù)預處理任務 def review_to_text(review):#任務一:去掉html標記。raw_text = BeautifulSoup(review,'html').get_text()#任務二:去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通過指定分隔符對字符串進行切片,如果參數(shù) num 有指定值,則僅分隔 num 個子字符串#這里是先將句子轉成小寫字母表示,再按照空格劃分為單詞listwords = letters.lower().split()#過濾掉停用詞# words = [w for w in words if w not in stopwords]# words = [w for w in words if w not in stopwords.words()]return words#分別對原始數(shù)據(jù)和測試數(shù)據(jù)集進行上述三項處理 X_data = [] y_data=[] for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment) # # y_data = data['sentiment'] # print(X_data,y_data)from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45) print(type(y_train))with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n") f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n") f1.close()# print(X_train) # from sklearn.feature_extraction.text import CountVectorizer # #5000的含義向量最大長度為5000,選取次數(shù)最多的5000個單詞作為向量下標 # vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) # train_data_features = vectorizer.fit_transform(X_train) # t_data_features = vectorizer.transform(X_test) # # from sklearn.naive_bayes import MultinomialNB # nb = MultinomialNB() # nb.fit(train_data_features,y_train) # print(nb.score(train_data_features, y_train)) # print(nb.score(t_data_features, y_test)) # # #預測 # pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty" # pre_str_list=[(' '.join(review_to_text(pre_str)))] # pre_data = vectorizer.transform(pd.Series(pre_str_list)) # result = nb.predict(pre_data) # print(result)def tokenizer(sentence):return sentence.split()def data_process(text): # 根據(jù)文本路徑生成文本的標簽for line in text:tokens = tokenizer(line) # 分詞統(tǒng)計詞數(shù)for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 對詞進行排序,過濾低頻詞,只取前MAX_WORD個高頻詞word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocabvocab=data_process(X_train) # print(vocab)class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding層## self.encoder=nn.LSTM(input_size=embed_size# ,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形狀是(批量大小,詞數(shù)),因此LSTM需要將序列長度(Seq_len)作為第一維,所以將輸入轉置后 再提取詞特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交換維度# LSTM只傳入輸入embeddings,因此只返回最后一層的隱藏層再各時間步的隱藏狀態(tài)# outputs的形狀是(詞數(shù),批量大小, 隱藏單元個數(shù))outputs, _ = self.encoder(embeddings)# 連接初時間步和最終時間步的隱藏狀態(tài)作為全連接層的輸入。形狀為(批量大小, 隱藏單元個數(shù))encoding = outputs[-1] # 取LSTM最后一層結果outs = self.softmax(self.decoder(encoding)) # 輸出層為二維概率[a,b]return outsdef text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分詞轉為idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 對長度不夠的句子進行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN長度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 將轉為idx的詞轉為tensor# 模型訓練 def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定義損失函數(shù)和優(yōu)化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均損失avg_acc = 0 # 平均準確率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一個epoch結束后,計算平均loss和評平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存訓練完成后的模型參數(shù)torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本標簽與內容file.close()def __getitem__(self, index): # 重寫getitemline = self.text_with_tag[index] # 獲取一個樣本的標簽和文本信息label = int(line[0]) # 標簽信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)# 模型測試 def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc# 計算預測準確性 def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正確的個數(shù)return acc.detach().cpu().numpy() / len(y_pred)from mxnet.gluon import data as gdata def main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 詞典保存為本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加載本地已經(jīng)存儲的vocab# 構建MyDataset實例# train_data = X_train# test_data = X_testtrain_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 構建DataLoder# train_data = GetLoader(X_train, y_train)# # test_data=GetLoader(X_test,y_test)# train_data = gdata.ArrayDataset(X_train, y_train)train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定義模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加載訓練好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 測試結果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)if __name__ == '__main__':main()總結
以上是生活随笔為你收集整理的电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 运维工程师 常见的 trouble sh
- 下一篇: 如何学习CCNP