人工神经网络_验证码破译(数据挖掘入门与实践-实验9)
                                                            生活随笔
收集整理的這篇文章主要介紹了
                                人工神经网络_验证码破译(数据挖掘入门与实践-实验9)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.                        
                                文章目錄
- 一、待優化
- 二、代碼
- 單字母預測
- 1、驗證碼圖像生成
- 2、字符串切割
- 3、數據集創建
- 4、多條數據集創建
- 5、數據集調整
- 6、數據集分割 & 單字母預測模型訓練
- 7、神經網路評估
 
- 單詞預測
- 1、預測函數創建
- 2、測試集導入 & 測試開始
- 3、優化: 詞典查詢
 
 
一、待優化
1、字符串切割步驟中segment_image函數只能沿著x, y軸切割,導致預測結果對shear值十分敏感,可另尋切割函數以提高預測正確率
二、代碼
單字母預測
1、驗證碼圖像生成
import numpy as np from PIL import Image, ImageDraw,ImageFont from skimage import transform as tf#驗證碼生成 def create_captcha(text, shear=0, size=(100, 30),scale=1):im = Image.new("L", size, "black") draw = ImageDraw.Draw(im)font = ImageFont.truetype(r"bretan/Coval-Black.otf",22)draw.text((0,0),text,fill=1,font=font)image = np.array(im)affine_tf = tf.AffineTransform(shear=shear) image = tf.warp(image, affine_tf) image = image / image.max()shape = image.shapeshapex,shapey = (int(shape[0]*scale),int(shape[1]*scale))image = tf.resize(image,(shapex,shapey))return image#驗證碼顯示 %matplotlib inline from matplotlib import pyplot as plt image = create_captcha("GENE", shear=0.2,size=[len(str)*25,30]) plt.imshow(image, cmap='Greys')2、字符串切割
##########圖像切割 from skimage.measure import label,regionprops #分割函數 def segment_image(image):#找出連通像素 連通區域標記#缺陷:只能返回方塊,字體傾斜時效果變差labeled_image=label(image>0)#圖像抽取 && 遍歷subimages=[]for region in regionprops(labeled_image):start_x, start_y, end_x, end_y = region.bboxsubimages.append(image[start_x:end_x,start_y:end_y])if len(subimages) == 0: return [image,]return subimages#分割圖像顯示 subimages = segment_image(image) f, axes = plt.subplots(1, len(subimages), figsize=(10, 3)) for i in range(len(subimages)): axes[i].imshow(subimages[i], cmap="gray")3、數據集創建
#########創建訓練集 from sklearn.utils import check_random_state random_state = check_random_state(1) letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") shear_values = np.arange(0, 0.5, 0.05)#生成訓練數據函數創建 def generate_sample(random_state=None): random_state = check_random_state(random_state) letter = random_state.choice(letters) shear = random_state.choice(shear_values)return create_captcha(letter, shear=shear, size=(25, 25)),letters.index(letter)#單條訓練數據生成 A_target=0,...,Z_target=25 image, target = generate_sample(random_state) plt.imshow(image, cmap="Greys") print("The target for this image is: {0}".format(target))4、多條數據集創建
#多條訓練數據生成 nums_dataset=3000 temp = zip((generate_sample(random_state)) for i in range(nums_dataset)) temp = list(temp) dataset = [] targets = [] for i in range(nums_dataset):dataset.append(np.array(temp)[i][0][0])targets.append(np.array(temp)[i][0][1]) dataset = np.array(dataset) targets = np.array(targets)5、數據集調整
#訓練集調整 將圖像統一調整為20x20像素 from skimage.transform import resize dataset = np.array([tf.resize(segment_image(sample)[0], (20, 20)) for sample in dataset])dataset = dataset.astype(np.float32) #dataset = np.array(dataset,dtyte = 'float') targets = np.array(targets)#對類別進行編碼 獨熱編碼 from sklearn.preprocessing import OneHotEncoder onehot = OneHotEncoder() y = onehot.fit_transform(targets.reshape(targets.shape[0],1))#稀疏矩陣->密集矩陣 Y = y.todense()#dataset二維化(存儲的是二維圖像信息) X = dataset.reshape((dataset.shape[0], dataset.shape[1] * dataset.shape[2]))6、數據集分割 & 單字母預測模型訓練
#####數據集分割 from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.9)#####模型訓練 from sklearn.neural_network import MLPClassifier clf = MLPClassifier(hidden_layer_sizes=(100,),random_state = 14) clf.fit(X_train,y_train)#clf.coefs_[1]7、神經網路評估
#####神經網絡評估 y_pred = clf.predict(X_test) from sklearn.metrics import f1_score score = f1_score(y_pred = y_pred, y_true=y_test, average = 'macro') #print(score)#####分類結果查看 from sklearn.metrics import classification_report #print(classification_report(y_pred = y_pred, y_true=y_test))單詞預測
1、預測函數創建
#####單詞預測 #預測函數創建 def predict_captcha(captcha_image,neural_network):#plt.imshow(captcha_image, cmap='Greys')#單詞分割subimages = segment_image(captcha_image)#字母像素調整dataset = np.array([tf.resize(subimage,(20,20)) for subimage in subimages])X_test = dataset.reshape((dataset.shape[0], dataset.shape[1] * dataset.shape[2]))#逐字母預測 & 并選取最有可能的預測值y_pred = neural_network.predict_proba(X_test)predictions = np.argmax(y_pred,axis = 1)#將預測值轉換為字母predicted_word = ''.join([letters[prediction] for prediction in predictions])return predicted_word#神經網絡測試函數 def test_prediction(word, net, shear = 0.2, scale = 1):captcha = create_captcha(word, shear = shear, scale = scale, size = (len(word)*25, 25))prediction = predict_captcha(captcha, net)return word == prediction, word, predictionprint(test_prediction("GENERAL",clf,shear = 0))2、測試集導入 & 測試開始
#####測試集導入 from nltk.corpus import words valid_words = [word.upper() for word in words.words() if len(word) == 4]#####測試開始 num_correct=0 num_incorrect=0 for word in valid_words:correct, word, prediction = test_prediction(word, clf, shear=0.2)if correct:num_correct = num_correct + 1else:num_incorrect = num_incorrect + 1 print("測試集單詞的總數量:{0}\n識別準確率:{1:.1f}".format(num_correct+num_incorrect,num_correct/(num_correct+num_incorrect)))3、優化: 詞典查詢
#####詞典提升正確率 from nltk.metrics import edit_distance steps = edit_distance("STEP","STOP") print("The number of steps needed is : {0}".format(steps))#距離函數創建 def compute_distance(prediction,word):return len(prediction) - sum(prediction[i]==word[i] for i in range(len(prediction)))#預測函數改進 from operator import itemgetter def improved_prediction(word,net,dictionary,shear=0.2,scale=1):captcha=create_captcha(word,shear=shear,scale=scale)prediction=predict_captcha(captcha,net)prediction=prediction[:4]if prediction not in dictionary:distances= sorted([(word, compute_distance(prediction, word)) for word in dictionary],key=itemgetter(1))best_word=distances[0]prediction=best_word[0]return word == prediction, word, prediction#開始測試 num_correct = 0 num_incorrect = 0 for word in valid_words:shear=0scale=1correct,word,prediction =improved_prediction(word,clf,valid_words,shear,scale)if correct:num_correct += 1else:num_incorrect += 1 print("Number correct is {0}".format(num_correct)) print("Number incorrect is {0}".format(num_incorrect))總結
以上是生活随笔為你收集整理的人工神经网络_验证码破译(数据挖掘入门与实践-实验9)的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: jupyter notebook_远程终
- 下一篇: linux下screen基本用法
