强智科技教务系统验证码识别
強(qiáng)智科技驗(yàn)證碼識(shí)別
前言
由于近期在寫(xiě)一個(gè)教務(wù)系統(tǒng)的爬蟲(chóng)程序,但是網(wǎng)站的驗(yàn)證碼讓人很頭疼,所以筆者臨時(shí)找了一些資料學(xué)習(xí)了一下,本人python用的很少,而且在機(jī)器學(xué)習(xí)這塊也是新手,從來(lái)沒(méi)有接觸過(guò),所以寫(xiě)的不好還請(qǐng)指點(diǎn)出來(lái)。
環(huán)境
python3.6PILsklearn準(zhǔn)備
使用一個(gè)簡(jiǎn)單的腳本下載1000張驗(yàn)證碼,然后做好標(biāo)記
import requestsurl = "http://****/verifycode.servlet"for i in range(1000):filename = "./code/"+str(i)+".png"response = requests.get(url)with open(filename , "wb") as f:f.write(response.content)圖像處理
實(shí)例圖片
1.二值化,這里我們進(jìn)行手動(dòng)利用一個(gè)閾值進(jìn)行二值化,處理完成以后的圖片如下
2.降噪,思路是,當(dāng)一個(gè)點(diǎn)他為黑色時(shí),周圍8(這個(gè)數(shù)字改小一點(diǎn)還可以完成線去除干擾線的功能,我這里就是使用這種方式將干擾點(diǎn)和干擾線去除的)個(gè)點(diǎn)都是白色則認(rèn)為這是一個(gè)噪點(diǎn)
我們發(fā)現(xiàn)還是有一個(gè)點(diǎn)沒(méi)有去除,沒(méi)有關(guān)系,下面我們做字符分割的時(shí)候還是會(huì)進(jìn)行一次降噪
3.字符分割,思路是找出每個(gè)數(shù)字的邊緣坐標(biāo),如果上下邊緣相差太小的話則認(rèn)為這里為噪點(diǎn),直接去除
圖片轉(zhuǎn)換為數(shù)據(jù)
上面已經(jīng)分割好了,我們現(xiàn)在需要將圖片轉(zhuǎn)換成數(shù)據(jù),方便我們下面?zhèn)魅霗C(jī)器學(xué)習(xí),我用的方式是遍歷分割好的方形區(qū)域,黑色為1,白色為0,拼接成一個(gè)類似于“0011010100000101111111111”的字符串,然后將字符串轉(zhuǎn)換成int型數(shù)值
最后生成的數(shù)據(jù)是下面這種樣子
圖片數(shù)據(jù):[“61256415613215646512” , “61256415613215646512” , “61256415613215646512” , “61256415613215646512”]
結(jié)果數(shù)據(jù):[‘1’ , ‘j’ , ‘k’ , ‘d’]
訓(xùn)練模型
我是用的是knn分類算法
代碼
1.圖像處理類
from PIL import Image , ImageDraw import cv2'''圖片處理類 ''' class ImageHandler():threshold = 130 #二值化處理閾值im = None #保存當(dāng)前類所處理的圖片spliter = []data = []labels = []def __init__(self , filename):self.filename = filename self.data = []self.labels = []self.spliter = []self.im = None'''圖片文件讀取'''def readFile(self):self.im = Image.open(self.filename)'''圖片二值化'''#def toBinary_img(self , im):# im = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)# th1 = cv2.adaptiveThreshold(im, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 1)# return th1'''手動(dòng)將圖片二值化順便去掉邊框'''def toBinary_data(self):self.readFile()im = self.imim = im.convert('L')pixdata = im.load()width , height = im.sizefor j in range(height):for i in range(width):if(i == 0 or i == width-1 or j == 0 or j == height-1):pixdata[i , j] = 255continueif(pixdata[i , j] < self.threshold):pixdata[i , j] = 0else:pixdata[i , j] = 255self.im = im'''降噪(點(diǎn)線降噪)'''def dot_noise(self):im = self.imw , h = im.sizepixdata = im.load()#從左至右降噪for y in range(h):for x in range(w):if(pixdata[x , y] == 0):sum = 0 #四周總共有多少個(gè)白點(diǎn)if(pixdata[x-1 , y] == 255):sum+=1if(pixdata[x+1 , y] == 255):sum+=1if(pixdata[x , y+1] == 255):sum+=1if(pixdata[x , y-1] == 255):sum+=1if(pixdata[x-1 , y-1] == 255):sum+=1if(pixdata[x-1 , y+1] == 255):sum+=1if(pixdata[x+1 , y-1] == 255):sum+=1if(pixdata[x+1 , y+1] == 255):sum+=1if(sum >= 7): #這里為7的時(shí)候圖片處理這完整,但是當(dāng)準(zhǔn)確率不及為5的時(shí)候,所以真正應(yīng)用的時(shí)候建議將它改成5,下面那個(gè)也一樣pixdata[x , y] = 255'''#從右至左降噪for y in range(h):for x in range(w-1 , -1 , -1):if(pixdata[x , y] == 0):sum = 0 #四周總共有多少個(gè)白點(diǎn)if(pixdata[x-1 , y] == 255):sum+=1if(pixdata[x+1 , y] == 255):sum+=1if(pixdata[x , y+1] == 255):sum+=1if(pixdata[x , y-1] == 255):sum+=1if(sum >= 3):pixdata[x , y] = 255'''#從下至上降噪for y in range(h-1 , -1 , -1):for x in range(w-1 , -1 , -1):if(pixdata[x , y] == 0):sum = 0 #四周總共有多少個(gè)白點(diǎn)if(pixdata[x-1 , y] == 255):sum+=1if(pixdata[x+1 , y] == 255):sum+=1if(pixdata[x , y+1] == 255):sum+=1if(pixdata[x , y-1] == 255):sum+=1if(pixdata[x-1 , y-1] == 255):sum+=1if(pixdata[x-1 , y+1] == 255):sum+=1if(pixdata[x+1 , y-1] == 255):sum+=1if(pixdata[x+1 , y+1] == 255):sum+=1if(sum >= 7):pixdata[x , y] = 255self.im = im'''切割'''def cut_img(self):im = self.imw , h = im.sizepixdata = im.load()#1.找出切割點(diǎn)spliter_y = []spliter_x = []flag = False #表示當(dāng)前遍歷的全部為白色,當(dāng)遇到黑色時(shí)就會(huì)變成True#1.1找縱向切割點(diǎn)for x in range(w):column = False #當(dāng)前行全是白色則為False , 否則為Truefor y in range(h):if(pixdata[x , y] == 0):column = Trueif(flag == False):flag = Truespliter_x.append(x - 1)if(flag == True and column == False):spliter_x.append(x)if(column == False):flag = False#print(self.filename)#判斷是否有字符粘連,如果有就進(jìn)行切割for i in range(0 , len(spliter_x) , 2):#兩個(gè)粘連的問(wèn)題if(spliter_x[i+1] - spliter_x[i] > 21 and spliter_x[i+1] - spliter_x[i] < 43):x = spliter_x[i]y = spliter_x[i+1]spliter_x[i+1] = x+19spliter_x.insert(i+2 , y)spliter_x.insert(i+2 , x+19)if(i == 0):if(spliter_x[i+5] - spliter_x[i+4] > 21):x = spliter_x[i+4]y = spliter_x[i+5]spliter_x[i+5] = x+19spliter_x.insert(i+6 , y)spliter_x.insert(i+6 , x+19)break #三個(gè)粘連的問(wèn)題elif(spliter_x[i+1] - spliter_x[i] >= 43):x = spliter_x[i]y = spliter_x[i+1]spliter_x[i+1] = x+19spliter_x.insert(i+2 , y)spliter_x.insert(i+2 , x+37)spliter_x.insert(i+2 , x+37)spliter_x.insert(i+2 , x+19)break ;#print(spliter_x) #2.2找橫向切割點(diǎn)for i in range(0 , len(spliter_x) , 2):#1.1.1先從上到下找到頂部臨界點(diǎn)flag = Falsefor y in range(h):for x in range(spliter_x[i] , spliter_x[i+1]):if(pixdata[x , y] == 0):if(flag == False):flag = Truespliter_y.append(y)break ;if(flag == True):break#1.1.2從下至上找到底部臨界點(diǎn)flag = Falsefor y in range(h-1 , -1 , -1):for x in range(spliter_x[i] , spliter_x[i+1]):if(pixdata[x , y] == 0):if(flag == False):flag = Truespliter_y.append(y+1)breakif(flag == True):break#再次降噪temp_arr_x = []temp_arr_y = []for i in range(0 , len(spliter_y) , 2):#print(str(spliter_y[i+1]) +"\t"+ str(spliter_y[i]))if(spliter_y[i+1] - spliter_y[i] <= 4):for x in range(spliter_x[i] , spliter_x[i+1]):for y in range(spliter_y[i] , spliter_y[i+1]):pixdata[x , y] = 255spliter_x[i] = 0spliter_x[i+1] = 0spliter_y[i] = 0spliter_y[i+1] = 0else:temp_arr_x.append(spliter_x[i])temp_arr_x.append(spliter_x[i+1])temp_arr_y.append(spliter_y[i])temp_arr_y.append(spliter_y[i+1])spliter_x = temp_arr_xspliter_y = temp_arr_y#將分割點(diǎn)進(jìn)行存儲(chǔ)result = [[] , [] , [] , []]#print(len(spliter_x))if(len(spliter_x) == 8):for i in range(len(result)):result[i].append(spliter_x[i*2])result[i].append(spliter_y[i*2])result[i].append(spliter_x[i*2+1])result[i].append(spliter_y[i*2+1])else:return Falseself.spliter = resultdef test(self):'''測(cè)試切割后的結(jié)果'''for index , i in enumerate(self.spliter):box = tuple(i)region = self.im.crop(box)w , h = region.sizew = w - 1h = h - 1draw = ImageDraw.Draw(region)draw.line((0 , 0 , w , 0) , fill=160)draw.line((0 , 0 , 0 , h) , fill=160)draw.line((w , 0 , w , h) , fill=160)draw.line((0 , h , w , h) , fill=160)del drawself.im.paste(region, box)#region.save(str(index)+".png" , "PNG")'''將分割好的圖片轉(zhuǎn)換成機(jī)器學(xué)習(xí)數(shù)據(jù)'''def img_2_train_data(self):f = self.filename.replace(".png" , "")[-4:]for index , i in enumerate(self.spliter):box = tuple(i)region = self.im.crop(box)w , h = region.sizepixdata = region.load()d = "0b"for y in range(h):for x in range(w):if(pixdata[x , y] == 0):d+='1'else:d+='0'self.data.append(int(d , 2))self.labels.append(f[index])2.訓(xùn)練代碼
from numpy import * import numpy as np from sklearn import neighbors import os from sklearn.preprocessing import LabelBinarizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.externals import joblib from ImageHandler import ImageHandler from PIL import Imageimg_dir = "./code" files = os.listdir(img_dir)data = [] result = []for i in range(len(files)):path = img_dir+"/"+files[i]image = ImageHandler(path)image.toBinary_data()image.dot_noise()image.cut_img()image.img_2_train_data()data = np.append(data , image.data)result = np.append(result , image.labels)del imageprint("數(shù)據(jù)準(zhǔn)備完畢")x = data.reshape(-1 , 1) y = result.reshape(-1 , 1) x = np.array(x) y = np.array(y)# 拆分訓(xùn)練數(shù)據(jù)與測(cè)試數(shù)據(jù) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1) # 訓(xùn)練KNN分類器 clf = neighbors.KNeighborsClassifier() clf.fit(x, y)print("訓(xùn)練完成")# 保存分類器模型 joblib.dump(clf, './knn/knn.pkl')print("結(jié)束")print("檢驗(yàn)準(zhǔn)確率")# # 測(cè)試結(jié)果打印 pre_y_train = clf.predict(x_train) pre_y_test = clf.predict(x_test) class_name1 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9' , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j' , 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't' , 'u', 'v', 'w', 'x'] class_name2 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9' , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j' , 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't' , 'u', 'v', 'w', 'x'] print (classification_report(y_train, pre_y_train, target_names=class_name1)) print (classification_report(y_test, pre_y_test, target_names=class_name2))以上就是我這次破解驗(yàn)證碼的全部過(guò)程,如果大佬覺(jué)得有更好的方式歡迎留言討論,加我qq也行:1730145232
總結(jié)
以上是生活随笔為你收集整理的强智科技教务系统验证码识别的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: carla 和 ros2
- 下一篇: nrf52832 - HID