# -*- coding:utf-8 -*-import requests
from bs4 import BeautifulSoup
import traceback
import re
import xlwt
list=['glibc','Microsoft Office Word','Microsoft Internet Explorer']#想要查詢的相關漏洞
num=[] # 存放每個實體對應的漏洞數目
page=[]# 存放每個實體對應的漏洞的頁數# 根據url爬取網頁defgetHTMLTEXT(url,code="utf-8"):kv = {'user-agent': 'Mozilla/5.0'} # 模擬瀏覽器訪問網站try:r=requests.get(url,headers=kv,timeout=30)r.raise_for_status()r.encoding=codereturn r.textexcept:traceback.print_exc()return""# 初始網頁defparsepage(name,url):type={}# 存放每個實體中的漏洞類型html=getHTMLTEXT(url)soup=BeautifulSoup(html,'html.parser')#每個類型的數量#text=soup.find_all('a',text=re.compile('總條數:'))#num.append(re.findall(r'[0-9,]+',text.__str__()[:][:]))#統計當前實體搜索結果共有多少頁value = soup.find_all('input',id="pagecount")page.append(re.findall(r'[0-9]+',value.__str__()))
#爬取對應漏洞下所有的網頁defall_page(name,url,n):type=[] #漏洞類型level=[] #漏洞危害等級name_info=[]#每個漏洞的名稱link=[] #每個漏洞的鏈接#循環遍歷每個網頁for p in range(1,int(n)+1):html=getHTMLTEXT(url+str(p))soup = BeautifulSoup(html, 'html.parser')# 統計每個實體中具體漏洞的鏈接text = soup.find_all('a',attrs={'class':'a_title2'})for i in text:try:href = i.attrs['href']if(re.findall(r'.?CNNVD.?',href)):link.append(href)except:continue#對于每一個鏈接,去爬取鏈接的頁面for i in link:html=getHTMLTEXT('http://www.cnnvd.org.cn'+i)soup = BeautifulSoup(html, 'html.parser')title=soup.find_all('h2',style="")for m in title[0]:title=m #每個漏洞的具體名稱#尋找類型和等級text=soup.find_all('a',style="color:#4095cc;cursor:pointer;",href="")S=[]if len(text):for t in text[:2]:t=re.findall(r'[\u4E00-\u9FA5]+',str(t)) #匹配漢字try:S.append(t[0])except:S.append('未評定')type.append(S[0])level.append(S[1])name_info.append(title)#將列表信息寫入EXCEL中f = xlwt.Workbook() # 創建EXCEL工作簿sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 創建sheetsheet1.write(0, 0, "漏洞名稱")sheet1.write(0, 1, "類型")sheet1.write(0, 2, "危害等級")for i in range(len(name_info)):sheet1.write(i + 1, 0, name_info[i])sheet1.write(i + 1, 1, type[i])sheet1.write(i + 1, 2, level[i])f.save(name+"_result.xls") #保存文件if __name__=="__main__":url='http://www.cnnvd.org.cn/web/vulnerability/queryLds.tag?qcvCname='for i in list:parsepage(i,url+i)for i in range(len(list)):turl=url+list[i]+'&pageno='all_page(list[i],turl,page[i][0])#創建工作簿f=xlwt.Workbook()#創建EXCEL工作簿sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 創建sheetfor i in range(len(num)):sheet1.write(i,0,list[i])sheet1.write(i,1,num[i])f.save("實體總量.xls")
生成的文件如下所示
2. 統計數據
import xlrd
import xlwt
import traceback
defopen_excel(path,name):type={}level={}try:data=xlrd.open_workbook(path)table=data.sheet_by_name(sheet_name=u'sheet1')t=table.col_values(1)[1:]l=table.col_values(2)[1:]#去掉重復值t1=set(t)l1=set(l)#初始化字典for i in t1:type[i]=0for i in l1:level[i]=0#統計數量for i in t:type[i]=type[i]+1for i in l:level[i]=level[i]+1f = xlwt.Workbook() # 創建EXCEL工作簿tkeys=list(type.keys())sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 創建sheetfor i in range(len(tkeys)):sheet1.write(i, 0,tkeys[i])sheet1.write(i, 1,type[tkeys[i]])sheet2=f.add_sheet(u'sheet2',cell_overwrite_ok=True)lkeys=list(level.keys())for i in range(len(lkeys)):sheet2.write(i,0,lkeys[i])sheet2.write(i,1,level[lkeys[i]])f.save(name+'_sum_up.xls')except :traceback.print_exc()print('Error!')
if __name__=="__main__":path=['glibc_result.xls','Microsoft Internet Explorer_result.xls','Microsoft Office Word_result.xls']name=['glibc','Microsoft Internet Explorer','Microsoft Office Word']for i in range(len(path)):open_excel(path[i],name[i])