【爬虫】小说筛选爬取模板
生活随笔
收集整理的這篇文章主要介紹了
【爬虫】小说筛选爬取模板
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
適用于某分區內的所有小說(兩級網站),并簡單地根據關鍵字賦權值篩選
import requests import re from bs4 import BeautifulSoup# 爬取網站 def getHTMLText(url):try:#print("獲取url中...")r=requests.get(url,timeout=30)r.raise_for_status()r.encoding=r.apparent_encoding#print("獲取url完成")return r.textexcept:print("獲取Url失敗")# 一級網站源碼解析 def parsePage1(text1):url1 = re.findall(r'', text1) # 本頁分區所有小說的粗url列表url1List = []for n in url1:n = n.split('"')[1]url1List.append(n)return url1List # 本頁分區所有小說的url列表# 具體文章爬取并篩選 def parsePage2(url1List):urlList = []for n in url1List:urlList.append("") # url預處理nameNum = len(urlList)count = 1for url2 in urlList:judge1 = 0print("\t第{}/{}個".format(count, nameNum))count += 1text2 = getHTMLText(url2) # 獲取小說htmlpageN = pageNum(text2) # 獲取小說頁數name = re.findall() # 獲取小說名稱remove = [] # 去除不需要的小說名稱for n in remove:if n in name:judge1 = 1if judge1 == 1:continueprint("\t\t{}".format(name))print("\t\t第1/{}頁".format(pageN))text = content(text2) # 文字獲取if pageN > 1:# 處理其他頁的urlurl = url2sl = url.split(".")sp = ""for i in sl[:-2]:sp += i + "."for n in range(2, min(pageN+1, 15)):print("\t\t第{}/{}頁".format(n, pageN))url2 = sp + sl[-2] + "_" + str(n) + "." + sl[-1]text2 = getHTMLText(url2)text += content(text2)# 篩選函數if judge(text):write(name)# 寫入函數 def write(name):f = open('1.txt', 'a', encoding='utf-8')w = name + "\n"f.write(w)f.close()# 文字獲取函數 def content(text2):text = text2soup = BeautifulSoup(text, "html.parser")text = soup.select('')[0]content = ""for string in text.stripped_strings:content += stringreturn content# 頁數獲取函數 def pageNum(text2):try:pageN = int(re.findall(r'', text2)return pageNexcept:return 1# 篩選函數 def judge(text):search1 = [] # 特別喜歡的關鍵詞search2 = [] # 一般喜歡的關鍵詞search3 = [] # 拒絕的關鍵詞length = len(search1) + len(search2)count = 0for str in search1:if str in text:count += 4for str in search2:if str in text:count += 1for str in search3:if str in text:count -= 10rate = count / lengthprint("\t\t\trate: {}".format(rate))# 篩選標準if rate >= 1:return Trueelse:return Falsedef main():url1 = "" # 分區網址text1 = getHTMLText(url1)indexPageNum = int(re.findall(r'', text1) # 分區總頁數url1List = parsePage1(text1) # 獲得分區下所有小說的網址parsePage2(url1List) # 具體文章爬取并篩選for indexN in range(1, indexPageNum+1):print("第{}/{}頁".format(indexN, indexPageNum))url1 = "" + str(indexN) + ".html"text1 = getHTMLText(url1)url1List = parsePage1(text1)parsePage2(url1List)main()?
總結
以上是生活随笔為你收集整理的【爬虫】小说筛选爬取模板的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 在线浏览ipynb文件
- 下一篇: Linux 截图