python爬虫,以某小说网站为例
生活随笔
收集整理的這篇文章主要介紹了
python爬虫,以某小说网站为例
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
python的簡單爬蟲,通過獲取網頁的源代碼分理處有用的數據再進行分析整理保存的過程。
大體可以分為三步:
- 獲取網頁源代碼
- 整理剝離有用代碼
- 儲存信息
1、通過requests庫獲取網站源碼
''' Description: html工具類 Version: 1.0 Autor: 李廣凱 Date: 2022-02-24 20:23:23 LastEditors: 李廣凱 LastEditTime: 2022-04-04 22:48:56 ''' import requests # 取得網頁源碼 def getHTML(start_url):html = requests.get(start_url)html.raise_for_status# html.encoding = 'utf-8'html.encoding = html.apparent_encodingreturn html.text2、整理網頁源代碼
#獲取排行榜標簽:日排行、周排行、月排行 def getListTag(html):# 標簽列表list_tag = []soup = BeautifulSoup(html, 'html.parser')useful = soup.find(class_='tab')li_list = useful.find_all('li')for li in li_list:list_tag.append(li.string)return list_tag# 獲取排行榜的url,書名,封面 def getBookTopList(html):global list_book_toplist_tag = getListTag(html)list_book_url = []list_book_name = []list_book_img = []list_book_top = []soup = BeautifulSoup(html, 'html.parser')useful = soup.find(class_='tabCon')ul_context = useful.find_all('ul')for ul in ul_context:list_book_img_cache = []list_book_name_cache = []list_book_url_cache = []for li in ul:li = str(li).replace('\n', '')book_img = re.findall('src="(.*?)"', li)if len(book_img):list_book_img_cache.append(book_img[0])if len(li):book_url = re.search('href="(.*?)"', li).group(1)list_book_url_cache.append(book_url)book_name = re.search('title="(.*?)"', li).group(1)list_book_name_cache.append(book_name)list_book_img.append(list_book_img_cache)list_book_name.append(list_book_name_cache)list_book_url.append(list_book_url_cache)print(list_book_name)for i in range(len(list_tag)):for k in range(len(list_book_name[i])):dic_book_top_cache = {'榜單名': str(list_tag[i]),'書名': str(list_book_name[i][k]),'封面': str(list_book_img[i][k]),'url': str(list_book_url[i][k])}list_book_top.append(dic_book_top_cache)print('獲取排行榜列表成功!')return list_book_top# 補充排行榜列表(作者、書籍簡介) def insertBookListTop():url_list = []for book in list_book_top:url_list.append(book['url'])pool = Pool(5)result = pool.map(getBookSimpleInfo, url_list)for i in range(len(result)):list_book_top[i]['作者'] = str(result[i][0][3:])list_book_top[i]['簡介'] = str(result[i][1])list_book_top[i]['下載地址'] = str(result[i][2])#獲取下載小說文件的url def downloadBookFile(download_before_url):html = html_tool.getHTML(download_before_url)soup = BeautifulSoup(html, 'html.parser')useful = soup.find(class_='down-btn-group')a_context = useful.find('a')a_context = str(a_context).replace('\n', '')download_book_url = re.search('href="(.*?)"', a_context).group(1)return download_book_url# 添加書籍信息 def getBookSimpleInfo(url):html = html_tool.getHTML(url)soup = BeautifulSoup(html, 'html.parser')useful = soup.find(class_='info2')h3_context = useful.find('h3')p_context = useful.find('p')author = h3_context.stringinfo = p_context.string# 下載頁面urla_context = soup.find(class_='btn btn-danger')a_context = str(a_context).replace('\n', '')download_before_url = re.search('href="(.*?)"', a_context).group(1)download_before_url = start_url + download_before_urldownload = downloadBookFile(download_before_url)info = str(info).replace('\n', '')info = str(info).replace(' ', '')return author, info, download3、儲存文件或插入數據庫
#儲存文件 def saveFile(book_info):url = book_info['下載地址']path = './book_file/' + book_info['書名'] + '.txt'r = requests.get(url)with open(path, "wb") as code:code.write(r.content)# urlretrieve(url,path)print(book_info['書名'] + '下載完成!')# 插入數據庫 def insertDB(book):file_path = './book_file/' + book['書名'] + '.txt'datetime = time.strftime("%Y-%m-%d", time.localtime())if book['榜單名'] == '日排行':sql = "INSERT INTO day_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book['書名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book['簡介'] + '","' + file_path + '","' + datetime + '")'if book['榜單名'] == '周排行':sql = "INSERT INTO week_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book['書名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book['簡介'] + '","' + file_path + '","' + datetime + '")'if book['榜單名'] == '月排行':sql = "INSERT INTO month_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book['書名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book['簡介'] + '","' + file_path + '","' + datetime + '")'db.insertData(sql)因為只是獲取排行榜數據,所以就用一個全局列表list_book_top[] 來臨時存儲信息。代碼整體還是比較簡單,就是分析提取書籍信息部分比較麻煩。
總結
以上是生活随笔為你收集整理的python爬虫,以某小说网站为例的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 最新mysql数据库源码编译安装。
- 下一篇: 以太网交换机芯片概述