python爬取b站排行榜_抓取+硒元素,获得Bilibili排行榜(紧急列表)(动态加载),scrapyselenium,获取,哔哩,应援...
目標數據:
爬蟲代碼:
# -*- coding: utf-8 -*-
import scrapy
from bilibili_yy.items import BilibiliYyItem
import re
from selenium import webdriver
import pyperclip
class BiliSpider(scrapy.Spider):
name = 'bili'
# allowed_domains = ['manga.bilibili.com']
start_urls = ['https://manga.bilibili.com/ranking?from=manga_homepage#/ouenn/']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
item = BilibiliYyItem()
for data_s in response.xpath('//div[@class="rank-item dp-i-block border-box p-relative"]'):
pmqingkuang = data_s.xpath('.//div[starts-with(@class,"rank-movement p-absolute bg-center bg-cover bg-no-repeat")]/@class').extract()[0]
if len(data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]')) == 2:
item['paiming'] = re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[0])[0]+ re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[1])[0]
else:
item['paiming'] = re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[0])[0].zfill(2)
if 'hold' in pmqingkuang:
item['pmqingkuang'] = '保持'
elif 'up' in pmqingkuang:
item['pmqingkuang'] = '上升'
else:
item['pmqingkuang'] = '下降'
item['pic_link'] = data_s.xpath('.//div[starts-with(@class,"manga-cover bg-center bg-cover bg-no-repeat")]/@data-src').extract()[0]
item['cartoon_link'] ='https://manga.bilibili.com'+ data_s.xpath('.//a[starts-with(@class,"dp-block manga-title")]/@href').extract()[0]
item['name'] = data_s.xpath('.//a[starts-with(@class,"dp-block manga-title")]/text()').extract()[0]
item['author'] = data_s.xpath('.//p[@class="fans-author-text t-over-hidden t-no-wrap"]/text()').extract()[0]
item['fensizhi'] = data_s.xpath('.//p[@class="fans-value"]/text()').extract()[0].replace(' 萬 粉絲值','')
if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[2]/@title'):
item['zhugong1'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[2]/@title').extract()[0]
else:
item['zhugong1'] = ''
if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[3]/@title'):
item['zhugong2'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[3]/@title').extract()[0]
else:
item['zhugong2'] = ''
if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[4]/@title'):
item['zhugong3'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[4]/@title').extract()[0]
else:
item['zhugong3'] = ''
yield item
def close_spider(self,spider):
print('關閉瀏覽器對象')
self.driver.quit()
寫出mongo:
全部文件下載:
總結
以上是生活随笔為你收集整理的python爬取b站排行榜_抓取+硒元素,获得Bilibili排行榜(紧急列表)(动态加载),scrapyselenium,获取,哔哩,应援...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: linux中cut -c命令,Linux
- 下一篇: 个性化电脑3D桌面软件选择方案