网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据
生活随笔
收集整理的這篇文章主要介紹了
网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
代碼實現
#encoding: utf-8from selenium import webdriver from lxml import etree import re import time from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import Byclass LagouSpider(object):driver_path = r"D:\Program Files\chromedriver_win32\chromedriver.exe"def __init__(self):self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='self.positions = []def run(self):self.driver.get(self.url)while True:source = self.driver.page_sourceWebDriverWait(driver=self.driver,timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]")))self.parse_list_page(source)try:next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")if "pager_next_disabled" in next_btn.get_attribute("class"):breakelse:next_btn.click()except:print(source)time.sleep(1)def parse_list_page(self,source):html = etree.HTML(source)links = html.xpath("//a[@class='position_link']/@href")for link in links:self.request_detail_page(link)time.sleep(1)def request_detail_page(self,url):# self.driver.get(url)self.driver.execute_script("window.open('%s')"%url)self.driver.switch_to.window(self.driver.window_handles[1])WebDriverWait(self.driver,timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']")))source = self.driver.page_sourceself.parse_detail_page(source)# 關閉當前這個詳情頁self.driver.close()# 繼續切換回職位列表頁self.driver.switch_to.window(self.driver.window_handles[0])def parse_detail_page(self,source):html = etree.HTML(source)position_name = html.xpath("//span[@class='name']/text()")[0]job_request_spans = html.xpath("//dd[@class='job_request']//span")salary = job_request_spans[0].xpath('.//text()')[0].strip()city = job_request_spans[1].xpath(".//text()")[0].strip()city = re.sub(r"[\s/]", "", city)work_years = job_request_spans[2].xpath(".//text()")[0].strip()work_years = re.sub(r"[\s/]", "", work_years)education = job_request_spans[3].xpath(".//text()")[0].strip()education = re.sub(r"[\s/]", "", education)desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()position = {'name': position_name,'company_name': company_name,'salary': salary,'city': city,'work_years': work_years,'education': education,'desc': desc}self.positions.append(position)print(position)print('='*40)if __name__ == '__main__':spider = LagouSpider()spider.run()總結
以上是生活随笔為你收集整理的网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 1.供给与需求分析(交通流理论)
- 下一篇: 临界资源和临界区