爬取智联招聘(面向对象)
生活随笔
收集整理的這篇文章主要介紹了
爬取智联招聘(面向对象)
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
有待完善
import re import requests import os import xlwt import codecsclass Item(object):zhiwei = Nonegongzi = Nonegongzuodidian = Nonegongsimingcheng = Noneclass getPosition(object):def __init__(self):self.urlBase = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&isfilter=0&fl=489&isadv=0&sg=aae5284f62664af8b14611bda6d68315&p=1'self.urls = []self.items = []self.getUrls(2)# self.getHTML(self.urls)self.spider(self.urls)self.save(self.items)def getHTML(self,url):kv = {'user-agent':'Mozilla/5.0'}r = requests.get(url,headers = kv)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textdef getUrls(self,pages):urlHead = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&isfilter=0&fl=489&isadv=0&sg=aae5284f62664af8b14611bda6d68315&p='for i in range(1,pages+1):url = urlHead + str(i)self.urls.append(url)def spider(self,urls):pat1 = '<b>(.*?)</b>'pat3 = '<td class="zwyx">(.*?)</td>'pat4 = '<td class="gzdd">(.*?)</td>'pat5 = 'target="_blank">(.*?)<'item = Item()for url in urls:html = self.getHTML(url)item.zhiwei = re.compile(pat1).findall(html)item.gongzi = re.compile(pat3).findall(html)item.gongzuodidian = re.compile(pat4).findall(html)item.gongsimingcheng = re.compile(pat5).findall(html)self.items.append(item)print(len(item.zhiwei))print(len(item.gongzi))def save(self,items):fileName = '1a.txt'.encode('GBK')tplt = "{0:^10}\t{1:<10}\t{2:^10}\t{3:^10}"with codecs.open(fileName,'w','utf-8') as f:for item in items:# f.write(tplt.format(item.zhiwei,item.gongzi,item.gongzuodidian,item.gongsimingcheng))# f.write("%s \t %s \t %s \t %s \r\n" % (' '.join(item.zhiwei),item.gongzi,item.gongzuodidian,item.gongsimingcheng))for i in range(59):print(tplt.format(item.zhiwei[i],item.gongzi[i],item.gongzuodidian[i],item.gongsimingcheng[i]))# print(item.zhiwei[i])# print(item.gongzi[i])if __name__ == '__main__':p = getPosition()總結(jié)
以上是生活随笔為你收集整理的爬取智联招聘(面向对象)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 几个Python运算符
- 下一篇: 爬取微博好友所发微博制作词云