python實現爬蟲
最近由于參加學校舉辦短時速學python的比賽,學習了一遍python這門語言,原來一直認為Java語言是最牛逼的,現在發現python也有它的可取之處,它開發快,語言簡潔,對于數組的處理,讓我發現利用它開發一些簡單的程序真的比java快^^
下面,記錄一下我利用python實現爬蟲,獲取百度文庫詞條其中包含”python”信息的樣例(技術Python、Mysql)
1、爬蟲架構,以及原理
爬蟲重要的架構有三個,分別是URL管理器、網頁下載器、網頁解析器,還有一個調度器、數據輸出器
URL管理器:管理所有的URL,負責取出URL給網頁下載器,并將該URL設定為以爬取
網頁下載器:將互聯網對應的網頁下載到本地
網頁解析器:解析網頁中重要的數據信息(在本樣例中為詞條信息),并且從該網頁中獲取其他符合要求的URL,存入Mysql,以便URL管理器取
調度器:類似與java中的main方法,相當于開啟爬蟲的入口,它負責初始化第一個入口URL(地址),利用while循環依次調用URL管理器、網頁下載器、網頁解析器完成相關功能。
數據輸出器:將得到數據輸出
如下圖:
2、代碼框架
1、利用Mysql數據庫,
數據庫表baike_spider,賬戶:root,密碼:0203
CREATE TABLE `baike_spider` (`webSite` varchar(255) DEFAULT NULL,`isCraw` int(1) DEFAULT '0',`title` varchar(255) DEFAULT NULL,`cont` text,KEY `webSide` (`webSite`) USING HASH
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、源程序框架
3、爬取結果展示:
尷尬,顯示的python就兩三個。。
不過沒關系,肯定在沒顯示出來的地方^_^.
import url_manager, html_donloader, html_parse, html_outputer
import sys
default_encoding =
'utf-8'
if sys.getdefaultencoding() != default_encoding:reload(sys)sys.setdefaultencoding(default_encoding)
class SpiderMain(object):def __init__(self):self.urls = url_manager.UrlManager()self.donloader = html_donloader.HtmlDonload()self.parse = html_parse.HtmlParse()self.outputer = html_outputer.HtmlOutputer()
def craw(self, root_url):self.urls.init_url(root_url)count =
0while self.urls.has_new_url():
try:new_url = self.urls.get_new_url()cont = self.donloader.donload(new_url)urls, new_data = self.parse.parse(new_url, cont)self.urls.add_new_urls(urls)
if new_data
is not None:self.outputer.collect_data(new_url, new_data)
print "爬完%s,得到%d個新網站,得到信息%s,%s" % (new_url,len(urls),new_data.values()[
0],new_data.values()[
1])
print "正在爬第%d個網站" % count
if count ==
1000:
breakcount +=
1except Exception,value:
print "craw error :",value
print "craw完畢"pass__name__ =
"_main_"if __name__ ==
"_main_":root_url =
"http://baike.baidu.com/item/Python"obj_spider = SpiderMain()obj_spider.craw(root_url)
import MySQLdb
as mdb
class UrlManager(object):def __init__(self):self.db = mdb.connect(
"localhost",
"root",
"0203",
"bigData",charset=
"utf8")cursor = self.db.cursor()delete_sql =
'''drop table if exists baike_spider'''create_sql =
'''create table if not exists baike_spider(webSite varchar(255),isCraw int(1) default '0',title varchar(255),cont text,KEY `webSide` (`webSite`) USING HASH)'''try:cursor.execute(delete_sql)cursor.execute(create_sql)cursor.execute(
"SET NAMES UTF8")self.db.commit()
except Exception,value:
print "URLManager.__init__Error: ",valueself.db.rollback()
finally:cursor.close()
passdef init_url(self, root_url):try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")insert_sql =
'''insert into baike_spider(webSite) values('%s')''' % root_urlcursor.execute(insert_sql)self.db.commit()
except Exception,value:self.db.rollback()
print "URLManager.__init__url: ",value
finally:cursor.close()
passdef has_new_url(self):new =
0try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")select_sql =
'''select isCraw from baike_spider where isCraw=0 limit 1'''new = cursor.execute(select_sql)
except Exception,value:
print "URLManager.has_new_url: ",value
finally:cursor.close()
return new
passdef get_new_url(self):url =
""try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")select_sql =
'''select * from baike_spider where isCraw=0 limit 1'''cursor.execute(select_sql)url = cursor.fetchone()[
0]update_sql =
'''update baike_spider set isCraw=1 where webSite='%s' '''cursor.execute(update_sql % url)self.db.commit()
except Exception,value:self.db.rollback()
print "URLManager.has_new_url: ",value
finally:cursor.close()
return url
def add_new_urls(self,urls):is_exist =
'''select isCraw from baike_spider where webSite='%s' '''insert_sql =
'''insert into baike_spider(webSite) values('%s')'''try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")
for url
in urls:flag = cursor.execute(is_exist % url)
if flag:
continueelse:cursor.execute(insert_sql % url)self.db.commit()
except Exception,value:
print "URLManager.add_new_urls: ",valueself.db.rollback()
finally:cursor.close()
pass
import urllib2
class HtmlDonload():def __init__(self):passdef donload(self, url):cont =
""try:response = urllib2.urlopen(url)
if response.getcode()==
200:cont = response.read()
except Exception,value:
print "HtmlDonload(),Error",value
return cont
pass
import re
import urlparse
from bs4
import BeautifulSoup
class HtmlParse():def __init__(self):passdef _get_new_urls(self, page_url, soup):new_urls = set()links = soup.find_all(name=
'a', href=re.compile(
r"/item/"))
for link
in links:new_url = link[
'href']new_full_url = urlparse.urljoin(page_url, new_url)new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, url, soup):res_data = {}title_node = soup.find(
"dd", class_=
"lemmaWgt-lemmaTitle-title").find(
"h1")res_data[
"title",] = title_node.get_text()summary_node = soup.find(
"div", class_=
"lemma-summary")res_data[
"summary"] = summary_node.get_text()
return res_data
def parse(self, url, cont):if cont
is None or url
is None:
returnsoup = BeautifulSoup(cont,
'html.parser', from_encoding=
'utf-8')new_urls = self._get_new_urls(url, soup)new_data = self._get_new_data(url, soup)
return new_urls, new_data
import MySQLdb
as mdb
class HtmlOutputer():def __init__(self):self.db = mdb.connect(
"localhost",
"root",
"0203",
"bigData",charset=
"utf8")
passdef collect_data(self, url, new_data):try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")insert_sql =
'''update baike_spider set title='%s',cont='%s' where webSite='%s' '''data = new_data.values()cursor.execute(insert_sql % (data[
0],data[
1],url))self.db.commit()
except Exception,value:self.db.rollback()
print "HtmlOutputer.collect_data: ",value
finally:cursor.close()
passdef print_data(self):print 123try:cursor = self.db.cursor()cursor.execute(
"SET NAMES UTF8")insert_sql =
'''select * from baike_spider where isCraw=1 '''cursor.execute(insert_sql)results = cursor.fetchall()
for result
in results:
print result[
2],result[
3]self.db.commit()
except Exception,value:self.db.rollback()
print "HtmlOutputer.collect_data: ",value
finally:cursor.close()
pass
總結
以上是生活随笔為你收集整理的利用python实现简单的爬虫,爬百度文库相关词条信息的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。