2019獨角獸企業重金招聘Python工程師標準>>>
可以把文字內容,圖片鏈接寫到數據庫;
可以把圖片下載至本地的日期文件夾里,并把本地的相對鏈接也寫到數據庫。
只要安裝了mysql,把上面的幾個配置換成自己的就OK了。
可以建一個定時任務,基本上所有內容都能爬下來(不包含評論和用戶信息)。
廢話不說,上代碼:
#!/usr/bin/env?python
#encoding:?utf-8
#author:?zengqiu
import?urllib2
import?urllib
from?bs4?import?BeautifulSoup
import?MySQLdb
import?datetime
import?re
import?urlparse
import?os
import?socket
mysql_host?=?"localhost"
mysql_port?=?3306
mysql_user?=?"root"
mysql_password?=?"test"
mysql_db_name?=?"qiushibaike"
mysql_table_name?=?"qiushibaike"
image_path?=?"/Users/admin/code/pythonCode/QB/QBImage"
def?spider(url):#?user_agent?=?"Mozilla/4.0?(compatible;?MSIE?5.5;?Windows?NT)"#?headers?=?{'User-Agent':?user_agent}#?request?=?urllib2.Request(url,?headers?=?headers)req?=?urllib2.Request(url)req.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')req.add_header('Accept-Language','zh-CN,zh;q=0.8,en;q=0.6')req.add_header('Cache-Control','max-age=0')req.add_header('Connection','keep-alive')req.add_header('Referer',url)req.add_header('User-Agent','Mozilla/5.0?(Windows?NT?6.1;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/32.0.1660.0?Safari/537.36')response?=?urllib2.urlopen(req)soup?=?BeautifulSoup(response.read())results?=?[]for?content?in?soup.findAll("div",?"content",?title=True):result?=?{}result['content']?=?content.text#print?content.textthumb?=?content.findNext("div")if?thumb['class']?==?[u'thumb']:for?attr?in?thumb.a.img.attrs:if?attr?==?"src":url_img?=?thumb.a.img[attr]result['image']?=?url_img#print?url_imgfor?attr?in?content.attrs:if?attr?==?"title":date?=?content[attr]result['date']?=?date#print?dateresults.append(result)#for?result?in?results:#for?key?in?result:#print?"[%s]?="?%?key,?result[key]return?results
def?create_database(database):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?port=mysql_port)cur?=?conn.cursor()sql?=?"create?database?%s"?%?(database)try:cur.execute(sql)conn.commit()except:conn.rollback()conn.close()
def?create_table(table):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")cur?=?conn.cursor()sql?=?"CREATE?TABLE?%s?(`id`?int(11)?NOT?NULL?AUTO_INCREMENT,?`content`?varchar(10000)?NULL,?`image`?varchar(1000)?NULL,?`date`?datetime?NULL,?`location`?varchar(1000)?NULL,?CONSTRAINT?entry?UNIQUE?(`content`(200),?`date`),?PRIMARY?KEY?(`id`))?ENGINE=MyISAM?DEFAULT?CHARACTER?SET=utf8?COLLATE=utf8_general_ci"?%?(table)try:cur.execute(sql)conn.commit()except:conn.rollback()conn.close()
def?insert(table,?date,?content,?image="",?location=""):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")cur?=?conn.cursor()sql?=?"insert?ignore?into?"?+?table?+?"(date,?content,?image,?location)?values(%s,?%s,?%s,?%s)"params?=?(datetime.datetime.strptime(date,?'%Y-%m-%d?%H:%M:%S'),?content,?image,?location)try:cur.execute(sql,?params)conn.commit()except:conn.rollback()conn.close()
def?download(url,?path):filename?=?re.split('/',?urlparse.urlparse(url).path)[-1]filepath?=?os.path.join(path,?filename)if?not?os.path.isfile(filepath):urllib.urlretrieve(url,?filepath)return?filename
def?makedir(path):if?not?os.path.exists(path):os.makedirs(path)
def?run():page?=?1enable?=?Truetry:conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")conn.close()except:create_database(mysql_db_name)create_table(mysql_table_name)while?enable:print?"page?is?%d"?%?pageurl?=?"http://www.qiushibaike.com/8hr/page/%d"?%?pageresults?=?spider(url)if?results:for?result?in?results:if?result.has_key('image'):subpath?=?re.split('?',?result['date'])[0]newpath?=?os.path.join(image_path,?subpath)makedir(newpath)try:filename?=?download(result['image'],?newpath)location?=?os.path.join(subpath,?filename)insert(mysql_table_name,?result['date'],?result['content'],?result['image'],?location)except:print?filename?+?"?is?not?exist"else:insert(mysql_table_name,?result['date'],?result['content'])#for?key?in?result:#print?"[%s]?="?%?key,?result[key]page?+=?1else:enable?=?False
def?main():print?'Please?use?it?as?./qiushibaike.py'run()
if?__name__?==?'__main__':main()
基于代碼https://github.com/zengqiu/spider/blob/master/qiushibaike.py 修改,原代碼在我的Mac上執行有問題,修改了幾個bug才搞定,第一次嘗試修改python代碼,挺折騰的。
剩下的就是想在本地做一個網頁,可以訪問數據庫按日期瀏覽每天的糗百內容,估計要用php做,誰如果有空能做一下,就太感謝了。
參考:http://www.v2ex.com/t/131750
轉載于:https://my.oschina.net/ioslighter/blog/357376
總結
以上是生活随笔為你收集整理的抓糗百数据和图片的Python爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。