爬取所有校园新闻
1.獲取單條新聞的#標題#鏈接#時間#來源#內容 #點擊次數,并包裝成一個函數。
import requests from bs4 import BeautifulSoup network = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(network) res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].textmain = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('鏈接:{}'.format(url))print('標題:{}'.format(title))print('正文:{}'.format(main))print('時間:{}'.format(time))print('來源:{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')passage = soup1.select('.show-content')click = int(requests.get('http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80').text.split('.')[-1].lstrip("html('").rstrip("');"))print('點擊次數:{}'.format(click))break?
?
2.獲取一個新聞列表頁的所有新聞的上述詳情,并包裝成一個函數。
import requests from bs4 import BeautifulSoup from datetime import datetime import renetwork = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(network) res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')def getclick(newsurl):id = re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',newsurl).groups()[0].split('/')[1]clickurl = 'http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80'.format(id)click = int(requests.get(clickurl).text.split('.')[-1].lstrip("html('").rstrip("');"))return(click)for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].texttimed = datetime.strptime(time,'%Y-%m-%d')main = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('鏈接:{}'.format(url))print('標題:{}'.format(title))print('正文:{}'.format(main))print('時間:{}'.format(timed))print('來源:{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')passage = soup1.select('.show-content')click = getclick(url)print('點擊次數:{}'.format(click))?
3.獲取所有新聞列表頁的網址,調用上述函數。?
import requests from bs4 import BeautifulSoup import reurl_main="http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(url_main) res.encoding = 'utf-8'soup = BeautifulSoup(res.text,'html.parser') li = soup.select('li')def gethits(url_1):li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')return hitsdef getpageinfo(label):for title_list in label:if len(title_list.select('.news-list-title'))>0:href = title_list.select('a')[0]['href']title = title_list.select('.news-list-title')[0].texttime = title_list.select('span')[0].textinfo = title_list.select('span')[1].textres_list = requests.get(href)res_list.encoding = 'utf-8'soup_list = BeautifulSoup(res_list.text,'html.parser')text_list = soup_list.select('.show-content')[0].texthits_list = gethits(href)getpageinfo(li)pages = int(soup.select('.a1')[0].text.rstrip('條'))//10+1for i in range(2,pages+1):url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)res_page = requests.get(url_page)res_page.encoding = 'utf-8'soup_page = BeautifulSoup(res_page.text,'html.parser')list_page = soup.select('li')getpageinfo(list_page)print(url_page)4.完后所有校園新聞爬取工作
import requests from bs4 import BeautifulSoup from datetime import datetime import redef getclick(newsurl):id = re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',newsurl).groups()[0].split('/')[1]clickurl = 'http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80'.format(id)click = int(requests.get(clickurl).text.split('.')[-1].lstrip("html('").rstrip("');"))return(click)def getonepage(listurl):res = requests.get(listurl)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].texttimed = datetime.strptime(time,'%Y-%m-%d')main = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('鏈接:{}'.format(url))print('標題:{}'.format(title))print('正文:{}'.format(main))print('時間:{}'.format(timed))print('來源:{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')click = getclick(url)print('點擊次數:{}'.format(click))getonepage('http://news.gzcc.cn/html/xiaoyuanxinwen/index.html')res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')page = int(soup.select('.a1')[0].text.rstrip('條'))//10+1 for i in range(2,page+1):listurl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)getonepage(listurl)?
轉載于:https://www.cnblogs.com/sisters/p/7655268.html
總結
- 上一篇: tsconfig.json配置
- 下一篇: linux学习-用户的特殊 shell