Python网络数据采集2-wikipedia
生活随笔
收集整理的這篇文章主要介紹了
Python网络数据采集2-wikipedia
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
Python網絡數據采集2-wikipedia
隨機鏈接跳轉
獲取維基百科的詞條超鏈接,并隨機跳轉。可能側邊欄和低欄會有其他鏈接。這不是我們想要的,所以定位到正文。正文在id為bodyContent的div標簽里。
import random import re import requests from bs4 import BeautifulSoupheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}start_url = '/wiki/Wiki'def get_links(url):r = requests.get('https://en.wikipedia.org' + url, headers=headers)soup = BeautifulSoup(r.text, 'lxml')# /wiki/some_wordslink_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^/]*$'))return link_listlinks = get_links(start_url) while len(links) > 0:# 隨機選擇一個鏈接link = random.choice(links).get('href')print(link)# 新的詞條覆蓋了原來的超鏈接,一直搜尋links = get_links(link) /wiki/Personal_wiki /wiki/Database_management_system /wiki/Netezza /wiki/C%2B%2B /wiki/C%2B%2B#Standardization /wiki/ISO_9984 /wiki/Georgian_script ...從首頁開始,將首頁的所有詞條放入集合中(去重),再遍歷集合,從集合中的鏈接遞歸搜索。
import re import requests from bs4 import BeautifulSoupheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}pages = set()def get_links(url):global pagesr = requests.get('https://en.wikipedia.org' + url, headers=headers)soup = BeautifulSoup(r.text, 'lxml')# /wiki/some_wordslink_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))for link in link_list:if link['href'] not in pages:new_page = link['href']pages.add(new_page)print(new_page)get_links(new_page)if __name__ == '__main__':# 空字符串表示,url為wiki主頁https://en.wikipedia.orgget_links('')獲取詞條的標題、正文
標題在h1標簽中,正文在id為mw-content-text的div標簽中。
import re import requests from bs4 import BeautifulSoupheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}pages = set()def get_links(url):global pagesr = requests.get('https://en.wikipedia.org' + url, headers=headers)soup = BeautifulSoup(r.text, 'lxml')# /wiki/some_wordstry:print(soup.h1.string)# 只打印第一段print(soup.find(id='mw-content-text').find('p').text)except AttributeError:print('頁面缺少一些屬性。')link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))for link in link_list:if link['href'] not in pages:new_page = link['href']pages.add(new_page)print('----------\n' + new_page)get_links(new_page)if __name__ == '__main__':# 空字符串表示,url為wiki主頁https://en.wikipedia.orgget_links('') Main Page Noye's Fludde is a one-act opera written largely for young amateur performers, created by the British composer Benjamin Britten. First performed in 1958 at the annual Aldeburgh Festival, it is based on the 15th-century Chester "mystery" play which recounts the biblical story of Noah, the flood and the ark. Britten had written numerous works for mixed ... -------- /wiki/Wikipedia Wikipedia Wikipedia (/?w?k??pi?di?/?(?listen) or /?w?ki?pi?di?/?(?listen) WIK-i-PEE-dee-?) is a free online encyclopedia with the aim to allow anyone to edit articles.[3] Wikipedia is the largest and most popular general reference work on the Internet[4][5][6] and is ranked among the ten most popular websites.[7] Wikipedia is owned by the nonprofit Wikimedia Foundation.[8][9][10] -------- /wiki/Main_Page ...尋找外鏈
從https://www.oreilly.com開始不斷尋找外鏈,如果某個頁面沒有外鏈,則進入該頁面的某個內鏈,再重新找外鏈。感覺這個例子不是很好,因為從其他外鏈又可能回到初始頁面。
import re import random import requests from bs4 import BeautifulSoupheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}def get_random_external_link(start_page):r = requests.get(start_page, headers=headers)soup = BeautifulSoup(r.text, 'lxml')# 返回分割地址的第一個元素,一般是主頁的地址ex_links = get_external_links(soup, split_address(start_page)[0])# 如果該頁面沒有外鏈,則獲取內鏈,再從內鏈里隨機選取一個,遞歸,直到獲取到外鏈為止。if len(ex_links) == 0:internal_links = get_internal_links(soup, split_address(start_page)[0])return get_random_external_link(random.choice(internal_links))else:return random.choice(ex_links)def get_internal_links(bs, include_url):internal_links = []# 找出所有以為'/'開頭的鏈接,此為內鏈in_links = bs.find_all('a', href=re.compile(r'^/|' + include_url))for link in in_links:if link['href'] not in internal_links:internal_links.append(link['href'])return internal_linksdef get_external_links(bs, exclude_url):external_links = []# 找出所有以http、https開頭的鏈接,且不含內鏈字符的,此為外鏈,(?!...)表示不包含ex_links = bs.find_all('a', href=re.compile(r'^(https|http)((?!' + exclude_url + ').)*$'))for link in ex_links:if link['href'] not in external_links:external_links.append(link['href'])return external_linksdef split_address(address):address_parts = []if address.split(':')[0] == 'http':address_parts = address.replace('http://', '').split('/')elif address.split(':')[0] == 'https':address_parts = address.replace('https://', '').split('/')return address_parts# 只搜索外鏈 def follow_external_only(url):external_link = get_random_external_link(url)print(external_link)follow_external_only(external_link)all_ex_links = set() all_in_links = set()# 獲得所有外鏈和內鏈,并打印了外鏈 def get_all_external_links(url):r = requests.get(url, headers=headers)soup = BeautifulSoup(r.text, 'lxml')internal_links = get_internal_links(soup, split_address(url)[0])external_links = get_external_links(soup, split_address(url)[0])for link in external_links:if link not in all_ex_links:all_ex_links.add(link)print(link)for link in internal_links:if link not in all_in_links:all_in_links.add(link)get_all_external_links(link)if __name__ == '__main__':# follow_external_only('https://www.oreilly.com')get_all_external_links('https://www.oreilly.com') https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav http://shop.oreilly.com/ http://members.oreilly.com https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+get+started+now https://www.safaribooksonline.com/public/free-trial/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+start+free+trial https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+sign+in ...上面的代碼經常會出錯,可能是正則表達式匹配的原因,也有可能是網絡原因。
by @sunhaiyu
2017.7.14
轉載于:https://www.cnblogs.com/sun-haiyu/p/7181771.html
總結
以上是生活随笔為你收集整理的Python网络数据采集2-wikipedia的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 梯度下降算法到logistic回归
- 下一篇: 6.30