python实现Instagram网络爬虫
python實現Instagram網絡爬蟲
- instagram爬蟲
- 背景介紹
- 爬蟲的設計思路
- 大致實現步驟
- 代碼
- 數據展示
- 數據簡要分析
 
instagram爬蟲
背景介紹
Instagram是國際最大的社交媒體之一。這是一個巨大地相片分享社區群,全世界的網民們可以在Instagram上以快速,出色以及有趣的方式來與朋友分享照片,分享生活,實現信息的即時分享、傳播互動。
 利用python語言從賬戶內獲取到其個人基本信息:用戶簡介、發帖數、關注數、被關注數以及發布的圖片信息:圖片文件、發布時間、點贊數、評論數。通過數據篩選,將信息保存到數據庫中,并對這些數據進行相應的處理與分析。
 
 
爬蟲的設計思路
1.首先確定要爬取的網頁URL地址
 2.通過HTTP/HTTPS協議來獲取對于的HTML頁面
 3.提取HTML里面有用的數據
 4.如果是有用的數據,那么就保存起來,如果是頁面里有其他URL,那么就繼續執行第二步
大致實現步驟
1.設置代理服務,成功訪問Instagram頁面
 2.利用python,對特定用戶“Instagram”的follower進行爬取,數量為10000
 3.對這些用戶個人信息,發布的圖片內容以及對應的點贊和評論數量進行爬取
 4.結果存入數據庫,并對采集的數據做基本的分析統計
 
代碼
導入包
import pandas as pd import traceback import random import os import re import sys import json import time import random import requests import pymongo from pyquery import PyQuery as pq鏈接 mongo 數據庫
client = pymongo.MongoClient(host="localhost",port=27017) db = client['instagram'] table_user = db['user'] table_post = db['post']url_base = 'https://www.instagram.com/{}/' uri = 'https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22{cursor}%22%7D'vpn 端口
proxies = {'http': 'http://127.0.0.1:11000','https': 'http://127.0.0.1:11000', }headers = {'authority': 'www.instagram.com','method': 'GET','path': '/graphql/query/?query_hash=c76146de99bb02f6415203be841dd25a&variables=%7B%22id%22%3A%221507979106%22%2C%22include_reel%22%3Atrue%2C%22fetch_mutual%22%3Atrue%2C%22first%22%3A24%7D','scheme': 'https','cookie': 'mid=XFZ8GwALAAGT7q5EpO0c2fInHptQ; mcd=3; ds_user_id=5908418817; ig_did=85FF1DEF-2EFC-4878-BE84-B1C8BE6CE8BD; ig_cb=1; fbm_124024574287414=base_domain=.instagram.com; csrftoken=xPM4ERPXxmLCIPwHLSqYh5kzSXqqZCkL; sessionid=5908418817%3A7v38AuQX9lztEW%3A4; shbid=10185; shbts=1584248837.7816854; rur=ASH; urlgen="{\"94.190.208.156\": 4760}:1jDLb7:htq8GLniOckdSATmm3SUTWBuN3o"','referer': 'https://www.instagram.com/skuukzky/followers/?hl=zh-cn','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36','x-csrftoken': 'xPM4ERPXxmLCIPwHLSqYh5kzSXqqZCkL','x-ig-app-id': '936619743392459','x-ig-www-claim': 'hmac.AR2iHSXFdGL67VxTW3jLLPH5WiFoUjzDxhEbyxgHYhXH5Y4x','x-requested-with': 'XMLHttpRequest', }請求網頁源代碼
def get_html(url):try:response = requests.get(url, headers=headers, proxies=proxies)if response.status_code == 200:return response.textelse:print('請求網頁源代碼錯誤, 錯誤狀態碼:', response.status_code)except Exception as e:print(e)return None請求網頁json
def get_json(url):try:response = requests.get(url, headers=headers, timeout=10, proxies=proxies)if response.status_code == 200:return response.json()else:print('請求網頁json錯誤, 錯誤狀態碼:', response.status_code)except Exception as e:print(e)time.sleep(60 + float(random.randint(1, 4000))/100)return get_json(url)請求連接
def get_content(url):try:response = requests.get(url, headers=headers, timeout=10, proxies=proxies)if response.status_code == 200:return response.contentelse:print('請求照片二進制流錯誤, 錯誤狀態碼:', response.status_code)except Exception as e:print(e)return None爬取網頁具體內容
def get_urls(html):id, username = '', ''user_id = re.findall('"profilePage_([0-9]+)"', html, re.S)[0]print('user_id:' + user_id)doc = pq(html)items = doc('script[type="text/javascript"]').items()for item in items:if item.text().strip().startswith('window._sharedData'):print(111)js_data = json.loads(item.text()[21:-1], encoding='utf-8')user = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]id = user['id']username = user['username']fullname = user['full_name']intro = user['biography']following_count = user['edge_follow']['count']followed_count = user['edge_followed_by']['count']post_count = user['edge_owner_to_timeline_media']['count']table_user.update_one({'id': id}, {'$set': {'id':id,'username':username,'fullname':fullname,'intro':intro,'following_count':following_count,'followed_count':followed_count,'post_count':post_count}}, True)# print(js_data)# time.sleep(1000)edges = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]page_info = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info']cursor = page_info['end_cursor']flag = page_info['has_next_page']for edge in edges:post_id = edge['node']['id']post_time = edge['node']['taken_at_timestamp']post_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(post_time))comment_count = edge['node']['edge_media_to_comment']['count']like_count = edge['node']['edge_media_preview_like']['count']urls = []if edge['node']['display_url']:display_url = edge['node']['display_url']print(display_url)urls.append(display_url)urls = ';'.join(urls)table_post.update_one({'post_id': post_id}, {'$set': {'user_id': id,'user_name':username,'post_id':post_id,'time': post_time,'comment_count': comment_count,'like_count': like_count,'img_urls': urls}}, True)print({'user_id': id,'user_name': username,'post_id': post_id,'time': post_time,'comment_count': comment_count,'like_count': like_count,'img_urls': urls})# print(cursor, flag)while flag:url = uri.format(user_id=user_id, cursor=cursor)js_data = get_json(url)infos = js_data['data']['user']['edge_owner_to_timeline_media']['edges']cursor = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']flag = js_data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']for edge in infos:# print('\n\n\n\n',edge)post_id = edge['node']['id']post_time = edge['node']['taken_at_timestamp']post_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(post_time))comment_count = edge['node']['edge_media_to_comment']['count']like_count = edge['node']['edge_media_preview_like']['count']urls = []if edge['node']['is_video']:video_url = edge['node']['video_url']if video_url:print(video_url)# urls.append(video_url)else:if edge['node']['display_url']:display_url = edge['node']['display_url']print(display_url)urls.append(display_url)urls = ';'.join(urls)table_post.update_one({'post_id': post_id}, {'$set': {'user_id': id,'user_name': username,'post_id': post_id,'time': post_time,'comment_count': comment_count,'like_count': like_count,'img_urls': urls}}, True)print({'user_id': id,'user_name': username,'post_id': post_id,'time': post_time,'comment_count': comment_count,'like_count': like_count,'img_urls': urls})主函數
def main(user):get_urls(get_html(url_base.format(user)))if __name__ == '__main__':df = pd.read_csv('source.csv')for index, row in df.iterrows():print('{}/{}'.format(index, df.shape[0]), row['username'])if row['isFinished']>0:continueuser_name = row['username']try:start = time.time()main(user_name)end = time.time()spent = end - startprint('spent {} s'.format(spent))df.loc[index, 'isFinished'] = 1df.to_csv('source.csv', index=False, encoding='utf-8-sig')time.sleep(random.randint(2,3))except Exception as e:# if isinstance(e, IndexError) or isinstance(e, TypeError):if isinstance(e, IndexError):print('{} 改名了'.format(user_name))df.loc[index, 'isFinished'] = 2df.to_csv('source.csv', index=False, encoding='utf-8-sig')continueprint(traceback.format_exc())break數據展示
 賬戶個人基本信息
 包含了賬戶id、用戶名、用戶簡介、關注數、被關注數、發帖數。
 發布的圖片信息
 包含了賬戶id、用戶名、圖片文件、發布時間、點贊數、評論數
數據簡要分析
 
由此可以推斷出我們爬取的用戶大多數多為新用戶(其中也可能有活躍性低的用戶)
 在爬取的對象中存在著局限性。
總結
以上是生活随笔為你收集整理的python实现Instagram网络爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: Unity 小游戏:3D射箭
- 下一篇: 高考成绩四川查询时间2021等位分,20
