python爬虫学习教程,用python爬取新浪微博数据
爬取新浪微博信息,并寫入csv/txt文件,文件名為目標用戶id加".csv"和".txt"的形式,同時還會下載該微博原始圖片(可選)。
運行環境
開發語言:python2/python3
系統: Windows/Linux/macOS
以爬取迪麗熱巴的微博為例,她的微博昵稱為"Dear-迪麗熱巴",id為1669879400(后面會講如何獲取用戶id)。我們選擇爬取她的原創微博。程序會自動生成一個weibo文件夾,我們以后爬取的所有微博都被存儲在這里。然后程序在該文件夾下生成一個名為"Dear-迪麗熱巴"的文件夾,迪麗熱巴的所有微博爬取結果都在這里。"Dear-迪麗熱巴"文件夾里包含一個csv文件、一個txt文件和一個img文件夾,img文件夾用來存儲下載到的圖片。
csv文件結果如下所示:
?
txt文件結果如下所示:
?
下載的圖片如下所示:
?
img文件夾
本次下載了766張圖片,大小一共1.15GB,包括她原創微博中的圖片和轉發微博轉發理由中的圖片。圖片名為yyyymmdd+微博id的形式,若某條微博存在多張圖片,則圖片名中還會包括它在微博圖片中的序號。本次下載有一張圖片因為超時沒有下載下來,該圖片url被寫到了not_downloaded_pictures.txt。
?源碼分享:
1 ''' 2 在學習過程中有什么不懂得可以加我的 3 python學習交流扣扣qun,934109170 4 群里有不錯的學習教程、開發工具與電子書籍。 5 與你分享python企業當下人才需求及怎么從零基礎學習好python,和學習什么內容。 6 ''' 7 8 #!/usr/bin/env python 9 # -*- coding: UTF-8 -*- 10 11 import codecs 12 import csv 13 import os 14 import random 15 import re 16 import sys 17 import traceback 18 from collections import OrderedDict 19 from datetime import datetime, timedelta 20 from time import sleep 21 22 import requests 23 from lxml import etree 24 from tqdm import tqdm 25 26 27 class Weibo(object): 28 cookie = {'Cookie': 'your cookie'} # 將your cookie替換成自己的cookie 29 30 def __init__(self, user_id, filter=0, pic_download=0): 31 """Weibo類初始化""" 32 if not isinstance(user_id, int): 33 sys.exit(u'user_id值應為一串數字形式,請重新輸入') 34 if filter != 0 and filter != 1: 35 sys.exit(u'filter值應為0或1,請重新輸入') 36 if pic_download != 0 and pic_download != 1: 37 sys.exit(u'pic_download值應為0或1,請重新輸入') 38 self.user_id = user_id # 用戶id,即需要我們輸入的數字,如昵稱為"Dear-迪麗熱巴"的id為1669879400 39 self.filter = filter # 取值范圍為0、1,程序默認值為0,代表要爬取用戶的全部微博,1代表只爬取用戶的原創微博 40 self.pic_download = pic_download # 取值范圍為0、1,程序默認值為0,代表不下載微博原始圖片,1代表下載 41 self.nickname = '' # 用戶昵稱,如“Dear-迪麗熱巴” 42 self.weibo_num = 0 # 用戶全部微博數 43 self.got_num = 0 # 爬取到的微博數 44 self.following = 0 # 用戶關注數 45 self.followers = 0 # 用戶粉絲數 46 self.weibo = [] # 存儲爬取到的所有微博信息 47 48 def deal_html(self, url): 49 """處理html""" 50 try: 51 html = requests.get(url, cookies=self.cookie).content 52 selector = etree.HTML(html) 53 return selector 54 except Exception as e: 55 print('Error: ', e) 56 traceback.print_exc() 57 58 def deal_garbled(self, info): 59 """處理亂碼""" 60 try: 61 info = (info.xpath('string(.)').replace(u'\u200b', '').encode( 62 sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) 63 return info 64 except Exception as e: 65 print('Error: ', e) 66 traceback.print_exc() 67 68 def get_nickname(self): 69 """獲取用戶昵稱""" 70 try: 71 url = 'https://weibo.cn/%d/info' % (self.user_id) 72 selector = self.deal_html(url) 73 nickname = selector.xpath('//title/text()')[0] 74 self.nickname = nickname[:-3] 75 if self.nickname == u'登錄 - 新' or self.nickname == u'新浪': 76 sys.exit(u'cookie錯誤或已過期,請按照README中方法重新獲取') 77 print(u'用戶昵稱: ' + self.nickname) 78 except Exception as e: 79 print('Error: ', e) 80 traceback.print_exc() 81 82 def get_user_info(self, selector): 83 """獲取用戶昵稱、微博數、關注數、粉絲數""" 84 try: 85 self.get_nickname() # 獲取用戶昵稱 86 user_info = selector.xpath("//div[@class='tip2']/*/text()") 87 88 self.weibo_num = int(user_info[0][3:-1]) 89 print(u'微博數: ' + str(self.weibo_num)) 90 91 self.following = int(user_info[1][3:-1]) 92 print(u'關注數: ' + str(self.following)) 93 94 self.followers = int(user_info[2][3:-1]) 95 print(u'粉絲數: ' + str(self.followers)) 96 print('*' * 100) 97 except Exception as e: 98 print('Error: ', e) 99 traceback.print_exc() 100 101 def get_page_num(self, selector): 102 """獲取微博總頁數""" 103 try: 104 if selector.xpath("//input[@name='mp']") == []: 105 page_num = 1 106 else: 107 page_num = (int)( 108 selector.xpath("//input[@name='mp']")[0].attrib['value']) 109 return page_num 110 except Exception as e: 111 print('Error: ', e) 112 traceback.print_exc() 113 114 def get_long_weibo(self, weibo_link): 115 """獲取長原創微博""" 116 try: 117 selector = self.deal_html(weibo_link) 118 info = selector.xpath("//div[@class='c']")[1] 119 wb_content = self.deal_garbled(info) 120 wb_time = info.xpath("//span[@class='ct']/text()")[0] 121 weibo_content = wb_content[wb_content.find(':') + 122 1:wb_content.rfind(wb_time)] 123 return weibo_content 124 except Exception as e: 125 print('Error: ', e) 126 traceback.print_exc() 127 128 def get_original_weibo(self, info, weibo_id): 129 """獲取原創微博""" 130 try: 131 weibo_content = self.deal_garbled(info) 132 weibo_content = weibo_content[:weibo_content.rfind(u'贊')] 133 a_text = info.xpath('div//a/text()') 134 if u'全文' in a_text: 135 weibo_link = 'https://weibo.cn/comment/' + weibo_id 136 wb_content = self.get_long_weibo(weibo_link) 137 if wb_content: 138 weibo_content = wb_content 139 return weibo_content 140 except Exception as e: 141 print('Error: ', e) 142 traceback.print_exc() 143 144 def get_long_retweet(self, weibo_link): 145 """獲取長轉發微博""" 146 try: 147 wb_content = self.get_long_weibo(weibo_link) 148 weibo_content = wb_content[:wb_content.rfind(u'原文轉發')] 149 return weibo_content 150 except Exception as e: 151 print('Error: ', e) 152 traceback.print_exc() 153 154 def get_retweet(self, info, weibo_id): 155 """獲取轉發微博""" 156 try: 157 original_user = info.xpath("div/span[@class='cmt']/a/text()") 158 if not original_user: 159 wb_content = u'轉發微博已被刪除' 160 return wb_content 161 else: 162 original_user = original_user[0] 163 wb_content = self.deal_garbled(info) 164 wb_content = wb_content[wb_content.find(':') + 165 1:wb_content.rfind(u'贊')] 166 wb_content = wb_content[:wb_content.rfind(u'贊')] 167 a_text = info.xpath('div//a/text()') 168 if u'全文' in a_text: 169 weibo_link = 'https://weibo.cn/comment/' + weibo_id 170 weibo_content = self.get_long_retweet(weibo_link) 171 if weibo_content: 172 wb_content = weibo_content 173 retweet_reason = self.deal_garbled(info.xpath('div')[-1]) 174 retweet_reason = retweet_reason[:retweet_reason.rindex(u'贊')] 175 wb_content = (retweet_reason + '\n' + u'原始用戶: ' + original_user + 176 '\n' + u'轉發內容: ' + wb_content) 177 return wb_content 178 except Exception as e: 179 print('Error: ', e) 180 traceback.print_exc() 181 182 def is_original(self, info): 183 """判斷微博是否為原創微博""" 184 is_original = info.xpath("div/span[@class='cmt']") 185 if len(is_original) > 3: 186 return False 187 else: 188 return True 189 190 def get_weibo_content(self, info, is_original): 191 """獲取微博內容""" 192 try: 193 weibo_id = info.xpath('@id')[0][2:] 194 if is_original: 195 weibo_content = self.get_original_weibo(info, weibo_id) 196 else: 197 weibo_content = self.get_retweet(info, weibo_id) 198 print(weibo_content) 199 return weibo_content 200 except Exception as e: 201 print('Error: ', e) 202 traceback.print_exc() 203 204 def get_publish_place(self, info): 205 """獲取微博發布位置""" 206 try: 207 div_first = info.xpath('div')[0] 208 a_list = div_first.xpath('a') 209 publish_place = u'無' 210 for a in a_list: 211 if ('place.weibo.com' in a.xpath('@href')[0] 212 and a.xpath('text()')[0] == u'顯示地圖'): 213 weibo_a = div_first.xpath("span[@class='ctt']/a") 214 if len(weibo_a) >= 1: 215 publish_place = weibo_a[-1] 216 if (u'視頻' == div_first.xpath( 217 "span[@class='ctt']/a/text()")[-1][-2:]): 218 if len(weibo_a) >= 2: 219 publish_place = weibo_a[-2] 220 else: 221 publish_place = u'無' 222 publish_place = self.deal_garbled(publish_place) 223 break 224 print(u'微博發布位置: ' + publish_place) 225 return publish_place 226 except Exception as e: 227 print('Error: ', e) 228 traceback.print_exc() 229 230 def get_publish_time(self, info): 231 """獲取微博發布時間""" 232 try: 233 str_time = info.xpath("div/span[@class='ct']") 234 str_time = self.deal_garbled(str_time[0]) 235 publish_time = str_time.split(u'來自')[0] 236 if u'剛剛' in publish_time: 237 publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') 238 elif u'分鐘' in publish_time: 239 minute = publish_time[:publish_time.find(u'分鐘')] 240 minute = timedelta(minutes=int(minute)) 241 publish_time = (datetime.now() - 242 minute).strftime('%Y-%m-%d %H:%M') 243 elif u'今天' in publish_time: 244 today = datetime.now().strftime('%Y-%m-%d') 245 time = publish_time[3:] 246 publish_time = today + ' ' + time 247 elif u'月' in publish_time: 248 year = datetime.now().strftime('%Y') 249 month = publish_time[0:2] 250 day = publish_time[3:5] 251 time = publish_time[7:12] 252 publish_time = year + '-' + month + '-' + day + ' ' + time 253 else: 254 publish_time = publish_time[:16] 255 print(u'微博發布時間: ' + publish_time) 256 return publish_time 257 except Exception as e: 258 print('Error: ', e) 259 traceback.print_exc() 260 261 def get_publish_tool(self, info): 262 """獲取微博發布工具""" 263 try: 264 str_time = info.xpath("div/span[@class='ct']") 265 str_time = self.deal_garbled(str_time[0]) 266 if len(str_time.split(u'來自')) > 1: 267 publish_tool = str_time.split(u'來自')[1] 268 else: 269 publish_tool = u'無' 270 print(u'微博發布工具: ' + publish_tool) 271 return publish_tool 272 except Exception as e: 273 print('Error: ', e) 274 traceback.print_exc() 275 276 def get_weibo_footer(self, info): 277 """獲取微博點贊數、轉發數、評論數""" 278 try: 279 footer = {} 280 pattern = r'\d+' 281 str_footer = info.xpath('div')[-1] 282 str_footer = self.deal_garbled(str_footer) 283 str_footer = str_footer[str_footer.rfind(u'贊'):] 284 weibo_footer = re.findall(pattern, str_footer, re.M) 285 286 up_num = int(weibo_footer[0]) 287 print(u'點贊數: ' + str(up_num)) 288 footer['up_num'] = up_num 289 290 retweet_num = int(weibo_footer[1]) 291 print(u'轉發數: ' + str(retweet_num)) 292 footer['retweet_num'] = retweet_num 293 294 comment_num = int(weibo_footer[2]) 295 print(u'評論數: ' + str(comment_num)) 296 footer['comment_num'] = comment_num 297 return footer 298 except Exception as e: 299 print('Error: ', e) 300 traceback.print_exc() 301 302 def extract_picture_urls(self, info, weibo_id): 303 """提取微博原始圖片url""" 304 try: 305 a_list = info.xpath('div/a/@href') 306 first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' 307 all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' 308 if first_pic in a_list: 309 if all_pic in a_list: 310 selector = self.deal_html(all_pic) 311 preview_picture_list = selector.xpath('//img/@src') 312 picture_list = [ 313 p.replace('/thumb180/', '/large/') 314 for p in preview_picture_list 315 ] 316 picture_urls = ','.join(picture_list) 317 else: 318 if info.xpath('.//img/@src'): 319 preview_picture = info.xpath('.//img/@src')[-1] 320 picture_urls = preview_picture.replace( 321 '/wap180/', '/large/') 322 else: 323 sys.exit( 324 u"爬蟲微博可能被設置成了'不顯示圖片',請前往" 325 u"'https://weibo.cn/account/customize/pic',修改為'顯示'" 326 ) 327 else: 328 picture_urls = '無' 329 return picture_urls 330 except Exception as e: 331 print('Error: ', e) 332 traceback.print_exc() 333 334 def get_picture_urls(self, info, is_original): 335 """獲取微博原始圖片url""" 336 try: 337 weibo_id = info.xpath('@id')[0][2:] 338 picture_urls = {} 339 if is_original: 340 original_pictures = self.extract_picture_urls(info, weibo_id) 341 picture_urls['original_pictures'] = original_pictures 342 if not self.filter: 343 picture_urls['retweet_pictures'] = '無' 344 else: 345 retweet_url = info.xpath("div/a[@class='cc']/@href")[0] 346 retweet_id = retweet_url.split('/')[-1].split('?')[0] 347 retweet_pictures = self.extract_picture_urls(info, retweet_id) 348 picture_urls['retweet_pictures'] = retweet_pictures 349 a_list = info.xpath('div[last()]/a/@href') 350 original_picture = '無' 351 for a in a_list: 352 if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): 353 original_picture = a 354 break 355 picture_urls['original_pictures'] = original_picture 356 return picture_urls 357 except Exception as e: 358 print('Error: ', e) 359 traceback.print_exc() 360 361 def download_pic(self, url, pic_path): 362 """下載單張圖片""" 363 try: 364 p = requests.get(url) 365 with open(pic_path, 'wb') as f: 366 f.write(p.content) 367 except Exception as e: 368 error_file = self.get_filepath( 369 'img') + os.sep + 'not_downloaded_pictures.txt' 370 with open(error_file, 'ab') as f: 371 url = url + '\n' 372 f.write(url.encode(sys.stdout.encoding)) 373 print('Error: ', e) 374 traceback.print_exc() 375 376 def download_pictures(self): 377 """下載微博圖片""" 378 try: 379 print(u'即將進行圖片下載') 380 img_dir = self.get_filepath('img') 381 for w in tqdm(self.weibo, desc=u'圖片下載進度'): 382 if w['original_pictures'] != '無': 383 pic_prefix = w['publish_time'][:11].replace( 384 '-', '') + '_' + w['id'] 385 if ',' in w['original_pictures']: 386 w['original_pictures'] = w['original_pictures'].split( 387 ',') 388 for j, url in enumerate(w['original_pictures']): 389 pic_suffix = url[url.rfind('.'):] 390 pic_name = pic_prefix + '_' + str(j + 391 1) + pic_suffix 392 pic_path = img_dir + os.sep + pic_name 393 self.download_pic(url, pic_path) 394 else: 395 pic_suffix = w['original_pictures'][ 396 w['original_pictures'].rfind('.'):] 397 pic_name = pic_prefix + pic_suffix 398 pic_path = img_dir + os.sep + pic_name 399 self.download_pic(w['original_pictures'], pic_path) 400 print(u'圖片下載完畢,保存路徑:') 401 print(img_dir) 402 except Exception as e: 403 print('Error: ', e) 404 traceback.print_exc() 405 406 def get_one_weibo(self, info): 407 """獲取一條微博的全部信息""" 408 try: 409 weibo = OrderedDict() 410 is_original = self.is_original(info) 411 if (not self.filter) or is_original: 412 weibo['id'] = info.xpath('@id')[0][2:] 413 weibo['content'] = self.get_weibo_content(info, 414 is_original) # 微博內容 415 picture_urls = self.get_picture_urls(info, is_original) 416 weibo['original_pictures'] = picture_urls[ 417 'original_pictures'] # 原創圖片url 418 if not self.filter: 419 weibo['retweet_pictures'] = picture_urls[ 420 'retweet_pictures'] # 轉發圖片url 421 weibo['original'] = is_original # 是否原創微博 422 weibo['publish_place'] = self.get_publish_place(info) # 微博發布位置 423 weibo['publish_time'] = self.get_publish_time(info) # 微博發布時間 424 weibo['publish_tool'] = self.get_publish_tool(info) # 微博發布工具 425 footer = self.get_weibo_footer(info) 426 weibo['up_num'] = footer['up_num'] # 微博點贊數 427 weibo['retweet_num'] = footer['retweet_num'] # 轉發數 428 weibo['comment_num'] = footer['comment_num'] # 評論數 429 else: 430 weibo = None 431 return weibo 432 except Exception as e: 433 print('Error: ', e) 434 traceback.print_exc() 435 436 def get_one_page(self, page): 437 """獲取第page頁的全部微博""" 438 try: 439 url = 'https://weibo.cn/u/%d?page=%d' % (self.user_id, page) 440 selector = self.deal_html(url) 441 info = selector.xpath("//div[@class='c']") 442 is_exist = info[0].xpath("div/span[@class='ctt']") 443 if is_exist: 444 for i in range(0, len(info) - 2): 445 weibo = self.get_one_weibo(info[i]) 446 if weibo: 447 self.weibo.append(weibo) 448 self.got_num += 1 449 print('-' * 100) 450 except Exception as e: 451 print('Error: ', e) 452 traceback.print_exc() 453 454 def get_filepath(self, type): 455 """獲取結果文件路徑""" 456 try: 457 file_dir = os.path.split(os.path.realpath( 458 __file__))[0] + os.sep + 'weibo' + os.sep + self.nickname 459 if type == 'img': 460 file_dir = file_dir + os.sep + 'img' 461 if not os.path.isdir(file_dir): 462 os.makedirs(file_dir) 463 if type == 'img': 464 return file_dir 465 file_path = file_dir + os.sep + '%d' % self.user_id + '.' + type 466 return file_path 467 except Exception as e: 468 print('Error: ', e) 469 traceback.print_exc() 470 471 def write_csv(self, wrote_num): 472 """將爬取的信息寫入csv文件""" 473 try: 474 result_headers = [ 475 '微博id', 476 '微博正文', 477 '原始圖片url', 478 '發布位置', 479 '發布時間', 480 '發布工具', 481 '點贊數', 482 '轉發數', 483 '評論數', 484 ] 485 if not self.filter: 486 result_headers.insert(3, '被轉發微博原始圖片url') 487 result_headers.insert(4, '是否為原創微博') 488 result_data = [w.values() for w in self.weibo][wrote_num:] 489 if sys.version < '3': # python2.x 490 reload(sys) 491 sys.setdefaultencoding('utf-8') 492 with open(self.get_filepath('csv'), 'ab') as f: 493 f.write(codecs.BOM_UTF8) 494 writer = csv.writer(f) 495 if wrote_num == 0: 496 writer.writerows([result_headers]) 497 writer.writerows(result_data) 498 else: # python3.x 499 with open(self.get_filepath('csv'), 500 'a', 501 encoding='utf-8-sig', 502 newline='') as f: 503 writer = csv.writer(f) 504 if wrote_num == 0: 505 writer.writerows([result_headers]) 506 writer.writerows(result_data) 507 print(u'%d條微博寫入csv文件完畢,保存路徑:' % self.got_num) 508 print(self.get_filepath('csv')) 509 except Exception as e: 510 print('Error: ', e) 511 traceback.print_exc() 512 513 def write_txt(self, wrote_num): 514 """將爬取的信息寫入txt文件""" 515 try: 516 temp_result = [] 517 if wrote_num == 0: 518 if self.filter: 519 result_header = u'\n\n原創微博內容: \n' 520 else: 521 result_header = u'\n\n微博內容: \n' 522 result_header = (u'用戶信息\n用戶昵稱:' + self.nickname + u'\n用戶id: ' + 523 str(self.user_id) + u'\n微博數: ' + 524 str(self.weibo_num) + u'\n關注數: ' + 525 str(self.following) + u'\n粉絲數: ' + 526 str(self.followers) + result_header) 527 temp_result.append(result_header) 528 for i, w in enumerate(self.weibo[wrote_num:]): 529 temp_result.append( 530 str(wrote_num + i + 1) + ':' + w['content'] + '\n' + 531 u'微博位置: ' + w['publish_place'] + '\n' + u'發布時間: ' + 532 w['publish_time'] + '\n' + u'點贊數: ' + str(w['up_num']) + 533 u' 轉發數: ' + str(w['retweet_num']) + u' 評論數: ' + 534 str(w['comment_num']) + '\n' + u'發布工具: ' + 535 w['publish_tool'] + '\n\n') 536 result = ''.join(temp_result) 537 with open(self.get_filepath('txt'), 'ab') as f: 538 f.write(result.encode(sys.stdout.encoding)) 539 print(u'%d條微博寫入txt文件完畢,保存路徑:' % self.got_num) 540 print(self.get_filepath('txt')) 541 except Exception as e: 542 print('Error: ', e) 543 traceback.print_exc() 544 545 def write_file(self, wrote_num): 546 """寫文件""" 547 if self.got_num > wrote_num: 548 self.write_csv(wrote_num) 549 self.write_txt(wrote_num) 550 551 def get_weibo_info(self): 552 """獲取微博信息""" 553 try: 554 url = 'https://weibo.cn/u/%d' % (self.user_id) 555 selector = self.deal_html(url) 556 self.get_user_info(selector) # 獲取用戶昵稱、微博數、關注數、粉絲數 557 page_num = self.get_page_num(selector) # 獲取微博總頁數 558 wrote_num = 0 559 page1 = 0 560 random_pages = random.randint(1, 5) 561 for page in tqdm(range(1, page_num + 1), desc=u'進度'): 562 self.get_one_page(page) # 獲取第page頁的全部微博 563 564 if page % 20 == 0: # 每爬20頁寫入一次文件 565 self.write_file(wrote_num) 566 wrote_num = self.got_num 567 568 # 通過加入隨機等待避免被限制。爬蟲速度過快容易被系統限制(一段時間后限 569 # 制會自動解除),加入隨機等待模擬人的操作,可降低被系統限制的風險。默 570 # 認是每爬取1到5頁隨機等待6到10秒,如果仍然被限,可適當增加sleep時間 571 if page - page1 == random_pages and page < page_num: 572 sleep(random.randint(6, 10)) 573 page1 = page 574 random_pages = random.randint(1, 5) 575 576 self.write_file(wrote_num) # 將剩余不足20頁的微博寫入文件 577 if not self.filter: 578 print(u'共爬取' + str(self.got_num) + u'條微博') 579 else: 580 print(u'共爬取' + str(self.got_num) + u'條原創微博') 581 except Exception as e: 582 print('Error: ', e) 583 traceback.print_exc() 584 585 def start(self): 586 """運行爬蟲""" 587 try: 588 self.get_weibo_info() 589 print(u'信息抓取完畢') 590 print('*' * 100) 591 if self.pic_download == 1: 592 self.download_pictures() 593 except Exception as e: 594 print('Error: ', e) 595 traceback.print_exc() 596 597 598 def main(): 599 try: 600 # 使用實例,輸入一個用戶id,所有信息都會存儲在wb實例中 601 user_id = 1669879400 # 可以改成任意合法的用戶id(爬蟲的微博id除外) 602 filter = 1 # 值為0表示爬取全部微博(原創微博+轉發微博),值為1表示只爬取原創微博 603 pic_download = 1 # 值為0代表不下載微博原始圖片,1代表下載微博原始圖片 604 wb = Weibo(user_id, filter, pic_download) # 調用Weibo類,創建微博實例wb 605 wb.start() # 爬取微博信息 606 print(u'用戶昵稱: ' + wb.nickname) 607 print(u'全部微博數: ' + str(wb.weibo_num)) 608 print(u'關注數: ' + str(wb.following)) 609 print(u'粉絲數: ' + str(wb.followers)) 610 if wb.weibo: 611 print(u'最新/置頂 微博為: ' + wb.weibo[0]['content']) 612 print(u'最新/置頂 微博位置: ' + wb.weibo[0]['publish_place']) 613 print(u'最新/置頂 微博發布時間: ' + wb.weibo[0]['publish_time']) 614 print(u'最新/置頂 微博獲得贊數: ' + str(wb.weibo[0]['up_num'])) 615 print(u'最新/置頂 微博獲得轉發數: ' + str(wb.weibo[0]['retweet_num'])) 616 print(u'最新/置頂 微博獲得評論數: ' + str(wb.weibo[0]['comment_num'])) 617 print(u'最新/置頂 微博發布工具: ' + wb.weibo[0]['publish_tool']) 618 except Exception as e: 619 print('Error: ', e) 620 traceback.print_exc() 621 622 623 if __name__ == '__main__': 624 main()注意事項
1.user_id不能為爬蟲微博的user_id。因為要爬微博信息,必須先登錄到某個微博賬號,此賬號我們姑且稱為爬蟲微博。爬蟲微博訪問自己的頁面和訪問其他用戶的頁面,得到的網頁格式不同,所以無法爬取自己的微博信息;
2.cookie有期限限制,超過有效期需重新更新cookie。
轉載于:https://www.cnblogs.com/xiaoyiq/p/11306876.html
總結
以上是生活随笔為你收集整理的python爬虫学习教程,用python爬取新浪微博数据的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: 二、话题通信原理,代码实现
- 下一篇: fleaphp 快速开发php框架
