生活随笔
收集整理的這篇文章主要介紹了
猫眼数据爬取
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
貓眼經典影片的爬取
在進行數據爬取的時候一定要設置好cookie
cookie_url
= 'https://maoyan.com/'
response
= requests
. get
( cookie_url
)
cookie
= response
. cookies
_csrf
= cookie
[ '_csrf' ]
uuid
= cookie
[ 'uuid' ]
uuid_n_v
= cookie
[ 'uuid_n_v' ]
貓眼設置了反爬機制需要構造header
header
= { 'Upgrade-Insecure-Requests' : '1' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' , 'Accept-Language' : 'zh-CN,zh;q=0.9' , 'Host' : 'maoyan.com' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' , 'Cookie' : 'uuid_n_v={};_csrf={};uuid;{}' . format ( uuid_n_v
, _csrf
, uuid
) , }
對每個電影詳情頁的url進行獲取
url_list
= [ ]
response
= requests
. get
( url
, headers
= header
, allow_redirects
= False ) . texttree
= etree
. HTML
( response
) div
= tree
. xpath
( "//div[@class='movies-list']//dd" ) for li
in div
: detail_url
= li
. xpath
( "./div/a/@href" ) [ 0 ] if detail_url
is not None : detail_url
= 'https://maoyan.com' + detail_urlurl_list
. append
( detail_url
)
對詳情頁的評分人數,評分,國家的等信息進行解析獲取 在爬取的時候遇到報錯,就是數據獲取為空,所以就是獲取每個數據都會進行判斷
response
= requests
. get
( detail_url
, headers
= header
, allow_redirects
= False ) tree
= etree
. HTML
( response
. text
) name
= tree
. xpath
( "//div[@class='movie-brief-container']/h1/text()" ) print ( name
) if len ( name
) == 0 : name
= None else : name
= name
[ 0 ] types
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[1]/a/text()" ) movie
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[2]/text()" ) if len ( movie
) == 0 : country
= None minute
= None else : movie
= movie
[ 0 ] . replace
( '\n' , '' ) if '/' in movie
: country
= movie
. split
( '/' ) [ 0 ] minute
= movie
. split
( '/' ) [ 1 ] else : country
= movieminute
= '暫無' screen_time
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[3]/text()" ) if len ( screen_time
) == 0 : screen_time
= '暫無' else : screen_time
= screen_time
[ 0 ] score
= tree
. xpath
( "//div[@class='movie-index-content score normal-score']/span/span/text()" ) people_score
= tree
. xpath
( "//span[@class='score-num']/span/text()" ) box_office
= tree
. xpath
( "//div[@class='movie-index-content box']//span/text()" )
在這里想必很多人都知道對評分人數,累計票房等設置了字體反爬機制,所以在這里需要下這個 在代碼中需要對這個數據進行解析``
from fontTools
. ttLib
import TTFont
fonts
= TTFont
( 'f0a30a4dda64b4f8f344858115f54fc92296.woff' )
fonts
. saveXML
( 'a.xml' )
解析成xml后,打開解析的文件,在這里我們需要下載High-Logic FontCreator軟件,用這軟件把f0a30a4dda64b4f8f344858115f54fc92296.woff打開 然后對比打開的a.xml 找到規律
data_temp
= { 'e137' : '8' , 'e343' : '1' , 'e5e2' : '4' , 'e7a1' : '9' , 'e8cd' : '5' , 'f19b' : '2' , 'f489' : '0' , 'f4ef' : '6' , 'f848' : '3' , 'f88a' : '7' , 'x' : '.' }
對字體進行解密,在這里我還發現,如果在用xpath獲取數據時, 不能直接
tree
. xpath
( "//div[@class='movie-index-content score normal-score']/span/span/text()" ) [ 0 ]
這樣獲取出來的是空的值 所以我只能用笨辦法這樣獲取數據
def font_anaylis ( score
, people_score
, box_office
) : score
= str ( score
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if score
is not '' : if '.' in score
: score
= score
. split
( '.' ) score
= data_temp
[ score
[ 0 ] ] + '.' + data_temp
[ score
[ 1 ] ] elif len ( score
) > 4 : score
= '無' else : score
= data_temp
[ score
] else : score
= '暫無' people_score
= str ( people_score
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if people_score
is not '' : if '.' in people_score
: l1
= people_score
[ : people_score
. index
( '.' ) ] alist1
= [ ] j
= 0 for i
in range ( 3 , len ( l1
) , 4 ) : people_scores1
= data_temp
[ l1
[ j
: i
+ 1 ] ] alist1
. append
( people_scores1
) j
+= 4 total_score1
= '' . join
( alist1
) l2
= people_score
[ people_score
. index
( '.' ) + 1 : ] alist2
= [ ] j
= 0 for i
in range ( 3 , len ( l2
) , 4 ) : people_scores2
= data_temp
[ l2
[ j
: i
+ 1 ] ] alist2
. append
( people_scores2
) j
+= 4 total_score2
= '' . join
( alist2
) if len ( l2
) % 2 != 0 : s
= l2
[ - 1 ] total_score
= total_score1
+ '.' + total_score2
+ s
else : total_score
= total_score1
+ '.' + total_score2
else : alist
= [ ] j
= 0 for i
in range ( 3 , len ( people_score
) , 4 ) : people_scores
= data_temp
[ people_score
[ j
: i
+ 1 ] ] alist
. append
( people_scores
) j
+= 4 total_score
= '' . join
( alist
) else : total_score
= '暫無' box_office
= str ( box_office
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if box_office
== '暫無' : total_office
= box_office
else : box_office
= box_office
. split
( ',' ) box
= box_office
[ 0 ] if '.' in box
: box1
= box
[ : box
. index
( '.' ) ] l1
= [ ] j
= 0 for i
in range ( 3 , len ( box1
) , 4 ) : bo1
= data_temp
[ box1
[ j
: i
+ 1 ] ] l1
. append
( bo1
) j
+= 4 box1
= '' . join
( l1
) box2
= box
[ box
. index
( '.' ) + 1 : ] l2
= [ ] j
= 0 for i
in range ( 3 , len ( box2
) , 4 ) : bo
= data_temp
[ box2
[ j
: i
+ 1 ] ] l2
. append
( bo
) j
+= 4 box2
= '' . join
( l2
) total_office
= box1
+ '.' + box2
+ box_office
[ 1 ] else : if len ( box
) == 0 : total_office
= '空' else : list = [ ] j
= 0 for i
in range ( 3 , len ( box
) , 4 ) : bo
= data_temp
[ box
[ j
: i
+ 1 ] ] list . append
( bo
) j
+= 4 box
= '' . join
( list ) total_office
= box
+ box_office
[ 1 ] return score
, total_score
, total_office
這樣基本數據都能獲取,只不過在運行幾次后,就會給你阻斷了,獲取的數據就會為空。 完整代碼如下
import requests
from fontTools
. ttLib
import TTFont
from lxml
import etree
import pandas
as pd
from multiprocessing
. dummy
import Pool
import time
fonts
= TTFont
( 'f0a30a4dda64b4f8f344858115f54fc92296.woff' )
fonts
. saveXML
( 'a.xml' )
data_temp
= { 'e137' : '8' , 'e343' : '1' , 'e5e2' : '4' , 'e7a1' : '9' , 'e8cd' : '5' , 'f19b' : '2' , 'f489' : '0' , 'f4ef' : '6' , 'f848' : '3' , 'f88a' : '7' , 'x' : '.' }
cookie_url
= 'https://maoyan.com/'
response
= requests
. get
( cookie_url
)
cookie
= response
. cookies
_csrf
= cookie
[ '_csrf' ]
uuid
= cookie
[ 'uuid' ]
uuid_n_v
= cookie
[ 'uuid_n_v' ]
header
= { 'Upgrade-Insecure-Requests' : '1' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' , 'Accept-Language' : 'zh-CN,zh;q=0.9' , 'Host' : 'maoyan.com' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' , 'Cookie' : 'uuid_n_v={};_csrf={};uuid;{}' . format ( uuid_n_v
, _csrf
, uuid
) , }
result
= [ ]
url
= 'https://maoyan.com/films?showType=3&offset=0'
urls_page
= [ 'https://maoyan.com/films?showType=3&offset={}' . format ( num
* 30 ) for num
in range ( 0 , 2 ) ]
url_list
= [ ]
def page_num ( url
) : response
= requests
. get
( url
, headers
= header
, allow_redirects
= False ) . texttree
= etree
. HTML
( response
) div
= tree
. xpath
( "//div[@class='movies-list']//dd" ) for li
in div
: detail_url
= li
. xpath
( "./div/a/@href" ) [ 0 ] if detail_url
is not None : detail_url
= 'https://maoyan.com' + detail_urlurl_list
. append
( detail_url
)
def get_url ( detail_url
) : response
= requests
. get
( detail_url
, headers
= header
, allow_redirects
= False ) tree
= etree
. HTML
( response
. text
) name
= tree
. xpath
( "//div[@class='movie-brief-container']/h1/text()" ) print ( name
) if len ( name
) == 0 : name
= None else : name
= name
[ 0 ] types
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[1]/a/text()" ) movie
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[2]/text()" ) if len ( movie
) == 0 : country
= None minute
= None else : movie
= movie
[ 0 ] . replace
( '\n' , '' ) if '/' in movie
: country
= movie
. split
( '/' ) [ 0 ] minute
= movie
. split
( '/' ) [ 1 ] else : country
= movieminute
= '暫無' screen_time
= tree
. xpath
( "//div[@class='movie-brief-container']/ul/li[3]/text()" ) if len ( screen_time
) == 0 : screen_time
= '暫無' else : screen_time
= screen_time
[ 0 ] score
= tree
. xpath
( "//div[@class='movie-index-content score normal-score']/span/span/text()" ) people_score
= tree
. xpath
( "//span[@class='score-num']/span/text()" ) box_office
= tree
. xpath
( "//div[@class='movie-index-content box']//span/text()" ) score
, people_score
, box_office
= font_anaylis
( score
, people_score
, box_office
) detail_txt
( score
, people_score
, box_office
, name
, types
, country
, minute
, screen_time
)
def font_anaylis ( score
, people_score
, box_office
) : score
= str ( score
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if score
is not '' : if '.' in score
: score
= score
. split
( '.' ) score
= data_temp
[ score
[ 0 ] ] + '.' + data_temp
[ score
[ 1 ] ] elif len ( score
) > 4 : score
= '無' else : score
= data_temp
[ score
] else : score
= '暫無' people_score
= str ( people_score
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if people_score
is not '' : if '.' in people_score
: l1
= people_score
[ : people_score
. index
( '.' ) ] alist1
= [ ] j
= 0 for i
in range ( 3 , len ( l1
) , 4 ) : people_scores1
= data_temp
[ l1
[ j
: i
+ 1 ] ] alist1
. append
( people_scores1
) j
+= 4 total_score1
= '' . join
( alist1
) l2
= people_score
[ people_score
. index
( '.' ) + 1 : ] alist2
= [ ] j
= 0 for i
in range ( 3 , len ( l2
) , 4 ) : people_scores2
= data_temp
[ l2
[ j
: i
+ 1 ] ] alist2
. append
( people_scores2
) j
+= 4 total_score2
= '' . join
( alist2
) if len ( l2
) % 2 != 0 : s
= l2
[ - 1 ] total_score
= total_score1
+ '.' + total_score2
+ s
else : total_score
= total_score1
+ '.' + total_score2
else : alist
= [ ] j
= 0 for i
in range ( 3 , len ( people_score
) , 4 ) : people_scores
= data_temp
[ people_score
[ j
: i
+ 1 ] ] alist
. append
( people_scores
) j
+= 4 total_score
= '' . join
( alist
) else : total_score
= '暫無' box_office
= str ( box_office
) . replace
( '[' , '' ) . replace
( ']' , '' ) . replace
( '\\u' , '' ) . replace
( "'" , "" ) if box_office
== '暫無' : total_office
= box_office
else : box_office
= box_office
. split
( ',' ) box
= box_office
[ 0 ] if '.' in box
: box1
= box
[ : box
. index
( '.' ) ] l1
= [ ] j
= 0 for i
in range ( 3 , len ( box1
) , 4 ) : bo1
= data_temp
[ box1
[ j
: i
+ 1 ] ] l1
. append
( bo1
) j
+= 4 box1
= '' . join
( l1
) box2
= box
[ box
. index
( '.' ) + 1 : ] l2
= [ ] j
= 0 for i
in range ( 3 , len ( box2
) , 4 ) : bo
= data_temp
[ box2
[ j
: i
+ 1 ] ] l2
. append
( bo
) j
+= 4 box2
= '' . join
( l2
) total_office
= box1
+ '.' + box2
+ box_office
[ 1 ] else : if len ( box
) == 0 : total_office
= '空' else : list = [ ] j
= 0 for i
in range ( 3 , len ( box
) , 4 ) : bo
= data_temp
[ box
[ j
: i
+ 1 ] ] list . append
( bo
) j
+= 4 box
= '' . join
( list ) total_office
= box
+ box_office
[ 1 ] return score
, total_score
, total_office
def detail_txt ( score
, people_score
, box_office
, name
, types
, country
, minute
, screen_time
) : lists
= [ ] lists
. append
( score
) , lists
. append
( people_score
) , lists
. append
( box_office
) , lists
. append
( name
) , \lists
. append
( types
) , lists
. append
( country
) , lists
. append
( minute
) , lists
. append
( screen_time
) result
. append
( lists
)
def to_csv ( result
) : df
= pd
. DataFrame
( result
, columns
= [ '評分' , '評分人數' , '累計票房' , '電影名' , '電影類型' , '國家' , '時長' , '上映時間' ] ) df
. to_csv
( 'maoyan1.csv' , index
= False )
def main ( ) : pool
= Pool
( 10 ) start
= time
. time
( ) pool
. map ( page_num
, urls_page
) pool
. map ( get_url
, url_list
) to_csv
( result
) pool
. close
( ) pool
. join
( ) end
= time
. time
( ) times
= end
- start
print ( times
)
if __name__
== "__main__" : main
( )
不知道有沒有更好的辦法
總結
以上是生活随笔 為你收集整理的猫眼数据爬取 的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔 網站內容還不錯,歡迎將生活随笔 推薦給好友。