PHPCrawler抓取酷狗精选集歌单
生活随笔
收集整理的這篇文章主要介紹了
PHPCrawler抓取酷狗精选集歌单
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
一、PHPCrawler的介紹與安裝
先了解一下什么是抓取?抓取就是網絡爬蟲,也就是人們常說的網絡蜘蛛(spider)。是搜索引擎的一個重要組成部分,按照一定的邏輯和算法抓取和下載互聯網上的信息和網頁。一般的爬蟲從一個start?url開始,按照一定的策略開始爬取,把爬取到的新的url放入爬取隊列中,然后進行新一輪的爬取,直到抓取完畢為止。
PHPCrawler是一個國外開源的爬蟲系統,它的源碼托管在sourceforge里,這是它的下載地址:點擊打開鏈接
,根據自己電腦里安裝的PHP版本選擇合適的版本下載。下載完畢之后,解壓到服務器網站根目錄下,復制example.php文件,并重命名。
二、完整源碼
<?php// It may take a whils to crawl a site ... set_time_limit(10000);// Inculde the phpcrawl-mainclass include("libs/PHPCrawler.class.php");// Extend the class and override the handleDocumentInfo()-method class MyCrawler extends PHPCrawler {//在這里解析頁面內容function handleDocumentInfo($DocInfo) {// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").if (PHP_SAPI == "cli") $lb = "\n";else $lb = "<br />";// Print the URL and the HTTP-status-Codeecho "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;// Print the refering URLecho "Referer-page: ".$DocInfo->referer_url.$lb;// Print if the content of the document was be recieved or notif ($DocInfo->received == true)echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;elseecho "Content not received".$lb; // Now you should do something with the content of the actual// received page or file ($DocInfo->source), we skip it in this example //echo $DocInfo->source;//echo $lb;$url=$DocInfo->url;$pat="/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/";if(preg_match($pat,$url)>0){$this->parseSonglistDetail($DocInfo);}flush();} public function parseSonglistDetail($DocInfo){$songlistArr=array();$songlistArr['raw_url']=$DocInfo->url;$content=$DocInfo->content;//名稱$matches=array();$pat="/<span>名稱:<\/span>([^(<br)]+)<br \/>/";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['title']=$matches[1];}else{$songlistArr['title']="";print "error:get title fail<br/>";}//創建人$matches=array();$pat="/<span>創建人:<\/span>([^(<br)]+)<br \/>/";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['creator']=$matches[1];}else{$songlistArr['creator']="";print "error:get creator fail<br/>";}//創建時間$matches=array();$pat="/<span>更新時間:<\/span>([^(<br)]+)<br \/>/";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['create_date']=$matches[1];}else{$songlistArr['create_date']="";print "error:get create_date fail<br/>";}//簡介$matches=array();$pat="/<span>簡介:<\/span>([^(<\/p)]*)<\/p>/";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['info']=$matches[1];}else{$songlistArr['info']="";print "error:get info fail<br/>";}//歌曲$matches=array();$pat="/<a title=\"([^\"]+)\" hidefocus=\"/";$res=preg_match_all($pat, $content,$matches);if($res>0){$songlistArr['songs']=array();for($i=0;$i<count($matches[1]);$i++){$song_title=$matches[1][$i];array_push($songlistArr['songs'],array('title'=>$song_title));}}else{$songlistArr['song']="";print "error:get song fail<br/>";}echo "<pre>";print_r($songlistArr);echo "</pre>";$this->saveSonglist($songlistArr);}public function saveSonglist($songlistArr){//連接數據庫$conn=mysql_connect("localhost","root","root");mysql_select_db("songlist",$conn);mysql_query("set names utf8");$songlist=array();$songlist['title']=mysql_escape_string($songlistArr['title']);$songlist['create_time']=mysql_escape_string($songlistArr['create_date']);$songlist['creator']=mysql_escape_string($songlistArr['creator']);$songlist['raw_url']=mysql_escape_string($songlistArr['raw_url']);$songlist['info']=mysql_escape_string($songlistArr['info']);$sql="insert into songlist set"."title=''".$songlist['title']."'".",creat_time=''".$songlist['create_time']."'".",creator=''".$songlist['creator']."'".",raw_url=''".$songlist['raw_url']."'".",info=''".$songlist['info']."';";mysql_query($sql,$conn);$songlist_id=mysql_insert_id();foreach($songlistArr['songs'] as $song){$title=mysql_escape_string($song['title']);$sql="insert into song set title='".$title."'" .",songlist_id=".$songlist_id.";";mysql_query($sql);}mysql_close($conn);} }// Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. //創建一個爬蟲 $crawler = new MyCrawler(); //設置一個開始的連接 // URL to crawl $start_url="www.kugou.com/yy/special/index/1-0-2.html"; $crawler->setURL($start_url); //設置內容的類型 // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); //忽略圖片,設置那些連接不需要下載//每一個精選集的連接 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");//i 忽略大小寫 //精選集頁面的鏈接 下一頁 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/\d+-0-2.html# i");// Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");// Store and send cookie-data like a browser does $crawler->enableCookieHandling(true);// Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //數據內容的容量,多少m,0是無限的 $crawler->setTrafficLimit(1000 * 1024);// Thats enough, now here we go $crawler->go();// At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport();if (PHP_SAPI == "cli") $lb = "\n"; else $lb = "<br />";echo "Summary:".$lb; echo "Links followed: ".$report->links_followed.$lb; echo "Documents received: ".$report->files_received.$lb; echo "Bytes received: ".$report->bytes_received." bytes".$lb; echo "Process runtime: ".$report->process_runtime." sec".$lb; ?>總結
以上是生活随笔為你收集整理的PHPCrawler抓取酷狗精选集歌单的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Decision tree(决策树)算法
- 下一篇: 教你分割视频,用多个视频随机合并,添加音