當前位置：首頁 > 编程语言 > php >内容正文

php

PHPCrawler抓取酷狗精选集歌单

發布時間：2023/12/20 php 29 豆豆

生活随笔收集整理的這篇文章主要介紹了 PHPCrawler抓取酷狗精选集歌单小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一、PHPCrawler的介紹與安裝

先了解一下什么是抓取？
抓取就是網絡爬蟲，也就是人們常說的網絡蜘蛛（spider）。是搜索引擎的一個重要組成部分，按照一定的邏輯和算法抓取和下載互聯網上的信息和網頁。一般的爬蟲從一個start?url開始，按照一定的策略開始爬取，把爬取到的新的url放入爬取隊列中，然后進行新一輪的爬取，直到抓取完畢為止。
PHPCrawler是一個國外開源的爬蟲系統，它的源碼托管在sourceforge里，這是它的下載地址：點擊打開鏈接
，根據自己電腦里安裝的PHP版本選擇合適的版本下載。下載完畢之后，解壓到服務器網站根目錄下，復制example.php文件，并重命名。

二、完整源碼

<?php// It may take a whils to crawl a site ... set_time_limit(10000);// Inculde the phpcrawl-mainclass include("libs/PHPCrawler.class.php");// Extend the class and override the handleDocumentInfo()-method class MyCrawler extends PHPCrawler {//在這里解析頁面內容function handleDocumentInfo($DocInfo) {// Just detect linebreak for output ("\n" in CLI-mode, otherwise " ").if (PHP_SAPI == "cli") $lb = "\n";else $lb = " ";// Print the URL and the HTTP-status-Codeecho "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;// Print the refering URLecho "Referer-page: ".$DocInfo->referer_url.$lb;// Print if the content of the document was be recieved or notif ($DocInfo->received == true)echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;elseecho "Content not received".$lb; // Now you should do something with the content of the actual// received page or file ($DocInfo->source), we skip it in this example //echo $DocInfo->source;//echo $lb;$url=$DocInfo->url;$pat="/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/";if(preg_match($pat,$url)>0){$this->parseSonglistDetail($DocInfo);}flush();} public function parseSonglistDetail($DocInfo){$songlistArr=array();$songlistArr['raw_url']=$DocInfo->url;$content=$DocInfo->content;//名稱$matches=array();$pat="/名稱：<\/span>([^(<br)]+) /";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['title']=$matches[1];}else{$songlistArr['title']="";print "error:get title fail ";}//創建人$matches=array();$pat="/創建人：<\/span>([^(<br)]+) /";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['creator']=$matches[1];}else{$songlistArr['creator']="";print "error:get creator fail ";}//創建時間$matches=array();$pat="/更新時間：<\/span>([^(<br)]+) /";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['create_date']=$matches[1];}else{$songlistArr['create_date']="";print "error:get create_date fail ";}//簡介$matches=array();$pat="/簡介：<\/span>([^(<\/p)]*)<\/p>/";$res=preg_match($pat, $content,$matches);if($res>0){$songlistArr['info']=$matches[1];}else{$songlistArr['info']="";print "error:get info fail ";}//歌曲$matches=array();$pat="/<a title=\"([^\"]+)\" hidefocus=\"/";$res=preg_match_all($pat, $content,$matches);if($res>0){$songlistArr['songs']=array();for($i=0;$i<count($matches[1]);$i++){$song_title=$matches[1][$i];array_push($songlistArr['songs'],array('title'=>$song_title));}}else{$songlistArr['song']="";print "error:get song fail ";}echo "<pre>";print_r($songlistArr);echo "</pre>";$this->saveSonglist($songlistArr);}public function saveSonglist($songlistArr){//連接數據庫$conn=mysql_connect("localhost","root","root");mysql_select_db("songlist",$conn);mysql_query("set names utf8");$songlist=array();$songlist['title']=mysql_escape_string($songlistArr['title']);$songlist['create_time']=mysql_escape_string($songlistArr['create_date']);$songlist['creator']=mysql_escape_string($songlistArr['creator']);$songlist['raw_url']=mysql_escape_string($songlistArr['raw_url']);$songlist['info']=mysql_escape_string($songlistArr['info']);$sql="insert into songlist set"."title=''".$songlist['title']."'".",creat_time=''".$songlist['create_time']."'".",creator=''".$songlist['creator']."'".",raw_url=''".$songlist['raw_url']."'".",info=''".$songlist['info']."';";mysql_query($sql,$conn);$songlist_id=mysql_insert_id();foreach($songlistArr['songs'] as $song){$title=mysql_escape_string($song['title']);$sql="insert into song set title='".$title."'" .",songlist_id=".$songlist_id.";";mysql_query($sql);}mysql_close($conn);} }// Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. //創建一個爬蟲 $crawler = new MyCrawler(); //設置一個開始的連接 // URL to crawl $start_url="www.kugou.com/yy/special/index/1-0-2.html"; $crawler->setURL($start_url); //設置內容的類型 // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); //忽略圖片，設置那些連接不需要下載//每一個精選集的連接 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");//i 忽略大小寫 //精選集頁面的鏈接下一頁 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/\d+-0-2.html# i");// Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");// Store and send cookie-data like a browser does $crawler->enableCookieHandling(true);// Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //數據內容的容量，多少m，0是無限的 $crawler->setTrafficLimit(1000 * 1024);// Thats enough, now here we go $crawler->go();// At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport();if (PHP_SAPI == "cli") $lb = "\n"; else $lb = " ";echo "Summary:".$lb; echo "Links followed: ".$report->links_followed.$lb; echo "Documents received: ".$report->files_received.$lb; echo "Bytes received: ".$report->bytes_received." bytes".$lb; echo "Process runtime: ".$report->process_runtime." sec".$lb; ?>

總結

以上是生活随笔為你收集整理的PHPCrawler抓取酷狗精选集歌单的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Decision tree(决策树)算法
下一篇：教你分割视频，用多个视频随机合并，添加音

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

php

PHPCrawler抓取酷狗精选集歌单

一、PHPCrawler的介紹與安裝

二、完整源碼

總結