DotnetSpider (一) 架构的理解、应用、搭建
本文連接:http://www.cnblogs.com/grom/p/8931650.html?
受業(yè)務(wù)影響,決定將Downloader單獨分層,做出修改。
最近在做爬蟲,之前一直在使用?HttpWebRequest 和?WebClient ,很方便快捷,也很適合新手,但隨著抓取任務(wù)的增多,多任務(wù),多庫等情況的出現(xiàn),使用一個優(yōu)秀的爬蟲框架是十分必要的。于是開始接觸dotnetspider。
借鑒一下框架的設(shè)計圖,在引入dotnetspider的NuGet包后,我基本也是按照這個進(jìn)行了分層
Data.Spider - 存放前臺頁面(Winform、控制臺)和實體爬蟲(EntitySpider)等,相當(dāng)于發(fā)起請求的起點。
Spider.Downloader - 封裝請求等信息,可實現(xiàn)自定義cookie等,非必須。
Spider.Processor - 處理器,繼承 IPageProcessor 實現(xiàn)對抓取內(nèi)容的處理
Spider.Pipe - 管道,我將它理解為經(jīng)過了 Processor 處理后的一個回調(diào),將處理好的數(shù)據(jù)存儲(文件、數(shù)據(jù)庫等)
Spider.Entity - 數(shù)據(jù)實體類,繼承 SpiderEntity
Spider.Command - 一些常用的公用命令,我這目前存放著轉(zhuǎn)數(shù)據(jù)格式類,后臺執(zhí)行JS類,SqlHelper(因架構(gòu)自帶數(shù)據(jù)庫管道,暫時沒用)等
這樣的分層也是參考了源碼的示例
隨著這幾天的嘗試,真的發(fā)現(xiàn)這個框架真的非常靈活,以凹凸租車的爬蟲為例,上代碼
實體類:
[EntityTable("CarWinsSpider", "AtzucheCar", EntityTable.Today)]
[EntitySelector(Expression = "$.data.content[*]", Type = SelectorType.JsonPath)]
public class AtzucheModel : SpiderEntity
{
/// <summary>
/// 車輛編號
/// </summary>
[PropertyDefine(Expression = "$.carNo", Type = SelectorType.JsonPath)]
public int carNo { get; set; }
/// <summary>
/// 品牌
/// </summary>
//[ReplaceFormatter(NewValue = "", OldValue = "\r")]
//[ReplaceFormatter(NewValue = "", OldValue = "\t")]
//[ReplaceFormatter(NewValue = "", OldValue = " ")]
//[ReplaceFormatter(NewValue = "", OldValue = "\n")]
//[ReplaceFormatter(NewValue = "", OldValue = "\"")]
//[ReplaceFormatter(NewValue = "", OldValue = " ")]
[PropertyDefine(Expression = "$.brand", Type = SelectorType.JsonPath)]
public string brand { get; set; }
/// <summary>
/// 地址
/// </summary>
[PropertyDefine(Expression = "$.carAddr", Type = SelectorType.JsonPath)]
public string carAddr { get; set; }
/// <summary>
/// 車系
/// </summary>
[PropertyDefine(Expression = "$.type", Type = SelectorType.JsonPath)]
public string type { get; set; }
/// <summary>
/// 排量
/// </summary>
[PropertyDefine(Expression = "$.sweptVolum", Type = SelectorType.JsonPath)]
public string sweptVolum { get; set; }
/// <summary>
/// 圖片
/// </summary>
[PropertyDefine(Expression = "$.coverPic", Type = SelectorType.JsonPath)]
public string coverPic { get; set; }
/// <summary>
/// 日租金
/// </summary>
[PropertyDefine(Expression = "$.dayPrice", Type = SelectorType.JsonPath)]
public int dayPrice { get; set; }
/// <summary>
/// 公里數(shù)
/// </summary>
[PropertyDefine(Expression = "$.distance", Type = SelectorType.JsonPath)]
public string distance { get; set; }
/// <summary>
/// 評分
/// </summary>
[PropertyDefine(Expression = "$.evalScore", Type = SelectorType.JsonPath)]
public string evalScore { get; set; }
[PropertyDefine(Expression = "$.gbType", Type = SelectorType.JsonPath)]
public string gbType { get; set; }
/// <summary>
/// 車牌
/// </summary>
[PropertyDefine(Expression = "$.plateNum", Type = SelectorType.JsonPath)]
public string plateNum { get; set; }
[PropertyDefine(Expression = "$.replyTag", Type = SelectorType.JsonPath)]
public string replyTag { get; set; }
[PropertyDefine(Expression = "$.transCount", Type = SelectorType.JsonPath)]
public string transCount { get; set; }
/// <summary>
/// 年款
/// </summary>
[PropertyDefine(Expression = "$.year", Type = SelectorType.JsonPath)]
public int year { get; set; }
[PropertyDefine(Expression = "$.isPrivilege", Type = SelectorType.JsonPath)]
public int isPrivilege { get; set; }
[PropertyDefine(Expression = "$.isRecommend", Type = SelectorType.JsonPath)]
public int isRecommend { get; set; }
[PropertyDefine(Expression = "$.isUpgrade", Type = SelectorType.JsonPath)]
public int isUpgrade { get; set; }
[PropertyDefine(Expression = "$.lat", Type = SelectorType.JsonPath)]
public string lat { get; set; }
[PropertyDefine(Expression = "$.lon", Type = SelectorType.JsonPath)]
public string lon { get; set; }
[PropertyDefine(Expression = "$.queryId", Type = SelectorType.JsonPath)]
public string queryId { get; set; }
[PropertyDefine(Expression = "$.supplyCarService", Type = SelectorType.JsonPath)]
public int supplyCarService { get; set; }
[PropertyDefine(Expression = "$.freeCarService", Type = SelectorType.JsonPath)]
public int freeCarService { get; set; }
[PropertyDefine(Expression = "$.isShenMaCar", Type = SelectorType.JsonPath)]
public int isShenMaCar { get; set; }
[PropertyDefine(Expression = "$.supportGetReturn", Type = SelectorType.JsonPath)]
public int supportGetReturn { get; set; }
[PropertyDefine(Expression = "$.confirmation", Type = SelectorType.JsonPath)]
public int confirmation { get; set; }
}
起始:
/// <summary>
/// 應(yīng)用程序的主入口點。
/// </summary>
[STAThread]
static void Main()
{
var site = new Site
{
CycleRetryTimes = 1,
SleepTime = 200,
Headers = new Dictionary<string, string>()
{
{"Accept","application/json, text/javascript, */*; q=0.01" },
{"Accept-Encoding","gzip, deflate" },
{"gzip, deflate","zh-CN,zh;q=0.9" },
{"X-Requested-With","XMLHttpRequest" },
{ "Referer", "http://www.atzuche.com/hz/car/search"},
{ "Connection","keep-alive" },
{ "Content-Type","application/json;charset=UTF-8" },
{ "Host","www.atzuche.com"},
{ "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
}
};
List<Request> resList = new List<Request>();
Request res = new Request();
//res.PostBody = $"id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={i}&shopid=83106681";//據(jù)說是post請求需要
res.Url = "http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0";
res.Method = System.Net.Http.HttpMethod.Get;
resList.Add(res);
var spider = DotnetSpider.Core.Spider.Create(site, new QueueDuplicateRemovedScheduler(), new AtzucheProcessor())
.AddStartRequests(resList.ToArray())//頁面抓取整理
.AddPipeline(new AtzuchePipe());//數(shù)據(jù)回調(diào)
//----------------------------------
spider.Monitor = new DotnetSpider.Core.Monitor.NLogMonitor();
spider.Downloader = new AtzucheDownloader(); //new DotnetSpider.Core.Downloader.HttpClientDownloader();
spider.ClearSchedulerAfterComplete = false;//爬蟲結(jié)束后不取消調(diào)度器
//----------------------------------
spider.ThreadNum = 1;
spider.Run();
Console.WriteLine("Press any key to continue...");
Console.Read();
}
這里也可將整個抓取方法當(dāng)做一個Spider實例單獨放置 ->?EntitySpider
/// <summary>
/// 應(yīng)用程序的主入口點。
/// </summary>
[STAThread]
static void Main()
{
AtzucheEntitySpider dDengEntitySpider = new AtzucheEntitySpider();
dDengEntitySpider.AddPageProcessor(new AtzucheProcessor());//控制器
dDengEntitySpider.AddPipeline(new AtzuchePipe());//回調(diào)
dDengEntitySpider.ThreadNum = 1;
dDengEntitySpider.Run();
Console.WriteLine("Press any key to continue...");
Console.Read();
}
Downloader
對目標(biāo)的請求全部包含著這里,可以根據(jù)需要自行設(shè)置,下篇將進(jìn)行自定義Request的應(yīng)用
public class AtzucheDownloader : BaseDownloader
{
protected override Page DowloadContent(Request request, ISpider spider)
{?
return new HttpClientDownloader().Download(request, spider);
}
}
新建爬蟲實體類
public class AtzucheEntitySpider : EntitySpider
{
protected override void MyInit(params string[] arguments)
{
AddPipeline(new SqlServerEntityPipeline("Server=.;Database=AuzucheSpider;uid=sa;pwd=123;MultipleActiveResultSets=true"));//注意連接字符串中數(shù)據(jù)庫不能帶 .? 親測報錯。。。
AddStartUrl("http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0");
AddEntityType<AtzucheModel>();//如添加此實體類,框架將會根據(jù)此實體類上面的特性選擇進(jìn)行匹配,匹配成功后插入數(shù)據(jù)庫,固可以省略Processor和Pipe,或者不使用此句,通過控制器和回調(diào)自定義存儲方法
}
public AtzucheEntitySpider() : base("AuzucheSpider", new Site
{
CycleRetryTimes = 1,
SleepTime = 200,
Headers = new Dictionary<string, string>()
{
{"Accept","application/json, text/javascript, */*; q=0.01" },
{"Accept-Encoding","gzip, deflate" },
{"gzip, deflate","zh-CN,zh;q=0.9" },
{"X-Requested-With","XMLHttpRequest" },
{ "Referer", "http://www.atzuche.com/hz/car/search"},
{ "Connection","keep-alive" },
{ "Content-Type","application/json;charset=UTF-8" },
{ "Host","www.atzuche.com"},
{ "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
}
})
{
}
}
?接下來是處理器:
解析抓取的數(shù)據(jù)封裝到"AtzucheList"內(nèi),可Pipe內(nèi)通過此名稱獲取處理好的數(shù)據(jù)。
public class AtzucheProcessor : IPageProcessor
{
public void Process(Page page, ISpider spider)
{
List<AtzucheModel> list = new List<AtzucheModel>();
var html = page.Selectable.JsonPath("$.data.content").GetValue();
list = JsonConvert.DeserializeObject<List<AtzucheModel>>(html);
page.AddResultItem("AtzucheList", list);
}
}
最后是回調(diào),可在此加入保存數(shù)據(jù)的代碼,至此結(jié)束。
public class AtzuchePipe : BasePipeline
{
public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider)
{
var result = new List<AtzucheModel>();
foreach (var resultItem in resultItems)
{
Console.WriteLine((resultItem.Results["AtzucheList"] as List<AtzucheModel>).Count);
foreach (var item in (resultItem.Results["AtzucheList"] as List<AtzucheModel>))
{
result.Add(new AtzucheModel()
{
carNo = item.carNo
});
Console.WriteLine($"{item.carNo}:{item.type} ");
}
}
}
}
? 結(jié)果圖:
總體來說,此框架對新手還是很友好的,靈活寫法可以讓我們有較多的方式去實現(xiàn)爬蟲,因為這個爬蟲比較簡單,就先寫到這里,未來如果可能,會再嘗試使用框架內(nèi)的多線程、代理等功能,如有心得將繼續(xù)分享,希望能對跟我一樣的新手有所幫助,十分感謝。
作者:Grom?
總結(jié)
以上是生活随笔為你收集整理的DotnetSpider (一) 架构的理解、应用、搭建的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 了却君王天下事(说一说了却君王天下事的简
- 下一篇: 中国传媒大学2022年研究生录取分数线(