网站数据采集程序(爬虫)
生活随笔
收集整理的這篇文章主要介紹了
网站数据采集程序(爬虫)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
采集數據無非就是三步,抓取頁面,分析數據,入庫。
一、抓取頁面
抓取頁面也是在網上找的例子,主要是用到了2個方法
1,獲取網站類容;2,清除html標簽。具體看代碼:
/// <summary>
/// 根據Url獲得內容
/// </summary>
/// <param name="url">Url</param>
/// <returns>string</returns>
public string GetContentUrl(string url)
{
string htmlContent = string.Empty;
try
{
System.Threading.Thread.Sleep(500);
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
//偽造瀏覽器數據,避免被防采集程序過濾
req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215; CrazyCoder.cn;www.aligong.com)";
req.ReadWriteTimeout = 30000;
req.Timeout = 300000;
req.Proxy = null;
HttpWebResponse response = (HttpWebResponse)req.GetResponse();
using (StreamReader sr = new StreamReader(response.GetResponseStream()))
{
htmlContent = sr.ReadToEnd();
sr.Dispose();
response.Close();
}
}
catch
{
htmlContent = "";
}
return htmlContent;
}
View Code
1 /// <summary>
2 /// 清除Html標簽
3 /// </summary>
4 /// <param name="ContentStr">Html內容</param>
5 /// <returns>string</returns>
6 public string ClearLable(string ContentStr)
7 {
8 while (ContentStr.IndexOf('<') >= 0 && ContentStr.IndexOf('>') > 0)
9 {
10 int begin = ContentStr.IndexOf('<');
11 int end = ContentStr.IndexOf('>');
12 string SubContect = ContentStr.Substring(begin, end - begin + 1);
13 ContentStr = ContentStr.Replace(SubContect, "");
14 }
15 ContentStr = ContentStr.Replace(" ", "");
16 return ContentStr.Trim();
17 }
View Code
第二步:分析數據
通過html正則模板獲取到匹配的正則,然后取得正則匹配的集合。放入自己的集合里分析它
1 public List<String> GetListURl(string url)
2 {
3 string htmlContent = GetContentUrl(url);//取得網頁地址內容
4
5 if (!string.IsNullOrWhiteSpace(htmlContent))
6 {
7 return DealHtmlContentList(htmlContent);//調用處理方法得到list返回集合
8 }
9 return null;
10 }
11 private List<String> DealHtmlContentList(string htmlContent)
12 {
13 List<string> listStr = new List<string>();
14 string sLi = "<ul id="house-lst" class="house-lst">";//獲取的列表代碼段
15 string eLi = "</ul>";
16 string arryLi = string.Empty;
17 int start = htmlContent.IndexOf(sLi);
18 int end = 0;
19 if (start > 0)
20 {
21 end = htmlContent.Substring(start).IndexOf(eLi);
22 if (end > 0) arryLi = htmlContent.Substring(start, end);//通過截取得到列表代碼
23 }
24 if (!string.IsNullOrWhiteSpace(arryLi))
25 {
26 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);//正則匹配li列表
27 for (Match mch = regli.Match(arryLi); mch.Success; mch = mch.NextMatch())//放進集合
28 {
29 listStr.Add(mch.Value);
30 }
31 }
32 return listStr;
33 }
View Code
這是獲取網頁內容代碼,截取到列表頁集合那段html代碼。匹配正則變成集合返回。這只是列表頁的數據
1 public string GetListDetail(string url) {
2 string htmlContent = GetContentUrl(url);//取得詳情頁地址內容
3 if (!string.IsNullOrWhiteSpace(htmlContent))
4 {
5 return DealHtmlContentDetail(htmlContent);//調用處理方法得到sql執行語句
6 }
7 return null;
8 }
9
10 private string DealHtmlContentDetail(string htmlContent) {
11 string sql = string.Empty;
12 string sDiv = "<ol>";
13 string eDiv = "</ol>";
14 string arryDiv = string.Empty;
15 int start = htmlContent.IndexOf(sDiv);
16 int end = 0;
17 if (start > 0)
18 {
19 end = htmlContent.Substring(start).IndexOf(eDiv);
20 if (end > 0) arryDiv = htmlContent.Substring(start, end);
21 }
22
23 if (!string.IsNullOrWhiteSpace(arryDiv))
24 {
25 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);
26 Regex reglable = new Regex("<label>(.*?)</label>", RegexOptions.Singleline);
27 Regex regspan = new Regex("<span class="other">(.*?)</span>", RegexOptions.Singleline);
28 Match mlable, mspan;
29 string InsertSql = "INSERT INTO LJHostInfo(Title,AveragePrice,";//sql語句拼接
30 string InsertSqlParam = "('{0}','{1}',";
31 for (Match mch = regli.Match(arryDiv); mch.Success; mch = mch.NextMatch())//匹配詳情數據
32 {
33 mlable = reglable.Match(mch.Value); mspan = regspan.Match(mch.Value);
34 if (mlable.Success)
35 {
36 string value = ClearLable(mspan.Value);
37 switch (ClearLable(mlable.Value))//分部比較并寫入sql語句拼接
38 {
39 case "建筑年代:":
40 InsertSql += "BuildYear,";
41 InsertSqlParam += "'" + value + "',";
42 break;
43 case "建筑類型:":
44 InsertSql += "BuildType,";
45 InsertSqlParam += "'" + value + "',";
46 break;
47 case "物業費用:":
48 InsertSql += "PropertyPrice,";
49 InsertSqlParam += "'" + value + "',";
50 break;
51 case "物業公司:":
52 InsertSql += "PropertyCompany,";
53 InsertSqlParam += "'" + value + "',";
54 break;
55 case "開發商:":
56 InsertSql += "Developers,";
57 InsertSqlParam += "'" + value + "',";
58 break;
59 case "樓棟總數:":
60 InsertSql += "FloorNum,";
61 InsertSqlParam += "'" + value + "',";
62 //匹配容積率
63 if (mlable.NextMatch().Success)
64 {
65 InsertSql += "Rate,";
66 InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
67 }
68 break;
69 case "房屋總數:":
70 InsertSql += "HousesNum,";
71 InsertSqlParam += "'" + value + "',";
72 //匹配綠化率
73 if (mlable.NextMatch().Success)
74 {
75 InsertSql += "GreenRates,";
76 InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
77 }
78 break;
79 case "所屬學區:":
80 InsertSql += "SchoolAddress,";
81 InsertSqlParam += "'" + value + "',";
82 break;
83 case "附近門店:":
84 InsertSql += "NearbyAddress,";
85 InsertSqlParam += "'" + ClearLable(mch.Value).Replace("附近門店:","").Trim().Replace(" ","") + "',";//獲取門店信息
86 break;
87 }
88 }
89 }
90 InsertSql = InsertSql.TrimEnd(',') + ") ";
91 InsertSqlParam = InsertSqlParam.TrimEnd(',') + ")";
92 sql = InsertSql + "VALUES" + InsertSqlParam;
93 }
94
95 return sql;
96 }
View Code
需要注意的就是匹配數據去掉html標簽,加入sql語句。重復的匹配再插入
第三步:多線程任務類
1 /// <summary>
2 /// 任務執行入庫操作類
3 /// </summary>
4 public class ThreadWorker
5 {
6 private ClumbForm cForm;
7 private List<String> list;
8 private string siteUrl = "@$#@$#@$#@$@#$#@$#@$#@$";//加密處理(^_^)
9 private LianJiaCaiJi caiji=new LianJiaCaiJi();
10
11 public ThreadWorker(ClumbForm cf, List<String> _list)
12 {
13 cForm = cf;
14 list = _list;
15 }
16
17 /// <summary>
18 /// 線程任務開始
19 /// </summary>
20 /// <param name="objParams"></param>
21 public void StartWorker()
22 {
23 string splitStr = string.Empty;
24 Regex regh2 = new Regex("<h2>(.*?)</h2>", RegexOptions.Singleline);
25 Regex regspan = new Regex("<span class="num">(.*?)</span>", RegexOptions.Singleline);
26 Match m;
27 Match ms;
28 foreach (var item in list)
29 {
30 m = regh2.Match(item);
31 if (m.Success)
32 {
33 lock (this)
34 {
35 ms = regspan.Match(item);
36 cForm.TotalCount += 1;
37 cForm.SBINSERTSQL.AppendFormat(caiji.GetListDetail(siteUrl + GetQuotationContent(m.Value, "href")), GetQuotationContent(m.Value, "title"), ms.Success ? caiji.ClearLable(ms.Value) : "0.00");
38 cForm.ShowMsg("已完成:" + GetQuotationContent(m.Value, "title") + "小區,價格:"+ (ms.Success ? caiji.ClearLable(ms.Value) : "0.00")+ " 完成時間:" + System.DateTime.Now.ToString());
39 cForm.ShowLableMsg(cForm.TotalCount+"");
40 }
41 }
42 }
43 cForm.ShowMsg("已完成第:" + cForm.TotalCount + "頁數據采集, 完成時間:" + System.DateTime.Now.ToString());
44 }
45
46 /// <summary>
47 /// 取得雙引號中間的數據
48 /// </summary>
49 /// <param name="content"></param>
50 /// <returns></returns>
51 private string GetQuotationContent(string content,string tag) {
52 int s=content.IndexOf(tag)+2;
53 if ( s>= 0) {
54 int tagS = content.Substring(s + tag.Length).IndexOf('"');
55 return content.Substring(s + tag.Length, tagS);
56 }
57 return "";
58 }
59
60 }
View Code
然后是任務執行
1 private void btnCaiJi_Click(object sender, EventArgs e)
2 {
3 //初始狀態
4 listBoxMessage.Items.Clear();
5 IsComplete = false;
6
7 if (string.IsNullOrWhiteSpace(txtPageStart.Text) || string.IsNullOrWhiteSpace(txtPageEnd.Text))
8 {
9 MessageBox.Show("請輸入采集頁數!");
10 return;
11 }
12 else if (int.Parse(txtPageStart.Text) > 100) {
13 MessageBox.Show("采集頁數只能在100以內!");
14 return;
15 }
16 ShowMsg("開始時間:" + System.DateTime.Now.ToString() + " 處理中請等待....");
17 _cts = new CancellationTokenSource();
18 ThreadPool.QueueUserWorkItem(state => CountTo(int.Parse(txtPageStart.Text), _cts.Token));
19
20 }
21
22 /// <summary>
23 /// 以累計的方式多線程采集數據
24 /// </summary>
25 /// <param name="countTo">累加到的指定值</param>
26 /// <param name="ct">取消憑證</param>
27 private void CountTo(int countTo, CancellationToken ct)
28 {
29 for (; countTo <= int.Parse(txtPageEnd.Text); countTo++)
30 {
31 tw = new ThreadWorker(this, caiji.GetListURl(string.Format(url, countTo)), null);
32 if (ct.IsCancellationRequested)
33 {
34 break;
35 }
36 //Invoke方法用于獲得創建控件的線程所在的上下文
37 this.Invoke(new Action(tw.StartWorker));
38 Thread.Sleep(200);
39 }
40 IsComplete = true;
41 ShowMsg("結束時間:" + System.DateTime.Now.ToString() + " 采集完成,總條數:"+TotalCount);
42 }
43
44 /// <summary>
45 /// 實時信息顯示
46 /// </summary>
47 /// <param name="msg">提示信息</param>
48 public void ShowMsg(string msg)
49 {
50 try
51 {
52 if (listBoxMessage.InvokeRequired)
53 {
54 GetMsgDelegate labDele = new GetMsgDelegate(ShowMsg);
55 this.Invoke(labDele, new object[] { msg });
56 }
57 else
58 {
59 listBoxMessage.Items.Add(msg);
60 listBoxMessage.SelectedItem = listBoxMessage.Items[listBoxMessage.Items.Count - 1];//設定listbox自動滾動
61 if (IsComplete)
62 {
63 btnCaiJi.Enabled = true;
64 btnExceSql.Enabled = true;
65 }
66 else
67 {
68 btnCaiJi.Enabled = false;
69 btnExceSql.Enabled = false;
70 }
71 }
72 }
73 catch { }
74 }
View Code
執行時界面
總結
以上是生活随笔為你收集整理的网站数据采集程序(爬虫)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Linux下载——下载文件的命令
- 下一篇: MySQL80数据库报错1045解决方法