JAVA爬取虎嗅网截图_java爬虫爬取网站使用多线程(虎嗅网站)
java爬蟲爬取網(wǎng)站使用多線程(虎嗅網(wǎng)站)
java爬蟲爬取網(wǎng)站使用多線程(虎嗅網(wǎng)站)
圖解虎嗅爬蟲優(yōu)化方案
pom 如下:
org.apache.httpcomponents
httpclient
4.5.3
org.jsoup
jsoup
1.10.3
org.springframework
spring-jdbc
4.2.6.RELEASE
mysql
mysql-connector-java
5.1.41
c3p0
c3p0
0.9.1.2
com.alibaba
fastjson
1.2.31
com.google.code.gson
gson
2.8.1
redis.clients
jedis
2.9.0
代碼演示如下:
package cn.itcast.huxiu.query;
import java.util.ArrayList;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.google.gson.Gson;
import cn.itcast.huxiu.Article;
import cn.itcast.huxiu.ArticleDao;
import cn.itcast.huxiu.ResponseJson;
public class HuXiuTest {
public static final ArticleDao articleDao = new ArticleDao();
public static final ArrayBlockingQueue blockingQueue = new ArrayBlockingQueue(1000);
public static final ExecutorService threadPool = Executors.newFixedThreadPool(10);
public static final String prefix = "https://www.huxiu.com/article/";
public static final String end = ".html";
public static void main(String[] args) throws Exception {
//創(chuàng)建線程
for(int i=0;i<30;i++){
threadPool.execute(new ProcessPagingThreadQueue());
}
// 爬取首頁的信息
String indexHtml = getIndex();
// 解析首頁 得到首頁里面的所有的id(根據(jù)id來查詢每一個頁面的信息) 存儲到集合里面
parseIndexHtml(indexHtml);
/**
* 在首頁的信息爬取了之后 就要準(zhǔn)備爬取分頁的信息 點擊加載更多只時 就相當(dāng)與是點擊了下一頁 點擊之后 就會發(fā)送一個請求
* 這個請求就可以加載下一頁的數(shù)據(jù)了 得到的下一頁所有數(shù)據(jù)之后 就要解析每一頁的數(shù)據(jù)
*
*/
// 根據(jù)首頁的信息來得到加載下一頁數(shù)據(jù)按鈕的數(shù)據(jù)值
String last_dateline = getValueAndIndexHtml(indexHtml);// 得到?jīng)]加載一頁數(shù)據(jù)的數(shù)值
// 點擊 加載下一頁的數(shù)據(jù)
for (int page = 2; page < 10; page++) {
// 獲得請求的路徑
String url = "https://www.huxiu.com/v2_action/article_list";
HttpPost httpPost = new HttpPost(url);
// 請求參數(shù)
ArrayList list = new ArrayList();
list.add(new BasicNameValuePair("huxiu_hash_code", "647893ceb60219effa36193702fd89a3"));
list.add(new BasicNameValuePair("page", page + ""));
list.add(new BasicNameValuePair("last_dateline", last_dateline));
// 參數(shù)設(shè)置
httpPost.setEntity(new UrlEncodedFormEntity(list));
// User-Agent
httpPost.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0");
// 發(fā)起請求
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse execute = httpClient.execute(httpPost);
// 在發(fā)送請求之后 頁面沒有跳轉(zhuǎn) 因為是在和首頁的同一個頁面下 所以不用判斷 只有頁面跳轉(zhuǎn)才有狀態(tài)碼的判定
// 請求發(fā)送之后 就有返回值了 主要注意的是返回值是json數(shù)據(jù)的形式來進(jìn)行返回的
String jsonDate = EntityUtils.toString(execute.getEntity());
// 得到j(luò)son數(shù)據(jù)值 就要水對json的數(shù)據(jù)進(jìn)行解析 解析json的數(shù)據(jù)使用到的是gson
Gson gson = new Gson();
// 將數(shù)據(jù)進(jìn)行解析并且映射到實體類中 實體類中是根據(jù)返回的參數(shù)來進(jìn)行設(shè)置的
// 得到分頁數(shù)據(jù)的所有的信息 也就是分頁數(shù)據(jù)的url
ResponseJson fromJson = gson.fromJson(jsonDate, ResponseJson.class);
// 得到的分頁的每一個數(shù)據(jù) 每一個URL信息
String data = fromJson.getData();// 得到分頁的信息
// 對分頁的數(shù)據(jù)信息進(jìn)行解析 也就要取得每一個詳情信息頁面的id值
getDate(data);// 得到所有id值的集合
System.out.println(page+"&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&");
}
}
private static void getDate(String data) {
if (data != null) {
Document document = Jsoup.parse(data);
Elements elements = document.select("div[data-aid]");
for (Element element : elements) {
try {
blockingQueue.put(element.attr("data-aid"));
System.out.println(element.attr("data-aid"));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
// 得到加載下一頁信息的數(shù)據(jù)值
private static String getValueAndIndexHtml(String indexHtml) {
if (indexHtml != null) {
Document document = Jsoup.parse(indexHtml);
Elements select = document.select("div[data-last_dateline]");
return select.get(0).attr("data-last_dateline");
}
return null;
}
public static Article parseXianQingYeMian(String html) {
if (html != null) {
Article article = new Article();
// 將詳細(xì)頁面的信息 轉(zhuǎn)換為文檔對象
Document document = Jsoup.parse(html);
// 獲取文章的標(biāo)題信息
String ownText = document.select(".t-h1").get(0).ownText();
article.setTitle(ownText);
// 獲取作者
String author = document.select(".author-name").get(0).text();
article.setAuthor(author);
// 獲取時間 根據(jù)頁面上的信息可知時間有兩種表示
Elements elements = document.select("span[class=article-time pull-left]");
if (elements.size() == 0) {
String createTime = document.select(".article-time").get(0).ownText();
article.setCreateTime(createTime);
} else {
String createTime = elements.get(0).ownText();
article.setCreateTime(createTime);
}
// 獲取文章內(nèi)容
String content = document.select(".article-content-wrap").get(0).text();
article.setContent(content);
// 獲取點贊
article.setZan(document.select(".num").get(0).ownText());
// 獲取評論
article.setPl(document.select(".article-pl").get(0).ownText());
System.out.println(article);
return article;
}
return null;
}
// 解析數(shù)據(jù) 得到url
private static void parseIndexHtml(String indexHtml) {
// TODO Auto-generated method stub
if (indexHtml != null) {
// 解析得到的頁面的信息 將其變成文檔對象
Document document = Jsoup.parse(indexHtml);
// 得到document對象后 就可以通過document對象來得到需要的東西
Elements elements = document.select(".mod-info-flow div[data-aid]");
for (Element element : elements) {
String aid = element.attr("data-aid");
try {
/**
* 在單線層轉(zhuǎn)變多線程中? ?主要的幾個是 隊列 線程的創(chuàng)建 線程池的創(chuàng)建
* 這個主要是創(chuàng)建隊列 并且把解析數(shù)據(jù)的詳情的id傳給了這個隊列
* 隊列的主要的作用是 防止單個線程被同時訪問造成擁堵引起并發(fā)的問題
* 隊列就把這幾個問題給解決了
*/
blockingQueue.put(aid);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
// 首頁的獲取
private static String getIndex() throws Exception {
String url = "https://www.huxiu.com";
// 發(fā)起一個get請求
HttpGet httpGet = new HttpGet(url);
// 設(shè)置請求頭
httpGet.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
// 返回頁面的信息
return getHtml(httpGet);
}
// 執(zhí)行發(fā)送請求的方法
public static String getHtml(HttpGet httpGet) throws Exception {
// TODO Auto-generated method stub
String html = null;
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse execute = httpClient.execute(httpGet);
// 判斷響應(yīng)碼是否為200
if (execute.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = execute.getEntity();
html = EntityUtils.toString(entity);
System.out.println(html);// 返回的的頁面的所有信息
}
return html;
}
}
**********************************************************************************************************************
代碼創(chuàng)建線程
package cn.itcast.huxiu.query;
import org.apache.http.client.methods.HttpGet;
import cn.itcast.huxiu.Article;
public class ProcessPagingThreadQueue extends Thread {
public void run() {
// TODO Auto-generated method stub
while (true) {
//得到每一給詳情頁面的id 這里面用到了while循環(huán)? 因為存在隊列里面的id值不知道是多少個 而且出來也是一個一個的出來的 所以就使用到了循環(huán)
try {
String parseInt = HuXiuTest.blockingQueue.take();//得到詳情頁的id
int id = Integer.parseInt(parseInt);
//創(chuàng)建發(fā)送請求
HttpGet httpGet = new HttpGet(HuXiuTest.prefix + id + HuXiuTest.end);
// 消息頭
httpGet.addHeader("user-agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
String html = HuXiuTest.getHtml(httpGet);//得到頁面的詳情信息
Article article = HuXiuTest.parseXianQingYeMian(html);
if(article != null){
article.setId(id);
//article.setUrl(HuXiuTest.prefix + id + HuXiuTest.end);
HuXiuTest.articleDao.save(article);
}
} catch (Exception e) {
// TODO Auto-generated catch blockArticle
e.printStackTrace();
}
}
}
}
**********************************************************************************************************************
實體類
package cn.itcast.huxiu;
public class Article {
private int id;
private String title;
private String author;
private String createTime;
private String sc;
private String zan;
private String pl;
private String content;
private String url;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getSc() {
return sc;
}
public void setSc(String sc) {
this.sc = sc;
}
public String getZan() {
return zan;
}
public void setZan(String zan) {
this.zan = zan;
}
public String getPl() {
return pl;
}
public void setPl(String pl) {
this.pl = pl;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Article [id=" + id + ", title=" + title + ", author=" + author + ", createTime=" + createTime + ", sc="
+ sc + ", zan=" + zan + ", pl=" + pl + ", content=" + content + ", url=" + url + "]";
}
}
數(shù)據(jù)庫連接
package cn.itcast.huxiu;
import org.springframework.jdbc.core.JdbcTemplate;
import com.mchange.v2.c3p0.ComboPooledDataSource;
public class ArticleDao extends JdbcTemplate{
public ArticleDao() {
// 創(chuàng)建C3P0的datasource 1.配置 2.代碼
ComboPooledDataSource dataSource = new ComboPooledDataSource();
// 1.url
// 2.driver
// 3.username&password
dataSource.setUser("root");
dataSource.setPassword("123");
dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
setDataSource(dataSource);
}
public void save(Article article) {
String sql = "INSERT INTO huxiu_article (id, title, author, createTime, zan, pl, sc, content, url ) VALUES( ?,?,?,?,?,?,?,?,?)";
update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
}
}
實體類
package cn.itcast.huxiu;
public class ResponseJson {
private int result;
private String msg;
private String data;
private double total_page;
private double last_dateline;
public int getResult() {
return result;
}
public void setResult(int result) {
this.result = result;
}
public String getMsg() {
return msg;
}
public void setMsg(String msg) {
this.msg = msg;
}
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
public double getTotal_page() {
return total_page;
}
public void setTotal_page(double total_page) {
this.total_page = total_page;
}
public double getLast_dateline() {
return last_dateline;
}
public void setLast_dateline(double last_dateline) {
this.last_dateline = last_dateline;
}
@Override
public String toString() {
return "ResponseJson [result=" + result + ", msg=" + msg + ", data=" + data + ", total_page=" + total_page
+ ", last_dateline=" + last_dateline + "]";
}
}
java爬蟲爬取網(wǎng)站使用多線程(虎嗅網(wǎng)站)相關(guān)教程
總結(jié)
以上是生活随笔為你收集整理的JAVA爬取虎嗅网截图_java爬虫爬取网站使用多线程(虎嗅网站)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: gdp数据分析
- 下一篇: 【转】红帽 Red Hat Linux相