用 Java 爬小姐姐图片,这个厉害了。。。
來源:blog.csdn.net/qq_35402412/article/details/113627625
目的
爬取搜狗圖片上千張美女圖片并下載到本地
準備工作
爬取地址:https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3
分析
打開上面的地址,按F12開發(fā)者工具 - NetWork - XHR - 頁面往下滑動XHR欄出現(xiàn)請求信息如下:
Request URL :https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3
分析這段請求URL的主要幾個參數(shù):
start=48 表示從第48張圖片開始檢索
xml_len=48 從地48張往后獲取48張圖片
query=?搜索關(guān)鍵詞(例:美女,這里瀏覽器自動做了轉(zhuǎn)碼,不影響我們使用)
點擊Respose,找個JSON格式器輔助過去看看。
JSON格式:https://www.bejson.com/
分析Respose返回的信息,可以發(fā)現(xiàn)我們想要的圖片地址放在 picUrl里,
思路
通過以上分析,不難實現(xiàn)下載方法,思路如下:
設(shè)置URL請求參數(shù)
訪問URL請求,獲取圖片地址
圖片地址存入List
遍歷List,使用線程池下載到本地
代碼
SougouImgProcessor.java 爬取圖片類
import?com.alibaba.fastjson.JSONObject; import?us.codecraft.webmagic.utils.HttpClientUtils; import?victor.chang.crawler.pipeline.SougouImgPipeline;import?java.util.ArrayList; import?java.util.List;/***?A?simple?PageProcessor.*?@author?code4crafter@gmail.com?<br>*?@since?0.1.0*/ public?class?SougouImgProcessor?{private?String?url;private?SougouImgPipeline?pipeline;private?List<JSONObject>?dataList;private?List<String>?urlList;private?String?word;public?SougouImgProcessor(String?url,String?word)?{this.url?=?url;this.word?=?word;this.pipeline?=?new?SougouImgPipeline();this.dataList?=?new?ArrayList<>();this.urlList?=?new?ArrayList<>();}public?void?process(int?idx,?int?size)?{String?res?=?HttpClientUtils.get(String.format(this.url,?idx,?size,?this.word));JSONObject?object?=?JSONObject.parseObject(res);List<JSONObject>?items?=?(List<JSONObject>)((JSONObject)object.get("data")).get("items");for(JSONObject?item?:?items){this.urlList.add(item.getString("picUrl"));}this.dataList.addAll(items);}//?下載public?void?pipelineData(){//?多線程pipeline.processSync(this.urlList,?this.word);}public?static?void?main(String[]?args)?{String?url?=?"https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";SougouImgProcessor?processor?=?new?SougouImgProcessor(url,"美女");int?start?=?0,?size?=?50,?limit?=?1000;?//?定義爬取開始索引、每次爬取數(shù)量、總共爬取數(shù)量for(int?i=start;i<start+limit;i+=size)processor.process(i,?size);processor.pipelineData();}}SougouImgPipeline.java ?圖片下載類
import?java.io.File; import?java.io.FileOutputStream; import?java.io.InputStream; import?java.net.URL; import?java.net.URLConnection; import?java.util.List; import?java.util.Objects; import?java.util.concurrent.ExecutorService; import?java.util.concurrent.Executors; import?java.util.concurrent.TimeUnit; import?java.util.concurrent.atomic.AtomicInteger;/***?Store?results?in?files.<br>*?@author?code4crafter@gmail.com?<br>*?@since?0.1.0*/ public?class?SougouImgPipeline?{private?String?extension?=?".jpg";private?String?path;private?volatile?AtomicInteger?suc;private?volatile?AtomicInteger?fails;public?SougouImgPipeline()?{setPath("E:/pipeline/sougou");suc?=?new?AtomicInteger();fails?=?new?AtomicInteger();}public?SougouImgPipeline(String?path)?{setPath(path);suc?=?new?AtomicInteger();fails?=?new?AtomicInteger();}public?SougouImgPipeline(String?path,?String?extension)?{setPath(path);this.extension?=?extension;suc?=?new?AtomicInteger();fails?=?new?AtomicInteger();}public?void?setPath(String?path)?{this.path?=?path;}/***?下載*?@param?url*?@param?cate*?@throws?Exception*/private?void?downloadImg(String?url,?String?cate,?String?name)?throws?Exception?{String?path?=?this.path?+?"/"?+?cate?+?"/";File?dir?=?new?File(path);if?(!dir.exists())?{????//?目錄不存在則創(chuàng)建目錄dir.mkdirs();}String?realExt?=?url.substring(url.lastIndexOf("."));???//?獲取擴展名String?fileName?=?name?+?realExt;fileName?=?fileName.replace("-",?"");String?filePath?=?path?+?fileName;File?img?=?new?File(filePath);if(img.exists()){???//?若文件之前已經(jīng)下載過,則跳過System.out.println(String.format("文件%s已存在本地目錄",fileName));return;}URLConnection?con?=?new?URL(url).openConnection();con.setConnectTimeout(5000);con.setReadTimeout(5000);InputStream?inputStream?=?con.getInputStream();byte[]?bs?=?new?byte[1024];File?file?=?new?File(filePath);FileOutputStream?os?=?new?FileOutputStream(file,?true);//?開始讀取?寫入int?len;while?((len?=?inputStream.read(bs))?!=?-1)?{os.write(bs,?0,?len);}System.out.println("picUrl:?"?+?url);System.out.println(String.format("正在下載第%s張圖片",?suc.getAndIncrement()));}/***?單線程處理**?@param?data*?@param?word*/public?void?process(List<String>?data,?String?word)?{long?start?=?System.currentTimeMillis();for?(String?picUrl?:?data)?{if?(picUrl?==?null)continue;try?{downloadImg(picUrl,?word,?picUrl);}?catch?(Exception?e)?{fails.incrementAndGet();}}System.out.println("下載成功:?"?+?suc.get());System.out.println("下載失敗:?"?+?fails.get());long?end?=?System.currentTimeMillis();System.out.println("耗時:"?+?(end?-?start)?/?1000?+?"秒");}/***?多線程處理**?@param?data*?@param?word*/public?void?processSync(List<String>?data,?String?word)?{long?start?=?System.currentTimeMillis();int?count?=?0;ExecutorService?executorService?=?Executors.newCachedThreadPool();?//?創(chuàng)建緩存線程池for?(int?i=0;i<data.size();i++)?{String?picUrl?=?data.get(i);if?(picUrl?==?null)continue;String?name?=?"";if(i<10){name="000"+i;}else?if(i<100){name="00"+i;}else?if(i<1000){name="0"+i;}String?finalName?=?name;executorService.execute(()?->?{try?{downloadImg(picUrl,?word,?finalName);}?catch?(Exception?e)?{fails.incrementAndGet();}});count++;}executorService.shutdown();try?{if?(!executorService.awaitTermination(60,?TimeUnit.SECONDS))?{//?超時的時候向線程池中所有的線程發(fā)出中斷(interrupted)。//?executorService.shutdownNow();}System.out.println("AwaitTermination?Finished");System.out.println("共有URL:?"+data.size());System.out.println("下載成功:?"?+?suc);System.out.println("下載失敗:?"?+?fails);File?dir?=?new?File(this.path?+?"/"?+?word?+?"/");int?len?=?Objects.requireNonNull(dir.list()).length;System.out.println("當前共有文件:?"+len);long?end?=?System.currentTimeMillis();System.out.println("耗時:"?+?(end?-?start)?/?1000.0?+?"秒");}?catch?(InterruptedException?e)?{e.printStackTrace();}}/***?多線程分段處理**?@param?data*?@param?word*?@param?threadNum*/public?void?processSync2(List<String>?data,?final?String?word,?int?threadNum)?{if?(data.size()?<?threadNum)?{process(data,?word);}?else?{ExecutorService?executorService?=?Executors.newCachedThreadPool();int?num?=?data.size()?/?threadNum;????//每段要處理的數(shù)量for?(int?i?=?0;?i?<?threadNum;?i++)?{int?start?=?i?*?num;int?end?=?(i?+?1)?*?num;if?(i?==?threadNum?-?1)?{end?=?data.size();}final?List<String>?cutList?=?data.subList(start,?end);executorService.execute(()?->?process(cutList,?word));}executorService.shutdown();}}}HttpClientUtils.java ? http請求工具類
import?org.apache.http.Header; import?org.apache.http.HttpEntity; import?org.apache.http.NameValuePair; import?org.apache.http.client.entity.UrlEncodedFormEntity; import?org.apache.http.client.methods.CloseableHttpResponse; import?org.apache.http.client.methods.HttpGet; import?org.apache.http.client.methods.HttpPost; import?org.apache.http.client.methods.HttpUriRequest; import?org.apache.http.conn.ssl.SSLConnectionSocketFactory; import?org.apache.http.conn.ssl.TrustStrategy; import?org.apache.http.entity.StringEntity; import?org.apache.http.impl.client.CloseableHttpClient; import?org.apache.http.impl.client.HttpClients; import?org.apache.http.message.BasicNameValuePair; import?org.apache.http.ssl.SSLContextBuilder; import?org.apache.http.util.EntityUtils; import?org.slf4j.Logger; import?org.slf4j.LoggerFactory;import?javax.net.ssl.HostnameVerifier; import?javax.net.ssl.SSLContext; import?javax.net.ssl.SSLSession; import?java.io.IOException; import?java.io.UnsupportedEncodingException; import?java.security.GeneralSecurityException; import?java.security.cert.CertificateException; import?java.security.cert.X509Certificate; import?java.util.ArrayList; import?java.util.HashMap; import?java.util.List; import?java.util.Map;/***?@author?code4crafter@gmail.com*?Date:?17/3/27*/ public?abstract?class?HttpClientUtils?{public?static?Map<String,?List<String>>?convertHeaders(Header[]?headers)?{Map<String,?List<String>>?results?=?new?HashMap<String,?List<String>>();for?(Header?header?:?headers)?{List<String>?list?=?results.get(header.getName());if?(list?==?null)?{list?=?new?ArrayList<String>();results.put(header.getName(),?list);}list.add(header.getValue());}return?results;}/***?http的get請求*?@param?url*/public?static?String?get(String?url)?{return?get(url,?"UTF-8");}public?static?Logger?logger?=?LoggerFactory.getLogger(HttpClientUtils.class);/***?http的get請求*?@param?url*/public?static?String?get(String?url,?String?charset)?{HttpGet?httpGet?=?new?HttpGet(url);return?executeRequest(httpGet,?charset);}/***?http的get請求,增加異步請求頭參數(shù)*?@param?url*/public?static?String?ajaxGet(String?url)?{return?ajaxGet(url,?"UTF-8");}/***?http的get請求,增加異步請求頭參數(shù)**?@param?url*/public?static?String?ajaxGet(String?url,?String?charset)?{HttpGet?httpGet?=?new?HttpGet(url);httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");return?executeRequest(httpGet,?charset);}/***?@param?url*?@return*/public?static?String?ajaxGet(CloseableHttpClient?httpclient,?String?url)?{HttpGet?httpGet?=?new?HttpGet(url);httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");return?executeRequest(httpclient,?httpGet,?"UTF-8");}/***?http的post請求,傳遞map格式參數(shù)*/public?static?String?post(String?url,?Map<String,?String>?dataMap)?{return?post(url,?dataMap,?"UTF-8");}/***?http的post請求,傳遞map格式參數(shù)*/public?static?String?post(String?url,?Map<String,?String>?dataMap,?String?charset)?{HttpPost?httpPost?=?new?HttpPost(url);try?{if?(dataMap?!=?null)?{List<NameValuePair>?nvps?=?new?ArrayList<NameValuePair>();for?(Map.Entry<String,?String>?entry?:?dataMap.entrySet())?{nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));}UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);formEntity.setContentEncoding(charset);httpPost.setEntity(formEntity);}}?catch?(UnsupportedEncodingException?e)?{e.printStackTrace();}return?executeRequest(httpPost,?charset);}/***?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)*/public?static?String?ajaxPost(String?url,?Map<String,?String>?dataMap)?{return?ajaxPost(url,?dataMap,?"UTF-8");}/***?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)*/public?static?String?ajaxPost(String?url,?Map<String,?String>?dataMap,?String?charset)?{HttpPost?httpPost?=?new?HttpPost(url);httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");try?{if?(dataMap?!=?null)?{List<NameValuePair>?nvps?=?new?ArrayList<NameValuePair>();for?(Map.Entry<String,?String>?entry?:?dataMap.entrySet())?{nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));}UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);formEntity.setContentEncoding(charset);httpPost.setEntity(formEntity);}}?catch?(UnsupportedEncodingException?e)?{e.printStackTrace();}return?executeRequest(httpPost,?charset);}/***?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)*/public?static?String?ajaxPostJson(String?url,?String?jsonString)?{return?ajaxPostJson(url,?jsonString,?"UTF-8");}/***?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)*/public?static?String?ajaxPostJson(String?url,?String?jsonString,?String?charset)?{HttpPost?httpPost?=?new?HttpPost(url);httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");StringEntity?stringEntity?=?new?StringEntity(jsonString,?charset);//?解決中文亂碼問題stringEntity.setContentEncoding(charset);stringEntity.setContentType("application/json");httpPost.setEntity(stringEntity);return?executeRequest(httpPost,?charset);}/***?執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)*/public?static?String?executeRequest(HttpUriRequest?httpRequest)?{return?executeRequest(httpRequest,?"UTF-8");}/***?執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)*/public?static?String?executeRequest(HttpUriRequest?httpRequest,?String?charset)?{CloseableHttpClient?httpclient;if?("https".equals(httpRequest.getURI().getScheme()))?{httpclient?=?createSSLInsecureClient();}?else?{httpclient?=?HttpClients.createDefault();}String?result?=?"";try?{try?{CloseableHttpResponse?response?=?httpclient.execute(httpRequest);HttpEntity?entity?=?null;try?{entity?=?response.getEntity();result?=?EntityUtils.toString(entity,?charset);}?finally?{EntityUtils.consume(entity);response.close();}}?finally?{httpclient.close();}}?catch?(IOException?ex)?{ex.printStackTrace();}return?result;}public?static?String?executeRequest(CloseableHttpClient?httpclient,?HttpUriRequest?httpRequest,?String?charset)?{String?result?=?"";try?{try?{CloseableHttpResponse?response?=?httpclient.execute(httpRequest);HttpEntity?entity?=?null;try?{entity?=?response.getEntity();result?=?EntityUtils.toString(entity,?charset);}?finally?{EntityUtils.consume(entity);response.close();}}?finally?{httpclient.close();}}?catch?(IOException?ex)?{ex.printStackTrace();}return?result;}/***?創(chuàng)建?SSL連接*/public?static?CloseableHttpClient?createSSLInsecureClient()?{try?{SSLContext?sslContext?=?new?SSLContextBuilder().loadTrustMaterial(new?TrustStrategy()?{@Overridepublic?boolean?isTrusted(X509Certificate[]?chain,?String?authType)?throws?CertificateException?{return?true;}}).build();SSLConnectionSocketFactory?sslsf?=?new?SSLConnectionSocketFactory(sslContext,?new?HostnameVerifier()?{@Overridepublic?boolean?verify(String?hostname,?SSLSession?session)?{return?true;}});return?HttpClients.custom().setSSLSocketFactory(sslsf).build();}?catch?(GeneralSecurityException?ex)?{throw?new?RuntimeException(ex);}} }運行
由于網(wǎng)絡(luò)等原因,我們發(fā)現(xiàn)并不能全部下載成功,不過可以多次運行嘗試,可以實現(xiàn)較高的下載成功率。
666,厲害了。。
如果看到這里,說明你喜歡這篇文章,請?轉(zhuǎn)發(fā)、點贊。同時?標星(置頂)本公眾號可以第一時間接受到博文推送。 推薦一些很不錯的計算機學習教程,包括:數(shù)據(jù)結(jié)構(gòu)、算法、計算機網(wǎng)絡(luò)、操作系統(tǒng)、Java(spring、springmvc、springboot、springcloud等)等等 ,全部收集于網(wǎng)絡(luò),如果有侵權(quán),請聯(lián)系刪除! 下面是部分截圖:獲取方式點擊下方公眾號,回復:好好學Java,即可獲取。總結(jié)
以上是生活随笔為你收集整理的用 Java 爬小姐姐图片,这个厉害了。。。的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 回馈粉丝,送30本技术书。
- 下一篇: 突然决定,送一台笔记本!