爬虫中如何获取页面编码类型
生活随笔
收集整理的這篇文章主要介紹了
爬虫中如何获取页面编码类型
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
獲取頁面的編碼格式的三種方式:
三種方式可以結合使用,由于inputStream不能夠被復用,但是inputStrem沒有clone方法也導致無法克隆
因此需要流轉化,這種方式多重比較需要重復進行流轉化。
依賴包
我的資源
工具類
/*** 獲取頁面的編碼格式1.根據Response中的header獲取編碼格式2.根據頁面標簽中的meta獲取3.根據頁面內容識別自動識別出編碼格式,經過測試準確率比較高三種方式可以結合使用,目前GFCrawer是使用了meta一種,由于inputStream不能夠被復用,但是inputStrem沒有clone方法也導致無法克隆因此需要流轉化,這種方式多重比較需要重復進行流轉化*/ package charset;import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.util.List; import java.util.Map;import org.apache.commons.io.IOUtils;import info.monitorenter.cpdetector.io.ASCIIDetector; import info.monitorenter.cpdetector.io.ByteOrderMarkDetector; import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.JChardetFacade; import info.monitorenter.cpdetector.io.ParsingDetector; import info.monitorenter.cpdetector.io.UnicodeDetector;/*** @author weijie 2019年5月8日*/ public class GFCrawerCharset {public static void main(String[] args) {// 判斷亂碼System.out.println("------判斷亂碼-------");//gb2312String url = "http://www.people.com.cn/";//utf-8String url2 = "https://blog.csdn.net/dreamzuora";String errorHtml = GFCrawer.GetContent(url, "utf-8");String succHtml = GFCrawer.GetContent(url2);System.out.println("error: " + errorHtml.substring(100, 300));System.out.println("succ: " + succHtml.substring(1000, 1100));boolean b = isErrorCodes(errorHtml);boolean b2 = isErrorCodes(succHtml);System.out.println(b);System.out.println(b2);// meta獲取編碼System.out.println("-------meta獲取編碼------");System.out.println(url + "\t" + getCharsetByMeta(url));System.out.println(url2 + "\t" + getCharsetByMeta(url2));// responseHeader獲取后端返回的contentType:但是很多網站不會明確返回System.out.println("-------后端頭信息編碼結果------");System.out.println(url + "\t" + getAutoCharsetByHeader(url));System.out.println(url2 + "\t" + getAutoCharsetByHeader(url2));// 自動判斷編碼類型System.out.println("-------識別文本信息編碼------");cdpInit();System.out.println(url + "\t" + getAutoCharsetByURL(url));System.out.println(url2 + "\t" + getAutoCharsetByURL(url2));}// 解析頁面內容自動識別編碼類型public static CodepageDetectorProxy cdp = null;static {cdpInit();}public static void cdpInit() {cdp = CodepageDetectorProxy.getInstance();cdp.add(JChardetFacade.getInstance());cdp.add(ASCIIDetector.getInstance());cdp.add(UnicodeDetector.getInstance());cdp.add(new ParsingDetector(false));cdp.add(new ByteOrderMarkDetector());}/*** 判斷是否亂碼 加上html的判斷會出錯* * @param html* @return*/public static boolean isErrorCodes(String str) {for (int i = 0, len = str.length(); i < len; i++) {char c = str.charAt(i);// 當從Unicode編碼向某個字符集轉換時,如果在該字符集中沒有對應的編碼,則得到0x3f(即問號字符?)//從其他字符集向Unicode編碼轉換時,如果這個二進制數在該字符集中沒有標識任何的字符,則得到的結果是0xfffd//System.out.println("--- " + (int) c);if ((int) c == 0xfffd) {// 存在亂碼//System.out.println("存在亂碼 " + (int) c);return true;}}return false; }/*** 通過流判斷編碼格式* * @param in* @return*/public static String getAutoCharsetByInputStream(InputStream in) {String code = null;ByteArrayInputStream bais = null;try {bais = new ByteArrayInputStream(IOUtils.toByteArray(in));Charset charset = cdp.detectCodepage(bais, 2147483647);bais.close();code = charset.name();} catch (IOException e) {e.printStackTrace();}return code;}/*** 通過url判斷* * @param href* @return*/public static String getAutoCharsetByURL(String href) {URL url;String code = null;try {url = new URL(href);Charset charset = cdp.detectCodepage(url);code = charset.name();} catch (MalformedURLException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return code;}/*** 從mata獲取編碼格式* * @param url* @return*/public static String getCharsetByMeta(String href) {String charset = null;URL url = null;try {url = new URL(href);} catch (MalformedURLException e) {e.printStackTrace();url = null;} if(url == null){return null;}HttpURLConnection httpConnection = null;try {httpConnection = (HttpURLConnection) url.openConnection();} catch (IOException e) {e.printStackTrace();httpConnection = null;}if(httpConnection == null){return null;}httpConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");InputStream input = null;try {input = httpConnection.getInputStream();input.read();BufferedReader reader = new BufferedReader(new InputStreamReader(input));while (reader.ready()) {String line = reader.readLine();if (line.contains("http-equiv") && line.contains("charset")) {String tmp = line.split(";")[1];charset = tmp.substring(tmp.indexOf("=") + 1, tmp.indexOf("\""));break;} else {continue;}}reader.close();} catch (Exception e) {e.printStackTrace();input = null;}return charset;}/*** 從mata獲取編碼格式* * @param url* @return*/public static String getCharsetByMeta(InputStream inputStream) {String charset = null;try {BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));while (reader.ready()) {String line = reader.readLine();if (line.contains("http-equiv") && line.contains("charset")) {String tmp = line.split(";")[1];charset = tmp.substring(tmp.indexOf("=") + 1, tmp.indexOf("\""));} else {continue;}}reader.close();return charset;} catch (MalformedURLException e) {e.printStackTrace();return charset;} catch (IOException e) {e.printStackTrace();return charset;}}/*** 從header中獲取頁面編碼* * @param strUrl* @return*/public static String getAutoCharsetByHeader(String href) {String charset = null;URL url = null;try {url = new URL(href);} catch (MalformedURLException e) {e.printStackTrace();url = null;} if(url == null){return null;}HttpURLConnection httpConnection = null;try {httpConnection = (HttpURLConnection) url.openConnection();} catch (IOException e) {e.printStackTrace();httpConnection = null;}if(httpConnection == null){return null;}httpConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");// 獲取鏈接的headerMap<String, List<String>> headerFields = httpConnection.getHeaderFields();// 判斷headers中是否存在Content-Typeif (headerFields.containsKey("Content-Type")) {// 拿到header 中的 Content-Type :[text/html; charset=utf-8]List<String> attrs = headerFields.get("Content-Type");String[] as = attrs.get(0).split(";");for (String att : as) {if (att.contains("charset")) {charset = att.split("=")[1];}}}return charset;}/*** 從header中獲取頁面編碼* * @param strUrl* @return*/public static String getCharsetByHeader(URLConnection urlConn) {String charset = null;// 獲取鏈接的headerMap<String, List<String>> headerFields = urlConn.getHeaderFields();// 判斷headers中是否存在Content-Typeif (headerFields.containsKey("Content-Type")) {// 拿到header 中的 Content-Type :[text/html; charset=utf-8]List<String> attrs = headerFields.get("Content-Type");String[] as = attrs.get(0).split(";");for (String att : as) {if (att.contains("charset")) {charset = att.split("=")[1];}}}return charset;} }總結
以上是生活随笔為你收集整理的爬虫中如何获取页面编码类型的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: java提取文章摘要内容
- 下一篇: 什么是Springmvc以及如果编写第一