HtmlUnit解析动态网页并采集网页列表到Excel
生活随笔
收集整理的這篇文章主要介紹了
HtmlUnit解析动态网页并采集网页列表到Excel
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
HtmlUnit可以解析動態網頁,本文這里選取了一個網址(頁面需js/ajax動態加載),模擬瀏覽器操作頁面各元素,包括點擊獲取列表值,主要是掌握了HtmlUnit解析頁面元素的主要用法,同時使用POI HSSF將頁面解析出來的ul/li標簽內容導入到excel表。
本文代碼只用來作為HtmlUnit頁面解析的參考,因為具體頁面有具體信息需要處理。其中對頁面如果有級聯元素,可以參考,如先選地市再選區域。
package com;import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator;import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRichTextString; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem;import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.DomElement; import com.gargoylesoftware.htmlunit.html.DomNodeList; import com.gargoylesoftware.htmlunit.html.HtmlDivision; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlTextInput;public class CMHtml {public static ArrayList<String> gAddList=new ArrayList<String>();public static String gQueryURL = "http://xxx";public void spider(String strCity,String strArea,String strAddr){ try { //創建一個webclientWebClient webClient = new WebClient(BrowserVersion.CHROME); // 啟動JSwebClient.getOptions().setJavaScriptEnabled(true); //忽略ssl認證webClient.getOptions().setUseInsecureSSL(true);//禁用Css,可避免自動二次請求CSS進行渲染webClient.getOptions().setCssEnabled(false);//運行錯誤時,不拋出異常//webClient.getOptions().setThrowExceptionOnScriptError(false);// 設置Ajax異步//webClient.setAjaxController(new NicelyResynchronizingAjaxController());//獲取頁面HtmlPage page = webClient.getPage(gQueryURL); webClient.waitForBackgroundJavaScript(10000);//webClient.waitForBackgroundJavaScriptStartingBefore(10000);//判斷圖形驗證碼是否彈出,獲取i標簽節點列表DomNodeList<DomElement> nodelist=page.getElementsByTagName("i");int iCode=0;for(int i=0;i<nodelist.getLength();i++){DomElement domElement = (DomElement)nodelist.get(i);String txt=domElement.asXml();if(txt.indexOf("c_vcode")!=-1){//找出id為c_vcode的i標簽iCode=i;break;} }DomElement domElement = (DomElement)nodelist.get(iCode);if(domElement.asXml().indexOf("display: none;")!=-1){//沒彈出//獲取返回頁面的地址列表div/ul/li標簽HtmlDivision divKD=(HtmlDivision)page.getElementById("kd_content"); DomNodeList<HtmlElement> nodeKD=divKD.getElementsByTagName("li");for(int m=0;m<nodeKD.size();m++){HtmlElement heLi=nodeKD.get(m);if(heLi.asText().equals(strCity)){//定位到城市page=(HtmlPage)heLi.click();HtmlDivision divKD1=(HtmlDivision)page.getElementById("kd_content"); DomNodeList<HtmlElement> nodeKD1=divKD1.getElementsByTagName("li");for(int n=0;n<nodeKD1.size();n++){HtmlElement heLi1=nodeKD1.get(n);if(heLi1.asText().equals(strArea)){//定位到區域page=(HtmlPage)heLi1.click();break;}}break;}}//輸入具體地址HtmlTextInput kw =(HtmlTextInput)page.getElementByName("kw");kw.setValueAttribute(strAddr); //獲取a標簽,提交查詢HtmlElement heAnchor =page.getHtmlElementById("srhBtn");HtmlPage retPage = (HtmlPage) heAnchor.click();// 等待JS驅動dom完成獲得還原后的網頁webClient.waitForBackgroundJavaScript(10000);//獲取返回頁面的地址列表div/ul/li標簽HtmlDivision div=(HtmlDivision)retPage.getElementById("query_result"); DomNodeList<HtmlElement> addNode=div.getElementsByTagName("li");for (int i=0;i<addNode.getLength();i++){HtmlElement heLi=addNode.get(i);gAddList.add(heLi.asText());} webClient.close(); return;}else{//彈出,重新啟動爬蟲,避免圖形驗證碼識別webClient.close();//關閉瀏覽器spider(strCity,strArea,strAddr);return;} }catch (Exception e) {System.err.println( "Exception: " + e ); }}public void query(String path){try {//獲取excel文件POIFSFileSystem fs=new POIFSFileSystem(new FileInputStream(path));//得到Excel工作簿對象 HSSFWorkbook wb = new HSSFWorkbook(fs); //得到Excel工作表對象 HSSFSheet sheet = wb.getSheetAt(0);//取得有效的行數int rowcount = sheet.getLastRowNum(); //得到Excel工作表的行 for (int i=1;i<=rowcount;i++){ //去掉第一行表頭HSSFRow row = sheet.getRow(i); //得到Excel工作表指定行的單元格 HSSFCell cellCity = row.getCell(0);//獲取地市String strCity=cellCity.getStringCellValue();HSSFCell cellArea = row.getCell(1);//獲取區域String strArea=cellArea.getStringCellValue(); HSSFCell cellAddr = row.getCell(2);//獲取地址String strAddr=cellAddr.getStringCellValue(); //爬蟲,返回列表結果spider(strCity,strArea,strAddr);//返回的地址寫入excel表for(int j=0;j<gAddList.size();j++){HSSFCell cellRes = row.createCell(j+2+1);cellRes.setCellValue(new HSSFRichTextString(gAddList.get(j)));}gAddList.clear();/*Iterator<String> it = gAddList.iterator();while(it.hasNext()){it.remove();}*/FileOutputStream out=new FileOutputStream(path);out.flush();wb.write(out);out.close();} wb.close(); fs.close();} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} }public static void main(String[] args) {CMHtml lession = new CMHtml();String path=System.getProperty("user.dir")+System.getProperty("file.separator")+"cmbb.xls";//增加文件分隔符,通用windows和linux系統lession.query(path);} }總結
以上是生活随笔為你收集整理的HtmlUnit解析动态网页并采集网页列表到Excel的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Hadoop平台日志结构
- 下一篇: Eclipse高版本无法兼容FatJar