HtmlUnit采集页面信息加工并写入excel表
生活随笔
收集整理的這篇文章主要介紹了
HtmlUnit采集页面信息加工并写入excel表
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、功能:從網頁上采集信息加工,如將商品名字重組(Name2FiveWords函數),并寫入excel表,涉及對htmlunit元素定位以及點擊重定向。
2、參考代碼如下:
package com;import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream;import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem;import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.ProxyConfig; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.DomElement; import com.gargoylesoftware.htmlunit.html.DomNodeList; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlPage;public class EBayHU {public static void main(String[] args) {String path="D:"+System.getProperty("file.separator")+"tmp"+System.getProperty("file.separator")+"test.xls";try { //webclient設置開始WebClient webClient = new WebClient(BrowserVersion.CHROME); //創建一個webclient webClient.getOptions().setJavaScriptEnabled(true); // 啟動JS webClient.getOptions().setUseInsecureSSL(true);//忽略ssl認證 webClient.getOptions().setCssEnabled(false);//禁用Css,可避免自動二次請求CSS進行渲染 webClient.getOptions().setThrowExceptionOnScriptError(false);//運行錯誤時,不拋出異常 webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 設置Ajax異步 //ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig(); //設置代理//proxyConfig.setProxyHost("ip"); //proxyConfig.setProxyPort(port);//webclinet設置結束//打開excel表,準備采集入表POIFSFileSystem fs=new POIFSFileSystem(new FileInputStream(path));//得到Excel工作簿對象 HSSFWorkbook wb = new HSSFWorkbook(fs); //得到Excel工作表對象 HSSFSheet sheet = wb.getSheetAt(0);//獲取第一張表int lastrow=sheet.getLastRowNum(); for(int i=1;i<=lastrow;i++){//第一行標題,從第二行開始//HSSFRow row = sheet.createRow(lastrow+1);HSSFRow row = sheet.getRow(i);HSSFCell cellURL=row.getCell(0);//獲取第一列URLString url=cellURL.getStringCellValue();HtmlPage page = webClient.getPage(url); //打開網頁webClient.waitForBackgroundJavaScript(10000);//等待1秒//第一步:獲取商品名稱并寫入excel第二列HtmlElement itemTitle =page.getHtmlElementById("itemTitle");HSSFCell cellItemTitle=row.createCell(1);String title=itemTitle.asText();title=title.replaceAll("Details about", "");cellItemTitle.setCellValue(title.trim()); //將商品名稱隨機分成5組 HSSFCell cellItemreTitle=row.createCell(2); cellItemreTitle.setCellValue(Name2FiveWords(title)); //第二步:獲取商品價格并寫入第三列HtmlElement price =page.getHtmlElementById("prcIsum");HSSFCell cellPrice=row.createCell(3);cellPrice.setCellValue(price.asText()); //第三步,獲取商品圖片并寫入excel第四列//if(!page.asText().contains("vi_main_img_fs")) continue;HtmlElement propic =page.getHtmlElementById("vi_main_img_fs");DomNodeList<HtmlElement> picnodes=propic.getElementsByTagName("img");StringBuffer strbuf=new StringBuffer();for(int m=0;m<picnodes.size();m++){HtmlElement pic=picnodes.get(m);page=(HtmlPage)pic.click();webClient.waitForBackgroundJavaScript(10000);HtmlElement bigpic =page.getHtmlElementById("icImg");String picsrc=bigpic.getAttribute("src");strbuf.append(picsrc+"\r\n");//圖片的網址之間用逗號隔開}String strResult=strbuf.toString();strResult=strResult.substring(0,strResult.length()-1);//截取最后一個字符HSSFCell cellpic=row.createCell(4);//寫入excel第列cellpic.setCellValue(strResult);//第四步:賣家信息寫入第5列HtmlElement seller =page.getHtmlElementById("mbgLink");String href=seller.getAttribute("href");HSSFCell cellseller=row.createCell(5);cellseller.setCellValue(seller.asText()+"\r\n"+href);//第五步:獲取商品詳情并寫入第6列DomNodeList<DomElement> nodelist=page.getElementsByTagName("div");for(int j=0;j<nodelist.getLength();j++){DomElement domElement = (DomElement)nodelist.get(j);if(domElement.getAttribute("class").equals("itemAttr")){//寫入excel第四列HSSFCell cellinfo=row.createCell(6);cellinfo.setCellValue(domElement.asText());break;}} //第六步:獲取商品更多介紹并寫入第7列HtmlElement descifr =page.getHtmlElementById("desc_ifr");//切換到iframeString src=descifr.getAttribute("src");HtmlPage ifrpage=webClient.getPage(src);//讀取iframe網頁webClient.waitForBackgroundJavaScript(10000);DomNodeList<DomElement> nodelist2=ifrpage.getElementsByTagName("div");for(int j=0;j<nodelist2.getLength();j++){DomElement domElement = (DomElement)nodelist2.get(j);if(domElement.getAttribute("class").equals("prod_item description")){//寫入excel第四列HSSFCell cellmoreinfo=row.createCell(7);cellmoreinfo.setCellValue(domElement.asText());break;}} //寫入excel表FileOutputStream out=new FileOutputStream(path);out.flush();wb.write(out);out.close();}wb.close(); fs.close(); }catch (Exception e) {System.err.println( "Exception: " + e );}}//將字符串分為5個部分public static String Name2FiveWords(String title){String s[]=title.split(" ");String sr="";int len=s.length/5+1;//分成5組for(int m=0;m<5;m++){String tmp="";for(int n=0;n<len;n++){if((m*len+n)<s.length) tmp=tmp+" "+s[m*len+n];} sr=sr+"|"+tmp.trim();}return sr;} }總結
以上是生活随笔為你收集整理的HtmlUnit采集页面信息加工并写入excel表的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【正一专栏】今夜有一种奇迹叫巴萨
- 下一篇: 模拟浏览器自动化测试工具Selenium