jsoup爬取网站信息之《冰与火之歌》
生活随笔
收集整理的這篇文章主要介紹了
jsoup爬取网站信息之《冰与火之歌》
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
使用jsoup爬取了下某個網(wǎng)站中的《冰與火之歌》信息,并將格式保存成了json格式到文本文件中。
具體執(zhí)行的代碼如下:?
public static void main(String[] args) throws IOException {TestJsoupBingYuHuo tj = new TestJsoupBingYuHuo();tj.test();}static String urlPath = "http://www.bingyuhuozhige.cc";static String srcPath = "D:\\study\\jsoup\\bingyuhuozhige\\";static boolean writeOnOff = true;public void test() throws IOException {Document document = JsoupUtils.getRoot(urlPath);Elements h3lists = document.select("h3");this.analysisH3List(h3lists);}// 解析冊信息private JSONArray analysisH3List(Elements h3List) throws IOException {int n = 0;for (Element h3 : h3List) {n++;if (n < 7) { //改變這里的數(shù)字,可以設置從第幾冊開始跑continue;}// h3 是冊名String h3Text = h3.text().trim();System.out.println(h3Text);if(writeOnOff) {FileUtils.createDir(srcPath + h3Text); // 創(chuàng)建冊目錄}// p 是冊的描述Element p = this.analysisPList(h3, srcPath + h3Text);// 解析冊的描述,并存儲到冊對應目錄下的文件中// div class = row 內(nèi)的 links 是每個章節(jié)的首頁this.analysisFirstPageLink(p, srcPath + h3Text);}return null;}// 解析 冊的描述private Element analysisPList(Element h3, String path) {Element p = null;while (p == null || !p.tagName().equals("p")) {if (p != null) {p = p.nextElementSibling();} else {p = h3.nextElementSibling();}System.out.println("=");}String pText = p.text().trim();System.out.println(pText);if(writeOnOff) { FileUtils.writeLine(path + File.separator + "簡述", pText);}return p;}// 解析每章的首頁urlprivate JSONArray analysisFirstPageLink(Element p, String path) throws IOException {Element divRow = null;while (divRow == null || !divRow.tagName().equals("div") || !divRow.hasClass("row")) {if (divRow != null) {divRow = divRow.nextElementSibling();} else {divRow = p.nextElementSibling();}System.out.println("=");}Elements links = divRow.select("a[href]");for (Element link : links) {String url = link.attr("href").trim();String title = link.attr("title").trim();// 為每個 title 創(chuàng)建一個 文本title = title.startsWith("第六十一章 獅鷲的重生") ? "第六十一章 獅鷲的重生(格里夫·瓊恩·克林頓)" : title;if(writeOnOff) { FileUtils.writeLine(path + File.separator + title + ".txt", title);}System.out.println(title + " = " + url);this.analysisAllPagesLink(url, path + File.separator + title + ".txt");//進行下一步,解析每個章節(jié)所有頁面的鏈接,并獲取每頁的文本內(nèi)容}return null;}// 解析 每章 首頁,獲取到每個章節(jié)的所有頁鏈接 ,并獲取每頁的文本內(nèi)容// 記得要做 睡眠處理,避免因網(wǎng)絡延時出錯private void analysisAllPagesLink(String firstPageUrl, String path) throws IOException {try {Thread.sleep(50);} catch (InterruptedException e) {e.printStackTrace();}Document document = JsoupUtils.getRoot(urlPath + firstPageUrl);Elements paginations = document.select("div.pagination");Element pagination = paginations.get(0);Elements links = pagination.select("a[href]");if(links.size() > 0) {int lastNum = 2;for (Element link : links) {String title = link.text().trim(); if("尾頁".equals(title)) {String url = link.attr("href").trim();lastNum = Integer.parseInt(url.substring(url.indexOf("_") + 1, url.indexOf(".html")));}}//解析每頁的內(nèi)容,把內(nèi)容寫到文件中this.analysisPageText(document, path);String urlPrefix = firstPageUrl.substring(0, firstPageUrl.indexOf(".html"));for(int i = 2 ; i <= lastNum ; i ++) {//解析每頁的內(nèi)容,把內(nèi)容寫到文件中this.analysisPageText(urlPath + urlPrefix + "_" + i + ".html", path);System.out.println(i);}}else {//解析每頁的內(nèi)容,把內(nèi)容寫到文件中this.analysisPageText(document, path);}}private void analysisPageText(String pageUrl, String path) throws IOException {try {Thread.sleep(50);} catch (InterruptedException e) {e.printStackTrace();}Document document = JsoupUtils.getRoot(pageUrl);this.analysisPageText(document, path);}private void analysisPageText(Document document, String path) {Elements span9s = document.select("div.span9");if(span9s.size() < 1) {span9s = document.select("div.span12");}Element span9 = span9s.get(0);String text = span9.html();text = text.substring(0, text.indexOf("<div class=\"pagination\"")); // System.out.println(text);int begin = text.lastIndexOf("</div>") + 6;text = text.substring(begin);text = text.replaceAll("<br>", "\n\r").replaceAll("<p>", "\n\r").replaceAll("</p>", "\n\r");if(writeOnOff) { FileUtils.writeLine(path, text);}}代碼中確實的其它jar包信息和工具類,見我的另一篇文章,鏈接如下:
jsoup爬取網(wǎng)站信息之《本草綱目》
總結
以上是生活随笔為你收集整理的jsoup爬取网站信息之《冰与火之歌》的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Java JNI调用IC卡读卡器
- 下一篇: c语言电流检测模块程序,C语言和MATL