java写一个搜索引擎_搜索引擎—-Java实现一个简单的网络爬虫
Java實現一個簡單的爬蟲程序。
package SearchCrawler;import java.util.*;import java.net.*;import java.io.*;import java.util.regex.*;/* *搜索Web爬行者(起始的URL、處理url的最大數、要搜索的字符串) */public class SearchCrawler implements Runnable{/* disallowListCache緩存robot不允許搜索的URL。 Robot協議在Web站點的根目錄下設置一個robots.txt文件, *規定站點上的哪些頁面是限制搜索的。 搜索程序應該在搜索過程中跳過這些區域,下面是robots.txt的一個例子: # robots.txt for http://somehost.com/ User-agent: * Disallow: /cgi-bin/ Disallow: /registration # /Disallow robots on registration page Disallow: /login */private HashMap> disallowListCache = new HashMap>();ArrayList errorList = new ArrayList();// 錯誤信息ArrayList result = new ArrayList(); // 搜索到的結果String startUrl;// 開始搜索的起點int maxUrl;// 最大處理的url數String searchString;// 要搜索的字符串(英文)boolean caseSensitive = false;// 是否區分大小寫boolean limitHost = false;// 是否在限制的主機內搜索public SearchCrawler(String startUrl, int maxUrl, String searchString){this.startUrl = startUrl;this.maxUrl = maxUrl;this.searchString = searchString;}public ArrayList getResult(){return result;}public void run(){// 啟動搜索線程crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);}// 檢測URL格式private URL verifyUrl(String url){// 只處理HTTP URLs.if (!url.toLowerCase().startsWith("http://"))return null;URL verifiedUrl = null;try{verifiedUrl = new URL(url);} catch (Exception e){return null;}return verifiedUrl;}// 檢測robot是否允許訪問給出的URL.private boolean isRobotAllowed(URL urlToCheck){String host = urlToCheck.getHost().toLowerCase();// 獲取給出RUL的主機// System.out.println("主機="+host);// 獲取主機不允許搜索的URL緩存ArrayList disallowList = disallowListCache.get(host);// 如果還沒有緩存,下載并緩存。if (disallowList == null){disallowList = new ArrayList();try{URL robotsFileUrl = new URL("http://" + host + "/robots.txt");BufferedReader reader = new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));// 讀robot文件,創建不允許訪問的路徑列表。String line;while ((line = reader.readLine()) != null){if (line.indexOf("Disallow:") == 0){// 是否包含"Disallow:"String disallowPath = line.substring("Disallow:".length());// 獲取不允許訪問路徑// 檢查是否有注釋。int commentIndex = disallowPath.indexOf("#");if (commentIndex != -1){disallowPath = disallowPath.substring(0,commentIndex);// 去掉注釋}disallowPath = disallowPath.trim();disallowList.add(disallowPath);}}// 緩存此主機不允許訪問的路徑。disallowListCache.put(host, disallowList);} catch (Exception e){return true; // web站點根目錄下沒有robots.txt文件,返回真}}String file = urlToCheck.getFile();// System.out.println("文件getFile()="+file);for (int i = 0; i < disallowList.size(); i++){String disallow = disallowList.get(i);if (file.startsWith(disallow)){return false;}}return true;}private String downloadPage(URL pageUrl){try{// Open connection to URL for reading.BufferedReader reader = new BufferedReader(new InputStreamReader(pageUrl.openStream()));// Read page into buffer.String line;StringBuffer pageBuffer = new StringBuffer();while ((line = reader.readLine()) != null){pageBuffer.append(line);}return pageBuffer.toString();} catch (Exception e){}return null;}// 從URL中去掉"www"private String removeWwwFromUrl(String url){int index = url.indexOf("://www.");if (index != -1){return url.substring(0, index + 3) + url.substring(index + 7);}return (url);}// 解析頁面并找出鏈接private ArrayList retrieveLinks(URL pageUrl, String pageContents,HashSet crawledList, boolean limitHost){// 用正則表達式編譯鏈接的匹配模式。Pattern p = Pattern.compile("]",Pattern.CASE_INSENSITIVE);Matcher m = p.matcher(pageContents);ArrayList linkList = new ArrayList();while (m.find()){String link = m.group(1).trim();if (link.length() < 1){continue;}// 跳過鏈到本頁面內鏈接。if (link.charAt(0) == '#'){continue;}if (link.indexOf("mailto:") != -1){continue;}if (link.toLowerCase().indexOf("javascript") != -1){continue;}if (link.indexOf("://") == -1){if (link.charAt(0) == '/'){// 處理絕對地link = "http://" + pageUrl.getHost() + ":"+ pageUrl.getPort() + link;} else{String file = pageUrl.getFile();if (file.indexOf('/') == -1){// 處理相對地址link = "http://" + pageUrl.getHost() + ":"+ pageUrl.getPort() + "/" + link;} else{String path = file.substring(0,file.lastIndexOf('/') + 1);link = "http://" + pageUrl.getHost() + ":"+ pageUrl.getPort() + path + link;}}}int index = link.indexOf('#');if (index != -1){link = link.substring(0, index);}link = removeWwwFromUrl(link);URL verifiedLink = verifyUrl(link);if (verifiedLink == null){continue;}/* 如果限定主機,排除那些不合條件的URL*/if (limitHost&& !pageUrl.getHost().toLowerCase().equals(verifiedLink.getHost().toLowerCase())){continue;}// 跳過那些已經處理的鏈接.if (crawledList.contains(link)){continue;}linkList.add(link);}return (linkList);}// 搜索下載Web頁面的內容,判斷在該頁面內有沒有指定的搜索字符串private boolean searchStringMatches(String pageContents,String searchString, boolean caseSensitive){String searchContents = pageContents;if (!caseSensitive){// 如果不區分大小寫searchContents = pageContents.toLowerCase();}Pattern p = Pattern.compile("[\\s]+");String[] terms = p.split(searchString);for (int i = 0; i < terms.length; i++){if (caseSensitive){if (searchContents.indexOf(terms[i]) == -1){return false;}} else{if (searchContents.indexOf(terms[i].toLowerCase()) == -1){return false;}}}return true;}// 執行實際的搜索操作public ArrayList crawl(String startUrl, int maxUrls,String searchString, boolean limithost, boolean caseSensitive){System.out.println("searchString=" + searchString);HashSet crawledList = new HashSet();LinkedHashSet toCrawlList = new LinkedHashSet();if (maxUrls < 1){errorList.add("Invalid Max URLs value.");System.out.println("Invalid Max URLs value.");}if (searchString.length() < 1){errorList.add("Missing Search String.");System.out.println("Missing search String");}if (errorList.size() > 0){System.out.println("err!!!");return errorList;}// 從開始URL中移出wwwstartUrl = removeWwwFromUrl(startUrl);toCrawlList.add(startUrl);while (toCrawlList.size() > 0){if (maxUrls != -1){if (crawledList.size() == maxUrls){break;}}// Get URL at bottom of the list.String url = toCrawlList.iterator().next();// Remove URL from the to crawl list.toCrawlList.remove(url);// Convert string url to URL object.URL verifiedUrl = verifyUrl(url);// Skip URL if robots are not allowed to access it.if (!isRobotAllowed(verifiedUrl)){continue;}// 增加已處理的URL到crawledListcrawledList.add(url);String pageContents = downloadPage(verifiedUrl);if (pageContents != null && pageContents.length() > 0){// 從頁面中獲取有效的鏈接ArrayList links = retrieveLinks(verifiedUrl,pageContents, crawledList, limitHost);toCrawlList.addAll(links);if (searchStringMatches(pageContents, searchString,caseSensitive)){result.add(url);System.out.println(url);}}}return result;}// 主函數public static void main(String[] args){if (args.length != 3){System.out.println("Usage:java SearchCrawler startUrl maxUrl searchString");return;}int max = Integer.parseInt(args[1]);SearchCrawler crawler = new SearchCrawler(args[0], max, args[2]);Thread search = new Thread(crawler);System.out.println("Start searching...");System.out.println("result:");search.start();}}
本文轉載自:CSDN博客
歡迎加入我愛機器學習QQ14群:336582044
微信掃一掃,關注我愛機器學習公眾號
總結
以上是生活随笔為你收集整理的java写一个搜索引擎_搜索引擎—-Java实现一个简单的网络爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: eclipse opengl java_
- 下一篇: vs java调试_基于VSCode的J