|   因為lucene索引的時候是將String型的信息建立索引的,所以這里必須是將word/pdf/html等文件的內(nèi)容轉(zhuǎn)化問字符型。  lucene的jar包自己去下載。  首先是建立索引的代碼:  public class TextFileIndexer {???  public static void main(String[] args) throws Exception {???  /* 指明要索引文件夾的位置,這里是d盤的s文件夾下 */  ???????? File fileDir = new File("d:\\s");???  /* 這里放索引文件的位置 */  ???????? File indexDir = new File("d:\\index");???  ???????? Analyzer luceneAnalyzer = new StandardAnalyzer();???  ???????? IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,???  true);???  ???????? File[] textFiles = fileDir.listFiles();???  long startTime = new Date().getTime();???  //增加document到索引去  ???????????????? System.out.println("File正在被索引.");??  /*  ????????????????? * 注意要變的就是這里,路徑和讀取文件的方法  ????????????????? * */  ???????????????? String path ="d:\\s\\2.doc";  ???????????????? String temp = ReadFile.readWord(path);  //???????????????? String path ="d:\\s\\index.htm";  //???????????????? String temp = ReadFile.readHtml(path);  ???????????????? Document document = new Document();???  ???????????????? Field FieldPath = new Field("path",path,  ???????????????????????? Field.Store.YES, Field.Index.NO);???  ???????????????? Field FieldBody = new Field("body", temp, Field.Store.YES,???  ???????????????????????? Field.Index.TOKENIZED,???  ???????????????????????? Field.TermVector.WITH_POSITIONS_OFFSETS);???  ???????????????? document.add(FieldPath);???  ???????????????? document.add(FieldBody);???  ???????????????? indexWriter.addDocument(document);???  //optimize()方法是對索引進行優(yōu)化  ???????? indexWriter.optimize();???  ???????? indexWriter.close();???  //測試一下索引的時間  long endTime = new Date().getTime();???  ???????? System.out???  ???????????????? .println("這花費了"??  ??????????????????????? + (endTime - startTime)???  ??????????????????????? + " 毫秒來把文檔增加到索引里面去!"??  ??????????????????????? + fileDir.getPath());???  ???? }??  }  上面已經(jīng)注釋了要換的地方,我們要做的就是換文件的路徑和讀取文件的方法。  下面來具體看下讀取文件的方法  1.首先來看WORD文檔:  我這里用的是poi,相關(guān)jar包自己去下載,然后加到工程中(以下所要用的jar包也是,不再重復(fù)說)。  來看相關(guān)代碼:  public static String readWord(String path) {  ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容  try {  ???????????? HWPFDocument doc = new HWPFDocument(new FileInputStream(path));  ???????????? Range range = doc.getRange();  int paragraphCount = range.numParagraphs();// 段落  for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數(shù)據(jù)  ???????????????? Paragraph pp = range.getParagraph(i);  ???????????????? content.append(pp.text());  ???????????? }  ???????? } catch (Exception e) {  ???????? }  return content.toString().trim();  ???? }  2.PDF文件用的是PDFbox:  public static String readPdf(String path) throws Exception {  ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容  ???????? FileInputStream fis = new FileInputStream(path);  ???????? PDFParser p = new PDFParser(fis);  ???????? p.parse();  ???????? PDFTextStripper ts = new PDFTextStripper();  ???????? content.append(ts.getText(p.getPDDocument()));  ???????? fis.close();  return content.toString().trim();  ???? }  3.html文件:  public static String readHtml(String urlString) {  ???????? StringBuffer content = new StringBuffer("");  ???????? File file = new File(urlString);  ???????? FileInputStream fis = null;  try {  ???????????? fis = new FileInputStream(file);  // 讀取頁面  ???????????? BufferedReader reader = new BufferedReader(new InputStreamReader(  ???????????????????? fis,"utf-8"));//這里的字符編碼要注意,要對上html頭文件的一致,否則會出亂碼  ???????????? String line = null;  while ((line = reader.readLine()) != null) {  ???????????????? content.append(line + "\n");  ???????????? }  ???????????? reader.close();  ???????? } catch (Exception e) {  ???????????? e.printStackTrace();  ???????? }  ???????? String contentString = content.toString();  return contentString;  ???? }  4.txt文件:  public static String readTxt(String path) {  ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容  try {  ???????????? FileReader reader = new FileReader(path);  ???????????? BufferedReader br = new BufferedReader(reader);  ???????????? String s1 = null;  while ((s1 = br.readLine()) != null) {  ???????????????? content.append(s1 + "\r");  ???????????? }  ???????????? br.close();  ???????????? reader.close();  ???????? } catch (IOException e) {  ???????????? e.printStackTrace();  ???????? }  return content.toString().trim();  ???? }  接下來數(shù)搜索代碼:  public class TestQuery {???  public static void main(String[] args) throws IOException, ParseException {???  ???????? Hits hits = null;???  //搜索內(nèi)容自己換  ???????? String queryString = "根據(jù)國務(wù)院的決定";???  ???????? Query query = null;??  ???????? IndexSearcher searcher = new IndexSearcher("d:\\index"); //這里注意索引存放的路徑  ???????? Analyzer analyzer = new StandardAnalyzer();???  try {???  ???????????? QueryParser qp = new QueryParser("body", analyzer);???  /**  ????????????? * 建索引的時候我們指定了body建立為內(nèi)容,我們搜索的時候也是針對body的,所以  ????????????? *??? QueryParser qp = new QueryParser("body", analyzer);  ????????????? *??? 這句和建立索引時候  ???????????????? Field FieldBody = new Field("body", temp, Field.Store.YES,???  ???????????????????????? Field.Index.TOKENIZED,???  ???????????????????????? Field.TermVector.WITH_POSITIONS_OFFSETS);  ????????????? *的這句的"body"是對應(yīng)的。  ???????????? */  ???????????? query = qp.parse(queryString);???  ???????? } catch (ParseException e) {  ???????????? System.out.println("異常");  ???????? }???  if (searcher != null) {???  ???????????? hits = searcher.search(query);???  if (hits.length() > 0) {???  ???????????????? System.out.println("找到:" + hits.length() + " 個結(jié)果!");??  for (int i = 0; i < hits.length(); i++) {//輸出搜索信息  ????????????????????? Document document = hits.doc(i);  ????????????????????? System.out.println("contents:"+document.get("body"));  //同樣原理這里的document.get("body")就是取得建立在索引文件里面的額body的所有內(nèi)容  ???????????????????? //你若想輸出文件路徑就用document.get("path")就可以了  ???????????????? }  ???????????? } else{  ???????????????? System.out.println("0個結(jié)果!");  ???????????? }???  ???????? }??  ???? }   |