解决数字和英文字母结合检索出现高亮重复问题
生活随笔
收集整理的這篇文章主要介紹了
解决数字和英文字母结合检索出现高亮重复问题
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
問題如下:
???????? 數字與英文結合在一起檢索,出現高亮重復問題
???????? 如:檢索“220”則
關于同意220kV佛山變電站#1、#2主變報廢的批復 .txt 檢索“220kv”則 關于同意220220kV佛山變電站#1、#2主變報廢的批復 .txt 高亮采用的是索引時記錄Term的位置,高亮處理采用TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);方式,其中主要代碼如下: QueryParser queryParser = new QueryParser(Version.LUCENE_30,fieldname,queryAnalyzer);???? Query query = queryParser.parse(keyWordLc);
???? Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
???????"<font color=/"red/">", "</font>"), new QueryScorer(
?????????query));
???? highlighter.setTextFragmenter(new SimpleFragmenter(50));
??????
???? TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);
???? /**???
????? * 注意這里最好設為true,雖然會影響性能,但是避免出現:
????? * 文檔題名為:索引測試新建文檔1.txt
?????????????? * 查看tokens結果:[(1,8,9), (1.txt,8,13), (文檔,6,8), (新建,4,6), (測試,2,4), (索引,0,2), (txt,10,13)]
?????????????? * 這樣高亮顯示的時候<font color="red">索引測試新建文檔</font>1.txt
?????????????? * 因為高亮顯示的方法里是按位置信息,當當前匹配的term小于前面最大的最后位置時才去高亮,
?????????????? * 不然則在最后獲取到最小匹配的term的首位置到最后匹配的term的末位置的字符串全部高亮起來了。
????? */
?????????? TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector,true);??
??????????
????????? ?String content = hitDoc.get(fieldname);
?????????? String result = highlighter.getBestFragments(tokenStream, content, 5,"...");?? 通過調試跟蹤,paoding分詞器對“220kv”會分詞為“220? ?kv??? 220kv”,而通過Lucene提供的lucene-highlighter-3.0.2.jar、lucene-memory-3.0.2.jar
解決方法是修改lucene-highlighter-3.0.2.jar中Highlighter類,代碼如下:
public final TextFragment[] getBestTextFragments(??TokenStream tokenStream,
??String text,
??boolean mergeContiguousFragments,
??int maxNumFragments)
??throws IOException, InvalidTokenOffsetsException
?{
??ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
??StringBuilder newText=new StringBuilder();
??
???? TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
???? OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
???? tokenStream.addAttribute(PositionIncrementAttribute.class);
???? tokenStream.reset();
????
??TextFragment currentFrag =?new TextFragment(newText,newText.length(), docFrags.size());
??TokenStream newStream = fragmentScorer.init(tokenStream);
??if(newStream != null) {
??? tokenStream = newStream;
??}
??fragmentScorer.startFragment(currentFrag);
??docFrags.add(currentFrag);FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);try
??{String tokenText;
???int startOffset;
???int endOffset;
???int lastEndOffset = 0;
???int lastStartOffset = 0;??//用來記錄當前所取的字符串起點位置textFragmenter.start(text, tokenStream);TokenGroup tokenGroup=new TokenGroup(tokenStream);for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
???????? next = tokenStream.incrementToken())
???{
????if(?(offsetAtt.endOffset()>text.length())
?????||
?????(offsetAtt.startOffset()>text.length())
?????)??????
????{
?????throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
???????+" exceeds length of provided text sized "+text.length());
????}
????if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
????{
?????//the current token is distinct from previous tokens -
?????// markup the cached token group info
?????startOffset = tokenGroup.matchStartOffset;
?????endOffset = tokenGroup.matchEndOffset;
?????
?????//用下面兩行替代代碼tokenText = text.substring(startOffset, endOffset);
?????//解決“數字+英文或英文+數字”格式關鍵詞出現高亮重復問題,如:檢索“220KV”會高亮“220220KV”
?????lastStartOffset = Math.max(startOffset, lastEndOffset);
?????tokenText = text.substring(lastStartOffset, endOffset);
?????
?????String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
?????//store any whitespace etc from between this and last group
?????if (startOffset > lastEndOffset)
??????newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
?????newText.append(markedUpText);
?????lastEndOffset=Math.max(endOffset, lastEndOffset);
?????
?????tokenGroup.clear();//check if current token marks the start of a new fragment
?????if(textFragmenter.isNewFragment())
?????{
??????currentFrag.setScore(fragmentScorer.getFragmentScore());
??????//record stats for a new fragment
??????currentFrag.textEndPos = newText.length();
??????currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
??????fragmentScorer.startFragment(currentFrag);
??????docFrags.add(currentFrag);
?????}
????}tokenGroup.addToken(fragmentScorer.getTokenScore());//????if(lastEndOffset>maxDocBytesToAnalyze)
//????{
//?????break;
//????}
???}
???currentFrag.setScore(fragmentScorer.getFragmentScore());if(tokenGroup.numTokens>0)
???{
????//flush the accumulated text (same code as in above loop)
????startOffset = tokenGroup.matchStartOffset;
????endOffset = tokenGroup.matchEndOffset;
????tokenText = text.substring(startOffset, endOffset);
????String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
????//store any whitespace etc from between this and last group
????if (startOffset > lastEndOffset)
?????newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
????newText.append(markedUpText);
????lastEndOffset=Math.max(lastEndOffset,endOffset);
???}//Test what remains of the original text beyond the point where we stopped analyzing
???if (
//?????if there is text beyond the last token considered..
?????(lastEndOffset < text.length())
?????&&
//?????and that text is not too large...
?????(text.length()<= maxDocCharsToAnalyze)
????)????
???{
????//append it to the last fragment
????newText.append(encoder.encodeText(text.substring(lastEndOffset)));
???}currentFrag.textEndPos = newText.length();//sort the most relevant sections of the text
???for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
???{
????currentFrag = i.next();//If you are running with a version of Lucene before 11th Sept 03
????// you do not have PriorityQueue.insert() - so uncomment the code below
????/*
?????????if (currentFrag.getScore() >= minScore)
?????????{
??????????fragQueue.put(currentFrag);
??????????if (fragQueue.size() > maxNumFragments)
??????????{ // if hit queue overfull
???????????fragQueue.pop(); // remove lowest in hit queue
???????????minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
??????????}
?????????}
????*/
????//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
????//fix to PriorityQueue. The correct method to use here is the new "insert" method
????// USE ABOVE CODE IF THIS DOES NOT COMPILE!
????fragQueue.insertWithOverflow(currentFrag);
???}//return the most relevant fragments
???TextFragment frag[] = new TextFragment[fragQueue.size()];
???for (int i = frag.length - 1; i >= 0; i--)
???{
????frag[i] = fragQueue.pop();
???}//merge any contiguous fragments to improve readability
???if(mergeContiguousFragments)
???{
????mergeContiguousFragments(frag);
????ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
????for (int i = 0; i < frag.length; i++)
????{
?????if ((frag[i] != null) && (frag[i].getScore() > 0))
?????{
??????fragTexts.add(frag[i]);
?????}
????}
????frag= fragTexts.toArray(new TextFragment[0]);
???}return frag;}
??finally
??{
???if (tokenStream != null)
???{
????try
????{
?????tokenStream.close();
????}
????catch (Exception e)
????{
????}
???}
??}
?}
總結
以上是生活随笔為你收集整理的解决数字和英文字母结合检索出现高亮重复问题的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 实时通信 | pusher 入门教程(一
- 下一篇: 原生html单页应用,小白篇 -- 原生