java关键字匹配算法_简单关键词匹配算法
針對(duì)微博的短篇博文,編寫的簡(jiǎn)單分詞和匹配算法。相對(duì)于一篇文檔的復(fù)雜分詞算法,能夠在效率和可用性上得到較好的平衡。
package com.sina.tblog.sentiment;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
import com.sina.tblog.sentiment.constant.Constant;
public class KeyWordFilter {
public static HashSet KeyWordsList = null;
public static HashSet letterKeyWordsList = null;
/**
* 初始化或重新導(dǎo)入關(guān)鍵詞列表
* @throws IOException
*/
static{
try {
initKeyWords(Constant.KeyWordsFiles);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static int deleteNewWord(String word){
if(word.length()>10||word.length()<2)
return -1;
if(!KeyWordsList.contains(word))
return 0;
KeyWordsList.remove(word);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())
letterKeyWordsList.remove(word.toUpperCase());
FileOutputStream stream;
OutputStreamWriter writer;
try {
stream = new FileOutputStream(Constant.newWordsFile,true);
writer = new OutputStreamWriter(stream);
writer.write("\n"+word);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return -1;
}
return 1;
}
public static int addWord(String word){
if(word.length()>10)
return -1;
if(KeyWordsList.contains(word))
return 0;
KeyWordsList.add(word);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())
letterKeyWordsList.add(word.toUpperCase());
FileOutputStream stream;
OutputStreamWriter writer;
try {
stream = new FileOutputStream(Constant.newWordsFile,true);
writer = new OutputStreamWriter(stream);
writer.write("\n"+word);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return -1;
}
return 1;
}
private static void initKeyWords(String Files[]) throws IOException {
if(KeyWordsList!=null)
KeyWordsList.clear();
else
KeyWordsList = new HashSet();
if(letterKeyWordsList!=null)
letterKeyWordsList.clear();
else
letterKeyWordsList = new HashSet();
for(int i=0;i
File file = new File(Files[i]);
BufferedReader reader = null;
reader = new BufferedReader(new FileReader(file));
String tmp = reader.readLine();
while(tmp!=null){
KeyWordsList.add(tmp);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find())
letterKeyWordsList.add(tmp.toUpperCase());
tmp = reader.readLine();
}
reader.close();
}
}
private static boolean findWord(String str,boolean ignoreCase){
if(ignoreCase == false)
return KeyWordsList.contains(str);
else{
boolean match = KeyWordsList.contains(str);
if(match == false){
match = letterKeyWordsList.contains(str.toUpperCase());
}
return match;
}
}
public static List segmentStrQuickMatch( String str_line,boolean ignoreCase)
{
String term = "";
boolean term_tag = false;
int str_size=0,left=0,len=0;
List list = new ArrayList();
str_size = str_line.length();
while(left
{
len = Constant.max_len;
while( len>=Constant.min_len )//gkm:每一詞
{
term="";
int right = left+len;
int x = 0;
if(right>str_size){
x = right-str_size;
right = str_size;
}
term=str_line.substring(left,right);
term_tag=findWord(term,ignoreCase);
if(term_tag==true)
break;
if(x>0)
len-=x+1;
else
len-=1;
}
if(term_tag==false)//gkm:詞典中沒有term,后移一個(gè)字符(以一個(gè)字符的速度后移,使得可以分出中英混合的詞,沒有判斷無(wú)效字符,有待改進(jìn)!!! )
{
left+=1;
}
else//gkm:詞典中有term,后移len個(gè)字符,term加入到terms_vct[term_tag]
{
left+=len;
list.add(term);
}
}//while(left
return list;
}
public static List segmentStrFullMatch( String str_line,boolean ignoreCase)
{
String term = "";
boolean term_tag = false;
int str_size=0,left=0,len=0;
List list = new ArrayList();
str_size = str_line.length();
while(left
{
len = Constant.max_len;
while( len>=Constant.min_len )//gkm:每一詞
{
term="";
int right = left+len;
int x = 0;
if(right>str_size){
x = right-str_size;
right = str_size;
}
term=str_line.substring(left,right);
term_tag=findWord(term,ignoreCase);
if(term_tag==true)
list.add(term);
if(x>0)
len-=x+1;
else
len-=1;
}
left+=1;
}//while(left
return list;
}
public static void main(String[] args) throws IOException {
System.out.println(segmentStrFullMatch("中華人民共和國(guó)",true));
}
}
分享到:
2012-12-18 15:17
瀏覽 504
評(píng)論
總結(jié)
以上是生活随笔為你收集整理的java关键字匹配算法_简单关键词匹配算法的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
 
                            
                        - 上一篇: python语言和汇编语言_python
- 下一篇: 人口增长模型 源代码
