从文本分类问题中的特征词选择算法追踪如何将数学知识,数学理论迁移到实际工程中去...
博文轉載請注明作者和出處(作者:finallyliuyu :出處博客園)
附:《卡方特征詞選擇算法》
《DF特征詞選擇算法》
一.數學背景
將數學知識、數學理論以及數學思想遷移到實際工程問題中,經常會促進工程問題的圓滿解決。
可是如何將數學知識引入工程問題中呢?首先需要有“數學思維”例如理解數學公式所刻畫的內涵;其次需要有“建模”能力:從不同的視角來看待同一個問題,最后抽象出的數學模型也可能會差別很大。比如有的數學模型有現成的解決方案可用,然而有的數學模型沒有現成的解決方案可用;或者有的模型比其他模型能更好地刻畫和表示實際問題。如果把在頭腦中搜索適合待解決實際問題的數學工具定義為“數學思維”,把將實際問題抽象成數學模型的能力定義為"”數學建模”的話,只有這兩種能力配合“默契”才能更好地解決實際問題。或許我們從文本分類問題中的各種特征詞選擇方法能夠看到些端倪。
首先先來看一些公式的含義
注意:上面的互信息公式有些錯誤,做如下更正:
平均互信息:(或叫做信息增益)
?
下面開開始介紹平均互信息與互信息在文本分類特征詞選擇算法中的應用
為了避免引起混淆,互信息在文本分類特征詞選擇中被稱為point-wise MI;平均互信息被稱作IG(Information Gain)。
參照 Manning《信息檢索導論》p189頁(王斌譯作)的定義:
IG 公式為:
point-wise MI公式為
二.下面給出實現這兩種算法的代碼C++代碼
?
point-wise MI 聲明代碼 double?CalPointWiseMI(double?N11,double?N10,double?N01,double?N00);//計算pointwise?MI;vector<pair<string,double>?>PointWiseMIFeatureSelectionForPerclass(DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,string?classLabel);
void?PointWiseMIFeatureSelection(vector<string?>?classLabels,DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,int?N,char?*?address); 計算pointwiseMI的值 /************************************************************************/
/*?計算pointwiseMI的值?????????????????????????????????????????????????????????????????????*/
/************************************************************************/
double?Preprocess::?CalPointWiseMI(double?N11,double?N10,double?N01,double?N00)
{
????double?pointwiseMI=0;
????if(N11>0)
????{
????????pointwiseMI=log(N11+N10+N01+N00)+log(N11)-log(N11+N10)-log(N11+N01);
????}
????return?pointwiseMI;
} 計算每個詞對每個類別的pointwiseMI /************************************************************************/
/*?計算每個詞對每個類別的pointwiseMI?????????????????????????????????????????????????????????????????????*/
/************************************************************************/
vector<pair<string,double>?>?Preprocess::?PointWiseMIFeatureSelectionForPerclass(DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,string?classLabel)
{
????int?N=endIndex-beginIndex+1;//總共的文章數目
????vector<string>tempvector;//詞袋子中的所有詞
????vector<pair<string,double>?>?MIInfo;
????for(map<string,vector<pair<int,int>>>::iterator?it=mymap.begin();it!=mymap.end();++it)
????{
????????tempvector.push_back(it->first);
????}
????//計算卡方值
????for(vector<string>::iterator?ittmp=tempvector.begin();ittmp!=tempvector.end();ittmp++)
????{
????????int?N1=mymap[*ittmp].size();
????????pair<string,string>?compoundKey=make_pair(*ittmp,classLabel);
????????double?N11=double(contingencyTable[compoundKey].first);
????????double?N01=double(contingencyTable[compoundKey].second);
????????double?N10=double(N1-N11);
????????double?N00=double(N-N1-N01);
????????double?miValue=CalPointWiseMI(N11,N10,N01,N00);
????????MIInfo.push_back(make_pair(*ittmp,miValue));
????}
????//按照卡方值從大到小將這些詞排列起來
????stable_sort(MIInfo.begin(),????MIInfo.end(),isLarger);
????return?MIInfo;
}
?
?
?
?
point-wise MI特征詞選擇法 /************************************************************************//*?point-wise?MI特征詞選擇法?????????????????????????????????????????????????????????????????????*/
/************************************************************************/
void?Preprocess::?PointWiseMIFeatureSelection(vector<string?>?classLabels,DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,int?N,char?*?address)
{
????clock_t?start,finish;
????double?totaltime;
????int?totalTraingingCorpus=endIndex-beginIndex+1;//訓練語料庫總共的文章數目
????set<string>finalKeywords;//存放最終遴選出的特征詞
????vector<pair<string,double>>MIInfo;
????start=clock();
????for(vector<string>::iterator?it=classLabels.begin();it!=classLabels.end();it++)
????{
????????//訓練語料庫中某個類別的文章數目
????????int?N_subClassCnt=getCategorizationNum(*it,"TrainingCorpus");
????????//threshold決定每個類別遴選多少個特征詞
????????int?threshold=N_subClassCnt*N/totalTraingingCorpus;
????????MIInfo=PointWiseMIFeatureSelectionForPerclass(mymap,contingencyTable,*it);
????????for(vector<pair<string,double>?>::size_type?j=0;j<threshold;j++)
????????{
????????????finalKeywords.insert(MIInfo[j].first);
????????}
????????MIInfo.clear();
????}
????ofstream?outfile(address);
????int?finalKeyWordsCount=finalKeywords.size();
????for?(set<string>::iterator?it=finalKeywords.begin();it!=finalKeywords.end();it++)
????{
????????outfile<<*it<<endl;
????}
????outfile.close();
????cout<<"最后共選擇特征詞"<<finalKeyWordsCount<<endl;
????finish=clock();
????totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
????cout<<"遴選特征詞共有了"<<totaltime<<endl;
}
?
?
信息增益特征詞選擇算法 double?CalInformationGain(double?N11,double?N10,double?N01,double?N00);//計算IG值vector<pair<string,double>?>InformationGainFeatureSelectionForPerclass(DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,string?classLabel);
void?InformationGainFeatureSelection(vector<string?>?classLabels,DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,int?N,char?*?address);
?
?
?
?
計算IG值 /************************************************************************//*?計算IG值?????????????????????????????????????????????????????????????????????*/
/************************************************************************/
double?Preprocess::CalInformationGain(double?N11,double?N10,double?N01,double?N00)
{
????double?IG=0;
????double?Ntotal=N11+N10+N01+N00;
????double?N1_=N11+N10;
????double?N0_=N01+N00;
????double?N_0=N10+N00;
????double?N_1=N11+N01;
????if(N11>0)
????{
????????IG+=N11/Ntotal*log(Ntotal*N11/(N1_*N_1));
????}
????if(N10>0)
????{
????????IG+=N10/Ntotal*log(Ntotal*N10/(N1_*N_0));
????}
????if(N01>0)
????{
????????IG+=N01/Ntotal*log(Ntotal*N01/(N0_*N_1));
????}
????if(N00>0)
????{
????????IG+=N00/Ntotal*log(Ntotal*N00/(N0_*N_0));
????}
????return?IG;
} 計算每個單詞,對于某個類別的IG值,并排序 ************************************************************************/
/*?計算每個單詞,對于某個類別的IG值,并排序??????????????????????????????????????????????????????????????????*/
/************************************************************************/
vector<pair<string,double>?>?Preprocess::?InformationGainFeatureSelectionForPerclass(DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,string?classLabel)
{
????int?N=endIndex-beginIndex+1;//總共的文章數目
????vector<string>tempvector;//詞袋子中的所有詞
????vector<pair<string,double>?>?IGInfo;
????for(map<string,vector<pair<int,int>>>::iterator?it=mymap.begin();it!=mymap.end();++it)
????{
????????tempvector.push_back(it->first);
????}
????//計算卡方值
????for(vector<string>::iterator?ittmp=tempvector.begin();ittmp!=tempvector.end();ittmp++)
????{
????????int?N1=mymap[*ittmp].size();
????????pair<string,string>?compoundKey=make_pair(*ittmp,classLabel);
????????double?N11=double(contingencyTable[compoundKey].first);
????????double?N01=double(contingencyTable[compoundKey].second);
????????double?N10=double(N1-N11);
????????double?N00=double(N-N1-N01);
????????//double?chiValue=CalChiSquareValue(N11,N10,N01,N00);
????????double?igValue=CalInformationGain(N11,N10,N01,N00);
????????IGInfo.push_back(make_pair(*ittmp,igValue));
????}
????//按照卡方值從大到小將這些詞排列起來
????stable_sort(IGInfo.begin(),IGInfo.end(),isLarger);
????return?IGInfo;
}
?
?
?
?
?
信息增益特征詞選擇算法 /************************************************************************//*????信息增益特征詞選擇算法?????????????????????????????????????????????????????????????????????????*/
/************************************************************************/
void?Preprocess::InformationGainFeatureSelection(vector<string?>?classLabels,DICTIONARY&?mymap,CONTINGENCY&?contingencyTable,int?N,char?*?address)
{
????clock_t?start,finish;
????double?totaltime;
????int?totalTraingingCorpus=endIndex-beginIndex+1;//訓練語料庫總共的文章數目
????set<string>finalKeywords;//存放最終遴選出的特征詞
????vector<pair<string,double>>IGInfo;
????start=clock();
????for(vector<string>::iterator?it=classLabels.begin();it!=classLabels.end();it++)
????{
????????//訓練語料庫中某個類別的文章數目
????????int?N_subClassCnt=getCategorizationNum(*it,"TrainingCorpus");
????????//threshold決定每個類別遴選多少個特征詞
????????int?threshold=N_subClassCnt*N/totalTraingingCorpus;
????????IGInfo=InformationGainFeatureSelectionForPerclass(mymap,contingencyTable,*it);?
????????for(vector<pair<string,double>?>::size_type?j=0;j<threshold;j++)
????????{
????????????finalKeywords.insert(IGInfo[j].first);
????????}
????????IGInfo.clear();
????}
????ofstream?outfile(address);
????int?finalKeyWordsCount=finalKeywords.size();
????for?(set<string>::iterator?it=finalKeywords.begin();it!=finalKeywords.end();it++)
????{
????????outfile<<*it<<endl;
????}
????outfile.close();
????cout<<"最后共選擇特征詞"<<finalKeyWordsCount<<endl;
????finish=clock();
????totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
????cout<<"遴選特征詞共有了"<<totaltime<<endl;
}
?
?
附: DF特征詞選擇算法
轉載于:https://www.cnblogs.com/finallyliuyu/archive/2010/10/04/1841820.html
總結
以上是生活随笔為你收集整理的从文本分类问题中的特征词选择算法追踪如何将数学知识,数学理论迁移到实际工程中去...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 《Head First 设计模式》专题上
- 下一篇: 使用AvalonDock制作WPF多标签