汉字正字表达式解决方案
生活随笔
收集整理的這篇文章主要介紹了
汉字正字表达式解决方案
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
原理: 將模式串與匹配串都轉成unicode編碼,再用正則。 可以用python完成,或者是用c++boost
方案一,解析程序C版本,中間調用python函數,python函數中調用正則表達式進行函數解析。 本地可以運行,但是haoop集群運行不了。
方案二,采用boost wregex C++源碼編譯boost庫。
備注: cpp文件都UTF-8編碼
?
方案一代碼:
#-*-coding:UTF-8-*- import re; import sys; import time; def add(a,b): s="";try:upatternstr=unicode(a,'UTF-8');except:pass;pchinese=re.compile(upatternstr);try:uline = unicode(b,"UTF-8");mylist = [];index = 0;while True:m=pchinese.search(uline,index);if (m!=None):mylist.append(m.group(1).encode("UTF-8"));index =m.end();else:break;s="\t".join(mylist);return s;except:return s;if (__name__=="__main__"):t="<li><span>字義:</span>(.*?)</li>";fid=open("qiming2.txt","r");s=fid.read();fid.close();add(t,s);char line[102400]={0};char text[102400]={0};char pattern[200]={0};strcpy(pattern,t.c_str());while(fgets(line,102400,stdin)){//text.assign(line);//wstring wtext = String2Wstringx(t);//wstring::const_iterator it=wtext.begin();// wstring::const_iterator end=wtext.end();//while(boost::regex_search(it,end,wm,wreg))// {// wstring wtemp=wm[1];// string temp=Wstring2String(wtemp);// results.push_back(temp);// it=wm[1].second;//}strcat(text,line);strcat(text,"\n");}//string t="劉[^劉]*?,";//wstring ws=String2Wstring(s);//cout<<p.size()<<endl;//cout<<ws.size()<<endl;//fprintf(stdout,"輸出正則匹配結果\n");//for(vector<string>::iterator it=results.begin();it!=results.end();it++)//{// printf("%s\n",(*it).c_str());//}Py_Initialize(); // 檢查初始化是否成功 if ( !Py_IsInitialized() ) { return -1; } // 添加當前路徑 //把輸入的字符串作為Python代碼直接運行,返回0 //表示成功,-1表示有錯。大多時候錯誤都是因為字符串 //中有語法錯誤。 PyRun_SimpleString("import sys"); PyRun_SimpleString("sys.path.append('./')"); PyObject *pName,*pModule,*pDict,*pFunc,*pArgs, *ret; // 載入名為pytest的腳本 pName = PyString_FromString("pytest"); pModule = PyImport_Import(pName); if ( !pModule ) { printf("can't find pytest.py"); return -1; } pDict = PyModule_GetDict(pModule); if ( !pDict ) { return -1; } // 找出函數名為add的函數 pFunc = PyDict_GetItemString(pDict, "add"); if ( !pFunc || !PyCallable_Check(pFunc) ) { printf("can't find function [add]"); return -1; } // 參數進棧 *pArgs; pArgs = PyTuple_New(2); // PyObject* Py_BuildValue(char *format, ...) // 把C++的變量轉換成一個Python對象。當需要從 // C++傳遞變量到Python時,就會使用這個函數。此函數 // 有點類似C的printf,但格式不同。常用的格式有 // s 表示字符串, // i 表示整型變量, // f 表示浮點數, // O 表示一個Python對象。 PyTuple_SetItem(pArgs, 0, Py_BuildValue("s",pattern)); PyTuple_SetItem(pArgs, 1, Py_BuildValue("s",text)); // 調用Python函數 ret=PyObject_CallObject(pFunc, pArgs); char * str_ret = PyString_AsString(ret);printf("result:%s\n", str_ret);Py_DECREF(pName); Py_DECREF(pArgs); Py_DECREF(pModule); // 關閉Python Py_Finalize(); gettimeofday(&tv2, NULL);fprintf(stderr,"%s has finished congratulations!\n",argv[0]);fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);return 0;
方法二
// please add your code here! #include <iostream> #include <stdlib.h> #include <math.h> #include<time.h> #include <set> #include <string> #include <sys/time.h> #include<locale.h> #include<boost/regex.hpp> #include <wchar.h> #include <iconv.h> #include <errno.h> using namespace std;/*funcname:spec:parms:[IN][IN][OUT]returnValue:author liuyu, 20120528 */ void PrintUsage() {fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" ); } int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen) {iconv_t convertor=iconv_open(toCode,fromCode);size_t inputsize;size_t outputsize;size_t oldoutputsize;char *input, *inputold;char *output=NULL;char *outputold=NULL;int flag=0;if(convertor==iconv_t(-1)){fprintf(stderr,"convertor device initailization failed!\n");return 1;}else{inputsize=srclen;input=new char[inputsize+1];memcpy(input,srcstr,inputsize);input[inputsize]='\0';inputold=input;outputsize=inputsize*5;oldoutputsize=outputsize;output=new char[outputsize];output[0]=0;outputold=output;size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);if (rc==size_t(-1)){fprintf(stdout, "errno=%d\n",errno);}destlen=oldoutputsize-outputsize;memcpy(deststr,outputold,destlen);deststr[destlen]=0;if(rc!=size_t(-1)){flag=1;}delete []inputold;delete []outputold;}iconv_close(convertor);if(flag==1){return 0;}else{return 1;}} wchar_t * MBs2WCs(const char* pszSrc){ wchar_t* pwcs = NULL; int size = 0; setlocale(LC_ALL, "zh_CN.UTF8"); size = mbstowcs(NULL,pszSrc,0); pwcs = new wchar_t[size+1]; size = mbstowcs(pwcs, pszSrc, size+1); pwcs[size] = 0; return pwcs; }char* WCs2MBs(const wchar_t * wcharStr){ char* str = NULL; int size = 0; setlocale(LC_ALL, "zh_CN.UTF8"); size = wcstombs( NULL, wcharStr, 0); str = new char[size + 1]; wcstombs( str, wcharStr, size); str[size] = '\0'; return str; }int main( int argc, char *argv[] ) {timeval tv1, tv2;gettimeofday(&tv1, NULL); if ( 1 != argc ){PrintUsage();return 1;}/*char *s="劉禹,劉德華,劉佳佳。。。王大虎。。。劉長春,xixi";char *t="(劉[^劉]*?),";wchar_t *ws =MBs2WCs(s);wchar_t *wt =MBs2WCs(t);wstring wstr1=ws;wstring wstr2=wt;boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);boost::wsmatch wm;wstring::const_iterator it=wstr1.begin();wstring::const_iterator end=wstr1.end();while(boost::regex_search(it,end,wm,wreg)){wstring wtemp=wm[1];char* temp=WCs2MBs(wtemp.c_str());printf("%s\n",temp);it=wm[0].second;}*/char line[102400]={0};char text[102400]={0};char* t="<li><span>字義:</span>(.*?)</li>";wchar_t *wt =MBs2WCs(t);boost::wsmatch wm;boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);while(fgets(line,102400,stdin)){strcat(text,line);}wchar_t * ws = MBs2WCs(text);wstring wtext=ws;wstring::const_iterator it=wtext.begin();wstring::const_iterator end=wtext.end();vector<string> results;while(boost::regex_search(it,end,wm,wreg)){wstring wtemp=wm[1];char* temp=WCs2MBs(wtemp.c_str());results.push_back(temp);it=wm[1].second;}for (vector<string>::iterator it = results.begin(); it!=results.end(); it++){fprintf(stdout,"%s\n",(*it).c_str());}gettimeofday(&tv2, NULL);fprintf(stderr,"%s has finished congratulations!\n",argv[0]);fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);return 0; }方法一的編譯方法:
?
?g++ Python.cpp -o Python -I/usr/include/python2.5 -L/usr/lib/python2.5 -lpython2.5
?
轉載于:https://www.cnblogs.com/finallyliuyu/p/4724404.html
總結
以上是生活随笔為你收集整理的汉字正字表达式解决方案的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 访问phpMyAdmin系统报js错误怎
- 下一篇: 凤凰网广告投放资源以及技巧