PHP基于字典的中英文数字混合分词算法RMM简易实现
生活随笔
收集整理的這篇文章主要介紹了
PHP基于字典的中英文数字混合分词算法RMM简易实现
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
?
<?phpclass Seg {//字典private $dict = [];//加載字典function set_dict($vDict){//詞典大寫,方便比對(duì)foreach ($vDict as $i=>$v){$vDict[$i]= strtoupper($v);}$this->dict = $vDict;}//分詞測(cè)試//基于字典的中英文數(shù)字混合分詞算法RMM 實(shí)現(xiàn)//https://blog.csdn.net/xqhadoop/article/details/60757242function rmmseg($vStr = ''){if('' === $vStr){return [];}if(empty($this->dict)){exit('詞典為空');}//大寫,方便比對(duì)$str = strtoupper($vStr);//分詞初步結(jié)果$result = [];while ('' !== $str) {$pos = 0;$t_str = $str;while (1) { // echo '$t_str=' . $t_str . ',$pos=' . $pos . '<br>';if (in_array($t_str, $this->dict)) {$result[] = $t_str; // echo ' 在字典中:' . $t_str . '<br>';$t_str = '';} else {$pos++;$t_str = mb_substr($str, $pos); // echo ' 不在字典,剩余 ' . $t_str . '<br>';if (1 == mb_strlen($t_str)) {$result[] = $t_str;$t_str = '';}}if ('' === $t_str) {break;}}if (0 == $pos) {break;}$str = mb_substr($str, 0, $pos);if (1 == $pos && '' !== trim($str)) {$result[] = $str;break;}// echo '循環(huán)str=' . $str . print_r($result, 1) . '<br>';}$result = array_reverse($result); // echo '<pre>' . print_r($result, 1);$result_merge_num = [];//連續(xù)單個(gè)數(shù)字或字母同類型合并$last_num = '';//連續(xù)字符類型$last_c_type = '';foreach ($result as $word) { // echo print_r($result_merge_num, 1) . '<br><br>$word=' . $word . ' ';if (1 == mb_strlen($word)) {//單個(gè)字$c_type = 3; //漢字if ($word >= 'A' && $word <= 'Z') {$c_type = 1; //字母} elseif (is_numeric($word)) {$c_type = 2; //數(shù)字} elseif (in_array($word, ['+', '-', '*', '/', '.', '%'])) {$c_type = 2; //數(shù)字或字母,隨著前面一個(gè)而變} else {$c_type = 0; //其他,例如空格等}//以下是數(shù)字或字母if ($c_type == $last_c_type) {//與之前的是同類的$last_num .= $word;} else {//不同類的if ('' !== $last_num) {$result_merge_num[] = $last_num;}$last_num = $word;$last_c_type = $c_type;}// echo 'c_type=' . $c_type . ',last=' . $last_num . '<br>';continue;}//以下是多個(gè)字的詞if ('' !== $last_num) {//如果之前有連續(xù)的數(shù)值,則合并為一個(gè),加入數(shù)組$result_merge_num[] = $last_num;$last_num = '';}$result_merge_num[] = $word;// echo 'c_type=' . $c_type . print_r($result_merge_num, 1) . '<br>';}// echo '<pre> 1 $result_merge_num=' . print_r($result_merge_num, 1);if ('' !== $last_num) {$result_merge_num[] = $last_num;}//濾除空格,注意不能用array_filter,否則把0值會(huì)去掉foreach ($result_merge_num as $i => $word) {if (' ' === $word) {unset($result_merge_num[$i]);}}// echo '<pre> $result_merge_num=' . print_r($result_merge_num, 1);return $result_merge_num;} }//------------------------------------------------------------------ //測(cè)試$seg = new Seg();//詞典 $dict = ['中華', '廣大', '人民', '共和國(guó)', '電阻', '電阻值', '貼片', '電壓','精度', 'RC', '功率', 'RES', 'OHM', '0603', '貼片電阻'];$str = "貼片電阻Res0603889電阻值24.89kohm,電壓 25V 功率1/8w放"; $str .= "RC0603FR-0722kL,4.22k精度0.5%,99 88方式"; $str .= "中華人民共和國(guó)廣大";$seg->set_dict($dict); $res = $seg->rmmseg($str);echo '原字符串=' . $str . '<br>'; echo '<br/>分詞結(jié)果=';echo "<style>.C_HIGHLIGHT{background:#ff0; border:1px solid orange;padding:1px 3px; margin-left:1px ;margin-top:2px;display:inline-block}</style>";foreach ($res as $word) {echo "<span class='C_HIGHLIGHT'>$word</span> "; }?
參考:https://blog.csdn.net/xqhadoop/article/details/60757242
?
總結(jié)
以上是生活随笔為你收集整理的PHP基于字典的中英文数字混合分词算法RMM简易实现的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 分享一个自己写的取中国农历相关数据的类。
- 下一篇: 双显示器,拆了副屏以后,发现程序依然不在