php中文分词代码一例

发布时间:2020-08-11编辑:脚本学堂
为大家介绍php进行中文分词的一段代码,有需要的朋友,可以参考下。

最常见的词语二分法:

<?php
$str = '这是我的网站www.jb200.com!'; 
//$str = iconv('GB2312','UTF-8',$str); 
$result = spStr($str); 
print_r($result); 
   
/**
 * UTF-8版 中文二元分词
 */ 
function spStr($str) 
{ 
    $cstr = array(); 
   
    $search = array(",", "/", "", ".", ";", ":", """, "!", "~", "`", "^", "(", ")", "?", "-", "t", "n", "'", "<", ">", "r", "rn", "{1}quot;", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",); 
   
    $str = str_replace($search, " ", $str); 
    preg_match_all("/[a-zA-Z]+/", $str, $estr); 
    preg_match_all("/[0-9]+/", $str, $nstr); 
   
    $str = preg_replace("/[0-9a-zA-Z]+/", " ", $str); 
    $str = preg_replace("/s{2,}/", " ", $str); 
   
    $str = explode(" ", trim($str)); 
   
    foreach ($str as $s) { 
        $l = strlen($s); 
   
        $bf = null; 
        for ($i= 0; $i< $l; $i=$i+3) { 
            $ns1 = $s{$i}.$s{$i+1}.$s{$i+2}; 
            if (isset($s{$i+3})) { 
                $ns2 = $s{$i+3}.$s{$i+4}.$s{$i+5}; 
                if (preg_match("/[x80-xff]{3}/",$ns2)) $cstr[] = $ns1.$ns2; 
            } else if ($i == 0) { 
                $cstr[] = $ns1; 
            } 
        } 
    } 
   
    $estr = isset($estr[0])?$estr[0]:array(); 
    $nstr = isset($nstr[0])?$nstr[0]:array(); 
   
    return array_merge($nstr,$estr,$cstr); 
}
?>

結果是:
Array ( [0] => www [1] => jbxue [2] => net [3] => 这是 [4] => 是我 [5] => 我的 [6] => 的网 [7] => 网站 )

如果以上结果转换为区位码,则如下代码所示:

<?php
foreach ($result as $s) { 
    $s = iconv('UTF-8','GB2312',$s); 
    $code[] = gbCode($s); 
} 
$code = implode(" ", $code); 
echo $code; 
   
function gbCode($str) { 
    $return = null; 
   
    if (!preg_match("/^[x80-xff]{2,}$/",$str)) return $str; 
   
    $len = strlen($str); 
    for ($i= 0; $i< $len; $i=$i+2) { 
        $return .= sprintf("%02d%02d",ord($str{$i})-160,ord($str{$i+1})-160); 
    } 
   
    return $return; 
}
?>