最新下载
热门教程
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
Rmm 分词算法代码片段
时间:2010-03-09 编辑:简简单单 来源:一聚教程网
function SplitRMM($str=""){
if($str!="") $this->SetSource(trim($str));
if($this->SourceString=="") return "";
//对文本进行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ",$this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if(trim($spwords[$i])=="") continue;
if($this->NotGBK($spwords[$i])){
if(ereg("[^0-9.+-]",$spwords[$i]))
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else
{
$nextword = "";
@$nextword = substr($this->ResultString,0,strpos($this->ResultString," "));
if(ereg("^".$this->CommonUnit,$nextword)){
$this->ResultString = $spwords[$i].$this->ResultString;
}else{
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if($c=="《") //书名
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else if($n>0xA13F && $n < 0xAA40) //标点符号
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else //正常短句
{
if(strlen($spwords[$i]) <= $this->SplitLen)
{
//如果结束符为特殊分割词,分离处理
if(ereg($this->EspecialChar."$",$spwords[$i],$regs)){
$spwords[$i] = ereg_replace($regs[0]."$","",$spwords[$i]).$spc.$regs[0];
}
//是否为常用单位
if(!ereg("^".$this->CommonUnit,$spwords[$i]) || $i==0){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}else{
$this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString;
$i--;
}
}
else
{
$this->ResultString = $this->RunRMM($spwords[$i]).$spc.$this->ResultString;
}
}
}
}
return $this->ResultString;
}
-
上一个: php 文件夹与文件移动类
-
下一个: 把全角数字转为半角数字
相关文章
- PHP导出数据超时的优化建议解读 10-31
- PHP之mysql位运算解析 10-31
- Laravel实现登录跳转功能解析 10-31
- php双向队列解读 10-31
- Laravel异常上下文解决教程 10-24
- php数组查询元素位置方法介绍 10-24