Zend Framework — 中文分词 Analyzer
Tuesday, April 29th, 2008最近在看Zend_Lucene_Search.查阅了一些资料.
这篇文章写的不错, 推荐一下.估计已经有很多人看过了.但是,其中的例子有点小问题, 需要修改一下才能跑起来, 我这里简单修改了一下, 放到附件中, 供以后需要的朋友下载
analyzer code.
附录代码(请不要直接使用, 请下载上面的zip中的code使用):
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
class Simple_Chinese_Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
private $_position;
private $_cnStopWords = array();
public function setCnStopWords($cnStopWords){
$this->_cnStopWords = $cnStopWords;
}
public function reset()
{
$this->_position = 0;
$search = array(
",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^",
"(", ")", "?", "-", "t", "n", "'", "<", ">", "\r", "\r\n",
"$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":",
")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’",
"[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】");
$this->_input = str_replace($search,' ',$this->_input);
$this->_input = str_replace($this->_cnStopWords,' ',$this->_input);
}
public function nextToken()
{
if ($this->_input === null) {
return null;
}
while ($this->_position < strlen($this->_input)) {
while ($this->_position < strlen($this->_input) &&
$this->_input[$this->_position]==' ' ) {
$this->_position++;
}
$termStartPosition = $this->_position;
$temp_char = $this->_input[$this->_position];
$isCnWord = false;
if(ord($temp_char)>127){
$i = 0;
while ($this->_position < strlen($this->_input) &&
ord( $this->_input[$this->_position] )>127) {
$this->_position = $this->_position + 3;
$i ++;
if($i==2){
$isCnWord = true;
break;
}
}
if($i==1)continue;
}else{
while ($this->_position < strlen($this->_input) &&
ctype_alnum( $this->_input[$this->_position] )) {
$this->_position++;
}
}
if ($this->_position == $termStartPosition) {
return null;
}
$token = new Zend_Search_Lucene_Analysis_Token(
substr($this->_input,
$termStartPosition,
$this->_position - $termStartPosition),
$termStartPosition,
$this->_position);
$token = $this->normalize($token);
if($isCnWord)$this->_position = $this->_position - 3;
if ($token !== null) {
return $token;
}
}
return null;
}
}
$stopWords = array('a', 'an', 'at', 'the', 'and', 'or', 'is', 'am');
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords);
$analyzer = new Simple_Chinese_Analyzer();
$cnStopWords = array('的');
$analyzer->setCnStopWords($cnStopWords);
$analyzer->addFilter($stopWordsFilter);
$value = 'this is (a test)【中文】的测试, 长春市长春药店';
$analyzer->setInput($value, 'utf-8');
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null)
{
$tokenCounter++;
$tokens[] = $token;
}
print_r($tokens);
?>
