先准备两个文件:
1、prototype.js
http://prototype.conio.net/,用于ajax加载汉字字库
2、汉字字库
http://cn.minidx.com/index.php?option=com_docman&task=doc_download&gid=47
完整代码:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>中英文混合分词-www.jb200.com</title> <script language="javascript" src="prototype.js"></script> <script language="javascript"> var dict = ""; var lastword = ""; //ajax 装载字库 new Ajax.Request('js/dict1.txt',{onComplete:function(response){dict=response.responseText+"";}}); var rs = []; function divide(text){ if(text.length==0) return true; var word = text.substring(0,1)+""; var regExp = /w/; //英文 if(regExp.test(word)){ var tmp = text.replace(/^s*(w+)s*.*$/,"$1"); text = text.replace(/^s*w+s*/,""); rs.push(tmp); divide(text); return; } var words = []; var end = 0; var start = -1; while((start = dict.indexOf('rn'+word,end))!=-1){ end = dict.indexOf('rn',start+1); if(start==-1||end==-1) return false; if(start>end) return false; words.push(dict.substr(start,end-start).replace(/(rn|s)/g,"")); } var tmp = ""; for(j=0;j<words.length;j++){ //找到最长的词,当然也可以将所有词保留 if(text.indexOf(words[j])!=-1&&words[j].length>tmp.length){ tmp=words[j]; } } //词库不存在的词 if(tmp == ""){ tmp = word; } text=text.replace(tmp,""); if(tmp.replace(/s/g,'')!="") rs.push(tmp); divide(text); } function dodivde(){ var text = $('word').value; rs = []; divide(text); $('dividresult').innerHTML=rs; } </script> </head> <body> <input type="text" name="word" id="word" value="我Welcome欢迎to Mozilla Firefox Help" onblur="dodivde();" /> <input name="do" type="button" value="DO IT" onclick="dodivde();" /> <span id="dividresult"></span> </body> </html>
注意:
如果自己准备字库的话,请注意字库中首尾要有换行,换行必须是rn,编码必须是utf8。
以上的实现,关键是词库,有了好词库,事情就成功了一半。