一聚教程网:一个值得你收藏的教程网站

最新下载

热门教程

[推荐]jsp中文分词程序

时间:2011-11-12 编辑:简简单单 来源:一聚教程网

 代码如下 复制代码
publicclass MM2
{
 privatestaticfinal Log log = LogFactory.getLog(MM2.class);
 
 privatestatic HashMap dictionary =null;
 privatestaticfinalint WORD_MAX_LENGTH =9;
 private Reader reader;
 
 static
 {
 loadDictionary();
 }
 
 public MM2(Reader reader)
 {
 this.reader = reader;
 }
 
 //切分出由中文、字母、数字组成的句子
public ArrayList getSentence() throws IOException
 {
 ArrayList list=new ArrayList();
 StringBuffer cb=new StringBuffer();
 int d=reader.read();
 int offset=0;
 boolean b=false;
 while(d>-1)
 {
 int type=Character.getType(d);
 if(type==2|| type==9|| type==5)
 {
 d=toAscii(d);
 cb.append((char)d);
 }
 else
 {
 b=true;
 }
 d=reader.read();
 if(d==-1|| b)
 {
 if(d==-1) offset++;
 b=false;
 char[] ioBuffer =newchar[cb.length()];
 cb.getChars(0, cb.length(), ioBuffer, 0);
 Sentence sen=new Sentence(ioBuffer,offset-cb.length());
 list.add(sen);
 cb.setLength(0);
 }
 offset++;
 }
 return list;
 }
 
 //将句子切分出词
public ArrayList getToken(ArrayList list) throws IOException
 {
 ArrayList tokenlist=new ArrayList();
 for(Sentence sen:list)
 {
 StringBuffer word =new StringBuffer();
 int offset=sen.getStartOffset();
 int bufferIndex =0;
 char c;
 boolean b=false;
 while(bufferIndex  {
 offset++;
 c=sen.getText()[bufferIndex++];
 if(word.length()==0)
 word.append(c);
 else
 {
 String temp = (word.toString() + c).intern();
 if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
 word.append(c);
 elseif(dictionary.containsKey(temp) && bufferIndex  word.append(c);
 else
 {
 bufferIndex--;
 offset--;
 while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2)
 {
 word.deleteCharAt(word.length()-1);
 bufferIndex--;
 offset--;
 }
 b=true;
 }
 }
 if(b || bufferIndex==sen.getText().length)
 {
 Token token =new Token(word.toString(),offset-word.length(),offset,"word");
 word.setLength(0);
 tokenlist.add(token);
 b=false;
 }
 }
 }
 return tokenlist;
 }
 
 //将相连的单个英文或数字组合成词
public ArrayList getNewToken(ArrayList list) throws IOException
 {
 ArrayList tokenlist=new ArrayList();
 Token word=null;
 for(int i=0;i  {
 Token t=list.get(i);
 if(t.getWord().length()==1&& Character.getType((int)t.getWord().charAt(0))!=5)
 {
 if(word==null)
 word=t;
 elseif(word.getEnd()==t.getStart())
 {
 word.setEnd(t.getEnd());
 word.setWord(word.getWord()+t.getWord());
 }
 else
 {
 tokenlist.add(word);
 word=t;
 }
 }
 elseif(word!=null)
 {
 tokenlist.add(word);
 word=null;
 tokenlist.add(t);
 }
 else
 tokenlist.add(t);
 }
 if(word!=null)
 tokenlist.add(word);
 return tokenlist;
 }
 
 //双角转单角
publicstaticint toAscii(int codePoint)
 {
 if((codePoint>=65296&& codePoint<=65305) //0-9
|| (codePoint>=65313&& codePoint<=65338) //A-Z
|| (codePoint>=65345&& codePoint<=65370) //a-z
 )
 {
 codePoint -=65248;
 }
 return codePoint;
 }
 
 //加载词典
publicstaticvoid loadDictionary()
 {
 if (dictionary ==null)
 {
 dictionary =new HashMap();
 InputStream is =null;
 BufferedReader br =null;
 try
 {
 is =new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
 br =new BufferedReader(new InputStreamReader(is, "UTF-8"));
 String word =null;
 while ((word = br.readLine()) !=null)
 {
 word=word.toLowerCase();
 if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH))
 {
 dictionary.put(word.intern(), 1);
 int i = word.length()-1;
 while(i >=2)
 {
 String temp = word.substring(0, i).intern();
 if (!dictionary.containsKey(temp))
 dictionary.put(temp,2);
 i--;
 }
 }
 }
 }
 catch (Exception e)
 {
 log.info(e);
 }
 finally
 {
 try
 {
 if(br!=null)
 br.close();
 if(is!=null)
 is.close();
 }
 catch (IOException e)
 {
 log.info(e);
 }
 }
 }
 }
 
 publicstatic String[] segWords(Reader input)
 {
 ArrayList list=new ArrayList();
 try
 {
 MM2 f=new MM2(input);
 ArrayList tlist= f.getNewToken(f.getToken(f.getSentence()));
 for(Token t:tlist)
 {
 list.add(t.getWord());
 }
 }
 catch(IOException e)
 {
 log.info(e);
 }
 return (String[])list.toArray(new String[0]);
 }
 
 publicstaticvoid main(String[] args)
 {
 String[] cc=MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase()));
 for(String c:cc)
 {
 System.out.println(c);
 }
 }
}

热门栏目