最新下载
热门教程
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
[推荐]jsp中文分词程序
时间:2011-11-12 编辑:简简单单 来源:一聚教程网
代码如下 | 复制代码 |
publicclass MM2 { privatestaticfinal Log log = LogFactory.getLog(MM2.class); privatestatic HashMap privatestaticfinalint WORD_MAX_LENGTH =9; private Reader reader; static { loadDictionary(); } public MM2(Reader reader) { this.reader = reader; } //切分出由中文、字母、数字组成的句子 public ArrayList { ArrayList StringBuffer cb=new StringBuffer(); int d=reader.read(); int offset=0; boolean b=false; while(d>-1) { int type=Character.getType(d); if(type==2|| type==9|| type==5) { d=toAscii(d); cb.append((char)d); } else { b=true; } d=reader.read(); if(d==-1|| b) { if(d==-1) offset++; b=false; char[] ioBuffer =newchar[cb.length()]; cb.getChars(0, cb.length(), ioBuffer, 0); Sentence sen=new Sentence(ioBuffer,offset-cb.length()); list.add(sen); cb.setLength(0); } offset++; } return list; } //将句子切分出词 public ArrayList { ArrayList for(Sentence sen:list) { StringBuffer word =new StringBuffer(); int offset=sen.getStartOffset(); int bufferIndex =0; char c; boolean b=false; while(bufferIndex offset++; c=sen.getText()[bufferIndex++]; if(word.length()==0) word.append(c); else { String temp = (word.toString() + c).intern(); if(dictionary.containsKey(temp) && dictionary.get(temp)==1) word.append(c); elseif(dictionary.containsKey(temp) && bufferIndex else { bufferIndex--; offset--; while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2) { word.deleteCharAt(word.length()-1); bufferIndex--; offset--; } b=true; } } if(b || bufferIndex==sen.getText().length) { Token token =new Token(word.toString(),offset-word.length(),offset,"word"); word.setLength(0); tokenlist.add(token); b=false; } } } return tokenlist; } //将相连的单个英文或数字组合成词 public ArrayList { ArrayList Token word=null; for(int i=0;i Token t=list.get(i); if(t.getWord().length()==1&& Character.getType((int)t.getWord().charAt(0))!=5) { if(word==null) word=t; elseif(word.getEnd()==t.getStart()) { word.setEnd(t.getEnd()); word.setWord(word.getWord()+t.getWord()); } else { tokenlist.add(word); word=t; } } elseif(word!=null) { tokenlist.add(word); word=null; tokenlist.add(t); } else tokenlist.add(t); } if(word!=null) tokenlist.add(word); return tokenlist; } //双角转单角 publicstaticint toAscii(int codePoint) { if((codePoint>=65296&& codePoint<=65305) //0-9 || (codePoint>=65313&& codePoint<=65338) //A-Z || (codePoint>=65345&& codePoint<=65370) //a-z ) { codePoint -=65248; } return codePoint; } //加载词典 publicstaticvoid loadDictionary() { if (dictionary ==null) { dictionary =new HashMap InputStream is =null; BufferedReader br =null; try { is =new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI())); br =new BufferedReader(new InputStreamReader(is, "UTF-8")); String word =null; while ((word = br.readLine()) !=null) { word=word.toLowerCase(); if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH)) { dictionary.put(word.intern(), 1); int i = word.length()-1; while(i >=2) { String temp = word.substring(0, i).intern(); if (!dictionary.containsKey(temp)) dictionary.put(temp,2); i--; } } } } catch (Exception e) { log.info(e); } finally { try { if(br!=null) br.close(); if(is!=null) is.close(); } catch (IOException e) { log.info(e); } } } } publicstatic String[] segWords(Reader input) { ArrayList try { MM2 f=new MM2(input); ArrayList for(Token t:tlist) { list.add(t.getWord()); } } catch(IOException e) { log.info(e); } return (String[])list.toArray(new String[0]); } publicstaticvoid main(String[] args) { String[] cc=MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase())); for(String c:cc) { System.out.println(c); } } } |
-
上一个: jsp中利用servlet实现图片上传
-
下一个: java去除字符串空格几种做法
相关文章
- SpringBoot测试配置属性与web启动环境解析 10-24
- vue中将el-switch值true、false改为number类型的1和0解析 10-24
- Vue中的路由配置项meta使用解读 10-24
- SpringBoot自定义bean绑定解析 10-24
- SpringBoot常用计量与bean属性校验和进制数据转换规则解析 10-24
- 工厂方法在Spring框架中的运用介绍 10-24