最新下载
热门教程
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
java获取字符串编码函数
时间:2011-05-03 编辑:简简单单 来源:一聚教程网
encoding.java
package org.loon.test.encoding;/** *//**
*
* title: loonframework
*
*
* description:编码基本类型集合
*
*
* copyright: copyright (c) 2008
*
*
* company: loonframework
*
*
* license: http://www.apache.org/licenses/license-2.0
*
*
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class encoding ...{// 支持的字符格式
public static int gb2312 = 0;public static int gbk = 1;
public static int big5 = 2;public static int utf8 = 3;
public static int unicode = 4;
public static int euc_kr = 5;
public static int sjis = 6;
public static int euc_jp = 7;
public static int ascii = 8;
public static int unknown = 9;
public static int totalt = 10;
public final static int simp = 0;
public final static int trad = 1;
// 解析名称用
public static string[] javaname;// 编码用
public static string[] nicename;// 应用于html中的字符集
public static string[] htmlname;public encoding() ...{
javaname = new string[totalt];
nicename = new string[totalt];
htmlname = new string[totalt];
javaname[gb2312] = "gb2312";
javaname[gbk] = "gbk";
javaname[big5] = "big5";
javaname[utf8] = "utf8";
javaname[unicode] = "unicode";
javaname[euc_kr] = "euc_kr";
javaname[sjis] = "sjis";
javaname[euc_jp] = "euc_jp";
javaname[ascii] = "ascii";
javaname[unknown] = "iso8859_1";// 分配编码名称
htmlname[gb2312] = "gb2312";
htmlname[gbk] = "gbk";
htmlname[big5] = "big5";
htmlname[utf8] = "utf-8";
htmlname[unicode] = "utf-16";
htmlname[euc_kr] = "euc-kr";
htmlname[sjis] = "shift_jis";
htmlname[euc_jp] = "euc-jp";
htmlname[ascii] = "ascii";
htmlname[unknown] = "iso8859-1";// 分配可读名称
nicename[gb2312] = "gb-2312";
nicename[gbk] = "gbk";
nicename[big5] = "big5";
nicename[utf8] = "utf-8";
nicename[unicode] = "unicode";
nicename[euc_kr] = "euc-kr";
nicename[sjis] = "shift-jis";
nicename[euc_jp] = "euc-jp";
nicename[ascii] = "ascii";
nicename[unknown] = "unknown";}
public string toencoding(final int type) ...{
return (javaname[type] + "," + nicename[type] + "," + htmlname[type])
.intern();
}
}encode,java(省略,见源码)
parseencoding.java
package org.loon.test.encoding;import java.io.bytearrayoutputstream;
import java.io.file;
import java.io.fileinputstream;
import java.io.filenotfoundexception;
import java.io.ioexception;
import java.io.inputstream;
import java.net.malformedurlexception;
import java.net.url;/** *//**
*
* title: loonframework
*
*
* description:
*
*
* copyright: copyright (c) 2008
*
*
* company: loonframework
*
*
* license: http://www.apache.org/licenses/license-2.0
*
*
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class parseencoding extends encode ...{public parseencoding() ...{
super();
gb2312format = new int[94][94];
gbkformat = new int[126][191];
big5format = new int[94][158];
euc_krformat = new int[94][94];
jpformat = new int[94][94];// 初始化编码格式
init();
}public string getencoding(final string path) ...{
return check(getencodevalue(path));
}public string getencoding(final inputstream in) ...{
return check(getencodevalue(in));
}public string getencoding(final byte[] buffer) ...{
return check(getencodevalue(buffer));
}public string getencoding(final url url) ...{
return check(getencodevalue(url));
}private string check(final int result) ...{
if (result == -1) ...{
return nicename[unknown];
}
return nicename[result];
}/** *//**
* 解析指定字符串路径编码所用格式
*
* @param path
* @return
*/
private int getencodevalue(string path) ...{
int express = unknown;
if (path.startswith("http://")) ...{
try ...{
express = getencodevalue(new url(path));
} catch (malformedurlexception e) ...{
express = -1;
}
} else ...{
express = getencodevalue(new file(path));
}
return express;
}/** *//**
*
* 解析指定inputstream所用编码,返回或然率最高的编码类型数值
*
* @param in
* @return
*/
public int getencodevalue(inputstream in) ...{
byte[] rawtext = new byte[8192];
int bytesread = 0, byteoffset = 0;
int express = unknown;
inputstream stream = in;
try ...{
while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length
- byteoffset)) > 0) ...{
byteoffset += bytesread;
}
;
stream.close();
express = getencodevalue(rawtext);
} catch (exception e) ...{
express = -1;
}
return express;
}/** *//**
* 解析指定url下数据所用编码,返回或然率最高的编码类型数值
*
* @param url
* @return
*/
public int getencodevalue(url url) ...{inputstream stream;
try ...{
stream = url.openstream();
} catch (ioexception e) ...{
stream = null;
}return getencodevalue(stream);
}/** *//**
* 解析指定file所用编码,返回或然率最高的编码类型数值
*
* @param file
* @return
*/
public int getencodevalue(file file) ...{
byte[] buffer;
try ...{
buffer = read(new fileinputstream(file));
} catch (filenotfoundexception e) ...{
buffer = null;
}
return getencodevalue(buffer);
}/** *//**
* 将inputstream转为byte[]
*
* @param inputstream
* @return
*/
private final byte[] read(final inputstream inputstream) ...{
byte[] arraybyte = null;
bytearrayoutputstream bytearrayoutputstream = new bytearrayoutputstream();
byte[] bytes = new byte[8192];
try ...{
bytes = new byte[inputstream.available()];
int read;
while ((read = inputstream.read(bytes)) >= 0) ...{
bytearrayoutputstream.write(bytes, 0, read);
}
arraybyte = bytearrayoutputstream.tobytearray();
} catch (ioexception e) ...{
return null;
}
return arraybyte;
}/** *//**
* 解析指定byte[]所用编码,返回或然率最高的数值类型
*
* @param content
* @return
*/
public int getencodevalue(byte[] content) ...{
if (content == null)
return -1;
int[] scores;
int index, maxscore = 0;
int encoding = unknown;
scores = new int[totalt];
// 分配或然率
scores[gb2312] = gb2312probability(content);
scores[gbk] = gbkprobability(content);
scores[big5] = big5probability(content);
scores[utf8] = utf8probability(content);
scores[unicode] = utf16probability(content);
scores[euc_kr] = euc_krprobability(content);
scores[ascii] = asciiprobability(content);
scores[sjis] = sjisprobability(content);
scores[euc_jp] = euc_jpprobability(content);
scores[unknown] = 0;// 概率比较
for (index = 0; index < totalt; index++) ...{
if (scores[index] > maxscore) ...{
// 索引
encoding = index;
// 最大几率
maxscore = scores[index];
}
}
// 返回或然率大于50%的数据
if (maxscore <= 50) ...{
encoding = unknown;
}
return encoding;
}/** *//**
* gb2312数据或然率计算
*
* @param content
* @return
*/
private int gb2312probability(byte[] content) ...{
int i, rawtextlen = 0;int dbchars = 1, gbchars = 1;
long gbformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;// 检查是否在亚洲汉字范围内
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
// 汉字gb码由两个字节组成,每个字节的范围是0xa1 ~ 0xfe
if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xf7
&& (byte) 0xa1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xfe) ...{
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0xa1;
column = content[i + 1] + 256 - 0xa1;
if (gb2312format[row][column] != 0) ...{
gbformat += gb2312format[row][column];
} else if (15 <= row && row < 55) ...{
// 在gb编码范围
gbformat += 200;
}}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
formatval = 50 * ((float) gbformat / (float) totalformat);return (int) (rangeval + formatval);
}/** *//**
* gb2312或然率计算
*
* @param content
* @return
*/
private int gbkprobability(byte[] content) ...{
int i, rawtextlen = 0;int dbchars = 1, gbchars = 1;
long gbformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xf7
&& // gb范围
(byte) 0xa1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xfe) ...{
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0xa1;
column = content[i + 1] + 256 - 0xa1;
if (gb2312format[row][column] != 0) ...{
gbformat += gb2312format[row][column];
} else if (15 <= row && row < 55) ...{
gbformat += 200;
}} else if ((byte) 0x81 <= content[i]
&& content[i] <= (byte) 0xfe && // gb扩展区域
(((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xfe) || ((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e))) ...{
gbchars++;
totalformat += 500;
row = content[i] + 256 - 0x81;
if (0x40 <= content[i + 1] && content[i + 1] <= 0x7e) ...{
column = content[i + 1] - 0x40;
} else ...{
column = content[i + 1] + 256 - 0x40;
}
if (gbkformat[row][column] != 0) ...{
gbformat += gbkformat[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
formatval = 50 * ((float) gbformat / (float) totalformat);
return (int) (rangeval + formatval) - 1;
}/** *//**
* 解析为big5的或然率
*
* @param content
* @return
*/
private int big5probability(byte[] content) ...{
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
float rangeval = 0, formatval = 0;
long bfformat = 0, totalformat = 1;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
if ((byte) 0xa1 <= content[i]
&& content[i] <= (byte) 0xf9
&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e) || ((byte) 0xa1 <= content[i + 1] && content[i + 1] <= (byte) 0xfe))) ...{
bfchars++;
totalformat += 500;
row = content[i] + 256 - 0xa1;
if (0x40 <= content[i + 1] && content[i + 1] <= 0x7e) ...{
column = content[i + 1] - 0x40;
} else ...{
column = content[i + 1] + 256 - 0x61;
}
if (big5format[row][column] != 0) ...{
bfformat += big5format[row][column];
} else if (3 <= row && row <= 37) ...{
bfformat += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) bfchars / (float) dbchars);
formatval = 50 * ((float) bfformat / (float) totalformat);return (int) (rangeval + formatval);
}/** *//**
* 在utf-8中的或然率
*
* @param content
* @return
*/
private int utf8probability(byte[] content) ...{
int score = 0;
int i, rawtextlen = 0;
int goodbytes = 0, asciibytes = 0;
// 检查是否为汉字可接受范围
rawtextlen = content.length;
for (i = 0; i < rawtextlen; i++) ...{
if ((content[i] & (byte) 0x7f) == content[i]) ...{
asciibytes++;
} else if (-64 <= content[i] && content[i] <= -33
&& i + 1 < rawtextlen && -128 <= content[i + 1]
&& content[i + 1] <= -65) ...{
goodbytes += 2;
i++;
} else if (-32 <= content[i] && content[i] <= -17
&& i + 2 < rawtextlen && -128 <= content[i + 1]
&& content[i + 1] <= -65 && -128 <= content[i + 2]
&& content[i + 2] <= -65) ...{
goodbytes += 3;
i += 2;
}
}if (asciibytes == rawtextlen) ...{
return 0;
}score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
// 如果不高于98则减少到零
if (score > 98) ...{
return score;
} else if (score > 95 && goodbytes > 30) ...{
return score;
} else ...{
return 0;
}}
/** *//**
* 检查为utf-16的或然率
*
* @param content
* @return
*/
private int utf16probability(byte[] content) ...{if (content.length > 1
&& ((byte) 0xfe == content[0] && (byte) 0xff == content[1])
|| ((byte) 0xff == content[0] && (byte) 0xfe == content[1])) ...{
return 100;
}
return 0;
}/** *//**
* 检查为ascii的或然率
*
* @param content
* @return
*/
private int asciiprobability(byte[] content) ...{
int score = 75;
int i, rawtextlen;rawtextlen = content.length;
for (i = 0; i < rawtextlen; i++) ...{
if (content[i] < 0) ...{
score = score - 5;
} else if (content[i] == (byte) 0x1b) ...{ // esc (used by iso 2022)
score = score - 5;
}
if (score <= 0) ...{
return 0;
}
}
return score;
}/** *//**
* 检查为euc_kr的或然率
*
* @param content
* @return
*/
private int euc_krprobability(byte[] content) ...{
int i, rawtextlen = 0;int dbchars = 1, krchars = 1;
long krformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;
rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xfe
&& (byte) 0xa1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xfe) ...{
krchars++;
totalformat += 500;
row = content[i] + 256 - 0xa1;
column = content[i + 1] + 256 - 0xa1;
if (euc_krformat[row][column] != 0) ...{
krformat += euc_krformat[row][column];
} else if (15 <= row && row < 55) ...{
krformat += 0;
}}
i++;
}
}
rangeval = 50 * ((float) krchars / (float) dbchars);
formatval = 50 * ((float) krformat / (float) totalformat);return (int) (rangeval + formatval);
}private int euc_jpprobability(byte[] content) ...{
int i, rawtextlen = 0;int dbchars = 1, jpchars = 1;
long jpformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column;rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xfe
&& (byte) 0xa1 <= content[i + 1]
&& content[i + 1] <= (byte) 0xfe) ...{
jpchars++;
totalformat += 500;
row = content[i] + 256 - 0xa1;
column = content[i + 1] + 256 - 0xa1;
if (jpformat[row][column] != 0) ...{
jpformat += jpformat[row][column];
} else if (15 <= row && row < 55) ...{
jpformat += 0;
}}
i++;
}
}
rangeval = 50 * ((float) jpchars / (float) dbchars);
formatval = 50 * ((float) jpformat / (float) totalformat);return (int) (rangeval + formatval);
}private int sjisprobability(byte[] content) ...{
int i, rawtextlen = 0;int dbchars = 1, jpchars = 1;
long jpformat = 0, totalformat = 1;
float rangeval = 0, formatval = 0;
int row, column, adjust;rawtextlen = content.length;
for (i = 0; i < rawtextlen - 1; i++) ...{
if (content[i] >= 0) ...{
} else ...{
dbchars++;
if (i + 1 < content.length
&& (((byte) 0x81 <= content[i] && content[i] <= (byte) 0x9f) || ((byte) 0xe0 <= content[i] && content[i] <= (byte) 0xef))
&& (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e) || ((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xfc))) ...{
jpchars++;
totalformat += 500;
row = content[i] + 256;
column = content[i + 1] + 256;
if (column < 0x9f) ...{
adjust = 1;
if (column > 0x7f) ...{
column -= 0x20;
} else ...{
column -= 0x19;
}
} else ...{
adjust = 0;
column -= 0x7e;
}
if (row < 0xa0) ...{
row = ((row - 0x70) << 1) - adjust;
} else ...{
row = ((row - 0xb0) << 1) - adjust;
}row -= 0x20;
column = 0x20;
if (row < jpformat.length && column < jpformat[row].length
&& jpformat[row][column] != 0) ...{
jpformat += jpformat[row][column];
}
i++;
} else if ((byte) 0xa1 <= content[i]
&& content[i] <= (byte) 0xdf) ...{
}}
}
rangeval = 50 * ((float) jpchars / (float) dbchars);
formatval = 50 * ((float) jpformat / (float) totalformat);return (int) (rangeval + formatval) - 1;
}}
encodingtest.java
package org.loon.test.encoding;
/** *//**
*title: loonframework
*description:
*copyright: copyright (c) 2008
*company: loonframework
*license: http://www.apache.org/licenses/license-2.0
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class encodingtest ...{
public static void main(string argc[]) ...{
parseencoding parse;parse = new parseencoding();
system.out.println("中国大陆:");
system.out.println("测试字符串,编码格式="+parse.getencoding("百度".getbytes()));
system.out.println("测试站点,编码格式="+parse.getencoding("http://www.111com.net"));
system.out.println();
system.out.println("中国台湾:");
system.out.println("测试字符串,编码格式="+parse.getencoding("".getbytes()));
system.out.println("测试站点,编码格式="+parse.getencoding("http://tw.yahoo.com/"));
system.out.println("测试站点(繁体字,utf编码),编码格式="+parse.getencoding("http://.tw/jute"));
system.out.println();
system.out.println("日本:");
system.out.println("测试字符串,编码格式="+parse.getencoding("".getbytes()));
system.out.println("测试站点,编码格式="+parse.getencoding("http://www.111com.net"));
system.out.println();
system.out.println("自称蚩尤后代那群……:");
system.out.println("测试站点,编码格式="+parse.getencoding("http://www.easyjava.co.kr/"));
}
}
-
上一个: java字符比较与字符类型转换方法
-
下一个: java 面向对象编程之我见解
相关文章
- SpringBoot测试配置属性与web启动环境解析 10-24
- vue中将el-switch值true、false改为number类型的1和0解析 10-24
- Vue中的路由配置项meta使用解读 10-24
- SpringBoot自定义bean绑定解析 10-24
- SpringBoot常用计量与bean属性校验和进制数据转换规则解析 10-24
- 工厂方法在Spring框架中的运用介绍 10-24