中文分词代码(此代码为作者多年经验总结,以前发表过VB,PB版本)
/* * created by yzh 2004.5.12 * 请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制。 * 中文分词代码 *此代码为作者多年经验总结,以前发表过VB,PB版本 */
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Locale; import java.util.TreeMap; import java.util.TreeSet;
public class ChineseSegmenter {
private static ChineseSegmenter segmenter = null;
// private Hashtable zhwords; private TreeMap zhwords;
private TreeSet cforeign, cnumbers;
// Char form public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
// Charform is TRAD, SIMP or BOTH private ChineseSegmenter(int charform, boolean loadwordfile) { cforeign = new TreeSet(); cnumbers = new TreeSet();
if (charform == SIMP) { loadset(cnumbers, "data/snumbers_u8.txt"); loadset(cforeign, "data/sforeign_u8.txt"); } else if (charform == TRAD) { loadset(cnumbers, "data/tnumbers_u8.txt"); loadset(cforeign, "data/tforeign_u8.txt"); } else { // BOTH loadset(cnumbers, "data/snumbers_u8.txt"); loadset(cforeign, "data/sforeign_u8.txt"); loadset(cnumbers, "data/tnumbers_u8.txt"); loadset(cforeign, "data/tforeign_u8.txt"); }
// zhwords = new Hashtable(120000); zhwords = new TreeMap();
if (!loadwordfile) { return; }
String newword = null; try { InputStream worddata = null; if (charform == SIMP) { worddata = getClass().getResourceAsStream("simplexu8.txt"); } else if (charform == TRAD) { worddata = getClass().getResourceAsStream("tradlexu8.txt"); } else if (charform == BOTH) { worddata = getClass().getResourceAsStream("bothlexu8.txt"); } BufferedReader in = new BufferedReader(new InputStreamReader( worddata, "UTF8")); while ((newword = in.readLine()) != null) { if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) { if (zhwords.containsKey(newword.substring(0, 2) .intern()) == false) { zhwords.put(newword.substring(0, 2).intern(), "2"); } }
if (newword.length() == 4) { if (zhwords.containsKey(newword.substring(0, 2) .intern()) == false) { zhwords.put(newword.substring(0, 2).intern(), "2"); } if (zhwords.containsKey(newword.substring(0, 3) .intern()) == false) { zhwords.put(newword.substring(0, 3).intern(), "2"); } } } } in.close(); } catch (IOException e) { e.printStackTrace(); }
} public synchronized static void reset() { ChineseSegmenter.segmenter = null; }
public synchronized static ChineseSegmenter getGBSegmenter() { Locale.setDefault(Locale.SIMPLIFIED_CHINESE); if (ChineseSegmenter.segmenter == null) { ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true); } return ChineseSegmenter.segmenter; }
public synchronized static ChineseSegmenter getBig5Segmenter() { Locale.setDefault(Locale.TRADITIONAL_CHINESE); if (ChineseSegmenter.segmenter == null) { ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true); } return ChineseSegmenter.segmenter; }
private void loadset(TreeSet targetset, String sourcefile) { String dataline; try { InputStream setdata = getClass().getResourceAsStream(sourcefile); BufferedReader in = new BufferedReader(new InputStreamReader( setdata, "UTF-8")); while ((dataline = in.readLine()) != null) { if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) { continue; } targetset.add(dataline.intern()); } in.close(); } catch (Exception e) { System.err.println("Exception loading data file" + sourcefile + " " + e); e.printStackTrace(); }
}
public boolean isNumber(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) { result = false; break; } } return result; }
public boolean isAllForeign(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) { result = false; break; } }
return result; }
public boolean isNotCJK(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) { result = false; break; } }
return result; }
public String segmentLine(String cline, String separator) { StringBuffer currentword = new StringBuffer(); StringBuffer outline = new StringBuffer(); int i, clength; char currentchar; // separator = " ";
clength = cline.length(); for (i = 0; i < clength; i++) { currentchar = cline.charAt(i); if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || isNumber(cline.substring(i, i + 1)) == true) { // Character in CJK block if (currentword.length() == 0) { // start looking for next // word if (i > 0 && (Character.isWhitespace(cline.charAt(i - 1)) == false)) { outline.append(separator); } currentword.append(currentchar);
} else { if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true && ((String) (zhwords.get(new String(currentword .toString() + currentchar).intern()))).equals("1") == true) { // word is in lexicon currentword.append(currentchar); } else if (isAllForeign(currentword.toString()) && cforeign.contains(new String( new char[] { currentchar }).intern()) && i + 2 < clength && (zhwords.containsKey(cline.substring(i, i + 2) .intern()) == false)) { // Possible a transliteration of a foreign name currentword.append(currentchar); } else if (isNumber(currentword.toString()) && cnumbers.contains(new String( new char[] { currentchar }).intern()) /* * && (i + 2 < clength) && * (zhwords.containsKey(cline.substring(i, i+2).intern()) == * false) */) { // Put all consecutive number characters together currentword.append(currentchar); } else if ((zhwords.containsKey(new String(currentword .toString() + currentchar).intern())) && (((String) (zhwords.get(new String(currentword .toString() + currentchar).intern()))).equals("2") == true) && i + 1 < clength && (zhwords.containsKey(new String(currentword .toString() + currentchar + cline.charAt(i + 1)) .intern()) == true)) { // Starts a word in the lexicon currentword.append(currentchar);
} else { // Start anew outline.append(currentword.toString()); if (Character.isWhitespace(currentchar) == false) { outline.append(separator); } currentword.setLength(0); currentword.append(currentchar); } }
} else { // Not chinese character // System.err.println("not cjk"); if (currentword.length() > 0) { outline.append(currentword.toString()); if (Character.isWhitespace(currentchar) == false) { outline.append(separator); } currentword.setLength(0); } outline.append(currentchar); } }
outline.append(currentword.toString());
return outline.toString(); // return offsets; }
public static void main(String[] args) throws Exception {
ChineseSegmenter seg = ChineseSegmenter.getGBSegmenter(); System.out.println(seg.segmentLine("Some string in chinese.", " ")); }
}
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。 原始发表:2006-03-24,如有侵权请联系 cloudcommunity@tencent 删除publicstringtxt中文分词append