📄 segmenter.jav.old
字号:
}
}
for (i = 0; i < suffix.length; i++)
{
if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true &&
(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
unstemmed.length() == 2))
{
logger.info("Stemmed suffix");
try
{
logger.info(new String(unstemmed.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
unstemmed.deleteCharAt(unstemmed.length()-1);
return unstemmed.toString();
}
}
for (i = 0; i < infix.length; i++)
{
if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null)
{
logger.info("Stemmed infix");
unstemmed.deleteCharAt(1);
return unstemmed.toString();
}
}
return unstemmed.toString();
}
public String segmentLine(String cline, String separator)
{
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
//separator = " ";
clength = cline.length();
int[][] offsets = new int[clength][2];
for (i = 0; i < clength; i++)
{
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true)
{
// Character in CJK block
if (currentword.length() == 0)
{ // start looking for next word
//logger.info("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false))
{
outline.append(separator);
}
currentword.append(currentchar);
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
}
else
{
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true)
{
// word is in lexicon
currentword.append(currentchar);
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
}
else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false))
{
// Possible a transliteration of a foreign name
currentword.append(currentchar);
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
}
else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ )
{
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
}
else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() +
currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar +
cline.charAt(i+1)).intern()) == true))
{
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
// Starts a word in the lexicon
currentword.append(currentchar);
}
else
{ // Start anew
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false)
{
outline.append(separator);
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
}
else
{ // Not chinese character
//logger.info("not cjk");
if (currentword.length() > 0)
{
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false)
{
outline.append(separator);
}
currentword.setLength(0);
}
outline.append(currentchar);
}
}
outline.append(currentword.toString());
return outline.toString();
//return offsets;
}
public LinkedList segmentLine(String cline)
{
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
String separator = "";
LinkedList offsets = new LinkedList();
Map tokenpair = new HashMap();
int currentoffset = 0;
offsets.add(new Integer(1));
clength = cline.length();
for (i = 0; i < clength; i++)
{
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true)
{
// Character in CJK block
if (currentword.length() == 0)
{ // start looking for next word
//logger.info("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false))
{
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0)
{
offsets.add(new Integer(currentoffset));
}
}
currentword.append(currentchar);
}
else
{
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true)
{
// word is in lexicon
currentword.append(currentchar);
}
else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ )
{
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug)
{
try
{
logger.info(new String(currentword.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
}
}
else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false))
{
// Possible a transliteration of a foreign name
currentword.append(currentchar);
}
else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar + cline.charAt(i+1)).intern()) == true))
{
// Starts a word in the lexicon
currentword.append(currentchar);
}
else
{ // Start anew
outline.append(currentword.toString());
currentoffset += currentword.length();
offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false)
{
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0)
{
offsets.add(new Integer(currentoffset));
}
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
}
else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -