📄 addcollocation.java
字号:
import java.util.*;
import java.io.*;
public class AddCollocation//将collocation在原语料中标记出来
{
public static void main(String[] args) throws IOException
{
File fin = new File("1addpolar.txt");
File fin2 = new File("collocation_seg.txt");
FileWriter fout = new FileWriter("2addcollo.txt");
Scanner scan = new Scanner (fin);
while (scan.hasNextLine())
{
StringBuffer str1 = new StringBuffer(scan.nextLine());//将原语料中的一行赋给字符串str1
Scanner scan2 = new Scanner (fin2);
while(scan2.hasNextLine())
{
String str2 =scan2.nextLine();//将collocation表中的一行赋给str2
Scanner scan3 = new Scanner(str2).useDelimiter("\\s+");
String str3 = scan3.next();
char letter='E';//用来保存褒贬词的词性,即+,-,*,~
while (scan3.hasNext())
{
String str4 = scan3.next();
letter = str4.charAt(0);
if(letter != '?') str3 = str3 + " " + str4;
else break;
}
int idx = str1.indexOf(str3);//idx用来保存索引
int slen1 = str3.length();
while(idx != -1)
{
int flag0=0;
int flag[]=new int[5];
flag[0]=str1.indexOf(",",idx);
flag[1]=str1.indexOf("。",idx);
flag[2]=str1.indexOf("?",idx);
flag[3]=str1.indexOf("!",idx);
flag[4]=str1.indexOf(";",idx);
Arrays.sort(flag);
for(int j=0;j<5;j++)
{
if(flag[j]!=-1)
{
flag0=flag[j];
break;
}
}
if (flag0==0) flag0=str1.length();
String str_senten=str1.substring(idx,flag0);
int temp = str2.indexOf("?");
String substr2 = str2.substring(temp+5);
scan3 = new Scanner(substr2).useDelimiter("\\s+");
String str_3 = scan3.next();
while (scan3.hasNext())
{
String str4 = scan3.next();
letter = str4.charAt(0);
if(letter != '+' && letter != '-' && letter != '*' && letter != '~') str_3 = str_3 + " " + str4;
else break;
}
int inde = str_senten.indexOf(str_3);
int slen2 = str_3.length();
if(inde != -1)
{
/*index[i] = idx+inde; letters[i]=letter;length[i]=slen;
i++;*/
int temp_flag = idx-1;
char ch = str1.charAt(temp_flag);
while(ch == '+'||ch == '-'||ch == '*'||ch == '~'||ch == 'a'||ch == 'b'||ch == 'c'||ch == 'd'||ch == 'n'||ch == '0')
{
ch = str1.charAt(--temp_flag);
}
if(ch =='#')
{
str1.deleteCharAt(temp_flag);
idx--;
}
temp_flag = idx+inde-1;
ch = str1.charAt(temp_flag);
while(ch == '+'||ch == '-'||ch == '*'||ch == '~'||ch == 'a'||ch == 'b'||ch == 'c'||ch == 'd'||ch == 'n'||ch == '0')
{
ch = str1.charAt(--temp_flag);
}
if(ch =='#')
{
str1.deleteCharAt(temp_flag);
inde--;
}
str1.insert(idx+inde,"#^"+letter);
String sub_str=str1.substring(0,idx);
int index=sub_str.lastIndexOf(" ");
addFactor(str1,index,slen1);
sub_str=str1.substring(0,idx+inde);
index=sub_str.lastIndexOf(" ");
addFactor(str1,index,slen2);
str1.insert(idx,"#^"+letter);
System.out.println(str_3);
}
idx = str1.indexOf(str3,idx+5);
}
scan3.close();
}
scan2.close();
fout.write(str1.toString()+"\n");
/*if(i>0)//str1中找到至少一个褒贬词
{
int a,b,t,u;
char v;
for(a=1;a<i;a++)//用冒泡法对数组index排序
for (b=i-1;b>=a;b--)
{
if (index[b-1]>index[b])
{ u = index[b-1];v = letters[b-1];t = length[b-1];
index[b-1]=index[b];letters[b-1]=letters[b];length[b-1]=length[b];
index[b]=u;letters[b]=v;length[b]=t;
}
}
int m;
String str5;
for (m=0;m<i;m++)
{
if(m==0)str5 = str1.substring(0,index[m]);
else str5 = str1.substring(index[m-1],index[m]);
fout.write(str5 + "#^" + letters[m]);//^代表collocation
}
str5 = str1.substring(index[m-1]);
fout.write(str5);
}
else fout.write(str1);
fout.write("\n");*/
}
scan.close();
fout.close();
}
//str1,index,slen1
public static void addFactor(StringBuffer str1,int index,int slen)throws IOException
{
int flag1=0,flag2=0;
int length1=0,temp_length=0;
int idx = str1.indexOf("#^");
char temp_letter='E';
File fin22 = new File("factor_seg.txt");
Scanner scan22 = new Scanner (fin22);
while(scan22.hasNextLine())
{
String str22 =scan22.nextLine();//将影响因子词表中的一行赋给str22
Scanner scan33 = new Scanner (str22).useDelimiter("\\s+");
String str33 = scan33.next();
char lett='E';//用来保存影响因子的程度,即a,b,c,d,n,0
while (scan33.hasNext())
{
String str44 = scan33.next();
lett = str44.charAt(0);
if(lett != 'a' && lett != 'b' && lett != 'c' && lett != 'd'&& lett != 'n'&& lett != '0'&& lett != 'A')
str33 = str33 + " " + str44;
else
{if (lett=='A') lett=str44.charAt(1);
break;
}
}
String str55;
if(str33.charAt(0)!='A')//看褒贬词前并且紧挨褒贬词的是不是影响因子
{
try
{
str55 = str1.substring(index-str33.length(),index);//2代表两个空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33))
{
flag1++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
else
{
try
{
str55 = str1.substring(index-5-str33.length(),index);//5代表的/u和两个空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33+" 的/u")||str55.equals(str33+" 地/u")||str55.equals(str33+" 得/u"))
{
flag2++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
}
}
else//看褒贬词后并且紧挨褒贬词的是不是影响因子
{
str33 = str33.substring(5);
try
{
str55 = str1.substring(index+2+slen,index+2+slen+str33.length());
}
catch (StringIndexOutOfBoundsException sibe)
{
str55 = "";
}
if (str55.equals(str33))
{
str1.insert(idx+3,lett);
break;
}
else
{
try
{
str55 = str1.substring(index+slen+2,index+slen+2+5+str33.length());
}
catch (StringIndexOutOfBoundsException sibe)
{
str55 = "";
}
if (str55.equals("的/u "+str33)||str55.equals("地/u "+str33)||str55.equals("得/u "+str33))
{
str1.insert(idx+3,lett);
break;
}
}
}
}//退出while(scan22)循环
if(flag1>0)
{
str1.insert(idx+3,temp_letter);
length1+=2;
length1+=temp_length;
}
else if(flag2>0)
{
str1.insert(idx+3,temp_letter);
length1+=7;
length1+=temp_length;
}
while(flag1>0||flag2>0)
{
flag1=0;
flag2=0;
temp_length=0;
Scanner scan20 = new Scanner (fin22);
while(scan20.hasNextLine())
{
String str22 =scan20.nextLine();//将影响因子词表中的一行赋给str22
Scanner scan33 = new Scanner (str22).useDelimiter("\\s+");
String str33 = scan33.next();
char lett='E';//用来保存影响因子的程度,即a,b,c,d,n,0
while (scan33.hasNext())
{
String str44 = scan33.next();
lett = str44.charAt(0);
if(lett != 'a' && lett != 'b' && lett != 'c' && lett != 'd'&& lett != 'n'&& lett != '0'&& lett != 'A')
str33 = str33 + " " + str44;
else
{if (lett=='A') lett=str44.charAt(1);
break;
}
}
String str55;
if(str33.charAt(0)!='A')//看褒贬词前并且紧挨褒贬词的是不是影响因子
{
try
{
str55 = str1.substring(index-length1-str33.length(),index-length1);//2代表两个空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33))
{
flag1++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
else
{
try
{
str55 = str1.substring(index-5-length1-str33.length(),index-length1);//5代表的/u和两个空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33+" 的/u")||str55.equals(str33+" 地/u")||str55.equals(str33+" 得/u"))
{
flag2++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
}
}
}//退出内while循环(scan20.hasNextLine())
if(flag1>0)
{
str1.insert(idx+3,temp_letter);
length1+=2;
length1+=temp_length;
}
else if(flag2>0)
{
str1.insert(idx+3,temp_letter);
length1+=7;
length1+=temp_length;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -