📄 newsegment.java
字号:
/*
* 创建日期 2005-1-18
*
* TODO 要更改此生成的文件的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
package net.nutch.segment;
import java.io.File;
import java.util.Properties;
import net.nutch.fs.LocalFileSystem;
import net.nutch.fs.NutchFileSystem;
import net.nutch.io.MD5Hash;
import net.nutch.parse.Outlink;
import net.nutch.parse.ParseData;
import net.nutch.parse.ParseText;
import org.apache.log4j.Logger;
import kit.nlp.util.WordsSegment;
/**
* @author shqxie
*
* TODO 要更改此生成的类型注释的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
public class NewSegment {
public static final Logger LOG = Logger.getLogger("segment");
private NutchFileSystem ndfs ;
private SegmentWriter writer = null;
public File segmentName;
public NewSegment(NutchFileSystem nfs) throws Exception{
//String sg = SegmentWriter.getNewSegmentName();
//init(nfs, new File(sg));
init(nfs,new File("segments"),null);
}
public NewSegment(NutchFileSystem nfs, File srcDir) throws Exception{
init( nfs,srcDir,null);
}
public NewSegment(String dir) throws Exception {
this(dir,null);
}
/**
*
* @param dir index path
* @param segmentName segment name
* @throws Exception
*/
public NewSegment(String dir, String segmentName) throws Exception{
NutchFileSystem nfs = new LocalFileSystem();
init(nfs,new File(dir),segmentName);
}
private void init(NutchFileSystem nfs, File srcDir, String sgName) throws Exception{
this.ndfs = nfs;
this.segmentName = null;
if( !ndfs.exists(srcDir) ){
ndfs.mkdirs(srcDir);
}
if ( !ndfs.isDirectory(srcDir) ){
ndfs.delete(srcDir);
ndfs.mkdirs(srcDir);
}
File segmentname = null;
if (sgName == null || sgName.length() == 0){
String sg = SegmentWriter.getNewSegmentName();
segmentname = new File(srcDir,sg);
}else{
segmentname = new File(srcDir,sgName);
}
if( ndfs.exists(segmentname) ){
LOG.error("file exist:"+segmentname.getName()+" "+ segmentname.getPath());
throw new Exception("Segment Name Exist!");
}
ndfs.mkdirs(segmentname);
this.segmentName = segmentname;
}
public void write(SegmentEntry entry) throws Exception{
if (entry == null)
return;
Properties property = entry.getProperties();
if( property == null )
property = new Properties();
String content = entry.getText();
if ( content != null ){
String temp = "";
if (content.length() > 0){
temp = WordsSegment.segment(content,' ');
if( temp == null ){
throw new Exception("segment content error!");
}
}
ParseText text = new ParseText(temp);
MD5Hash md5Hash = MD5Hash.digest(content);
Outlink[] outlink = new Outlink[]{};
property.put("url",entry.getUrl());
property.put("digest",md5Hash.toString());
ParseData data = new ParseData(entry.getTitle(),outlink,property);
if(writer == null)
writer = new SegmentWriter(ndfs,segmentName,true,true,true);
writer.append(text,data);
}
}
public String getSegmentName(){
return segmentName.getName();
}
public void close() throws Exception{
if (writer != null) try{
writer.close();
}catch(Exception e){
throw new Exception("Close Segment Writer Error!"+e.toString());
}
}
public static void main(String[] args) {
try{
NutchFileSystem nfs = new LocalFileSystem();
NewSegment ns = new NewSegment(nfs,new File("test"));
SegmentEntry se = new SegmentEntry("title:标题","","");
se.putProperty("key","value");
ns.write(se);
ns.close();
}catch(Exception e){
System.out.println("Error"+e.toString());
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -