⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newsegment.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
字号:
/*
 * 创建日期 2005-1-18
 *
 * TODO 要更改此生成的文件的模板,请转至
 * 窗口 - 首选项 - Java - 代码样式 - 代码模板
 */
package net.nutch.segment;

import java.io.File;
import java.util.Properties;

import net.nutch.fs.LocalFileSystem;
import net.nutch.fs.NutchFileSystem;
import net.nutch.io.MD5Hash;
import net.nutch.parse.Outlink;
import net.nutch.parse.ParseData;
import net.nutch.parse.ParseText;

import org.apache.log4j.Logger;

import kit.nlp.util.WordsSegment;


/**
 * @author shqxie
 *
 * TODO 要更改此生成的类型注释的模板,请转至
 * 窗口 - 首选项 - Java - 代码样式 - 代码模板
 */
public class NewSegment {
	public static final Logger LOG = Logger.getLogger("segment");
	
	private NutchFileSystem ndfs ;
	
	private SegmentWriter writer = null;
	
	public File segmentName;
	
	public NewSegment(NutchFileSystem nfs) throws Exception{
		//String sg = SegmentWriter.getNewSegmentName();
		//init(nfs, new File(sg));
		init(nfs,new File("segments"),null);
	}
	
	public NewSegment(NutchFileSystem nfs, File srcDir) throws Exception{
		init( nfs,srcDir,null);
	}
	
	public NewSegment(String dir) throws Exception {
		this(dir,null);
	}
	/**
	 * 
	 * @param dir index path
	 * @param segmentName segment name
	 * @throws Exception
	 */
	public NewSegment(String dir, String segmentName) throws Exception{
		NutchFileSystem nfs = new LocalFileSystem();
		init(nfs,new File(dir),segmentName);
	}
	
	private void init(NutchFileSystem nfs, File srcDir, String sgName) throws Exception{
		this.ndfs = nfs;
		this.segmentName = null;
		
		if( !ndfs.exists(srcDir) ){
			ndfs.mkdirs(srcDir);
		}
		if ( !ndfs.isDirectory(srcDir) ){
			ndfs.delete(srcDir);
			ndfs.mkdirs(srcDir);
		}
		
		File segmentname = null;
		if (sgName == null || sgName.length() == 0){
			String sg = SegmentWriter.getNewSegmentName();
			segmentname = new File(srcDir,sg);
		}else{
			segmentname = new File(srcDir,sgName);
		}
		

		if( ndfs.exists(segmentname) ){
			LOG.error("file exist:"+segmentname.getName()+" "+ segmentname.getPath());
			throw new Exception("Segment Name Exist!");
		}
		ndfs.mkdirs(segmentname);
		this.segmentName = segmentname;
	}
	
	public void write(SegmentEntry entry) throws Exception{
		if (entry == null)
			return;
		Properties property = entry.getProperties();
		if( property == null )
			property = new Properties();
		
		String content = entry.getText();

		if ( content != null ){
			String temp = "";
			if (content.length() > 0){
				temp = WordsSegment.segment(content,' ');
				if( temp == null ){
					throw new Exception("segment content error!");
				}
			}
			ParseText text = new ParseText(temp);
			MD5Hash md5Hash = MD5Hash.digest(content);
			Outlink[] outlink = new Outlink[]{};
			property.put("url",entry.getUrl());
			property.put("digest",md5Hash.toString());
			ParseData data = new ParseData(entry.getTitle(),outlink,property);			
			if(writer == null)
				writer = new SegmentWriter(ndfs,segmentName,true,true,true);
			writer.append(text,data);
		}
	}
	
	public String getSegmentName(){
		return segmentName.getName();
	}

	public void close() throws Exception{
		if (writer != null) try{
			writer.close();
		}catch(Exception e){
			throw new Exception("Close Segment Writer Error!"+e.toString());
		}
	}
	
	
	public static void main(String[] args) {
		try{
			NutchFileSystem nfs = new LocalFileSystem();
			NewSegment ns = new NewSegment(nfs,new File("test"));
			SegmentEntry se = new SegmentEntry("title:标题","","");
			se.putProperty("key","value");
			ns.write(se);
			ns.close();
		}catch(Exception e){
			System.out.println("Error"+e.toString());
		}
		
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -