⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 basicindexer.java

📁 dragontoolkit用于机器学习
💻 JAVA
字号:
package dragon.ir.index;

import dragon.nlp.extract.*;
import dragon.onlinedb.*;
import java.util.ArrayList;

/**
 * <p>BasicIndexer, providing all basic functions for indexing, can be used to index documents directly </p>
 * <p> </p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class BasicIndexer extends AbstractIndexer {
    private BasicIndexWriteController charWriter, cptWriter;
    private TripleExtractor te;
    private ConceptExtractor ce;
    private int minArticleSize;
    private boolean[] remainingSections;

    public BasicIndexer(TripleExtractor te, boolean useConcept, String indexFolder) {
        super(true);
        this.te = te;
        remainingSections=new boolean[4];
        minArticleSize=5;
        if(useConcept)
            cptWriter = new BasicIndexWriteController(indexFolder, relationSupported, true);
        else
            charWriter = new BasicIndexWriteController(indexFolder, relationSupported, false);
    }

    public BasicIndexer(TripleExtractor te, String charIndexFolder, String cptIndexFolder) {
        super(true);
        this.te = te;
        remainingSections=new boolean[4];
        minArticleSize=5;
        if(cptIndexFolder!=null)
            cptWriter = new BasicIndexWriteController(cptIndexFolder, relationSupported, true);
        if(charIndexFolder!=null)
            charWriter = new BasicIndexWriteController(charIndexFolder, relationSupported, false);
    }

    public BasicIndexer(ConceptExtractor ce, boolean useConcept, String indexFolder) {
        super(false);
        this.ce = ce;
        remainingSections=new boolean[4];
        if(useConcept)
            cptWriter = new BasicIndexWriteController(indexFolder, relationSupported, true);
        else
            charWriter = new BasicIndexWriteController(indexFolder, relationSupported, false);
    }

    public BasicIndexer(ConceptExtractor ce, String charIndexFolder, String cptIndexFolder) {
        super(false);
        this.ce = ce;
        remainingSections=new boolean[4];
        if(cptIndexFolder!=null)
            cptWriter = new BasicIndexWriteController(cptIndexFolder, relationSupported, true);
        if(charIndexFolder!=null)
            charWriter = new BasicIndexWriteController(charIndexFolder, relationSupported, false);
    }

    public void close() {
        initialized=false;
        if (charWriter != null) {
            charWriter.close();
            charWriter = null;
        }
        if (cptWriter != null) {
            cptWriter.close();
            cptWriter = null;
        }
    }

    public boolean indexed(String docKey) {
        if (charWriter != null) {
            return charWriter.indexed(docKey);
        }
        else if (cptWriter != null) {
            return cptWriter.indexed(docKey);
        }
        else {
            return true;
        }
    }

    public void setMinArticleSize(int minSize){
        this.minArticleSize=minSize;
    }

    public void setSectionIndexOption(boolean all, boolean title, boolean abt, boolean body, boolean meta) {
        if (all)
            addSection(new IRSection(IRSection.SEC_ALL));

        remainingSections[IRSection.SEC_TITLE-1]=!title;
        if (title)
            addSection(new IRSection(IRSection.SEC_TITLE));

        remainingSections[IRSection.SEC_ABSTRACT-1]=!abt;
        if (abt)
            addSection(new IRSection(IRSection.SEC_ABSTRACT));

        remainingSections[IRSection.SEC_BODY-1]=!body;
        if (body)
            addSection(new IRSection(IRSection.SEC_BODY));

        remainingSections[IRSection.SEC_META-1]=!meta;
        if (meta)
            addSection(new IRSection(IRSection.SEC_META));
    }

    protected void initDocIndexing(){
        if(te!=null)
            te.initDocExtraction();
        if(ce!=null)
            ce.initDocExtraction();
    }

    protected boolean extract(String content, ArrayList conceptList, ArrayList relationList) {
        boolean ret;

        try{
            if (content == null || content.length() <minArticleSize) {
                return true;
            }
            ret = te.extractFromDoc(content);
            if (ret) {
                if (te.getConceptList() != null) {
                    conceptList.addAll(te.getConceptList());
                }
                if (te.getTripleList() != null) {
                    relationList.addAll(te.getTripleList());
                }
            }
            return ret;
        }
        catch(Exception e){
            e.printStackTrace();
            return false;
        }
    }

    protected boolean extract(String content, ArrayList conceptList) {
        try{
            if (content == null || content.length() <minArticleSize) {
                return true;
            }
            if (ce.extractFromDoc(content) != null) {
                conceptList.addAll(ce.getConceptList());
                return true;
            }
            else
                return false;
        }
        catch(Exception e){
            e.printStackTrace();
            return false;
        }
    }

    protected String getSection(Article paper, int sectionID) {
        switch (sectionID) {
            case IRSection.SEC_TITLE:
                return paper.getTitle();
            case IRSection.SEC_ABSTRACT:
                return paper.getAbstract();
            case IRSection.SEC_BODY:
                return paper.getBody();
            case IRSection.SEC_META:
                return paper.getMeta();
        }
        return null;
    }

    protected String getRemainingSections(Article paper) {
        StringBuffer sb;
        String section;
        int i;

        sb = new StringBuffer();
        for (i =0; i < remainingSections.length; i++) {
            if (remainingSections[i] && (section = getSection(paper, i+1)) != null && section.length()>=minArticleSize) {
                if (sb.length() > 0) {
                    sb.append("\n\n");
                }
                sb.append(section);
            }
        }
        return sb.toString();
    }

    protected void write(int sectionID, ArrayList conceptList){
        if(charWriter!=null)
            charWriter.write(sectionID,conceptList);
        if(cptWriter!=null)
            cptWriter.write(sectionID,conceptList);
    }

    protected void write(int sectionID, ArrayList conceptList, ArrayList relationList){
        if(charWriter!=null)
            charWriter.write(sectionID,conceptList, relationList);
        if(cptWriter!=null)
            cptWriter.write(sectionID,conceptList, relationList);
    }

    protected void initIndex(){
        if(charWriter!=null)
            charWriter.initialize();
        if(cptWriter!=null)
            cptWriter.initialize();
    }

    protected void initSectionWrite(IRSection section){
        if (charWriter != null)
            charWriter.addSection(section.copy());
        if (cptWriter != null)
            cptWriter.addSection(section.copy());
    }

    protected boolean setDoc(String docKey){
        boolean ret;

        ret=true;
        if(charWriter!=null)
            ret=charWriter.setDoc(docKey);
        if(ret && cptWriter!=null)
            ret=cptWriter.setDoc(docKey);
        return ret;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -