📄 documentfactory.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.ObjectList;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.lang.FlyweightPrototype;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import java.io.IOException;import java.io.InputStream;import java.io.Serializable;/** A factory parsing and building documents of the same type. * * <p>Each document produced by the same factory has a number of <em>fields</em>, * which represent units of information that should be indexed * separately. The number of available fields may be recovered calling * {@link #numberOfFields()}, their types calling {@link #fieldType(int)}, * and their symbolic names using {@link #fieldName(int)}. * * <p>Factories contain the parsing and document-level breaking logic. For instance, * a factory for HTML documents might extract the text into a title and a body, and * expose them as {@link FieldType#TEXT} fields. Additionally, the last modification * date might be exposed as a {@link FieldType#DATE} field, and so on. * * <strong>Warning</strong>: implementations of this class are not required * to be thread-safe, but they provide {@link FlyweightPrototype flyweight copies}. * The {@link #copy()} method is strengthened so to return a instance of this class. */public interface DocumentFactory extends Serializable, FlyweightPrototype<DocumentFactory> { /** A field type. */ public static enum FieldType { /** The most basic type: indexable text. */ TEXT, /** A virtual field: an {@link ObjectList} of {@link VirtualDocumentFragment}s. */ VIRTUAL, /** A date (experimental). */ INT, /** A date (experimental). */ DATE, } /** Returns the number of fields present in the documents produced by this factory. * * @return the number of fields present in the documents produced by this factory. */ public int numberOfFields(); /** Returns the symbolic name of a field. * * @param field the index of a field (between 0 inclusive and {@link #numberOfFields()} exclusive}). * @return the symbolic name of the <code>field</code>-th field. */ public String fieldName( int field ); /** Returns the index of a field, given its symbolic name. * * @param fieldName the name of a field of this factory. * @return the corresponding index, or -1 if there is no field with name <code>fieldName</code>. */ public int fieldIndex( String fieldName ); /** Returns the type of a field. * * <p>The possible types are defined in {@link FieldType}. * * @param field the index of a field (between 0 inclusive and {@link #numberOfFields()} exclusive}). * @return the type of the <code>field</code>-th field. */ public FieldType fieldType( int field ); /** Returns the document obtained by parsing the given byte stream. * * <p>The parameter <code>metadata</code> actually replaces the lack of a simple keyword-based * parameter-passing system in Java. This method might take several different type of “suggestions” * which have been collected by the collection: typically, the document title, a URI representing * the document, its MIME type, its encoding and so on. Some of this information might be * set by default (as it happens, for instance, in a {@link PropertyBasedDocumentFactory}). * Implementations of this method must consult the metadata provided by the collection, possibly * complete them with default factory metadata, and proceed to the document construction. * * @param rawContent the raw content from which the document should be extracted; it must not be closed, as * resource management is a responsibility of the {@linkplain DocumentCollection}. * @param metadata a map from enums (e.g., keys taken in {@link PropertyBasedDocumentFactory}) to various kind of objects. * * @return the document obtained by parsing the given character sequence. */ public Document getDocument( InputStream rawContent, Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException; public DocumentFactory copy();}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -