📄 scoringfilter.java

📁 nutch0.8源码
💻 JAVA
字号:
/** * Copyright 2006 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.nutch.scoring;import java.util.List;import org.apache.hadoop.conf.Configurable;import org.apache.hadoop.io.UTF8;import org.apache.lucene.document.Document;import org.apache.nutch.crawl.CrawlDatum;import org.apache.nutch.crawl.Inlinks;import org.apache.nutch.parse.Parse;import org.apache.nutch.parse.ParseData;import org.apache.nutch.plugin.Pluggable;import org.apache.nutch.protocol.Content;/** * A contract defining behavior of scoring plugins. *  * A scoring filter will manipulate scoring variables in CrawlDatum and * in resulting search indexes. Filters can be chained in a specific order, * to provide multi-stage scoring adjustments. *  * @author Andrzej Bialecki */public interface ScoringFilter extends Configurable, Pluggable {  /** The name of the extension point. */  public final static String X_POINT_ID = ScoringFilter.class.getName();    /**   * Set an initial score for newly injected pages. Note: newly injected pages   * may have no inlinks, so filter implementations may wish to set this    * score to a non-zero value, to give newly injected pages some initial   * credit.   * @param url url of the page   * @param datum new datum. Filters will modify it in-place.   * @throws ScoringFilterException   */  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;    /**   * Set an initial score for newly discovered pages. Note: newly discovered pages   * have at least one inlink with its score contribution, so filter implementations   * may choose to set initial score to zero (unknown value), and then the inlink   * score contribution will set the "real" value of the new page.   * @param url url of the page   * @param datum new datum. Filters will modify it in-place.   * @throws ScoringFilterException   */  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;    /**   * This method prepares a sort value for the purpose of sorting and   * selecting top N scoring pages during fetchlist generation.   * @param url url of the page   * @param datum page's datum, should not be modified   * @param initSort initial sort value, or a value from previous filters in chain   */  public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException;    /**   * This method takes all relevant score information from the current datum   * (coming from a generated fetchlist) and stores it into   * {@link org.apache.nutch.protocol.Content} metadata.   * This is needed in order to pass this value(s) to the mechanism that distributes it   * to outlinked pages.   * @param url url of the page   * @param datum source datum. NOTE: modifications to this value are not persisted.   * @param content instance of content. Implementations may modify this   * in-place, primarily by setting some metadata properties.   */  public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) throws ScoringFilterException;    /**   * Currently a part of score distribution is performed using only data coming   * from the parsing process. We need this method in order to ensure the   * presence of score data in these steps.   * @param url page url   * @param content original content. NOTE: modifications to this value are not persisted.   * @param parse target instance to copy the score information to. Implementations   * may modify this in-place, primarily by setting some metadata properties.   */  public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) throws ScoringFilterException;    /**   * Distribute score value from the current page to all its outlinked pages.   * @param fromUrl url of the source page   * @param toUrl url of the target page   * @param parseData ParseData instance, which stores relevant score value(s)   * in its metadata. NOTE: filters may modify this in-place, all changes will   * be persisted.   * @param target target CrawlDatum. NOTE: filters can modify this in-place,   * all changes will be persisted.   * @param adjust a CrawlDatum instance, initially null, which implementations   * may use to pass adjustment values to the original CrawlDatum. When creating   * this instance, set its status to {@link CrawlDatum#STATUS_LINKED}.   * @param allCount number of all collected outlinks from the source page   * @param validCount number of valid outlinks from the source page, i.e.   * outlinks that are acceppted by current URLNormalizers and URLFilters.   * @return if needed, implementations may return an instance of CrawlDatum,   * with status {@link CrawlDatum#STATUS_LINKED}, which contains adjustments   * to be applied to the original CrawlDatum score(s) and metadata. This can   * be null if not needed.   * @throws ScoringFilterException   */  public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl,          ParseData parseData, CrawlDatum target, CrawlDatum adjust,          int allCount, int validCount) throws ScoringFilterException;  /**   * This method calculates a new score of CrawlDatum during CrawlDb update, based on the   * initial value of the original CrawlDatum, and also score values contributed by   * inlinked pages.   * @param url url of the page   * @param old original datum, with original score. May be null if this is a newly   * discovered page. If not null, filters should use score values from this parameter   * as the starting values - the {@param datum} parameter may contain values that are   * no longer valid, if other updates occured between generation and this update.   * @param datum the new datum, with the original score saved at the time when   * fetchlist was generated. Filters should update this in-place, and it will be saved in   * the crawldb.   * @param inlinked (partial) list of CrawlDatum-s (with their scores) from   * links pointing to this page, found in the current update batch.   * @throws ScoringFilterException   */  public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException;    /**   * This method calculates a Lucene document boost.   * @param url url of the page   * @param doc Lucene document. NOTE: this already contains all information collected   * by indexing filters. Implementations may modify this instance, in order to store/remove   * some information.   * @param dbDatum current page from CrawlDb. NOTE: changes made to this instance   * are not persisted.   * @param fetchDatum datum from FetcherOutput (containing among others the fetching status)   * @param parse parsing result. NOTE: changes made to this instance are not persisted.   * @param inlinks current inlinks from LinkDb. NOTE: changes made to this instance are   * not persisted.   * @param initScore initial boost value for the Lucene document.   * @return boost value for the Lucene document. This value is passed as an argument   * to the next scoring filter in chain. NOTE: implementations may also express   * other scoring strategies by modifying Lucene document directly.   * @throws ScoringFilterException   */  public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum,          CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -