⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawldatum.java

📁 nutch0.8源码
💻 JAVA
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.nutch.crawl;import java.io.*;import java.util.*;import org.apache.hadoop.io.*;import org.apache.nutch.util.*;/* The crawl state of a url. */public class CrawlDatum implements WritableComparable, Cloneable {  public static final String DB_DIR_NAME = "current";  public static final String GENERATE_DIR_NAME = "crawl_generate";  public static final String FETCH_DIR_NAME = "crawl_fetch";  public static final String PARSE_DIR_NAME = "crawl_parse";  private final static byte CUR_VERSION = 4;  public static final byte STATUS_SIGNATURE = 0;  public static final byte STATUS_DB_UNFETCHED = 1;  public static final byte STATUS_DB_FETCHED = 2;  public static final byte STATUS_DB_GONE = 3;  public static final byte STATUS_LINKED = 4;  public static final byte STATUS_FETCH_SUCCESS = 5;  public static final byte STATUS_FETCH_RETRY = 6;  public static final byte STATUS_FETCH_GONE = 7;    public static final String[] statNames = {    "signature",    "DB_unfetched",    "DB_fetched",    "DB_gone",    "linked",    "fetch_success",    "fetch_retry",    "fetch_gone"  };  private static final float MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;  private byte status;  private long fetchTime = System.currentTimeMillis();  private byte retries;  private float fetchInterval;  private float score = 1.0f;  private byte[] signature = null;  private long modifiedTime;  private MapWritable metaData;  public CrawlDatum() {}  public CrawlDatum(int status, float fetchInterval) {    this.status = (byte)status;    this.fetchInterval = fetchInterval;  }  public CrawlDatum(int status, float fetchInterval, float score) {    this(status, fetchInterval);    this.score = score;  }  //  // accessor methods  //  public byte getStatus() { return status; }  public void setStatus(int status) { this.status = (byte)status; }  public long getFetchTime() { return fetchTime; }  public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }  public void setNextFetchTime() {    fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);  }  public long getModifiedTime() {    return modifiedTime;  }  public void setModifiedTime(long modifiedTime) {    this.modifiedTime = modifiedTime;  }    public byte getRetriesSinceFetch() { return retries; }  public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}  public float getFetchInterval() { return fetchInterval; }  public void setFetchInterval(float fetchInterval) {    this.fetchInterval = fetchInterval;  }  public float getScore() { return score; }  public void setScore(float score) { this.score = score; }  public byte[] getSignature() {    return signature;  }  public void setSignature(byte[] signature) {    if (signature != null && signature.length > 256)      throw new RuntimeException("Max signature length (256) exceeded: " + signature.length);    this.signature = signature;  }     public void setMetaData(MapWritable mapWritable) {this.metaData = mapWritable; }  /**   * returns a MapWritable if it was set or read in @see readFields(DataInput),    * returns empty map in case CrawlDatum was freshly created (lazily instantiated).   */  public MapWritable getMetaData() {    if (this.metaData == null) this.metaData = new MapWritable();    return this.metaData;  }    //  // writable methods  //  public static CrawlDatum read(DataInput in) throws IOException {    CrawlDatum result = new CrawlDatum();    result.readFields(in);    return result;  }  public void readFields(DataInput in) throws IOException {    byte version = in.readByte();                 // read version    if (version > CUR_VERSION)                   // check version      throw new VersionMismatchException(CUR_VERSION, version);    status = in.readByte();    fetchTime = in.readLong();    retries = in.readByte();    fetchInterval = in.readFloat();    score = in.readFloat();    if (version > 2) {      modifiedTime = in.readLong();      int cnt = in.readByte();      if (cnt > 0) {        signature = new byte[cnt];        in.readFully(signature);      } else signature = null;    }    if (version > 3) {      if (in.readBoolean()) {        if (metaData == null) {          metaData = new MapWritable();         } else {           metaData.clear();        }        metaData.readFields(in);      } else {        if (metaData != null) {          metaData.clear(); // at least clear old meta data        }      }    }  }  /** The number of bytes into a CrawlDatum that the score is stored. */  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;  private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;  public void write(DataOutput out) throws IOException {    out.writeByte(CUR_VERSION);                   // store current version    out.writeByte(status);    out.writeLong(fetchTime);    out.writeByte(retries);    out.writeFloat(fetchInterval);    out.writeFloat(score);    out.writeLong(modifiedTime);    if (signature == null) {      out.writeByte(0);    } else {      out.writeByte(signature.length);      out.write(signature);    }    if (metaData != null && metaData.size() > 0) {      out.writeBoolean(true);      metaData.write(out);    } else {      out.writeBoolean(false);    }  }  /** Copy the contents of another instance into this instance. */  public void set(CrawlDatum that) {    this.status = that.status;    this.fetchTime = that.fetchTime;    this.retries = that.retries;    this.fetchInterval = that.fetchInterval;    this.score = that.score;    this.modifiedTime = that.modifiedTime;    this.signature = that.signature;    this.metaData = new MapWritable(that.metaData); // make a deep copy  }  //  // compare methods  //    /** Sort by decreasing score. */  public int compareTo(Object o) {    CrawlDatum that = (CrawlDatum)o;     if (that.score != this.score)      return (that.score - this.score) > 0 ? 1 : -1;    if (that.status != this.status)      return this.status - that.status;    if (that.fetchTime != this.fetchTime)      return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;    if (that.retries != this.retries)      return that.retries - this.retries;    if (that.fetchInterval != this.fetchInterval)      return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;    if (that.modifiedTime != this.modifiedTime)      return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;    return SignatureComparator._compare(this, that);  }  /** A Comparator optimized for CrawlDatum. */   public static class Comparator extends WritableComparator {    public Comparator() { super(CrawlDatum.class); }    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {      float score1 = readFloat(b1,s1+SCORE_OFFSET);      float score2 = readFloat(b2,s2+SCORE_OFFSET);      if (score2 != score1) {        return (score2 - score1) > 0 ? 1 : -1;      }      int status1 = b1[s1+1];      int status2 = b2[s2+1];      if (status2 != status1)        return status1 - status2;      long fetchTime1 = readLong(b1, s1+1+1);      long fetchTime2 = readLong(b2, s2+1+1);      if (fetchTime2 != fetchTime1)        return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;      int retries1 = b1[s1+1+1+8];      int retries2 = b2[s2+1+1+8];      if (retries2 != retries1)        return retries2 - retries1;      float fetchInterval1 = readFloat(b1, s1+1+1+8+1);      float fetchInterval2 = readFloat(b2, s2+1+1+8+1);      if (fetchInterval2 != fetchInterval1)        return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;      long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);      long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);      if (modifiedTime2 != modifiedTime1)        return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;      int sigl1 = b1[s1+SIG_OFFSET];      int sigl2 = b2[s2+SIG_OFFSET];      return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2);    }  }  static {                                        // register this comparator    WritableComparator.define(CrawlDatum.class, new Comparator());  }  //  // basic methods  //  public String toString() {    StringBuffer buf = new StringBuffer();    buf.append("Version: " + CUR_VERSION + "\n");    buf.append("Status: " + getStatus() + " (" + statNames[getStatus()] + ")\n");    buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");    buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");    buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");    buf.append("Retry interval: " + getFetchInterval() + " days\n");    buf.append("Score: " + getScore() + "\n");    buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");    buf.append("Metadata: " + (metaData != null ? metaData.toString() : "null") + "\n");    return buf.toString();  }  public boolean equals(Object o) {    if (!(o instanceof CrawlDatum))      return false;    CrawlDatum other = (CrawlDatum)o;    boolean res =      (this.status == other.status) &&      (this.fetchTime == other.fetchTime) &&      (this.modifiedTime == other.modifiedTime) &&      (this.retries == other.retries) &&      (this.fetchInterval == other.fetchInterval) &&      (SignatureComparator._compare(this.signature, other.signature) == 0) &&      (this.score == other.score);    if (!res) return res;    // allow zero-sized metadata to be equal to null metadata    if (this.metaData == null) {      if (other.metaData != null && other.metaData.size() > 0) return false;      else return true;    } else {      if (other.metaData == null) {        if (this.metaData.size() == 0) return true;        else return false;      } else {        return this.metaData.equals(other.metaData);      }    }  }  public int hashCode() {    int res = 0;    if (signature != null) {      for (int i = 0; i < signature.length / 4; i += 4) {        res ^= (int)(signature[i] << 24 + signature[i+1] << 16 +                signature[i+2] << 8 + signature[i+3]);      }    }    if (metaData != null) res ^= metaData.hashCode();    return      res ^ status ^      ((int)fetchTime) ^      ((int)modifiedTime) ^      retries ^      Float.floatToIntBits(fetchInterval) ^      Float.floatToIntBits(score);  }  public Object clone() {    try {      return super.clone();    } catch (CloneNotSupportedException e) {      throw new RuntimeException(e);    }  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -