⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 testsequencefileinputfilter.java

📁 hadoop:Nutch集群平台
💻 JAVA
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.mapred;import java.io.*;import java.util.*;import junit.framework.TestCase;import org.apache.commons.logging.*;import org.apache.hadoop.fs.*;import org.apache.hadoop.io.*;import org.apache.hadoop.conf.*;public class TestSequenceFileInputFilter extends TestCase {  private static final Log LOG = InputFormatBase.LOG;  private static final int MAX_LENGTH = 15000;  private static final Configuration conf = new Configuration();  private static final JobConf job = new JobConf(conf);  private static final FileSystem fs;  private static final Path inDir = new Path(System.getProperty("test.build.data",".") + "/mapred");  private static final Path inFile = new Path(inDir, "test.seq");  private static final Random random = new Random(1);  private static final Reporter reporter = new Reporter() {      public void setStatus(String status) throws IOException {}      public void progress() throws IOException {}  };    static {      job.setInputPath(inDir);      try {        fs = FileSystem.getNamed( "local", conf);    } catch (IOException e) {        e.printStackTrace();        throw new RuntimeException(e);    }  }  private static void createSequenceFile(int numRecords) throws Exception {      // create a file with length entries      SequenceFile.Writer writer =          new SequenceFile.Writer(fs, inFile,                  Text.class, BytesWritable.class);      try {          for (int i = 1; i <= numRecords; i++) {              Text key = new Text(Integer.toString(i));              byte[] data = new byte[random.nextInt(10)];              random.nextBytes(data);              BytesWritable value = new BytesWritable(data);              writer.append(key, value);          }      } finally {          writer.close();      }  }  private int countRecords(int numSplits) throws IOException {      InputFormat format = new SequenceFileInputFilter();      Text key = new Text();      BytesWritable value = new BytesWritable();      if(numSplits==0) {        numSplits =            random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;      }      FileSplit[] splits = format.getSplits(fs, job, numSplits);            // check each split      int count = 0;      for (int j = 0; j < splits.length; j++) {          RecordReader reader =              format.getRecordReader(fs, splits[j], job, reporter);          try {              while (reader.next(key, value)) {                  LOG.info("Accept record "+key.toString());                  count++;              }          } finally {              reader.close();          }      }      return count;  }    public void testRegexFilter() throws Exception {    // set the filter class    LOG.info("Testing Regex Filter with patter: \\A10*");    SequenceFileInputFilter.setFilterClass(job,             SequenceFileInputFilter.RegexFilter.class);    SequenceFileInputFilter.RegexFilter.setPattern(job, "\\A10*");        // clean input dir    fs.delete(inDir);      // for a variety of lengths    for (int length = 1; length < MAX_LENGTH;               length+= random.nextInt(MAX_LENGTH/10)+1) {        LOG.info("******Number of records: "+length);        createSequenceFile(length);        int count = countRecords(0);        assertEquals(count, length==0?0:(int)Math.log10(length)+1);    }        // clean up    fs.delete(inDir);  }  public void testPercentFilter() throws Exception {      LOG.info("Testing Percent Filter with frequency: 1000");      // set the filter class      SequenceFileInputFilter.setFilterClass(job,               SequenceFileInputFilter.PercentFilter.class);      SequenceFileInputFilter.PercentFilter.setFrequency(job, 1000);            // clean input dir      fs.delete(inDir);          // for a variety of lengths      for (int length = 0; length < MAX_LENGTH;                 length+= random.nextInt(MAX_LENGTH/10)+1) {          LOG.info("******Number of records: "+length);          createSequenceFile(length);          int count = countRecords(1);          LOG.info("Accepted "+count+" records");          int expectedCount = length/1000;          if(expectedCount*1000!=length)              expectedCount++;          assertEquals(count, expectedCount);      }            // clean up      fs.delete(inDir);  }    public void testMD5Filter() throws Exception {      // set the filter class      LOG.info("Testing MD5 Filter with frequency: 1000");      SequenceFileInputFilter.setFilterClass(job,               SequenceFileInputFilter.MD5Filter.class);      SequenceFileInputFilter.MD5Filter.setFrequency(job, 1000);            // clean input dir      fs.delete(inDir);          // for a variety of lengths      for (int length = 0; length < MAX_LENGTH;                 length+= random.nextInt(MAX_LENGTH/10)+1) {          LOG.info("******Number of records: "+length);          createSequenceFile(length);          LOG.info("Accepted "+countRecords(0)+" records");      }      // clean up      fs.delete(inDir);    }  public static void main(String[] args) throws Exception {    TestSequenceFileInputFilter filter = new TestSequenceFileInputFilter();    filter.testRegexFilter();  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -