📄 trecdocmaker.java
字号:
package org.apache.lucene.benchmark.byTask.feeds;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.BufferedInputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.text.DateFormat;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.Locale;import java.util.zip.GZIPInputStream;import org.apache.lucene.benchmark.byTask.utils.Config;/** * A DocMaker using the (compressed) Trec collection for its input. */public class TrecDocMaker extends BasicDocMaker { private static final String newline = System.getProperty("line.separator"); private DateFormat dateFormat []; private File dataDir = null; private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; private int iteration=0; private BufferedReader reader; private GZIPInputStream zis; private static final String DATE_FORMATS [] = { "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT "EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT }; /* (non-Javadoc) * @see SimpleDocMaker#setConfig(java.util.Properties) */ public void setConfig(Config config) { super.setConfig(config); String d = config.get("docs.dir","trec"); dataDir = new File(new File("work"),d); collectFiles(dataDir,inputFiles); if (inputFiles.size()==0) { throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); } // date format: 30-MAR-1987 14:22:36.87 dateFormat = new SimpleDateFormat[DATE_FORMATS.length]; for (int i = 0; i < dateFormat.length; i++) { dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US); dateFormat[i].setLenient(true); } } private void openNextFile() throws NoMoreDataException, Exception { closeInputs(); int retries = 0; while (true) { File f = null; synchronized (this) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } f = (File) inputFiles.get(nextFile++); } System.out.println("opening: "+f+" length: "+f.length()); try { zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))); reader = new BufferedReader(new InputStreamReader(zis)); return; } catch (Exception e) { retries++; if (retries<20) { System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); continue; } else { throw new NoMoreDataException(); } } } } private void closeInputs() { if (zis!=null) { try { zis.close(); } catch (IOException e) { System.out.println("closeInputs(): Ingnoring error: "+e); e.printStackTrace(); } zis = null; } if (reader!=null) { try { reader.close(); } catch (IOException e) { System.out.println("closeInputs(): Ingnoring error: "+e); e.printStackTrace(); } reader = null; } } // read until finding a line that starts with the specified prefix private StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception { sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); if (line==null) { openNextFile(); continue; } if (line.startsWith(prefix)) { if (collectMatchLine) { sb.append(sep+line); sep = newline; } break; } if (collectAll) { sb.append(sep+line); sep = newline; } } //System.out.println("read: "+sb); return sb; } protected DocData getNextDocData() throws NoMoreDataException, Exception { if (reader==null) { openNextFile(); } // 1. skip until doc start read("<DOC>",null,false,false); // 2. name StringBuffer sb = read("<DOCNO>",null,true,false); String name = sb.substring("<DOCNO>".length()); name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration; // 3. skip until doc header read("<DOCHDR>",null,false,false); // 4. date sb = read("Date: ",null,true,false); String dateStr = sb.substring("Date: ".length()); // 5. skip until end of doc header read("</DOCHDR>",null,false,false); // 6. collect until end of doc sb = read("</DOC>",null,false,true); // this is the next document, so parse it Date date = parseDate(dateStr); HTMLParser p = getHtmlParser(); DocData docData = p.parse(name, date, sb, dateFormat[0]); addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). return docData; } private Date parseDate(String dateStr) { Date date = null; for (int i=0; i<dateFormat.length; i++) { try { date = dateFormat[i].parse(dateStr.trim()); return date; } catch (ParseException e) { } } // do not fail test just because a date could not be parsed System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr); date = new Date(); // now return date; } /* * (non-Javadoc) * @see DocMaker#resetIinputs() */ public synchronized void resetInputs() { super.resetInputs(); closeInputs(); nextFile = 0; iteration = 0; } /* * (non-Javadoc) * @see DocMaker#numUniqueTexts() */ public int numUniqueTexts() { return inputFiles.size(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -