⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filelistiterator.java

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 JAVA
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org.  For furtherinformation, see the file `LICENSE' included with this distribution. *//**   @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a>  */package edu.umass.cs.mallet.base.pipe.iterator;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.types.Label;import edu.umass.cs.mallet.base.types.Instance;import edu.umass.cs.mallet.base.types.Alphabet;import edu.umass.cs.mallet.base.util.Strings;import java.util.ArrayList;import java.util.Iterator;import java.net.URI;import java.util.regex.*;import java.io.*;/** * An iterator that generates instances for a pipe from a list of filenames. * Each file is treated as a text file whose target is determined by  * a user-specified regular expression pattern applied to the filename * *  @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a> */public class FileListIterator extends AbstractPipeInputIterator{  FileFilter fileFilter;  ArrayList fileArray;  Iterator subIterator;  Pattern targetPattern;  // Set target slot to string coming from 1st group of this Pattern  int commonPrefixIndex;  /** Special value that means to use the directories[i].getPath() as the target name */  // xxx Note that these are specific to UNIX directory delimiter characters!  Fix this.  /** Use as label names the directories of the given files,   * optionally removing common prefix of all starting directories   */  public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_");  /** Use as label names the first directory in the filename. */  public static final Pattern FIRST_DIRECTORY = Pattern.compile ("/?([^/]*)/.+");  /** Use as label name the last directory in the filename. */  public static final Pattern LAST_DIRECTORY = Pattern.compile(".*/([^/]+)/[^/]+"); // was ("([^/]*)/[^/]+");  /** Use as label names all the directory names in the filename. */  public static final Pattern ALL_DIRECTORIES = Pattern.compile ("^(.*)/[^/]+");  /* Pass null as targetPattern to get null targets */  /**   * Construct an iterator over the given arry of Files   *   * The instances constructed from the files are returned in the same order   * as they appear in the given array   *   * @param files  Array of files from which to construct instances   * @param fileFilter   class implementing interface FileFilter that will decide which names to accept.   *                     May be null.   * @param targetPattern  regex Pattern applied to the filename whose first parenthesized group   *                       on matching is taken to be the target value of the generated instance.   *                       The pattern is applied to the filename with the matcher.find() method.   * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES    *                           pattern, removing the common prefix of all initially specified    *                           directories, leaving the remainder of each filename as the target value.   *   */  public FileListIterator(File[] files, FileFilter fileFilter,      Pattern targetPattern, boolean removeCommonPrefix)   {    this.fileFilter = fileFilter;    this.fileArray = new ArrayList();    this.targetPattern = targetPattern;    fillFileArrayAssignCommonPrefixIndexAndSubIterator(files, removeCommonPrefix);  }  public FileListIterator(String[] filenames, FileFilter fileFilter,      Pattern targetPattern, boolean removeCommonPrefix)   {    this(FileIterator.stringArray2FileArray(filenames), fileFilter,         targetPattern, removeCommonPrefix);  }  /**   * Construct a FileListIterator with the file containing the list of files, which   * contains one filename per line.     *   * The instances constructed from the filelist are returned in the same order   * as listed   */  public FileListIterator(File filelist, FileFilter fileFilter,      Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException   {    this.fileFilter = fileFilter;    this.fileArray = new ArrayList();    this.targetPattern = targetPattern;    ArrayList filenames = new ArrayList();    BufferedReader reader = new BufferedReader(new FileReader(filelist));    String filename = reader.readLine();	    while (filename != null && filename.trim().length() > 0) {      filenames.add(filename.trim());      filename = reader.readLine();    }    reader.close();    // convert list of filenames to array of files    File[] fa = new File[filenames.size()];    for (int i = 0; i < filenames.size(); i++)      fa[i] = new File((String) filenames.get(i));    fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix);  }  public FileListIterator(String filelistName, FileFilter fileFilter,      Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException  {    this (new File(filelistName), fileFilter, targetPattern, removeCommonPrefix);  }  public FileListIterator(String filelistName, Pattern targetPattern) throws FileNotFoundException, IOException  {    this (new File(filelistName), null, targetPattern, true);  }  // The PipeInputIterator interface  public Instance nextInstance ()  {    File nextFile = (File) subIterator.next();    String path = nextFile.getParent();    String targetName = null;    if (targetPattern == STARTING_DIRECTORIES) {      targetName = path.substring(commonPrefixIndex);    }     else if (targetPattern != null) {      Matcher m = targetPattern.matcher(path);      if (m.find ()){        targetName = m.group (1);      }    }    return new Instance (nextFile, targetName, nextFile.toURI(), null);  }  public File nextFile ()  {    return (File) subIterator.next();		  }  public boolean hasNext ()	  {    return subIterator.hasNext();  }  public ArrayList getFileArray()  {    return fileArray;  }  private void fillFileArrayAssignCommonPrefixIndexAndSubIterator(File[] files, boolean removeCommonPrefix)  {    ArrayList filenames = new ArrayList();    for (int i = 0; i < files.length; i++) {      if (files[i].isDirectory())        throw new IllegalArgumentException(files[i] + " is not a file.");      else if (! files[i].exists())        throw new IllegalArgumentException(files[i] + " does not exist.");      if (this.fileFilter == null || this.fileFilter.accept(files[i])) {        this.fileArray.add(files[i]);        if (removeCommonPrefix)          filenames.add(files[i].getPath());      }    }    this.subIterator = this.fileArray.iterator();    if (removeCommonPrefix) { // find the common prefix index of all filenames      String[] fn = new String[filenames.size()];      for (int i = 0; i < fn.length; i++)        fn[i] = (String) filenames.get(i);      this.commonPrefixIndex = Strings.commonPrefixIndex(fn);    }    else       this.commonPrefixIndex = 0;  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -