📄 filelistiterator.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a> */package edu.umass.cs.mallet.base.pipe.iterator;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.types.Label;import edu.umass.cs.mallet.base.types.Instance;import edu.umass.cs.mallet.base.types.Alphabet;import edu.umass.cs.mallet.base.util.Strings;import java.util.ArrayList;import java.util.Iterator;import java.net.URI;import java.util.regex.*;import java.io.*;/** * An iterator that generates instances for a pipe from a list of filenames. * Each file is treated as a text file whose target is determined by * a user-specified regular expression pattern applied to the filename * * @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a> */public class FileListIterator extends AbstractPipeInputIterator{ FileFilter fileFilter; ArrayList fileArray; Iterator subIterator; Pattern targetPattern; // Set target slot to string coming from 1st group of this Pattern int commonPrefixIndex; /** Special value that means to use the directories[i].getPath() as the target name */ // xxx Note that these are specific to UNIX directory delimiter characters! Fix this. /** Use as label names the directories of the given files, * optionally removing common prefix of all starting directories */ public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_"); /** Use as label names the first directory in the filename. */ public static final Pattern FIRST_DIRECTORY = Pattern.compile ("/?([^/]*)/.+"); /** Use as label name the last directory in the filename. */ public static final Pattern LAST_DIRECTORY = Pattern.compile(".*/([^/]+)/[^/]+"); // was ("([^/]*)/[^/]+"); /** Use as label names all the directory names in the filename. */ public static final Pattern ALL_DIRECTORIES = Pattern.compile ("^(.*)/[^/]+"); /* Pass null as targetPattern to get null targets */ /** * Construct an iterator over the given arry of Files * * The instances constructed from the files are returned in the same order * as they appear in the given array * * @param files Array of files from which to construct instances * @param fileFilter class implementing interface FileFilter that will decide which names to accept. * May be null. * @param targetPattern regex Pattern applied to the filename whose first parenthesized group * on matching is taken to be the target value of the generated instance. * The pattern is applied to the filename with the matcher.find() method. * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES * pattern, removing the common prefix of all initially specified * directories, leaving the remainder of each filename as the target value. * */ public FileListIterator(File[] files, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) { this.fileFilter = fileFilter; this.fileArray = new ArrayList(); this.targetPattern = targetPattern; fillFileArrayAssignCommonPrefixIndexAndSubIterator(files, removeCommonPrefix); } public FileListIterator(String[] filenames, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) { this(FileIterator.stringArray2FileArray(filenames), fileFilter, targetPattern, removeCommonPrefix); } /** * Construct a FileListIterator with the file containing the list of files, which * contains one filename per line. * * The instances constructed from the filelist are returned in the same order * as listed */ public FileListIterator(File filelist, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException { this.fileFilter = fileFilter; this.fileArray = new ArrayList(); this.targetPattern = targetPattern; ArrayList filenames = new ArrayList(); BufferedReader reader = new BufferedReader(new FileReader(filelist)); String filename = reader.readLine(); while (filename != null && filename.trim().length() > 0) { filenames.add(filename.trim()); filename = reader.readLine(); } reader.close(); // convert list of filenames to array of files File[] fa = new File[filenames.size()]; for (int i = 0; i < filenames.size(); i++) fa[i] = new File((String) filenames.get(i)); fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix); } public FileListIterator(String filelistName, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException { this (new File(filelistName), fileFilter, targetPattern, removeCommonPrefix); } public FileListIterator(String filelistName, Pattern targetPattern) throws FileNotFoundException, IOException { this (new File(filelistName), null, targetPattern, true); } // The PipeInputIterator interface public Instance nextInstance () { File nextFile = (File) subIterator.next(); String path = nextFile.getParent(); String targetName = null; if (targetPattern == STARTING_DIRECTORIES) { targetName = path.substring(commonPrefixIndex); } else if (targetPattern != null) { Matcher m = targetPattern.matcher(path); if (m.find ()){ targetName = m.group (1); } } return new Instance (nextFile, targetName, nextFile.toURI(), null); } public File nextFile () { return (File) subIterator.next(); } public boolean hasNext () { return subIterator.hasNext(); } public ArrayList getFileArray() { return fileArray; } private void fillFileArrayAssignCommonPrefixIndexAndSubIterator(File[] files, boolean removeCommonPrefix) { ArrayList filenames = new ArrayList(); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) throw new IllegalArgumentException(files[i] + " is not a file."); else if (! files[i].exists()) throw new IllegalArgumentException(files[i] + " does not exist."); if (this.fileFilter == null || this.fileFilter.accept(files[i])) { this.fileArray.add(files[i]); if (removeCommonPrefix) filenames.add(files[i].getPath()); } } this.subIterator = this.fileArray.iterator(); if (removeCommonPrefix) { // find the common prefix index of all filenames String[] fn = new String[filenames.size()]; for (int i = 0; i < fn.length; i++) fn[i] = (String) filenames.get(i); this.commonPrefixIndex = Strings.commonPrefixIndex(fn); } else this.commonPrefixIndex = 0; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -