📄 filesystemcollection.java
字号:
*/
private void indexDocs(final IndexCommand ic) throws IndexException {
if (stopRequested) {
log.info("Indexing stops, due to request");
return;
}
log.debug("indexDocs: document #" + ic.getWriter().docCount() + ": " + ic);
if (ic.getFile().isDirectory()) {
if (!FileUtils.isLink(ic.getFile())) {
indexDirectory(ic);
} else {
log.warn("Skipping symbolic link: " + ic.getFile().getAbsolutePath());
}
} else {
// handle composed docs first based on file extension, lookup in the manager.getArchiveHandler() whether
// this is an archive
// TODO refactor this together with straight file
String extension = FileUtils.getExtension(ic.getFile());
if ((manager.getArchiveHandler() != null) && manager.getArchiveHandler().canUnPack(extension)) {
indexArchive(ic, extension);
} else {
// handle straight files
if (ic.getFile().isFile()) {
if (!FileUtils.isLink(ic.getFile())) {
indexStraightFile(ic);
} else {
log.warn("Skipping symbolic link: " + ic.getFile().getAbsolutePath());
}
} else {
log.debug("not a normal file: " + ic.getFile().getName());
}
}
}
}
/**
* @param ic
* @throws IndexException
*/
private void indexStraightFile(final IndexCommand ic) throws IndexException {
log.debug(ic.getFile() + " is a straight file");
if (!ic.isInZip()) {
ic.setRealName(ic.getFile().getName());
} else {
ic.setZipName(ic.getFile().getName());
}
// do we support this kind of file?
if (manager.getFactory().canExtract(ic.getFile()) || manager.getFactory().isDefaultFileinfo()) {
// get the hash for this file
String hash = FileUtils.getMD5Hash(ic.getFile());
// if we can't get a hash, just set it to a non null value,
// so at least the indexing continues
if (hash == null) {
hash = "unknown";
}
// Check whether this file has been added already
if (!ic.getCollection().getMd5DocumentCache().contains(hash)) {
// new document, handle it
ic.setHash(hash);
Document doc = parse(ic);
if (doc != null) {
if (log.isDebugEnabled()) {
log.debug("Indexcommand: " + ic);
}
// add the document to the index(writer)
try {
ic.getWriter().addDocument(doc);
// add the hash to hashtable if not "unknown" or
// empty
if (!"unknown".equals(hash) && (hash.length() > 0)) {
boolean result = ic.getCollection().getMd5DocumentCache().add(hash);
if (result) {
log.debug("Hash added for document: " + ic.getFile());
} else {
log.warn("No Hash added for document: " + ic.getFile());
}
}
log.info("document #" + ic.getWriter().docCount() + ": " + ic.getFile().getName() + " added to index");
}
catch (IOException e) {
throw new IndexException("Error adding document '" + ic.getFile().getName() + "' to Index", e);
}
}
} else {
log.info("skipping duplicate document: " + ic.getFile().getName());
// if this document is in the cache, we may remove it
if (FileUtils.isIn(ic.getFile(), ic.getCollection().getCacheDirWithManagerDefaults())) {
if (ic.getFile().delete()) {
log.debug("Removed: " + ic.getFile() + " from cache.");
}
}
}
} else {
log.debug("skipping unsupported document: " + ic.getFile().getName());
}
}
/**
* @param ic
* @param extension
* @throws IndexException
*/
private void indexArchive(final IndexCommand ic, String extension) throws IndexException {
if (stopRequested) {
log.info("Indexing stops, due to request");
return;
}
// we have an archive
log.debug(ic.getFile() + " is an archive");
// add the document with just its name and hash to the collection as well, so that we can cache it
// for incremental indexing
String hash = FileUtils.getMD5Hash(ic.getFile());
// if we can't get a hash, just set it to a non null value, so at least the indexing continues
if (hash == null) {
hash = "unknown";
}
// Check whether this file has been added already
if (!ic.getCollection().getMd5DocumentCache().contains(hash)) {
try {
// add the document with just its name and hash
Document doc = new Document();
doc.add(new Field("hash", hash, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("name", ic.getRealName(), Field.Store.YES, Field.Index.TOKENIZED));
ic.getWriter().addDocument(doc);
log.debug("Archive " + ic.getFile() + " added to collection");
File dir = null;
if (!StringUtils.hasText(manager.getArchiveHandler().getUnArchiveCommand(extension))) {
// this is a zip: handle with java's zip
// capabilities
log.debug(ic.getFile() + " is a zip file");
dir = CollectionManagerImpl.unZip(ic.getFile(), ic.getCollection());
} else {
log.debug(ic.getFile() + " is a external archive file");
dir = manager.unPack(ic.getFile(), ic.getCollection());
}
IndexCommand localIc = new IndexCommand(ic);
if (ic.isInZip()) {
// ic.setZipPath(ic.getZipPath() +
// ic.getFile().getName() + "::/");
localIc.setZipPath(ic.getZipPath() + dir.getName() + "/");
localIc.setStart(true);
} else {
localIc.setRealName(ic.getFile().getName());
localIc.setInZip(true);
localIc.setStart(false);
}
localIc.setFile(dir);
indexDocs(localIc);
// remove dir since it is temporary
if (!ic.getCollection().isKeepCacheWithManagerDefaults()) {
FileUtils.removeDir(dir);
}
// add the hash to hashtable if not "unknown" or empty
if (!"unknown".equals(hash) && (hash.length() > 0)) {
boolean result = ic.getCollection().getMd5DocumentCache().add(hash);
if (result) {
log.debug("Hash added for document: " + ic.getFile());
} else {
log.warn("No Hash added for document: " + ic.getFile());
}
}
}
catch (IOException e) {
throw new IndexException("Error adding document '" + ic.getFile().getName() + "' to Index", e);
}
} else {
log.info("skipping duplicate archive: " + ic.getFile().getName());
}
}
/**
* @param ic
* @throws IndexException
*/
private void indexDirectory(final IndexCommand ic) throws IndexException {
if (stopRequested) {
log.info("Indexing stops, due to request");
return;
}
log.debug(ic.getFile() + " is a directory");
// recurse
String[] files = ic.getFile().list();
// I've seen list return null, so be carefull, guess dir names too long for OS
if (files == null) {
log.warn("Something funny with '" + ic.getFile() + "'. Name or path too long?");
log.warn("Could not access '" + ic.getFile() + "' for indexing. Skipping this directory.");
} else {
log.debug(ic.getFile() + " is a directory with " + files.length + " docs");
if (!ic.isInZip()) {
// is this the first directory
if (!ic.isStart()) {
ic.setRealPath(ic.getRealPath() + ic.getFile().getName() + "/");
} else {
ic.setStart(false);
}
} else {
if (!ic.isStart()) {
ic.setZipPath(ic.getZipPath() + ic.getFile().getName() + "/");
} else {
ic.setStart(false);
}
}
// Index the files using a new IndexCommand that's a copy of the current one
// except with the new File, don't use the current since status will be overridden
// when backtracking from recursion
for (int i = 0; i < files.length; i++) {
IndexCommand localIc = new IndexCommand(ic);
localIc.setFile(new File(ic.getFile(), files[i]));
indexDocs(localIc);
}
}
}
/**
* Makes a document for a File, by parsing the contents and metadata provided by {@link IndexCommand}.
*
* @param ic IndexCommand containing all parameters for parsing.
*
* @return Document with parsed content, or null if unknown format, or empty content.
*/
private Document parse(final IndexCommand ic) {
log.debug("Parsing " + ic.getFile().getName());
// Extract relevant info from the file by first getting the relevant
// Extractor
Extractor ext = manager.getFactory().createExtractor(ic.getFile());
if (ext == null) {
log.debug("Skipping " + ic.getFile().getName());
return null;
}
ParsedFileInfo fileInfo = ext.extractInfo(ic.getFile());
if (fileInfo != null) {
// make a new, empty document
if (log.isDebugEnabled()) {
log.debug("Creating new Document with ParsedFileInfo: " + fileInfo);
}
Document doc = new Document();
// Add all collection info
doc.add(new Field("name", ic.getRealName(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("path", ic.getRealPath(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("zipPath", ic.getZipPath(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("zipName", ic.getZipName(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("collection", ic.getCollection().getName(), Field.Store.YES, Field.Index.TOKENIZED));
// Add all file info
if (fileInfo.getReader() != null) {
doc.add(new Field("contents", fileInfo.getReader()));
}
doc.add(new Field("summary", fileInfo.getSummary(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("title", fileInfo.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("type", fileInfo.getType(), Field.Store.YES, Field.Index.TOKENIZED));
if (fileInfo.getISBN() != null) {
doc.add(new Field("isbn", fileInfo.getISBN(), Field.Store.YES, Field.Index.UN_TOKENIZED));
}
// store date as yyyyMMdd
DateFormat df = new SimpleDateFormat("yyyyMMdd");
String dfString = df.format(new Date(fileInfo.getModificationDate()));
doc.add(new Field("modified", dfString, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("size", Long.toString(fileInfo.getSize()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("hash", ic.getHash(), Field.Store.YES, Field.Index.UN_TOKENIZED));
if (log.isDebugEnabled()) {
log.debug("Parsed " + doc);
}
return doc;
} else {
log.warn("Extractor does not return any ParsedFileInfo for: " + ic.getFile().getName());
}
return null;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -