📄 filesystemcollection.java

📁 很好的搜索代码,大家都很难下载!抓紧时间啊!不要错过!
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
     */
    private void indexDocs(final IndexCommand ic) throws IndexException {
        if (stopRequested) {
            log.info("Indexing stops, due to request");
            return;
        }
        log.debug("indexDocs: document #" + ic.getWriter().docCount() + ": " + ic);
        if (ic.getFile().isDirectory()) {
            if (!FileUtils.isLink(ic.getFile())) {
                indexDirectory(ic);
            } else {
                log.warn("Skipping symbolic link: " + ic.getFile().getAbsolutePath());
            }
        } else {
            // handle composed docs first based on file extension, lookup in the manager.getArchiveHandler() whether
            // this is an archive
            // TODO refactor this together with straight file
            String extension = FileUtils.getExtension(ic.getFile());

            if ((manager.getArchiveHandler() != null) && manager.getArchiveHandler().canUnPack(extension)) {
                indexArchive(ic, extension);
            } else {
                // handle straight files
                if (ic.getFile().isFile()) {
                    if (!FileUtils.isLink(ic.getFile())) {
                        indexStraightFile(ic);
                    } else {
                        log.warn("Skipping symbolic link: " + ic.getFile().getAbsolutePath());
                    }
                } else {
                    log.debug("not a normal file: " + ic.getFile().getName());
                }
            }
        }
    }

    /**
     * @param ic
     * @throws IndexException
     */
    private void indexStraightFile(final IndexCommand ic) throws IndexException {
        log.debug(ic.getFile() + " is a straight file");
        if (!ic.isInZip()) {
            ic.setRealName(ic.getFile().getName());
        } else {
            ic.setZipName(ic.getFile().getName());
        }
        // do we support this kind of file?
        if (manager.getFactory().canExtract(ic.getFile()) || manager.getFactory().isDefaultFileinfo()) {
            // get the hash for this file
            String hash = FileUtils.getMD5Hash(ic.getFile());
            // if we can't get a hash, just set it to a non null value,
            // so at least the indexing continues
            if (hash == null) {
                hash = "unknown";
            }
            // Check whether this file has been added already
            if (!ic.getCollection().getMd5DocumentCache().contains(hash)) {
                // new document, handle it
                ic.setHash(hash);
                Document doc = parse(ic);
                if (doc != null) {
                    if (log.isDebugEnabled()) {
                        log.debug("Indexcommand: " + ic);
                    }
                    // add the document to the index(writer)
                    try {
                        ic.getWriter().addDocument(doc);

                        // add the hash to hashtable if not "unknown" or
                        // empty
                        if (!"unknown".equals(hash) && (hash.length() > 0)) {
                            boolean result = ic.getCollection().getMd5DocumentCache().add(hash);

                            if (result) {
                                log.debug("Hash added for document: " + ic.getFile());
                            } else {
                                log.warn("No Hash added for document: " + ic.getFile());
                            }
                        }

                        log.info("document #" + ic.getWriter().docCount() + ": " + ic.getFile().getName() + " added to index");
                    }
                    catch (IOException e) {
                        throw new IndexException("Error adding document '" + ic.getFile().getName() + "' to Index", e);
                    }
                }
            } else {
                log.info("skipping duplicate document: " + ic.getFile().getName());

                // if this document is in the cache, we may remove it
                if (FileUtils.isIn(ic.getFile(), ic.getCollection().getCacheDirWithManagerDefaults())) {
                    if (ic.getFile().delete()) {
                        log.debug("Removed: " + ic.getFile() + " from cache.");
                    }
                }
            }
        } else {
            log.debug("skipping unsupported document: " + ic.getFile().getName());
        }
    }

    /**
     * @param ic
     * @param extension
     * @throws IndexException
     */
    private void indexArchive(final IndexCommand ic, String extension) throws IndexException {
        if (stopRequested) {
            log.info("Indexing stops, due to request");
            return;
        }
        // we have an archive
        log.debug(ic.getFile() + " is an archive");
        // add the document with just its name and hash to the collection as well, so that we can cache it
        // for incremental indexing
        String hash = FileUtils.getMD5Hash(ic.getFile());
        // if we can't get a hash, just set it to a non null value, so at least the indexing continues
        if (hash == null) {
            hash = "unknown";
        }
        // Check whether this file has been added already
        if (!ic.getCollection().getMd5DocumentCache().contains(hash)) {
            try {
                // add the document with just its name and hash
                Document doc = new Document();
                doc.add(new Field("hash", hash, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.add(new Field("name", ic.getRealName(), Field.Store.YES, Field.Index.TOKENIZED));
                ic.getWriter().addDocument(doc);
                log.debug("Archive " + ic.getFile() + " added to collection");
                File dir = null;
                if (!StringUtils.hasText(manager.getArchiveHandler().getUnArchiveCommand(extension))) {
                    // this is a zip: handle with java's zip
                    // capabilities
                    log.debug(ic.getFile() + " is a zip file");
                    dir = CollectionManagerImpl.unZip(ic.getFile(), ic.getCollection());
                } else {
                    log.debug(ic.getFile() + " is a external archive file");
                    dir = manager.unPack(ic.getFile(), ic.getCollection());
                }

                IndexCommand localIc = new IndexCommand(ic);
                if (ic.isInZip()) {
                    // ic.setZipPath(ic.getZipPath() +
                    // ic.getFile().getName() + "::/");
                    localIc.setZipPath(ic.getZipPath() + dir.getName() + "/");
                    localIc.setStart(true);
                } else {
                    localIc.setRealName(ic.getFile().getName());
                    localIc.setInZip(true);
                    localIc.setStart(false);
                }

                localIc.setFile(dir);
                indexDocs(localIc);
                // remove dir since it is temporary
                if (!ic.getCollection().isKeepCacheWithManagerDefaults()) {
                    FileUtils.removeDir(dir);
                }

                // add the hash to hashtable if not "unknown" or empty
                if (!"unknown".equals(hash) && (hash.length() > 0)) {
                    boolean result = ic.getCollection().getMd5DocumentCache().add(hash);
                    if (result) {
                        log.debug("Hash added for document: " + ic.getFile());
                    } else {
                        log.warn("No Hash added for document: " + ic.getFile());
                    }
                }
            }
            catch (IOException e) {
                throw new IndexException("Error adding document '" + ic.getFile().getName() + "' to Index", e);
            }
        } else {
            log.info("skipping duplicate archive: " + ic.getFile().getName());
        }
    }

    /**
     * @param ic
     * @throws IndexException
     */
    private void indexDirectory(final IndexCommand ic) throws IndexException {
        if (stopRequested) {
            log.info("Indexing stops, due to request");
            return;
        }
        log.debug(ic.getFile() + " is a directory");
        // recurse
        String[] files = ic.getFile().list();
        // I've seen list return null, so be carefull, guess dir names too long for OS
        if (files == null) {
            log.warn("Something funny with '" + ic.getFile() + "'. Name or path too long?");
            log.warn("Could not access '" + ic.getFile() + "' for indexing. Skipping this directory.");
        } else {

            log.debug(ic.getFile() + " is a directory with " + files.length + " docs");
            if (!ic.isInZip()) {
                // is this the first directory
                if (!ic.isStart()) {
                    ic.setRealPath(ic.getRealPath() + ic.getFile().getName() + "/");
                } else {
                    ic.setStart(false);
                }
            } else {
                if (!ic.isStart()) {
                    ic.setZipPath(ic.getZipPath() + ic.getFile().getName() + "/");
                } else {
                    ic.setStart(false);
                }
            }

            // Index the files using a new IndexCommand that's a copy of the current one
            // except with the new File, don't use the current since status will be overridden
            // when backtracking from recursion
            for (int i = 0; i < files.length; i++) {
                IndexCommand localIc = new IndexCommand(ic);
                localIc.setFile(new File(ic.getFile(), files[i]));
                indexDocs(localIc);
            }
        }
    }

    /**
     * Makes a document for a File, by parsing the contents and metadata provided by {@link IndexCommand}.
     * 
     * @param ic IndexCommand containing all parameters for parsing.
     * 
     * @return Document with parsed content, or null if unknown format, or empty content.
     */
    private Document parse(final IndexCommand ic) {
        log.debug("Parsing " + ic.getFile().getName());

        // Extract relevant info from the file by first getting the relevant
        // Extractor
        Extractor ext = manager.getFactory().createExtractor(ic.getFile());

        if (ext == null) {
            log.debug("Skipping " + ic.getFile().getName());

            return null;
        }

        ParsedFileInfo fileInfo = ext.extractInfo(ic.getFile());

        if (fileInfo != null) {
            // make a new, empty document
            if (log.isDebugEnabled()) {
                log.debug("Creating new Document with ParsedFileInfo: " + fileInfo);
            }

            Document doc = new Document();

            // Add all collection info
            doc.add(new Field("name", ic.getRealName(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("path", ic.getRealPath(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("zipPath", ic.getZipPath(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("zipName", ic.getZipName(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("collection", ic.getCollection().getName(), Field.Store.YES, Field.Index.TOKENIZED));

            // Add all file info
            if (fileInfo.getReader() != null) {
                doc.add(new Field("contents", fileInfo.getReader()));
            }
            doc.add(new Field("summary", fileInfo.getSummary(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("title", fileInfo.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
            doc.add(new Field("type", fileInfo.getType(), Field.Store.YES, Field.Index.TOKENIZED));
            if (fileInfo.getISBN() != null) {
                doc.add(new Field("isbn", fileInfo.getISBN(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            // store date as yyyyMMdd
            DateFormat df = new SimpleDateFormat("yyyyMMdd");
            String dfString = df.format(new Date(fileInfo.getModificationDate()));

            doc.add(new Field("modified", dfString, Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.add(new Field("size", Long.toString(fileInfo.getSize()), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.add(new Field("hash", ic.getHash(), Field.Store.YES, Field.Index.UN_TOKENIZED));

            if (log.isDebugEnabled()) {
                log.debug("Parsed " + doc);
            }

            return doc;
        } else {
            log.warn("Extractor does not return any ParsedFileInfo for: " + ic.getFile().getName());
        }
        return null;
    }

}
上一页 12
💿 文件大小 133 K
👤 上传用户 cenxudong4
📂 所属分类 Java编程
🏷️ 相关标签

#搜索 #代码 #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -