📄 filescanner.java
字号:
// Done, join with threads as they finish for (Iterator iter = threads.iterator(); iter.hasNext(); ) { try { Thread th = ((Thread)(iter.next())); th.join(); log(1,"joined thread: "+th.getName()); } catch (InterruptedException ex) {} } long endTime = System.currentTimeMillis(); long elapsedTime = endTime - startTime; double indexRate = numberOfIndexedFiles/(elapsedTime/1000.0); double kbytes = numberOfIndexedBytes/1024.0; double kbytesRate = numberOfIndexedBytes/(elapsedTime/1000.0); System.out.println("Total scanning time: "+elapsedTime/1000.0 +" seconds"); System.out.println("Found "+numberOfFiles+" files"); System.out.println("Rejected "+numberOfRejectedFiles+" files"); System.out.println("Indexed "+numberOfIndexedFiles+" files"); System.out.println(" "+indexRate+" files per second"); System.out.println("Indexed "+kbytes+" Kbytes"); System.out.println(" "+kbytesRate+" kbytes per second"); } // Some utility routines. static public String guessDoctype(String name) { int dot = name.lastIndexOf("."); if (dot == -1) return defaultMime; String ext = name.substring(dot); log(2,"guessDoctype: extension is "+ext); if (!doctypes.containsKey(ext)) return defaultMime; return (String)(doctypes.get(ext)); } static public boolean indexableDocType(String type) { return allowedDoctypes.contains(type); } static synchronized final void log(int level,String msg) { if (level <= verbose) { System.out.println(msg); } } // From Acme.Utils.parseInt(). See http://www.acme.com/java/ // Parse an integer, returning a default value on errors. // Avoids try blocks all over the code. static int parseInt(String s, int default_value) { try { return Integer.parseInt(s); } catch (Exception e) { return default_value; } } static void usage() { System.out.println("arguments are:"); System.out.println(" -c col internal name of search collection"); System.out.println(" -d dir directory to scan (can be repeated)"); System.out.println(" -m mime default MIME content-type for files"); System.out.println(" -p port port number of search server (integer)"); System.out.println(" -s host hostname of search server"); System.out.println(" -t n number of worker (indexing) threads"); System.out.println(" -u user admin username for search collection"); System.out.println(" -v more verbose message (repeat for even more)"); System.out.println(" -z pass admin password for search collection"); } FileScanner() { } public void run() { long startWait = System.currentTimeMillis(); if (!finding) { try { Thread.sleep(100); } catch (InterruptedException ex) {} log(1,"waiting for work "+Thread.currentThread().getName()); if (System.currentTimeMillis() - startWait > SCANNER_SLEEP) { return; } } // Default values for insert. short flags = 0; int quality = 0; String desc = null; String publisher = null; String doctype = null; Map extra = null; Locale locale = null; List text = null; List terms = null; log(3,"starting indexing: "+Thread.currentThread().getName()); try { // Termination condition is subtle. // If the files queue is empty, but the boss thread // is not done, don't quit yet. while(!files.isEmpty() || !done) { while (files.isEmpty() && !done) { // Wait for more files to be found try { Thread.sleep(100); } catch (InterruptedException ex) {} } if (files.isEmpty()) continue; File file = new File((String)(files.removeFirst())); String path = file.getPath(); String name = file.getName(); log(1,"thread "+Thread.currentThread().getName()+" reading "+path); doctype = guessDoctype(name); if (!indexableDocType(doctype)) { log (0,doctype+" is not indexable, "+path); numberOfRejectedFiles++; continue; } long size = file.length(); if (size > 10000000) { log (0,"file too big ("+size+" bytes) "+path); numberOfRejectedFiles++; continue; } // Read file into memory. byte[] doc = new byte[(int)size]; try { new FileInputStream(path).read(doc); } catch (FileNotFoundException ex) { continue; // Must have been deleted, go to next filename. } catch (IOException ex) { log(0,"IO exception "+ex+" reading "+path); continue; // Some other problem, go to next filename. } // Map the filename into a URL that can be accessed by search clients // (a "file:" URL is usually inappropriate as it is specific to this host) // For this example, we're assuming the files will be served to clients // by the host "webhost" via HTTP, with starting path "/FileScanner" URL url; try { String canon = file.getCanonicalPath(); StringBuffer sb = new StringBuffer(canon.length() * 2); int offset = 0; int found; found = canon.indexOf(File.separator, offset); // Encode each directory for URL while (found != -1) { sb.append( "/" ); sb.append( URLEncoder.encode(canon.substring(offset,found)) ); // for JDK 1.3 or below //sb.append( URLEncoder.encode(canon.substring(offset,found),"UTF-8") ); // for JDK 1.4 or above offset = found + 1; found = canon.indexOf(File.separator, offset); } sb.append( "/" ); sb.append( URLEncoder.encode(canon.substring(offset)) ); url = new URL( "http", "webhost", "/FileScanner" + sb.toString() ); log(1,"thread "+Thread.currentThread().getName()+" mapping to URL "+url); } catch (MalformedURLException ex) { log(0,"Malformed URL for "+path); continue; // Move on to next file. } catch (IOException ex) { log(0,"Unable to find Canonical path for "+path); continue; // Move on to next file. } // Insert file, with retries for network errors. // Retries use binary exponential backoff: 2, 4, 8, 16, 32, ... seconds // This allows for network congestion to improve, or for the // server process to restart and recover. final int INITIAL_WAIT = 2; final int MAX_WAIT = 256; // Combined wait time is 510s (8.5 minutes) boolean retry = true; int trys = 0; for (long waitSecs=INITIAL_WAIT; retry; waitSecs *= 2) { try { trys++; if (trys>1) log(1,"try #"+trys+" for "+url); // Remove previous copy of this document, if it was there indexer.deleteMatchingDocuments(url); // Add the new one indexer.insert(url, (int)size, new Date(file.lastModified()), flags, quality, doctype, doc, path, desc, publisher, extra, locale, text, terms); numberOfIndexedFiles++; numberOfIndexedBytes += size; retry = false; } catch (IOException ex) { if (waitSecs > MAX_WAIT) { log(0,"Indexing failure in "+Thread.currentThread().getName()+ " when handling "+path); retry = false; // skip this document continue; } log(1,"Indexing problem in "+Thread.currentThread().getName()+ " sleeping for "+waitSecs+"s before retry, "+ ex.getMessage()); try { Thread.sleep(waitSecs * 1000); } catch (InterruptedException ex2) {} retry = true; } } } } catch (NoSuchElementException ex) { // Someone else must have grabbed it. Try again. } } // The Collections framework offers a synchronized List, // but that doesn't implement the operations specific to // LinkedList. We could implement an entire synchronized // LinkedList here, or we could make the client code // use the List calls in a queue-like manner. But this // seems more obvious for the above code, and it is // only four calls. static final class Queue { LinkedList q = new LinkedList(); public synchronized boolean isEmpty() { return q.isEmpty(); } public synchronized int size() { return q.size(); } public synchronized void addLast(Object o) { q.addLast(o); } public synchronized Object removeFirst() { return q.removeFirst(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -