📄 adddocument.java
字号:
/* -*- mode:java; indent-tabs-mode:nil; c-basic-offset:2 -*- * * $RCSFile$ $Revision: 1.26 $ $Date: 2006/01/25 16:56:17 $ * * Copyright (c) 2000-2004 Autonomy Corp. All Rights Reserved. * Permission to use, copy, modify, and distribute this file is hereby * granted without fee, provided that the above copyright notice appear * in all copies. */import java.io.*;import java.net.*;import java.util.*;import com.ultraseek.xpa.search.*;import com.ultraseek.xpa.server.*;/** * Add documents to an Ultraseek Indexer collection. * <p> * This sample application demonstrates how to: * <ul> * <li>parse and insert a document. * <li>add additional text to the search index for a document. * <li>add additional terms to the search index for a document. * <li>add non-indexed text, which is returned with SearchResults, to the document metadata. * </ul> * The steps to add a document to the Ultraseek search index are: * <blockquote> * <nl> * <li>Fetch the document. * <li>Parse the text from the document. * <li>Customize the information to add to the search index. * <li>Delete any previous version of the document from the search index. * <li>Insert the document's information to the search index. * </nl> * </blockquote> * This sample application does the following: * <blockquote> * <table border=1> * <tr><td>Fetch the Document</td> * <td>The document is fetched using a <code>URLConnection</code>. * It attempts to guess the real document type when the source server * returns <code>"application/octet-stream"</code>. * </td></tr> * <tr><td>Parse the text from the document</td> * <td>The document is sent to the Ultraseek server using <code>IndexerAdmin.parse()</code>, * and Ultraseek returns the indexable information it found in the document.</td></tr> * <tr><td>Customize Index information</td> * <td>This sample adds the following information: * <blockquote> * <table border=1><tr><td>Indexable text</td><td>Here is a recipe for Nutria Gumbo.</td></tr> * <tr><td>Indexable terms</td><td>cuisine:cajun</td></tr> * <tr><td>Non indexed Metadata</td><td>ingredients = "Nutria, Okra, Rice, Tomatoes"</td></tr> * </table> * </blockquote> * </td></tr> * <tr><td>Deletes the prior version of the document</td> * <td>Using <code>CollectionAdmin.deleteMatchingDocuments()</code></td></tr> * <tr><td>Insert the new document's information</td><td>Using <code>CollectionAdmin.insert()</code></td></tr> * </table> * </blockquote> * This sample application then displays the <code>SearchResult</code> for the inserted * document, and (optionally) removes the document from the index. * <p> * Afterwards, this sample application prompts for the next URL. * An empty String will end the demonstration. * * @see IndexerAdmin#parse * @see IndexerDocument * @see IndexerAdmin#deleteMatchingDocuments * @see IndexerAdmin#insert * @since XPA1.0 */public class AddDocument { public static void main(String[] args) throws IOException { new AddDocument().runDemo(args); } AddDocument() { } /** * Run the entire demo of how to Add a Document to an Ultraseek collection. */ public void runDemo(String[] args) throws IOException { try { InitSampleApplication(); } catch (IOException e) { System.out.println( "Sample application encountered: " + e ); return; } while (true) { try { if (!GetDocumentURL()) break; FetchTheDocument(); ParseTheDocument(); CustomizeDocumentInformation(); DeletePriorDocumentInformation(); InsertDocumentInformation(); /* See the result of the sample application's actions */ ShowDocumentSearchResult(); MaybeCleanupInsertedDocument(); } catch (IOException e) { System.out.println( "Sample application encountered: " + e ); } } System.out.println( "Done." ); } IndexerAdmin indexerAdmin = null; UltraseekCollection collection = null; URL url = null; InputStreamReader inputStreamReader = new InputStreamReader(System.in); BufferedReader bufferedReader = new BufferedReader(inputStreamReader); /** * Initialize the Sample application. * Asks for Ultraseek server location, admin username, and collection to modify. */ public void InitSampleApplication() throws IOException { System.out.println(); System.out.println("This demo application adds documents"); System.out.println("to an Ultraseek \"direct indexing\" collection."); System.out.println(); UltraseekServer server = null; while (true) { try { server = null; System.out.println("Specify the URL (with port) of your Ultraseek instance."); System.out.print("URL: "); URL ultra = null; String input = bufferedReader.readLine(); try { ultra = new URL(input); } catch (MalformedURLException e) { if (input != null && !input.startsWith("http")) { input = "http://" + input; ultra = new URL(input); } else { throw e; } } server = new UltraseekServer(ultra); System.out.print("Connecting... "); String version = server.getVersionString(); System.out.println("\nUltraseek Server at " + ultra + " is version " + version ); break; } catch (MalformedURLException e) { System.out.println("Unable to parse URL: " + e ); } catch (IOException e) { if (server == null) throw e; System.out.println("Connect failure: " + e); } } ServerAdmin serverAdmin = null; while (true) { String username = null; String password = null; try { username = null; password = null; System.out.print("Admin username: "); username = bufferedReader.readLine(); System.out.print("Admin password: "); password = bufferedReader.readLine(); serverAdmin = server.admin(username,password); serverAdmin.login(); break; } catch (IOException e) { if ((username == null) || (password == null)) throw e; System.out.println("Login failure: " + e + "\n"); } } while (true) { System.out.println("Enter the internal name of the collection to insert into."); System.out.print("collection: "); String id = bufferedReader.readLine(); collection = (UltraseekCollection)server.getSearchCollection(id); if (collection==null) { System.out.println("Cannot find a collection named: " + id + "\n"); continue; } CollectionAdmin collectionAdmin = collection.admin(serverAdmin); /* REMIND: this test should be simplier ... */ if (!(collectionAdmin instanceof IndexerAdmin) || (collectionAdmin instanceof SpiderAdmin) || (collectionAdmin instanceof ScannerAdmin) || (collectionAdmin instanceof ExchangeAdmin) || (collectionAdmin instanceof NetnewsAdmin) || (collectionAdmin instanceof DatabaseAdmin)) { String collectionType = collection.getClass().getName(); collectionType = collectionType.substring(collectionType.lastIndexOf(".")+1); System.out.println(collection.getID() + " is a " + collectionType ); System.out.println("Documents cannot be inserted into that kind of collection.\n"); continue; } indexerAdmin = (IndexerAdmin)collectionAdmin; break; } } /** Get a URL for the document to insert. * @return true We have a new URL * @return false Input is ended, time to quit. */ public boolean GetDocumentURL() { while (true) { try { System.out.println(); System.out.println("Enter the URL of the document you want added."); System.out.print("URL: "); String input = bufferedReader.readLine(); if ("".equals(input)) return false; url = new URL(input); return true; } catch (MalformedURLException e) { System.out.println("Exception: " + e ); } catch (IOException e) { return false; } } } String contentType = null; Date date = null; int size = 0; byte[] content = null; /** * Fetch the content of the document to insert into the search collection. * <p> * Your application could fetch the document content using HTTP, FTP, * from a Database, etc. Your application should also handle * errors, retries, filtering of documents, etc. * <p> * This sample uses a simple <code>URLConnection</code> to * fetch the document. The document information is stored * in <code>content, contentType, date,</code> and <code>size</code>. * If the document source server returns a non-indexable * <code>Content-Type</code>, this sample tries to guess * a content type based on the filename part of the URL. */ public void FetchTheDocument() throws IOException { MyURLConnection conn = null; try { System.out.print( "\nFetching document: ... " ); long start = System.currentTimeMillis(); conn = new MyURLConnection(url); content = conn.getDocumentContent(); System.out.println( "" + (System.currentTimeMillis() - start) + " ms."); contentType = conn.getDocumentContentType(); date = conn.getDocumentDate(); size = content.length; System.out.println( "Document Content-Type: " + contentType ); System.out.println( "Document Date: " + date ); System.out.println( "Document size in bytes " + size ); } catch (IOException e) { System.out.println( "Unable to fetch document: " + e ); throw e; } } IndexerDocument parsedDoc = null; /** * Parse the document to discover indexable content. * <p> * Unless the document is <code>text/plain</code>, it must be parsed * to determine the appropriate text content to index. Your application * can parse the document itself, but it is recommended you use the * Ultraseek server's document parser - and then modify the results from * that parse if you need to customize information about the document. * <p> * This sample uses the Ultraseek server's document parser, the result is * stored in <code>parsedDoc</code>. */ public void ParseTheDocument() throws IOException { /* Second, send the document to the Ultraseek Server for parsing */ try { short flags = 0; System.out.print( "\nParsing document: ... " ); long start = System.currentTimeMillis(); parsedDoc = indexerAdmin.parse(url, size, date, flags, contentType, content); System.out.println( "" + (System.currentTimeMillis() - start) + " ms."); } catch (IOException e) { System.out.println( "Unable to parse document: " + e ); throw e; } System.out.println( "Parsed Title: " + parsedDoc.getTitle() ); System.out.println( "Parsed URL: " + parsedDoc.getURL() ); System.out.println( "Parsed Date: " + parsedDoc.getDate() ); System.out.println( "Parsed Flags: " + parsedDoc.getFlags() ); System.out.println( "Parsed Locale: " + parsedDoc.getLocale() ); System.out.println( "Parsed Publisher: " + parsedDoc.getPublisher() ); System.out.println( "Parsed Size: " + parsedDoc.getSize() ); System.out.println( "Parsed Date: " + parsedDoc.getDate() ); System.out.println( "Number of Terms: " + parsedDoc.getTerms().size() ); System.out.println( "Number of Text: " + parsedDoc.getText().size() ); System.out.println( "Number of Extra: " + parsedDoc.getExtra().size() ); String description = parsedDoc.getDescription(); final int MAX_DESCRIPTION = 800; if (description.length() > MAX_DESCRIPTION) description = (description.substring(0,(MAX_DESCRIPTION/2-1)) + "\n\n ... display truncated ... \n\n" + description.substring(description.length()-(MAX_DESCRIPTION/2-1)-1,description.length()-1)); System.out.println( "Parsed Description:\n" + description ); System.out.println( "------------------------------------------------" ); } /** * Customize the information to insert into the search index. * <p> * Your application could add meta-data, additional index terms, remove * index terms, add text to be indexed, change the title, etc. * <p> * This sample adds some text to be indexed, adds some specific * terms to be indexed, and adds some meta-data to be returned * with the SearchResult (and which is not added to the search index).
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -