📄 dispatchingdocumentfactory.java
字号:
for ( k = 0; k < n; k++ ) if ( e.getValue() == documentFactory[ k ].getClass() ) { if ( e.getKey().equals( OTHERWISE_IN_RULE ) ) value2int.defaultReturnValue( k ); else value2int.put( e.getKey(), k ); break; } if ( k == n ) throw new IllegalArgumentException( "Mismatch in the rule mapping " + e.getKey() + " to " + e.getValue() ); } System.out.println( "Building a strategy mapping " + dispatchingKey + " to " + value2int ); strategy = new StringBasedDispatchingStrategy( dispatchingKey, value2int ); } /** Creates a new dispatching factory. * * @param documentFactory the array of subfactories. * @param fieldName the names of this factory's fields. * @param fieldType the types of this factory's fields. * @param rename the way fields of this class are mapped to fields of the subfactories. * @param strategy the strategy to decide which factory should be used. */ public DispatchingDocumentFactory( final DocumentFactory[] documentFactory, final String[] fieldName, final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy ){ init( documentFactory, fieldName, fieldType, rename, strategy ); checkAttributes(); } public DispatchingDocumentFactory copy() { final DocumentFactory[] documentFactory = new DocumentFactory[ this.documentFactory.length ]; for( int i = documentFactory.length; i-- != 0; ) documentFactory[ i ] = this.documentFactory[ i ].copy(); return new DispatchingDocumentFactory( documentFactory, fieldName, fieldType, rename, strategy ); } public DispatchingDocumentFactory( final Properties properties ) throws ConfigurationException { super( properties ); setExtraArguments( properties ); checkAttributes(); } public DispatchingDocumentFactory( final String[] property ) throws ConfigurationException { super( property ); setExtraArguments( property ); checkAttributes(); } public DispatchingDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) { super( defaultMetadata ); checkAttributes(); // Will certainly fail because the configuration is actually missing } public DispatchingDocumentFactory() { super(); checkAttributes(); // Will certainly fail because the configuration is actually missing } @SuppressWarnings("unchecked") @Override protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException { if ( sameKey( MetadataKeys.FIELDNAME, key ) ) { fieldName = values; numberOfFields = fieldName.length; return true; } else if ( sameKey( MetadataKeys.KEY, key ) ) { final String dispatchingKeyName = ensureJustOne( key, values ); final int lastDot = dispatchingKeyName.lastIndexOf( '.' ); try { dispatchingKey = Enum.valueOf( (Class<Enum>)Class.forName( dispatchingKeyName.substring( 0, lastDot ) ), dispatchingKeyName.substring( lastDot + 1) ); } catch ( ClassNotFoundException e ) { throw new IllegalArgumentException( "The class specified in the key " + dispatchingKeyName + " cannot be found" ); } return true; } else if ( sameKey( MetadataKeys.RULE, key ) ) { String[] rules = values; value2factoryClass = new Object2ObjectLinkedOpenHashMap<String,Class<? extends DocumentFactory>>(); int i, m = rules.length; for ( i = 0; i < m; i++ ) { int pos = rules[ i ].indexOf( ':' ); if ( pos <= 0 || pos == rules[ i ].length() - 1 ) throw new ConfigurationException( "Rule " + rules[ i ] + " does not contain a colon or it is malformed" ); if ( rules[ i ].indexOf( ':', pos + 1 ) >= 0 ) throw new ConfigurationException( "Rule " + rules[ i ] + " contains too many colons" ); String factoryName = rules[ i ].substring( pos + 1 ); Class<? extends DocumentFactory> factoryClass = null; try { factoryClass = (Class<? extends DocumentFactory>)Class.forName( factoryName ); if ( ! ( DocumentFactory.class.isAssignableFrom( factoryClass ) ) ) throw new ClassNotFoundException(); } catch ( ClassNotFoundException e ) { throw new ConfigurationException( "ParsingFactory " + factoryName + " is invalid; maybe the package name is missing" ); } value2factoryClass.put( rules[ i ].substring( 0, pos ), factoryClass ); } m = value2factoryClass.values().size(); return true; } else if ( sameKey( MetadataKeys.MAP, key ) ) { String[] pieces = values; int i, m = pieces.length; rename = new int[ m ][]; for ( i = 0; i < m; i++ ) { String[] subpieces = pieces[ i ].split( ":" ); if ( i > 0 && subpieces.length != rename[ 0 ].length ) throw new ConfigurationException( "Length mismatch in the map " + values ); rename[ i ] = new int[ subpieces.length ]; for ( int k = 0; k < subpieces.length; k++ ) { try { rename[ i ][ k ] = Integer.parseInt( subpieces[ k ] ); } catch ( NumberFormatException e ) { throw new ConfigurationException( "Number format exception in the map " + values ); } } } } return super.parseProperty( key, values, metadata ); } public int numberOfFields() { return numberOfFields; } public String fieldName( final int field ) { ensureFieldIndex( field ); return fieldName[ field ]; } public int fieldIndex( final String fieldName ) { for ( int k = 0; k < numberOfFields; k++ ) if ( this.fieldName[ k ].equals( fieldName ) ) return k; return -1; } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); return fieldType[ field ]; } /** A word reader that is returned when a null field should be returned. */ final private WordReader nullReader = new FastBufferedReader(); public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException { final int factoryIndex = strategy.factoryNumber( metadata, this ); System.out.println( "The strategy returned " + factoryIndex ); if ( factoryIndex < 0 || factoryIndex >= n ) throw new IllegalArgumentException(); System.out.println( "Going to parse a document with " + metadata + ", using " + documentFactory[ factoryIndex ].getClass().getName() ); final DocumentFactory factory = documentFactory[ factoryIndex ]; final Document document = factory.getDocument( rawContent, metadata ); return new AbstractDocument() { public CharSequence title() { return document.title(); } public String toString() { return document.toString(); } public CharSequence uri() { return document.uri(); } public Object content( final int field ) throws IOException { ensureFieldIndex( field ); if ( rename[ factoryIndex ][ field ] < 0 ) return NullReader.getInstance(); return document.content( rename[ factoryIndex ][ field ] ); } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); if ( rename[ factoryIndex ][ field ] < 0 ) return nullReader; return document.wordReader( rename[ factoryIndex ][ field ] ); } public void close() throws IOException { super.close(); document.close(); } }; } public static void main( final String[] arg ) throws IOException, ConfigurationException { //PdfDocumentFactory pdfFactory = new PdfDocumentFactory(); //HtmlDocumentFactory htmlFactory = new HtmlDocumentFactory(); //IdentityDocumentFactory idFactory = new IdentityDocumentFactory(); //Object2IntMap map = new Object2IntOpenHashMap( // new String[] { "application/pdf", "text/html" }, // new int[] { 0, 1 } // ); //map.defaultReturnValue( 2 ); //DispatchingStrategy strategy = new StringBasedDispatchingStrategy( MetadataKeys.MIMETYPE, map ); Properties p = new Properties(); p.addProperty( MetadataKeys.FIELDNAME.name().toLowerCase(), "text,title" ); p.addProperty( MetadataKeys.KEY.name().toLowerCase(), PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE.name() ); p.addProperty( MetadataKeys.RULE.name().toLowerCase(), "application/pdf:it.unimi.dsi.mg4j.document.PdfDocumentFactory,text/html:it.unimi.dsi.mg4j.document.HtmlDocumentFactory,?:it.unimi.dsi.mg4j.document.IdentityDocumentFactory" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING.name().toLowerCase(), "iso-8859-1" ); DispatchingDocumentFactory factory = new DispatchingDocumentFactory( p ); DocumentCollection dc = new FileSetDocumentCollection( arg, factory ); BinIO.storeObject( dc, "test.collection" ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -