📄 ternaryintervalsearchtree.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
					s.append( e.path, 0, e.path.length - 1 );					e = e.left;					continue;				}								index -= e.left.numNodes;			}						if ( e.isWord ) {				if ( index == 0 ) return s.append( e.path ).compact();				index--;			}									if ( e.middle != null ) {				if ( index < e.middle.numNodes ) {					s.append( e.path );					e = e.middle;					continue;				}				index -= e.middle.numNodes;			}						s.append( e.path, 0, e.path.length - 1 );			e = e.right;		}	}		public CharSequence get( final int index ) {		return getTerm( index );	}	@Deprecated	public int getIndex( final CharSequence s ) {		return getNumber( s );	}		public int getNumber( final CharSequence s ) {		final int l = s.length();		Node e = root;		int i;		int offset = 0;		int wordsAtLeft = 0;		char c;		char[] path;				while( e != null ) {			path = e.path;			for( i = 0; i < path.length - 1; i++ ) 				if ( offset + i == l || s.charAt( offset + i ) != path[ i ] ) return -1;						offset += i;			if ( offset == l ) return -1;						c = s.charAt( offset );			if ( c < e.path[ i ] ) e = e.left;			else if ( c > e.path[ i ] ) { 				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( e.middle != null ) wordsAtLeft += e.middle.numNodes;				if ( e.isWord ) wordsAtLeft++;				e = e.right;			}			else {				offset++;				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( offset == l ) return e.isWord ? wordsAtLeft : -1;				if ( e.isWord ) wordsAtLeft++;				e = e.middle;			}		}		return -1;	}		/** True if the last {@link #add(CharSequence)} modified the tree. */	private boolean modified;		public boolean add( final CharSequence s ) {		modified = false;		root = addRec( s, 0, s.length(), root );		return modified;	}			public CharSequence getPrefix( final Interval interval ) {		final MutableString s = new MutableString();		return getPrefix ( interval, s );	}			public MutableString getPrefix( final Interval interval, final MutableString prefix ) {		if ( interval == Intervals.EMPTY_INTERVAL || interval.left < 0 || interval.right < 0 ) throw new IllegalArgumentException();		getTerm( interval.left, prefix );		if ( interval.length() == 1 ) return prefix;		final MutableString s = (MutableString)getTerm( interval.right );		final int l = Math.min( prefix.length(), s.length() );		int i;		for( i = 0; i < l; i++ ) if ( s.charAt( i ) != prefix.charAt( i ) ) break;		return prefix.length( i );	} 	/** Inserts the given character sequence, starting at the given position, in the given subtree.	 * 	 * @param s the character sequence containing the characters to be inserted.	 * @param offset the first character to be inserted.	 * @param length the number of characters to be inserted.	 * @param e the subtree in which the characters should be inserted, or <code>null</code> if	 * a new node should be created.	 * @return the new node at the top of the subtree.	 */		private Node addRec( final CharSequence s, final int offset, final int length, final Node e ) {				if ( e == null ) {			// We create a new node containing all the characters and return it.			modified = true;			size++;			return new Node( s, offset, length, true, 1 );		}				/* We start scanning the path contained in the current node, up to		 * the last character excluded. If we find a mismatch, or if we exhaust our		 * characters, we must fork this node. */				char c;		int i;		Node n = null;		final char[] path = e.path;				for ( i = 0; i < path.length - 1; i++ ) {			c = s.charAt( offset + i );			if ( c < path[ i ] ) {				/* We fork on the left, keeping just the first i + 1 characters (this is necessary				 * as at least one character must be present in every node). The new				 * node will cover one word more than e.				 */				n = new Node( path, 0, i + 1, false, e.numNodes + 1 );				n.middle = e;				e.removePathPrefix( i + 1 );				n.left = addRec( s, offset + i, length - i, null );				break;			}			else if ( c > path[ i ] ) {				// As before, but on the right.				n = new Node( path, 0, i + 1, false, e.numNodes + 1 );				n.middle = e;				e.removePathPrefix( i + 1 );								n.right = addRec( s, offset + i, length - i, null );				break;			}			else {				if ( i == length - 1 ) {					/* We exhausted the character sequence. We fork in the middle,					 * keeping length characters and marking the new node as					 * containing one work. Again, the new code will cover one word					 * more than e. */					n = new Node( s, offset, length, true, e.numNodes + 1 );					n.middle = e;					e.removePathPrefix( length );					size++;					modified = true;					break;				}			}		}				if ( i < path.length - 1 ) return n;		/* We are positioned on the last character of the path. In this case our		 * behaviour is different, as if we must fork we must not perform any		 * splitting. Moreover, if we exhaust the characters we either found		 * the new sequence in the tree, or we just have to mark the node. */				c = s.charAt( offset + i );				if ( c < path[ i ] ) {			/** We fork on the left. The number of words under this node will			 * increase only if the structure is modified. */			e.left = addRec( s, offset + i, length - i, e.left );			if ( modified ) e.numNodes++;		}		else if ( c > path[ i ] ) {			e.right = addRec( s, offset + i, length - i, e.right );			if ( modified ) e.numNodes++;		}		else {			if ( i == length - 1 ) {				// This is the node.				if ( modified = !e.isWord ) {					e.numNodes++;					size++;				}				e.isWord = true;			}			else {				// We add a node in the middle, completing the sequence.				e.middle = addRec( s, offset + i + 1, length - i - 1, e.middle );				if ( modified ) e.numNodes++;			}		}		return e;	}	public int size() {		return size;	}		public boolean hasPrefixes() {		return true;	}	public boolean hasTerms() {		return true;	}	public static void main( final String[] arg ) throws IOException, JSAPException, NoSuchMethodException {		final SimpleJSAP jsap = new SimpleJSAP( TernaryIntervalSearchTree.class.getName(), "Builds a ternary interval search tree reading from standard input a newline-separated list of terms.",			new Parameter[] {				new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),				new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),				new UnflaggedOption( "tree", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised tree." )		});		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		final TernaryIntervalSearchTree tree = new TernaryIntervalSearchTree();				MutableString term = new MutableString();		final ProgressLogger pl = new ProgressLogger();		pl.itemsName = "terms";		final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( System.in, (Charset)jsapResult.getObject( "encoding" ) ), jsapResult.getInt( "bufferSize" ) );						pl.start( "Reading terms..." );		while( terms.readLine( term ) != null ) {			pl.update();			tree.add( term );		}		pl.done();		BinIO.storeObject( tree, jsapResult.getString( "tree" ) );	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -