📄 bulletparser.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
		int i, c = 0;		String tmpEntity;		if ( length < 2 ) return -1;				if ( a[ offset + 1 ] == '#' ) {			if ( length > 2 && a[ offset + 2 ] == 'x' ) {				for( i = 3; i < length && i < MAX_HEX_ENTITY_LENGTH && Character.digit( a[ i + offset ], HEXADECIMAL ) != -1; i++ );				tmpEntity =  new String( a, offset + 3, i - 3 );				if ( i != 3 ) c = Integer.parseInt( tmpEntity, HEXADECIMAL );			}			else {				for( i = 2; i < length && i < MAX_DEC_ENTITY_LENGTH && Character.isDigit( a[ i + offset ] ); i++ );				tmpEntity = new String( a, offset + 2, i - 2 );				if ( i != 2 ) c = Integer.parseInt( tmpEntity );			}						if ( c > 0 && c < MAX_ENTITY_VALUE ) {				lastEntity = (char)c;				if ( i < length && a[ i + offset ] == ';' ) i++;				return i + offset;			}		}		else {			if ( Character.isLetter( a[ offset + 1 ] ) ) {				for( i = 2; i < length && Character.isLetterOrDigit( a[ offset + i ] ); i++ );				if ( i != 1 && ( loose || ( i < length && ( Character.isWhitespace( a[ offset + i ] ) || a[ offset + i ] == ';' ) ) ) && ( lastEntity = entity2Char( entity.length( 0 ).append( a, offset + 1, i - 1 ) ) ) != 0 ) {					if ( i < length && a[ i + offset ] == ';' ) i++;					return i + offset;				}			}		}		return -1;	}	/**	 * Replaces entities with the corresponding characters.	 * 	 * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences	 * of entities are replaced by the corresponding character.	 * 	 * @param s a mutable string whose entities will be replaced by the corresponding characters.	 * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}.	 * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}.	 */	protected void replaceEntities( final MutableString s, final MutableString entity, final boolean loose ) {		final char[] a = s.array();		int length = s.length();		/* We examine the string *backwards*, so that i is always a valid index. */		int i = length, j;		while( i-- > 0 )			if ( a[ i ] == '&' && ( j = scanEntity( a, i, length - i, loose, entity ) ) != -1 ) 				length = s.replace( i, j, lastEntity ).length();	}	/** Handles markup.	 * 	 * @param text the text.	 * @param pos the first character in the markup after <samp>&lt;!</samp>.	 * @param end the end of <code>text</code>.	 * @return the position of the first character after the markup.	 */		protected int handleMarkup( final char[] text, int pos, final int end ) {		// A markup instruction (doctype, comment, etc.).		switch( text[ ++pos ] ) {		case 'D':		case 'd':			// DOCTYPE			while(  pos < end && text[ pos++ ] != '>' );			break;		case '-':			// comment			if ( ( pos = CLOSED_COMMENT.search( text, pos, end ) ) == -1 ) pos = end;			else pos += CLOSED_COMMENT.length();			break;				default:			if ( pos < end - 6 && 					text[ pos ] == '[' && text[ pos + 1 ] == 'C' && text[ pos + 2 ] == 'D' && text[ pos + 3 ] == 'A' && text[ pos + 4 ] == 'T' && text[ pos + 5 ] == 'A' && text[ pos + 6 ] == '[' ) {				// CDATA section				final int last = CLOSED_CDATA.search( text, pos, end );				if ( parseCDATA ) callback.cdata( null, text, pos + 7, ( last == -1 ? end : last ) - pos - 7 );				pos = last == -1 ? end : last + CLOSED_CDATA.length();			}			//  Generic markup			else while( pos < end && text[ pos++ ] != '>' );			break;		}		return pos;	}		/** Handles processing instruction, ASP tags etc.	 * 	 * @param text the text.	 * @param pos the first character in the markup after <samp>&lt;%</samp>.	 * @param end the end of <code>text</code>.	 * @return the position of the first character after the processing instruction.	 */		protected int handleProcessingInstruction( final char[] text, int pos, final int end ) {		switch( text[ ++pos  ] ) {		case '%':			if ( ( pos = CLOSED_PERCENT.search( text, pos, end ) ) == -1 ) pos = end;			else pos += CLOSED_PERCENT.length();			break;					case '?':			if ( ( pos = CLOSED_PIC.search( text, pos, end ) ) == -1 ) pos = end;			else pos += CLOSED_PIC.length();			break;		case '[':			if ( ( pos = CLOSED_SECTION.search( text, pos, end ) ) == -1 ) pos = end;			else pos += CLOSED_SECTION.length();			break;		default:			//  Generic markup			while( pos < end && text[ pos++ ] != '>' );			break;		}		return pos;	}		/**	 * Analyze the text document to extract information.	 * 	 * @param text a <code>char</code> array of text to be parsed.	 */	public void parse( final char[] text ) {		parse( text, 0, text.length );	}			/**	 * Analyze the text document to extract information.	 * 	 * @param text a <code>char</code> array of text to be parsed.	 * @param offset the offset in the array from which the parsing will begin.	 * @param length the number of characters to be parsed.	 */	public void parse( final char[] text, final int offset, final int length ) {		MutableString tagElemTypeName = new MutableString(); 		MutableString attrName = new MutableString(); 		MutableString attrValue = new MutableString(); 		MutableString entity = new MutableString();		MutableString characters = new MutableString();		/* During the analysis of attribute we need a separator for values */		char delim;		/* The current character */		char currChar;		/* The state of the switch */		int state;		/* Others integer values used in the parsing process */		int start, k;		/* This boolean is set true if we have words to handle */		boolean flowBroken = false, parseCurrAttr;				/* The current element. */		Element currentElement;		/* The current attribute object */		Attribute currAttr = null; 		attrMap = new Reference2ObjectArrayMap<Attribute,MutableString>( 16 );			callback.startDocument();		tagElemTypeName.length( 0 ); 		attrName.length( 0 ); 		attrValue.length( 0 ); 		entity.length( 0 ); 		state = STATE_TEXT;		currentElement = null;		final int end = offset + length;		int pos = offset;				/* This is the main loop. */		while ( pos < end ) {						switch( state ) {			case STATE_TEXT:				currChar = text[ pos ];				if ( currChar == '&' ) {										// We handle both the case of an entity, and that of a stray '&'.					if ( ( k = scanEntity( text, pos, end - pos, true, entity ) ) == -1 ) {						currChar = '&';						pos++;					}					else {						currChar = lastEntity;						pos = k;						if ( DEBUG ) System.err.println( "Entity at: " + pos + " end of entity: " + k + " entity: " + entity + " char: " + currChar );					}					if ( parseText ) characters.append( currChar );					continue;				}								// No tags can happen later than end - 2.				if ( currChar != '<' || pos >= end - 2 ) {					if ( parseText ) characters.append( currChar );					pos++;					continue;				}								switch( text[ ++pos ] ) {				case '!':					pos = handleMarkup( text, pos, end );					break;				case '%':				case '?':					pos = handleProcessingInstruction( text, pos, end );					break;				default:					// Actually a tag. Note that we allow for </> and that we skip false positives					// due to sloppy HTML writing (e.g., "<-- hello! -->" ).					if ( Character.isLetter( text[ pos ] ) ) state = STATE_BEFORE_START_TAG_NAME;					else if ( text[ pos ] == '/' && ( Character.isLetter( text[ pos + 1 ] ) || text[ pos + 1 ] == '>' ) ) {						state = STATE_BEFORE_END_TAG_NAME;						pos++;					}					else {						// Not really a tag.						if ( parseText ) characters.append( '<' );						continue;					}					break;				}				if ( parseText && characters.length() != 0 ) {					callback.characters( characters.array(), 0, characters.length(), flowBroken );					characters.length( 0 );				}				flowBroken = false;								break;			case STATE_BEFORE_START_TAG_NAME:			case STATE_BEFORE_END_TAG_NAME:				// Let's get the name.				tagElemTypeName.length( 0 );				for( start = pos; pos < end && ( Character.isLetterOrDigit( text[ pos ] ) || text[ pos ] == ':' || text[ pos ] == '_' ||text[ pos ] == '-' || text[ pos ] == '.' ); pos++ );								tagElemTypeName.append( text, start, pos - start );				tagElemTypeName.toLowerCase();								currentElement = factory.getElement( tagElemTypeName );				if ( DEBUG ) System.err.println( ( state == STATE_BEFORE_START_TAG_NAME ? "Opening" : "Closing" ) + " tag for " + tagElemTypeName + " (element: " + currentElement+ ")" );								if ( currentElement != null && currentElement.breaksFlow ) flowBroken = true;				while( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++;				state = state == STATE_BEFORE_START_TAG_NAME ? STATE_IN_START_TAG : STATE_IN_END_TAG;				break;							case STATE_IN_START_TAG:				currChar = text[ pos ];				if ( currChar != '>' && ( currChar != '/' || pos == end - 1 || text[ pos + 1 ] != '>' ) ) {					// We got attributes.					if ( Character.isLetter( currChar ) ) {						parseCurrAttr = false;						attrName.length( 0 );						for( start = pos; pos < end && ( Character.isLetter( text[ pos ] ) || text[ pos ] == '-' ); pos++ );						if ( currentElement != null && parseAttributes ) { 							attrName.append( text, start, pos - start );							attrName.toLowerCase();							if ( DEBUG ) System.err.println( "Got attribute named \"" + attrName + "\"" );							currAttr = factory.getAttribute( attrName );							parseCurrAttr = parsedAttrs.contains( currAttr );						}						// Skip whitespace						while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++;						if ( pos == end ) break;						if ( text[ pos ] != '=' ) {							// We found an attribute without explicit value.							// TODO: can we avoid another string?							if ( parseCurrAttr ) attrMap.put( currAttr, new MutableString( currAttr.name ) );							break;						}												pos++;						while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++;						if ( pos == end ) break;												attrValue.length( 0 );						if ( pos < end && ( ( delim = text[ pos ] ) == '"' || ( delim = text[ pos ] ) == '\'' ) ) {							// An attribute value with delimiters.							for( start = ++pos; pos < end && text[ pos ] != delim; pos++ );							if ( parseCurrAttr ) attrValue.append( text, start, pos - start ).replace( NONSPACE_WHITESPACE, SPACE );							if ( pos < end ) pos++;						}						else {							// An attribute value without delimiters. Due to very common errors, we 							// gather characters up to the first occurrence of whitespace or '>'.							for( start = pos; pos < end && !Character.isWhitespace( text[ pos ] ) && text[ pos ] != '>'; pos++ ); 							if ( parseCurrAttr ) attrValue.append( text, start, pos - start );						}						if ( parseCurrAttr ) {							replaceEntities( attrValue, entity, false );							attrMap.put( currAttr, attrValue.copy() );							if ( DEBUG ) System.err.println( "Attribute value: \"" + attrValue + "\"" );						}						// Skip whitespace						while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++;					}					else {						// It's a mess. Our only reasonable chance is to try to resync on the first						// whitespace, or alternatively to get to the end of the tag.						do pos++; while ( pos < end && text[ pos ] != '>' && ! Character.isWhitespace( text[ pos ] ) );						// Skip whitespace						while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++;						continue;					}				}				else {					if ( parseTags && ! callback.startElement( currentElement, attrMap ) ) break;					if ( attrMap != null ) attrMap.clear();										if ( currentElement == Element.SCRIPT || currentElement == Element.STYLE ) {						final TextPattern pattern = currentElement == Element.SCRIPT ? SCRIPT_CLOSE_TAG_PATTERN : STYLE_CLOSE_TAG_PATTERN; 						start = pos + 1;						pos = pattern.search( text, start, end );						if ( pos == -1 ) pos = end;						if ( parseText ) callback.cdata( currentElement, text, start, pos - start );						if ( pos < end ) {							if ( parseTags ) callback.endElement( currentElement );							pos += pattern.length();						}					}					else pos += currChar == '/' ? 2 : 1;					state = STATE_TEXT;				}				break;							case STATE_IN_END_TAG:				while ( pos < end && text[ pos ] != '>' ) pos++;				if ( parseTags && currentElement != null && ! callback.endElement( currentElement ) ) break;				state = STATE_TEXT;				pos++;				break;							default:			}					}		// We do what we can to invoke tag handlers in case of a truncated text.		if ( state == STATE_IN_START_TAG && parseTags && currentElement != null ) callback.startElement( currentElement, attrMap );		if ( state == STATE_IN_END_TAG && parseTags && currentElement != null ) callback.endElement( currentElement );				if ( state == STATE_TEXT && parseText && characters.length() > 0 ) 			callback.characters( characters.array(), 0, characters.length(), flowBroken );				callback.endDocument();	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -