📄 bulletparser.java
字号:
int i, c = 0; String tmpEntity; if ( length < 2 ) return -1; if ( a[ offset + 1 ] == '#' ) { if ( length > 2 && a[ offset + 2 ] == 'x' ) { for( i = 3; i < length && i < MAX_HEX_ENTITY_LENGTH && Character.digit( a[ i + offset ], HEXADECIMAL ) != -1; i++ ); tmpEntity = new String( a, offset + 3, i - 3 ); if ( i != 3 ) c = Integer.parseInt( tmpEntity, HEXADECIMAL ); } else { for( i = 2; i < length && i < MAX_DEC_ENTITY_LENGTH && Character.isDigit( a[ i + offset ] ); i++ ); tmpEntity = new String( a, offset + 2, i - 2 ); if ( i != 2 ) c = Integer.parseInt( tmpEntity ); } if ( c > 0 && c < MAX_ENTITY_VALUE ) { lastEntity = (char)c; if ( i < length && a[ i + offset ] == ';' ) i++; return i + offset; } } else { if ( Character.isLetter( a[ offset + 1 ] ) ) { for( i = 2; i < length && Character.isLetterOrDigit( a[ offset + i ] ); i++ ); if ( i != 1 && ( loose || ( i < length && ( Character.isWhitespace( a[ offset + i ] ) || a[ offset + i ] == ';' ) ) ) && ( lastEntity = entity2Char( entity.length( 0 ).append( a, offset + 1, i - 1 ) ) ) != 0 ) { if ( i < length && a[ i + offset ] == ';' ) i++; return i + offset; } } } return -1; } /** * Replaces entities with the corresponding characters. * * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences * of entities are replaced by the corresponding character. * * @param s a mutable string whose entities will be replaced by the corresponding characters. * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}. * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}. */ protected void replaceEntities( final MutableString s, final MutableString entity, final boolean loose ) { final char[] a = s.array(); int length = s.length(); /* We examine the string *backwards*, so that i is always a valid index. */ int i = length, j; while( i-- > 0 ) if ( a[ i ] == '&' && ( j = scanEntity( a, i, length - i, loose, entity ) ) != -1 ) length = s.replace( i, j, lastEntity ).length(); } /** Handles markup. * * @param text the text. * @param pos the first character in the markup after <samp><!</samp>. * @param end the end of <code>text</code>. * @return the position of the first character after the markup. */ protected int handleMarkup( final char[] text, int pos, final int end ) { // A markup instruction (doctype, comment, etc.). switch( text[ ++pos ] ) { case 'D': case 'd': // DOCTYPE while( pos < end && text[ pos++ ] != '>' ); break; case '-': // comment if ( ( pos = CLOSED_COMMENT.search( text, pos, end ) ) == -1 ) pos = end; else pos += CLOSED_COMMENT.length(); break; default: if ( pos < end - 6 && text[ pos ] == '[' && text[ pos + 1 ] == 'C' && text[ pos + 2 ] == 'D' && text[ pos + 3 ] == 'A' && text[ pos + 4 ] == 'T' && text[ pos + 5 ] == 'A' && text[ pos + 6 ] == '[' ) { // CDATA section final int last = CLOSED_CDATA.search( text, pos, end ); if ( parseCDATA ) callback.cdata( null, text, pos + 7, ( last == -1 ? end : last ) - pos - 7 ); pos = last == -1 ? end : last + CLOSED_CDATA.length(); } // Generic markup else while( pos < end && text[ pos++ ] != '>' ); break; } return pos; } /** Handles processing instruction, ASP tags etc. * * @param text the text. * @param pos the first character in the markup after <samp><%</samp>. * @param end the end of <code>text</code>. * @return the position of the first character after the processing instruction. */ protected int handleProcessingInstruction( final char[] text, int pos, final int end ) { switch( text[ ++pos ] ) { case '%': if ( ( pos = CLOSED_PERCENT.search( text, pos, end ) ) == -1 ) pos = end; else pos += CLOSED_PERCENT.length(); break; case '?': if ( ( pos = CLOSED_PIC.search( text, pos, end ) ) == -1 ) pos = end; else pos += CLOSED_PIC.length(); break; case '[': if ( ( pos = CLOSED_SECTION.search( text, pos, end ) ) == -1 ) pos = end; else pos += CLOSED_SECTION.length(); break; default: // Generic markup while( pos < end && text[ pos++ ] != '>' ); break; } return pos; } /** * Analyze the text document to extract information. * * @param text a <code>char</code> array of text to be parsed. */ public void parse( final char[] text ) { parse( text, 0, text.length ); } /** * Analyze the text document to extract information. * * @param text a <code>char</code> array of text to be parsed. * @param offset the offset in the array from which the parsing will begin. * @param length the number of characters to be parsed. */ public void parse( final char[] text, final int offset, final int length ) { MutableString tagElemTypeName = new MutableString(); MutableString attrName = new MutableString(); MutableString attrValue = new MutableString(); MutableString entity = new MutableString(); MutableString characters = new MutableString(); /* During the analysis of attribute we need a separator for values */ char delim; /* The current character */ char currChar; /* The state of the switch */ int state; /* Others integer values used in the parsing process */ int start, k; /* This boolean is set true if we have words to handle */ boolean flowBroken = false, parseCurrAttr; /* The current element. */ Element currentElement; /* The current attribute object */ Attribute currAttr = null; attrMap = new Reference2ObjectArrayMap<Attribute,MutableString>( 16 ); callback.startDocument(); tagElemTypeName.length( 0 ); attrName.length( 0 ); attrValue.length( 0 ); entity.length( 0 ); state = STATE_TEXT; currentElement = null; final int end = offset + length; int pos = offset; /* This is the main loop. */ while ( pos < end ) { switch( state ) { case STATE_TEXT: currChar = text[ pos ]; if ( currChar == '&' ) { // We handle both the case of an entity, and that of a stray '&'. if ( ( k = scanEntity( text, pos, end - pos, true, entity ) ) == -1 ) { currChar = '&'; pos++; } else { currChar = lastEntity; pos = k; if ( DEBUG ) System.err.println( "Entity at: " + pos + " end of entity: " + k + " entity: " + entity + " char: " + currChar ); } if ( parseText ) characters.append( currChar ); continue; } // No tags can happen later than end - 2. if ( currChar != '<' || pos >= end - 2 ) { if ( parseText ) characters.append( currChar ); pos++; continue; } switch( text[ ++pos ] ) { case '!': pos = handleMarkup( text, pos, end ); break; case '%': case '?': pos = handleProcessingInstruction( text, pos, end ); break; default: // Actually a tag. Note that we allow for </> and that we skip false positives // due to sloppy HTML writing (e.g., "<-- hello! -->" ). if ( Character.isLetter( text[ pos ] ) ) state = STATE_BEFORE_START_TAG_NAME; else if ( text[ pos ] == '/' && ( Character.isLetter( text[ pos + 1 ] ) || text[ pos + 1 ] == '>' ) ) { state = STATE_BEFORE_END_TAG_NAME; pos++; } else { // Not really a tag. if ( parseText ) characters.append( '<' ); continue; } break; } if ( parseText && characters.length() != 0 ) { callback.characters( characters.array(), 0, characters.length(), flowBroken ); characters.length( 0 ); } flowBroken = false; break; case STATE_BEFORE_START_TAG_NAME: case STATE_BEFORE_END_TAG_NAME: // Let's get the name. tagElemTypeName.length( 0 ); for( start = pos; pos < end && ( Character.isLetterOrDigit( text[ pos ] ) || text[ pos ] == ':' || text[ pos ] == '_' ||text[ pos ] == '-' || text[ pos ] == '.' ); pos++ ); tagElemTypeName.append( text, start, pos - start ); tagElemTypeName.toLowerCase(); currentElement = factory.getElement( tagElemTypeName ); if ( DEBUG ) System.err.println( ( state == STATE_BEFORE_START_TAG_NAME ? "Opening" : "Closing" ) + " tag for " + tagElemTypeName + " (element: " + currentElement+ ")" ); if ( currentElement != null && currentElement.breaksFlow ) flowBroken = true; while( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++; state = state == STATE_BEFORE_START_TAG_NAME ? STATE_IN_START_TAG : STATE_IN_END_TAG; break; case STATE_IN_START_TAG: currChar = text[ pos ]; if ( currChar != '>' && ( currChar != '/' || pos == end - 1 || text[ pos + 1 ] != '>' ) ) { // We got attributes. if ( Character.isLetter( currChar ) ) { parseCurrAttr = false; attrName.length( 0 ); for( start = pos; pos < end && ( Character.isLetter( text[ pos ] ) || text[ pos ] == '-' ); pos++ ); if ( currentElement != null && parseAttributes ) { attrName.append( text, start, pos - start ); attrName.toLowerCase(); if ( DEBUG ) System.err.println( "Got attribute named \"" + attrName + "\"" ); currAttr = factory.getAttribute( attrName ); parseCurrAttr = parsedAttrs.contains( currAttr ); } // Skip whitespace while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++; if ( pos == end ) break; if ( text[ pos ] != '=' ) { // We found an attribute without explicit value. // TODO: can we avoid another string? if ( parseCurrAttr ) attrMap.put( currAttr, new MutableString( currAttr.name ) ); break; } pos++; while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++; if ( pos == end ) break; attrValue.length( 0 ); if ( pos < end && ( ( delim = text[ pos ] ) == '"' || ( delim = text[ pos ] ) == '\'' ) ) { // An attribute value with delimiters. for( start = ++pos; pos < end && text[ pos ] != delim; pos++ ); if ( parseCurrAttr ) attrValue.append( text, start, pos - start ).replace( NONSPACE_WHITESPACE, SPACE ); if ( pos < end ) pos++; } else { // An attribute value without delimiters. Due to very common errors, we // gather characters up to the first occurrence of whitespace or '>'. for( start = pos; pos < end && !Character.isWhitespace( text[ pos ] ) && text[ pos ] != '>'; pos++ ); if ( parseCurrAttr ) attrValue.append( text, start, pos - start ); } if ( parseCurrAttr ) { replaceEntities( attrValue, entity, false ); attrMap.put( currAttr, attrValue.copy() ); if ( DEBUG ) System.err.println( "Attribute value: \"" + attrValue + "\"" ); } // Skip whitespace while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++; } else { // It's a mess. Our only reasonable chance is to try to resync on the first // whitespace, or alternatively to get to the end of the tag. do pos++; while ( pos < end && text[ pos ] != '>' && ! Character.isWhitespace( text[ pos ] ) ); // Skip whitespace while ( pos < end && Character.isWhitespace( text[ pos ] ) ) pos++; continue; } } else { if ( parseTags && ! callback.startElement( currentElement, attrMap ) ) break; if ( attrMap != null ) attrMap.clear(); if ( currentElement == Element.SCRIPT || currentElement == Element.STYLE ) { final TextPattern pattern = currentElement == Element.SCRIPT ? SCRIPT_CLOSE_TAG_PATTERN : STYLE_CLOSE_TAG_PATTERN; start = pos + 1; pos = pattern.search( text, start, end ); if ( pos == -1 ) pos = end; if ( parseText ) callback.cdata( currentElement, text, start, pos - start ); if ( pos < end ) { if ( parseTags ) callback.endElement( currentElement ); pos += pattern.length(); } } else pos += currChar == '/' ? 2 : 1; state = STATE_TEXT; } break; case STATE_IN_END_TAG: while ( pos < end && text[ pos ] != '>' ) pos++; if ( parseTags && currentElement != null && ! callback.endElement( currentElement ) ) break; state = STATE_TEXT; pos++; break; default: } } // We do what we can to invoke tag handlers in case of a truncated text. if ( state == STATE_IN_START_TAG && parseTags && currentElement != null ) callback.startElement( currentElement, attrMap ); if ( state == STATE_IN_END_TAG && parseTags && currentElement != null ) callback.endElement( currentElement ); if ( state == STATE_TEXT && parseText && characters.length() > 0 ) callback.characters( characters.array(), 0, characters.length(), flowBroken ); callback.endDocument(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -