📄 uri.java
字号:
* Encodes URI string. * * This is a two mapping, one from original characters to octets, and * subsequently a second from octets to URI characters: * <p><blockquote><pre> * original character sequence->octet sequence->URI character sequence * </pre></blockquote><p> * * An escaped octet is encoded as a character triplet, consisting of the * percent character "%" followed by the two hexadecimal digits * representing the octet code. For example, "%20" is the escaped * encoding for the US-ASCII space character. * <p> * Conversion from the local filesystem character set to UTF-8 will * normally involve a two step process. First convert the local character * set to the UCS; then convert the UCS to UTF-8. * The first step in the process can be performed by maintaining a mapping * table that includes the local character set code and the corresponding * UCS code. * The next step is to convert the UCS character code to the UTF-8 encoding. * <p> * Mapping between vendor codepages can be done in a very similar manner * as described above. * <p> * The only time escape encodings can allowedly be made is when a URI is * being created from its component parts. The escape and validate methods * are internally performed within this method. * * @param original the original character sequence * @param allowed those characters that are allowed within a component * @param charset the protocol charset * @return URI character sequence * @throws URIException null component or unsupported character encoding */ protected static char[] encode(String original, BitSet allowed, String charset) throws URIException { if (original == null) { throw new IllegalArgumentException("Original string may not be null"); } if (allowed == null) { throw new IllegalArgumentException("Allowed bitset may not be null"); } byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset)); return EncodingUtil.getAsciiString(rawdata).toCharArray(); } /** * Decodes URI encoded string. * * This is a two mapping, one from URI characters to octets, and * subsequently a second from octets to original characters: * <p><blockquote><pre> * URI character sequence->octet sequence->original character sequence * </pre></blockquote><p> * * A URI must be separated into its components before the escaped * characters within those components can be allowedly decoded. * <p> * Notice that there is a chance that URI characters that are non UTF-8 * may be parsed as valid UTF-8. A recent non-scientific analysis found * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% * false reading. * <p> * The percent "%" character always has the reserved purpose of being * the escape indicator, it must be escaped as "%25" in order to be used * as data within a URI. * <p> * The unescape method is internally performed within this method. * * @param component the URI character sequence * @param charset the protocol charset * @return original character sequence * @throws URIException incomplete trailing escape pattern or unsupported * character encoding */ protected static String decode(char[] component, String charset) throws URIException { if (component == null) { throw new IllegalArgumentException("Component array of chars may not be null"); } return decode(new String(component), charset); } /** * Decodes URI encoded string. * * This is a two mapping, one from URI characters to octets, and * subsequently a second from octets to original characters: * <p><blockquote><pre> * URI character sequence->octet sequence->original character sequence * </pre></blockquote><p> * * A URI must be separated into its components before the escaped * characters within those components can be allowedly decoded. * <p> * Notice that there is a chance that URI characters that are non UTF-8 * may be parsed as valid UTF-8. A recent non-scientific analysis found * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% * false reading. * <p> * The percent "%" character always has the reserved purpose of being * the escape indicator, it must be escaped as "%25" in order to be used * as data within a URI. * <p> * The unescape method is internally performed within this method. * * @param component the URI character sequence * @param charset the protocol charset * @return original character sequence * @throws URIException incomplete trailing escape pattern or unsupported * character encoding * * @since 3.0 */ protected static String decode(String component, String charset) throws URIException { if (component == null) { throw new IllegalArgumentException("Component array of chars may not be null"); } byte[] rawdata = null; try { rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component)); } catch (DecoderException e) { throw new URIException(e.getMessage()); } return EncodingUtil.getString(rawdata, charset); } /** * Pre-validate the unescaped URI string within a specific component. * * @param component the component string within the component * @param disallowed those characters disallowed within the component * @return if true, it doesn't have the disallowed characters * if false, the component is undefined or an incorrect one */ protected boolean prevalidate(String component, BitSet disallowed) { // prevalidate the given component by disallowed characters if (component == null) { return false; // undefined } char[] target = component.toCharArray(); for (int i = 0; i < target.length; i++) { if (disallowed.get(target[i])) { return false; } } return true; } /** * Validate the URI characters within a specific component. * The component must be performed after escape encoding. Or it doesn't * include escaped characters. * * @param component the characters sequence within the component * @param generous those characters that are allowed within a component * @return if true, it's the correct URI character sequence */ protected boolean validate(char[] component, BitSet generous) { // validate each component by generous characters return validate(component, 0, -1, generous); } /** * Validate the URI characters within a specific component. * The component must be performed after escape encoding. Or it doesn't * include escaped characters. * <p> * It's not that much strict, generous. The strict validation might be * performed before being called this method. * * @param component the characters sequence within the component * @param soffset the starting offset of the given component * @param eoffset the ending offset of the given component * if -1, it means the length of the component * @param generous those characters that are allowed within a component * @return if true, it's the correct URI character sequence */ protected boolean validate(char[] component, int soffset, int eoffset, BitSet generous) { // validate each component by generous characters if (eoffset == -1) { eoffset = component.length - 1; } for (int i = soffset; i <= eoffset; i++) { if (!generous.get(component[i])) { return false; } } return true; } /** * In order to avoid any possilbity of conflict with non-ASCII characters, * Parse a URI reference as a <code>String</code> with the character * encoding of the local system or the document. * <p> * The following line is the regular expression for breaking-down a URI * reference into its components. * <p><blockquote><pre> * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * </pre></blockquote><p> * For example, matching the above expression to * http://jakarta.apache.org/ietf/uri/#Related * results in the following subexpression matches: * <p><blockquote><pre> * $1 = http: * scheme = $2 = http * $3 = //jakarta.apache.org * authority = $4 = jakarta.apache.org * path = $5 = /ietf/uri/ * $6 = <undefined> * query = $7 = <undefined> * $8 = #Related * fragment = $9 = Related * </pre></blockquote><p> * * @param original the original character sequence * @param escaped <code>true</code> if <code>original</code> is escaped * @throws URIException If an error occurs. */ protected void parseUriReference(String original, boolean escaped) throws URIException { // validate and contruct the URI character sequence if (original == null) { throw new URIException("URI-Reference required"); } /* @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? */ String tmp = original.trim(); /* * The length of the string sequence of characters. * It may not be equal to the length of the byte array. */ int length = tmp.length(); /* * Remove the delimiters like angle brackets around an URI. */ if (length > 0) { char[] firstDelimiter = { tmp.charAt(0) }; if (validate(firstDelimiter, delims)) { if (length >= 2) { char[] lastDelimiter = { tmp.charAt(length - 1) }; if (validate(lastDelimiter, delims)) { tmp = tmp.substring(1, length - 1); length = length - 2; } } } } /* * The starting index */ int from = 0; /* * The test flag whether the URI is started from the path component. */ boolean isStartedFromPath = false; int atColon = tmp.indexOf(':'); int atSlash = tmp.indexOf('/'); if ((atColon <= 0 && !tmp.startsWith("//")) || (atSlash >= 0 && atSlash < atColon)) { isStartedFromPath = true; } /* * <p><blockquote><pre> * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); if (at == -1) { at = 0; } /* * Parse the scheme. * <p><blockquote><pre> * scheme = $2 = http * @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (at > 0 && at < length && tmp.charAt(at) == ':') { char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); if (validate(target, scheme)) { _scheme = target; } else { throw new URIException("incorrect scheme"); } from = ++at; } /* * Parse the authority component. * <p><blockquote><pre> * authority = $4 = jakarta.apache.org * @@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ // Reset flags _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; if (0 <= at && at < length && tmp.charAt(at) == '/') { // Set flag _is_hier_part = true; if (at + 2 < length && tmp.charAt(at + 1) == '/' && !isStartedFromPath) { // the temporary index to start the search from int next = indexFirstOf(tmp, "/?#", at + 2); if (next == -1) { next = (tmp.substring(at + 2).length() == 0) ? at + 2 : tmp.length(); } parseAuthority(tmp.substring(at + 2, next), escaped); from = at = next; // Set flag _is_net_path = true; } if (from == at) { // Set flag _is_abs_path = true; } } /* * Parse the path component. * <p><blockquote><pre> * path = $5 = /ietf/uri/ * @@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (from < length) { // rel_path = rel_segment [ abs_path ] int next = indexFirstOf(tmp, "?#", from); if (next == -1) { next = tmp.length(); } if (!_is_abs_path) { if (!escaped && prevalidate(tmp.substring(from, next), disallowed_rel_path) || escaped && validate(tmp.substring(from, next).toCharArray(), rel_path)) { // Set flag _is_rel_path = true; } else if (!escaped && prevalidate(tmp.substring(from, next), disallowed_opaque_part) || escaped && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { // Set flag _is_opaque_part = true; } else { // the path component may be empty _path = null; } } String s = tmp.substring(from, next); if (escaped) { setRawPath(s.toCharArray()); } else { setPath(s); } at = next; } // set the charset to do escape encoding String charset = getProtocolCharset(); /* * Parse the query component. * <p><blockquote><pre> * query = $7 = <undefined> * @@@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { int next = tmp.indexOf('#', at + 1); if (next == -1) { next = tmp.length(); } if (escaped) { _query = tmp.substring(at + 1, next).toCharArray(); if (!validate(_query, uric)) { throw new URIException("Invalid query"); } } else { _query = encode(tmp.substring(at + 1, next), allowed_query, charset); } at = next; } /* * Parse the fragment component. * <p><block
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -