📄 uurifactory.java
字号:
/** * Fixup 'authority' portion of URI, by removing any stray * encoded spaces, lowercasing any domain names, and applying * IDN-punycoding to Unicode domains. * * @param uriAuthority the authority string to fix * @return fixed version * @throws URIException */ private String fixupAuthority(String uriAuthority) throws URIException { // Lowercase the host part of the uriAuthority; don't destroy any // userinfo capitalizations. Make sure no illegal characters in // domainlabel substring of the uri authority. if (uriAuthority != null) { // Get rid of any trailing escaped spaces: // http://www.archive.org%20. Rare but happens. // TODO: reevaluate: do IE or firefox do such mid-URI space-removal? // if not, we shouldn't either. while(uriAuthority.endsWith(ESCAPED_SPACE)) { uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3); } // lowercase & IDN-punycode only the domain portion int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex); if(atIndex<0 && portColonIndex<0) { // most common case: neither userinfo nor port return fixupDomainlabel(uriAuthority); } else if (atIndex<0 && portColonIndex>-1) { // next most common: port but no userinfo String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex)); String port = uriAuthority.substring(portColonIndex); return domain + port; } else if (atIndex>-1 && portColonIndex<0) { // uncommon: userinfo, no port String userinfo = uriAuthority.substring(0,atIndex+1); String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1)); return userinfo + domain; } else { // uncommon: userinfo, port String userinfo = uriAuthority.substring(0,atIndex+1); String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex)); String port = uriAuthority.substring(portColonIndex); return userinfo + domain + port; } } return uriAuthority; } /** * Fixup the domain label part of the authority. * * We're more lax than the spec. in that we allow underscores. * * @param label Domain label to fix. * @return Return fixed domain label. * @throws URIException */ private String fixupDomainlabel(String label) throws URIException { // apply IDN-punycoding, as necessary try { // TODO: optimize: only apply when necessary, or // keep cache of recent encodings label = IDNA.toASCII(label); } catch (IDNAException e) { if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) { // domain name has ACE prefix, leading/trailing dash, or // underscore -- but is still a name we wish to tolerate; // simply continue } else { // problematic domain: neither ASCII acceptable characters // nor IDN-punycodable, so throw exception // TODO: change to HeritrixURIException so distinguishable // from URIExceptions in library code URIException ue = new URIException(e+" "+label); ue.initCause(e); throw ue; } } label = label.toLowerCase(); return label; } /** * Ensure that there all characters needing escaping * in the passed-in String are escaped. Stray '%' characters * are *not* escaped, as per browser behavior. * * @param u String to escape * @param charset * @return string with any necessary escaping applied */ private String ensureMinimalEscaping(String u, final String charset) { return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE); } /** * Ensure that there all characters needing escaping * in the passed-in String are escaped. Stray '%' characters * are *not* escaped, as per browser behavior. * * @param u String to escape * @param charset * @param bitset * @return string with any necessary escaping applied */ private String ensureMinimalEscaping(String u, final String charset, final BitSet bitset) { if (u == null) { return null; } for (int i = 0; i < u.length(); i++) { char c = u.charAt(i); if (!bitset.get(c)) { try { u = LaxURLCodec.DEFAULT.encode(bitset, u, charset); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } break; } } return u; } /** * Escape any whitespace found. * * The parent class takes care of the bulk of escaping. But if any * instance of escaping is found in the URI, then we ask for parent * to do NO escaping. Here we escape any whitespace found irrespective * of whether the uri has already been escaped. We do this for * case where uri has been judged already-escaped only, its been * incompletly done and whitespace remains. Spaces, etc., in the URI are * a real pain. Their presence will break log file and ARC parsing. * @param uri URI string to check. * @return uri with spaces escaped if any found. */ protected String escapeWhitespace(String uri) { // Just write a new string anyways. The perl '\s' is not // as inclusive as the Character.isWhitespace so there are // whitespace characters we could miss. So, rather than // write some awkward regex, just go through the string // a character at a time. Only create buffer first time // we find a space. MutableString buffer = null; for (int i = 0; i < uri.length(); i++) { char c = uri.charAt(i); if (Character.isWhitespace(c)) { if (buffer == null) { buffer = new MutableString(uri.length() + 2 /*If space, two extra characters (at least)*/); buffer.append(uri.substring(0, i)); } buffer.append("%"); String hexStr = Integer.toHexString(c); if ((hexStr.length() % 2) > 0) { buffer.append("0"); } buffer.append(hexStr); } else { if (buffer != null) { buffer.append(c); } } } return (buffer != null)? buffer.toString(): uri; } /** * Check port on passed http authority. Make sure the size is not larger * than allowed: See the 'port' definition on this * page, http://www.kerio.com/manual/wrp/en/418.htm. * Also, we've seen port numbers of '0080' whose leading zeros confuse * the parent class. Strip the leading zeros. * * @param uriAuthority * @return Null or an amended port number. * @throws URIException */ private String checkPort(String uriAuthority) throws URIException { Matcher m = PORTREGEX.matcher(uriAuthority); if (m.matches()) { String no = m.group(2); if (no != null && no.length() > 0) { // First check if the port has leading zeros // as in '0080'. Strip them if it has and // then reconstitute the uriAuthority. Be careful // of cases where port is '0' or '000'. while (no.charAt(0) == '0' && no.length() > 1) { no = no.substring(1); } uriAuthority = m.group(1) + no; // Now makesure the number is legit. int portNo = Integer.parseInt(no); if (portNo <= 0 || portNo > 65535) { throw new URIException("Port out of bounds: " + uriAuthority); } } } return uriAuthority; } /** * @param b Buffer to append to. * @param str String to append if not null. * @param substr Suffix or prefix to use if <code>str</code> is not null. * @param suffix True if <code>substr</code> is a suffix. */ private void appendNonNull(MutableString b, String str, String substr, boolean suffix) { if (str != null && str.length() > 0) { if (!suffix) { b.append(substr); } b.append(str); if (suffix) { b.append(substr); } } } /** * @param str String to work on. * @param prefix Prefix to strip if present. * @return <code>str</code> w/o <code>prefix</code>. */ private String stripPrefix(String str, String prefix) { return str.startsWith(prefix)? str.substring(prefix.length(), str.length()): str; } /** * @param str String to work on. * @param tail Tail to strip if present. * @return <code>str</code> w/o <code>tail</code>. */ private static String stripTail(String str, String tail) { return str.endsWith(tail)? str.substring(0, str.length() - tail.length()): str; } /** * @param element to examine. * @return Null if passed null or an empty string otherwise * <code>element</code>. */ private String checkUriElement(String element) { return (element == null || element.length() <= 0)? null: element; } /** * @param element to examine and lowercase if non-null. * @return Null if passed null or an empty string otherwise * <code>element</code> lowercased. */ private String checkUriElementAndLowerCase(String element) { String tmp = checkUriElement(element); return (tmp != null)? tmp.toLowerCase(): tmp; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -