uurifactory.html

来自「网络爬虫开源代码」· HTML 代码 · 共 695 行 · 第 1/5 页

HTML
695
字号
<a name="548" href="#548">548</a>     <em>/**<em>*</em></em><a name="549" href="#549">549</a> <em>     * If http(s) scheme, check scheme specific part begins '//'.</em><a name="550" href="#550">550</a> <em>     * @throws URIException </em><a name="551" href="#551">551</a> <em>     * @see <a href="http://www.faqs.org/rfcs/rfc1738.html" target="alexandria_uri">http://www.faqs.org/rfcs/rfc1738.html</a> Section 3.1. Common Internet</em><a name="552" href="#552">552</a> <em>     * Scheme Syntax</em><a name="553" href="#553">553</a> <em>     */</em><a name="554" href="#554">554</a>     <strong>protected</strong> <strong>void</strong> checkHttpSchemeSpecificPartSlashPrefix(<strong>final</strong> URI base,<a name="555" href="#555">555</a>     		<strong>final</strong> String scheme, <strong>final</strong> String schemeSpecificPart)<a name="556" href="#556">556</a>     throws URIException {<a name="557" href="#557">557</a>     	<em class="comment">// Only apply this check if no base.</em><a name="558" href="#558">558</a>     	<strong>if</strong> (base != <strong>null</strong>) {<a name="559" href="#559">559</a>     		<strong>return</strong>;<a name="560" href="#560">560</a>     	}<a name="561" href="#561">561</a>     	<strong>if</strong> (scheme == <strong>null</strong> || scheme.length() &lt;= 0) {<a name="562" href="#562">562</a>     		<strong>return</strong>;<a name="563" href="#563">563</a>     	}<a name="564" href="#564">564</a>     	<strong>if</strong> (!scheme.equals(<span class="string">"http"</span>) &amp;&amp; !scheme.equals(<span class="string">"https"</span>)) {<a name="565" href="#565">565</a>     		<strong>return</strong>;<a name="566" href="#566">566</a>     	}<a name="567" href="#567">567</a>     	<strong>if</strong> (!schemeSpec<strong>if</strong>icPart.startsWith(<span class="string">"//"</span>)) {<a name="568" href="#568">568</a>     		<strong>throw</strong> <strong>new</strong> URIException(<span class="string">"http scheme specific part must "</span> +<a name="569" href="#569">569</a>     		    <span class="string">"begin '//': "</span> + schemeSpecificPart);<a name="570" href="#570">570</a>     	}<a name="571" href="#571">571</a>     	<strong>if</strong> (schemeSpec<strong>if</strong>icPart.length() &lt;= 2) {<a name="572" href="#572">572</a>     		<strong>throw</strong> <strong>new</strong> URIException(<span class="string">"http scheme specific part is "</span> +<a name="573" href="#573">573</a>         		<span class="string">"too short: "</span> + schemeSpecificPart);<a name="574" href="#574">574</a>     	}<a name="575" href="#575">575</a>     }<a name="576" href="#576">576</a>     <a name="577" href="#577">577</a>     <em>/**<em>*</em></em><a name="578" href="#578">578</a> <em>     * Fixup 'authority' portion of URI, by removing any stray </em><a name="579" href="#579">579</a> <em>     * encoded spaces, lowercasing any domain names, and applying</em><a name="580" href="#580">580</a> <em>     * IDN-punycoding to Unicode domains. </em><a name="581" href="#581">581</a> <em>     * </em><a name="582" href="#582">582</a> <em>     * @param uriAuthority the authority string to fix</em><a name="583" href="#583">583</a> <em>     * @return fixed version</em><a name="584" href="#584">584</a> <em>     * @throws URIException</em><a name="585" href="#585">585</a> <em>     */</em><a name="586" href="#586">586</a>     <strong>private</strong> String fixupAuthority(String uriAuthority) throws URIException {<a name="587" href="#587">587</a>         <em class="comment">// Lowercase the host part of the uriAuthority; don't destroy any</em><a name="588" href="#588">588</a>         <em class="comment">// userinfo capitalizations.  Make sure no illegal characters in</em><a name="589" href="#589">589</a>         <em class="comment">// domainlabel substring of the uri authority.</em><a name="590" href="#590">590</a>         <strong>if</strong> (uriAuthority != <strong>null</strong>) {<a name="591" href="#591">591</a>             <em class="comment">// Get rid of any trailing escaped spaces:</em><a name="592" href="#592">592</a>             <em class="comment">// http://www.archive.org%20.  Rare but happens.</em><a name="593" href="#593">593</a>             <em class="comment">// TODO: reevaluate: do IE or firefox do such mid-URI space-removal?</em><a name="594" href="#594">594</a>             <em class="comment">// if not, we shouldn't either. </em><a name="595" href="#595">595</a>             <strong>while</strong>(uriAuthority.endsWith(ESCAPED_SPACE)) {<a name="596" href="#596">596</a>                 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);<a name="597" href="#597">597</a>             }<a name="598" href="#598">598</a> <a name="599" href="#599">599</a>             <em class="comment">// lowercase &amp; IDN-punycode only the domain portion</em><a name="600" href="#600">600</a>             <strong>int</strong> atIndex = uriAuthority.indexOf(COMMERCIAL_AT);<a name="601" href="#601">601</a>             <strong>int</strong> portColonIndex = uriAuthority.indexOf(COLON,(atIndex&lt;0)?0:atIndex);<a name="602" href="#602">602</a>             <strong>if</strong>(atIndex&lt;0 &amp;&amp; portColonIndex&lt;0) {<a name="603" href="#603">603</a>                 <em class="comment">// most common case: neither userinfo nor port</em><a name="604" href="#604">604</a>                 <strong>return</strong> fixupDomainlabel(uriAuthority);<a name="605" href="#605">605</a>             } <strong>else</strong> <strong>if</strong> (atIndex&lt;0 &amp;&amp; portColonIndex>-1) {<a name="606" href="#606">606</a>                 <em class="comment">// next most common: port but no userinfo</em><a name="607" href="#607">607</a>                 String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));<a name="608" href="#608">608</a>                 String port = uriAuthority.substring(portColonIndex);<a name="609" href="#609">609</a>                 <strong>return</strong> domain + port;<a name="610" href="#610">610</a>             } <strong>else</strong> <strong>if</strong> (atIndex>-1 &amp;&amp; portColonIndex&lt;0) {<a name="611" href="#611">611</a>                 <em class="comment">// uncommon: userinfo, no port</em><a name="612" href="#612">612</a>                 String userinfo = uriAuthority.substring(0,atIndex+1);<a name="613" href="#613">613</a>                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));<a name="614" href="#614">614</a>                 <strong>return</strong> userinfo + domain;<a name="615" href="#615">615</a>             } <strong>else</strong> {<a name="616" href="#616">616</a>                 <em class="comment">// uncommon: userinfo, port</em><a name="617" href="#617">617</a>                 String userinfo = uriAuthority.substring(0,atIndex+1);<a name="618" href="#618">618</a>                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));<a name="619" href="#619">619</a>                 String port = uriAuthority.substring(portColonIndex);<a name="620" href="#620">620</a>                 <strong>return</strong> userinfo + domain + port;<a name="621" href="#621">621</a>             }<a name="622" href="#622">622</a>         }<a name="623" href="#623">623</a>         <strong>return</strong> uriAuthority;<a name="624" href="#624">624</a>     }<a name="625" href="#625">625</a>     <a name="626" href="#626">626</a>     <em>/**<em>*</em></em><a name="627" href="#627">627</a> <em>     * Fixup the domain label part of the authority.</em><a name="628" href="#628">628</a> <em>     * </em><a name="629" href="#629">629</a> <em>     * We're more lax than the spec. in that we allow underscores.</em><a name="630" href="#630">630</a> <em>     * </em><a name="631" href="#631">631</a> <em>     * @param label Domain label to fix.</em><a name="632" href="#632">632</a> <em>     * @return Return fixed domain label.</em><a name="633" href="#633">633</a> <em>     * @throws URIException</em><a name="634" href="#634">634</a> <em>     */</em><a name="635" href="#635">635</a>     <strong>private</strong> String fixupDomainlabel(String label)<a name="636" href="#636">636</a>     throws URIException {<a name="637" href="#637">637</a>         <a name="638" href="#638">638</a>         <em class="comment">// apply IDN-punycoding, as necessary</em><a name="639" href="#639">639</a>         <strong>try</strong> {<a name="640" href="#640">640</a>             <em class="comment">// TODO: optimize: only apply when necessary, or</em><a name="641" href="#641">641</a>             <em class="comment">// keep cache of recent encodings</em><a name="642" href="#642">642</a>             label = IDNA.toASCII(label);<a name="643" href="#643">643</a>         } <strong>catch</strong> (IDNAException e) {<a name="644" href="#644">644</a>             <strong>if</strong>(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {<a name="645" href="#645">645</a>                 <em class="comment">// domain name has ACE prefix, leading/trailing dash, or </em><a name="646" href="#646">646</a>                 <em class="comment">// underscore -- but is still a name we wish to tolerate;</em><a name="647" href="#647">647</a>                 <em class="comment">// simply continue</em><a name="648" href="#648">648</a>             } <strong>else</strong> {<a name="649" href="#649">649</a>                 <em class="comment">// problematic domain: neither ASCII acceptable characters</em><a name="650" href="#650">650</a>                 <em class="comment">// nor IDN-punycodable, so throw exception </em><a name="651" href="#651">651</a>                 <em class="comment">// TODO: change to HeritrixURIException so distinguishable</em><a name="652" href="#652">652</a>                 <em class="comment">// from URIExceptions in library code</em><a name="653" href="#653">653</a>                 URIException ue = <strong>new</strong> URIException(e+<span class="string">" "</span>+label);<a name="654" href="#654">654</a>                 ue.initCause(e);<a name="655" href="#655">655</a>                 <strong>throw</strong> ue;<a name="656" href="#656">656</a>             }<a name="657" href="#657">657</a>         }<a name="658" href="#658">658</a>         label = label.toLowerCase();<a name="659" href="#659">659</a>         <strong>return</strong> label;<a name="660" href="#660">660</a>     }<a name="661" href="#661">661</a>     <a name="662" href="#662">662</a>     <em>/**<em>*</em></em><a name="663" href="#663">663</a> <em>     * Ensure that there all characters needing escaping</em><a name="664" href="#664">664</a> <em>     * in the passed-in String are escaped. Stray '%' characters</em><a name="665" href="#665">665</a> <em>     * are *not* escaped, as per browser behavior. </em><a name="666" href="#666">666</a> <em>     * </em><a name="667" href="#667">667</a> <em>     * @param u String to escape</em><a name="668" href="#668">668</a> <em>     * @param charset </em><a name="669" href="#669">669</a> <em>     * @return string with any necessary escaping applied</em><a name="670" href="#670">670</a> <em>     */</em><a name="671" href="#671">671</a>     <strong>private</strong> String ensureMinimalEscaping(String u, <strong>final</strong> String charset) {<a name="672" href="#672">672</a>         <strong>return</strong> ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);<a name="673" href="#673">673</a>     }<a name="674" href="#674">674</a>     <a name="675" href="#675">675</a>     <em>/**<em>*</em></em><a name="676" href="#676">676</a> <em>     * Ensure that there all characters needing escaping</em><a name="677" href="#677">677</a> <em>     * in the passed-in String are escaped. Stray '%' characters</em><a name="678" href="#678">678</a> <em>     * are *not* escaped, as per browser behavior. </em><a name="679" href="#679">679</a> <em>     * </em><a name="680" href="#680">680</a> <em>     * @param u String to escape</em><a name="681" href="#681">681</a> <em>     * @param charset </em><a name="682" href="#682">682</a> <em>     * @param bitset </em><a name="683" href="#683">683</a> <em>     * @return string with any necessary escaping applied</em><a name="684" href="#684">684</a> <em>     */</em><a name="685" href="#685">685</a>     <strong>private</strong> String ensureMinimalEscaping(String u, <strong>final</strong> String charset,<a name="686" href="#686">686</a>             <strong>final</strong> Bit

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?