📄 extractoruniversal.html
字号:
<a name="452" href="#452">452</a> <em>/**<em>*</em></em><a name="453" href="#453">453</a> <em> * This method takes a look at a string and determines if it could be a URL.</em><a name="454" href="#454">454</a> <em> * To qualify the string must either begin with "<a href="http://" target="alexandria_uri">http://</a>" (https would also</em><a name="455" href="#455">455</a> <em> * work) followed by something that looks like an IP address or contain</em><a name="456" href="#456">456</a> <em> * within the string (possible at the end but not at the beginning) a TLD</em><a name="457" href="#457">457</a> <em> * (Top Level Domain) preceded by a dot.</em><a name="458" href="#458">458</a> <em> *</em><a name="459" href="#459">459</a> <em> * @param lookat The string to examine in an effort to determine if it</em><a name="460" href="#460">460</a> <em> * could be a URL</em><a name="461" href="#461">461</a> <em> * @return True if the string matches the above criteria for a URL.</em><a name="462" href="#462">462</a> <em> */</em><a name="463" href="#463">463</a> <strong>private</strong> <strong>boolean</strong> looksLikeAnURL(String lookat) {<a name="464" href="#464">464</a> <strong>if</strong>(lookat.indexOf(<span class="string">"http://"</span>)==0 || lookat.indexOf(<span class="string">"https://"</span>)==0){<a name="465" href="#465">465</a> <em class="comment">//Check if the rest of the string looks like an IP address.</em><a name="466" href="#466">466</a> <em class="comment">//if so return true. Otherwise continue on.</em><a name="467" href="#467">467</a> Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);<a name="468" href="#468">468</a> <strong>boolean</strong> testVal = ip.matches();<a name="469" href="#469">469</a> TextUtils.recycleMatcher(ip);<a name="470" href="#470">470</a> <strong>if</strong>(testVal){<a name="471" href="#471">471</a> <strong>return</strong> <strong>true</strong>;<a name="472" href="#472">472</a> }<a name="473" href="#473">473</a> }<a name="474" href="#474">474</a> <a name="475" href="#475">475</a> <strong>int</strong> dot = lookat.indexOf(<span class="string">"."</span>);<a name="476" href="#476">476</a> <strong>if</strong>(dot!=0){<em class="comment">//An URL can't start with a .tld.</em><a name="477" href="#477">477</a> <strong>while</strong>(dot != -1 && dot < lookat.length()){<a name="478" href="#478">478</a> lookat = lookat.substring(dot+1);<a name="479" href="#479">479</a> <strong>if</strong> (isTLD(lookat.substring(0, lookat.length() <= 6?<a name="480" href="#480">480</a> lookat.length(): 6)))<a name="481" href="#481">481</a> {<a name="482" href="#482">482</a> <strong>return</strong> <strong>true</strong>;<a name="483" href="#483">483</a> }<a name="484" href="#484">484</a> dot = lookat.indexOf(<span class="string">"."</span>);<a name="485" href="#485">485</a> }<a name="486" href="#486">486</a> }<a name="487" href="#487">487</a> <a name="488" href="#488">488</a> <strong>return</strong> false;<a name="489" href="#489">489</a> }<a name="490" href="#490">490</a> <a name="491" href="#491">491</a> <em>/**<em>*</em></em><a name="492" href="#492">492</a> <em> * Checks if a string is equal to known Top Level Domain. The string may</em><a name="493" href="#493">493</a> <em> * contain additional characters <i>after</i> the TLD but not before.</em><a name="494" href="#494">494</a> <em> * @param potentialTLD The string (usually 2-6 chars) to check if it starts</em><a name="495" href="#495">495</a> <em> * with a TLD.</em><a name="496" href="#496">496</a> <em> * @return True if the given string starts with the name of a known TLD</em><a name="497" href="#497">497</a> <em> *</em><a name="498" href="#498">498</a> <em> * @see #TLDs</em><a name="499" href="#499">499</a> <em> */</em><a name="500" href="#500">500</a> <strong>private</strong> <strong>boolean</strong> isTLD(String potentialTLD) {<a name="501" href="#501">501</a> <strong>if</strong>(potentialTLD.length()<2){<a name="502" href="#502">502</a> <strong>return</strong> false;<a name="503" href="#503">503</a> }<a name="504" href="#504">504</a> <a name="505" href="#505">505</a> potentialTLD.toLowerCase();<a name="506" href="#506">506</a> Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);<a name="507" href="#507">507</a> <strong>boolean</strong> ret = uri.matches();<a name="508" href="#508">508</a> TextUtils.recycleMatcher(uri);<a name="509" href="#509">509</a> <strong>return</strong> ret;<a name="510" href="#510">510</a> }<a name="511" href="#511">511</a> <a name="512" href="#512">512</a> <em>/**<em>*</em></em><a name="513" href="#513">513</a> <em> * Determines if a char (as represented by an int in the range of 0-255) is</em><a name="514" href="#514">514</a> <em> * a character (in the Ansi character set) that can be present in a URL.</em><a name="515" href="#515">515</a> <em> * This method takes a <b>strict</b> approach to what characters can be in</em><a name="516" href="#516">516</a> <em> * a URL.</em><a name="517" href="#517">517</a> <em> * <p></em><a name="518" href="#518">518</a> <em> * The following are considered to be 'URLable'<br></em><a name="519" href="#519">519</a> <em> * <ul></em><a name="520" href="#520">520</a> <em> * <li> <code># $ % & + , - . /</code> values 35-38,43-47</em><a name="521" href="#521">521</a> <em> * <li> <code>[0-9]</code> values 48-57</em><a name="522" href="#522">522</a> <em> * <li> <code>: ; = ? @</code> value 58-59,61,63-64</em><a name="523" href="#523">523</a> <em> * <li> <code>[A-Z]</code> values 65-90</em><a name="524" href="#524">524</a> <em> * <li> <code>_</code> value 95</em><a name="525" href="#525">525</a> <em> * <li> <code>[a-z]</code> values 97-122</em><a name="526" href="#526">526</a> <em> * <li> <code>~</code> value 126</em><a name="527" href="#527">527</a> <em> * </ul></em><a name="528" href="#528">528</a> <em> * <p></em><a name="529" href="#529">529</a> <em> * To summerize, the following ranges are considered URLable:<br></em><a name="530" href="#530">530</a> <em> * 35-38,43-59,61,63-90,95,97-122,126</em><a name="531" href="#531">531</a> <em> *</em><a name="532" href="#532">532</a> <em> * @param ch The character (represented by an int) to test.</em><a name="533" href="#533">533</a> <em> * @return True if it is a URLable character, false otherwise.</em><a name="534" href="#534">534</a> <em> */</em><a name="535" href="#535">535</a> <strong>private</strong> <strong>boolean</strong> isURLableChar(<strong>int</strong> ch) {<a name="536" href="#536">536</a> <strong>return</strong> (ch>=35 && ch<=38)<a name="537" href="#537">537</a> || (ch>=43 && ch<=59)<a name="538" href="#538">538</a> || (ch==61)<a name="539" href="#539">539</a> || (ch>=63 && ch<=90)<a name="540" href="#540">540</a> || (ch==95)<a name="541" href="#541">541</a> || (ch>=97 && ch<=122)<a name="542" href="#542">542</a> || (ch==126);<a name="543" href="#543">543</a> }<a name="544" href="#544">544</a> <a name="545" href="#545">545</a> <em class="comment">/*<em class="comment"> (non-Javadoc)</em></em><a name="546" href="#546">546</a> <em class="comment"> * @see org.archive.crawler.framework.Processor#report()</em><a name="547" href="#547">547</a> <em class="comment"> */</em><a name="548" href="#548">548</a> <strong>public</strong> String report() {<a name="549" href="#549">549</a> StringBuffer ret = <strong>new</strong> StringBuffer();<a name="550" href="#550">550</a> ret.append(<span class="string">"Processor: org.archive.crawler.extractor."</span> +<a name="551" href="#551">551</a> <span class="string">"ExtractorUniversal\n"</span>);<a name="552" href="#552">552</a> ret.append(<span class="string">" Function: Link extraction on unknown file"</span> +<a name="553" href="#553">553</a> <span class="string">" types.\n"</span>);<a name="554" href="#554">554</a> ret.append(<span class="string">" CrawlURIs handled: "</span> + numberOfCURIsHandled + <span class="string">"\n"</span>);<a name="555" href="#555">555</a> ret.append(<span class="string">" Links extracted: "</span> + numberOfLinksExtracted + <span class="string">"\n\n"</span>);<a name="556" href="#556">556</a> <a name="557" href="#557">557</a> <strong>return</strong> ret.toString();<a name="558" href="#558">558</a> }<a name="559" href="#559">559</a> }</pre><hr/><div id="footer">This page was automatically generated by <a href="http://maven.apache.org/">Maven</a></div></body></html>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -