📄 extractorhtml.html
字号:
<a name="544" href="#544">544</a> <em class="comment">// comment match</em><a name="545" href="#545">545</a> <em class="comment">// for now do nothing</em><a name="546" href="#546">546</a> } <strong>else</strong> <strong>if</strong> (tags.start(7) > 0) {<a name="547" href="#547">547</a> <em class="comment">// <meta> match</em><a name="548" href="#548">548</a> <strong>int</strong> start = tags.start(5);<a name="549" href="#549">549</a> <strong>int</strong> end = tags.end(5);<a name="550" href="#550">550</a> assert start >= 0: <span class="string">"Start is: "</span> + start + <span class="string">", "</span> + curi;<a name="551" href="#551">551</a> assert end >= 0: <span class="string">"End is :"</span> + end + <span class="string">", "</span> + curi;<a name="552" href="#552">552</a> <strong>if</strong> (processMeta(curi,<a name="553" href="#553">553</a> cs.subSequence(start, end))) {<a name="554" href="#554">554</a> <a name="555" href="#555">555</a> <em class="comment">// meta tag included NOFOLLOW; abort processing</em><a name="556" href="#556">556</a> <strong>break</strong>;<a name="557" href="#557">557</a> }<a name="558" href="#558">558</a> } <strong>else</strong> <strong>if</strong> (tags.start(5) > 0) {<a name="559" href="#559">559</a> <em class="comment">// generic <whatever> match</em><a name="560" href="#560">560</a> <strong>int</strong> start5 = tags.start(5);<a name="561" href="#561">561</a> <strong>int</strong> end5 = tags.end(5);<a name="562" href="#562">562</a> assert start5 >= 0: <span class="string">"Start is: "</span> + start5 + <span class="string">", "</span> + curi;<a name="563" href="#563">563</a> assert end5 >= 0: <span class="string">"End is :"</span> + end5 + <span class="string">", "</span> + curi;<a name="564" href="#564">564</a> <strong>int</strong> start6 = tags.start(6);<a name="565" href="#565">565</a> <strong>int</strong> end6 = tags.end(6);<a name="566" href="#566">566</a> assert start6 >= 0: <span class="string">"Start is: "</span> + start6 + <span class="string">", "</span> + curi;<a name="567" href="#567">567</a> assert end6 >= 0: <span class="string">"End is :"</span> + end6 + <span class="string">", "</span> + curi;<a name="568" href="#568">568</a> processGeneralTag(curi,<a name="569" href="#569">569</a> cs.subSequence(start6, end6),<a name="570" href="#570">570</a> cs.subSequence(start5, end5));<a name="571" href="#571">571</a> <a name="572" href="#572">572</a> } <strong>else</strong> <strong>if</strong> (tags.start(1) > 0) {<a name="573" href="#573">573</a> <em class="comment">// <script> match</em><a name="574" href="#574">574</a> <strong>int</strong> start = tags.start(1);<a name="575" href="#575">575</a> <strong>int</strong> end = tags.end(1);<a name="576" href="#576">576</a> assert start >= 0: <span class="string">"Start is: "</span> + start + <span class="string">", "</span> + curi;<a name="577" href="#577">577</a> assert end >= 0: <span class="string">"End is :"</span> + end + <span class="string">", "</span> + curi;<a name="578" href="#578">578</a> assert tags.end(2) >= 0: <span class="string">"Tags.end(2) illegal "</span> + tags.end(2) +<a name="579" href="#579">579</a> <span class="string">", "</span> + curi;<a name="580" href="#580">580</a> processScript(curi, cs.subSequence(start, end),<a name="581" href="#581">581</a> tags.end(2) - start);<a name="582" href="#582">582</a> <a name="583" href="#583">583</a> } <strong>else</strong> <strong>if</strong> (tags.start(3) > 0){<a name="584" href="#584">584</a> <em class="comment">// <style... match</em><a name="585" href="#585">585</a> <strong>int</strong> start = tags.start(3);<a name="586" href="#586">586</a> <strong>int</strong> end = tags.end(3);<a name="587" href="#587">587</a> assert start >= 0: <span class="string">"Start is: "</span> + start + <span class="string">", "</span> + curi;<a name="588" href="#588">588</a> assert end >= 0: <span class="string">"End is :"</span> + end + <span class="string">", "</span> + curi;<a name="589" href="#589">589</a> assert tags.end(4) >= 0: <span class="string">"Tags.end(4) illegal "</span> + tags.end(4) +<a name="590" href="#590">590</a> <span class="string">", "</span> + curi;<a name="591" href="#591">591</a> processStyle(curi, cs.subSequence(start, end),<a name="592" href="#592">592</a> tags.end(4) - start);<a name="593" href="#593">593</a> }<a name="594" href="#594">594</a> }<a name="595" href="#595">595</a> TextUtils.recycleMatcher(tags);<a name="596" href="#596">596</a> }<a name="597" href="#597">597</a> <a name="598" href="#598">598</a> <a name="599" href="#599">599</a> <strong>static</strong> <strong>final</strong> String NON_HTML_PATH_EXTENSION =<a name="600" href="#600">600</a> <span class="string">"(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"</span>+<a name="601" href="#601">601</a> <span class="string">"|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"</span>;<a name="602" href="#602">602</a> <a name="603" href="#603">603</a> <em>/**<em>*</em></em><a name="604" href="#604">604</a> <em> * Test whether this HTML is so unexpected (eg in place of a GIF URI)</em><a name="605" href="#605">605</a> <em> * that it shouldn't be scanned for links.</em><a name="606" href="#606">606</a> <em> *</em><a name="607" href="#607">607</a> <em> * @param curi CrawlURI to examine.</em><a name="608" href="#608">608</a> <em> * @return True if HTML is acceptable/expected here</em><a name="609" href="#609">609</a> <em> * @throws URIException</em><a name="610" href="#610">610</a> <em> */</em><a name="611" href="#611">611</a> <strong>protected</strong> <strong>boolean</strong> isHtmlExpectedHere(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi) throws URIException {<a name="612" href="#612">612</a> String path = curi.getUURI().getPath();<a name="613" href="#613">613</a> <strong>if</strong>(path==<strong>null</strong>) {<a name="614" href="#614">614</a> <em class="comment">// no path extension, HTML is fine</em><a name="615" href="#615">615</a> <strong>return</strong> <strong>true</strong>;<a name="616" href="#616">616</a> }<a name="617" href="#617">617</a> <strong>int</strong> dot = path.lastIndexOf('.');<a name="618" href="#618">618</a> <strong>if</strong> (dot < 0) {<a name="619" href="#619">619</a> <em class="comment">// no path extension, HTML is fine</em><a name="620" href="#620">620</a> <strong>return</strong> <strong>true</strong>;<a name="621" href="#621">621</a> }<a name="622" href="#622">622</a> <strong>if</strong>(dot<(path.length()-5)) {<a name="623" href="#623">623</a> <em class="comment">// extension too long to recognize, HTML is fine</em><a name="624" href="#624">624</a> <strong>return</strong> <strong>true</strong>;<a name="625" href="#625">625</a> }<a name="626" href="#626">626</a> String ext = path.substring(dot+1);<a name="627" href="#627">627</a> <strong>return</strong> ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);<a name="628" href="#628">628</a> }<a name="629" href="#629">629</a> <a name="630" href="#630">630</a> <strong>protected</strong> <strong>void</strong> processScript(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi, CharSequence sequence,<a name="631" href="#631">631</a> <strong>int</strong> endOfOpenTag) {<a name="632" href="#632">632</a> <em class="comment">// for now, do nothing</em><a name="633" href="#633">633</a> <em class="comment">// TODO: best effort extraction of strings</em><a name="634" href="#634">634</a> <a name="635" href="#635">635</a> <em class="comment">// first, get attributes of script-open tag</em><a name="636" href="#636">636</a> <em class="comment">// as per any other tag</em><a name="637" href="#637">637</a> processGeneralTag(curi,sequence.subSequence(0,6),<a name="638" href="#638">638</a> sequence.subSequence(0,endOfOpenTag));<a name="639" href="#639">639</a> <a name="640" href="#640">640</a> <em class="comment">// then, apply best-effort string-analysis heuristics</em><a name="641" href="#641">641</a> <em class="comment">// against any code present (false positives are OK)</em><a name="642" href="#642">642</a> processScriptCode(<a name="643" href="#643">643</a> curi, sequence.subSequence(endOfOpenTag, sequence.length()));<a name="644" href="#644">644</a> }<a name="645" href="#645">645</a> <a name="646" href="#646">646</a> <em>/**<em>*</em></em><a name="647" href="#647">647</a> <em> * Process metadata tags.</em><a name="648" href="#648">648</a> <em> * @param curi CrawlURI we're processing.</em><a name="649" href="#649">649</a> <em> * @param cs Sequence from underlying ReplayCharSequence. This</em><a name="650" href="#650">650</a> <em> * is TRANSIENT data. Make a copy if you want the data to live outside</em><a name="651" href="#651">651</a> <em> * of this extractors' lifetime.</em><a name="652" href="#652">652</a> <em> * @return True robots exclusion metatag.</em><a name="653" href="#653">653</a> <em> */</em><a name="654" href="#654">654</a> <strong>protected</strong> <strong>boolean</strong> processMeta(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi, CharSequence cs) {<a name="655" href="#655">655</a> Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);<a name="656" href="#656">656</a> String name = <strong>null</strong>;<a name="657" href="#657">657</a> String httpEquiv = <strong>null</strong>;<a name="658" href="#658">658</a> String content = <strong>null</strong>;<a name="659" href="#659">659</a> <strong>while</strong> (attr.find()) {<a name="660" href="#660">660</a> <strong>int</strong> valueGroup =<a name="661" href="#661">661</a> (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;<a name="662" href="#662">662</a> CharSequence value =<a name="663" href="#663">663</a> cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));<a name="664" href="#664">664</a> <strong>if</strong> (attr.group(1).equalsIgnoreCase(<span class="string">"name"</span>)) {<a name="665" href="#665">665</a> name = value.toString();<a name="666" href="#666">666</a> } <strong>else</strong> <strong>if</strong> (attr.group(1).equalsIgnoreCase(<span class="string">"http-equiv"</span>)) {<a name="667" href="#667">667</a> httpEquiv = value.toString();<a name="668" href="#668">668</a> } <strong>else</strong> <strong>if</strong> (attr.group(1).equalsIgnoreCase(<span class="string">"content"</span>)) {<a name="669" href="#669">669</a> content = value.toString();<a name="670" href="#670">670</a> }<a name="671" href="#671">671</a> <em class="comment">// TODO: handle other stuff</em><a name="672" href="#672">672</a> }<a name="673" href="#673">673</a> TextUtils.recycleMatcher(attr);<a name="674" href="#674">674</a> <a name="675" href="#675">675</a> <em class="comment">// Look for the 'robots' meta-tag</em><a name="676" href="#676">676</a> <strong>if</strong>(<span class="string">"robots"</span>.equalsIgnoreCase(name) && content != <strong>null</strong> ) {<a name="677" href="#677">677</a> curi.putString(A_META_ROBOTS, content);<a name="678" href="#678">678</a> <a href="../../../../org/archive/crawler/datamodel/RobotsHonoringPolicy.html">RobotsHonoringPolicy</a> policy =<a name="679" href="#679">679</a> getSettingsHandler().getOrder().getRobotsHonoringPolicy();<a name="680" href="#680">680</a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -