crawlcontroller.html

来自「网络爬虫开源代码」· HTML 代码 · 共 701 行 · 第 1/5 页

HTML
701
字号
<a name="556" href="#556">556</a>         }<a name="557" href="#557">557</a>     }<a name="558" href="#558">558</a> <a name="559" href="#559">559</a>     <em>/**<em>*</em></em><a name="560" href="#560">560</a> <em>     * Allows an external class to raise a CrawlURIDispostion</em><a name="561" href="#561">561</a> <em>     * crawledURINeedRetry event that will be broadcast to all listeners that</em><a name="562" href="#562">562</a> <em>     * have registered with the CrawlController.</em><a name="563" href="#563">563</a> <em>     *</em><a name="564" href="#564">564</a> <em>     * @param curi - The CrawlURI that will be sent with the event notification.</em><a name="565" href="#565">565</a> <em>     *</em><a name="566" href="#566">566</a> <em>     * @see CrawlURIDispositionListener#crawledURINeedRetry(CrawlURI)</em><a name="567" href="#567">567</a> <em>     */</em><a name="568" href="#568">568</a>     <strong>public</strong> <strong>void</strong> fireCrawledURINeedRetryEvent(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi) {<a name="569" href="#569">569</a>         <strong>if</strong> (registeredCrawlURIDispositionListener != <strong>null</strong>) {<a name="570" href="#570">570</a>             <em class="comment">// Then we'll just use that.</em><a name="571" href="#571">571</a>             registeredCrawlURIDispositionListener.crawledURINeedRetry(curi);<a name="572" href="#572">572</a>             <strong>return</strong>;<a name="573" href="#573">573</a>         }<a name="574" href="#574">574</a>         <a name="575" href="#575">575</a>         <em class="comment">// Go through the list.</em><a name="576" href="#576">576</a>         <strong>if</strong> (registeredCrawlURIDispositionListeners != <strong>null</strong><a name="577" href="#577">577</a>                 &amp;&amp; registeredCrawlURIDispositionListeners.size() > 0) {<a name="578" href="#578">578</a>             <strong>for</strong> (Iterator i = registeredCrawlURIDispositionListeners.iterator();<a name="579" href="#579">579</a>                     i.hasNext();) {<a name="580" href="#580">580</a>                 ((<a href="../../../../org/archive/crawler/event/CrawlURIDispositionListener.html">CrawlURIDispositionListener</a>)i.next()).crawledURINeedRetry(curi);<a name="581" href="#581">581</a>             }<a name="582" href="#582">582</a>         }<a name="583" href="#583">583</a>     }<a name="584" href="#584">584</a> <a name="585" href="#585">585</a>     <em>/**<em>*</em></em><a name="586" href="#586">586</a> <em>     * Allows an external class to raise a CrawlURIDispostion</em><a name="587" href="#587">587</a> <em>     * crawledURIDisregard event that will be broadcast to all listeners that</em><a name="588" href="#588">588</a> <em>     * have registered with the CrawlController.</em><a name="589" href="#589">589</a> <em>     * </em><a name="590" href="#590">590</a> <em>     * @param curi -</em><a name="591" href="#591">591</a> <em>     *            The CrawlURI that will be sent with the event notification.</em><a name="592" href="#592">592</a> <em>     * </em><a name="593" href="#593">593</a> <em>     * @see CrawlURIDispositionListener#crawledURIDisregard(CrawlURI)</em><a name="594" href="#594">594</a> <em>     */</em><a name="595" href="#595">595</a>     <strong>public</strong> <strong>void</strong> fireCrawledURIDisregardEvent(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi) {<a name="596" href="#596">596</a>         <strong>if</strong> (registeredCrawlURIDispositionListener != <strong>null</strong>) {<a name="597" href="#597">597</a>             <em class="comment">// Then we'll just use that.</em><a name="598" href="#598">598</a>             registeredCrawlURIDispositionListener.crawledURIDisregard(curi);<a name="599" href="#599">599</a>         } <strong>else</strong> {<a name="600" href="#600">600</a>             <em class="comment">// Go through the list.</em><a name="601" href="#601">601</a>             <strong>if</strong> (registeredCrawlURIDispositionListeners != <strong>null</strong><a name="602" href="#602">602</a>                 &amp;&amp; registeredCrawlURIDispositionListeners.size() > 0) {<a name="603" href="#603">603</a>                 Iterator it = registeredCrawlURIDispositionListeners.iterator();<a name="604" href="#604">604</a>                 <strong>while</strong> (it.hasNext()) {<a name="605" href="#605">605</a>                     (<a name="606" href="#606">606</a>                         (<a href="../../../../org/archive/crawler/event/CrawlURIDispositionListener.html">CrawlURIDispositionListener</a>) it<a name="607" href="#607">607</a>                             .next())<a name="608" href="#608">608</a>                             .crawledURIDisregard(<a name="609" href="#609">609</a>                         curi);<a name="610" href="#610">610</a>                 }<a name="611" href="#611">611</a>             }<a name="612" href="#612">612</a>         }<a name="613" href="#613">613</a>     }<a name="614" href="#614">614</a> <a name="615" href="#615">615</a>     <em>/**<em>*</em></em><a name="616" href="#616">616</a> <em>     * Allows an external class to raise a CrawlURIDispostion crawledURIFailure event</em><a name="617" href="#617">617</a> <em>     * that will be broadcast to all listeners that have registered with the CrawlController.</em><a name="618" href="#618">618</a> <em>     *</em><a name="619" href="#619">619</a> <em>     * @param curi - The CrawlURI that will be sent with the event notification.</em><a name="620" href="#620">620</a> <em>     *</em><a name="621" href="#621">621</a> <em>     * @see CrawlURIDispositionListener#crawledURIFailure(CrawlURI)</em><a name="622" href="#622">622</a> <em>     */</em><a name="623" href="#623">623</a>     <strong>public</strong> <strong>void</strong> fireCrawledURIFailureEvent(<a href="../../../../org/archive/crawler/datamodel/CrawlURI.html">CrawlURI</a> curi) {<a name="624" href="#624">624</a>         <strong>if</strong> (registeredCrawlURIDispositionListener != <strong>null</strong>) {<a name="625" href="#625">625</a>             <em class="comment">// Then we'll just use that.</em><a name="626" href="#626">626</a>             registeredCrawlURIDispositionListener.crawledURIFailure(curi);<a name="627" href="#627">627</a>         } <strong>else</strong> {<a name="628" href="#628">628</a>             <em class="comment">// Go through the list.</em><a name="629" href="#629">629</a>             <strong>if</strong> (registeredCrawlURIDispositionListeners != <strong>null</strong><a name="630" href="#630">630</a>                 &amp;&amp; registeredCrawlURIDispositionListeners.size() > 0) {<a name="631" href="#631">631</a>                 Iterator it = registeredCrawlURIDispositionListeners.iterator();<a name="632" href="#632">632</a>                 <strong>while</strong> (it.hasNext()) {<a name="633" href="#633">633</a>                     ((<a href="../../../../org/archive/crawler/event/CrawlURIDispositionListener.html">CrawlURIDispositionListener</a>)it.next())<a name="634" href="#634">634</a>                         .crawledURIFailure(curi);<a name="635" href="#635">635</a>                 }<a name="636" href="#636">636</a>             }<a name="637" href="#637">637</a>         }<a name="638" href="#638">638</a>     }<a name="639" href="#639">639</a> <a name="640" href="#640">640</a>     <strong>private</strong> <strong>void</strong> setupCrawlModules() throws FatalConfigurationException,<a name="641" href="#641">641</a>              AttributeNotFoundException, MBeanException, ReflectionException {<a name="642" href="#642">642</a>         <strong>if</strong> (scope == <strong>null</strong>) {<a name="643" href="#643">643</a>             scope = (CrawlScope) order.getAttribute(CrawlScope.ATTR_NAME);<a name="644" href="#644">644</a>         	scope.initialize(<strong>this</strong>);<a name="645" href="#645">645</a>         }<a name="646" href="#646">646</a>         <strong>try</strong> {<a name="647" href="#647">647</a>             <strong>this</strong>.serverCache = <strong>new</strong> <a href="../../../../org/archive/crawler/datamodel/ServerCache.html">ServerCache</a>(<strong>this</strong>);<a name="648" href="#648">648</a>         } <strong>catch</strong> (Exception e) {<a name="649" href="#649">649</a>             <strong>throw</strong> <strong>new</strong> <a href="../../../../org/archive/crawler/framework/exceptions/FatalConfigurationException.html">FatalConfigurationException</a>(<span class="string">"Unable to"</span> +<a name="650" href="#650">650</a>                <span class="string">" initialize frontier (Failed setup of ServerCache) "</span> + e);<a name="651" href="#651">651</a>         }<a name="652" href="#652">652</a>         <a name="653" href="#653">653</a>         <strong>if</strong> (<strong>this</strong>.frontier == <strong>null</strong>) {<a name="654" href="#654">654</a>             <strong>this</strong>.frontier = (Frontier)order.getAttribute(Frontier.ATTR_NAME);<a name="655" href="#655">655</a>             <strong>try</strong> {<a name="656" href="#656">656</a>                 frontier.initialize(<strong>this</strong>);<a name="657" href="#657">657</a>                 frontier.pause(); <em class="comment">// Pause until begun</em><a name="658" href="#658">658</a>                 <em class="comment">// Run recovery if recoverPath points to a file (If it points</em><a name="659" href="#659">659</a>                 <em class="comment">// to a directory, its a checkpoint recovery).</em><a name="660" href="#660">660</a>                 <em class="comment">// TODO: make recover path relative to job root dir.</em><a name="661" href="#661">661</a>                 <strong>if</strong> (!isCheckpointRecover()) {<a name="662" href="#662">662</a>                     runFrontierRecover((String)order.<a name="663" href="#663">663</a>                         getAttribute(CrawlOrder.ATTR_RECOVER_PATH));<a name="664" href="#664">664</a>                 }<a name="665" href="#665">665</a>             } <strong>catch</strong> (IOException e) {<a name="666" href="#666">666</a>                 <strong>throw</strong> <strong>new</strong> <a href="../../../../org/archive/crawler/framework/exceptions/FatalConfigurationException.html">FatalConfigurationException</a>(<a name="667" href="#667">667</a>                     <span class="string">"unable to initialize frontier: "</span> + e);<a name="668" href="#668">668</a>             }<a name="669" href="#669">669</a>         }<a name="670" href="#670">670</a> <a name="671" href="#671">671</a>         <em class="comment">// Setup processors</em><a name="672" href="#672">672</a>         <strong>if</strong> (processorChains == <strong>null</strong>) {<a name="673" href="#673">673</a>             processorChains = <strong>new</strong> <a href="../../../../org/archive/crawler/framework/ProcessorChainList.html">ProcessorChainList</a>(order);<a name="674" href="#674">674</a>         }<a name="675" href="#675">675</a>     }<a name="676" href="#676">676</a>     <a name="677" href="#677">677</a>     <strong>protected</strong> <strong>void</strong> runFrontierRecover(String recoverPath)<a name="678" href="#678">678</a>             throws AttributeNotFoundException, MBeanException,<a name="679" href="#679">679</a>             ReflectionException, <a href="../../../../org/archive/crawler/framework/exceptions/FatalConfigurationException.html">FatalConfigurationException</a> {<a name="680" href="#680">680</a>         <strong>if</strong> (recoverPath == <strong>null</strong> || recoverPath.length() &lt;= 0) {<a name="681" href="#681">681</a>             <strong>return</strong>;<a name="682" href="#682">682</a>         }<a name="683" href="#683">683</a>         File f = <strong>new</strong> File(recoverPath);<a name="684" href="#684">684</a>         <strong>if</strong> (!f.exists()) {<a name="685" href="#685">685</a>             LOGGER.severe(<span class="string">"Recover file does not exist "</span> + recoverPath);<a name="686" href="#686">686</a>             <strong>return</strong>;<a name="687" href="#687">687</a>         }<a name="688" href="#688">688</a>         <strong>if</strong> (!f.isFile()) {<a name="689" href="#689">689</a>             <em class="comment">// Its a directory if supposed to be doing a checkpoint recover.</em><a name="690" href="#690">690</a>             <strong>return</strong>;<a name="691" href="#691">691</a>         }<a name="692" href="#692">692</a>         <strong>boolean<

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?