⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 toethread.html

📁 一个开源的网页爬虫一个开源的网页爬虫一个开源的网页爬虫一个开源的网页爬虫一个开源的网页爬虫一个开源的网页爬虫
💻 HTML
📖 第 1 页 / 共 4 页
字号:
<a name="308" href="#308">308</a>             }<a name="309" href="#309">309</a>             setStep(STEP_DONE_WITH_PROCESSORS);<a name="310" href="#310">310</a>             currentProcessorName = <span class="string">""</span>;<a name="311" href="#311">311</a>         } <strong>catch</strong> (RuntimeExceptionWrapper e) {<a name="312" href="#312">312</a>             <em class="comment">// Workaround to get cause from BDB</em><a name="313" href="#313">313</a>             <strong>if</strong>(e.getCause() == <strong>null</strong>) {<a name="314" href="#314">314</a>                 e.initCause(e.getDetail());<a name="315" href="#315">315</a>             }<a name="316" href="#316">316</a>             recoverableProblem(e);<a name="317" href="#317">317</a>         } <strong>catch</strong> (AssertionError ae) {<a name="318" href="#318">318</a>             <em class="comment">// This risks leaving crawl in fatally inconsistent state, </em><a name="319" href="#319">319</a>             <em class="comment">// but is often reasonable for per-Processor assertion problems </em><a name="320" href="#320">320</a>             recoverableProblem(ae);<a name="321" href="#321">321</a>         } <strong>catch</strong> (RuntimeException e) {<a name="322" href="#322">322</a>             recoverableProblem(e);<a name="323" href="#323">323</a>         } <strong>catch</strong> (StackOverflowError err) {<a name="324" href="#324">324</a>             recoverableProblem(err);<a name="325" href="#325">325</a>         } <strong>catch</strong> (Error err) {<a name="326" href="#326">326</a>             <em class="comment">// OutOfMemory and any others</em><a name="327" href="#327">327</a>             seriousError(err); <a name="328" href="#328">328</a>         }<a name="329" href="#329">329</a>     }<a name="330" href="#330">330</a> <a name="331" href="#331">331</a> <a name="332" href="#332">332</a>     <em>/**<em>*</em></em><a name="333" href="#333">333</a> <em>     * Handling for exceptions and errors that are possibly recoverable.</em><a name="334" href="#334">334</a> <em>     * </em><a name="335" href="#335">335</a> <em>     * @param e</em><a name="336" href="#336">336</a> <em>     */</em><a name="337" href="#337">337</a>     <strong>private</strong> <strong>void</strong> recoverableProblem(Throwable e) {<a name="338" href="#338">338</a>         Object previousStep = step;<a name="339" href="#339">339</a>         setStep(STEP_HANDLING_RUNTIME_EXCEPTION);<a name="340" href="#340">340</a>         e.printStackTrace(System.err);<a name="341" href="#341">341</a>         currentCuri.setFetchStatus(S_RUNTIME_EXCEPTION);<a name="342" href="#342">342</a>         <em class="comment">// store exception temporarily for logging</em><a name="343" href="#343">343</a>         currentCuri.addAnnotation(<span class="string">"err="</span>+e.getClass().getName());<a name="344" href="#344">344</a>         currentCuri.putObject(A_RUNTIME_EXCEPTION, e);<a name="345" href="#345">345</a>         String message = <span class="string">"Problem "</span> + e + <a name="346" href="#346">346</a>                 <span class="string">" occured when trying to process '"</span><a name="347" href="#347">347</a>                 + currentCuri.toString()<a name="348" href="#348">348</a>                 + <span class="string">"' at step "</span> + previousStep <a name="349" href="#349">349</a>                 + <span class="string">" in "</span> + currentProcessorName +<span class="string">"\n"</span>;<a name="350" href="#350">350</a>         logger.log(Level.SEVERE, message.toString(), e);<a name="351" href="#351">351</a>     }<a name="352" href="#352">352</a> <a name="353" href="#353">353</a>     <strong>private</strong> <a href="../../../../org/archive/crawler/framework/Processor.html">Processor</a> getProcessor(<a href="../../../../org/archive/crawler/framework/Processor.html">Processor</a> processor) {<a name="354" href="#354">354</a>         <strong>if</strong>(!(processor instanceof <a href="../../../../org/archive/crawler/datamodel/InstancePerThread.html">InstancePerThread</a>)) {<a name="355" href="#355">355</a>             <em class="comment">// just use the shared Processor</em><a name="356" href="#356">356</a>              <strong>return</strong> processor;<a name="357" href="#357">357</a>         }<a name="358" href="#358">358</a>         <em class="comment">// must use local copy of processor</em><a name="359" href="#359">359</a>         <a href="../../../../org/archive/crawler/framework/Processor.html">Processor</a> localProcessor = (Processor) localProcessors.get(<a name="360" href="#360">360</a>                     processor.getClass().getName());<a name="361" href="#361">361</a>         <strong>if</strong> (localProcessor == <strong>null</strong>) {<a name="362" href="#362">362</a>             localProcessor = processor.spawn(<strong>this</strong>.getSerialNumber());<a name="363" href="#363">363</a>             localProcessors.put(processor.getClass().getName(),localProcessor);<a name="364" href="#364">364</a>         }<a name="365" href="#365">365</a>         <strong>return</strong> localProcessor;<a name="366" href="#366">366</a>     }<a name="367" href="#367">367</a> <a name="368" href="#368">368</a>     <em>/**<em>*</em></em><a name="369" href="#369">369</a> <em>     * @return Return toe thread serial number.</em><a name="370" href="#370">370</a> <em>     */</em><a name="371" href="#371">371</a>     <strong>public</strong> <strong>int</strong> getSerialNumber() {<a name="372" href="#372">372</a>         <strong>return</strong> <strong>this</strong>.serialNumber;<a name="373" href="#373">373</a>     }<a name="374" href="#374">374</a> <a name="375" href="#375">375</a>     <em>/**<em>*</em></em><a name="376" href="#376">376</a> <em>     * Used to get current threads HttpRecorder instance.</em><a name="377" href="#377">377</a> <em>     * Implementation of the HttpRecorderMarker interface.</em><a name="378" href="#378">378</a> <em>     * @return Returns instance of HttpRecorder carried by this thread.</em><a name="379" href="#379">379</a> <em>     * @see org.archive.util.HttpRecorderMarker#getHttpRecorder()</em><a name="380" href="#380">380</a> <em>     */</em><a name="381" href="#381">381</a>     <strong>public</strong> <a href="../../../../org/archive/util/HttpRecorder.html">HttpRecorder</a> getHttpRecorder() {<a name="382" href="#382">382</a>         <strong>return</strong> <strong>this</strong>.httpRecorder;<a name="383" href="#383">383</a>     }<a name="384" href="#384">384</a>     <a name="385" href="#385">385</a>     <em>/**<em>* Get the CrawlController acossiated with this thread.</em></em><a name="386" href="#386">386</a> <em>     *</em><a name="387" href="#387">387</a> <em>     * @return Returns the CrawlController.</em><a name="388" href="#388">388</a> <em>     */</em><a name="389" href="#389">389</a>     <strong>public</strong> <a href="../../../../org/archive/crawler/framework/CrawlController.html">CrawlController</a> getController() {<a name="390" href="#390">390</a>         <strong>return</strong> controller;<a name="391" href="#391">391</a>     }<a name="392" href="#392">392</a> <a name="393" href="#393">393</a>     <em>/**<em>*</em></em><a name="394" href="#394">394</a> <em>     * Terminates a thread.</em><a name="395" href="#395">395</a> <em>     *</em><a name="396" href="#396">396</a> <em>     * &lt;p> Calling this method will ensure that the current thread will stop</em><a name="397" href="#397">397</a> <em>     * processing as soon as possible (note: this may be never). Meant to</em><a name="398" href="#398">398</a> <em>     * 'short circuit' hung threads.</em><a name="399" href="#399">399</a> <em>     *</em><a name="400" href="#400">400</a> <em>     * &lt;p> Current crawl uri will have its fetch status set accordingly and</em><a name="401" href="#401">401</a> <em>     * will be immediately returned to the frontier.</em><a name="402" href="#402">402</a> <em>     *</em><a name="403" href="#403">403</a> <em>     * &lt;p> As noted before, this does not ensure that the thread will stop</em><a name="404" href="#404">404</a> <em>     * running (ever). But once evoked it will not try and communicate with</em><a name="405" href="#405">405</a> <em>     * other parts of crawler and will terminate as soon as control is</em><a name="406" href="#406">406</a> <em>     * established.</em><a name="407" href="#407">407</a> <em>     */</em><a name="408" href="#408">408</a>     <strong>protected</strong> <strong>void</strong> kill(){<a name="409" href="#409">409</a>         <strong>this</strong>.interrupt();<a name="410" href="#410">410</a>         <strong>synchronized</strong>(<strong>this</strong>) {<a name="411" href="#411">411</a>             <strong>if</strong> (currentCuri!=<strong>null</strong>) {<a name="412" href="#412">412</a>                 currentCuri.setFetchStatus(S_PROCESSING_THREAD_KILLED);<a name="413" href="#413">413</a>                 controller.getFrontier().finished(currentCuri);<a name="414" href="#414">414</a>              }<a name="415" href="#415">415</a>         }<a name="416" href="#416">416</a>     }<a name="417" href="#417">417</a> <a name="418" href="#418">418</a> 	<em>/**<em>*</em></em><a name="419" href="#419">419</a> <em>	 * @return Current step (For debugging/reporting, give abstract step</em><a name="420" href="#420">420</a> <em>     * where this thread is).</em><a name="421" href="#421">421</a> <em>	 */</em><a name="422" href="#422">422</a> 	<strong>public</strong> Object getStep() {<a name="423" href="#423">423</a> 		<strong>return</strong> step;<a name="424" href="#424">424</a> 	}<a name="425" href="#425">425</a> <a name="426" href="#426">426</a>     <em>/**<em>*</em></em><a name="427" href="#427">427</a> <em>     * Is this thread processing a URI, not paused or waiting for a URI?</em><a name="428" href="#428">428</a> <em>     * @return whether thread is actively processing a URI</em><a name="429" href="#429">429</a> <em>     */</em><a name="430" href="#430">430</a>     <strong>public</strong> <strong>boolean</strong> isActive() {<a name="431" href="#431">431</a>         <em class="comment">// if alive and not waiting in/for frontier.next(), we're 'active'</em><a name="432" href="#432">432</a>         <strong>return</strong> <strong>this</strong>.isAlive() &amp;&amp; (currentCuri != <strong>null</strong>);<a name="433" href="#433">433</a>     }<a name="434" href="#434">434</a>     <a name="435" href="#435">435</a>     <em>/**<em>*</em></em><a name="436" href="#436">436</a> <em>     * Request that this thread retire (exit cleanly) at the earliest</em><a name="437" href="#437">437</a> <em>     * opportunity.</em><a name="438" href="#438">438</a> <em>     */</em><a name="439" href="#439">439</a>     <strong>public</strong> <strong>void</strong> retire() {<a name="440" href="#440">440</a>         shouldRetire = <strong>true</strong>;<a name="441" href="#441">441</a>     }<a name="442" href="#442">442</a> <a name="443" href="#443">443</a>     <em>/**<em>*</em></em><a name="444" href="#444">444</a> <em>     * Whether this thread should cleanly retire at the earliest </em><a name="445" href="#445">445</a> <em>     * opportunity. </em><a name="446" href="#446">446</a> <em>     * </em><a name="447" href="#447">447</a> <em>     * @return True if should retire.</em><a name="448" href="#448">448</a> <em>     */</em><a name="449" href="#449">449</a>     <strong>public</strong> <strong>boolean</strong> shouldRetire() {<a name="450" href="#450">450</a>         <strong>return</strong> shouldRetire;<a name="451" href="#451">451</a>     }<a name="452" href="#452">452</a> <a name="453" href="#453">453</a>     <em class="comment">//</em><a name="454" href="#454">454</a>     <em class="comment">// Reporter implementation</em><a name="455" href="#455">455</a>     <em class="comment">// </em><a name="456" href="#456">456</a>     <a name="457" href="#457">457</a>     <em>/**<em>*</em></em><a name="458" href="#458">458</a> <em>     * Compiles and returns a report on its status.</em><a name="459" href="#459">459</a> <em>     * @param name Report name.</em><a name="460" href="#460">460</a> <em>     * @param pw Where to print.</em><a name="461" href="#461">461</a> <em>     */</em><a name="462" href="#462">462</a>     <strong>public</strong> <strong>void</strong> reportTo(String name, PrintWriter pw) {<a name="463" href="#463">463</a>         <em class="comment">// name is ignored for now: only one kind of report</em><a name="464" href="#464">464</a>         <a name="465" href="#465">465</a>         pw.print(<span class="string">"["</span>);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -