statisticstracker.html

来自「网络爬虫开源代码」· HTML 代码 · 共 738 行 · 第 1/5 页

HTML
738
字号
<a name="584" href="#584">584</a> <em>     * @return A count of all uris encountered</em><a name="585" href="#585">585</a> <em>     *</em><a name="586" href="#586">586</a> <em>     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()</em><a name="587" href="#587">587</a> <em>     */</em><a name="588" href="#588">588</a>     <strong>public</strong> <strong>long</strong> discoveredUriCount() {<a name="589" href="#589">589</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="590" href="#590">590</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="591" href="#591">591</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="592" href="#592">592</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong>?<a name="593" href="#593">593</a>             controller.getFrontier().discoveredUriCount() : discoveredUriCount;<a name="594" href="#594">594</a>     }<a name="595" href="#595">595</a> <a name="596" href="#596">596</a>     <em>/**<em>*</em></em><a name="597" href="#597">597</a> <em>     * Number of URIs that have &lt;i>finished&lt;/i> processing.</em><a name="598" href="#598">598</a> <em>     *</em><a name="599" href="#599">599</a> <em>     * @return Number of URIs that have finished processing</em><a name="600" href="#600">600</a> <em>     *</em><a name="601" href="#601">601</a> <em>     * @see org.archive.crawler.framework.Frontier#finishedUriCount()</em><a name="602" href="#602">602</a> <em>     */</em><a name="603" href="#603">603</a>     <strong>public</strong> <strong>long</strong> finishedUriCount() {<a name="604" href="#604">604</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="605" href="#605">605</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong> ?<a name="606" href="#606">606</a>             controller.getFrontier().finishedUriCount() : finishedUriCount;<a name="607" href="#607">607</a>     }<a name="608" href="#608">608</a> <a name="609" href="#609">609</a>     <em>/**<em>*</em></em><a name="610" href="#610">610</a> <em>     * Get the total number of failed fetch attempts (connection failures -> give up, etc)</em><a name="611" href="#611">611</a> <em>     *</em><a name="612" href="#612">612</a> <em>     * @return The total number of failed fetch attempts</em><a name="613" href="#613">613</a> <em>     */</em><a name="614" href="#614">614</a>     <strong>public</strong> <strong>long</strong> failedFetchAttempts() {<a name="615" href="#615">615</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="616" href="#616">616</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="617" href="#617">617</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="618" href="#618">618</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong> ?<a name="619" href="#619">619</a>             controller.getFrontier().failedFetchCount() : downloadFailures;<a name="620" href="#620">620</a>     }<a name="621" href="#621">621</a> <a name="622" href="#622">622</a>     <em>/**<em>*</em></em><a name="623" href="#623">623</a> <em>     * Get the total number of failed fetch attempts (connection failures -> give up, etc)</em><a name="624" href="#624">624</a> <em>     *</em><a name="625" href="#625">625</a> <em>     * @return The total number of failed fetch attempts</em><a name="626" href="#626">626</a> <em>     */</em><a name="627" href="#627">627</a>     <strong>public</strong> <strong>long</strong> disregardedFetchAttempts() {<a name="628" href="#628">628</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="629" href="#629">629</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="630" href="#630">630</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="631" href="#631">631</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong>?<a name="632" href="#632">632</a>             controller.getFrontier().disregardedUriCount() : downloadDisregards;<a name="633" href="#633">633</a>     }<a name="634" href="#634">634</a> <a name="635" href="#635">635</a>     <strong>public</strong> <strong>long</strong> successfullyFetchedCount() {<a name="636" href="#636">636</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="637" href="#637">637</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="638" href="#638">638</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="639" href="#639">639</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong>?<a name="640" href="#640">640</a>             controller.getFrontier().succeededFetchCount() : downloadedUriCount;<a name="641" href="#641">641</a>     }<a name="642" href="#642">642</a>     <a name="643" href="#643">643</a>     <strong>public</strong> <strong>long</strong> totalCount() {<a name="644" href="#644">644</a>         <strong>return</strong> queuedUriCount() + activeThreadCount() +<a name="645" href="#645">645</a>             successfullyFetchedCount();<a name="646" href="#646">646</a>     }<a name="647" href="#647">647</a> <a name="648" href="#648">648</a>     <em>/**<em>*</em></em><a name="649" href="#649">649</a> <em>     * Ratio of number of threads that would theoretically allow</em><a name="650" href="#650">650</a> <em>     * maximum crawl progress (if each was as productive as current</em><a name="651" href="#651">651</a> <em>     * threads), to current number of threads.</em><a name="652" href="#652">652</a> <em>     * </em><a name="653" href="#653">653</a> <em>     * @return float congestion ratio </em><a name="654" href="#654">654</a> <em>     */</em><a name="655" href="#655">655</a>     <strong>public</strong> <strong>float</strong> congestionRatio() {<a name="656" href="#656">656</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="657" href="#657">657</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="658" href="#658">658</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="659" href="#659">659</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong> ?<a name="660" href="#660">660</a>             controller.getFrontier().congestionRatio() : congestionRatio;<a name="661" href="#661">661</a>     }<a name="662" href="#662">662</a>     <a name="663" href="#663">663</a>     <em>/**<em>*</em></em><a name="664" href="#664">664</a> <em>     * Ordinal position of the 'deepest' URI eligible </em><a name="665" href="#665">665</a> <em>     * for crawling. Essentially, the length of the longest</em><a name="666" href="#666">666</a> <em>     * frontier internal queue. </em><a name="667" href="#667">667</a> <em>     * </em><a name="668" href="#668">668</a> <em>     * @return long URI count to deepest URI</em><a name="669" href="#669">669</a> <em>     */</em><a name="670" href="#670">670</a>     <strong>public</strong> <strong>long</strong> deepestUri() {<a name="671" href="#671">671</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="672" href="#672">672</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="673" href="#673">673</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="674" href="#674">674</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong> ?<a name="675" href="#675">675</a>             controller.getFrontier().deepestUri() : deepestUri;<a name="676" href="#676">676</a>     }<a name="677" href="#677">677</a>     <a name="678" href="#678">678</a>     <em>/**<em>*</em></em><a name="679" href="#679">679</a> <em>     * Average depth of the last URI in all eligible queues.</em><a name="680" href="#680">680</a> <em>     * That is, the average length of all eligible queues.</em><a name="681" href="#681">681</a> <em>     * </em><a name="682" href="#682">682</a> <em>     * @return long average depth of last URIs in queues </em><a name="683" href="#683">683</a> <em>     */</em><a name="684" href="#684">684</a>     <strong>public</strong> <strong>long</strong> averageDepth() {<a name="685" href="#685">685</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="686" href="#686">686</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="687" href="#687">687</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="688" href="#688">688</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong> ?<a name="689" href="#689">689</a>             controller.getFrontier().averageDepth() : averageDepth;<a name="690" href="#690">690</a>     }<a name="691" href="#691">691</a>     <a name="692" href="#692">692</a>     <em>/**<em>*</em></em><a name="693" href="#693">693</a> <em>     * Number of URIs &lt;i>queued&lt;/i> up and waiting for processing.</em><a name="694" href="#694">694</a> <em>     *</em><a name="695" href="#695">695</a> <em>     * &lt;p>If crawl not running (paused or stopped) this will return the value</em><a name="696" href="#696">696</a> <em>     * of the last snapshot.</em><a name="697" href="#697">697</a> <em>     *</em><a name="698" href="#698">698</a> <em>     * @return Number of URIs queued up and waiting for processing.</em><a name="699" href="#699">699</a> <em>     *</em><a name="700" href="#700">700</a> <em>     * @see org.archive.crawler.framework.Frontier#queuedUriCount()</em><a name="701" href="#701">701</a> <em>     */</em><a name="702" href="#702">702</a>     <strong>public</strong> <strong>long</strong> queuedUriCount() {<a name="703" href="#703">703</a>         <em class="comment">// While shouldrun is true we can use info direct from the crawler.</em><a name="704" href="#704">704</a>         <em class="comment">// After that our last snapshot will have to do.</em><a name="705" href="#705">705</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="706" href="#706">706</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong>?<a name="707" href="#707">707</a>             controller.getFrontier().queuedUriCount() : queuedUriCount;<a name="708" href="#708">708</a>     }<a name="709" href="#709">709</a> <a name="710" href="#710">710</a>     <em>/**<em>* @deprecated use totalBytesCrawled */</em> </em><a name="711" href="#711">711</a>     <strong>public</strong> <strong>long</strong> totalBytesWritten() {<a name="712" href="#712">712</a>         <em class="comment">// return totalBytesCrawled(); </em><a name="713" href="#713">713</a>         <strong>return</strong> shouldrun &amp;&amp; <strong>this</strong>.controller != <strong>null</strong> &amp;&amp;<a name="714" href="#714">714</a>                 <strong>this</strong>.controller.getFrontier() != <strong>null</strong>?<a name="715" href="#715">715</a>             controller.getFrontier().totalBytesWritten() : totalProcessedBytes;<a name="716" href="#716">716</a>     }<a name="717" href="#717">717</a>     <a name="718" href="#718">718</a>     <strong>public</strong> <strong>long</strong> totalBytesCrawled() {<a name="719" href="#719">719</a>         <strong>return</strong> shouldrun ?<a name="720" href="#720">720</a>             crawledBytes.getTotal() : totalProcessedBytes;<a name="721" href="#721">721</a>     }<a name="722" href="#722">722</a>     <a name="723" href="#723">723</a>     <strong>public</strong> String crawledBytesSummary() {<a name="724" href="#724">724</a>         <strong>return</strong> crawledBytes.summary();<a name="725" href="#725">725</a>     }<a name="726" href="#726">726</a> <a name="727" href="#727">727</a>     <em>/**<em>*</em></em><a name="728" href="#728">728</a> <em>     * If the curi is a seed, we update the processedSeeds table.</em><a name="729" href="#729">729</a> <em>     *<

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?