arc2wcdx.html

来自「网络爬虫开源代码」· HTML 代码 · 共 263 行 · 第 1/2 页

HTML
263
字号
<a name="124" href="#124">124</a>                 <em class="comment">// arc name ('g')</em><a name="125" href="#125">125</a>                 appendField(builder,reader.getFileName());<a name="126" href="#126">126</a>                 <em class="comment">// compressed start offset ('V')</em><a name="127" href="#127">127</a>                 appendField(builder,h.getOffset());<a name="128" href="#128">128</a> <a name="129" href="#129">129</a>                 <em class="comment">// compressed end offset (?)</em><a name="130" href="#130">130</a> <em class="comment">//            appendField(builder,</em><a name="131" href="#131">131</a> <em class="comment">//                    reader.getInputStream() instanceof RepositionableStream</em><a name="132" href="#132">132</a> <em class="comment">//                    ? ((GzippedInputStream)reader.getInputStream()).vPosition()</em><a name="133" href="#133">133</a> <em class="comment">//                    : "-");</em><a name="134" href="#134">134</a>                 <em class="comment">// TODO; leave unavail for now</em><a name="135" href="#135">135</a>                 appendField(builder, <span class="string">"-"</span>);<a name="136" href="#136">136</a> <a name="137" href="#137">137</a>                 <em class="comment">// uncompressed (declared in ARC headerline) record length</em><a name="138" href="#138">138</a>                 appendField(builder,h.getLength());<a name="139" href="#139">139</a>                 <em class="comment">// http header content-length</em><a name="140" href="#140">140</a>                 appendField(builder,hg.getFirstHeader(<span class="string">"Content-Length"</span>));<a name="141" href="#141">141</a> <a name="142" href="#142">142</a>                 <em class="comment">// http header mod-date</em><a name="143" href="#143">143</a>                 appendTimeField(builder,hg.getFirstHeader(<span class="string">"Last-Modified"</span>));<a name="144" href="#144">144</a>                 <em class="comment">// http header expires</em><a name="145" href="#145">145</a>                 appendTimeField(builder,hg.getFirstHeader(<span class="string">"Expires"</span>));<a name="146" href="#146">146</a>                 <a name="147" href="#147">147</a>                 <em class="comment">// http header etag</em><a name="148" href="#148">148</a>                 appendField(builder,hg.getFirstHeader(<span class="string">"ETag"</span>));<a name="149" href="#149">149</a>                 <em class="comment">// http header redirect ('Location' header?)</em><a name="150" href="#150">150</a>                 appendField(builder,hg.getFirstHeader(<span class="string">"Location"</span>));<a name="151" href="#151">151</a>                 <em class="comment">// ip ('e')</em><a name="152" href="#152">152</a>                 appendField(builder,h.getIp());<a name="153" href="#153">153</a>                 <em class="comment">// original URI</em><a name="154" href="#154">154</a>                 appendField(builder,h.getUrl());<a name="155" href="#155">155</a>                 <em class="comment">// TODO MAYBE - a title from inside content? </em><a name="156" href="#156">156</a> <a name="157" href="#157">157</a>                 writer.println(builder.toString());<a name="158" href="#158">158</a>                 count++;<a name="159" href="#159">159</a>             }<a name="160" href="#160">160</a>             wcdxFile.renameTo(<strong>new</strong> File(wcdxPath));<a name="161" href="#161">161</a>         } <strong>catch</strong> (IOException e) {<a name="162" href="#162">162</a>             <em class="comment">// soldier on: but leave '.open' wcdx file as indicator of error</em><a name="163" href="#163">163</a>             <strong>if</strong>(!wcdxFile.exists()) {<a name="164" href="#164">164</a>                 <strong>try</strong> {<a name="165" href="#165">165</a>                     wcdxFile.createNewFile();<a name="166" href="#166">166</a>                 } <strong>catch</strong> (IOException e1) {<a name="167" href="#167">167</a>                     <em class="comment">// TODO Auto-generated catch block</em><a name="168" href="#168">168</a>                     <strong>throw</strong> <strong>new</strong> RuntimeException(e1);<a name="169" href="#169">169</a>                 }<a name="170" href="#170">170</a>             }<a name="171" href="#171">171</a>         } <strong>catch</strong> (RuntimeException e) {<a name="172" href="#172">172</a>             <em class="comment">// soldier on: but leave '.open' wcdx file as indicator of error</em><a name="173" href="#173">173</a>             <strong>if</strong>(!wcdxFile.exists()) {<a name="174" href="#174">174</a>                 <strong>try</strong> {<a name="175" href="#175">175</a>                     wcdxFile.createNewFile();<a name="176" href="#176">176</a>                 } <strong>catch</strong> (IOException e1) {<a name="177" href="#177">177</a>                     <em class="comment">// TODO Auto-generated catch block</em><a name="178" href="#178">178</a>                     <strong>throw</strong> <strong>new</strong> RuntimeException(e1);<a name="179" href="#179">179</a>                 }<a name="180" href="#180">180</a>             }<a name="181" href="#181">181</a>         } <strong>finally</strong> {<a name="182" href="#182">182</a>             <strong>if</strong>(writer!=<strong>null</strong>) {<a name="183" href="#183">183</a>                 writer.close();<a name="184" href="#184">184</a>             }<a name="185" href="#185">185</a>         }<a name="186" href="#186">186</a>         <a name="187" href="#187">187</a>         <strong>return</strong> <strong>new</strong> Object[] {wcdxPath, count};<a name="188" href="#188">188</a>     }<a name="189" href="#189">189</a> <a name="190" href="#190">190</a>     <strong>protected</strong> <strong>static</strong> <strong>void</strong> appendField(StringBuilder builder, Object obj) {<a name="191" href="#191">191</a>         <strong>if</strong>(builder.length()>0) {<a name="192" href="#192">192</a>             <em class="comment">// prepend with delimiter</em><a name="193" href="#193">193</a>             builder.append(' ');<a name="194" href="#194">194</a>         }<a name="195" href="#195">195</a>         <strong>if</strong>(obj instanceof Header) {<a name="196" href="#196">196</a>             obj = ((Header)obj).getValue().trim();<a name="197" href="#197">197</a>         }<a name="198" href="#198">198</a> <a name="199" href="#199">199</a>         builder.append((obj==<strong>null</strong>||obj.toString().length()==0)?<span class="string">"-"</span>:obj);<a name="200" href="#200">200</a>     }<a name="201" href="#201">201</a> <a name="202" href="#202">202</a>     <strong>protected</strong> <strong>static</strong> <strong>void</strong> appendTimeField(StringBuilder builder, Object obj) {<a name="203" href="#203">203</a>         <strong>if</strong>(builder.length()>0) {<a name="204" href="#204">204</a>             <em class="comment">// prepend with delimiter</em><a name="205" href="#205">205</a>             builder.append(' ');<a name="206" href="#206">206</a>         }<a name="207" href="#207">207</a>         <strong>if</strong>(obj==<strong>null</strong>) {<a name="208" href="#208">208</a>             builder.append(<span class="string">"-"</span>);<a name="209" href="#209">209</a>             <strong>return</strong>;<a name="210" href="#210">210</a>         }<a name="211" href="#211">211</a>         <strong>if</strong>(obj instanceof Header) {<a name="212" href="#212">212</a>             String s = ((Header)obj).getValue().trim();<a name="213" href="#213">213</a>             <strong>try</strong> {<a name="214" href="#214">214</a>                 Date date = DateUtil.parseDate(s);<a name="215" href="#215">215</a>                 String d = ArchiveUtils.get14DigitDate(date);<a name="216" href="#216">216</a>                 <strong>if</strong>(d.startsWith(<span class="string">"209"</span>)) {<a name="217" href="#217">217</a>                     d = <span class="string">"199"</span>+d.substring(3);<a name="218" href="#218">218</a>                 }<a name="219" href="#219">219</a>                 obj = d;<a name="220" href="#220">220</a>             } <strong>catch</strong> (DateParseException e) {<a name="221" href="#221">221</a>                 builder.append('e');<a name="222" href="#222">222</a>                 <strong>return</strong>;<a name="223" href="#223">223</a>             }<a name="224" href="#224">224</a> <a name="225" href="#225">225</a>         }<a name="226" href="#226">226</a>         builder.append(obj);<a name="227" href="#227">227</a>     }<a name="228" href="#228">228</a> }<a name="229" href="#229">229</a> <a name="230" href="#230">230</a> <em class="comment">//'wide' CDX</em><a name="231" href="#231">231</a> <em class="comment">//a original url</em><a name="232" href="#232">232</a> <em class="comment">//b timestamp</em><a name="233" href="#233">233</a> <em class="comment">//s resp code</em><a name="234" href="#234">234</a> <em class="comment">//m type</em><a name="235" href="#235">235</a> <em class="comment">//? content md5 (full 'k'? 'c'?</em><a name="236" href="#236">236</a> <em class="comment">//g arc name</em><a name="237" href="#237">237</a> <em class="comment">//V compressed start offset</em><a name="238" href="#238">238</a> <em class="comment">//? compressed length</em><a name="239" href="#239">239</a> <em class="comment">//n? uncompressed length</em><a name="240" href="#240">240</a> <em class="comment">//? mod date</em><a name="241" href="#241">241</a> <em class="comment">//? expires</em><a name="242" href="#242">242</a> <em class="comment">//? server 'date' hdr</em><a name="243" href="#243">243</a> <em class="comment">//? etag</em><a name="244" href="#244">244</a> <em class="comment">//r redirect ('Location'?)</em><a name="245" href="#245">245</a> <em class="comment">//e ip</em><a name="246" href="#246">246</a> <em class="comment">//MAYBE: </em><a name="247" href="#247">247</a> <em class="comment">//? TITLE from HTML or other format?</em><a name="248" href="#248">248</a> <a name="249" href="#249">249</a> </pre><hr/><div id="footer">This page was automatically generated by <a href="http://maven.apache.org/">Maven</a></div></body></html>

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?