⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractorswf.java

📁 高性能分词算法
💻 JAVA
字号:
/* * Heritrix * * $Id: ExtractorSWF.java 6041 2008-11-18 02:42:39Z nlevitt $ * * Created on March 19, 2004 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.extractor;import java.io.IOException;import java.io.InputStream;import java.util.Vector;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.io.IOUtils;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.framework.CrawlController;import org.archive.util.TextUtils;import com.anotherbigidea.flash.interfaces.SWFActions;import com.anotherbigidea.flash.interfaces.SWFTagTypes;import com.anotherbigidea.flash.interfaces.SWFTags;import com.anotherbigidea.flash.readers.ActionParser;import com.anotherbigidea.flash.readers.SWFReader;import com.anotherbigidea.flash.readers.TagParser;import com.anotherbigidea.flash.structs.AlphaTransform;import com.anotherbigidea.flash.structs.Matrix;import com.anotherbigidea.flash.writers.SWFActionsImpl;import com.anotherbigidea.flash.writers.SWFTagTypesImpl;import com.anotherbigidea.io.InStream;/** * Process SWF (flash/shockwave) files for strings that are likely to be * crawlable URIs. *  * @author Igor Ranitovic */public class ExtractorSWF extends Extractor implements CoreAttributeConstants {	private static final long serialVersionUID = 3627359592408010589L;	private static Logger logger = Logger.getLogger(ExtractorSWF.class			.getName());	protected long numberOfCURIsHandled = 0;	protected long numberOfLinksExtracted = 0;	// TODO: consider if this should be even smaller, because anything	// containing URLs wouldn't be this big	private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB	/**	 * @param name	 */	public ExtractorSWF(String name) {		super(name, "Flash extractor. Extracts URIs from SWF "				+ "(flash/shockwave) files.");	}	protected void extract(CrawlURI curi) {		if (!isHttpTransactionContentToProcess(curi)) {			return;		}		String contentType = curi.getContentType();		if (contentType == null) {			return;		}		if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)				&& (!curi.toString().toLowerCase().endsWith(".swf"))) {			return;		}        InputStream documentStream = null;		try {            documentStream =                 curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();                        // Get link extracting SWF reader            SWFReader reader = getSWFReader(curi, documentStream);            if (reader == null) {                return;            }            numberOfCURIsHandled++;			// Parse file for links			reader.readFile();		} catch (IOException e) {			curi.addLocalizedError(getName(), e, "failed reading");		} catch (NullPointerException e) {			curi.addLocalizedError(getName(), e, "bad .swf file");		} catch (NegativeArraySizeException e) {			curi.addLocalizedError(getName(), e, "bad .swf file");		} finally {		    IOUtils.closeQuietly(documentStream);        }		// Set flag to indicate that link extraction is completed.		curi.linkExtractorFinished();		logger.fine(curi + " has " + numberOfLinksExtracted + " links.");	}	/**	 * Get a link extracting SWFParser.	 * 	 * A custom SWFReader which parses links from .swf file.	 * 	 * @param curi A CrawlURI to be processed.	 * @return An SWFReader.	 */	private SWFReader getSWFReader(CrawlURI curi, InputStream documentStream) {        if (documentStream == null) {            return null;        }		// Create SWF actions that will add discoved URIs to CrawlURI		// alist(s).		ExtractorSWFActions actions = new ExtractorSWFActions(curi,				getController());		// Overwrite parsing of specific tags that might have URIs.		ExtractorSWFTags tags = new ExtractorSWFTags(actions);		// Get a SWFReader instance.		SWFReader reader = new ExtractorSWFReader(getTagParser(tags), documentStream);		return reader;	}	class ExtractorSWFReader extends SWFReader	{	    public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {	        super(consumer, inputstream);	    }	    	    public ExtractorSWFReader(SWFTags consumer, InStream instream)	    {	        super(consumer, instream);	    }    	    /**         * Override because a corrupt SWF file can cause us to try read         * lengths that are hundreds of megabytes in size causing us to         * OOME.         *          * Below is copied from SWFReader parent class.         */        public int readOneTag() throws IOException {            int header = mIn.readUI16();            int type = header >> 6; // only want the top 10 bits            int length = header & 0x3F; // only want the bottom 6 bits            boolean longTag = (length == 0x3F);            if (longTag) {                length = (int) mIn.readUI32();            }            // Below test added for Heritrix use.            if (length > MAX_READ_SIZE) {                // skip to next, rather than throw IOException ending                // processing                mIn.skipBytes(length);                logger.info("oversized SWF tag (type=" + type + ";length="                        + length + ") skipped");            } else {                byte[] contents = mIn.read(length);                mConsumer.tag(type, longTag, contents);            }            return type;        }    }	/**	 * Get a TagParser	 * 	 * A custom ExtractorTagParser which ignores all the big binary image/	 * sound/font types which don't carry URLs is used, to avoid the	 * occasionally fatal (OutOfMemoryError) memory bloat caused by the	 * all-in-memory SWF library handling.	 * 	 * @param customTags	 *            A custom tag parser.	 * @return An SWFReader.	 */	private TagParser getTagParser(SWFTagTypes customTags) {		return new ExtractorTagParser(customTags);	}	/**	 * TagParser customized to ignore SWFTags that will never contain	 * extractable URIs.	 */	protected class ExtractorTagParser extends TagParser {		protected ExtractorTagParser(SWFTagTypes tagtypes) {			super(tagtypes);		}		protected void parseDefineBits(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in bits		}		protected void parseDefineBitsJPEG3(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in bits		}		protected void parseDefineBitsLossless(InStream in, int length,				boolean hasAlpha) throws IOException {			// DO NOTHING - no URLs to be found in bits		}		protected void parseDefineButtonSound(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in sound		}		protected void parseDefineFont(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in font		}		protected void parseDefineJPEG2(InStream in, int length)				throws IOException {			// DO NOTHING - no URLs to be found in jpeg		}		protected void parseDefineJPEGTables(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in jpeg		}		protected void parseDefineShape(int type, InStream in)				throws IOException {			// DO NOTHING - no URLs to be found in shape		}		protected void parseDefineSound(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in sound		}		protected void parseFontInfo(InStream in, int length, boolean isFI2)				throws IOException {			// DO NOTHING - no URLs to be found in font info		}		protected void parseDefineFont2(InStream in) throws IOException {			// DO NOTHING - no URLs to be found in bits		}				// heritrix: Overridden to use our TagParser and SWFReader. The rest of the code is the same.		@Override	    protected void parseDefineSprite( InStream in ) throws IOException	    {	        int id         = in.readUI16();	        in.readUI16(); // frame count	        	        SWFTagTypes sstt = mTagtypes.tagDefineSprite( id );	        	        if( sstt == null ) return;	        	        // heritrix: only these two lines differ from super.parseDefineSprite()	        TagParser parser = new ExtractorTagParser( sstt );	        SWFReader reader = new ExtractorSWFReader( parser, in );	        	        reader.readTags();	    }		// Overridden to read 32 bit clip event flags when flash version >= 6.        // All the rest of the code is copied directly. Fixes HER-1509.		@Override	    protected void parsePlaceObject2( InStream in ) throws IOException	    {	        boolean hasClipActions    = in.readUBits(1) != 0;	        boolean hasClipDepth      = in.readUBits(1) != 0;	        boolean hasName           = in.readUBits(1) != 0;	        boolean hasRatio          = in.readUBits(1) != 0;	        boolean hasColorTransform = in.readUBits(1) != 0;	        boolean hasMatrix         = in.readUBits(1) != 0;	        boolean hasCharacter      = in.readUBits(1) != 0;	        boolean isMove            = in.readUBits(1) != 0;	    	        int depth = in.readUI16();	        	        int            charId    = hasCharacter      ? in.readUI16()            : 0;	        Matrix         matrix    = hasMatrix         ? new Matrix( in )         : null;	        AlphaTransform cxform    = hasColorTransform ? new AlphaTransform( in ) : null;	        int            ratio     = hasRatio          ? in.readUI16()            : -1;        	        String         name      = hasName           ? in.readString( mStringEncoding )  : null;  	        int            clipDepth = hasClipDepth      ? in.readUI16()            : 0;	        	        int clipEventFlags = 0;	        	        if (hasClipActions) {                in.readUI16(); // reserved                // heritrix: flags size changed in swf version 6                clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();            }	        	        SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,                    depth, charId, matrix, cxform, ratio, name, clipEventFlags);            if (hasClipActions && actions != null) {                int flags = 0;                // heritrix: flags size changed in swf version 6                while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {                    in.readUI32(); // length                    actions.start(flags);                    ActionParser parser = new ActionParser(actions, mFlashVersion);                    parser.parse(in);                }                actions.done();            }        }	}	/**	 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which	 * parse URI-like strings.	 */	protected class ExtractorSWFTags extends SWFTagTypesImpl {		private SWFActions actions;		public ExtractorSWFTags(SWFActions acts) {			super(null);			actions = acts;		}		public SWFActions tagDefineButton(int id, Vector buttonRecords)				throws IOException {			return actions;		}		public SWFActions tagDefineButton2(int id, boolean trackAsMenu,				Vector buttonRecord2s) throws IOException {			return actions;		}		public SWFActions tagDoAction() throws IOException {			return actions;		}		public SWFActions tagDoInActions(int spriteId) throws IOException {			return actions;		}		public SWFTagTypes tagDefineSprite(int id) throws IOException {			return this;		}		public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,				int depth, int charId, Matrix matrix, AlphaTransform cxform,				int ratio, String name, int clipActionFlags) throws IOException {			return actions;		}	}	/**	 * SWFActions that parse URI-like strings. Links discovered using	 * <code>ExtractorJS</code> are marked as speculative links (hop X). All	 * other links are marked as embedded links (hop E).	 * 	 */	protected class ExtractorSWFActions extends SWFActionsImpl {		private CrawlURI curi;		private CrawlController controller;		static final String JSSTRING = "javascript:";		/**		 * @param curi		 *            SWF URL to handle		 * @param controller		 *            Crawl controller need for error reporting		 */		public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {			assert (curi != null) : "CrawlURI should not be null";			this.curi = curi;			this.controller = controller;		}		/**		 * Overwrite handling of discovered URIs.		 * 		 * @param url		 *            Discovered URL.		 * @param target		 *            Discovered target (currently not being used.)		 * @throws IOException		 */		public void getURL(String url, String target) throws IOException {			processURIString(url);		}		public void lookupTable(String[] strings) throws IOException {			for (String str : strings) {				considerStringAsUri(str);			}		}		public void push(String value) throws IOException {			considerStringAsUri(value);		}		public void considerStringAsUri(String str) throws IOException {			Matcher uri = TextUtils.getMatcher(ExtractorJS.STRING_URI_DETECTOR,					str);			if (uri.matches()) {				curi.createAndAddLinkRelativeToVia(uri.group(),						Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);				incrementLinkCount(1);			}			TextUtils.recycleMatcher(uri);		}		public void processURIString(String url) throws IOException {			if (url.startsWith(JSSTRING)) {				incrementLinkCount(ExtractorJS.considerStrings(						curi, url, controller,false));			} else {				curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,						Link.EMBED_HOP);				incrementLinkCount(1);			}		}		private void incrementLinkCount(long count) {			numberOfLinksExtracted += count;		}	}	public String report() {		StringBuffer ret = new StringBuffer();		ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");		ret.append("  Function:          Link extraction on Shockwave Flash "				+ "documents (.swf)\n");		ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");		ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");		return ret.toString();	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -