📄 feedprocessor.js
字号:
/** * ElementInfo is a simple container object that describes * some characteristics of a feed element. For example, it * says whether an element can be expected to appear more * than once inside a given entry or feed. */ function ElementInfo(fieldName, containerClass, closeFunc, isArray) { this.fieldName = fieldName; this.containerClass = containerClass; this.closeFunc = closeFunc; this.isArray = isArray; this.isWrapper = false;}/** * FeedElementInfo represents a feed element, usually the root. */function FeedElementInfo(fieldName, feedVersion) { this.isWrapper = false; this.fieldName = fieldName; this.feedVersion = feedVersion;}/** * Some feed formats include vestigial wrapper elements that we don't * want to include in our object model, but we do need to keep track * of during parsing. */function WrapperElementInfo(fieldName) { this.isWrapper = true; this.fieldName = fieldName;}/***** The Processor *****/function FeedProcessor() { this._reader = Cc[SAX_CONTRACTID].createInstance(Ci.nsISAXXMLReader); this._buf = ""; this._feed = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2); this._handlerStack = []; this._xmlBaseStack = []; // sparse array keyed to nesting depth this._depth = 0; this._state = "START"; this._result = null; this._extensionHandler = null; this._xhtmlHandler = null; // The nsIFeedResultListener waiting for the parse results this.listener = null; // These elements can contain (X)HTML or plain text. // We keep a table here that contains their default treatment this._textConstructs = {"atom:title":"text", "atom:summary":"text", "atom:rights":"text", "atom:content":"text", "atom:subtitle":"text", "description":"html", "rss1:description":"html", "dc:description":"html", "content:encoded":"html", "title":"text", "rss1:title":"text", "atom03:title":"text", "atom03:tagline":"text", "atom03:summary":"text", "atom03:content":"text"}; this._stack = []; this._trans = { "START": { //If we hit a root RSS element, treat as RSS2. "rss": new FeedElementInfo("RSS2", "rss2"), // If we hit an RDF element, if could be RSS1, but we can't // verify that until we hit a rss1:channel element. "rdf:RDF": new WrapperElementInfo("RDF"), // If we hit a Atom 1.0 element, treat as Atom 1.0. "atom:feed": new FeedElementInfo("Atom", "atom"), // Treat as Atom 0.3 "atom03:feed": new FeedElementInfo("Atom03", "atom03"), }, /********* RSS2 **********/ "IN_RSS2": { "channel": new WrapperElementInfo("channel") }, "IN_CHANNEL": { "item": new ElementInfo("items", Cc[ENTRY_CONTRACTID], null, true), "managingEditor": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], rssAuthor, true), "category": new ElementInfo("categories", null, rssCatTerm, true), "cloud": new ElementInfo("cloud", null, null, false), "image": new ElementInfo("image", null, null, false), "textInput": new ElementInfo("textInput", null, null, false), "skipDays": new ElementInfo("skipDays", null, null, false), "skipHours": new ElementInfo("skipHours", null, null, false), "generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID], atomGenerator, false), }, "IN_ITEMS": { "author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], rssAuthor, true), "category": new ElementInfo("categories", null, rssCatTerm, true), "enclosure": new ElementInfo("enclosure", null, null, true), "guid": new ElementInfo("guid", null, rssGuid, false) }, "IN_SKIPDAYS": { "day": new ElementInfo("days", null, rssArrayElement, true) }, "IN_SKIPHOURS":{ "hour": new ElementInfo("hours", null, rssArrayElement, true) }, /********* RSS1 **********/ "IN_RDF": { // If we hit a rss1:channel, we can verify that we have RSS1 "rss1:channel": new FeedElementInfo("rdf_channel", "rss1"), "rss1:image": new ElementInfo("image", null, null, false), "rss1:textinput": new ElementInfo("textInput", null, null, false), "rss1:item": new ElementInfo("items", Cc[ENTRY_CONTRACTID], null, true), }, "IN_RDF_CHANNEL": { "admin:generatorAgent": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID], null, false), "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], rssAuthor, true), }, /********* ATOM 1.0 **********/ "IN_ATOM": { "atom:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], null, true), "atom:generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID], atomGenerator, false), "atom:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], null, true), "atom:link": new ElementInfo("links", null, null, true), "atom:entry": new ElementInfo("entries", Cc[ENTRY_CONTRACTID], null, true) }, "IN_ENTRIES": { "atom:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], null, true), "atom:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], null, true), "atom:link": new ElementInfo("links", null, null, true), }, /********* ATOM 0.3 **********/ "IN_ATOM03": { "atom03:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], null, true), "atom03:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], null, true), "atom03:link": new ElementInfo("links", null, null, true), "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID], null, true), "atom03:generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID], atomGenerator, false), }, "IN_ATOM03_ENTRIES": { "atom03:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID], null, true), "atom03:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID], null, true), "atom03:link": new ElementInfo("links", null, null, true), "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID], null, true) } }}// See startElement for a long description of how feeds are processed.FeedProcessor.prototype = { // Set ourselves as the SAX handler, and set the base URI _init: function FP_init(uri) { this._reader.contentHandler = this; this._reader.errorHandler = this; this._result = Cc[FR_CONTRACTID].createInstance(Ci.nsIFeedResult); if (uri) { this._result.uri = uri; this._reader.baseURI = uri; this._xmlBaseStack[0] = uri; } }, // This function is called once we figure out what type of feed // we're dealing with. Some feed types require digging a bit further // than the root. _docVerified: function FP_docVerified(version) { this._result.doc = Cc[FEED_CONTRACTID].createInstance(Ci.nsIFeed); this._result.doc.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1]; this._result.doc.fields = this._feed; this._result.version = version; }, // When we're done with the feed, let the listener know what // happened. _sendResult: function FP_sendResult() { try { // Can be null when a non-feed is fed to us if (this._result.doc) this._result.doc.normalize(); } catch (e) { LOG("FIXME: " + e); } try { if (this.listener != null) this.listener.handleResult(this._result); } finally { this._result = null; this._reader = null; } }, // Parsing functions parseFromStream: function FP_parseFromStream(stream, uri) { this._init(uri); this._reader.parseFromStream(stream, null, stream.available(), "application/xml"); this._reader = null; }, parseFromString: function FP_parseFromString(inputString, uri) { this._init(uri); this._reader.parseFromString(inputString, "application/xml"); this._reader = null; }, parseAsync: function FP_parseAsync(requestObserver, uri) { this._init(uri); this._reader.parseAsync(requestObserver); }, // nsIStreamListener // The XMLReader will throw sensible exceptions if these get called // out of order. onStartRequest: function FP_onStartRequest(request, context) { this._reader.onStartRequest(request, context); }, onStopRequest: function FP_onStopRequest(request, context, statusCode) { this._reader.onStopRequest(request, context, statusCode); }, onDataAvailable: function FP_onDataAvailable(request, context, inputStream, offset, count) { this._reader.onDataAvailable(request, context, inputStream, offset, count); }, // nsISAXErrorHandler // We only care about fatal errors. When this happens, we may have // parsed through the feed metadata and some number of entries. The // listener can still show some of that data if it wants, and we'll // set the bozo bit to indicate we were unable to parse all the way // through. fatalError: function FP_reportError() { this._result.bozo = true; //XXX need to QI to FeedProgressListener this._sendResult(); }, // nsISAXContentHandler startDocument: function FP_startDocument() { //LOG("----------"); }, endDocument: function FP_endDocument() { this._sendResult(); }, // The transitions defined above identify elements that contain more // than just text. For example RSS items contain many fields, and so // do Atom authors. The only commonly used elements that contain // mixed content are Atom Text Constructs of type="xhtml", which we // delegate to another handler for cleaning. That leaves a couple // different types of elements to deal with: those that should occur // only once, such as title elements, and those that can occur // multiple times, such as the RSS category element and the Atom // link element. Most of the RSS1/DC elements can occur multiple // times in theory, but in practice, the only ones that do have // analogues in Atom. // // Some elements are also groups of attributes or sub-elements, // while others are simple text fields. For the most part, we don't // have to pay explicit attention to the simple text elements, // unless we want to post-process the resulting string to transform // it into some richer object like a Date or URI. // // Elements that have more sophisticated content models still end up // being dictionaries, whether they are based on attributes like RSS // cloud, sub-elements like Atom author, or even items and // entries. These elements are treated as "containers". It's // theoretically possible for a container to have an attribute with // the same universal name as a sub-element, but none of the feed // formats allow this by default, and I don't of any extension that // works this way. // startElement: function FP_startElement(uri, localName, qName, attributes) { this._buf = ""; ++this._depth; var elementInfo; //LOG("<" + localName + ">"); // Check for xml:base var base = attributes.getValueFromName(XMLNS, "base"); if (base) { this._xmlBaseStack[this._depth] = strToURI(base, this._xmlBaseStack[this._xmlBaseStack.length - 1]); } // To identify the element we're dealing with, we look up the // namespace URI in our gNamespaces dictionary, which will give us // a "canonical" prefix for a namespace URI. For example, this // allows Dublin Core "creator" elements to be consistently mapped // to "dc:creator", for easy field access by consumer code. This // strategy also happens to shorten up our state table. var key = this._prefixForNS(uri) + localName; // Check to see if we need to hand this off to our XHTML handler. // The elements we're dealing with will look like this: // // <title type="xhtml"> // <div xmlns="http://www.w3.org/1999/xhtml"> // A title with <b>bold</b> and <i>italics</i>. // </div> // </title> // // When it returns in returnFromXHTMLHandler, the handler should // give us back a string like this: // // "A title with <b>bold</b> and <i>italics</i>." // // The Atom spec explicitly says the div is not part of the content, // and explicitly allows whitespace collapsing. // if ((this._result.version == "atom" || this._result.version == "atom03") && this._textConstructs[key] != null) { var type = attributes.getValueFromName("","type"); if (type != null && type.indexOf("xhtml") >= 0) { this._xhtmlHandler = new XHTMLHandler(this, (this._result.version == "atom")); this._reader.contentHandler = this._xhtmlHandler; return; } } // Check our current state, and see if that state has a defined // transition. For example, this._trans["atom:entry"]["atom:author"] // will have one, and it tells us to add an item to our authors array. if (this._trans[this._state] && this._trans[this._state][key]) { elementInfo = this._trans[this._state][key]; } else { // If we don't have a transition, hand off to extension handler this._extensionHandler = new ExtensionHandler(this); this._reader.contentHandler = this._extensionHandler; this._extensionHandler.startElement(uri, localName, qName, attributes); return; } // This distinguishes wrappers like 'channel' from elements // we'd actually like to do something with (which will test true). this._handlerStack[this._depth] = elementInfo; if (elementInfo.isWrapper) { this._state = "IN_" + elementInfo.fieldName.toUpperCase(); this._stack.push([this._feed, this._state]); } else if (elementInfo.feedVersion) { this._state = "IN_" + elementInfo.fieldName.toUpperCase(); // Check for the older RSS2 variants if (elementInfo.feedVersion == "rss2") elementInfo.feedVersion = this._findRSSVersion(attributes); else if (uri == RSS090NS) elementInfo.feedVersion = "rss090"; this._docVerified(elementInfo.feedVersion); this._stack.push([this._feed, this._state]); this._mapAttributes(this._feed, attributes); } else { this._state = this._processComplexElement(elementInfo, attributes); } }, // In the endElement handler, we decrement the stack and look // for cleanup/transition functions to execute. The second part // of the state transition works as above in startElement, but // the state we're looking for is prefixed with an underscore // to distinguish endElement events from startElement events. endElement: function FP_endElement(uri, localName, qName) { var elementInfo = this._handlerStack[this._depth]; //LOG("</" + localName + ">"); if (elementInfo && !elementInfo.isWrapper) this._closeComplexElement(elementInfo); // cut down xml:base context if (this._xmlBaseStack.length == this._depth + 1) this._xmlBaseStack = this._xmlBaseStack.slice(0, this._depth); // our new state is whatever is at the top of the stack now if (this._stack.length > 0) this._state = this._stack[this._stack.length - 1][1]; this._handlerStack = this._handlerStack.slice(0, this._depth); --this._depth; }, // Buffer up character data. The buffer is cleared with every // opening element. characters: function FP_characters(data) { this._buf += data;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -