[Pkg-javascript-commits] [node-htmlparser2_new] 01/06: Import Upstream version 3.9.2
Paolo Greppi
paolog-guest at moszumanska.debian.org
Sat Dec 24 01:12:20 UTC 2016
This is an automated email from the git hooks/post-receive script.
paolog-guest pushed a commit to branch master
in repository node-htmlparser2_new.
commit 3d60433fc7f9bf55e368e88b1bdab4dd2ac2f205
Author: Paolo Greppi <paolo.greppi at libpf.com>
Date: Sat Dec 24 00:57:07 2016 +0000
Import Upstream version 3.9.2
---
.eslintrc | 94 ++
.gitattributes | 2 +
.gitignore | 0
.travis.yml | 16 +
LICENSE | 18 +
README.md | 91 ++
lib/CollectingHandler.js | 55 +
lib/FeedHandler.js | 95 ++
lib/Parser.js | 353 ++++++
lib/ProxyHandler.js | 27 +
lib/Stream.js | 35 +
lib/Tokenizer.js | 906 +++++++++++++++
lib/WritableStream.js | 25 +
lib/index.js | 68 ++
package.json | 56 +
test/01-events.js | 9 +
test/02-stream.js | 23 +
test/03-feed.js | 19 +
test/Documents/Atom_Example.xml | 25 +
test/Documents/Attributes.html | 16 +
test/Documents/Basic.html | 1 +
test/Documents/RDF_Example.xml | 63 +
test/Documents/RSS_Example.xml | 48 +
test/Events/01-simple.json | 44 +
test/Events/02-template.json | 63 +
test/Events/03-lowercase_tags.json | 46 +
test/Events/04-cdata.json | 50 +
test/Events/05-cdata-special.json | 35 +
test/Events/06-leading-lt.json | 16 +
test/Events/07-self-closing.json | 67 ++
test/Events/08-implicit-close-tags.json | 71 ++
test/Events/09-attributes.json | 68 ++
test/Events/10-crazy-attrib.json | 52 +
test/Events/11-script_in_script.json | 54 +
test/Events/12-long-comment-end.json | 20 +
test/Events/13-long-cdata-end.json | 22 +
test/Events/14-implicit-open-tags.json | 27 +
test/Events/15-lt-whitespace.json | 16 +
test/Events/16-double_attribs.json | 45 +
test/Events/17-numeric_entities.json | 16 +
test/Events/18-legacy_entities.json | 16 +
test/Events/19-named_entities.json | 16 +
test/Events/20-xml_entities.json | 16 +
test/Events/21-entity_in_attribute.json | 38 +
test/Events/22-double_brackets.json | 41 +
test/Events/23-legacy_entity_fail.json | 16 +
test/Events/24-special_special.json | 133 +++
test/Events/25-empty_tag_name.json | 13 +
test/Events/26-not-quite-closed.json | 35 +
test/Events/27-entities_in_attributes.json | 62 +
test/Events/28-cdata_in_html.json | 9 +
test/Events/29-comment_edge-cases.json | 18 +
test/Events/30-cdata_edge-cases.json | 22 +
test/Events/31-comment_false-ending.json | 9 +
test/Events/32-script-ending-with-lessthan.json | 35 +
test/Feeds/01-rss.js | 34 +
test/Feeds/02-atom.js | 18 +
test/Feeds/03-rdf.js | 20 +
test/Stream/01-basic.json | 83 ++
test/Stream/02-RSS.json | 1093 ++++++++++++++++++
test/Stream/03-Atom.json | 678 +++++++++++
test/Stream/04-RDF.json | 1399 +++++++++++++++++++++++
test/Stream/05-Attributes.json | 354 ++++++
test/api.js | 103 ++
test/test-helper.js | 83 ++
test/unicode.js | 21 +
66 files changed, 7122 insertions(+)
diff --git a/.eslintrc b/.eslintrc
new file mode 100644
index 0000000..24d1a9e
--- /dev/null
+++ b/.eslintrc
@@ -0,0 +1,94 @@
+{
+ "extends": "eslint:recommended",
+ "env": {
+ "node": true
+ },
+ "globals": {
+ "describe": true,
+ "it": true
+ },
+ "rules": {
+ "eqeqeq": 2,
+ "no-extend-native": 2,
+ "no-use-before-define": [
+ 2,
+ {
+ "functions": false,
+ "classes": false
+ }
+ ],
+ "no-caller": 2,
+ "no-irregular-whitespace": 2,
+ "quotes": [
+ 2,
+ "double"
+ ],
+ "no-undef": 2,
+ "no-unused-vars": 2,
+ "no-eq-null": 2,
+ "no-proto": 2,
+ "curly": [
+ 2,
+ "multi-line"
+ ],
+ "no-mixed-spaces-and-tabs": [
+ 2,
+ "smart-tabs"
+ ],
+ "space-infix-ops": 2,
+ "keyword-spacing": [
+ 2,
+ {
+ "overrides": {
+ "if": {
+ "after": false
+ },
+ "catch": {
+ "after": false
+ },
+ "for": {
+ "after": false
+ },
+ "while": {
+ "after": false
+ }
+ }
+ }
+ ],
+ "new-cap": 2,
+ "comma-style": [
+ 2,
+ "last"
+ ],
+ "dot-notation": 2,
+ "wrap-iife": 2,
+ "no-empty": 2,
+ "space-unary-ops": [
+ 2,
+ {
+ "words": false,
+ "nonwords": false
+ }
+ ],
+ "no-with": 2,
+ "no-multi-str": 2,
+ "no-trailing-spaces": 2,
+ "indent": [
+ 2,
+ "tab",
+ {
+ "SwitchCase": 1,
+ "VariableDeclarator": 0
+ }
+ ],
+ "linebreak-style": [
+ 2,
+ "unix"
+ ],
+ "consistent-this": [
+ 2,
+ "_this"
+ ],
+ "no-extra-semi": 0 // https://github.com/eslint/eslint/issues/6386
+ }
+}
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..4bb50dc
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text eol=lf
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..15628ee
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,16 @@
+language: node_js
+node_js:
+ - stable
+ - unstable
+ - 5.1
+ - 4.2
+ - 0.12
+
+sudo: false
+
+matrix:
+ fast_finish: true
+ allow_failures:
+ - node_js: unstable
+
+script: npm run coveralls
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a35e02
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,18 @@
+Copyright 2010, 2011, Chris Winberry <chris at winberry.net>. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7590a6a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,91 @@
+# htmlparser2
+
+[![NPM version](http://img.shields.io/npm/v/htmlparser2.svg?style=flat)](https://npmjs.org/package/htmlparser2)
+[![Downloads](https://img.shields.io/npm/dm/htmlparser2.svg?style=flat)](https://npmjs.org/package/htmlparser2)
+[![Build Status](http://img.shields.io/travis/fb55/htmlparser2/master.svg?style=flat)](http://travis-ci.org/fb55/htmlparser2)
+[![Coverage](http://img.shields.io/coveralls/fb55/htmlparser2.svg?style=flat)](https://coveralls.io/r/fb55/htmlparser2)
+
+A forgiving HTML/XML/RSS parser. The parser can handle streams and provides a callback interface.
+
+## Installation
+ npm install htmlparser2
+
+A live demo of htmlparser2 is available [here](http://demos.forbeslindesay.co.uk/htmlparser2/).
+
+## Usage
+
+```javascript
+var htmlparser = require("htmlparser2");
+var parser = new htmlparser.Parser({
+ onopentag: function(name, attribs){
+ if(name === "script" && attribs.type === "text/javascript"){
+ console.log("JS! Hooray!");
+ }
+ },
+ ontext: function(text){
+ console.log("-->", text);
+ },
+ onclosetag: function(tagname){
+ if(tagname === "script"){
+ console.log("That's it?!");
+ }
+ }
+}, {decodeEntities: true});
+parser.write("Xyz <script type='text/javascript'>var foo = '<<bar>>';</ script>");
+parser.end();
+```
+
+Output (simplified):
+
+```
+--> Xyz
+JS! Hooray!
+--> var foo = '<<bar>>';
+That's it?!
+```
+
+## Documentation
+
+Read more about the parser and its options in the [wiki](https://github.com/fb55/htmlparser2/wiki/Parser-options).
+
+## Get a DOM
+The `DomHandler` (known as `DefaultHandler` in the original `htmlparser` module) produces a DOM (document object model) that can be manipulated using the [`DomUtils`](https://github.com/fb55/DomUtils) helper.
+
+The `DomHandler`, while still bundled with this module, was moved to its [own module](https://github.com/fb55/domhandler). Have a look at it for further information.
+
+## Parsing RSS/RDF/Atom Feeds
+
+```javascript
+new htmlparser.FeedHandler(function(<error> error, <object> feed){
+ ...
+});
+```
+
+Note: While the provided feed handler works for most feeds, you might want to use [danmactough/node-feedparser](https://github.com/danmactough/node-feedparser), which is much better tested and actively maintained.
+
+## Performance
+
+After having some artificial benchmarks for some time, __ at AndreasMadsen__ published his [`htmlparser-benchmark`](https://github.com/AndreasMadsen/htmlparser-benchmark), which benchmarks HTML parses based on real-world websites.
+
+At the time of writing, the latest versions of all supported parsers show the following performance characteristics on [Travis CI](https://travis-ci.org/AndreasMadsen/htmlparser-benchmark/builds/10805007) (please note that Travis doesn't guarantee equal conditions for all tests):
+
+```
+gumbo-parser : 34.9208 ms/file ± 21.4238
+html-parser : 24.8224 ms/file ± 15.8703
+html5 : 419.597 ms/file ± 264.265
+htmlparser : 60.0722 ms/file ± 384.844
+htmlparser2-dom: 12.0749 ms/file ± 6.49474
+htmlparser2 : 7.49130 ms/file ± 5.74368
+hubbub : 30.4980 ms/file ± 16.4682
+libxmljs : 14.1338 ms/file ± 18.6541
+parse5 : 22.0439 ms/file ± 15.3743
+sax : 49.6513 ms/file ± 26.6032
+```
+
+## How does this module differ from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)?
+
+This is a fork of the `htmlparser` module. The main difference is that this is intended to be used only with node (it runs on other platforms using [browserify](https://github.com/substack/node-browserify)). `htmlparser2` was rewritten multiple times and, while it maintains an API that's compatible with `htmlparser` in most cases, the projects don't share any code anymore.
+
+The parser now provides a callback interface close to [sax.js](https://github.com/isaacs/sax-js) (originally targeted at [readabilitySAX](https://github.com/fb55/readabilitysax)). As a result, old handlers won't work anymore.
+
+The `DefaultHandler` and the `RssHandler` were renamed to clarify their purpose (to `DomHandler` and `FeedHandler`). The old names are still available when requiring `htmlparser2`, your code should work as expected.
diff --git a/lib/CollectingHandler.js b/lib/CollectingHandler.js
new file mode 100644
index 0000000..93d97a7
--- /dev/null
+++ b/lib/CollectingHandler.js
@@ -0,0 +1,55 @@
+module.exports = CollectingHandler;
+
+function CollectingHandler(cbs){
+ this._cbs = cbs || {};
+ this.events = [];
+}
+
+var EVENTS = require("./").EVENTS;
+Object.keys(EVENTS).forEach(function(name){
+ if(EVENTS[name] === 0){
+ name = "on" + name;
+ CollectingHandler.prototype[name] = function(){
+ this.events.push([name]);
+ if(this._cbs[name]) this._cbs[name]();
+ };
+ } else if(EVENTS[name] === 1){
+ name = "on" + name;
+ CollectingHandler.prototype[name] = function(a){
+ this.events.push([name, a]);
+ if(this._cbs[name]) this._cbs[name](a);
+ };
+ } else if(EVENTS[name] === 2){
+ name = "on" + name;
+ CollectingHandler.prototype[name] = function(a, b){
+ this.events.push([name, a, b]);
+ if(this._cbs[name]) this._cbs[name](a, b);
+ };
+ } else {
+ throw Error("wrong number of arguments");
+ }
+});
+
+CollectingHandler.prototype.onreset = function(){
+ this.events = [];
+ if(this._cbs.onreset) this._cbs.onreset();
+};
+
+CollectingHandler.prototype.restart = function(){
+ if(this._cbs.onreset) this._cbs.onreset();
+
+ for(var i = 0, len = this.events.length; i < len; i++){
+ if(this._cbs[this.events[i][0]]){
+
+ var num = this.events[i].length;
+
+ if(num === 1){
+ this._cbs[this.events[i][0]]();
+ } else if(num === 2){
+ this._cbs[this.events[i][0]](this.events[i][1]);
+ } else {
+ this._cbs[this.events[i][0]](this.events[i][1], this.events[i][2]);
+ }
+ }
+ }
+};
diff --git a/lib/FeedHandler.js b/lib/FeedHandler.js
new file mode 100644
index 0000000..329c94a
--- /dev/null
+++ b/lib/FeedHandler.js
@@ -0,0 +1,95 @@
+var index = require("./index.js"),
+ DomHandler = index.DomHandler,
+ DomUtils = index.DomUtils;
+
+//TODO: make this a streamable handler
+function FeedHandler(callback, options){
+ this.init(callback, options);
+}
+
+require("inherits")(FeedHandler, DomHandler);
+
+FeedHandler.prototype.init = DomHandler;
+
+function getElements(what, where){
+ return DomUtils.getElementsByTagName(what, where, true);
+}
+function getOneElement(what, where){
+ return DomUtils.getElementsByTagName(what, where, true, 1)[0];
+}
+function fetch(what, where, recurse){
+ return DomUtils.getText(
+ DomUtils.getElementsByTagName(what, where, recurse, 1)
+ ).trim();
+}
+
+function addConditionally(obj, prop, what, where, recurse){
+ var tmp = fetch(what, where, recurse);
+ if(tmp) obj[prop] = tmp;
+}
+
+var isValidFeed = function(value){
+ return value === "rss" || value === "feed" || value === "rdf:RDF";
+};
+
+FeedHandler.prototype.onend = function(){
+ var feed = {},
+ feedRoot = getOneElement(isValidFeed, this.dom),
+ tmp, childs;
+
+ if(feedRoot){
+ if(feedRoot.name === "feed"){
+ childs = feedRoot.children;
+
+ feed.type = "atom";
+ addConditionally(feed, "id", "id", childs);
+ addConditionally(feed, "title", "title", childs);
+ if((tmp = getOneElement("link", childs)) && (tmp = tmp.attribs) && (tmp = tmp.href)) feed.link = tmp;
+ addConditionally(feed, "description", "subtitle", childs);
+ if((tmp = fetch("updated", childs))) feed.updated = new Date(tmp);
+ addConditionally(feed, "author", "email", childs, true);
+
+ feed.items = getElements("entry", childs).map(function(item){
+ var entry = {}, tmp;
+
+ item = item.children;
+
+ addConditionally(entry, "id", "id", item);
+ addConditionally(entry, "title", "title", item);
+ if((tmp = getOneElement("link", item)) && (tmp = tmp.attribs) && (tmp = tmp.href)) entry.link = tmp;
+ if((tmp = fetch("summary", item) || fetch("content", item))) entry.description = tmp;
+ if((tmp = fetch("updated", item))) entry.pubDate = new Date(tmp);
+ return entry;
+ });
+ } else {
+ childs = getOneElement("channel", feedRoot.children).children;
+
+ feed.type = feedRoot.name.substr(0, 3);
+ feed.id = "";
+ addConditionally(feed, "title", "title", childs);
+ addConditionally(feed, "link", "link", childs);
+ addConditionally(feed, "description", "description", childs);
+ if((tmp = fetch("lastBuildDate", childs))) feed.updated = new Date(tmp);
+ addConditionally(feed, "author", "managingEditor", childs, true);
+
+ feed.items = getElements("item", feedRoot.children).map(function(item){
+ var entry = {}, tmp;
+
+ item = item.children;
+
+ addConditionally(entry, "id", "guid", item);
+ addConditionally(entry, "title", "title", item);
+ addConditionally(entry, "link", "link", item);
+ addConditionally(entry, "description", "description", item);
+ if((tmp = fetch("pubDate", item))) entry.pubDate = new Date(tmp);
+ return entry;
+ });
+ }
+ }
+ this.dom = feed;
+ DomHandler.prototype._handleCallback.call(
+ this, feedRoot ? null : Error("couldn't find root of feed")
+ );
+};
+
+module.exports = FeedHandler;
diff --git a/lib/Parser.js b/lib/Parser.js
new file mode 100644
index 0000000..12db240
--- /dev/null
+++ b/lib/Parser.js
@@ -0,0 +1,353 @@
+var Tokenizer = require("./Tokenizer.js");
+
+/*
+ Options:
+
+ xmlMode: Disables the special behavior for script/style tags (false by default)
+ lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`)
+ lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`)
+*/
+
+/*
+ Callbacks:
+
+ oncdataend,
+ oncdatastart,
+ onclosetag,
+ oncomment,
+ oncommentend,
+ onerror,
+ onopentag,
+ onprocessinginstruction,
+ onreset,
+ ontext
+*/
+
+var formTags = {
+ input: true,
+ option: true,
+ optgroup: true,
+ select: true,
+ button: true,
+ datalist: true,
+ textarea: true
+};
+
+var openImpliesClose = {
+ tr : { tr:true, th:true, td:true },
+ th : { th:true },
+ td : { thead:true, th:true, td:true },
+ body : { head:true, link:true, script:true },
+ li : { li:true },
+ p : { p:true },
+ h1 : { p:true },
+ h2 : { p:true },
+ h3 : { p:true },
+ h4 : { p:true },
+ h5 : { p:true },
+ h6 : { p:true },
+ select : formTags,
+ input : formTags,
+ output : formTags,
+ button : formTags,
+ datalist: formTags,
+ textarea: formTags,
+ option : { option:true },
+ optgroup: { optgroup:true }
+};
+
+var voidElements = {
+ __proto__: null,
+ area: true,
+ base: true,
+ basefont: true,
+ br: true,
+ col: true,
+ command: true,
+ embed: true,
+ frame: true,
+ hr: true,
+ img: true,
+ input: true,
+ isindex: true,
+ keygen: true,
+ link: true,
+ meta: true,
+ param: true,
+ source: true,
+ track: true,
+ wbr: true,
+
+ //common self closing svg elements
+ path: true,
+ circle: true,
+ ellipse: true,
+ line: true,
+ rect: true,
+ use: true,
+ stop: true,
+ polyline: true,
+ polygon: true
+};
+
+var re_nameEnd = /\s|\//;
+
+function Parser(cbs, options){
+ this._options = options || {};
+ this._cbs = cbs || {};
+
+ this._tagname = "";
+ this._attribname = "";
+ this._attribvalue = "";
+ this._attribs = null;
+ this._stack = [];
+
+ this.startIndex = 0;
+ this.endIndex = null;
+
+ this._lowerCaseTagNames = "lowerCaseTags" in this._options ?
+ !!this._options.lowerCaseTags :
+ !this._options.xmlMode;
+ this._lowerCaseAttributeNames = "lowerCaseAttributeNames" in this._options ?
+ !!this._options.lowerCaseAttributeNames :
+ !this._options.xmlMode;
+
+ if(this._options.Tokenizer) {
+ Tokenizer = this._options.Tokenizer;
+ }
+ this._tokenizer = new Tokenizer(this._options, this);
+
+ if(this._cbs.onparserinit) this._cbs.onparserinit(this);
+}
+
+require("inherits")(Parser, require("events").EventEmitter);
+
+Parser.prototype._updatePosition = function(initialOffset){
+ if(this.endIndex === null){
+ if(this._tokenizer._sectionStart <= initialOffset){
+ this.startIndex = 0;
+ } else {
+ this.startIndex = this._tokenizer._sectionStart - initialOffset;
+ }
+ }
+ else this.startIndex = this.endIndex + 1;
+ this.endIndex = this._tokenizer.getAbsoluteIndex();
+};
+
+//Tokenizer event handlers
+Parser.prototype.ontext = function(data){
+ this._updatePosition(1);
+ this.endIndex--;
+
+ if(this._cbs.ontext) this._cbs.ontext(data);
+};
+
+Parser.prototype.onopentagname = function(name){
+ if(this._lowerCaseTagNames){
+ name = name.toLowerCase();
+ }
+
+ this._tagname = name;
+
+ if(!this._options.xmlMode && name in openImpliesClose) {
+ for(
+ var el;
+ (el = this._stack[this._stack.length - 1]) in openImpliesClose[name];
+ this.onclosetag(el)
+ );
+ }
+
+ if(this._options.xmlMode || !(name in voidElements)){
+ this._stack.push(name);
+ }
+
+ if(this._cbs.onopentagname) this._cbs.onopentagname(name);
+ if(this._cbs.onopentag) this._attribs = {};
+};
+
+Parser.prototype.onopentagend = function(){
+ this._updatePosition(1);
+
+ if(this._attribs){
+ if(this._cbs.onopentag) this._cbs.onopentag(this._tagname, this._attribs);
+ this._attribs = null;
+ }
+
+ if(!this._options.xmlMode && this._cbs.onclosetag && this._tagname in voidElements){
+ this._cbs.onclosetag(this._tagname);
+ }
+
+ this._tagname = "";
+};
+
+Parser.prototype.onclosetag = function(name){
+ this._updatePosition(1);
+
+ if(this._lowerCaseTagNames){
+ name = name.toLowerCase();
+ }
+
+ if(this._stack.length && (!(name in voidElements) || this._options.xmlMode)){
+ var pos = this._stack.lastIndexOf(name);
+ if(pos !== -1){
+ if(this._cbs.onclosetag){
+ pos = this._stack.length - pos;
+ while(pos--) this._cbs.onclosetag(this._stack.pop());
+ }
+ else this._stack.length = pos;
+ } else if(name === "p" && !this._options.xmlMode){
+ this.onopentagname(name);
+ this._closeCurrentTag();
+ }
+ } else if(!this._options.xmlMode && (name === "br" || name === "p")){
+ this.onopentagname(name);
+ this._closeCurrentTag();
+ }
+};
+
+Parser.prototype.onselfclosingtag = function(){
+ if(this._options.xmlMode || this._options.recognizeSelfClosing){
+ this._closeCurrentTag();
+ } else {
+ this.onopentagend();
+ }
+};
+
+Parser.prototype._closeCurrentTag = function(){
+ var name = this._tagname;
+
+ this.onopentagend();
+
+ //self-closing tags will be on the top of the stack
+ //(cheaper check than in onclosetag)
+ if(this._stack[this._stack.length - 1] === name){
+ if(this._cbs.onclosetag){
+ this._cbs.onclosetag(name);
+ }
+ this._stack.pop();
+ }
+};
+
+Parser.prototype.onattribname = function(name){
+ if(this._lowerCaseAttributeNames){
+ name = name.toLowerCase();
+ }
+ this._attribname = name;
+};
+
+Parser.prototype.onattribdata = function(value){
+ this._attribvalue += value;
+};
+
+Parser.prototype.onattribend = function(){
+ if(this._cbs.onattribute) this._cbs.onattribute(this._attribname, this._attribvalue);
+ if(
+ this._attribs &&
+ !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname)
+ ){
+ this._attribs[this._attribname] = this._attribvalue;
+ }
+ this._attribname = "";
+ this._attribvalue = "";
+};
+
+Parser.prototype._getInstructionName = function(value){
+ var idx = value.search(re_nameEnd),
+ name = idx < 0 ? value : value.substr(0, idx);
+
+ if(this._lowerCaseTagNames){
+ name = name.toLowerCase();
+ }
+
+ return name;
+};
+
+Parser.prototype.ondeclaration = function(value){
+ if(this._cbs.onprocessinginstruction){
+ var name = this._getInstructionName(value);
+ this._cbs.onprocessinginstruction("!" + name, "!" + value);
+ }
+};
+
+Parser.prototype.onprocessinginstruction = function(value){
+ if(this._cbs.onprocessinginstruction){
+ var name = this._getInstructionName(value);
+ this._cbs.onprocessinginstruction("?" + name, "?" + value);
+ }
+};
+
+Parser.prototype.oncomment = function(value){
+ this._updatePosition(4);
+
+ if(this._cbs.oncomment) this._cbs.oncomment(value);
+ if(this._cbs.oncommentend) this._cbs.oncommentend();
+};
+
+Parser.prototype.oncdata = function(value){
+ this._updatePosition(1);
+
+ if(this._options.xmlMode || this._options.recognizeCDATA){
+ if(this._cbs.oncdatastart) this._cbs.oncdatastart();
+ if(this._cbs.ontext) this._cbs.ontext(value);
+ if(this._cbs.oncdataend) this._cbs.oncdataend();
+ } else {
+ this.oncomment("[CDATA[" + value + "]]");
+ }
+};
+
+Parser.prototype.onerror = function(err){
+ if(this._cbs.onerror) this._cbs.onerror(err);
+};
+
+Parser.prototype.onend = function(){
+ if(this._cbs.onclosetag){
+ for(
+ var i = this._stack.length;
+ i > 0;
+ this._cbs.onclosetag(this._stack[--i])
+ );
+ }
+ if(this._cbs.onend) this._cbs.onend();
+};
+
+
+//Resets the parser to a blank state, ready to parse a new HTML document
+Parser.prototype.reset = function(){
+ if(this._cbs.onreset) this._cbs.onreset();
+ this._tokenizer.reset();
+
+ this._tagname = "";
+ this._attribname = "";
+ this._attribs = null;
+ this._stack = [];
+
+ if(this._cbs.onparserinit) this._cbs.onparserinit(this);
+};
+
+//Parses a complete HTML document and pushes it to the handler
+Parser.prototype.parseComplete = function(data){
+ this.reset();
+ this.end(data);
+};
+
+Parser.prototype.write = function(chunk){
+ this._tokenizer.write(chunk);
+};
+
+Parser.prototype.end = function(chunk){
+ this._tokenizer.end(chunk);
+};
+
+Parser.prototype.pause = function(){
+ this._tokenizer.pause();
+};
+
+Parser.prototype.resume = function(){
+ this._tokenizer.resume();
+};
+
+//alias for backwards compat
+Parser.prototype.parseChunk = Parser.prototype.write;
+Parser.prototype.done = Parser.prototype.end;
+
+module.exports = Parser;
diff --git a/lib/ProxyHandler.js b/lib/ProxyHandler.js
new file mode 100644
index 0000000..858e975
--- /dev/null
+++ b/lib/ProxyHandler.js
@@ -0,0 +1,27 @@
+module.exports = ProxyHandler;
+
+function ProxyHandler(cbs){
+ this._cbs = cbs || {};
+}
+
+var EVENTS = require("./").EVENTS;
+Object.keys(EVENTS).forEach(function(name){
+ if(EVENTS[name] === 0){
+ name = "on" + name;
+ ProxyHandler.prototype[name] = function(){
+ if(this._cbs[name]) this._cbs[name]();
+ };
+ } else if(EVENTS[name] === 1){
+ name = "on" + name;
+ ProxyHandler.prototype[name] = function(a){
+ if(this._cbs[name]) this._cbs[name](a);
+ };
+ } else if(EVENTS[name] === 2){
+ name = "on" + name;
+ ProxyHandler.prototype[name] = function(a, b){
+ if(this._cbs[name]) this._cbs[name](a, b);
+ };
+ } else {
+ throw Error("wrong number of arguments");
+ }
+});
\ No newline at end of file
diff --git a/lib/Stream.js b/lib/Stream.js
new file mode 100644
index 0000000..0ac49a6
--- /dev/null
+++ b/lib/Stream.js
@@ -0,0 +1,35 @@
+module.exports = Stream;
+
+var Parser = require("./WritableStream.js");
+
+function Stream(options){
+ Parser.call(this, new Cbs(this), options);
+}
+
+require("inherits")(Stream, Parser);
+
+Stream.prototype.readable = true;
+
+function Cbs(scope){
+ this.scope = scope;
+}
+
+var EVENTS = require("../").EVENTS;
+
+Object.keys(EVENTS).forEach(function(name){
+ if(EVENTS[name] === 0){
+ Cbs.prototype["on" + name] = function(){
+ this.scope.emit(name);
+ };
+ } else if(EVENTS[name] === 1){
+ Cbs.prototype["on" + name] = function(a){
+ this.scope.emit(name, a);
+ };
+ } else if(EVENTS[name] === 2){
+ Cbs.prototype["on" + name] = function(a, b){
+ this.scope.emit(name, a, b);
+ };
+ } else {
+ throw Error("wrong number of arguments!");
+ }
+});
\ No newline at end of file
diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
new file mode 100644
index 0000000..d67427c
--- /dev/null
+++ b/lib/Tokenizer.js
@@ -0,0 +1,906 @@
+module.exports = Tokenizer;
+
+var decodeCodePoint = require("entities/lib/decode_codepoint.js"),
+ entityMap = require("entities/maps/entities.json"),
+ legacyMap = require("entities/maps/legacy.json"),
+ xmlMap = require("entities/maps/xml.json"),
+
+ i = 0,
+
+ TEXT = i++,
+ BEFORE_TAG_NAME = i++, //after <
+ IN_TAG_NAME = i++,
+ IN_SELF_CLOSING_TAG = i++,
+ BEFORE_CLOSING_TAG_NAME = i++,
+ IN_CLOSING_TAG_NAME = i++,
+ AFTER_CLOSING_TAG_NAME = i++,
+
+ //attributes
+ BEFORE_ATTRIBUTE_NAME = i++,
+ IN_ATTRIBUTE_NAME = i++,
+ AFTER_ATTRIBUTE_NAME = i++,
+ BEFORE_ATTRIBUTE_VALUE = i++,
+ IN_ATTRIBUTE_VALUE_DQ = i++, // "
+ IN_ATTRIBUTE_VALUE_SQ = i++, // '
+ IN_ATTRIBUTE_VALUE_NQ = i++,
+
+ //declarations
+ BEFORE_DECLARATION = i++, // !
+ IN_DECLARATION = i++,
+
+ //processing instructions
+ IN_PROCESSING_INSTRUCTION = i++, // ?
+
+ //comments
+ BEFORE_COMMENT = i++,
+ IN_COMMENT = i++,
+ AFTER_COMMENT_1 = i++,
+ AFTER_COMMENT_2 = i++,
+
+ //cdata
+ BEFORE_CDATA_1 = i++, // [
+ BEFORE_CDATA_2 = i++, // C
+ BEFORE_CDATA_3 = i++, // D
+ BEFORE_CDATA_4 = i++, // A
+ BEFORE_CDATA_5 = i++, // T
+ BEFORE_CDATA_6 = i++, // A
+ IN_CDATA = i++, // [
+ AFTER_CDATA_1 = i++, // ]
+ AFTER_CDATA_2 = i++, // ]
+
+ //special tags
+ BEFORE_SPECIAL = i++, //S
+ BEFORE_SPECIAL_END = i++, //S
+
+ BEFORE_SCRIPT_1 = i++, //C
+ BEFORE_SCRIPT_2 = i++, //R
+ BEFORE_SCRIPT_3 = i++, //I
+ BEFORE_SCRIPT_4 = i++, //P
+ BEFORE_SCRIPT_5 = i++, //T
+ AFTER_SCRIPT_1 = i++, //C
+ AFTER_SCRIPT_2 = i++, //R
+ AFTER_SCRIPT_3 = i++, //I
+ AFTER_SCRIPT_4 = i++, //P
+ AFTER_SCRIPT_5 = i++, //T
+
+ BEFORE_STYLE_1 = i++, //T
+ BEFORE_STYLE_2 = i++, //Y
+ BEFORE_STYLE_3 = i++, //L
+ BEFORE_STYLE_4 = i++, //E
+ AFTER_STYLE_1 = i++, //T
+ AFTER_STYLE_2 = i++, //Y
+ AFTER_STYLE_3 = i++, //L
+ AFTER_STYLE_4 = i++, //E
+
+ BEFORE_ENTITY = i++, //&
+ BEFORE_NUMERIC_ENTITY = i++, //#
+ IN_NAMED_ENTITY = i++,
+ IN_NUMERIC_ENTITY = i++,
+ IN_HEX_ENTITY = i++, //X
+
+ j = 0,
+
+ SPECIAL_NONE = j++,
+ SPECIAL_SCRIPT = j++,
+ SPECIAL_STYLE = j++;
+
+function whitespace(c){
+ return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
+}
+
+function characterState(char, SUCCESS){
+ return function(c){
+ if(c === char) this._state = SUCCESS;
+ };
+}
+
+function ifElseState(upper, SUCCESS, FAILURE){
+ var lower = upper.toLowerCase();
+
+ if(upper === lower){
+ return function(c){
+ if(c === lower){
+ this._state = SUCCESS;
+ } else {
+ this._state = FAILURE;
+ this._index--;
+ }
+ };
+ } else {
+ return function(c){
+ if(c === lower || c === upper){
+ this._state = SUCCESS;
+ } else {
+ this._state = FAILURE;
+ this._index--;
+ }
+ };
+ }
+}
+
+function consumeSpecialNameChar(upper, NEXT_STATE){
+ var lower = upper.toLowerCase();
+
+ return function(c){
+ if(c === lower || c === upper){
+ this._state = NEXT_STATE;
+ } else {
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+ }
+ };
+}
+
+function Tokenizer(options, cbs){
+ this._state = TEXT;
+ this._buffer = "";
+ this._sectionStart = 0;
+ this._index = 0;
+ this._bufferOffset = 0; //chars removed from _buffer
+ this._baseState = TEXT;
+ this._special = SPECIAL_NONE;
+ this._cbs = cbs;
+ this._running = true;
+ this._ended = false;
+ this._xmlMode = !!(options && options.xmlMode);
+ this._decodeEntities = !!(options && options.decodeEntities);
+}
+
+Tokenizer.prototype._stateText = function(c){
+ if(c === "<"){
+ if(this._index > this._sectionStart){
+ this._cbs.ontext(this._getSection());
+ }
+ this._state = BEFORE_TAG_NAME;
+ this._sectionStart = this._index;
+ } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
+ if(this._index > this._sectionStart){
+ this._cbs.ontext(this._getSection());
+ }
+ this._baseState = TEXT;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeTagName = function(c){
+ if(c === "/"){
+ this._state = BEFORE_CLOSING_TAG_NAME;
+ } else if(c === "<"){
+ this._cbs.ontext(this._getSection());
+ this._sectionStart = this._index;
+ } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) {
+ this._state = TEXT;
+ } else if(c === "!"){
+ this._state = BEFORE_DECLARATION;
+ this._sectionStart = this._index + 1;
+ } else if(c === "?"){
+ this._state = IN_PROCESSING_INSTRUCTION;
+ this._sectionStart = this._index + 1;
+ } else {
+ this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
+ BEFORE_SPECIAL : IN_TAG_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInTagName = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._emitToken("onopentagname");
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
+ if(whitespace(c));
+ else if(c === ">"){
+ this._state = TEXT;
+ } else if(this._special !== SPECIAL_NONE){
+ if(c === "s" || c === "S"){
+ this._state = BEFORE_SPECIAL_END;
+ } else {
+ this._state = TEXT;
+ this._index--;
+ }
+ } else {
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInCloseingTagName = function(c){
+ if(c === ">" || whitespace(c)){
+ this._emitToken("onclosetag");
+ this._state = AFTER_CLOSING_TAG_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateAfterCloseingTagName = function(c){
+ //skip everything until ">"
+ if(c === ">"){
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateBeforeAttributeName = function(c){
+ if(c === ">"){
+ this._cbs.onopentagend();
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(c === "/"){
+ this._state = IN_SELF_CLOSING_TAG;
+ } else if(!whitespace(c)){
+ this._state = IN_ATTRIBUTE_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInSelfClosingTag = function(c){
+ if(c === ">"){
+ this._cbs.onselfclosingtag();
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(!whitespace(c)){
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeName = function(c){
+ if(c === "=" || c === "/" || c === ">" || whitespace(c)){
+ this._cbs.onattribname(this._getSection());
+ this._sectionStart = -1;
+ this._state = AFTER_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateAfterAttributeName = function(c){
+ if(c === "="){
+ this._state = BEFORE_ATTRIBUTE_VALUE;
+ } else if(c === "/" || c === ">"){
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ } else if(!whitespace(c)){
+ this._cbs.onattribend();
+ this._state = IN_ATTRIBUTE_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeAttributeValue = function(c){
+ if(c === "\""){
+ this._state = IN_ATTRIBUTE_VALUE_DQ;
+ this._sectionStart = this._index + 1;
+ } else if(c === "'"){
+ this._state = IN_ATTRIBUTE_VALUE_SQ;
+ this._sectionStart = this._index + 1;
+ } else if(!whitespace(c)){
+ this._state = IN_ATTRIBUTE_VALUE_NQ;
+ this._sectionStart = this._index;
+ this._index--; //reconsume token
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){
+ if(c === "\""){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){
+ if(c === "'"){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){
+ if(whitespace(c) || c === ">"){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeDeclaration = function(c){
+ this._state = c === "[" ? BEFORE_CDATA_1 :
+ c === "-" ? BEFORE_COMMENT :
+ IN_DECLARATION;
+};
+
+Tokenizer.prototype._stateInDeclaration = function(c){
+ if(c === ">"){
+ this._cbs.ondeclaration(this._getSection());
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateInProcessingInstruction = function(c){
+ if(c === ">"){
+ this._cbs.onprocessinginstruction(this._getSection());
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateBeforeComment = function(c){
+ if(c === "-"){
+ this._state = IN_COMMENT;
+ this._sectionStart = this._index + 1;
+ } else {
+ this._state = IN_DECLARATION;
+ }
+};
+
+Tokenizer.prototype._stateInComment = function(c){
+ if(c === "-") this._state = AFTER_COMMENT_1;
+};
+
+Tokenizer.prototype._stateAfterComment1 = function(c){
+ if(c === "-"){
+ this._state = AFTER_COMMENT_2;
+ } else {
+ this._state = IN_COMMENT;
+ }
+};
+
+Tokenizer.prototype._stateAfterComment2 = function(c){
+ if(c === ">"){
+ //remove 2 trailing chars
+ this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(c !== "-"){
+ this._state = IN_COMMENT;
+ }
+ // else: stay in AFTER_COMMENT_2 (`--->`)
+};
+
+Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION);
+
+Tokenizer.prototype._stateBeforeCdata6 = function(c){
+ if(c === "["){
+ this._state = IN_CDATA;
+ this._sectionStart = this._index + 1;
+ } else {
+ this._state = IN_DECLARATION;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateInCdata = function(c){
+ if(c === "]") this._state = AFTER_CDATA_1;
+};
+
+Tokenizer.prototype._stateAfterCdata1 = characterState("]", AFTER_CDATA_2);
+
+Tokenizer.prototype._stateAfterCdata2 = function(c){
+ if(c === ">"){
+ //remove 2 trailing chars
+ this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(c !== "]") {
+ this._state = IN_CDATA;
+ }
+ //else: stay in AFTER_CDATA_2 (`]]]>`)
+};
+
+Tokenizer.prototype._stateBeforeSpecial = function(c){
+ if(c === "c" || c === "C"){
+ this._state = BEFORE_SCRIPT_1;
+ } else if(c === "t" || c === "T"){
+ this._state = BEFORE_STYLE_1;
+ } else {
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+ }
+};
+
+Tokenizer.prototype._stateBeforeSpecialEnd = function(c){
+ if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){
+ this._state = AFTER_SCRIPT_1;
+ } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){
+ this._state = AFTER_STYLE_1;
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2);
+Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3);
+Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4);
+Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5);
+
+Tokenizer.prototype._stateBeforeScript5 = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._special = SPECIAL_SCRIPT;
+ }
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+};
+
+Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
+Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
+Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
+Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
+
+Tokenizer.prototype._stateAfterScript5 = function(c){
+ if(c === ">" || whitespace(c)){
+ this._special = SPECIAL_NONE;
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index - 6;
+ this._index--; //reconsume the token
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2);
+Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3);
+Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4);
+
+Tokenizer.prototype._stateBeforeStyle4 = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._special = SPECIAL_STYLE;
+ }
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+};
+
+Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
+Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
+Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
+
+Tokenizer.prototype._stateAfterStyle4 = function(c){
+ if(c === ">" || whitespace(c)){
+ this._special = SPECIAL_NONE;
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index - 5;
+ this._index--; //reconsume the token
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY);
+Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY);
+
+//for entities terminated with a semicolon
+Tokenizer.prototype._parseNamedEntityStrict = function(){
+ //offset = 1
+ if(this._sectionStart + 1 < this._index){
+ var entity = this._buffer.substring(this._sectionStart + 1, this._index),
+ map = this._xmlMode ? xmlMap : entityMap;
+
+ if(map.hasOwnProperty(entity)){
+ this._emitPartial(map[entity]);
+ this._sectionStart = this._index + 1;
+ }
+ }
+};
+
+
+//parses legacy entities (without trailing semicolon)
+Tokenizer.prototype._parseLegacyEntity = function(){
+ var start = this._sectionStart + 1,
+ limit = this._index - start;
+
+ if(limit > 6) limit = 6; //the max length of legacy entities is 6
+
+ while(limit >= 2){ //the min length of legacy entities is 2
+ var entity = this._buffer.substr(start, limit);
+
+ if(legacyMap.hasOwnProperty(entity)){
+ this._emitPartial(legacyMap[entity]);
+ this._sectionStart += limit + 1;
+ return;
+ } else {
+ limit--;
+ }
+ }
+};
+
+Tokenizer.prototype._stateInNamedEntity = function(c){
+ if(c === ";"){
+ this._parseNamedEntityStrict();
+ if(this._sectionStart + 1 < this._index && !this._xmlMode){
+ this._parseLegacyEntity();
+ }
+ this._state = this._baseState;
+ } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){
+ if(this._xmlMode);
+ else if(this._sectionStart + 1 === this._index);
+ else if(this._baseState !== TEXT){
+ if(c !== "="){
+ this._parseNamedEntityStrict();
+ }
+ } else {
+ this._parseLegacyEntity();
+ }
+
+ this._state = this._baseState;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._decodeNumericEntity = function(offset, base){
+ var sectionStart = this._sectionStart + offset;
+
+ if(sectionStart !== this._index){
+ //parse entity
+ var entity = this._buffer.substring(sectionStart, this._index);
+ var parsed = parseInt(entity, base);
+
+ this._emitPartial(decodeCodePoint(parsed));
+ this._sectionStart = this._index;
+ } else {
+ this._sectionStart--;
+ }
+
+ this._state = this._baseState;
+};
+
+Tokenizer.prototype._stateInNumericEntity = function(c){
+ if(c === ";"){
+ this._decodeNumericEntity(2, 10);
+ this._sectionStart++;
+ } else if(c < "0" || c > "9"){
+ if(!this._xmlMode){
+ this._decodeNumericEntity(2, 10);
+ } else {
+ this._state = this._baseState;
+ }
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateInHexEntity = function(c){
+ if(c === ";"){
+ this._decodeNumericEntity(3, 16);
+ this._sectionStart++;
+ } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){
+ if(!this._xmlMode){
+ this._decodeNumericEntity(3, 16);
+ } else {
+ this._state = this._baseState;
+ }
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._cleanup = function (){
+ if(this._sectionStart < 0){
+ this._buffer = "";
+ this._bufferOffset += this._index;
+ this._index = 0;
+ } else if(this._running){
+ if(this._state === TEXT){
+ if(this._sectionStart !== this._index){
+ this._cbs.ontext(this._buffer.substr(this._sectionStart));
+ }
+ this._buffer = "";
+ this._bufferOffset += this._index;
+ this._index = 0;
+ } else if(this._sectionStart === this._index){
+ //the section just started
+ this._buffer = "";
+ this._bufferOffset += this._index;
+ this._index = 0;
+ } else {
+ //remove everything unnecessary
+ this._buffer = this._buffer.substr(this._sectionStart);
+ this._index -= this._sectionStart;
+ this._bufferOffset += this._sectionStart;
+ }
+
+ this._sectionStart = 0;
+ }
+};
+
+//TODO make events conditional
+Tokenizer.prototype.write = function(chunk){
+ if(this._ended) this._cbs.onerror(Error(".write() after done!"));
+
+ this._buffer += chunk;
+ this._parse();
+};
+
+Tokenizer.prototype._parse = function(){
+ while(this._index < this._buffer.length && this._running){
+ var c = this._buffer.charAt(this._index);
+ if(this._state === TEXT) {
+ this._stateText(c);
+ } else if(this._state === BEFORE_TAG_NAME){
+ this._stateBeforeTagName(c);
+ } else if(this._state === IN_TAG_NAME) {
+ this._stateInTagName(c);
+ } else if(this._state === BEFORE_CLOSING_TAG_NAME){
+ this._stateBeforeCloseingTagName(c);
+ } else if(this._state === IN_CLOSING_TAG_NAME){
+ this._stateInCloseingTagName(c);
+ } else if(this._state === AFTER_CLOSING_TAG_NAME){
+ this._stateAfterCloseingTagName(c);
+ } else if(this._state === IN_SELF_CLOSING_TAG){
+ this._stateInSelfClosingTag(c);
+ }
+
+ /*
+ * attributes
+ */
+ else if(this._state === BEFORE_ATTRIBUTE_NAME){
+ this._stateBeforeAttributeName(c);
+ } else if(this._state === IN_ATTRIBUTE_NAME){
+ this._stateInAttributeName(c);
+ } else if(this._state === AFTER_ATTRIBUTE_NAME){
+ this._stateAfterAttributeName(c);
+ } else if(this._state === BEFORE_ATTRIBUTE_VALUE){
+ this._stateBeforeAttributeValue(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){
+ this._stateInAttributeValueDoubleQuotes(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){
+ this._stateInAttributeValueSingleQuotes(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){
+ this._stateInAttributeValueNoQuotes(c);
+ }
+
+ /*
+ * declarations
+ */
+ else if(this._state === BEFORE_DECLARATION){
+ this._stateBeforeDeclaration(c);
+ } else if(this._state === IN_DECLARATION){
+ this._stateInDeclaration(c);
+ }
+
+ /*
+ * processing instructions
+ */
+ else if(this._state === IN_PROCESSING_INSTRUCTION){
+ this._stateInProcessingInstruction(c);
+ }
+
+ /*
+ * comments
+ */
+ else if(this._state === BEFORE_COMMENT){
+ this._stateBeforeComment(c);
+ } else if(this._state === IN_COMMENT){
+ this._stateInComment(c);
+ } else if(this._state === AFTER_COMMENT_1){
+ this._stateAfterComment1(c);
+ } else if(this._state === AFTER_COMMENT_2){
+ this._stateAfterComment2(c);
+ }
+
+ /*
+ * cdata
+ */
+ else if(this._state === BEFORE_CDATA_1){
+ this._stateBeforeCdata1(c);
+ } else if(this._state === BEFORE_CDATA_2){
+ this._stateBeforeCdata2(c);
+ } else if(this._state === BEFORE_CDATA_3){
+ this._stateBeforeCdata3(c);
+ } else if(this._state === BEFORE_CDATA_4){
+ this._stateBeforeCdata4(c);
+ } else if(this._state === BEFORE_CDATA_5){
+ this._stateBeforeCdata5(c);
+ } else if(this._state === BEFORE_CDATA_6){
+ this._stateBeforeCdata6(c);
+ } else if(this._state === IN_CDATA){
+ this._stateInCdata(c);
+ } else if(this._state === AFTER_CDATA_1){
+ this._stateAfterCdata1(c);
+ } else if(this._state === AFTER_CDATA_2){
+ this._stateAfterCdata2(c);
+ }
+
+ /*
+ * special tags
+ */
+ else if(this._state === BEFORE_SPECIAL){
+ this._stateBeforeSpecial(c);
+ } else if(this._state === BEFORE_SPECIAL_END){
+ this._stateBeforeSpecialEnd(c);
+ }
+
+ /*
+ * script
+ */
+ else if(this._state === BEFORE_SCRIPT_1){
+ this._stateBeforeScript1(c);
+ } else if(this._state === BEFORE_SCRIPT_2){
+ this._stateBeforeScript2(c);
+ } else if(this._state === BEFORE_SCRIPT_3){
+ this._stateBeforeScript3(c);
+ } else if(this._state === BEFORE_SCRIPT_4){
+ this._stateBeforeScript4(c);
+ } else if(this._state === BEFORE_SCRIPT_5){
+ this._stateBeforeScript5(c);
+ }
+
+ else if(this._state === AFTER_SCRIPT_1){
+ this._stateAfterScript1(c);
+ } else if(this._state === AFTER_SCRIPT_2){
+ this._stateAfterScript2(c);
+ } else if(this._state === AFTER_SCRIPT_3){
+ this._stateAfterScript3(c);
+ } else if(this._state === AFTER_SCRIPT_4){
+ this._stateAfterScript4(c);
+ } else if(this._state === AFTER_SCRIPT_5){
+ this._stateAfterScript5(c);
+ }
+
+ /*
+ * style
+ */
+ else if(this._state === BEFORE_STYLE_1){
+ this._stateBeforeStyle1(c);
+ } else if(this._state === BEFORE_STYLE_2){
+ this._stateBeforeStyle2(c);
+ } else if(this._state === BEFORE_STYLE_3){
+ this._stateBeforeStyle3(c);
+ } else if(this._state === BEFORE_STYLE_4){
+ this._stateBeforeStyle4(c);
+ }
+
+ else if(this._state === AFTER_STYLE_1){
+ this._stateAfterStyle1(c);
+ } else if(this._state === AFTER_STYLE_2){
+ this._stateAfterStyle2(c);
+ } else if(this._state === AFTER_STYLE_3){
+ this._stateAfterStyle3(c);
+ } else if(this._state === AFTER_STYLE_4){
+ this._stateAfterStyle4(c);
+ }
+
+ /*
+ * entities
+ */
+ else if(this._state === BEFORE_ENTITY){
+ this._stateBeforeEntity(c);
+ } else if(this._state === BEFORE_NUMERIC_ENTITY){
+ this._stateBeforeNumericEntity(c);
+ } else if(this._state === IN_NAMED_ENTITY){
+ this._stateInNamedEntity(c);
+ } else if(this._state === IN_NUMERIC_ENTITY){
+ this._stateInNumericEntity(c);
+ } else if(this._state === IN_HEX_ENTITY){
+ this._stateInHexEntity(c);
+ }
+
+ else {
+ this._cbs.onerror(Error("unknown _state"), this._state);
+ }
+
+ this._index++;
+ }
+
+ this._cleanup();
+};
+
+Tokenizer.prototype.pause = function(){
+ this._running = false;
+};
+Tokenizer.prototype.resume = function(){
+ this._running = true;
+
+ if(this._index < this._buffer.length){
+ this._parse();
+ }
+ if(this._ended){
+ this._finish();
+ }
+};
+
+Tokenizer.prototype.end = function(chunk){
+ if(this._ended) this._cbs.onerror(Error(".end() after done!"));
+ if(chunk) this.write(chunk);
+
+ this._ended = true;
+
+ if(this._running) this._finish();
+};
+
+Tokenizer.prototype._finish = function(){
+ //if there is remaining data, emit it in a reasonable way
+ if(this._sectionStart < this._index){
+ this._handleTrailingData();
+ }
+
+ this._cbs.onend();
+};
+
+Tokenizer.prototype._handleTrailingData = function(){
+ var data = this._buffer.substr(this._sectionStart);
+
+ if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
+ this._cbs.oncdata(data);
+ } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
+ this._cbs.oncomment(data);
+ } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
+ this._parseLegacyEntity();
+ if(this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
+ this._decodeNumericEntity(2, 10);
+ if(this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
+ this._decodeNumericEntity(3, 16);
+ if(this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else if(
+ this._state !== IN_TAG_NAME &&
+ this._state !== BEFORE_ATTRIBUTE_NAME &&
+ this._state !== BEFORE_ATTRIBUTE_VALUE &&
+ this._state !== AFTER_ATTRIBUTE_NAME &&
+ this._state !== IN_ATTRIBUTE_NAME &&
+ this._state !== IN_ATTRIBUTE_VALUE_SQ &&
+ this._state !== IN_ATTRIBUTE_VALUE_DQ &&
+ this._state !== IN_ATTRIBUTE_VALUE_NQ &&
+ this._state !== IN_CLOSING_TAG_NAME
+ ){
+ this._cbs.ontext(data);
+ }
+ //else, ignore remaining data
+ //TODO add a way to remove current tag
+};
+
+Tokenizer.prototype.reset = function(){
+ Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs);
+};
+
+Tokenizer.prototype.getAbsoluteIndex = function(){
+ return this._bufferOffset + this._index;
+};
+
+Tokenizer.prototype._getSection = function(){
+ return this._buffer.substring(this._sectionStart, this._index);
+};
+
+Tokenizer.prototype._emitToken = function(name){
+ this._cbs[name](this._getSection());
+ this._sectionStart = -1;
+};
+
+Tokenizer.prototype._emitPartial = function(value){
+ if(this._baseState !== TEXT){
+ this._cbs.onattribdata(value); //TODO implement the new event
+ } else {
+ this._cbs.ontext(value);
+ }
+};
diff --git a/lib/WritableStream.js b/lib/WritableStream.js
new file mode 100644
index 0000000..9868f8a
--- /dev/null
+++ b/lib/WritableStream.js
@@ -0,0 +1,25 @@
+module.exports = Stream;
+
+var Parser = require("./Parser.js"),
+ WritableStream = require("stream").Writable || require("readable-stream").Writable,
+ StringDecoder = require("string_decoder").StringDecoder,
+ Buffer = require("buffer").Buffer;
+
+function Stream(cbs, options){
+ var parser = this._parser = new Parser(cbs, options);
+ var decoder = this._decoder = new StringDecoder();
+
+ WritableStream.call(this, {decodeStrings: false});
+
+ this.once("finish", function(){
+ parser.end(decoder.end());
+ });
+}
+
+require("inherits")(Stream, WritableStream);
+
+WritableStream.prototype._write = function(chunk, encoding, cb){
+ if(chunk instanceof Buffer) chunk = this._decoder.write(chunk);
+ this._parser.write(chunk);
+ cb();
+};
\ No newline at end of file
diff --git a/lib/index.js b/lib/index.js
new file mode 100644
index 0000000..880f57e
--- /dev/null
+++ b/lib/index.js
@@ -0,0 +1,68 @@
+var Parser = require("./Parser.js"),
+ DomHandler = require("domhandler");
+
+function defineProp(name, value){
+ delete module.exports[name];
+ module.exports[name] = value;
+ return value;
+}
+
+module.exports = {
+ Parser: Parser,
+ Tokenizer: require("./Tokenizer.js"),
+ ElementType: require("domelementtype"),
+ DomHandler: DomHandler,
+ get FeedHandler(){
+ return defineProp("FeedHandler", require("./FeedHandler.js"));
+ },
+ get Stream(){
+ return defineProp("Stream", require("./Stream.js"));
+ },
+ get WritableStream(){
+ return defineProp("WritableStream", require("./WritableStream.js"));
+ },
+ get ProxyHandler(){
+ return defineProp("ProxyHandler", require("./ProxyHandler.js"));
+ },
+ get DomUtils(){
+ return defineProp("DomUtils", require("domutils"));
+ },
+ get CollectingHandler(){
+ return defineProp("CollectingHandler", require("./CollectingHandler.js"));
+ },
+ // For legacy support
+ DefaultHandler: DomHandler,
+ get RssHandler(){
+ return defineProp("RssHandler", this.FeedHandler);
+ },
+ //helper methods
+ parseDOM: function(data, options){
+ var handler = new DomHandler(options);
+ new Parser(handler, options).end(data);
+ return handler.dom;
+ },
+ parseFeed: function(feed, options){
+ var handler = new module.exports.FeedHandler(options);
+ new Parser(handler, options).end(feed);
+ return handler.dom;
+ },
+ createDomStream: function(cb, options, elementCb){
+ var handler = new DomHandler(cb, options, elementCb);
+ return new Parser(handler, options);
+ },
+ // List of all events that the parser emits
+ EVENTS: { /* Format: eventname: number of arguments */
+ attribute: 2,
+ cdatastart: 0,
+ cdataend: 0,
+ text: 1,
+ processinginstruction: 2,
+ comment: 1,
+ commentend: 0,
+ closetag: 1,
+ opentag: 2,
+ opentagname: 1,
+ error: 1,
+ end: 0
+ }
+};
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..95f2d54
--- /dev/null
+++ b/package.json
@@ -0,0 +1,56 @@
+{
+ "name": "htmlparser2",
+ "description": "Fast & forgiving HTML/XML/RSS parser",
+ "version": "3.9.2",
+ "author": "Felix Boehm <me at feedic.com>",
+ "keywords": [
+ "html",
+ "parser",
+ "streams",
+ "xml",
+ "dom",
+ "rss",
+ "feed",
+ "atom"
+ ],
+ "repository": {
+ "type": "git",
+ "url": "git://github.com/fb55/htmlparser2.git"
+ },
+ "bugs": {
+ "mail": "me at feedic.com",
+ "url": "http://github.com/fb55/htmlparser2/issues"
+ },
+ "directories": {
+ "lib": "lib/"
+ },
+ "main": "lib/index.js",
+ "files": [
+ "lib"
+ ],
+ "scripts": {
+ "lcov": "istanbul cover _mocha --report lcovonly -- -R spec",
+ "coveralls": "npm run lint && npm run lcov && (cat coverage/lcov.info | coveralls || exit 0)",
+ "test": "mocha && npm run lint",
+ "lint": "eslint lib test"
+ },
+ "dependencies": {
+ "domelementtype": "^1.3.0",
+ "domhandler": "^2.3.0",
+ "domutils": "^1.5.1",
+ "entities": "^1.1.1",
+ "inherits": "^2.0.1",
+ "readable-stream": "^2.0.2"
+ },
+ "devDependencies": {
+ "coveralls": "^2.11.4",
+ "istanbul": "^0.4.3",
+ "mocha": "^2.2.5",
+ "eslint": "^2.12.0",
+ "mocha-lcov-reporter": "^1.2.0"
+ },
+ "browser": {
+ "readable-stream": false
+ },
+ "license": "MIT"
+}
diff --git a/test/01-events.js b/test/01-events.js
new file mode 100644
index 0000000..a3c7cf3
--- /dev/null
+++ b/test/01-events.js
@@ -0,0 +1,9 @@
+var helper = require("./test-helper.js");
+
+helper.mochaTest("Events", __dirname, function(test, cb){
+ helper.writeToParser(
+ helper.getEventCollector(cb),
+ test.options.parser,
+ test.html
+ );
+});
\ No newline at end of file
diff --git a/test/02-stream.js b/test/02-stream.js
new file mode 100644
index 0000000..fe6044d
--- /dev/null
+++ b/test/02-stream.js
@@ -0,0 +1,23 @@
+var helper = require("./test-helper.js"),
+ Stream = require("..").WritableStream,
+ fs = require("fs"),
+ path = require("path");
+
+helper.mochaTest("Stream", __dirname, function(test, cb){
+ var filePath = path.join(__dirname, "Documents", test.file);
+ fs.createReadStream(filePath).pipe(
+ new Stream(
+ helper.getEventCollector(function(err, events){
+ cb(err, events);
+
+ var handler = helper.getEventCollector(cb),
+ stream = new Stream(handler, test.options);
+
+ fs.readFile(filePath, function(err, data){
+ if(err) throw err;
+ else stream.end(data);
+ });
+ }
+ ), test.options)
+ ).on("error", cb);
+});
\ No newline at end of file
diff --git a/test/03-feed.js b/test/03-feed.js
new file mode 100644
index 0000000..7849953
--- /dev/null
+++ b/test/03-feed.js
@@ -0,0 +1,19 @@
+//Runs tests for feeds
+
+var helper = require("./test-helper.js"),
+ FeedHandler = require("..").RssHandler,
+ fs = require("fs"),
+ path = require("path");
+
+helper.mochaTest("Feeds", __dirname, function(test, cb){
+ fs.readFile(
+ path.join(__dirname, "Documents", test.file),
+ function(err, file){
+ helper.writeToParser(
+ new FeedHandler(cb),
+ { xmlMode: true },
+ file.toString()
+ );
+ }
+ );
+});
\ No newline at end of file
diff --git a/test/Documents/Atom_Example.xml b/test/Documents/Atom_Example.xml
new file mode 100644
index 0000000..f836380
--- /dev/null
+++ b/test/Documents/Atom_Example.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- http://en.wikipedia.org/wiki/Atom_%28standard%29 -->
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <title>Example Feed</title>
+ <subtitle>A subtitle.</subtitle>
+ <link href="http://example.org/feed/" rel="self" />
+ <link href="http://example.org/" />
+ <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
+ <updated>2003-12-13T18:30:02Z</updated>
+ <author>
+ <name>John Doe</name>
+ <email>johndoe at example.com</email>
+ </author>
+
+ <entry>
+ <title>Atom-Powered Robots Run Amok</title>
+ <link href="http://example.org/2003/12/13/atom03" />
+ <link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
+ <link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
+ <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+ <updated>2003-12-13T18:30:02Z</updated>
+ <content type="html"><p>Some content.</p></content>
+ </entry>
+
+</feed>
diff --git a/test/Documents/Attributes.html b/test/Documents/Attributes.html
new file mode 100644
index 0000000..f3bfa09
--- /dev/null
+++ b/test/Documents/Attributes.html
@@ -0,0 +1,16 @@
+<!doctype html>
+<html>
+<head>
+ <title>Attributes test</title>
+</head>
+<body>
+ <!-- Normal attributes -->
+ <button id="test0" class="value0" title="value1">class="value0" title="value1"</button>
+
+ <!-- Attributes with no quotes or value -->
+ <button id="test1" class=value2 disabled>class=value2 disabled</button>
+
+ <!-- Attributes with no space between them. No valid, but accepted by the browser -->
+ <button id="test2" class="value4"title="value5">class="value4"title="value5"</button>
+</body>
+</html>
\ No newline at end of file
diff --git a/test/Documents/Basic.html b/test/Documents/Basic.html
new file mode 100644
index 0000000..65957a2
--- /dev/null
+++ b/test/Documents/Basic.html
@@ -0,0 +1 @@
+<!DOCTYPE html><html><title>The Title</title><body>Hello world</body></html>
\ No newline at end of file
diff --git a/test/Documents/RDF_Example.xml b/test/Documents/RDF_Example.xml
new file mode 100644
index 0000000..b76dc37
--- /dev/null
+++ b/test/Documents/RDF_Example.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:ev="http://purl.org/rss/1.0/modules/event/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:admin="http://webns.net/mvcb/">
+ <channel rdf:about="https://github.com/fb55/htmlparser2/">
+ <title>A title to parse and remember</title>
+ <link>https://github.com/fb55/htmlparser2/</link>
+ <description/>
+ <dc:language>en-us</dc:language>
+ <dc:rights>Copyright 2015 the authors</dc:rights>
+ <dc:publisher>webmaster at thisisafakedoma.in</dc:publisher>
+ <dc:creator>webmaster at thisisafakedoma.in</dc:creator>
+ <dc:source>https://github.com/fb55/htmlparser2/</dc:source>
+ <dc:title>A title to parse and remember</dc:title>
+ <dc:type>Collection</dc:type>
+ <syn:updateBase>2011-11-04T09:39:10-07:00</syn:updateBase>
+ <syn:updateFrequency>4</syn:updateFrequency>
+ <syn:updatePeriod>hourly</syn:updatePeriod>
+ <items>
+ <rdf:Seq>
+ <rdf:li rdf:resource="http://somefakesite/path/to/something.html"/>
+ </rdf:Seq>
+ </items>
+ </channel>
+ <item rdf:about="http://somefakesite/path/to/something.html">
+ <title><![CDATA[ Fast HTML Parsing ]]></title>
+ <link>
+http://somefakesite/path/to/something.html
+</link>
+ <description><![CDATA[
+Great test content<br>A link: <a href="http://github.com">Github</a>
+]]></description>
+ <dc:date>2011-11-04T09:35:17-07:00</dc:date>
+ <dc:language>en-us</dc:language>
+ <dc:rights>Copyright 2015 the authors</dc:rights>
+ <dc:source>
+http://somefakesite/path/to/something.html
+</dc:source>
+ <dc:title><![CDATA[ Fast HTML Parsing ]]></dc:title>
+ <dc:type>text</dc:type>
+ <dcterms:issued>2011-11-04T09:35:17-07:00</dcterms:issued>
+ </item>
+ <item rdf:about="http://somefakesite/path/to/something-else.html">
+ <title><![CDATA[
+This space intentionally left blank
+]]></title>
+ <link>
+http://somefakesite/path/to/something-else.html
+</link>
+ <description><![CDATA[
+The early bird gets the worm
+]]></description>
+ <dc:date>2011-11-04T09:34:54-07:00</dc:date>
+ <dc:language>en-us</dc:language>
+ <dc:rights>Copyright 2015 the authors</dc:rights>
+ <dc:source>
+http://somefakesite/path/to/something-else.html
+</dc:source>
+ <dc:title><![CDATA[
+This space intentionally left blank
+]]></dc:title>
+ <dc:type>text</dc:type>
+ <dcterms:issued>2011-11-04T09:34:54-07:00</dcterms:issued>
+ </item>
+</rdf:RDF>
\ No newline at end of file
diff --git a/test/Documents/RSS_Example.xml b/test/Documents/RSS_Example.xml
new file mode 100644
index 0000000..0d1fde8
--- /dev/null
+++ b/test/Documents/RSS_Example.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<!-- http://cyber.law.harvard.edu/rss/examples/rss2sample.xml -->
+<rss version="2.0">
+ <channel>
+ <title>Liftoff News</title>
+ <link>http://liftoff.msfc.nasa.gov/</link>
+ <description>Liftoff to Space Exploration.</description>
+ <language>en-us</language>
+ <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
+
+ <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
+ <docs>http://blogs.law.harvard.edu/tech/rss</docs>
+ <generator>Weblog Editor 2.0</generator>
+ <managingEditor>editor at example.com</managingEditor>
+ <webMaster>webmaster at example.com</webMaster>
+ <item>
+
+ <title>Star City</title>
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
+ <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.</description>
+ <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
+ <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
+
+ </item>
+ <item>
+ <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.</description>
+ <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
+
+ </item>
+ <item>
+ <title>The Engine That Does More</title>
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>
+ <description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.</description>
+ <pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>
+
+ </item>
+ <item>
+ <title>Astronauts' Dirty Laundry</title>
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>
+ <description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.</description>
+ <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>
+
+ </item>
+ </channel>
+</rss>
\ No newline at end of file
diff --git a/test/Events/01-simple.json b/test/Events/01-simple.json
new file mode 100644
index 0000000..ab3076a
--- /dev/null
+++ b/test/Events/01-simple.json
@@ -0,0 +1,44 @@
+{
+ "name": "simple",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<h1 class=test>adsf</h1>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "h1"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "test"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "h1",
+ {
+ "class": "test"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "adsf"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "h1"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/02-template.json b/test/Events/02-template.json
new file mode 100644
index 0000000..df344b6
--- /dev/null
+++ b/test/Events/02-template.json
@@ -0,0 +1,63 @@
+{
+ "name": "Template script tags",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<p><script type=\"text/template\"><h1>Heading1</h1></script></p>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "p"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "p",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "type",
+ "text/template"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "script",
+ {
+ "type": "text/template"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "<h1>Heading1</h1>"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "p"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/03-lowercase_tags.json b/test/Events/03-lowercase_tags.json
new file mode 100644
index 0000000..9b58c59
--- /dev/null
+++ b/test/Events/03-lowercase_tags.json
@@ -0,0 +1,46 @@
+{
+ "name": "Lowercase tags",
+ "options": {
+ "handler": {},
+ "parser": {
+ "lowerCaseTags": true
+ }
+ },
+ "html": "<H1 class=test>adsf</H1>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "h1"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "test"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "h1",
+ {
+ "class": "test"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "adsf"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "h1"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/04-cdata.json b/test/Events/04-cdata.json
new file mode 100644
index 0000000..6032b68
--- /dev/null
+++ b/test/Events/04-cdata.json
@@ -0,0 +1,50 @@
+{
+ "name": "CDATA",
+ "options": {
+ "handler": {},
+ "parser": {"xmlMode": true}
+ },
+ "html": "<tag><![CDATA[ asdf ><asdf></adsf><> fo]]></tag><![CD>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "tag"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "tag",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ " asdf ><asdf></adsf><> fo"
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "tag"
+ ]
+ },
+ {
+ "event": "processinginstruction",
+ "data": [
+ "![CD",
+ "![CD"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/05-cdata-special.json b/test/Events/05-cdata-special.json
new file mode 100644
index 0000000..686cb1a
--- /dev/null
+++ b/test/Events/05-cdata-special.json
@@ -0,0 +1,35 @@
+{
+ "name": "CDATA (inside special)",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<script>/*<![CDATA[*/ asdf ><asdf></adsf><> fo/*]]>*/</script>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "script",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "/*<![CDATA[*/ asdf ><asdf></adsf><> fo/*]]>*/"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "script"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/06-leading-lt.json b/test/Events/06-leading-lt.json
new file mode 100644
index 0000000..fcec852
--- /dev/null
+++ b/test/Events/06-leading-lt.json
@@ -0,0 +1,16 @@
+{
+ "name": "leading lt",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": ">a>",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ ">a>"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/07-self-closing.json b/test/Events/07-self-closing.json
new file mode 100644
index 0000000..49ed93b
--- /dev/null
+++ b/test/Events/07-self-closing.json
@@ -0,0 +1,67 @@
+{
+ "name": "Self-closing tags",
+ "options": {
+ "handler": {
+
+ },
+ "parser": {
+
+ }
+ },
+ "html": "<a href=http://test.com/>Foo</a><hr / >",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "a"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://test.com/"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "a",
+ {
+ "href": "http://test.com/"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Foo"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "a"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "hr"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "hr",
+ {}
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "hr"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/08-implicit-close-tags.json b/test/Events/08-implicit-close-tags.json
new file mode 100644
index 0000000..331e785
--- /dev/null
+++ b/test/Events/08-implicit-close-tags.json
@@ -0,0 +1,71 @@
+{
+ "name": "Implicit close tags",
+ "options": {},
+ "html": "<ol><li class=test><div><table style=width:100%><tr><th>TH<td colspan=2><h3>Heading</h3><tr><td><div>Div</div><td><div>Div2</div></table></div><li><div><h3>Heading 2</h3></div></li></ol><p>Para<h4>Heading 4</h4>",
+ "expected": [
+ { "event": "opentagname", "data": [ "ol" ] },
+ { "event": "opentag", "data": [ "ol", {} ] },
+ { "event": "opentagname", "data": [ "li" ] },
+ { "event": "attribute", "data": [ "class", "test" ] },
+ { "event": "opentag", "data": [ "li", { "class": "test" } ] },
+ { "event": "opentagname", "data": [ "div" ] },
+ { "event": "opentag", "data": [ "div", {} ] },
+ { "event": "opentagname", "data": [ "table" ] },
+ { "event": "attribute", "data": [ "style", "width:100%" ] },
+ { "event": "opentag", "data": [ "table", { "style": "width:100%" } ] },
+ { "event": "opentagname", "data": [ "tr" ] },
+ { "event": "opentag", "data": [ "tr", {} ] },
+ { "event": "opentagname", "data": [ "th" ] },
+ { "event": "opentag", "data": [ "th", {} ] },
+ { "event": "text", "data": [ "TH" ] },
+ { "event": "closetag", "data": [ "th" ] },
+ { "event": "opentagname", "data": [ "td" ] },
+ { "event": "attribute", "data": [ "colspan", "2" ] },
+ { "event": "opentag", "data": [ "td", { "colspan": "2" } ] },
+ { "event": "opentagname", "data": [ "h3" ] },
+ { "event": "opentag", "data": [ "h3", {} ] },
+ { "event": "text", "data": [ "Heading" ] },
+ { "event": "closetag", "data": [ "h3" ] },
+ { "event": "closetag", "data": [ "td" ] },
+ { "event": "closetag", "data": [ "tr" ] },
+ { "event": "opentagname", "data": [ "tr" ] },
+ { "event": "opentag", "data": [ "tr", {} ] },
+ { "event": "opentagname", "data": [ "td" ] },
+ { "event": "opentag", "data": [ "td", {} ] },
+ { "event": "opentagname", "data": [ "div" ] },
+ { "event": "opentag", "data": [ "div", {} ] },
+ { "event": "text", "data": [ "Div" ] },
+ { "event": "closetag", "data": [ "div" ] },
+ { "event": "closetag", "data": [ "td" ] },
+ { "event": "opentagname", "data": [ "td" ] },
+ { "event": "opentag", "data": [ "td", {} ] },
+ { "event": "opentagname", "data": [ "div" ] },
+ { "event": "opentag", "data": [ "div", {} ] },
+ { "event": "text", "data": [ "Div2" ] },
+ { "event": "closetag", "data": [ "div" ] },
+ { "event": "closetag", "data": [ "td" ] },
+ { "event": "closetag", "data": [ "tr" ] },
+ { "event": "closetag", "data": [ "table" ] },
+ { "event": "closetag", "data": [ "div" ] },
+ { "event": "closetag", "data": [ "li" ] },
+ { "event": "opentagname", "data": [ "li" ] },
+ { "event": "opentag", "data": [ "li", {} ] },
+ { "event": "opentagname", "data": [ "div" ] },
+ { "event": "opentag", "data": [ "div", {} ] },
+ { "event": "opentagname", "data": [ "h3" ] },
+ { "event": "opentag", "data": [ "h3", {} ] },
+ { "event": "text", "data": [ "Heading 2" ] },
+ { "event": "closetag", "data": [ "h3" ] },
+ { "event": "closetag", "data": [ "div" ] },
+ { "event": "closetag", "data": [ "li" ] },
+ { "event": "closetag", "data": [ "ol" ] },
+ { "event": "opentagname", "data": [ "p" ] },
+ { "event": "opentag", "data": [ "p", {} ] },
+ { "event": "text", "data": [ "Para" ] },
+ { "event": "closetag", "data": [ "p" ] },
+ { "event": "opentagname", "data": [ "h4" ] },
+ { "event": "opentag", "data": [ "h4", {} ] },
+ { "event": "text", "data": [ "Heading 4" ] },
+ { "event": "closetag", "data": [ "h4" ] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/09-attributes.json b/test/Events/09-attributes.json
new file mode 100644
index 0000000..afa6e4a
--- /dev/null
+++ b/test/Events/09-attributes.json
@@ -0,0 +1,68 @@
+{
+ "name": "attributes (no white space, no value, no quotes)",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<button class=\"test0\"title=\"test1\" disabled value=test2>adsf</button>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "test0"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "title",
+ "test1"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "disabled",
+ ""
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "value",
+ "test2"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "button",
+ {
+ "class": "test0",
+ "title": "test1",
+ "disabled": "",
+ "value": "test2"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "adsf"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "button"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/10-crazy-attrib.json b/test/Events/10-crazy-attrib.json
new file mode 100644
index 0000000..00bad5f
--- /dev/null
+++ b/test/Events/10-crazy-attrib.json
@@ -0,0 +1,52 @@
+{
+ "name": "crazy attribute",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<p < = '' FAIL>stuff</p><a",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "p"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "<",
+ ""
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "fail",
+ ""
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "p",
+ {
+ "<": "",
+ "fail": ""
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "stuff"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "p"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/11-script_in_script.json b/test/Events/11-script_in_script.json
new file mode 100644
index 0000000..ddbb87c
--- /dev/null
+++ b/test/Events/11-script_in_script.json
@@ -0,0 +1,54 @@
+{
+ "name": "Scripts creating other scripts",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<p><script>var str = '<script></'+'script>';</script></p>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "p"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "p",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "script",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "var str = '<script></'+'script>';"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "p"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/12-long-comment-end.json b/test/Events/12-long-comment-end.json
new file mode 100644
index 0000000..e81f307
--- /dev/null
+++ b/test/Events/12-long-comment-end.json
@@ -0,0 +1,20 @@
+{
+ "name": "Long comment ending",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<meta id='before'><!-- text ---><meta id='after'>",
+ "expected": [
+ { "event": "opentagname", "data": [ "meta" ] },
+ { "event": "attribute", "data": [ "id", "before" ] },
+ { "event": "opentag", "data": [ "meta", {"id": "before"} ] },
+ { "event": "closetag", "data": [ "meta" ] },
+ { "event": "comment", "data": [ " text -" ] },
+ { "event": "commentend", "data": [] },
+ { "event": "opentagname", "data": [ "meta" ] },
+ { "event": "attribute", "data": [ "id", "after" ] },
+ { "event": "opentag", "data": [ "meta", {"id": "after"} ] },
+ { "event": "closetag", "data": [ "meta" ] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/13-long-cdata-end.json b/test/Events/13-long-cdata-end.json
new file mode 100644
index 0000000..34b7b41
--- /dev/null
+++ b/test/Events/13-long-cdata-end.json
@@ -0,0 +1,22 @@
+{
+ "name": "Long CDATA ending",
+ "options": {
+ "handler": {},
+ "parser": {"xmlMode": true}
+ },
+ "html": "<before /><tag><![CDATA[ text ]]]></tag><after />",
+ "expected": [
+ { "event": "opentagname", "data": [ "before" ] },
+ { "event": "opentag", "data": [ "before", {} ] },
+ { "event": "closetag", "data": [ "before" ] },
+ { "event": "opentagname", "data": [ "tag" ] },
+ { "event": "opentag", "data": [ "tag", {} ] },
+ { "event": "cdatastart", "data": [] },
+ { "event": "text", "data": [ " text ]" ] },
+ { "event": "cdataend", "data": [] },
+ { "event": "closetag", "data": [ "tag" ] },
+ { "event": "opentagname", "data": [ "after" ] },
+ { "event": "opentag", "data": [ "after", {} ] },
+ { "event": "closetag", "data": [ "after" ] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/14-implicit-open-tags.json b/test/Events/14-implicit-open-tags.json
new file mode 100644
index 0000000..f02b840
--- /dev/null
+++ b/test/Events/14-implicit-open-tags.json
@@ -0,0 +1,27 @@
+{
+ "name": "Implicit open p and br tags",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<div>Hallo</p>World</br></ignore></div></p></br>",
+ "expected": [
+ { "event": "opentagname", "data": [ "div" ] },
+ { "event": "opentag", "data": [ "div", {} ] },
+ { "event": "text", "data": [ "Hallo" ] },
+ { "event": "opentagname", "data": [ "p" ] },
+ { "event": "opentag", "data": [ "p", {} ] },
+ { "event": "closetag", "data": [ "p" ] },
+ { "event": "text", "data": [ "World" ] },
+ { "event": "opentagname", "data": [ "br" ] },
+ { "event": "opentag", "data": [ "br", {} ] },
+ { "event": "closetag", "data": [ "br" ] },
+ { "event": "closetag", "data": [ "div" ] },
+ { "event": "opentagname", "data": [ "p" ] },
+ { "event": "opentag", "data": [ "p", {} ] },
+ { "event": "closetag", "data": [ "p" ] },
+ { "event": "opentagname", "data": [ "br" ] },
+ { "event": "opentag", "data": [ "br", {} ] },
+ { "event": "closetag", "data": [ "br" ] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/15-lt-whitespace.json b/test/Events/15-lt-whitespace.json
new file mode 100644
index 0000000..aae6eb0
--- /dev/null
+++ b/test/Events/15-lt-whitespace.json
@@ -0,0 +1,16 @@
+{
+ "name": "lt followed by whitespace",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "a < b",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "a < b"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/16-double_attribs.json b/test/Events/16-double_attribs.json
new file mode 100644
index 0000000..bed1d8f
--- /dev/null
+++ b/test/Events/16-double_attribs.json
@@ -0,0 +1,45 @@
+{
+ "name": "double attribute",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<h1 class=test class=boo></h1>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "h1"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "test"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "boo"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "h1",
+ {
+ "class": "test"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "h1"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/17-numeric_entities.json b/test/Events/17-numeric_entities.json
new file mode 100644
index 0000000..23e0b26
--- /dev/null
+++ b/test/Events/17-numeric_entities.json
@@ -0,0 +1,16 @@
+{
+ "name": "numeric entities",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "aجdfg&#x;h",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "abcdfg&#x;h"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/18-legacy_entities.json b/test/Events/18-legacy_entities.json
new file mode 100644
index 0000000..5f34e5b
--- /dev/null
+++ b/test/Events/18-legacy_entities.json
@@ -0,0 +1,16 @@
+{
+ "name": "legacy entities",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "&elíe&eer;s<er",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "&el\u00EDe&eer;s<er"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/19-named_entities.json b/test/Events/19-named_entities.json
new file mode 100644
index 0000000..d9068d5
--- /dev/null
+++ b/test/Events/19-named_entities.json
@@ -0,0 +1,16 @@
+{
+ "name": "named entities",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "&el<er∳foo&bar",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "&el<er\u2233foo&bar"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/20-xml_entities.json b/test/Events/20-xml_entities.json
new file mode 100644
index 0000000..ce82300
--- /dev/null
+++ b/test/Events/20-xml_entities.json
@@ -0,0 +1,16 @@
+{
+ "name": "xml entities",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true, "xmlMode": true}
+ },
+ "html": "&>&<üaجde",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "&>&<üaجde"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/21-entity_in_attribute.json b/test/Events/21-entity_in_attribute.json
new file mode 100644
index 0000000..e0a3195
--- /dev/null
+++ b/test/Events/21-entity_in_attribute.json
@@ -0,0 +1,38 @@
+{
+ "name": "entity in attribute",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "<a href='http://example.com/page?param=value¶m2¶m3=<val&; & &'>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "a"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.com/page?param=value¶m2¶m3=<val&; & &"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "a",
+ {
+ "href": "http://example.com/page?param=value¶m2¶m3=<val&; & &"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "a"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/22-double_brackets.json b/test/Events/22-double_brackets.json
new file mode 100644
index 0000000..38a513b
--- /dev/null
+++ b/test/Events/22-double_brackets.json
@@ -0,0 +1,41 @@
+{
+ "name": "double brackets",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<<princess-purpose>>testing</princess-purpose>",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "<"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "princess-purpose"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "princess-purpose",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ ">testing"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "princess-purpose"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/23-legacy_entity_fail.json b/test/Events/23-legacy_entity_fail.json
new file mode 100644
index 0000000..4b4320b
--- /dev/null
+++ b/test/Events/23-legacy_entity_fail.json
@@ -0,0 +1,16 @@
+{
+ "name": "legacy entities",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "M&M",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "M&M"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/24-special_special.json b/test/Events/24-special_special.json
new file mode 100644
index 0000000..e80731f
--- /dev/null
+++ b/test/Events/24-special_special.json
@@ -0,0 +1,133 @@
+{
+ "name": "Special special tags",
+ "options": {},
+ "html": "<sCriPT></scripter</soo</sCript><STyLE></styler</STylE><sCiPt><stylee><scriptee><soo>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "script",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "</scripter</soo"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "style"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "style",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "</styler"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "style"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "scipt"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "scipt",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "stylee"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "stylee",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "scriptee"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "scriptee",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "soo"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "soo",
+ {}
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "soo"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "scriptee"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "stylee"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "scipt"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/25-empty_tag_name.json b/test/Events/25-empty_tag_name.json
new file mode 100644
index 0000000..b3b340c
--- /dev/null
+++ b/test/Events/25-empty_tag_name.json
@@ -0,0 +1,13 @@
+{
+ "name": "Empty tag name",
+ "options": {},
+ "html": "< ></ >",
+ "expected": [
+ {
+ "event": "text",
+ "data": [
+ "< ></ >"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/26-not-quite-closed.json b/test/Events/26-not-quite-closed.json
new file mode 100644
index 0000000..8504440
--- /dev/null
+++ b/test/Events/26-not-quite-closed.json
@@ -0,0 +1,35 @@
+{
+ "name": "Not quite closed",
+ "options": {},
+ "html": "<foo /bar></foo bar>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "foo"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "bar",
+ ""
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "foo",
+ {
+ "bar": ""
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "foo"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/27-entities_in_attributes.json b/test/Events/27-entities_in_attributes.json
new file mode 100644
index 0000000..b03cbdf
--- /dev/null
+++ b/test/Events/27-entities_in_attributes.json
@@ -0,0 +1,62 @@
+{
+ "name": "Entities in attributes",
+ "options": {
+ "handler": {},
+ "parser": {"decodeEntities": true}
+ },
+ "html": "<foo bar=& baz=\"&\" boo='&' noo=>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "foo"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "bar",
+ "&"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "baz",
+ "&"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "boo",
+ "&"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "noo",
+ ""
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "foo",
+ {
+ "bar": "&",
+ "baz": "&",
+ "boo": "&",
+ "noo": ""
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "foo"
+ ]
+ }
+ ]
+}
diff --git a/test/Events/28-cdata_in_html.json b/test/Events/28-cdata_in_html.json
new file mode 100644
index 0000000..80c033b
--- /dev/null
+++ b/test/Events/28-cdata_in_html.json
@@ -0,0 +1,9 @@
+{
+ "name": "CDATA in HTML",
+ "options": {},
+ "html": "<![CDATA[ foo ]]>",
+ "expected": [
+ { "event": "comment", "data": [ "[CDATA[ foo ]]" ] },
+ { "event": "commentend", "data": [] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/29-comment_edge-cases.json b/test/Events/29-comment_edge-cases.json
new file mode 100644
index 0000000..9d9709a
--- /dev/null
+++ b/test/Events/29-comment_edge-cases.json
@@ -0,0 +1,18 @@
+{
+ "name": "Comment edge-cases",
+ "options": {},
+ "html": "<!-foo><!-- --- --><!--foo",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "!-foo",
+ "!-foo"
+ ]
+ },
+ { "event": "comment", "data": [ " --- " ] },
+ { "event": "commentend", "data": [] },
+ { "event": "comment", "data": [ "foo" ] },
+ { "event": "commentend", "data": [] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/30-cdata_edge-cases.json b/test/Events/30-cdata_edge-cases.json
new file mode 100644
index 0000000..d226f09
--- /dev/null
+++ b/test/Events/30-cdata_edge-cases.json
@@ -0,0 +1,22 @@
+{
+ "name": "CDATA edge-cases",
+ "options": {
+ "parser": {"recognizeCDATA": true}
+ },
+ "html": "<![CDATA><![CDATA[[]]sdaf]]><![CDATA[foo",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "![cdata",
+ "![CDATA"
+ ]
+ },
+ { "event": "cdatastart", "data": [] },
+ { "event": "text", "data": [ "[]]sdaf" ] },
+ { "event": "cdataend", "data": [] },
+ { "event": "cdatastart", "data": [] },
+ { "event": "text", "data": [ "foo" ] },
+ { "event": "cdataend", "data": [] }
+ ]
+}
\ No newline at end of file
diff --git a/test/Events/31-comment_false-ending.json b/test/Events/31-comment_false-ending.json
new file mode 100644
index 0000000..6658428
--- /dev/null
+++ b/test/Events/31-comment_false-ending.json
@@ -0,0 +1,9 @@
+{
+ "name": "Comment false ending",
+ "options": {},
+ "html": "<!-- a-b-> -->",
+ "expected": [
+ { "event": "comment", "data": [ " a-b-> " ] },
+ { "event": "commentend", "data": [] }
+ ]
+}
diff --git a/test/Events/32-script-ending-with-lessthan.json b/test/Events/32-script-ending-with-lessthan.json
new file mode 100644
index 0000000..dcf7690
--- /dev/null
+++ b/test/Events/32-script-ending-with-lessthan.json
@@ -0,0 +1,35 @@
+{
+ "name": "Scripts ending with <",
+ "options": {
+ "handler": {},
+ "parser": {}
+ },
+ "html": "<script><</script>",
+ "expected": [
+ {
+ "event": "opentagname",
+ "data": [
+ "script"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "script",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "<"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "script"
+ ]
+ }
+ ]
+}
diff --git a/test/Feeds/01-rss.js b/test/Feeds/01-rss.js
new file mode 100644
index 0000000..a3aae47
--- /dev/null
+++ b/test/Feeds/01-rss.js
@@ -0,0 +1,34 @@
+exports.name = "RSS (2.0)";
+exports.file = "/RSS_Example.xml";
+exports.expected = {
+ type: "rss",
+ id: "",
+ title: "Liftoff News",
+ link: "http://liftoff.msfc.nasa.gov/",
+ description: "Liftoff to Space Exploration.",
+ updated: new Date("Tue, 10 Jun 2003 09:41:01 GMT"),
+ author: "editor at example.com",
+ items: [{
+ id: "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
+ title: "Star City",
+ link: "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
+ description: "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\">Star City</a>.",
+ pubDate: new Date("Tue, 03 Jun 2003 09:39:21 GMT")
+ }, {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
+ description: "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\">partial eclipse of the Sun</a> on Saturday, May 31st.",
+ pubDate: new Date("Fri, 30 May 2003 11:06:42 GMT")
+ }, {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
+ title: "The Engine That Does More",
+ link: "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
+ description: "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.",
+ pubDate: new Date("Tue, 27 May 2003 08:37:32 GMT")
+ }, {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
+ title: "Astronauts' Dirty Laundry",
+ link: "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
+ description: "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.",
+ pubDate: new Date("Tue, 20 May 2003 08:56:02 GMT")
+ }]
+};
\ No newline at end of file
diff --git a/test/Feeds/02-atom.js b/test/Feeds/02-atom.js
new file mode 100644
index 0000000..5b5d88e
--- /dev/null
+++ b/test/Feeds/02-atom.js
@@ -0,0 +1,18 @@
+exports.name = "Atom (1.0)";
+exports.file = "/Atom_Example.xml";
+exports.expected = {
+ type: "atom",
+ id: "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6",
+ title: "Example Feed",
+ link: "http://example.org/feed/",
+ description: "A subtitle.",
+ updated: new Date("2003-12-13T18:30:02Z"),
+ author: "johndoe at example.com",
+ items: [{
+ id: "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
+ title: "Atom-Powered Robots Run Amok",
+ link: "http://example.org/2003/12/13/atom03",
+ description: "Some content.",
+ pubDate: new Date("2003-12-13T18:30:02Z")
+ }]
+};
diff --git a/test/Feeds/03-rdf.js b/test/Feeds/03-rdf.js
new file mode 100644
index 0000000..d8f92f5
--- /dev/null
+++ b/test/Feeds/03-rdf.js
@@ -0,0 +1,20 @@
+exports.name = "RDF test";
+exports.file = "/RDF_Example.xml";
+exports.expected = {
+ "type": "rdf",
+ "id": "",
+ "title": "A title to parse and remember",
+ "link": "https://github.com/fb55/htmlparser2/",
+ "items": [
+ {
+ "title": "Fast HTML Parsing",
+ "link": "http://somefakesite/path/to/something.html",
+ "description": "Great test content<br>A link: <a href=\"http://github.com\">Github</a>"
+ },
+ {
+ "title": "This space intentionally left blank",
+ "link": "http://somefakesite/path/to/something-else.html",
+ "description": "The early bird gets the worm"
+ }
+ ]
+};
diff --git a/test/Stream/01-basic.json b/test/Stream/01-basic.json
new file mode 100644
index 0000000..e0766e7
--- /dev/null
+++ b/test/Stream/01-basic.json
@@ -0,0 +1,83 @@
+{
+ "name": "Basic html",
+ "options": {},
+ "file": "Basic.html",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "!doctype",
+ "!DOCTYPE html"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "html",
+ {}
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "The Title"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "body"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "body",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Hello world"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "body"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "html"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Stream/02-RSS.json b/test/Stream/02-RSS.json
new file mode 100644
index 0000000..0d5921c
--- /dev/null
+++ b/test/Stream/02-RSS.json
@@ -0,0 +1,1093 @@
+{
+ "name": "RSS feed",
+ "options": {"xmlMode": true},
+ "file": "RSS_Example.xml",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "?xml",
+ "?xml version=\"1.0\"?"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "comment",
+ "data": [
+ " http://cyber.law.harvard.edu/rss/examples/rss2sample.xml "
+ ]
+ },
+ {
+ "event": "commentend",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "rss"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "version",
+ "2.0"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "rss",
+ {
+ "version": "2.0"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "channel"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "channel",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Liftoff News"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Liftoff to Space Exploration."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "language"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "language",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "en-us"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "language"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "pubDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Tue, 10 Jun 2003 04:00:00 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "lastBuildDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "lastBuildDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Tue, 10 Jun 2003 09:41:01 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "lastBuildDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "docs"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "docs",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://blogs.law.harvard.edu/tech/rss"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "docs"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "generator"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "generator",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Weblog Editor 2.0"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "generator"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "managingEditor"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "managingEditor",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "editor at example.com"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "managingEditor"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "webMaster"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "webMaster",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "webmaster at example.com"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "webMaster"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Star City"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\">Star City</a>."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "pubDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Tue, 03 Jun 2003 09:39:21 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "guid",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\">partial eclipse of the Sun</a> on Saturday, May 31st."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "pubDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Fri, 30 May 2003 11:06:42 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "guid",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "The Engine That Does More"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "pubDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Tue, 27 May 2003 08:37:32 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "guid",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Astronauts' Dirty Laundry"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "pubDate",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Tue, 20 May 2003 08:56:02 GMT"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "pubDate"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "guid",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "guid"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n "
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n "
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "channel"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "rss"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/Stream/03-Atom.json b/test/Stream/03-Atom.json
new file mode 100644
index 0000000..0cbf24e
--- /dev/null
+++ b/test/Stream/03-Atom.json
@@ -0,0 +1,678 @@
+{
+ "name": "Atom feed",
+ "options": {"xmlMode": true},
+ "file": "Atom_Example.xml",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "?xml",
+ "?xml version=\"1.0\" encoding=\"utf-8\"?"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "comment",
+ "data": [
+ " http://en.wikipedia.org/wiki/Atom_%28standard%29 "
+ ]
+ },
+ {
+ "event": "commentend",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "feed"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns",
+ "http://www.w3.org/2005/Atom"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "feed",
+ {
+ "xmlns": "http://www.w3.org/2005/Atom"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Example Feed"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "subtitle"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "subtitle",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "A subtitle."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "subtitle"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.org/feed/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rel",
+ "self"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {
+ "href": "http://example.org/feed/",
+ "rel": "self"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.org/"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {
+ "href": "http://example.org/"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "id"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "id",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "id"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "updated"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "updated",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2003-12-13T18:30:02Z"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "updated"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "author"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "author",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "name"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "name",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "John Doe"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "name"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "email"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "email",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "johndoe at example.com"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "email"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "author"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "entry"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "entry",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Atom-Powered Robots Run Amok"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.org/2003/12/13/atom03"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {
+ "href": "http://example.org/2003/12/13/atom03"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rel",
+ "alternate"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "type",
+ "text/html"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.org/2003/12/13/atom03.html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {
+ "rel": "alternate",
+ "type": "text/html",
+ "href": "http://example.org/2003/12/13/atom03.html"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rel",
+ "edit"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "href",
+ "http://example.org/2003/12/13/atom03/edit"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {
+ "rel": "edit",
+ "href": "http://example.org/2003/12/13/atom03/edit"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "id"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "id",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "id"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "updated"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "updated",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2003-12-13T18:30:02Z"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "updated"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "content"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "type",
+ "html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "content",
+ {
+ "type": "html"
+ }
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "p"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "p",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Some content."
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "p"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "content"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "entry"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "feed"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ }
+ ]
+}
diff --git a/test/Stream/04-RDF.json b/test/Stream/04-RDF.json
new file mode 100644
index 0000000..0150eb8
--- /dev/null
+++ b/test/Stream/04-RDF.json
@@ -0,0 +1,1399 @@
+{
+ "name": "RDF feed",
+ "options": {"xmlMode": true},
+ "file": "RDF_Example.xml",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "?xml",
+ "?xml version=\"1.0\" encoding=\"UTF-8\"?"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "rdf:RDF"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:rdf",
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns",
+ "http://purl.org/rss/1.0/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:ev",
+ "http://purl.org/rss/1.0/modules/event/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:content",
+ "http://purl.org/rss/1.0/modules/content/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:taxo",
+ "http://purl.org/rss/1.0/modules/taxonomy/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:dc",
+ "http://purl.org/dc/elements/1.1/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:syn",
+ "http://purl.org/rss/1.0/modules/syndication/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:dcterms",
+ "http://purl.org/dc/terms/"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "xmlns:admin",
+ "http://webns.net/mvcb/"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "rdf:RDF",
+ {
+ "xmlns:rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+ "xmlns": "http://purl.org/rss/1.0/",
+ "xmlns:ev": "http://purl.org/rss/1.0/modules/event/",
+ "xmlns:content": "http://purl.org/rss/1.0/modules/content/",
+ "xmlns:taxo": "http://purl.org/rss/1.0/modules/taxonomy/",
+ "xmlns:dc": "http://purl.org/dc/elements/1.1/",
+ "xmlns:syn": "http://purl.org/rss/1.0/modules/syndication/",
+ "xmlns:dcterms": "http://purl.org/dc/terms/",
+ "xmlns:admin": "http://webns.net/mvcb/"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "channel"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rdf:about",
+ "https://github.com/fb55/htmlparser2/"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "channel",
+ {
+ "rdf:about": "https://github.com/fb55/htmlparser2/"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "A title to parse and remember"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "https://github.com/fb55/htmlparser2/"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:language",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "en-us"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:rights",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Copyright 2015 the authors"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:publisher"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:publisher",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "webmaster at thisisafakedoma.in"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:publisher"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:creator"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:creator",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "webmaster at thisisafakedoma.in"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:creator"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:source",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "https://github.com/fb55/htmlparser2/"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "A title to parse and remember"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:type",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Collection"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "syn:updateBase"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "syn:updateBase",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2011-11-04T09:39:10-07:00"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "syn:updateBase"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "syn:updateFrequency"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "syn:updateFrequency",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "4"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "syn:updateFrequency"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "syn:updatePeriod"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "syn:updatePeriod",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "hourly"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "syn:updatePeriod"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "items"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "items",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "rdf:Seq"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "rdf:Seq",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "rdf:li"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rdf:resource",
+ "http://somefakesite/path/to/something.html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "rdf:li",
+ {
+ "rdf:resource": "http://somefakesite/path/to/something.html"
+ }
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "rdf:li"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "rdf:Seq"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "items"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "channel"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rdf:about",
+ "http://somefakesite/path/to/something.html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {
+ "rdf:about": "http://somefakesite/path/to/something.html"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ " Fast HTML Parsing "
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nhttp://somefakesite/path/to/something.html\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nGreat test content<br>A link: <a href=\"http://github.com\">Github</a>\n"
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:date"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:date",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2011-11-04T09:35:17-07:00"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:date"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:language",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "en-us"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:rights",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Copyright 2015 the authors"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:source",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nhttp://somefakesite/path/to/something.html\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:title",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ " Fast HTML Parsing "
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:type",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "text"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dcterms:issued"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dcterms:issued",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2011-11-04T09:35:17-07:00"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dcterms:issued"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "rdf:about",
+ "http://somefakesite/path/to/something-else.html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "item",
+ {
+ "rdf:about": "http://somefakesite/path/to/something-else.html"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nThis space intentionally left blank\n"
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "link",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nhttp://somefakesite/path/to/something-else.html\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "link"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "description",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nThe early bird gets the worm\n"
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "description"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:date"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:date",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2011-11-04T09:34:54-07:00"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:date"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:language",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "en-us"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:language"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:rights",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Copyright 2015 the authors"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:rights"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:source",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nhttp://somefakesite/path/to/something-else.html\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:source"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:title",
+ {}
+ ]
+ },
+ {
+ "event": "cdatastart",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\nThis space intentionally left blank\n"
+ ]
+ },
+ {
+ "event": "cdataend",
+ "data": []
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dc:type",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "text"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dc:type"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "dcterms:issued"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "dcterms:issued",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "2011-11-04T09:34:54-07:00"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "dcterms:issued"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "item"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "rdf:RDF"
+ ]
+ }
+ ]
+}
diff --git a/test/Stream/05-Attributes.json b/test/Stream/05-Attributes.json
new file mode 100644
index 0000000..ad364c0
--- /dev/null
+++ b/test/Stream/05-Attributes.json
@@ -0,0 +1,354 @@
+{
+ "name": "Attributes",
+ "options": {},
+ "file": "Attributes.html",
+ "expected": [
+ {
+ "event": "processinginstruction",
+ "data": [
+ "!doctype",
+ "!doctype html"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "html"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "html",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "head"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "head",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "title",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "Attributes test"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "title"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "head"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "body"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "body",
+ {}
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "comment",
+ "data": [
+ " Normal attributes "
+ ]
+ },
+ {
+ "event": "commentend",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "id",
+ "test0"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "value0"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "title",
+ "value1"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "button",
+ {
+ "id": "test0",
+ "class": "value0",
+ "title": "value1"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "class=\"value0\" title=\"value1\""
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n\t"
+ ]
+ },
+ {
+ "event": "comment",
+ "data": [
+ " Attributes with no quotes or value "
+ ]
+ },
+ {
+ "event": "commentend",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "id",
+ "test1"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "value2"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "disabled",
+ ""
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "button",
+ {
+ "id": "test1",
+ "class": "value2",
+ "disabled": ""
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "class=value2 disabled"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\n\t"
+ ]
+ },
+ {
+ "event": "comment",
+ "data": [
+ " Attributes with no space between them. No valid, but accepted by the browser "
+ ]
+ },
+ {
+ "event": "commentend",
+ "data": []
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n\t"
+ ]
+ },
+ {
+ "event": "opentagname",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "id",
+ "test2"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "class",
+ "value4"
+ ]
+ },
+ {
+ "event": "attribute",
+ "data": [
+ "title",
+ "value5"
+ ]
+ },
+ {
+ "event": "opentag",
+ "data": [
+ "button",
+ {
+ "id": "test2",
+ "class": "value4",
+ "title": "value5"
+ }
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "class=\"value4\"title=\"value5\""
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "button"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "body"
+ ]
+ },
+ {
+ "event": "text",
+ "data": [
+ "\n"
+ ]
+ },
+ {
+ "event": "closetag",
+ "data": [
+ "html"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/test/api.js b/test/api.js
new file mode 100644
index 0000000..a6a76ef
--- /dev/null
+++ b/test/api.js
@@ -0,0 +1,103 @@
+var htmlparser2 = require(".."),
+ assert = require("assert");
+
+describe("API", function(){
+
+ it("should load all modules", function(){
+ var Stream = require("../lib/Stream.js");
+ assert.strictEqual(htmlparser2.Stream, Stream, "should load module");
+ assert.strictEqual(htmlparser2.Stream, Stream, "should load it again (cache)");
+
+ var ProxyHandler = require("../lib/ProxyHandler.js");
+ assert.strictEqual(htmlparser2.ProxyHandler, ProxyHandler, "should load module");
+ assert.strictEqual(htmlparser2.ProxyHandler, ProxyHandler, "should load it again (cache)");
+ });
+
+ it("should work without callbacks", function(){
+ var p = new htmlparser2.Parser(null, {xmlMode: true, lowerCaseAttributeNames: true});
+
+ p.end("<a foo><bar></a><!-- --><![CDATA[]]]><?foo?><!bar><boo/>boohay");
+ p.write("foo");
+
+ //check for an error
+ p.end();
+ var err = false;
+ p._cbs.onerror = function(){ err = true; };
+ p.write("foo");
+ assert(err);
+ err = false;
+ p.end();
+ assert(err);
+
+ p.reset();
+
+ //remove method
+ p._cbs.onopentag = function(){};
+ p.write("<a foo");
+ p._cbs.onopentag = null;
+ p.write(">");
+
+ //pause/resume
+ var processed = false;
+ p._cbs.ontext = function(t){
+ assert.equal(t, "foo");
+ processed = true;
+ };
+ p.pause();
+ p.write("foo");
+ assert(!processed);
+ p.resume();
+ assert(processed);
+ processed = false;
+ p.pause();
+ assert(!processed);
+ p.resume();
+ assert(!processed);
+ p.pause();
+ p.end("foo");
+ assert(!processed);
+ p.resume();
+ assert(processed);
+
+ });
+
+ it("should update the position", function(){
+ var p = new htmlparser2.Parser(null);
+
+ p.write("foo");
+
+ assert.equal(p.startIndex, 0);
+ assert.equal(p.endIndex, 2);
+
+ p.write("<bar>");
+
+ assert.equal(p.startIndex, 3);
+ assert.equal(p.endIndex, 7);
+ });
+
+ it("should update the position when a single tag is spread across multiple chunks", function(){
+ var p = new htmlparser2.Parser(null);
+
+ p.write("<div ");
+ p.write("foo=bar>");
+
+ assert.equal(p.startIndex, 0);
+ assert.equal(p.endIndex, 12);
+ });
+
+ it("should support custom tokenizer", function(){
+ function CustomTokenizer(options, cbs){
+ htmlparser2.Tokenizer.call(this, options, cbs);
+ return this;
+ }
+ CustomTokenizer.prototype = Object.create(htmlparser2.Tokenizer.prototype);
+ CustomTokenizer.prototype.constructor = CustomTokenizer;
+
+ var p = new htmlparser2.Parser({
+ onparserinit: function(parser){
+ assert(parser._tokenizer instanceof CustomTokenizer);
+ }
+ }, { Tokenizer: CustomTokenizer });
+ p.done();
+ });
+});
\ No newline at end of file
diff --git a/test/test-helper.js b/test/test-helper.js
new file mode 100644
index 0000000..3f39bf5
--- /dev/null
+++ b/test/test-helper.js
@@ -0,0 +1,83 @@
+var htmlparser2 = require(".."),
+ fs = require("fs"),
+ path = require("path"),
+ assert = require("assert"),
+ Parser = htmlparser2.Parser,
+ CollectingHandler = htmlparser2.CollectingHandler;
+
+exports.writeToParser = function(handler, options, data){
+ var parser = new Parser(handler, options);
+ //first, try to run the test via chunks
+ for(var i = 0; i < data.length; i++){
+ parser.write(data.charAt(i));
+ }
+ parser.end();
+ //then parse everything
+ parser.parseComplete(data);
+};
+
+//returns a tree structure
+exports.getEventCollector = function(cb){
+ var handler = new CollectingHandler({onerror: cb, onend: onend});
+
+ return handler;
+
+ function onend(){
+ cb(null, handler.events.reduce(eventReducer, []));
+ }
+};
+
+function eventReducer(events, arr){
+ if(arr[0] === "onerror" || arr[0] === "onend");
+ else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
+ events[events.length - 1].data[0] += arr[1];
+ } else {
+ events.push({
+ event: arr[0].substr(2),
+ data: arr.slice(1)
+ });
+ }
+
+ return events;
+}
+
+function getCallback(expected, done){
+ var repeated = false;
+
+ return function(err, actual){
+ assert.ifError(err);
+ try {
+ assert.deepEqual(expected, actual, "didn't get expected output");
+ } catch(e){
+ e.expected = JSON.stringify(expected, null, 2);
+ e.actual = JSON.stringify(actual, null, 2);
+ throw e;
+ }
+
+ if(repeated) done();
+ else repeated = true;
+ };
+}
+
+exports.mochaTest = function(name, root, test){
+ describe(name, readDir);
+
+ function readDir(){
+ var dir = path.join(root, name);
+
+ fs
+ .readdirSync(dir)
+ .filter(RegExp.prototype.test, /^[^\._]/) //ignore all files with a leading dot or underscore
+ .map(function(name){
+ return path.join(dir, name);
+ })
+ .map(require)
+ .forEach(runTest);
+ }
+
+ function runTest(file){
+ it(file.name, function(done){
+ test(file, getCallback(file.expected, done));
+ });
+ }
+};
diff --git a/test/unicode.js b/test/unicode.js
new file mode 100644
index 0000000..602b4ca
--- /dev/null
+++ b/test/unicode.js
@@ -0,0 +1,21 @@
+var htmlparser2 = require(".."),
+ assert = require("assert");
+
+describe("WritableStream", function(){
+
+ it("should decode fragmented unicode characters", function(){
+ var processed = false;
+ var stream = new htmlparser2.WritableStream({
+ ontext: function(text){
+ assert.equal(text, "€");
+ processed = true;
+ }
+ });
+
+ stream.write(new Buffer([0xE2, 0x82]));
+ stream.write(new Buffer([0xAC]));
+ stream.end();
+
+ assert(processed);
+ });
+});
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-javascript/node-htmlparser2_new.git
More information about the Pkg-javascript-commits
mailing list