r2345 - in branches: . nekohtml nekohtml/upstream
nekohtml/upstream/0.9.5 nekohtml/upstream/0.9.5/data
nekohtml/upstream/0.9.5/data/html
nekohtml/upstream/0.9.5/data/html/canonical
nekohtml/upstream/0.9.5/doc nekohtml/upstream/0.9.5/doc/html
nekohtml/upstream/0.9.5/src nekohtml/upstream/0.9.5/src/html
nekohtml/upstream/0.9.5/src/html/META-INF
nekohtml/upstream/0.9.5/src/html/META-INF/services
nekohtml/upstream/0.9.5/src/html/org
nekohtml/upstream/0.9.5/src/html/org/cyberneko
nekohtml/upstream/0.9.5/src/html/org/cyberneko/html
nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters
nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers
nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res
nekohtml/upstream/0.9.5/src/html/sample
nekohtml/upstream/0.9.5/src/html/test
Marcus Better
marcusb-guest at costa.debian.org
Tue Aug 22 16:00:32 UTC 2006
Author: marcusb-guest
Date: 2006-08-22 16:00:27 +0000 (Tue, 22 Aug 2006)
New Revision: 2345
Added:
branches/nekohtml/
branches/nekohtml/upstream/
branches/nekohtml/upstream/0.9.5/
branches/nekohtml/upstream/0.9.5/LICENSE
branches/nekohtml/upstream/0.9.5/LICENSE_apache
branches/nekohtml/upstream/0.9.5/README_html
branches/nekohtml/upstream/0.9.5/TODO_html
branches/nekohtml/upstream/0.9.5/build-html-test.xml
branches/nekohtml/upstream/0.9.5/build-html.xml
branches/nekohtml/upstream/0.9.5/build.bat
branches/nekohtml/upstream/0.9.5/data/
branches/nekohtml/upstream/0.9.5/data/html/
branches/nekohtml/upstream/0.9.5/data/html/canonical/
branches/nekohtml/upstream/0.9.5/data/html/canonical/README
branches/nekohtml/upstream/0.9.5/data/html/canonical/test00.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test01.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test02.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test03.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test04.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test05.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test06.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test07.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test08.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test09.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test10.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test100.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test101.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test102.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test103.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test11.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test12.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test13.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test14.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test15.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test16.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test17.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test18.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test19.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test20.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test21.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test22.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test23.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test24.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test25.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test26.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test27.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test28.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test29.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test30.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test31.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test32.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test33.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test34.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test35.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test36.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test37.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test38.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test39.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test40.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test41.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test42.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test43.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test44.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test45.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test46.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test47.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test48.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test49.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test50.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test51.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test52.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test53.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test54.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test55.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test56.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test57.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test58.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test59.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test60.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test61.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test62.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test63.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test64.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test65.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test66.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test67.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test68.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test69.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test70.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test71.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test72.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test73.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test74.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test75.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test76.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test77.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test78.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test79.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test80.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test81.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test82.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test83.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test84.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test85.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test86.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test87.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test88.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test89.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test90.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test91.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test92.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test93.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test94.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test95.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test96.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test97.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test98.html
branches/nekohtml/upstream/0.9.5/data/html/canonical/test99.html
branches/nekohtml/upstream/0.9.5/data/html/test00.html
branches/nekohtml/upstream/0.9.5/data/html/test01.html
branches/nekohtml/upstream/0.9.5/data/html/test02.html
branches/nekohtml/upstream/0.9.5/data/html/test03.html
branches/nekohtml/upstream/0.9.5/data/html/test04.html
branches/nekohtml/upstream/0.9.5/data/html/test05.html
branches/nekohtml/upstream/0.9.5/data/html/test06.html
branches/nekohtml/upstream/0.9.5/data/html/test07.html
branches/nekohtml/upstream/0.9.5/data/html/test08.html
branches/nekohtml/upstream/0.9.5/data/html/test09.html
branches/nekohtml/upstream/0.9.5/data/html/test10.html
branches/nekohtml/upstream/0.9.5/data/html/test100.html
branches/nekohtml/upstream/0.9.5/data/html/test101.html
branches/nekohtml/upstream/0.9.5/data/html/test102.html
branches/nekohtml/upstream/0.9.5/data/html/test102.html.settings
branches/nekohtml/upstream/0.9.5/data/html/test103.html
branches/nekohtml/upstream/0.9.5/data/html/test11.html
branches/nekohtml/upstream/0.9.5/data/html/test12.html
branches/nekohtml/upstream/0.9.5/data/html/test13.html
branches/nekohtml/upstream/0.9.5/data/html/test14.html
branches/nekohtml/upstream/0.9.5/data/html/test15.html
branches/nekohtml/upstream/0.9.5/data/html/test16.html
branches/nekohtml/upstream/0.9.5/data/html/test17.html
branches/nekohtml/upstream/0.9.5/data/html/test18.html
branches/nekohtml/upstream/0.9.5/data/html/test19.html
branches/nekohtml/upstream/0.9.5/data/html/test20.html
branches/nekohtml/upstream/0.9.5/data/html/test21.html
branches/nekohtml/upstream/0.9.5/data/html/test22.html
branches/nekohtml/upstream/0.9.5/data/html/test23.html
branches/nekohtml/upstream/0.9.5/data/html/test24.html
branches/nekohtml/upstream/0.9.5/data/html/test25.html
branches/nekohtml/upstream/0.9.5/data/html/test26.html
branches/nekohtml/upstream/0.9.5/data/html/test27.html
branches/nekohtml/upstream/0.9.5/data/html/test28.html
branches/nekohtml/upstream/0.9.5/data/html/test29.html
branches/nekohtml/upstream/0.9.5/data/html/test30.html
branches/nekohtml/upstream/0.9.5/data/html/test31.html
branches/nekohtml/upstream/0.9.5/data/html/test32.html
branches/nekohtml/upstream/0.9.5/data/html/test33.html
branches/nekohtml/upstream/0.9.5/data/html/test34.html
branches/nekohtml/upstream/0.9.5/data/html/test35.html
branches/nekohtml/upstream/0.9.5/data/html/test36.html
branches/nekohtml/upstream/0.9.5/data/html/test37.html
branches/nekohtml/upstream/0.9.5/data/html/test38.html
branches/nekohtml/upstream/0.9.5/data/html/test39.html
branches/nekohtml/upstream/0.9.5/data/html/test40.html
branches/nekohtml/upstream/0.9.5/data/html/test41.html
branches/nekohtml/upstream/0.9.5/data/html/test42.html
branches/nekohtml/upstream/0.9.5/data/html/test43.html
branches/nekohtml/upstream/0.9.5/data/html/test44.html
branches/nekohtml/upstream/0.9.5/data/html/test45.html
branches/nekohtml/upstream/0.9.5/data/html/test46.html
branches/nekohtml/upstream/0.9.5/data/html/test47.html
branches/nekohtml/upstream/0.9.5/data/html/test48.html
branches/nekohtml/upstream/0.9.5/data/html/test49.html
branches/nekohtml/upstream/0.9.5/data/html/test50.html
branches/nekohtml/upstream/0.9.5/data/html/test51.html
branches/nekohtml/upstream/0.9.5/data/html/test52.html
branches/nekohtml/upstream/0.9.5/data/html/test53.html
branches/nekohtml/upstream/0.9.5/data/html/test54.html
branches/nekohtml/upstream/0.9.5/data/html/test55.html
branches/nekohtml/upstream/0.9.5/data/html/test56.html
branches/nekohtml/upstream/0.9.5/data/html/test57.html
branches/nekohtml/upstream/0.9.5/data/html/test58.html
branches/nekohtml/upstream/0.9.5/data/html/test59.html
branches/nekohtml/upstream/0.9.5/data/html/test60.html
branches/nekohtml/upstream/0.9.5/data/html/test61.html
branches/nekohtml/upstream/0.9.5/data/html/test62.html
branches/nekohtml/upstream/0.9.5/data/html/test63.html
branches/nekohtml/upstream/0.9.5/data/html/test64.html
branches/nekohtml/upstream/0.9.5/data/html/test65.html
branches/nekohtml/upstream/0.9.5/data/html/test66.html
branches/nekohtml/upstream/0.9.5/data/html/test67.html
branches/nekohtml/upstream/0.9.5/data/html/test68.html
branches/nekohtml/upstream/0.9.5/data/html/test69.html
branches/nekohtml/upstream/0.9.5/data/html/test70.html
branches/nekohtml/upstream/0.9.5/data/html/test71.html
branches/nekohtml/upstream/0.9.5/data/html/test72.html
branches/nekohtml/upstream/0.9.5/data/html/test73.html
branches/nekohtml/upstream/0.9.5/data/html/test74.html
branches/nekohtml/upstream/0.9.5/data/html/test75.html
branches/nekohtml/upstream/0.9.5/data/html/test76.html
branches/nekohtml/upstream/0.9.5/data/html/test77.html
branches/nekohtml/upstream/0.9.5/data/html/test78.html
branches/nekohtml/upstream/0.9.5/data/html/test79.html
branches/nekohtml/upstream/0.9.5/data/html/test80.html
branches/nekohtml/upstream/0.9.5/data/html/test81.html
branches/nekohtml/upstream/0.9.5/data/html/test82.html
branches/nekohtml/upstream/0.9.5/data/html/test83.html
branches/nekohtml/upstream/0.9.5/data/html/test84.html
branches/nekohtml/upstream/0.9.5/data/html/test85.html
branches/nekohtml/upstream/0.9.5/data/html/test86.html
branches/nekohtml/upstream/0.9.5/data/html/test87.html
branches/nekohtml/upstream/0.9.5/data/html/test88.html
branches/nekohtml/upstream/0.9.5/data/html/test89.html
branches/nekohtml/upstream/0.9.5/data/html/test90.html
branches/nekohtml/upstream/0.9.5/data/html/test91.html
branches/nekohtml/upstream/0.9.5/data/html/test92.html
branches/nekohtml/upstream/0.9.5/data/html/test93.html
branches/nekohtml/upstream/0.9.5/data/html/test94.html
branches/nekohtml/upstream/0.9.5/data/html/test95.html
branches/nekohtml/upstream/0.9.5/data/html/test95.html.settings
branches/nekohtml/upstream/0.9.5/data/html/test96.html
branches/nekohtml/upstream/0.9.5/data/html/test97.html
branches/nekohtml/upstream/0.9.5/data/html/test97.html.settings
branches/nekohtml/upstream/0.9.5/data/html/test98.html
branches/nekohtml/upstream/0.9.5/data/html/test98.html.settings
branches/nekohtml/upstream/0.9.5/data/html/test99.html
branches/nekohtml/upstream/0.9.5/doc/
branches/nekohtml/upstream/0.9.5/doc/html/
branches/nekohtml/upstream/0.9.5/doc/html/.htaccess
branches/nekohtml/upstream/0.9.5/doc/html/changes.html
branches/nekohtml/upstream/0.9.5/doc/html/faq.html
branches/nekohtml/upstream/0.9.5/doc/html/filters.html
branches/nekohtml/upstream/0.9.5/doc/html/index.html
branches/nekohtml/upstream/0.9.5/doc/html/settings.html
branches/nekohtml/upstream/0.9.5/doc/html/software.html
branches/nekohtml/upstream/0.9.5/doc/html/usage.html
branches/nekohtml/upstream/0.9.5/doc/style.css
branches/nekohtml/upstream/0.9.5/src/
branches/nekohtml/upstream/0.9.5/src/html/
branches/nekohtml/upstream/0.9.5/src/html/META-INF/
branches/nekohtml/upstream/0.9.5/src/html/META-INF/services/
branches/nekohtml/upstream/0.9.5/src/html/META-INF/services/org.apache.xerces.xni.parser.XMLParserConfiguration
branches/nekohtml/upstream/0.9.5/src/html/org/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLAugmentations.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLComponent.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLConfiguration.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLElements.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEntities.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLErrorReporter.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEventInfo.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLScanner.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLTagBalancer.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/ObjectFactory.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport12.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/DefaultFilter.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/ElementRemover.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Identity.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/NamespaceBinder.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Purifier.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Writer.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMFragmentParser.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMParser.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/SAXParser.java
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/ErrorMessages.properties
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLlat1.properties
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLspecial.properties
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLsymbol.properties
branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/XMLbuiltin.properties
branches/nekohtml/upstream/0.9.5/src/html/sample/
branches/nekohtml/upstream/0.9.5/src/html/sample/HTMLSAXParser.java
branches/nekohtml/upstream/0.9.5/src/html/sample/RemoveElements.java
branches/nekohtml/upstream/0.9.5/src/html/sample/Script.java
branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOM.java
branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOMFragment.java
branches/nekohtml/upstream/0.9.5/src/html/test/
branches/nekohtml/upstream/0.9.5/src/html/test/Tester.java
branches/nekohtml/upstream/0.9.5/src/html/test/UTF8BOMSkipper.java
branches/nekohtml/upstream/0.9.5/src/html/test/Writer.java
Log:
Imported upstream sources.
Added: branches/nekohtml/upstream/0.9.5/LICENSE
===================================================================
--- branches/nekohtml/upstream/0.9.5/LICENSE 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/LICENSE 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,47 @@
+The CyberNeko Software License, Version 1.0
+
+
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+3. The end-user documentation included with the redistribution,
+ if any, must include the following acknowledgment:
+ "This product includes software developed by Andy Clark."
+ Alternately, this acknowledgment may appear in the software itself,
+ if and wherever such third-party acknowledgments normally appear.
+
+4. The names "CyberNeko" and "NekoHTML" must not be used to endorse
+ or promote products derived from this software without prior
+ written permission. For written permission, please contact
+ andyc at cyberneko.net.
+
+5. Products derived from this software may not be called "CyberNeko",
+ nor may "CyberNeko" appear in their name, without prior written
+ permission of the author.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR OTHER CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+====================================================================
+
+This license is based on the Apache Software License, version 1.1.
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/LICENSE_apache
===================================================================
--- branches/nekohtml/upstream/0.9.5/LICENSE_apache 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/LICENSE_apache 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,56 @@
+/*
+ * The Apache Software License, Version 1.1
+ *
+ *
+ * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Xerces" and "Apache Software Foundation" must
+ * not be used to endorse or promote products derived from this
+ * software without prior written permission. For written
+ * permission, please contact apache at apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * nor may "Apache" appear in their name, without prior written
+ * permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation and was
+ * originally based on software copyright (c) 1999, International
+ * Business Machines, Inc., http://www.ibm.com. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
Added: branches/nekohtml/upstream/0.9.5/README_html
===================================================================
--- branches/nekohtml/upstream/0.9.5/README_html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/README_html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,28 @@
+NekoHTML Readme
+===============
+
+Building
+--------
+
+You can build NekoHTML with the supplied build.bat file by
+typing the following:
+
+ > build -f build-html.xml (target ...)
+
+The default target will build the entire package.
+
+To build NekoHTML, you need Ant and Xerces2. The first time
+you try to build, it will tell you is required and where to
+download the necessary packages.
+
+Documentation
+-------------
+
+The documentation for NekoHTML is located at the following URL:
+
+ doc/html/index.html
+
+Contact Information
+-------------------
+
+Andy Clark <andyc at apache.org>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/TODO_html
===================================================================
--- branches/nekohtml/upstream/0.9.5/TODO_html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/TODO_html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+TODO:
+ * scanner
+ * optimize (more)
+ * tag balancer
+ * optimize (more)
+ * incorporate ideas from JTidy
Added: branches/nekohtml/upstream/0.9.5/build-html-test.xml
===================================================================
--- branches/nekohtml/upstream/0.9.5/build-html-test.xml 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/build-html-test.xml 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+<project default='test'>
+ <taskdef name='tester' classname='test.Tester'/>
+ <target name='test'>
+ <tester canondir='data/html/canonical' outputdir='data/html/output'>
+ <fileset dir='data/html' includes='test*.html'/>
+ </tester>
+ </target>
+</project>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/build-html.xml
===================================================================
--- branches/nekohtml/upstream/0.9.5/build-html.xml 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/build-html.xml 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,186 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!-- $Id: build-html.xml,v 1.22 2005/05/27 04:03:31 andyc Exp $ -->
+<project default='all' basedir='.'>
+
+ <!-- PROPERTIES -->
+ <property name='version' value='0.9.5'/>
+ <property name='name' value='nekohtml'/>
+ <property name='fullname' value='${name}-${version}'/>
+ <property name='Title' value='NekoHTML'/>
+ <property name='FullTitle' value='CyberNeko HTML Parser'/>
+ <property name='Name' value='${Title} ${version}'/>
+ <property name='author' value='Andy Clark'/>
+ <property name='copyright' value='(C) Copyright 2002-2005, ${author}. All rights reserved.'/>
+ <property name='URL' value='http://www.apache.org/~andyc/neko/doc/html/index.html'/>
+
+ <property name='jarfile' value='${name}.jar'/>
+ <property name='jarfileXni' value='${name}Xni.jar'/>
+ <property name='jarfileSamples' value='${name}Samples.jar'/>
+ <property name='jarfileTest' value='${name}Test.jar'/>
+
+ <property name='zipfile' value='${fullname}.zip'/>
+ <property name='tarfile' value='${fullname}.tar'/>
+ <property name='tgzfile' value='${fullname}.tar.gz'/>
+
+ <property name='contents.misc' value='LICENSE,LICENSE_apache,README_html,TODO_html,build.bat,build-html.xml,build-html-test.xml'/>
+ <property name='contents.jars' value='${jarfile},${jarfileXni},${jarfileSamples}'/>
+ <property name='contents.source' value='src/html/META-INF/**,src/html/**/*.java,src/html/**/*.properties'/>
+ <property name='contents.docs' value='doc/style.css,doc/html/**,data/html/test*.html*,data/html/canonical/*'/>
+ <property name='contents.libs' value='lib/xercesMinimal.jar'/>
+ <property name='contents'
+ value='${contents.misc},${contents.jars},${contents.source},${contents.docs},${contents.libs}'/>
+
+ <property name='package' value='org.cyberneko.html'/>
+
+ <property name='version.dir' value='bin/html/src'/>
+ <property name='version.manifest' value='${version.dir}/MANIFEST_html'/>
+ <property name='version.dir.java' value='${version.dir}/org/cyberneko/html'/>
+ <property name='version.java' value='${version.dir.java}/Version.java'/>
+
+ <!-- TARGETS -->
+ <target name='universe' depends='full,all'/>
+
+ <target name='all' depends='zip,tgz'/>
+
+ <target name='full'>
+ <property name='contents.full' value='lib/**'/>
+ </target>
+
+ <target name='compile' depends='version'>
+ <mkdir dir="bin/html"/>
+ <javac srcdir='src/html' destdir='bin/html' includes='org/**,sample/**' debug='true'/>
+ <javac srcdir='bin/html/src' destdir='bin/html' includes='org/**'/>
+ </target>
+
+ <target name='jar' depends='compile'>
+ <copy todir='bin/html'>
+ <fileset dir='.' includes='LICENSE'/>
+ <fileset dir='src/html' includes='**/*.properties'/>
+ </copy>
+ <jar jarfile='${jarfile}' basedir='bin/html'
+ manifest='${version.manifest}'
+ includes='LICENSE,org/**/*.class,org/**/*.properties'/>
+ <jar jarfile='${jarfileSamples}' basedir='bin/html'
+ includes='LICENSE,sample/**'/>
+ </target>
+
+ <target name='jar-xni'>
+ <mkdir dir="bin/html"/>
+ <copy todir='bin/html'>
+ <fileset dir='.' includes='LICENSE'/>
+ <fileset dir='src/html' includes='META-INF/services/**'/>
+ </copy>
+ <jar jarfile='${jarfileXni}' basedir='bin/html'
+ includes='LICENSE,META-INF/services/**'/>
+ </target>
+
+ <target name='package' depends='jar,doc'>
+ <mkdir dir='bin/package/${fullname}'/>
+ <copy todir='bin/package/${fullname}'>
+ <fileset dir='.' includes='${contents},${contents.full}'/>
+ <fileset dir='bin' includes='${contents.jars}'/>
+ </copy>
+ </target>
+
+ <target name='package-nodir'>
+ <mkdir dir='bin/package-${name}'/>
+ <copy todir='bin/package-${name}'>
+ <fileset dir='bin/package/${fullname}' includes='**'/>
+ </copy>
+ </target>
+
+ <target name='zip' depends='package'>
+ <zip zipfile='${zipfile}' basedir='bin/package' includes='${fullname}/**'/>
+ </target>
+
+ <target name='tgz' depends='package'>
+ <tar tarfile='${tarfile}' basedir='bin/package' includes='${fullname}/**'/>
+ <gzip zipfile='${tgzfile}' src='${tarfile}'/>
+ <delete file='${tarfile}'/>
+ </target>
+
+ <target name='doc' unless='docs.done'>
+ <delete dir='doc/html/javadoc'/>
+ <mkdir dir='doc/html/javadoc'/>
+ <javadoc packagenames='${package},${package}.parsers,${package}.filters'
+ sourcepath='src/html' destdir='doc/html/javadoc'
+ author='true' version='true' use='true'
+ windowtitle="${Name} Implementation"
+ doctitle="${Name}"
+ bottom="${copyright}"
+ />
+ <property name='docs.done' value='true'/>
+ </target>
+
+ <target name='version-init'>
+ <mkdir dir='${version.dir.java}/'/>
+ <dependset>
+ <srcfilelist dir='.' files='build-html.xml'/>
+ <targetfilelist dir='.' files='${version.manifest},${version.java}'/>
+ </dependset>
+ <available property='available.version' file='${version.java}'/>
+ </target>
+
+ <target name='version' depends='version-init' unless='available.version'>
+ <echo message='Generating ${version.java}'/>
+ <echo file='${version.java}'>/* ${copyright} */
+
+package org.cyberneko.html;
+
+/**
+ * This class holds version information for the ${FullTitle}.
+ *
+ * @author ${author}
+ */
+public class Version {
+
+ /** Returns the version string. */
+ public static String getVersion() { return "${Name}"; }
+
+ /** Prints the version string to standard output. */
+ public static void main(String[] argv) {
+ System.out.println(getVersion());
+ } // main(String[])
+
+} // class Version</echo>
+ <echo message='Generating ${version.manifest}'/>
+ <echo file='${version.manifest}'>
+Name: org/cyberneko/html/
+Implementation-Title: ${FullTitle}
+Implementation-Version: ${version}
+Implementation-Vendor: ${author}
+Implementation-URL: ${URL}
+Specification-Title: Hyper-Text Markup Language (HTML)
+Specification-Vendor: World Wide Web Consortium (W3C)
+Specification-Version: 4.01
+
+</echo>
+ </target>
+
+ <target name='test' depends='jar'>
+ <javac srcdir='src/html' destdir='bin/html' includes='test/**'/>
+ <jar jarfile='${jarfileTest}' basedir='bin/html'
+ includes='test/**/*.class,test/**/*.properties'/>
+ <mkdir dir='data/html/output'/>
+ <java classname='org.apache.tools.ant.Main' fork='true' failonerror='true'>
+ <classpath>
+ <pathelement path='${java.class.path}'/>
+ <pathelement location='${jarfile}'/>
+ <pathelement location='${jarfileTest}'/>
+ </classpath>
+ <arg value='-f'/>
+ <arg value='build-html-test.xml'/>
+ </java>
+ </target>
+
+ <target name='clean'>
+ <delete dir='bin/html' quiet='true'/>
+ <delete dir='doc/html/javadoc' quiet='true'/>
+ <delete quiet='true'>
+ <fileset dir='.' includes='${name}*.jar,${name}*.zip,${name}*.tar.gz'/>
+ </delete>
+ <delete dir='bin/package' quiet='true'/>
+ <delete dir='bin/package-${name}' quiet='true'/>
+ </target>
+
+</project>
Added: branches/nekohtml/upstream/0.9.5/build.bat
===================================================================
--- branches/nekohtml/upstream/0.9.5/build.bat 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/build.bat 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,40 @@
+ at echo off
+if "%JAVA_HOME%" == "" goto error
+if not exist lib\xalan.jar goto requirements
+set LOCALCLASSPATH=%JAVA_HOME%\lib\tools.jar;%JAVA_HOME%\jre\lib\rt.jar;lib\xml-apis.jar;lib\xalan.jar;lib\xercesImpl.jar;lib\ant.jar;lib\ant-launcher.jar;lib\jing.jar;lib\junit.jar
+"%JAVA_HOME%\bin\java" -classpath "%LOCALCLASSPATH%" org.apache.tools.ant.Main %1 %2 %3 %4 %5
+goto end
+:error
+echo error: JAVA_HOME not found in your environment.
+goto end
+:requirements
+echo error: Missing required jar files.
+echo.
+echo The Ant tool is required. Download Ant from the following URL
+echo http://jakarta.apache.org/ant/index.html and place the ant.jar
+echo file in the lib/ directory.
+echo.
+echo Please download Xalan2 from http://xml.apache.org/dist/xalan-j/
+echo and place the following files in the lib/ directory:
+echo.
+echo xml-apis.jar
+echo xalan.jar
+echo xercesImpl.jar
+echo.
+echo Please download Xerces2 from http://xml.apache.org/dist/xerces-j/
+echo and place the following files in the lib/ directory:
+echo.
+echo xercesSamples.jar
+echo.
+echo If building ManekiNeko, James Clark's Jing Relax NG validator
+echo is also required. Please download the Jar file distribution
+echo from http://www.thaiopensource.com/relaxng/jing.html and place
+echo the following file in the lib/ directory:
+echo.
+echo jing.jar
+echo.
+if not exist lib md lib
+goto end
+:end
+set LOCALCLASSPATH=
+ at echo on
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/README
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/README 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/README 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+The "canonical" output of the files in this directory are
+a modified NSGMLS format, as described below. Each piece
+of information is conveyed on a separate line, encoded in
+UTF-8.
+
+ startElement ::= '(' name
+ attribute ::= 'A' name ' ' value
+ endElement ::= ')' name
+ characters ::= '"' text
+ comment ::= '#' text
+
+ text ::= Unicode chars, with tab, carriage return, and
+ newline escaped as \t, \r, and \n, respectively.
+
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test00.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test00.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test00.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test01.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test01.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test01.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"Just text.
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test02.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test02.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test02.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(H1
+"Header
+)H1
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test03.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test03.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test03.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(P
+"Paragraph text.
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test04.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test04.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test04.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,10 @@
+(HTML
+(BODY
+(P
+"Paragraph text.\n
+)P
+(P
+"Additional text.
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test05.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test05.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test05.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(HEAD
+(SCRIPT
+Atype text/javascript
+"\n This is a <b>test</b>. Don't go crazy! </i>\n
+)SCRIPT
+)HEAD
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test06.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test06.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test06.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,17 @@
+(HTML
+(BODY
+"This
+(I
+"is
+(B
+Aclass test
+"unbalanced
+)B
+)I
+(B
+Aclass test
+" content
+)B
+", dude!
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test07.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test07.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test07.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,15 @@
+(HTML
+(BODY
+(TABLE
+(TR
+(TD
+Anowrap
+)TD
+(TD
+Aalign middle
+Anowrap
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test08.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test08.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test08.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(HEAD
+(FOOBAR
+"Text
+)FOOBAR
+)HEAD
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test09.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test09.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test09.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+(HTML
+(HEAD
+(META
+Acontent text/html; charset=utf-8
+Ahttp-equiv content-type
+)META
+"\n
+)HEAD
+(BODY
+(H1
+"ã¢ãã¡
+)H1
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test10.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test10.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test10.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+(HTML
+(HEAD
+(META
+Acontent text/html; charset=utf-16
+Ahttp-equiv content-type
+)META
+"\n
+)HEAD
+(BODY
+(H1
+"ã¢ãã¡(LE)
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test100.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test100.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test100.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+(HTML
+(BODY
+(P
+"\n
+(A
+Ahref link.htm
+(H3
+"Header
+)H3
+)A
+"\n
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test101.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test101.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test101.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+?base http://foo.bar
+?tags :noads:
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test102.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test102.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test102.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(HEAD
+(META
+Acontent text/html;charset=iso-8859-1
+Ahttp-equiv content-type
+)META
+"\n
+)HEAD
+(BODY
+"â
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test103.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test103.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test103.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+(HTML
+A{http://www.w3.org/2000/xmlns/}xmlns:A NSa
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test11.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test11.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test11.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+(HTML
+(HEAD
+(META
+Acontent text/html; charset=utf-16
+Ahttp-equiv content-type
+)META
+"\n
+)HEAD
+(BODY
+(H1
+"ã¢ãã¡(BE)
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test12.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test12.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test12.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,37 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(TR
+"\n
+(TD
+"\n
+(TABLE
+"\n
+(TR
+"\n
+(TD
+"cell 1
+)TD
+"\n
+(TD
+"\n
+)TD
+)TR
+)TABLE
+"\n
+)TD
+"\n
+)TR
+"\n
+(TR
+"\n
+(TD
+"cell 2
+)TD
+"\n
+)TR
+"\n
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test13.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test13.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test13.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,23 @@
+(HTML
+(BODY
+(UL
+"\n
+(LI
+"One\n
+)LI
+(LI
+"Two\n
+(UL
+"\n
+(LI
+"Two.One\n
+)LI
+)UL
+"\n
+)LI
+(LI
+"Three\n
+)LI
+)UL
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test14.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test14.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test14.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,30 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(TR
+"\n
+(TD
+"\n
+(OBJECT
+"\n
+(PARAM
+)PARAM
+"\n
+(EMBED
+"\n
+)EMBED
+"\n
+(NOEMBED
+"\n
+)NOEMBED
+"\n
+)OBJECT
+"\n
+)TD
+"\n
+)TR
+"\n
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test15.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test15.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test15.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(A
+Ahref http://example.com/cgi-bin/redirect?s=www.candy.com&u=Andy
+"M & Ms
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test16.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test16.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test16.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&unknown1; & &unknown2;
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test17.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test17.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test17.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(TR
+(TD
+(INPUT
+Atype text
+)INPUT
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test18.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test18.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test18.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(A
+Ahref /path/
+"blah
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test19.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test19.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test19.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(IMG
+Asrc me.gif
+)IMG
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test20.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test20.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test20.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(SPAN
+Aclass note
+"Look Out!
+)SPAN
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test21.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test21.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test21.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"M & M
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test22.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test22.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test22.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&foo;
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test23.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test23.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test23.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"<
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test24.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test24.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test24.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+(HTML
+(BODY
+(A
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test25.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test25.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test25.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test26.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test26.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test26.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test27.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test27.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test27.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test28.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test28.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test28.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"< =
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test29.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test29.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test29.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&#foo;
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test30.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test30.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test30.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(A
+Ahref /cgi-bin/myscript
+"happy
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test31.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test31.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test31.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(A
+Ahref /broken/
+"Too Much to Ask
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test32.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test32.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test32.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,9 @@
+(HTML
+(BODY
+(IMG
+Aalt this ain't a real quote
+Asrc aint.gif
+)IMG
+"\n"this & that" â¢\n -- Andy <andyc at apache.org>
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test33.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test33.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test33.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(HEAD
+(SCRIPT
+Atype text/x-nekoscript
+"\n(h1\n"Header\n)h1\n
+)SCRIPT
+)HEAD
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test34.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test34.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test34.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(BODY
+(LI
+"Item1
+(UL
+(LI
+"Item2
+)LI
+)UL
+)LI
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test35.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test35.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test35.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,31 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(TR
+"\n
+(TH
+"foo\n
+)TH
+(TH
+"bar\n
+)TH
+(TH
+"baz\n
+)TH
+)TR
+(TR
+"\n
+(TD
+"foo\n
+)TD
+(TD
+"bar\n
+)TD
+(TD
+"baz\n
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test36.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test36.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test36.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(BODY
+(A
+Aname foo
+)A
+(P
+(A
+Aname foo
+"Blah
+)A
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test37.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test37.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test37.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,23 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(TR
+"\n
+(TD
+"Alpha\n
+)TD
+)TR
+)TABLE
+"\n
+(TABLE
+"\n
+(TR
+"\n
+(TD
+"Beta\n
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test38.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test38.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test38.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,27 @@
+(HTML
+"\n
+(BODY
+"\n
+(P
+"Here we go!
+(A
+Ahref http://bigidea.com/
+"Bob
+)A
+"
+(BR
+)BR
+"\n and
+(A
+Ahref http://larryboy.com/
+" Larry
+)A
+"\n and friends
+(A
+Ahref http://google.com/
+"Google
+)A
+"\n
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test39.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test39.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test39.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,15 @@
+(HTML
+(BODY
+(H1
+"Header1
+)H1
+"\n
+(SCRIPT
+"\ndocument.write('ABC');\n<!--\ndocument.write("<script>document.write('Hello, World')
+)SCRIPT
+"");\n//-->\ndocument.write('XYZ');\n\n
+(H2
+"Header2
+)H2
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test40.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test40.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test40.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,26 @@
+(HTML
+(BODY
+(TABLE
+"\n
+(COL
+)COL
+"\n
+(COL
+Astyle text-align: right
+)COL
+"\n
+(TR
+"\n
+(TD
+"This
+)TD
+"\n
+(TD
+"That
+)TD
+"\n
+)TR
+"\n
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test41.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test41.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test41.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,18 @@
+(HTML
+(HEAD
+(TITLE
+"foo
+)TITLE
+)HEAD
+(BODY
+"\n
+(FORM
+Aid form1
+"\n
+(ISINDEX
+Aprompt enterSomeText
+)ISINDEX
+"\n
+)FORM
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test42.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test42.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test42.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,9 @@
+(HTML
+(HEAD
+(SCRIPT
+)SCRIPT
+(TITLE
+"Title
+)TITLE
+)HEAD
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test43.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test43.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test43.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,17 @@
+(HTML
+"\n
+(BODY
+"\n
+(P
+"\n
+(BUTTON
+" a button
+)BUTTON
+"
+(BR
+)BR
+"\n
+)P
+"\n
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test44.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test44.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test44.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+?target
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test45.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test45.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test45.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+?target data
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test46.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test46.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test46.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+?target data\t
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test47.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test47.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test47.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+?php print "Hello, World.\\n";\n
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test48.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test48.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test48.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(ISINDEX
+Aprompt enterSomeText
+)ISINDEX
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test49.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test49.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test49.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+!
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test50.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test50.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test50.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(BODY
+(A
+Ahref foo
+)A
+(P
+(A
+Ahref foo
+"Blah
+)A
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test51.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test51.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test51.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,16 @@
+(HTML
+(BODY
+(UL
+"\n
+(LI
+"Item 1\n
+(P
+"Paragraph\n
+)P
+)LI
+(LI
+"Item 2\n
+)LI
+)UL
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test52.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test52.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test52.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test53.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test53.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test53.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,19 @@
+(HTML
+"\n
+(BODY
+"\n
+(FORM
+"\n
+(SPAN
+Aid span1
+(SPAN
+Aid span2
+(SELECT
+)SELECT
+)SPAN
+)SPAN
+"\n
+)FORM
+"\n
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test54.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test54.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test54.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,10 @@
+(HTML
+(BODY
+(P
+(FORM
+(P
+)P
+)FORM
+)P
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test55.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test55.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test55.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+(FONT
+(SELECT
+)SELECT
+)FONT
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test56.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test56.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test56.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(BODY
+(FORM
+(CENTER
+(SELECT
+(OPTION
+)OPTION
+)SELECT
+)CENTER
+)FORM
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test57.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test57.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test57.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"Outside content
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test58.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test58.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test58.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(TEXTAREA
+"&
+)TEXTAREA
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test59.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test59.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test59.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(HEAD
+(SCRIPT
+"&
+)SCRIPT
+)HEAD
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test60.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test60.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test60.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,10 @@
+(HTML
+(BODY
+(A
+Ahref foo
+(FONT
+"text
+)FONT
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test61.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test61.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test61.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,23 @@
+(HTML
+"\n
+(HEAD
+(META
+Acontent no-cache
+Ahttp-equiv Pragma
+)META
+"\n\n
+(TITLE
+"Title
+)TITLE
+"\n
+(META
+Acontent text/html; charset=iso-8859-1
+Ahttp-equiv Content-Type
+)META
+"\n
+)HEAD
+"\n
+(BODY
+"\n\n
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test62.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test62.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test62.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(HEAD
+"\n
+)HEAD
+(BODY
+(SPAN
+Aid cc
+Astyle behavior:url(#default#clientCaps)
+)SPAN
+"\n
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test63.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test63.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test63.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,16 @@
+(HTML
+(BODY
+(TABLE
+(TR
+(TD
+(TABLE
+(TR
+(TD
+)TD
+)TR
+)TABLE
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test64.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test64.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test64.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+Abgcolor white
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test65.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test65.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test65.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+!
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test66.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test66.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test66.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+!HTML
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test67.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test67.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test67.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+!HTML
+ppublic_id
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test68.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test68.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test68.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+!HTML
+ppublic_id
+ssystem_id
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test69.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test69.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test69.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+!HTML
+ppublic_id
+ssystem_id
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test70.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test70.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test70.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+!HTML
+ssystem_id
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test71.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test71.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test71.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+!ROOT
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test72.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test72.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test72.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(A
+Ahref a&b
+)A
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test73.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test73.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test73.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,11 @@
+(HTML
+(HEAD
+(SCRIPT
+)SCRIPT
+)HEAD
+(BODY
+(H1
+"Title
+)H1
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test74.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test74.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test74.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,15 @@
+!HTML
+p-//W3C//DTD HTML 4.01 Transitional//EN
+shttp://www.w3.o$
+(HTML
+(HEAD
+(TITLE
+"Title
+)TITLE
+)HEAD
+(BODY
+(H1
+"Header1
+)H1
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test75.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test75.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test75.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,9 @@
+(HTML
+(BODY
+(P
+?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office"
+(IMG
+)IMG
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test76.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test76.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test76.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,17 @@
+(HTML
+(BODY
+(P
+"outer paragraph\n
+)P
+(P
+"inner paragraph
+(BR
+)BR
+"second line in inner paragraph\n
+)P
+"second line in outer paragrapth\n
+(P
+)P
+"outside paragraph tags
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test77.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test77.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test77.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+(HTML
+(FRAMESET
+)FRAMESET
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test78.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test78.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test78.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+#\na\nb\nc\n
+(HTML
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test79.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test79.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test79.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+(HTML
+Aa123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test80.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test80.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test80.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(BODY
+(P
+"P1
+#[CDATA[<h1>Header</h1>]]
+"\n
+)P
+(P
+"P2
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test81.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test81.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test81.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+#one\ntwo\n\nthree\n\n\n
+?target one\ntwo\n\nthree\n\n\n
+(HTML
+Aattr one two three
+"\n
+(BODY
+"\n[one\ntwo\n\nthree\n\n\n]\n
+(TEXTAREA
+"one\ntwo\n\nthree\n\n\n
+)TEXTAREA
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test82.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test82.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test82.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(H:BODY
+A{http://www.w3.org/2000/xmlns/}xmlns:H http://www.w3.org/1999/xhtml
+)H:BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test83.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test83.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test83.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,22 @@
+(HTML
+(BODY
+(TABLE
+(TR
+(TD
+(TABLE
+(TR
+(TD
+(TABLE
+(TR
+(TD
+)TD
+)TR
+)TABLE
+)TD
+)TR
+)TABLE
+)TD
+)TR
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test84.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test84.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test84.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,28 @@
+(HTML
+A{http://www.w3.org/2000/xmlns/}xmlns:A NSa
+A{http://www.w3.org/2000/xmlns/}xmlns:B NSb
+A{http://www.w3.org/2000/xmlns/}xmlns:C NSc
+A{http://www.w3.org/2000/xmlns/}xmlns:D NSd
+A{http://www.w3.org/2000/xmlns/}xmlns:E NSe
+A{http://www.w3.org/2000/xmlns/}xmlns:F NSf
+A{http://www.w3.org/2000/xmlns/}xmlns:G NSg
+A{http://www.w3.org/2000/xmlns/}xmlns:H NSh
+A{http://www.w3.org/2000/xmlns/}xmlns:I NSi
+A{http://www.w3.org/2000/xmlns/}xmlns:J NSj
+A{http://www.w3.org/2000/xmlns/}xmlns:K NSk
+A{http://www.w3.org/2000/xmlns/}xmlns:L NSl
+A{http://www.w3.org/2000/xmlns/}xmlns:M NSm
+A{http://www.w3.org/2000/xmlns/}xmlns:N NSn
+A{http://www.w3.org/2000/xmlns/}xmlns:O NSo
+A{http://www.w3.org/2000/xmlns/}xmlns:P NSp
+A{http://www.w3.org/2000/xmlns/}xmlns:Q NSq
+A{http://www.w3.org/2000/xmlns/}xmlns:R NSr
+A{http://www.w3.org/2000/xmlns/}xmlns:S NSs
+A{http://www.w3.org/2000/xmlns/}xmlns:T NSt
+A{http://www.w3.org/2000/xmlns/}xmlns:U NSu
+A{http://www.w3.org/2000/xmlns/}xmlns:V NSv
+A{http://www.w3.org/2000/xmlns/}xmlns:W NSw
+A{http://www.w3.org/2000/xmlns/}xmlns:X NSx
+A{http://www.w3.org/2000/xmlns/}xmlns:Y NSy
+A{http://www.w3.org/2000/xmlns/}xmlns:Z NSz
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test85.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test85.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test85.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test86.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test86.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test86.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&#x
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test87.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test87.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test87.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+!HTML
+p-//W3C//DTD HTML 4.0 Transitional//EN
+(HTML
+(BODY
+"Hello
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test88.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test88.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test88.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+(HTML
+(BODY
+"Hello
+(P
+"World
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test89.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test89.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test89.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+(HTML
+(BODY
+"&
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test90.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test90.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test90.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+xversion 1.0
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test91.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test91.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test91.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+(HTML
+(BODY
+(P
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test92.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test92.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test92.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,11 @@
+(HTML
+(HEAD
+(SCRIPT
+"document.write("</SCRIPT\\>");
+)SCRIPT
+)HEAD
+(BODY
+(P
+)P
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test93.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test93.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test93.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,12 @@
+(HTML
+(BODY
+(TABLE
+(FORM
+(TR
+(TD
+)TD
+)TR
+)FORM
+)TABLE
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test94.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test94.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test94.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(HEAD
+(SCRIPT
+"\n<!--\nhtml script content\n//-->\n
+)SCRIPT
+"\n
+)HEAD
+(BODY
+(H1
+"Foo
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test95.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test95.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test95.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(HEAD
+(SCRIPT
+"\nhtml script content\n//\n
+)SCRIPT
+"\n
+)HEAD
+(BODY
+(H1
+"Foo
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test96.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test96.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test96.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(HEAD
+(SCRIPT
+"\n<![CDATA[\nxhtml script content\n]]>\n
+)SCRIPT
+"\n
+)HEAD
+(BODY
+(H1
+"Foo
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test97.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test97.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test97.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+(HTML
+(HEAD
+(SCRIPT
+"\nxhtml script content\n\n
+)SCRIPT
+"\n
+)HEAD
+(BODY
+(H1
+"Foo
+)H1
+)BODY
+)HTML
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test98.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test98.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test98.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+(HTML
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/canonical/test99.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/canonical/test99.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/canonical/test99.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,16 @@
+(HTML
+A{http://www.w3.org/2000/xmlns/}xmlns:IE
+(HEAD
+(MAINA6
+(META
+Acontent text/html; charset=ISO-8859-1
+Ahttp-equiv Content-Type
+)META
+)MAINA6
+)HEAD
+(BODY
+(H1
+"Foo
+)H1
+)BODY
+)HTML
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test00.html
===================================================================
Added: branches/nekohtml/upstream/0.9.5/data/html/test01.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test01.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test01.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+Just text.
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test02.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test02.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test02.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<h1>Header</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test03.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test03.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test03.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<p>Paragraph text.
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test04.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test04.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test04.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<p>Paragraph text.
+<p>Additional text.
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test05.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test05.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test05.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<script type=text/javascript>
+ This is a <b>test</b>. Don't go crazy! </i>
+</script>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test06.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test06.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test06.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+This <i>is <b class=test>unbalanced</i> content</b>, dude!
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test07.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test07.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test07.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<td nowrap><td nowrap align=middle>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test08.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test08.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test08.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<foobar>Text</foobar>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test09.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test09.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test09.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<meta http-equiv='content-type' content='text/html; charset=utf-8'>
+<h1>ã¢ãã¡</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test10.html
===================================================================
(Binary files differ)
Property changes on: branches/nekohtml/upstream/0.9.5/data/html/test10.html
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: branches/nekohtml/upstream/0.9.5/data/html/test100.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test100.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test100.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<p>
+<a href="link.htm"><h3>Header</h3></a>
+</p>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test101.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test101.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test101.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<?base http://foo.bar/>
+<?tags :noads:?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test102.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test102.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test102.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<meta http-equiv='content-type' content='text/html;charset=iso-8859-1'>
+—
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test102.html.settings
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test102.html.settings 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test102.html.settings 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+feature http://cyberneko.org/html/features/scanner/fix-mswindows-refs true
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test103.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test103.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test103.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<html xmlns:a='NSa'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test11.html
===================================================================
(Binary files differ)
Property changes on: branches/nekohtml/upstream/0.9.5/data/html/test11.html
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: branches/nekohtml/upstream/0.9.5/data/html/test12.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test12.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test12.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,14 @@
+<table>
+ <tr>
+ <td>
+ <table>
+ <tr>
+ <td>cell 1</td>
+ <td>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td>cell 2</td>
+ </tr>
+</table>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test13.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test13.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test13.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+<ul>
+ <li>One
+ <li>Two
+ <ul>
+ <li>Two.One
+ </ul>
+ <li>Three
+</ul>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test14.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test14.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test14.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,13 @@
+<table>
+ <tr>
+ <td>
+ <object>
+ <param>
+ <embed>
+ </embed>
+ <noembed>
+ </noembed>
+ </object>
+ </td>
+ </tr>
+</table>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test15.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test15.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test15.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href='http://example.com/cgi-bin/redirect?s=www.candy.com&u=Andy'>M & Ms</a>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test16.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test16.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test16.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&unknown1; & &unknown2;
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test17.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test17.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test17.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<table>
+<tr><td><input type=text>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test18.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test18.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test18.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href=/path/>blah</a>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test19.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test19.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test19.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<Img Src='me.gif'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test20.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test20.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test20.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<Span CLaSS='note'>Look Out!</spaN>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test21.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test21.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test21.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+M & M
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test22.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test22.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test22.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&foo;
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test23.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test23.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test23.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test24.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test24.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test24.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<A =
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test25.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test25.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test25.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<A href=
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test26.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test26.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test26.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<A href='index.html
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test27.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test27.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test27.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<A href='index.html'
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test28.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test28.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test28.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+< =
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test29.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test29.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test29.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&#foo;
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test30.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test30.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test30.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href=/cgi-bin/myscript>happy</a>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test31.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test31.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test31.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href=/broken/>Too Much to Ask</a>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test32.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test32.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test32.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<img alt='this ain't a real quote' src='aint.gif'>
+"this & that" ™
+ -- Andy <andyc at apache.org>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test33.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test33.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test33.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+<script type='text/x-nekoscript'>
+(h1
+"Header
+)h1
+</script>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test34.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test34.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test34.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<li>Item1<ul></li><li>Item2
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test35.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test35.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test35.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,10 @@
+<table>
+ <tr>
+ <th>foo
+ <th>bar
+ <th>baz
+ <tr>
+ <td>foo
+ <td>bar
+ <td>baz
+</table>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test36.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test36.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test36.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a name=foo><p>Blah</p>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test37.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test37.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test37.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+<table>
+ <tr>
+ <td>Alpha
+</table>
+<table>
+ <tr>
+ <td>Beta
+</table>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test38.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test38.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test38.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<html>
+<body>
+ <p>Here we go! <a href="http://bigidea.com/">Bob</a> <br/.</p>
+ and <a href="http://larryboy.com/"> Larry </a>
+ and friends <a href="http://google.com/">Google</a>
+</body></html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test39.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test39.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test39.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,9 @@
+<h1>Header1</h1>
+<script>
+document.write('ABC');
+<!--
+document.write("<script>document.write('Hello, World')</script>");
+//-->
+document.write('XYZ');
+</script>
+<h2>Header2</h2>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test40.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test40.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test40.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,8 @@
+<table>
+ <col>
+ <col style="text-align: right">
+ <tr>
+ <td>This</td>
+ <td>That</td>
+ </tr>
+</table>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test41.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test41.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test41.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+<html><head><title>foo</title></head><body>
+<form id='form1'>
+<isindex prompt='enterSomeText'></isindex>
+</form></body></html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test42.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test42.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test42.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<head><script/><title>Title</title>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test43.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test43.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test43.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>
+<button> a button </button> <br>
+</p>
+</body></html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test44.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test44.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test44.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<?target?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test45.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test45.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test45.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<?target data?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test46.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test46.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test46.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<?target data ?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test47.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test47.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test47.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<?php
+print "Hello, World.\n";
+?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test48.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test48.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test48.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<body><isindex prompt='enterSomeText'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test49.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test49.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test49.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,22 @@
+<!DOCTYPE [
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+////////////////////////////////////////////////////////////////////////////////////////////////////
+]>
Added: branches/nekohtml/upstream/0.9.5/data/html/test50.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test50.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test50.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href=foo><p>Blah</p>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test51.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test51.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test51.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+<ul>
+<li>Item 1
+ <p>Paragraph
+<li>Item 2
+</ul>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test52.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test52.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test52.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<html></html
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test53.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test53.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test53.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<html>
+<body>
+<form>
+<span id=span1><span id=span2><select></select></span></span>
+</form>
+</body></html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test54.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test54.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test54.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<p><form><p>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test55.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test55.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test55.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<font><select></select></font>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test56.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test56.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test56.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<form><center><select><option>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test57.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test57.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test57.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<html><body></body></html>Outside content
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test58.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test58.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test58.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<textarea>&</textarea>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test59.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test59.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test59.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<script>&</script>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test60.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test60.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test60.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href='foo'><font>text</font></a>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test61.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test61.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test61.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,10 @@
+<html>
+<meta http-equiv="Pragma" content="no-cache">
+<head>
+<title>Title</title>
+<meta http-equiv="Content-Type" content="text/html;
+charset=iso-8859-1">
+</head>
+<body>
+</body>
+</html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test62.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test62.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test62.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<html><head>
+<span style="behavior:url(#default#clientCaps)" id=cc></span></head>
+<body>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test63.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test63.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test63.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<table><tr><td><table><td>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test64.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test64.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test64.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<body bgcolor='white'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test65.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test65.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test65.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test66.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test66.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test66.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!doctype html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test67.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test67.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test67.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE html PUBLIC "public_id">
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test68.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test68.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test68.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE html PUBLIC "public_id" "system_id">
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test69.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test69.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test69.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE html PUBLIC 'public_id' 'system_id'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test70.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test70.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test70.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE html SYSTEM "system_id">
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test71.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test71.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test71.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<!doctype root>
+<!doctype html>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test72.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test72.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test72.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<a href=a&b>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test73.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test73.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test73.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<script></script><html><h1>Title
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test74.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test74.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test74.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+"http://www.w3.o$
+<title>Title</title><h1>Header1</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test75.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test75.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test75.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<P><?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /><IMG></P>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test76.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test76.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test76.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,4 @@
+<p>outer paragraph
+<p>inner paragraph<br>second line in inner paragraph
+</p>second line in outer paragrapth
+</p>outside paragraph tags
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test77.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test77.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test77.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<html><frameset>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test78.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test78.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test78.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,5 @@
+<!--
+a
+b
+c
+-
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test79.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test79.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test79.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<HTML A123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890></HTML>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test80.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test80.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test80.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+<p>P1<![CDATA[<h1>Header</h1>]]>
+<p>P2
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test81.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test81.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test81.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<!--one
two
three
-->
+<?target one
two
three
?>
+<html attr="one
two
three
">
+<body>
+[one
two
three
]
+<textarea>one
two
three
</textarea>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test82.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test82.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test82.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<html><h:body xmlns:h='http://www.w3.org/1999/xhtml'>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test83.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test83.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test83.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<table><tr><td><table><tr><td><table><tr><td>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test84.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test84.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test84.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,28 @@
+<html
+xmlns:a='NSa'
+xmlns:b='NSb'
+xmlns:c='NSc'
+xmlns:d='NSd'
+xmlns:e='NSe'
+xmlns:f='NSf'
+xmlns:g='NSg'
+xmlns:h='NSh'
+xmlns:i='NSi'
+xmlns:j='NSj'
+xmlns:k='NSk'
+xmlns:l='NSl'
+xmlns:m='NSm'
+xmlns:n='NSn'
+xmlns:o='NSo'
+xmlns:p='NSp'
+xmlns:q='NSq'
+xmlns:r='NSr'
+xmlns:s='NSs'
+xmlns:t='NSt'
+xmlns:u='NSu'
+xmlns:v='NSv'
+xmlns:w='NSw'
+xmlns:x='NSx'
+xmlns:y='NSy'
+xmlns:z='NSz'
+>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test85.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test85.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test85.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test86.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test86.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test86.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&#x
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test87.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test87.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test87.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >Hello
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test88.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test88.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test88.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<HTML>Hello<p>World
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test89.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test89.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test89.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+&
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test90.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test90.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test90.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<?xml version='1.0'?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test91.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test91.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test91.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<p><?xml version="1.0"?>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test92.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test92.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test92.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<script>document.write("</SCRIPT\>");</script><p>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test93.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test93.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test93.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<table><form><td>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test94.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test94.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test94.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<script>
+<!--
+html script content
+//-->
+</script>
+<h1>Foo</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test95.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test95.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test95.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<script>
+<!--
+html script content
+//-->
+</script>
+<h1>Foo</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test95.html.settings
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test95.html.settings 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test95.html.settings 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+feature http://cyberneko.org/html/features/scanner/script/strip-comment-delims true
Added: branches/nekohtml/upstream/0.9.5/data/html/test96.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test96.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test96.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<script>
+<![CDATA[
+xhtml script content
+]]>
+</script>
+<h1>Foo</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test97.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test97.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test97.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,6 @@
+<script>
+<![CDATA[
+xhtml script content
+]]>
+</script>
+<h1>Foo</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test97.html.settings
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test97.html.settings 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test97.html.settings 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+feature http://cyberneko.org/html/features/scanner/script/strip-cdata-delims true
Added: branches/nekohtml/upstream/0.9.5/data/html/test98.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test98.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test98.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+</html><h1>foo</h1>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test98.html.settings
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test98.html.settings 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test98.html.settings 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+feature http://cyberneko.org/html/features/balance-tags/ignore-outside-content true
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/data/html/test99.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/data/html/test99.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/data/html/test99.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+<HTML XMLNS:IE><head><mainA6><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=ISO-8859-1"><h1>Foo
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/html/.htaccess
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/.htaccess 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/.htaccess 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,2 @@
+AddDefaultCharset Off
+
Added: branches/nekohtml/upstream/0.9.5/doc/html/changes.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/changes.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/changes.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,439 @@
+<title>NekoHTML | Change History</title>
+<link rel=stylesheet type=text/css href=../style.css>
+
+<h1>Change History</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+<a href='usage.html'>Usage</a>
+|
+<a href='settings.html'>Settings</a>
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+<a href='software.html'>Software</a>
+|
+Changes
+]
+</div>
+
+<h2>Releases</h2>
+<dl>
+ <dt>Version 0.9.5 (18 Jun 2005)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.5.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.5.tar.gz'>tgz</a>]
+ <dd>Added feature submitted by Asgeir Asgeirsson to allow scanner to fix
+ character entity references for Microsoft Windows® characters;
+ stopped building nekohtmlXni.jar file by default;
+ fixed handling of <blockquote> reported by Joseph Walton
+ to better match browser behavior;
+ fixed tag-balancing bug for unknown elements reported by Marc
+ Guillemot and Vadim Tashlikovich;
+ fixed mapping of encoding name in <code><meta></code> element
+ reported by Marc Guillemot;
+ changed tag-balancing to allow headers inside of links suggested
+ by Laurens Fridael;
+ applied attribute namespace patch from Joseph Walton;
+ fixed namespace bug for "xml" prefixes reported by Asgeir
+ Asgeirsson;
+ fixed namespace bug for "xmlns" prefixes reported by
+ Johannes Koch;
+ and
+ fixed no-such-method exception bug when using augmentations feature
+ with older versions of Xerces2 reported by Hans Donner.
+ <dt>Version 0.9.4 (17 Nov 2004)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.4.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.4.tar.gz'>tgz</a>]
+ <dd>Fixed typo in proviso 5 of the license agreement;
+ added features to strip CDATA delimiters (i.e. "<![CDATA[" and
+ "]]>") from <script> and <style> elements suggested by Dan Sojka;
+ fixed tag-balancing problem reported by Egor Samarkhanov;
+ applied augmentations patches donated by Marc-André Morissette;
+ implemented augmentation performance enhancements inspired by
+ Marc-André Morissette;
+ fixed ignore-outside-content bug reported by Chris Erskine;
+ and
+ updated link to Xerces download site.
+ <dt>Version 0.9.3 (30 Jun 2004)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.3.tar.gz'>tgz</a>]
+ <dd>Implemented scanning of XML declaration;
+ fixed <script> tag scanning bug reported by Vasiliev Ivan;
+ added <code>Version</code> class and manifest entries to query
+ product information;
+ and fixed some Javadoc errors.
+ <dt>Version 0.9.2 (31 Mar 2004)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.2.tar.gz'>tgz</a>]
+ <dd>Fixed entity reference scanning and tag-balancing bugs identified
+ by Tommy Sandström;
+ fixed tag-balancing bug reported by Oliver Pfeiffer;
+ fixed doctype scanning bug reported by Jonathan Baxter;
+ updated Purifier filter to synthesize missing namespace bindings;
+ updated Writer filter to convert all known characters back to
+ their entity names;
+ and
+ updated implementation to work with Xerces-J 2.6.2 that removed
+ the <code>ObjectFactory</code> class in the
+ <code>org.apache.xerces.util</code> package.
+ <dt>Version 0.9.1 (24 Feb 2004)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.1.tar.gz'>tgz</a>]
+ <dd>Fixed namespace binding bug reported by Jonathan Baxter.
+ <dt>Version 0.9 (19 Feb 2004)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.9.tar.gz'>tgz</a>]
+ <dd>Implemented scanning of CDATA sections;
+ implemented namespace processing;
+ added features to
+ override namespace bindings,
+ insert namespace bindings if not present,
+ override doctype public and system identifiers, and
+ insert doctype declaration if not present;
+ added a filter to allow applications to "purify" the input, ensuring
+ that the output is well-formed XML;
+ added missing location augmentations from document type declaration
+ callback;
+ fixed newline scanning bugs reported by Jonathan Baxter;
+ and
+ fixed comment scanning bugs and infinite loop bug caused by extremely
+ long element and attribute names found by Ram Subbaroyan.
+ <dt>Version 0.8.3 (12 Dec 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.3.tar.gz'>tgz</a>]
+ <dd>Fixed null pointer exception for <frameset> tags reported by
+ Dawid Weiss;
+ and
+ added missing file to xercesMinimal.jar file reported by Brent Beardsley.
+ <dt>Version 0.8.2 (14 Nov 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.2.tar.gz'>tgz</a>]
+ <dd>Fixed array index out of bounds exception in special tags and
+ doctype scanning bug reported by Leo Galambos;
+ updated processing instruction scanning to handle weird PIs exported
+ from Microsoft products as reported by Gabriele Bulfon;
+ fixed erroneous reporting of missing whitespace before attributes
+ reported by Arno Schatz;
+ installed a default error handler that prints to standard error
+ suggested by Arno Schatz;
+ and
+ fixed handling of dangling </p> reported by Gopi Murthy to
+ better match browser behavior.
+ <dt>Version 0.8.1 (30 Sep 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.1.tar.gz'>tgz</a>]
+ <dd>Fixed bug reported by Yuan Ji that allowed multiple <html> tags;
+ fixed bug in stripping leading comments in <script> tags
+ as reported by Lawrence McCartin;
+ added feature to be able to strip HTML comment delimiters (i.e. "<!--"
+ and "-->") from <style> element content suggested by
+ Lawrence McCartin;
+ updated DOMParser to work around a bug in the Xerces HTML DOM
+ implementation when a doctype node was inserted into the document,
+ reported by Troy Waldrep;
+ updated the DOMFragmentParser to allow setting of features and
+ properties as requested by Paul Reeves;
+ changed the status of the document fragment parser from experimental
+ to <font color=green>supported</font>;
+ added feature to allow application to ignore a character encoding
+ specified in a <meta http-equiv='Content-Type'
+ content='text/html;charset=...'> tag requested by Roger Fullerton;
+ and
+ changed feature identifier for document fragment tag balancing to
+ be more in line with other features (but retained old feature
+ identifier for backwards compatibility).
+ <dt>Version 0.8 (05 Aug 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.8.tar.gz'>tgz</a>]
+ <dd>Implemented scanning of doctype declaration;
+ implemented non-normalized attribute value for XNI filters that want
+ to know original attribute value;
+ fixed bug scanning entity references inside of unquoted attributes;
+ fixed line counting bug in attribute values reported by Arno Schatz;
+ and
+ updated files in xercesMinimal.jar noted by Brent Beardsley.
+ <dt>Version 0.7.7 (25 Jun 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.7.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.7.tar.gz'>tgz</a>]
+ <dd>Fixed handling of <font> tags reported by Dave King;
+ fixed bugs that caused multiple <head> and <body> tags
+ as reported by Mike Bowler;
+ fixed missing <tr> bug in nested tables reported by Troy Waldrep;
+ and
+ normalized newlines in attribute values to spaces.
+ <dt>Version 0.7.6 (06 May 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.6.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.6.tar.gz'>tgz</a>]
+ <dd>Fixed infinite loop in special tags reported by Mike Bowler.
+ <dt>Version 0.7.5 (02 May 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.5.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.5.tar.gz'>tgz</a>]
+ <dd>Fixed parsing of entity reference within <textarea> tags reported
+ by Mattias Jiderhamn;
+ changed behavior of tag balancer to not consume content after the end
+ <body> and <html> tags but retained old behavior through
+ new feature;
+ fixed <noscript> bug reported by Takashi Tomokiyo;
+ and
+ updated implementation for XNI changes introduced in Xerces-J 2.4.0.
+ <dt>Version 0.7.4 (03 Mar 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.4.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.4.tar.gz'>tgz</a>]
+ <dd>Fixed <form> element balancing problem reported by Dan Rocco;
+ fixed null pointer exception reported by Michael Dynin that was
+ caused by a null XMLResourceIdentifier object passed to the
+ startGeneralEntity method in the Xerces DOM parser classes;
+ fixed handling of <font> element as requested by Arno Schatz
+ to better match current browsers;
+ replaced generic catch exception blocks with explicit catch blocks
+ suggested by Arno Schatz;
+ fixed <center> tag-balancing problem reported by Russell Gold;
+ fixed null pointer exception caused by null namespace context
+ object passed to Xerces SAX parser class reported by David Leslie;
+ and
+ added FAQ entry describing how to insert custom filters before
+ the tag-balancer.
+ <dt>Version 0.7.3 (28 Jan 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.3.tar.gz'>tgz</a>]
+ <dd>Updated implementation for XNI changes introduced in Xerces-J 2.3.0;
+ and
+ fixed hack string to accommodate XML4J build of Xerces included in
+ the Eclipse editor reported by Geoffrey Longman.
+ <dt>Version 0.7.2 (10 Jan 2003)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.2.tar.gz'>tgz</a>]
+ <dd>Fixed class-cast exception bug in DOMFragmentParser reported by
+ Joseph Artsimovich;
+ fixed <span> tag-balancing bug reported by Ron Cemer;
+ and
+ fixed handling of form tags missing a parent element reported by
+ Russell Gold in order to better match browser behavior.
+ <dt>Version 0.7.1 (06 Dec 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.1.tar.gz'>tgz</a>]
+ <dd>Fixed null pointer exception caused by null attributes object
+ passed to Xerces SAX parser class as reported by Kevin Huber;
+ and
+ fixed infinite loop condition when encountering "</html[eof]"
+ as reported by Matt Hurst.
+ <dt>Version 0.7 (27 Nov 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.7.tar.gz'>tgz</a>]
+ <dd>Changed behavior of tag balancer for unbalanced elements
+ as requested by Troy Waldrep to make output match that
+ produced by browsers such as Mozilla;
+ fixed other tag balancing problems identified by a bug
+ reported by Laurens Fridael;
+ added <font color='red'>experimental</font> HTML fragment
+ parsing feature and DOM fragment parser class;
+ fixed buffer boundary bug in skipMarkup method reported
+ by Mike Bowler;
+ added constructor to the Writer filter that accepts a
+ Java writer object parameter as requested by Alain
+ Gilbert;
+ fixed HTMLScanner class so that it can compile with JDK 1.1
+ as reported by Mikko Honkala;
+ and
+ fixed bug reported by Russell Gold that would ignore
+ the <param> element within an <applet>
+ element.
+ <dt>Version 0.6.8 (30 Sep 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.8.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.8.tar.gz'>tgz</a>]
+ <dd>Implemented scanning of processing instructions;
+ improved performance of HTMLElements#getElement method inspired
+ by Sam Cheung;
+ changed tag balancer algorithm as requested by Mike Bowler so
+ that it does not close the <body> element to insert a
+ proper parent element;
+ fixed <isindex> proper parent bug and <script> empty
+ element tag bug reported by Mike Bowler;
+ fixed bug reported by YingLCS that a <form> tag
+ would prematurely close a <p> tag;
+ and
+ updated implementation for XNI changes introduced in Xerces-J 2.2.0.
+ <dt>Version 0.6.7 (06 Sep 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.7.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.7.tar.gz'>tgz</a>]
+ <dd>Added a FAQ section;
+ and
+ updated implementation for XNI changes introduced in Xerces-J 2.1.0.
+ <dt>Version 0.6.6 (25 Aug 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.6.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.6.tar.gz'>tgz</a>]
+ <dd>Changed packaging to include product name and version in
+ directory name;
+ updated <code>HTMLConfiguration</code> to implement the
+ <code>XMLPullParserConfiguration</code> interface;
+ fixed bug reported by Martin Jericho to correct handling
+ of <col> element;
+ fixed bug reported by Dave King that would skip to end of
+ document if bad markup was found;
+ fixed numerous bugs related to scanning <script> tags
+ reported by Sam Cheung;
+ added feature to be able to strip HTML comment delimiters (i.e.
+ "<!--" and "-->") from <script> element content;
+ changed the status of the feature to dynamically insert
+ content from <em>experimental</em> to <font color=green>
+ supported</font>;
+ added code to be able to compare test files against canonical
+ output for regression testing;
+ and
+ fixed minor bugs found by the tests.
+ <dt>Version 0.6.5 (17 Jul 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.5.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.5.tar.gz'>tgz</a>]
+ <dd> Fixed bug in changing character encoding when "charset=..." is
+ not written in lowercase;
+ and
+ mark attributes as "specified".
+ <dt>Version 0.6.4 (15 Jun 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.4.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.4.tar.gz'>tgz</a>]
+ <dd>Re-organized package contents for integration into the CyberNeko
+ Tools for XNI package;
+ fixed table closing bug reported by Oskar Liljeblad;
+ fixed newline bug reported by OtisG;
+ and
+ fixed line counting bug reported by Donald Ball.
+ <dt>Version 0.6.3 (29 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.3.tar.gz'>tgz</a>]
+ <dd>Fixed bug in handling of <th> elements reported by
+ Oskar Liljeblad;
+ and
+ fixed various tag-balancing problems.
+ <dt>Version 0.6.2 (26 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.2.tar.gz'>tgz</a>]
+ <dd>Changed scanner behavior as requested by Alexey Shananin to
+ report malformed start elements (e.g. <...>) as
+ characters
+ and
+ fixed tag balancing bug introduced in previous version. Oops!
+ <dt>Version 0.6.1 (23 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.1.tar.gz'>tgz</a>]
+ <dd>Changed tag balancer behavior to swallow events after the close
+ of the <html> tag to ensure that the document stream
+ remains well-formed;
+ added additional Ruby elements;
+ and
+ improved tag balancer performance.
+ <dt>Version 0.6 (12 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.6.tar.gz'>tgz</a>]
+ <dd>Added property to allow custom document filters to be appended
+ to the default NekoHTML parser pipeline;
+ added convenience filters for serializing HTML documents and
+ removing elements from the document event stream;
+ added samples to demonstrate the filtering feature;
+ added <font color=red>experimental</font> functionality to
+ allow applications to dynamically insert content into the
+ HTML document stream;
+ added a minimal Xerces2 Jar file containing just the files
+ required for using the HTMLConfiguration class directly to
+ alleviate full dependence on Xerces2 distribution;
+ applied patch from Serge Proskuryakov to fix handling of
+ misplaced <title> within <body>;
+ fixed minor tag balancing bug;
+ and
+ re-organized and added new documentation.
+ <dt>Version 0.5 (07 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.5.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.5.tar.gz'>tgz</a>]
+ <dd>Fixed some location reporting information bugs and added
+ feature to report character boundaries of events via the
+ associated augmentations object;
+ added feature to disable tag balancing;
+ and
+ added features to notify handlers of start and end of character
+ and built-in XML and HTML entity references.
+ <dt>Version 0.4.1 (03 May 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.4.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.4.1.tar.gz'>tgz</a>]
+ <dd>Fixed some unquoted attribute value scanning bugs reported
+ by Xiaowei Jiang;
+ fixed hack for Xerces-J 2.0.1 reported by Ron Cemer;
+ now passing locator object to <code>startDocument</code>
+ method;
+ and
+ celebrated opening of the Spider-Man movie.
+ <dt>Version 0.4 (14 Apr 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.4.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.4.tar.gz'>tgz</a>]
+ <dd>Added properties to control case of element and attribute names;
+ changed behavior of parser so that only known HTML elements
+ have their names modified according to the properties — all
+ unknown tags are left as-is;
+ added property to set default encoding;
+ added feature to augment infoset to report "synthesized" events;
+ added feature to be able to report errors and localized the error
+ messages;
+ implemented the locator so that location information can be
+ reported;
+ and
+ fixed element information so that more elements are properly
+ scanned as "special".
+ <dt>Version 0.3.3 (02 Apr 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.3.tar.gz'>tgz</a>]
+ <dd>Separated <tt>META-INF/services/*</tt> files to separate Jar
+ so that HTML parser configuration selection can be controlled
+ more explicitly; added DOM and SAX parser classes for
+ convenience; and fixed bug so that parser now obeys the
+ encoding specified in the input source.
+ <dt>Version 0.3.2 (15 Mar 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.2.tar.gz'>tgz</a>]
+ <dd>Fixed problem with bare <input> elements appearing outside
+ of <form> tag.
+ <dt>Version 0.3.1 (07 Mar 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.1.tar.gz'>tgz</a>]
+ <dd>Fixed handling of bare ampersands in content and attribute
+ values.
+ <dt>Version 0.3 (25 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.3.tar.gz'>tgz</a>]
+ <dd>Changed license to an Apache style license and fixed a
+ few bugs.
+ <dt>Version 0.2.3 (19 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.3.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.3.tar.gz'>tgz</a>]
+ <dd>Nested tables bug fix.
+ <dt>Version 0.2.2 (17 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.2.tar.gz'>tgz</a>]
+ <dd>More bug fixes to allow the parser to be used with Xalan
+ 2.3.0. The parser wasn't keeping track of features and
+ properties and without namespaces turned on, Xalan would
+ not correctly transform the SAX events emitted using
+ NekoHTML.
+ <dt>Version 0.2.1 (16 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.1.tar.gz'>tgz</a>]
+ <dd>Minor bug fix to work around problem in Xerces-J 2.0.0 SAX
+ parser that drops attributes when parser configuration
+ doesn't have a symbol table.
+ <dt>Version 0.2 (14 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.2.tar.gz'>tgz</a>]
+ <dd>Adding support for UTF-8, UTF-16, and other 8-bit encodings
+ supported by Java.
+ <dt>Version 0.1 (04 Feb 2002)
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.1.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-0.1.tar.gz'>tgz</a>]
+ <dd>Initial writing.
+</dl>
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/html/faq.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/faq.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/faq.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,352 @@
+<title>NekoHTML | Frequently Asked Questions</title>
+<link rel=stylesheet type=text/css href=../style.css>
+
+<h1>Frequently Asked Questions</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+<a href='usage.html'>Usage</a>
+|
+<a href='settings.html'>Settings</a>
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+FAQ
+|
+<a href='software.html'>Software</a>
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<h2>Table of Contents</h2>
+
+<ul>
+<li><a href='#uppercase'>Why are the DOM element names always uppercase?</a>
+<li><a href='#hierarchy'>Why do I get a hierarchy request error using DOM?</a>
+<li><a href='#prefilter'>How do I add filters <em>before</em> the tag balancer?</a>
+<li><a href='#fragments'>How do I parse HTML document fragments?</a>
+<li><a href='#offsets'>How can I get the location of document information?</a>
+<li><a href='#xerces2'>Do I have to use all of Xerces2?</a>
+<li><a href='#version'>What version of NekoHTML am I using?</a>
+</ul>
+
+<hr>
+
+<a name='uppercase'></a>
+<h3>Why are the DOM element names always uppercase?</h3>
+
+<p>
+The <a href='http://www.w3.org/TR/1998/REC-DOM-Level-1-19981001/level-one-html.html'>HTML
+DOM</a> specification explicitly states that element and
+attribute names follow the semantics, including case-sensitivity,
+specified in the <a href='http://www.w3.org/TR/html4/'>HTML
+4</a> specification. In addition,
+<a href='http://www.w3.org/TR/html4/about.html#h-1.2.1'>section
+1.2.1</a> of the HTML 4.01 specification states:
+<blockquote>
+Element names are written in uppercase letters (e.g., BODY).
+Attribute names are written in lowercase letters (e.g., lang, onsubmit).
+</blockquote>
+<p>
+The Xerces HTML DOM implementation (used by default in the
+NekoHTML <code>DOMParser</code> class) follows this convention.
+Therefore, even if the
+"http://cyberneko.org/html/properties/names/elems" property is
+set to "lower", the DOM will still uppercase the element names.
+<p>
+To get around this problem, instantiate a Xerces2 <code>DOMParser</code>
+object using the NekoHTML parser configuration. By default, the
+Xerces DOM parser class creates a standard XML DOM tree, not
+an HTML DOM tree. Therefore, the element and attribute names
+will follow the settings for the
+"http://cyberneko.org/html/properties/names/elems" and
+"http://cyberneko.org/html/properties/names/attrs" properties.
+However, realize that the application will not be able to cast
+the document nodes to the HTML DOM interfaces for accessing the
+document's information.
+<p>
+The following sample code shows how to instantiate a DOM
+parser using the NekoHTML parser configuration:
+<pre class='code'>
+<span class='code-comment'>// import org.apache.xerces.parsers.DOMParser;
+// import org.cyberneko.html.HTMLConfiguration;</span>
+
+DOMParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMParser<span class='code-punct'>(</span><span class='code-keyword'>new</span> HTMLConfiguration<span class='code-punct'>());</span>
+</pre>
+
+<a name='hierarchy'></a>
+<h3>Why do I get a hierarchy request error using DOM?</h3>
+
+<p>
+Using the NekoHTML DOM parser to parse HTML documents with
+namespace information can result in a hierarchy request error
+to be thrown. For example:
+<blockquote>
+org.w3c.dom.DOMException: HIERARCHY_REQUEST_ERR: An attempt was made
+to insert a node where it is not permitted.
+</blockquote>
+<p>
+The Xerces HTML DOM implementation does not support namespaces
+and cannot represent XHTML documents with namespace information.
+Therefore, in order to use the default HTML DOM implementation
+with NekoHTML's <code>DOMParser</code> to parse XHTML documents,
+you must turn off namespace processing. For example:
+<pre class='code'>
+<span class='code-comment'>// import org.cyberneko.html.parsers.DOMParser;</span>
+
+DOMParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMParser<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span><span class='code-func'>setFeature</span><span class='code-punct'>(</span><span class='code-string'>"http://xml.org/sax/features/namespaces"</span><span class='code-punct'>,</class> <span class='code-keyword'>false</span><span class='code-punct'>);</span>
+</pre>
+<p>
+If your application requires namespace processing to be turned
+on <em>and</em> uses the DOM API, another option is to add a
+custom filter to the parsing pipeline to remove namespace
+information before the <code>DOMParser</code> constructs the
+document. For example:
+<pre class='code'>
+<span class='code-comment'>// import org.cyberneko.html.filters.DefaultFilter;
+// import org.cyberneko.html.parsers.DOMParser;
+// import org.apache.xerces.xni.*;
+// import org.apache.xerces.xni.parser.XMLDocumentFilter;</span>
+
+DOMParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMParser<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span><span class='code-func'>setProperty</span><span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/properties/filters"</span><span class='code-punct'>,</span>
+ <span class='code-keyword'>new</span> XMLDocumentFilter<span class='code-punct'>[] {</span> <span class='code-keyword'>new</span> DefaultFilter<span class='code-punct'>() {</span>
+ <span class='code-keyword'>public void</span> <span class='code-func'>startElement</span><span class='code-punct'>(</span>QName element<span class='code-punct'>,</span> XMLAttributes attrs<span class='code-punct'>,</span>
+ Augmentations augs<span class='code-punct'>)</span> <span class='code-keyword'>throws</span> XNIException <span class='code-punct'>{</span>
+ element<span class='code-punct'>.</span>uri <span class='code-punct'>=</span> <span class='code-keyword'>null</span><span class='code-punct'>;</span>
+ <span class='code-keyword'>super</span><span class='code-punct'>.</span><span class='code-func'>startElement</span><span class='code-punct'>(</span>element<span class='code-punct'>,</span> attrs<span class='code-punct'>,</span> augs<span class='code-punct'>);</span>
+ <span class='code-punct'>}</span>
+ <span class='code-comment'>// ...etc...</span>
+ <span class='code-punct'>}
+});</span>
+</pre>
+
+<a name='prefilter'></a>
+<h3>How do I add filters <em>before</em> the tag balancer?</h3>
+
+<p>
+The NekoHTML parser has a property that allows you to append
+custom filter components at the end of the parser pipeline as
+detailed in the <a href='filters.html'>Pipeline Filters</a>
+documentation. But this means that processing occurs
+<em>after</em> the tag-balancer does its job. However, the same
+property can also be used to insert custom components before
+the tag-balancer as well.
+<p>
+The secret is to <em>disable</em> the tag-balancing feature and
+then add another instance of the <code>HTMLTagBalancer</code>
+component at the end of your custom filter pipeline. The following
+example shows how to add a custom filter before the tag-balancer
+in the DOM parser. (This also works on all other types of parsers
+that use the <code>HTMLConfiguration</code>.)
+<pre class='code'>
+<span class='code-comment'>// import org.cyberneko.html.HTMLConfiguration;
+// import org.cyberneko.html.parsers.DOMParser;
+// import org.apache.xerces.xni.parser.XMLDocumentFilter;</span>
+
+DOMParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMParser<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span><span class='code-func'>setFeature</span><span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/features/balance-tags"</span><span class='code-punct'>,</span> <span class='code-keyword'>false</span><span class='code-punct'>);</span>
+XMLDocumentFilter<span class='code-punct'>[]</span> filters <span class='code-punct'>= {</span> <span class='code-keyword'>new</span> MyFilter<span class='code-punct'>(),</span> <span class='code-keyword'>new</span> HTMLTagBalancer<span class='code-punct'>() };</span>
+parser<span class='code-punct'>.</span><span class='code-func'>setProperty</span><span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/properties/filters"</span><span class='code-punct'>,</span> filters<span class='code-punct'>);</span>
+</pre>
+
+<a name='fragments'></a>
+<h3>How do I parse HTML document fragments?</h3>
+
+<p>
+Frequently, HTML is used within applications and online forms
+to allow users to enter rich-text. In these situations, it is
+useful to be able to parse the entered text as a document
+<i>fragment</i>. In other words, the entered text represents
+content within the HTML <body> element — it is
+<em>not</em> a full HTML document.
+<p>
+Starting with version 0.7.0, NekoHTML has added a feature that
+allows the application to parse HTML document fragments. Setting
+the "<code>http://cyberneko.org/features/document-fragment</code>"
+feature to <code>true</code> instructs the tag-balancer to
+balance only tags found within the HTML <body> element.
+The surrounding <body> and <html> elements are not
+inserted.
+<p>
+<strong>Note:</strong>
+The document-fragment feature should <strong>not</strong> be
+used on the <code>DOMParser</code> class since it relies on
+balanced elements in order to correctly construct the DOM
+tree. However, a new parser class has been added to NekoHTML
+to allow you parser DOM document fragments. Please refer to
+the <a href='usage.html#convenience'>Usage Instructions</a>
+for more information.
+
+<a name='offsets'></a>
+<h3>How can I get the location of document information?</h3>
+
+<p>
+Many applications are interested in knowing where elements,
+attributes, and character data appear within the source
+document. To aid these applications, NekoHTML has a feature
+that reports the starting and ending character offsets of
+each piece of information in the document.
+<p>
+In order to tell NekoHTML to report the character offsets
+for document information, the
+<a href='settings.html#augmentations'>augmentations</a>
+feature needs to be turned on. For example:
+<p>
+<pre class='code'>
+<span class='code-comment'>// import org.cyberneko.html.parsers.SAXParser;</span>
+
+String AUGMENTATIONS <span class='code-punct'>=</span> <span class='code-string'>"http://cyberneko.org/html/features/augmentations"</span><span class='code-punct'>;</span>
+
+SAXParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> <span class='code-func'>SAXParser</span><span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span><span class='code-func'>setFeature</span><span class='code-punct'>(</span>AUGMENTATIONS<span class='code-punct'>,</span> <span class='code-keyword'>true</span><span class='code-punct'>);</span>
+</pre>
+<p>
+Once the feature is enabled, the location information can be
+obtained by querying the
+<code><a href='javadoc/org/cyberneko/html/HTMLEventInfo.html'>HTMLEventInfo</a></code>
+object in the <code>Augmentations</code> parameter passed to
+all XNI callbacks. This dependency is required because DOM
+and SAX lack the ability to communicate this detailed
+information to the application.
+<p>
+The XNI dependence does not restrict applications to only
+using the Xerces Native Interface, however. The best way to
+use this information is by extending one of the parsers in the
+<code>org.cyberneko.html.parsers</code> package and overriding
+the methods of interest. The following example extends the
+<code>SAXParser</code> class to retrieve the event information
+for start elements:
+<p>
+<pre class='code'>
+<span class='code-keyword'>public class</span> MySAXParser <span class='code-keyword'>extends</span> SAXParser <span class='code-punct'>{</span>
+
+ <span class='code-keyword'>static final</span> String AUGMENTATIONS <span class='code-punct'>=</span>
+ <span class='code-string'>"http://cyberneko.org/html/features/augmentations"</span><span class='code-punct'>;</span>
+
+ <span class='code-keyword'>public</span> <span class='code-func'>MySAXParser</span><span class='code-punct'>() {</span>
+ <span class='code-func'>setFeature</span><span class='code-punct'>(</span>AUGMENTATIONS<span class='code-punct'>,</span> <span class='code-keyword'>true</span><span class='code-punct'>);
+ }</span>
+
+ <span class='code-keyword'>public void</span> <span class='code-func'>startElement</span><span class='code-punct'>(</span>QName element<span class='code-punct'>,</span> XMLAttributes attrs<span class='code-punct'>,</span>
+ Augmentations augs<span class='code-punct'>)</span> <span class='code-keyword'>throws</span> XNIException <span class='code-punct'>{</span>
+
+ <span class='code-comment'>// get offset information</span>
+ HTMLEventInfo info <span class='code-punct'>=
+ (</span>HTMLEventInfo<span class='code-punct'>)</span>augs<span class='code-punct'>.</span><span class='code-func'>getItem</span><span class='code-punct'>(</span>AUGMENTATIONS<span class='code-punct'>);</span>
+
+ <span class='code-keyword'>boolean</span> synthesized <span class='code-punct'>=</span> info<span class='code-punct'>.</span><span class='code-func'>isSynthesized</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>int</span> beginRow <span class='code-punct'>=</span> info<span class='code-punct'>.</span><span class='code-func'>getBeginLineNumber</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>int</span> beginCol <span class='code-punct'>=</span> info<span class='code-punct'>.</span><span class='code-func'>getBeginColumnNumber</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>int</span> endRow <span class='code-punct'>=</span> info<span class='code-punct'>.</span><span class='code-func'>getEndLineNumber</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>int</span> endCol <span class='code-punct'>=</span> info<span class='code-punct'>.</span><span class='code-func'>getEndColumnNumber</span><span class='code-punct'>();</span>
+
+ <span class='code-comment'>// perform default processing</span>
+ <span class='code-keyword'>super</span><span class='code-punct'>.</span><span class='code-func'>startElement</span><span class='code-punct'>(</span>element<span class='code-punct'>,</span> attrs<span class='code-punct'>,</span> augs<span class='code-punct'>);
+ }
+
+}</span>
+</pre>
+<p>
+<strong>Note:</strong>
+The NekoHTML parser reports character offsets and is unable
+to report the byte offsets that map to the resulting characters.
+The parser takes advantage of the character decoders present in
+the JVM which do not report byte offsets. And because these
+decoders buffer blocks of bytes internally for performance
+reasons, it is not possible to write a custom input stream to
+perform this mapping between byte and character offsets. If you
+control the source documents and can restrict them to a single
+character encoding, then writing a custom reader to perform
+this mapping is more feasible.
+<p>
+<strong>Note:</strong>
+Currently, only the start and end row and column information
+can be queried. In the future, NekoHTML will be able
+to report character offsets from the beginning of the file.
+This does not, however, mean that byte offsets will also be
+supported at a future date.
+
+<a name='xerces2'></a>
+<h3>Do I have to use all of Xerces2?</h3>
+
+<p>
+While NekoHTML is a rather small library, many users complain
+about the size of the Xerces2 library. However, the full
+Xerces2 library is <em>not</em> required in order to use the
+NekoHTML parser. Because the CyberNeko HTML parser is written
+using the Xerces Native Interface (XNI) framework that forms
+the foundation of the Xerces2 implementation, only that part
+is required to write applications using NekoHTML.
+<p>
+For convenience, a small Jar file containing only the necessary
+parts of the framework and utility classes from Xerces2 is
+distributed with the NekoHTML package. The Jar file, called
+<code>xercesMinimal.jar</code>, can be found in the
+<code>lib/</code> directory of the distribution. Simply add
+this file to your classpath along with <code>nekohtml.jar</code>.
+<p>
+However, there are a few restrictions if you choose to use
+the <code>xercesMinimal.jar</code> file instead of the full
+Xerces2 package. First, you cannot use the DOM and SAX parsers
+included with NekoHTML because they use the Xerces2 base
+classes. Second, because you cannot use the convenience
+parser classes, your application must be written using the
+XNI framework. However, using the XNI framework is not
+difficult for programmers familiar with SAX. [Note: future
+versions of NekoHTML may include custom implementations of
+the DOM and SAX parsers to avoid this dependence on the
+Xerces2 library.]
+<p>
+Most users of the CyberNeko HTML parser will not have a
+problem including the full Xerces2 package because the
+application is likely to need an XML parser implementation.
+However, for those users that are concerned about Jar file
+size, then using the <code>xercesMinimal.jar</code> file
+may be a useful alternative.
+
+<a name='version'></a>
+<h3>What version of NekoHTML am I using?</h3>
+
+<p>
+Since version 0.9.3, NekoHTML includes a class that can be
+used to query the product version within application code.
+The <code>Version</code> class in the
+<code>org.cyberneko.html</code> package contains a method,
+<code>getVersion</code> that returns the NekoHTML version
+as a string. For example:
+<pre class='code'>
+<span class='code-comment'>// import org.cyberneko.html.Version;</span>
+
+System<span class='code-punct'>.</span>err<span class='code-punct'>.</span><span class='code-func'>println</span><span class='code-punct'>(</span>Version<span class='code-punct'>.</span><span class='code-func'>getVersion</span><span class='code-punct'>());</span>
+</pre>
+<p>
+The <code>Version</code> also includes a <code>main</code>
+method that prints the version information to standard output.
+<p>
+The version and product information can also be queried using
+the Java package API. For example:
+<pre class='code'>
+Class cls <span class='code-punct'>=</span> Class<span class='code-punct'>.</span><span class='code-func'>forName</span><span class='code-punct'>(</span><span class='code-string'>"org.cyberneko.html.HTMLConfiguration"</span><span class='code-punct'>);</span>
+Package pkg <span class='code-punct'>=</span> cls<span class='code-punct'>.</span><span class='code-func'>getPackage</span><span class='code-punct'>();</span>
+
+String name <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getName</span><span class='code-punct'>();</span>
+
+String specTitle <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getSpecificationTitle</span><span class='code-punct'>();</span>
+String specVendor <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getSpecificationVendor</span><span class='code-punct'>();</span>
+String specVersion <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getSpecificationVersion</span><span class='code-punct'>();</span>
+
+String implTitle <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getImplementationTitle</span><span class='code-punct'>();</span>
+String implVendor <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getImplementationVendor</span><span class='code-punct'>();</span>
+String implVersion <span class='code-punct'>=</span> pkg<span class='code-punct'>.</span><span class='code-func'>getImplementationVersion</span><span class='code-punct'>();</span>
+</pre>
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
Added: branches/nekohtml/upstream/0.9.5/doc/html/filters.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/filters.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/filters.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,373 @@
+<title>NekoHTML | Pipeline Filters</title>
+<link rel=stylesheet type=text/css href=../style.css>
+
+<h1>Pipeline Filters</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+<a href='usage.html'>Usage</a>
+|
+<a href='settings.html'>Settings</a>
+|
+Filters
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+<a href='software.html'>Software</a>
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<h2>Table of Contents</h2>
+<ul>
+<li><a href='#overview'>Overview</a>
+ <ul>
+ <li><a href='#overview.create'>Creating a New Filter</a>
+ <li><a href='#overview.append'>Appending Filters to the Pipeline</a>
+ </ul>
+<li><a href='#filters'>Sample Filters</a>
+ <ul>
+ <li><a href='#filters.serialize'>Serializing HTML Documents</a>
+ <li><a href='#filters.namespaces'>Namespace Processing</a>
+ <li><a href='#filters.well-formedness'>Ensuring XML Well-Formedness</a>
+ <li><a href='#filters.removing'>Removing Elements</a>
+ <li><a href='#filters.identity'>Performing Identity Transform</a>
+ <li><a href='#filters.dynamic'>Dynamically Inserting Content</a>
+ </ul>
+</ul>
+
+<hr>
+
+<a name='overview'></a>
+<h2>Overview</h2>
+<p>
+The Xerces Native Interface (XNI) defines a parser configuration
+framework in which parsers can be written as a pipeline of
+modular components. This allows new parser configurations to be
+constructed by re-arranging existing components and/or writing
+custom components. And because the NekoHTML parser is written using
+this modular framework, new functionality can be quickly and
+easily added to the parser by appending custom document filters
+to the end of the default NekoHTML parsing pipeline.
+
+<a name='overview.create'></a>
+<h3>Creating a New Filter</h3>
+<p>
+To write a custom filter, simply write a new class that implements
+the <code>XMLDocumentFilter</code> interface from the
+<code>org.apache.xerces.xni.parser</code> package of Xerces2. This
+interface allows the component to be both the <em>handler</em> of
+document events from the previous stage in the pipeline as well as
+the <em>source</em> for the next stage in the pipeline. The
+implementation of the new filter is completely arbitrary; it can
+remove events from the document stream, generate new events, or
+anything else you want!
+<p>
+NekoHTML includes a base filter class to simplify the creation of
+custom filters. To write a new filter, simply extend the
+<code>DefaultFilter</code> class located in the
+<code>org.cyberneko.html.filters</code> package and override the
+relevent methods to add your own behavior. Once done, the only
+thing you need to do is append the filter to the end of the
+parser pipeline.
+
+<a name='overview.append'></a>
+<h3>Appending Filters to the Pipeline</h3>
+<p>
+The NekoHTML parser has a <a href='settings.html#filters'>filters
+property</a> that allows you to append custom document filters to
+the end of the default parser pipeline. The value of this property
+is an array of objects that implement the <code>XMLDocumentFilter</code>
+interface in XNI. For example, the following code instantiates a
+default filter and appends it to the parser pipeline:
+<pre class='code'>
+XMLDocumentFilter noop <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DefaultFilter<span class='code-punct'>();</span>
+XMLDocumentFilter<span class='code-punct'>[]</span> filters <span class='code-punct'>= {</span> noop <span class='code-punct'>};</span>
+
+XMLParserConfiguration parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> HTMLConfiguration<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span>setProperty<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/properties/filters"</span><span class='code-punct'>,</span> filters<span class='code-punct'>);</span>
+</pre>
+
+<a name='filters'></a>
+<h2>Sample Filters</h2>
+<p>
+This section describes a few of the basic document filters
+that are included with the NekoHTML parser. The included filters
+enable applications to perform a variety of operations, including:
+<ul>
+<li>serializing HTML documents;
+<li>ensuring XML well-formedness;
+ and
+<li>performing identity transform.
+</ul>
+
+<a name='filters.serialize'></a>
+<h3>Serializing HTML Documents</h3>
+<p>
+NekoHTML includes a simple HTML serializer written as a filter.
+The <code>Writer</code> class is located in the
+<code>org.cyberneko.html.filters</code> and contains two
+different constructors. The default constructor creates a writer
+that prints to the standard output. The other constructor allows
+the application to control the output stream and the encoding.
+For example:
+<pre class='code'>
+<span class='code-comment'>// write to standard output using UTF-8</span>
+XMLDocumentFilter writer <span class='code-punct'>=</span> new Writer<span class='code-punct'>();</span>
+
+<span class='code-comment'>// write to file with specified encoding</span>
+OutputStream stream <span class='code-punct'>=</span> <span class='code-keyword'>new</span> FileOutputStream<span class='code-punct'>(</span><span class='code-string'>"index.html"</span><span class='code-punct'>);</span>
+String encoding <span class='code-punct'>=</span> <span class='code-string'>"ISO-8859-1"</span><span class='code-punct'>;</span>
+XMLDocumentFilter writer <span class='code-punct'>=</span> <span class='code-keyword'>new</span> Writer<span class='code-punct'>(</span>stream, encoding<span class='code-punct'>);</span>
+</pre>
+<p>
+Besides serializing the HTML event stream, the writer also passes
+the document events to the next stage in the pipeline. This allows
+applications to insert writer filters between other custom filters
+for debugging purposes.
+<p>
+Since an HTML document may have specified its encoding using the
+<META> tag and http-equiv/content attributes, the writer will
+automatically change any character set specified in this tag to
+match the encoding of the output stream. Therefore, the character
+encoding name used to construct the writer should be an official
+<a href='http://www.iana.org/assignments/character-sets'>IANA</a>
+encoding name and not a Java encoding name.
+<strong>Note:</strong>
+The modified character set in the <META> tag is <em>not</em>
+propagated to the next stage in the pipeline. The changed value is
+only output to the stream; the original value is sent to the next
+stage in the pipeline.
+<p>
+For convenience, the <code>Writer</code> class contains a
+<code>main</code> method that allows you to run it as a program.
+This can be used for debugging purposes in order to see what the
+NekoHTML parser is generating as well as converting the character
+encoding of existing documents.
+<p>
+The following table shows the standard usage of the writer:
+<table cellspacing='0' cellpadding='3'>
+<tr><th style='border-bottom: 0'>Usage:
+<td style='border-bottom: 0'><tt>java org.cyberneko.html.filters.Writer (options) file ...</tt>
+<tr><th style='border-bottom: solid black 1'>Options:
+<td><pre>
+ -e name Specify IANA name of output encoding.
+ -i Perform identity transform.
+ -p Purify output to ensure XML well-formedness.
+ -h Display help screen.</pre>
+</td>
+</tr>
+</table>
+
+<a name='filters.namespaces'></a>
+<h3>Namespace Processing</h3>
+<p>
+A filter to perform namespace processing is included with NekoHTML,
+for convenience. You do not need to add this filter manually because
+it is automatically added to the parsing pipeline if the SAX namespaces
+feature is enabled. However, if you are interested, the
+<code>NamespaceBinder</code> component is included in the
+<code>org.cyberneko.html.filters</code> package.
+<p>
+<strong>Note:</strong>
+This component does not perform <em>any</em> namespace processing
+unless the SAX namespaces feature,
+"http://xml.org/sax/features/namespaces", is enabled.
+
+<a name='filters.well-formedness'></a>
+<h3>Ensuring XML Well-Formedness</h3>
+<p>
+HTML allows documents to be less strict than XML documents which
+means that most HTML documents cannot be parsed with an XML parser.
+But even if an HTML document can be parsed and accessed by
+applications using standard XML programming interfaces, many
+applications need to produce well-formed output. Not only do tags
+need to be balanced properly, but the document content must also
+be legal according to the XML specification. Therefore, the
+NekoHTML parser provides a filter that "purifies" the input,
+ensuring that the output is well-formed XML.
+<p>
+The <code>Purifier</code> class in the
+<code>org.cyberneko.html.filters</code> package lets the application
+convert the HTML input into well-formed XML output. Some of the
+changes that the Purifier performs, are:
+<ul>
+<li>fixing illegal element and attribute names;
+<li>ensuring the string "--" does not appear in the content of
+ a comment;
+<li>escaping illegal characters appearing in the document;
+<li>etc.
+</ul>
+
+<a name='filters.removing'></a>
+<h3>Removing Elements</h3>
+<p>
+The NekoHTML parser also provides a basic document filter capable
+of removing specified elements from the processing stream. The
+<code>ElementRemover</code> class is located in the
+<code>org.cyberneko.html.filters</code> package and provides
+two options for processing document elements:
+<ul>
+<li>specifying those elements which should be accepted and,
+ optionally, which attributes of that element should be
+ kept; and
+<li>specifying those elements whose tags and content should be
+ completely removed from the event stream.
+</ul>
+<p>
+The first option allows the application to specify which elements
+appearing in the event stream should be accepted and, therefore,
+passed on to the next stage in the pipeline. All elements
+<em>not</em> in the list of acceptable elements have their start
+and end tags stripped from the event stream <em>unless</em> those
+elements appear in the list of elements to be removed.
+<p>
+The second option allows the application to specify which elements
+should be completely removed from the event stream. When an element
+appears that is to be removed, the element's start and end tag as
+well as all of that element's content is removed from the event
+stream.
+<p>
+A common use of this filter would be to only allow rich-text
+and linking elements as well as the character content to pass
+through the filter — all other elements would be stripped.
+The following code shows how to configure this filter to perform
+this task:
+<pre class='code'>
+ElementRemover remover <span class='code-punct'>=</span> <span class='code-keyword'>new</span> ElementRemover<span class='code-punct'>();</span>
+remover<span class='code-punct'>.</span>acceptElement<span class='code-punct'>(</span><span class='code-string'>"b"</span><span class='code-punct'>,</span> <span class='code-keyword'>null</span><span class='code-punct'>);</span>
+remover<span class='code-punct'>.</span>acceptElement<span class='code-punct'>(</span><span class='code-string'>"i"</span><span class='code-punct'>,</span> <span class='code-keyword'>null</span><span class='code-punct'>);</span>
+remover<span class='code-punct'>.</span>acceptElement<span class='code-punct'>(</span><span class='code-string'>"u"</span><span class='code-punct'>,</span> <span class='code-keyword'>null</span><span class='code-punct'>);</span>
+remover<span class='code-punct'>.</span>acceptElement<span class='code-punct'>(</span><span class='code-string'>"a"</span><span class='code-punct'>,</span> <span class='code-keyword'>new</span> String<span class='code-punct'>[] {</span> <span class='code-string'>"href"</span> <span class='code-punct'>});</span>
+</pre>
+<p>
+However, this would still allow the text content of other
+elements to pass through, which may not be desirable. In order
+to further "clean" the input, the <code>removeElement</code>
+option can be used. The following piece of code adds the ability
+to completely remove any <SCRIPT> tags and content
+from the stream.
+<pre class='code'>
+remover<span class='code-punct'>.</span>removeElement<span class='code-punct'>(</span><span class='code-string'>"script"</span><span class='code-punct'>);</span>
+</pre>
+<p>
+This source code is included in the <code>src/html/sample/</code>
+directory in the file named <code>RemoveElements.java</code>.
+<p>
+<strong>Note:</strong>
+When an element is "stripped", its start and end tags are
+removed from the event stream. However, all of the element's
+text content and elements (that are accepted) are not stripped.
+To completely remove an element's content, use the
+<code>removeElement</code> method.
+<p>
+<strong>Note:</strong>
+Care should be taken when using this filter because the output
+may not be a well-balanced tree. Specifically, if the application
+removes the <HTML> element (with or without retaining its
+children), the resulting document event stream will no longer be
+well-formed.
+
+<a name='filters.identity'></a>
+<h3>Performing Identity Transform</h3>
+<p>
+An identity filter is provided that performs an identity
+operation of the original document event stream generated by the
+HTML scanner by removing events that are synthesized by the tag
+balancer. This operation is essentially the same as turning off
+tag-balancing in the parser. However, this filter is useful when
+you want the tag balancer to report "errors" but do not want the
+synthesized events in the output.
+<p>
+<strong>Note:</strong>
+This filter requires the augmentations feature to be turned on.
+For example:
+<pre class='code'>
+XMLParserConfiguration parser <span class='code-punct'>=</span> new HTMLConfiguration<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span>setFeature<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/features/augmentations"</span><span class='code-punct'>,</span> <span class='code-keyword'>true</span><span class='code-punct'>);</span>
+</pre>
+<p>
+<strong>Note:</strong>
+This isn't <em>exactly</em> the identify transform because the
+element and attributes names may have been modified from the
+original document. For example, by default, NekoHTML converts
+element names to upper-case and attribute names to lower-case.
+
+<a name='filters.dynamic'></a>
+<h3>Dynamically Inserting Content</h3>
+<p>
+The NekoHTML parser has the ability to dynamically insert content
+into the parsed HTML document. This functionality can be used to
+insert the result of an embedded script (e.g. JavaScript) into the
+HTML document in place of the script element. <strong>Note:</strong>
+NekoHTML does not provide a scripting engine — only the
+ability to insert content to be parsed.
+<p>
+To insert content into the HTML document stream, call the
+<code>pushInputStream</code> method on the NekoHTML parser
+configuration class. This method takes an <code>XMLInputSource</code>
+object as a parameter. At the moment, the character stream
+(java.io.Reader) of the input source <strong>must</strong> be
+set or else the implementation will throw an illegal argument
+exception.
+<p>
+A sample program called <code>Script</code> is included in the
+<tt>src/sample/</tt> directory that demonstrates how to use of the
+<code>pushInputSource</code> method of the HTMLConfiguration in order
+to dynamically insert content into the HTML stream.
+This sample defines a new script language called "NekoScript"
+that is a modified subset of the
+<a href='http://www.jclark.com/sp/sgmlsout.htm'>NSGMLS format</a>.
+In this format, each line specifies a new <i>command</i> where each
+command may indicate a start element tag, an attribute value,
+character content, an end element tag, etc. The following table
+enumerates the NSGMLS features supported by the NekoScript
+language:
+<table border='1' cellspacing='0', cellpadding='3'>
+<tr>
+<th style='font-weight:normal;border-bottom:solid black 1'><tt>(<i>name</i></tt>
+<td>A start element with the specified <i>name</i>.
+<tr>
+<th style='font-weight:normal;border-bottom:solid black 1'><tt>"<i>text</i></tt>
+<td>Character content with the specified <i>text</i>.
+<tr>
+<th style='font-weight:normal;border-bottom:solid black 1'><tt>)<i>name</i></tt>
+<td>An end element with the specified <i>name</i>.
+</table>
+<p>
+When processed with the <code>Script</code> filter, the following
+document:
+<pre class='document'>
+<script type='text/x-nekoscript'>
+(h1
+"Header
+)h1
+</script>
+</pre>
+<p>
+is equivalent to:
+<pre class='document'>
+<H1>Header</H1>
+</pre>
+<p>
+as seen by the document handler registered with the parser.
+<p>
+The <code>Script</code> class implements a <code>main</code>
+method so that it can be run as a program. Running the program
+produces the following output: [<strong>Note:</strong> The command
+should be contiguous. It is split among separate lines in this
+example to make it easier to read.]
+<pre class='cmdline'>
+<span class='cmdline-prompt'>></span> <span class='cmdline-cmd'>java -cp nekohtml.jar;nekohtmlSamples.jar;lib/xercesMinimal.jar
+ sample.Script data/test33.html</span>
+<H1>Header</H1>
+</pre>
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/html/index.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/index.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/index.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,105 @@
+<title>NekoHTML</title>
+<link rel=stylesheet type=text/css href=../style.css>
+
+<h1>CyberNeko HTML Parser <sub>0.9.5</sub></h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+Top
+|
+<a href='usage.html'>Usage</a>
+|
+<a href='settings.html'>Settings</a>
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+<a href='software.html'>Software</a>
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<h2>About</h2>
+<p>
+NekoHTML is a simple <a href='http://www.w3.org/TR/html4/'>HTML</a>
+scanner and tag balancer that enables application programmers to
+parse HTML documents and access the information using standard XML
+interfaces. The parser can scan HTML files and "fix up" many common
+mistakes that human (and computer) authors make in writing HTML
+documents. NekoHTML adds missing parent elements; automatically
+closes elements with optional end tags; and can handle mismatched
+inline element tags.
+<p>
+NekoHTML is written using the
+<a href='http://xml.apache.org/xerces2-j/xni.html'>Xerces
+Native Interface</a> (XNI) that is the foundation of the
+<a href='http://xml.apache.org/xerces2-j/'>Xerces2</a>
+implementation. This enables you to use the NekoHTML parser
+with existing XNI tools without modification or rewriting code.
+
+<h2>License Agreement</h2>
+<p>
+The NekoHTML parser is distributed under an Apache-style license
+and is currently being considered as a sub-project of the Apache
+Xerces project. If accepted, the license and development of NekoHTML
+will move to Apache. However, this will not affect the ability of
+the parser to be freely used as specified by the current license.
+<p>
+For specific license details, please refer to the
+<a href='../../LICENSE'>LICENSE</a> file.
+
+<h2>Download</h2>
+<p>
+The NekoHTML parser includes complete Java source code and
+documentation. You can download the latest version from the
+following location:
+<ul>
+<li>NekoHTML
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-latest.zip'>zip</a>]
+ [<a href='http://www.apache.org/~andyc/neko/nekohtml-latest.tar.gz'>tgz</a>]
+</ul>
+
+<h2>Requirements and Limitations</h2>
+<p>
+This version of NekoHTML requires the following:
+<ul>
+<li><a href='http://java.sun.com/products/'>Java 1.1</a> (or
+ higher)
+<li><a href='http://xml.apache.org/xerces2-j/download.cgi'>Xerces 2.0.0</a>
+ (or higher)
+ [<a href='http://archive.apache.org/dist/xml/xerces-j/'>archive</a>]
+</ul>
+<p>
+This version has the following limitations:
+<ul>
+<li>There are HTML documents for which NekoHTML cannot properly
+ generate a well-formed XML document event stream. For example,
+ documents with multiple <html> tags are inherently
+ ill-formed because XML documents may only have a single root
+ element.
+<li>Code added to the core DOM implementation in Xerces-J 2.0.1
+ introduced a bug in the HTML DOM implementation based on it.
+ The bug causes the element nodes in the resultant HTML document
+ object to be of type <code>org.apache.xerces.dom.ElementNSImpl</code>
+ instead of the appropriate HTML DOM element objects. The problem
+ affects NekoHTML users who use the parser with Xerces-J 2.0.1 and
+ anyone using the HTML DOM implementation in Xerces-J 2.0.1.
+<li>There are no other known major limitations with this release.
+ However, additional work can always be done to improve
+ performance, fix bugs, and add functionality.
+</ul>
+<p>
+For a more complete list of items to be done, please refer to
+the <a href='../../TODO_html'>Todo Items</a>.
+
+<h2>Contact Information</h2>
+<p>
+Andy Clark <<a href='mailto:andyc at apache.org'>andyc at apache.org</a>>
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/html/settings.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/settings.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/settings.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,412 @@
+<title>NekoHTML | Parser Settings</title>
+<link rel=stylesheet type=text/css href=../style.css>
+<style type='text/css'>
+.see {
+ margin: 0; margin-top: 0.25em;
+ font-size: 0.8em;
+}
+</style>
+
+<h1>Parser Settings</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+<a href='usage.html'>Usage</a>
+|
+Settings
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+<a href='software.html'>Software</a>
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<h2>Configuring Parser</h2>
+<p>
+The application can set a variety of NekoHTML settings to more
+precisely control the behavior of the parser. These settings
+can be set directly on the <code>HTMLConfiguration</code> class
+or on the supplied parser classes by calling the
+<code>setFeature</code> and <code>setProperty</code> methods.
+For example:
+<pre class='code'>
+<span class='code-comment'>// settings on HTMLConfiguration</span>
+org<span class='code-punct'>.</span>apache<span class='code-punct'>.</span>xerces<span class='code-punct'>.</span>xni<span class='code-punct'>.</span>parser<span class='code-punct'>.</span>XMLParserConfiguration config <span class='code-punct'>=</span>
+ <span class='code-keyword'>new</span> org<span class='code-punct'>.</span>cyberneko<span class='code-punct'>.</span>html<span class='code-punct'>.</span>HTMLConfiguration<span class='code-punct'>();</span>
+config<span class='code-punct'>.</span>setFeature<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/features/augmentations"</span><span class='code-punct'>,</span> <span class='code-keyword'>true</span><span class='code-punct'>);</span>
+config<span class='code-punct'>.</span>setProperty<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/properties/names/elems"</span><span class='code-punct'>,</span> <span class='code-string'>"lower"</span><span class='code-punct'>);</span>
+
+<span class='code-comment'>// settings on DOMParser</span>
+org<span class='code-punct'>.</span>cyberneko<span class='code-punct'>.</span>html<span class='code-punct'>.</span>parsers<span class='code-punct'>.</span>DOMParser parser <span class='code-punct'>=</span>
+ <span class='code-keyword'>new</span> org<span class='code-punct'>.</span>cyberneko<span class='code-punct'>.</span>html<span class='code-punct'>.</span>parsers<span class='code-punct'>.</span>DOMParser<span class='code-punct'>();</span>
+parser<span class='code-punct'>.</span>setFeature<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/features/augmentations"</span><span class='code-punct'>,</span> <span class='code-keyword'>true</span><span class='code-punct'>);</span>
+parser<span class='code-punct'>.</span>setProperty<span class='code-punct'>(</span><span class='code-string'>"http://cyberneko.org/html/properties/names/elems"</span><span class='code-punct'>,</span> <span class='code-string'>"lower"</span><span class='code-punct'>);</span>
+</pre>
+
+<h2>Features</h2>
+The NekoHTML parser supports the following features:
+<table cellpadding='4' cellspacing='0'>
+ <tr><th>Feature Id / Description<th>Default
+ <tr>
+ <td>
+ <a name='namespaces'></a>
+ <span class='id'>http://xml.org/sax/features/namespaces</span>
+ <br>
+ Specifies if the NekoHTML parser should perform namespace
+ processing. If enabled, namespace binding attributes are
+ processed and elements and attributes are bound to the defined
+ namespaces.
+ <p class='see'>
+ See: <a href='#override-namespaces'>http://cyberneko.org/html/features/override-namespaces</a>
+ <td align='center'>true
+ <tr>
+ <td>
+ <a name='balance-tags'></a>
+ <span class='id'>http://cyberneko.org/html/features/balance-tags</span>
+ <br>
+ Specifies if the NekoHTML parser should attempt to balance
+ the tags in the parsed document. Balancing the tags fixes up many
+ common mistakes by adding missing parent elements, automatically
+ closing elements with optional end tags, and correcting unbalanced
+ inline element tags. In order to process HTML documents as XML, this
+ feature should <strong>not</strong> be turned off. This feature is
+ provided as a performance enhancement for applications that only
+ care about the appearance of specific elements, attributes, and/or
+ content regardless of the document's ill-formed structure.
+ <td align='center'>true
+ <tr>
+ <td>
+ <a name='override-doctype'></a>
+ <span class='id'>http://cyberneko.org/html/features/override-doctype</span>
+ <br>
+ Specifies whether the NekoHTML parser should override the public
+ and system identifier values specified in the document type declaration.
+ <p class='see'>
+ See: <a href='#doctype-pubid'>http://cyberneko.org/html/properties/doctype/pubid</a>
+ <br>
+ See: <a href='#doctype-sysid'>http://cyberneko.org/html/properties/doctype/sysid</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='insert-doctype'></a>
+ <span class='id'>http://cyberneko.org/html/features/insert-doctype</span>
+ <br>
+ Specifies whether the NekoHTML parser should insert a document type
+ declaration into the document handler callbacks. The values for the
+ public and system identifiers are taken from the sysid and pubid
+ properties. Therefore, those properties should be set if this
+ feature is turned on. Also, setting this feature to <code>true</code>
+ will cause the parser to ignore any document type declaration that
+ appears in the document.
+ <p class='see'>
+ See: <a href='#doctype-pubid'>http://cyberneko.org/html/properties/doctype/pubid</a>
+ <br>
+ See: <a href='#doctype-sysid'>http://cyberneko.org/html/properties/doctype/sysid</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='override-namespaces'></a>
+ <span class='id'>http://cyberneko.org/html/features/override-namespaces</span>
+ <br>
+ Specifies whether the NekoHTML parser should override the namespace
+ URI bound to HTML elements and attributes.
+ <p class='see'>
+ See: <a href='#namespaces-uri'>http://cyberneko.org/html/properties/namespaces-uri</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='insert-namespaces'></a>
+ <span class='id'>http://cyberneko.org/html/features/insert-namespaces</span>
+ <br>
+ Specifies whether the NekoHTML parser should insert namespace URI
+ bindings to HTML elements and attributes. The value for the
+ namespace URI is taken from the namespaces property. Therefore,
+ that property should be set if this feature is turned on.
+ <p class='see'>
+ See: <a href='#namespaces-uri'>http://cyberneko.org/html/properties/namespaces-uri</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='ignore-outside-content'></a>
+ <span class='id'>http://cyberneko.org/html/features/balance-tags/ignore-outside-content</span>
+ <br>
+ Specifies if the NekoHTML parser should ignore content after the end
+ of the document root element. If this feature is set to true, all
+ elements and character content appearing outside of the document body
+ is consumed. If set to false, the end elements for the <body>
+ and <html> are ignored, allowing content appearing outside of
+ the document to be parsed and communicated to the application.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='document-fragment'></a>
+ <span class='id'>http://cyberneko.org/html/features/balance-tags/document-fragment</span>
+ <br>
+ Specifies if the tag balancer should operate as if a fragment
+ of HTML is being parsed. With this feature set, the tag balancer
+ will not attempt to insert a missing body elements around content
+ and markup. However, proper parents for elements contained within
+ the <body> element will still be inserted. This feature should
+ <strong>not</strong> be used when using the <code>DOMParser</code>
+ class. In order to parse a DOM <code>DocumentFragment</code>, use the
+ <code>DOMFragmentParser</code> class.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='cdata-sections'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/cdata-sections</span>
+ <br>
+ Specifies whether CDATA sections are reported as character content.
+ If set to <code>false</code>, CDATA sections are reported as comments.
+ When reported as comments, the comment text is prefixed with "[CDATA["
+ and end with "]]". This prefix and suffix is <em>not</em>
+ included when reported as character content.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='notify-char-refs'></a>
+ <span class='id'>http://apache.org/xml/features/scanner/notify-char-refs</span>
+ <br>
+ Specifies whether character entity references (e.g. &#32;, &#x20;,
+ etc) should be reported to the registered document handler. The name of
+ the entity reported will contain the leading pound sign and optional 'x'
+ character. For example, the name of the character entity reference
+ <code>&#x20;</code> will be reported as "#x20".
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='notify-builtin-xml-refs'></a>
+ <span class='id'>http://apache.org/xml/features/scanner/notify-builtin-refs</span>
+ <br>
+ Specifies whether the XML built-in entity references (e.g. &amp;,
+ &lt;, etc) should be reported to the registered document handler.
+ This only applies to the five pre-defined XML general entities --
+ specifically, "amp", "lt", "gt", "quot", and "apos". This is done for
+ compatibility with the Xerces feature.
+ To be notified of the built-in entity references in HTML, set the
+ <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code>
+ feature to <code>true</code>.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='notify-builtin-html-refs'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/notify-builtin-refs</span>
+ <br>
+ Specifies whether the HTML built-in entity references (e.g. &nobr;,
+ &copy;, etc) should be reported to the registered document
+ handler. This <em>includes</em> the five pre-defined XML general
+ entities.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='fix-mswindows-refs'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/fix-mswindows-refs</span>
+ <br>
+ Specifies whether to fix character entity references for Microsoft
+ Windows® characters as described at
+ <a href='http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.html</a>.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='ignore-specified-charset'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/ignore-specified-charset</span>
+ <br>
+ Specifies whether to ignore the character encoding specified within the
+ <meta http-equiv='Content-Type' content='text/html;charset=...'>
+ tag. By default, NekoHTML checks this tag for a charset and changes the
+ character encoding of the scanning reader object. Setting this feature
+ to <code>true</code> allows the application to override this behavior.
+ <p class='see'>
+ See: <a href='#default-encoding'>http://cyberneko.org/html/properties/default-encoding</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='script-strip-comment-delims'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/script/strip-comment-delims</span>
+ <br>
+ Specifies whether the scanner should strip HTML comment delimiters
+ (i.e. "<!--" and "-->") from <script> element content.
+ <p class='see'>
+ See: <a href='#style-strip-comment-delims'>http://cyberneko.org/html/features/scanner/style/strip-comment-delims</a>
+ <br>
+ See: <a href='#script-strip-cdata-delims'>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='script-strip-cdata-delims'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims</span>
+ <br>
+ Specifies whether the scanner should strip XHTML CDATA delimiters
+ (i.e. "<![CDATA[" and "]]>") from <script> element content.
+ <p class='see'>
+ See: <a href='#style-strip-cdata-delims'>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims</a>
+ <br>
+ See: <a href='#script-strip-comment-delims'>http://cyberneko.org/html/features/scanner/script/strip-comment-delims</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='style-strip-comment-delims'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/style/strip-comment-delims</span>
+ <br>
+ Specifies whether the scanner should strip HTML comment delimiters
+ (i.e. "<!--" and "-->") from <style> element content.
+ <p class='see'>
+ See: <a href='#script-strip-comment-delims'>http://cyberneko.org/html/features/scanner/script/strip-comment-delims</a>
+ <br>
+ See: <a href='#style-strip-cdata-delims'>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='style-strip-cdata-delims'></a>
+ <span class='id'>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims</span>
+ <br>
+ Specifies whether the scanner should strip XHTML CDATA delimiters
+ (i.e. "<![CDATA[" and "]]>") from <style> element content.
+ <p class='see'>
+ See: <a href='#script-strip-cdata-delims'>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims</a>
+ <br>
+ See: <a href='#style-strip-comment-delims'>http://cyberneko.org/html/features/scanner/style/strip-comment-delims</a>
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='augmentations'></a>
+ <span class='id'>http://cyberneko.org/html/features/augmentations</span>
+ <br>
+ Specifies whether infoset items that correspond to the
+ HTML events are included in the parsing pipeline. If
+ included, the augmented item will implement the
+ <code>HTMLEventInfo</code> interface found in the
+ <code>org.cyberneko.html</code> package. The augmentations
+ can be queried in XNI by calling the <code>getItem</code>
+ method with the key
+ "http://cyberneko.org/html/features/augmentations".
+ Currently, the HTML event info augmentation can report event
+ character boundaries and whether the event is synthesized.
+ <td align='center'>false
+ <tr>
+ <td>
+ <a name='report-errors'></a>
+ <span class='id'>http://cyberneko.org/html/features/report-errors</span>
+ <br>
+ Specifies whether errors should be reported to the registered error
+ handler. Since HTML applications are supposed to permit the
+ liberal use (and abuse) of HTML documents, errors should
+ normally be handled silently. However, if the application wants
+ to know about errors in the parsed HTML document, this feature
+ can be set to <code>true</code>.
+ <td align='center'>false
+</table>
+
+<h2>Properties</h2>
+The NekoHTML parser supports the following properties:
+<table cellpadding='4' cellspacing='0'>
+ <tr><th>Property Id / Description<th>Values<th>Default
+ <tr>
+ <td>
+ <a name='filters'></a>
+ <span class='id'>http://cyberneko.org/html/properties/filters</span>
+ <br>
+ This property allows applications to append custom document
+ processing components to the end of the default NekoHTML parser
+ pipeline. The value of this property must be an array of type
+ <code>org.apache.xerces.xni.parser.XMLDocumentFilter</code>
+ and no value of this array is allowed to be null. The document
+ filters are appended to the parser pipeline in array order.
+ Please refer to the <a href='filters.html'>filters</a>
+ documentation for more information.
+ <td align='center'><nobr>XMLDocumentFilter[]</nobr>
+ <td align='center'>null
+ <tr>
+ <td>
+ <a name='default-encoding'></a>
+ <span class='id'>http://cyberneko.org/html/properties/default-encoding</span>
+ <br>
+ Sets the default encoding the NekoHTML scanner should use
+ when parsing documents. In the absence of an
+ <code>http-equiv</code> directive in the source document,
+ this setting is important because the parser does not
+ have any support to <i>auto-detect</i> the encoding.
+ <p class='see'>
+ See: <a href='#ignore-specified-charset'>http://cyberneko.org/html/features/scanner/ignore-specified-charset</a>
+ <td align='center'>
+ <a href='http://www.iana.org/assignments/character-sets'>IANA</a>
+ encoding names
+ <td align='center'><nobr>"Windows-1252"</nobr>
+ <tr>
+ <td>
+ <a name='elem-names'></a>
+ <span class='id'>http://cyberneko.org/html/properties/names/elems</span>
+ <br>
+ Specifies how the NekoHTML components should modify recognized
+ element names. Names can be converted to upper-case, converted
+ to lower-case, or left as-is. The value of "match" specifies
+ that element names are to be left as-is but the end tag name will
+ be modified to match the start tag name. This is required to
+ ensure that the parser generates a well-formed XML document.
+ <td align='center'>"upper"<br>"lower"<br>"match"
+ <td align='center'>"upper"
+ <tr>
+ <td>
+ <a name='attr-names'></a>
+ <span class='id'>http://cyberneko.org/html/properties/names/attrs</span>
+ <br>
+ Specifies how the NekoHTML components should modify attribute names
+ of recognized elements. Names can be converted to upper-case,
+ converted to lower-case, or left as-is.
+ <td align='center'>"upper"<br>"lower"<br><nobr>"no-change"</nobr>
+ <td align='center'>"lower"
+ <tr>
+ <td>
+ <a name='doctype-pubid'></a>
+ <span class='id'>http://cyberneko.org/html/properties/doctype/pubid</span>
+ <br>
+ Specifies the document type declaration public identifier if the
+ <code>http://cyberneko.org/html/features/override-doctype</code>
+ feature is set to <code>true</code>. The default value is the HTML
+ 4.01 transitional public identifier, "-//W3C//DTD HTML 4.01 Transitional//EN".
+ <p class='see'>
+ See: <a href='#overrid-doctype'>http://cyberneko.org/html/features/override-doctype</a>
+ <td align='center'>String
+ <td align='center'>HTML 4.01 transitional public identifier
+ <tr>
+ <td>
+ <a name='doctype-sysid'></a>
+ <span class='id'>http://cyberneko.org/html/properties/doctype/sysid</span>
+ <br>
+ Specifies the document type declaration system identifier if the
+ <code>http://cyberneko.org/html/features/override-doctype</code>
+ feature is set to <code>true</code>. The default value is the HTML
+ 4.01 transitional system identifier, "http://www.w3.org/TR/html4/loose.dtd".
+ <p class='see'>
+ See: <a href='#overrid-doctype'>http://cyberneko.org/html/features/override-doctype</a>
+ <td align='center'>String
+ <td align='center'>HTML 4.01 transitional system identifier
+ <tr>
+ <td>
+ <a name='namespaces-uri'></a>
+ <span class='id'>http://cyberneko.org/html/properties/namespaces-uri</span>
+ <br>
+ Specifies the namespace binding URI if the
+ <code>http://cyberneko.org/html/features/override-namespaces</code>
+ feature is set to <code>true</code>. The default value is the XHTML
+ 1.0 namespace, "http://www.w3.org/1999/xhtml". This property does
+ <em>not</em> affect the case of element and attributes names and
+ does <em>not</em> ensure that the output of the NekoHTML parser is
+ valid according to the XHTML specification.
+ <p class='see'>
+ See: <a href='#override-namespaces'>http://cyberneko.org/html/features/override-namespaces</a>
+ <td align='center'>String
+ <td align='center'>XHTML 1.0 namespaces URI
+</table>
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
Added: branches/nekohtml/upstream/0.9.5/doc/html/software.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/software.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/software.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,209 @@
+<title>NekoHTML | Software</title>
+<meta http-equiv='content-type' content='text/html;charset=utf-8'>
+<link rel=stylesheet type=text/css href=../style.css>
+<style type='text/css'>
+.desc { font-style: italic }
+.note { font-style: italic }
+</style>
+
+<h1>Software Projects</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+<a href='usage.html'>Usage</a>
+|
+<a href='settings.html'>Settings</a>
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+Software
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<h2>Available Projects</h2>
+<p>
+The world is full of HTML pages and there's a good chance that, at some
+time, your software project will need to read and process these documents.
+Perhaps NekoHTML can help! And, depending on what you need to do, there
+may already be a solution to your problem. This page lists a number of
+projects that use the NekoHTML parser.
+<p>
+Here are some software projects you may find useful, presented in
+alphabetical order:
+<table cellpadding='4' cellspacing='0'>
+<tr>
+<th rowspan='2'>Celware WebRecorder
+<td class='desc'>
+Celware WebRecorder rapidly and easily exposes information on existing
+Web sites to other IT systems via a programmatic service, using the same
+interface as a standard Web browser. If you can see the information on
+the Web, Celware WebRecorder can learn by example and create a service
+to include that information in your IT environmentâwithout making any
+changes to the Web server!
+<tr>
+<td>
+<a href='http://www.celcorp.com/webrecorder.html'>http://www.celcorp.com/webrecorder.html</a>
+<tr>
+<th rowspan='2'>CVSGrab
+<td class='desc'>
+CVSGrab allows you to checkout files from a public CVS repository even
+if you are behind a corporate firewall that blocks access to the pserver.
+It provides read-only access to public CVS repositories through the
+ViewCVS web interface.
+<tr>
+<td>
+<a href='http://cvsgrab.sourceforge.net/'>http://cvsgrab.sourceforge.net/</a>
+<tr>
+<th rowspan='2'>Egothor
+<td class='desc'>
+Egothor is an open source, high-performance, full-featured text search
+engine written entirely in Java. It can be configured as a standalone
+engine, metasearcher, peer-to-peer HUB, and, moreover, it can be used
+as a library for an application that needs full-text search.
+<!--
+<br><br>
+<font size='-1'>Note: NekoHTML is used in Egothor starting with RC-1.2.4
+which is available from 16 Oct 2003.</font>
+-->
+<tr>
+<td>
+<a href='http://www.egothor.org/'>http://www.egothor.org/</a>
+<tr>
+<th rowspan='2'>HtmlUnit
+<td class='desc'>
+HtmlUnit is a java unit testing framework for testing web based applications.
+HtmlUnit models the returned document so that you deal with pages and form
+and tables.
+<tr>
+<td>
+<a href='http://htmlunit.sourceforge.net/'>http://htmlunit.sourceforge.net/</a>
+<tr>
+<th rowspan='2'>HttpUnit
+<td class='desc'>
+HttpUnit is a free, open source Java API for accessing web sites
+without a browser, and is ideally suited for automated unit testing of
+web sites when combined with a Java unit test framework such as JUnit.
+<tr>
+<td>
+<a href='http://httpunit.sourceforge.net/'>http://httpunit.sourceforge.net/</a>
+<tr>
+<th rowspan='2'>Jakarta Jelly
+<td class='desc'>
+Jelly is a Java and XML based scripting and processing engine for turning
+XML into executable code. Jelly can be used as a more flexible and powerful
+front end to Ant such as in the Maven project, as a testing framework such
+as JellyUnit, in an intergration or workflow system such as werkflow or as
+a page templating system inside engines like Cocoon.
+<tr>
+<td>
+<a href='http://jakarta.apache.org/commons/sandbox/jelly/'>http://jakarta.apache.org/commons/sandbox/jelly/</a>
+<tr>
+<!-- JPluck removed NekoHTML as of version 0.9. Oh well...
+<th rowspan='2'>JPluck
+<td class='desc'>
+JPluck is a Java-based toolkit for creating Plucker documents. JPluck
+is the perfect companion to the Plucker Viewer for the Palm OS platform.
+Together they provide a free solution for offline reading of web sites
+on Palm handhelds.
+<tr>
+<td>
+<a href='http://jpluck.sourceforge.net/'>http://jpluck.sourceforge.net/</a>
+-->
+<tr>
+<th rowspan='2'>Jivan</th>
+<td class='desc'>
+Jivan is a new Java web presentation technology that aids in the
+programming of dynamic web pages by separating program code from
+presentation layout. It enables you to use the W3C DOM API to
+push content into your HTML template. By directly copying
+unchanged sections of the source documents during page
+serialization, Jivan provides great performance to the web
+developer.
+<tr>
+<td>
+<a href='http://www.jivan.net/'>http://www.jivan.net/</a>
+<tr>
+<th rowspan='2'>jWebUnit
+<td class='desc'>
+jWebUnit is a Java framework that facilitates creation of acceptance tests
+for web applications. It provides a high-level API for navigating a
+web application combined with a set of assertions to verify the application's
+correctness. This includes navigation via links, form entry and submission,
+validation of table contents, and other typical business web application
+features.
+<tr>
+<td>
+<a href='http://jwebunit.sourceforge.net/'>http://jwebunit.sourceforge.net/</a>
+<tr>
+<th rowspan='2'>LingPipe
+<td class='desc'>
+LingPipe is a suite of Java tools designed to perform linguistic analysis on
+natural language data. While fast and robust enough to be used in a commercial
+system, LingPipe's flexibility and included source make it appropriate for
+research use. Tools include a statistical named-entity detector, a heuristic
+sentence boundary detector, and a heuristic within-document coreference
+resolution engine. Named entity extraction models are included for English
+news and English genomics domains, and can be trained for other languages
+and genres.
+<tr>
+<td>
+<a href='http://www.alias-i.com/lingpipe/'>http://www.alias-i.com/lingpipe/</a>
+<tr>
+<th rowspan='2'>Mockrunner
+<td class='desc'>
+Mockrunner is a lightweight framework for unit testing applications
+in the J2EE environment. It supports Struts actions and forms,
+servlets, filters and tag classes. Furthermore it includes a JDBC and
+a JMS test framework. The JDBC test framework can be used standalone
+or in conjunction with MockEJB to test EJB based applications.
+<tr>
+<td>
+<a href='http://mockrunner.sourceforge.net/'>http://mockrunner.sourceforge.net/</a>
+<tr>
+<th rowspan='2'>Pasta
+<td>
+Pastaã¯ãã¦ã§ãéçºã®ããã®ãã¬ã¼ã ã¯ã¼ã¯ã§ãã ãã®ãã¬ã¼ã ã¯ã¼ã¯ã¯ä¸»ã«ã
+ç¬èªã®ãã³ãã¬ã¼ãè¨èªãCLSãã¨ãããã¯ã°ã©ã¦ã³ã (ãªã¬ã¼ã·ã§ãã«ãã¼ã¿ãã¼ã¹ãªã©)
+ã¨ãã³ãã¬ã¼ãè¨èªãæ¥ç¶ããããã®ãã¡ãã»ã¼ã¸ãã¥ã¼ãããæ§æããã¦ãã¾ãã
+...
+<br>
+<strong>Note:</strong>
+<span class='desc'>The documentation for Pasta is only available in
+Japanese at this time.</span>
+<tr>
+<td>
+<a href='http://www.port4.info/pasta/'>http://www.port4.info/pasta/</a>
+<tr>
+<th rowspan='2'>X-Smiles
+<td class='desc'>
+X-Smiles is a Java based XML browser. It is intended for both desktop
+use and embedded network devices and to support multimedia services.
+<tr>
+<td>
+<a href='http://www.x-smiles.org/'>http://www.x-smiles.org/</a>
+</table>
+<p class='note'>
+<strong>Note:</strong> The author of NekoHTML does not officially endorse,
+recommend, or support any of the above software — they are merely
+presented for the benefit of the user. All questions and comments should
+be directed to the respective project owners. Mail sent to the author of
+NekoHTML regarding these projects will be ignored.
+
+<h2>Adding Your Project</h2>
+<p>
+If your project incorporates NekoHTML in some way and you think
+it would be useful to other users, please
+<a href='mailto:andyc at apache.org'>contact me</a> and I will add
+it to this list.
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/html/usage.html
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/html/usage.html 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/html/usage.html 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,212 @@
+<title>NekoHTML | Usage Instructions</title>
+<link rel=stylesheet type=text/css href=../style.css>
+<style type='text/css'>
+.note {
+ margin-left: 2em; margin-right: 2em;
+ padding: .25em;
+ border: 1px solid black;
+ background-color: #fdd;
+}
+</style>
+
+<h1>Usage Instructions</h1>
+<div class='navbar'>
+[<a href='../index.html'>Home</a>]
+[
+<a href='index.html'>Top</a>
+|
+Usage
+|
+<a href='settings.html'>Settings</a>
+|
+<a href='filters.html'>Filters</a>
+|
+<a href='javadoc/index.html'>JavaDoc</a>
+|
+<a href='faq.html'>FAQ</a>
+|
+<a href='software.html'>Software</a>
+|
+<a href='changes.html'>Changes</a>
+]
+</div>
+
+<a name='transparent'></a>
+<h2>Transparent Parser Construction</h2>
+<p>
+NekoHTML is designed to be as lightweight and simple to use as
+possible. Using the Xerces 2.0.0 parser as a foundation, NekoHTML
+can be transparent for applications that instantiate parser objects
+with the <a href='http://java.sun.com/xml/jaxp/index.html'>Java
+API for XML Processing</a> (JAXP). Just put the appropriate NekoHTML
+jar files in the classpath <em>before</em> the Xerces jar files. For
+example (on Windows): [<strong>Note:</strong> The classpath should be
+contiguous. It is split among separate lines in this example to make
+it easier to read.]
+<pre class='cmdline'>
+<span class='cmdline-prompt'>></span> <span class='cmdline-cmd'>java -cp nekohtml.jar;nekohtmlXni.jar;
+ xmlParserAPIs.jar;xercesImpl.jar;xercesSamples.jar
+ sax.Counter doc/html/index.html</span>
+doc/html/index.html: 10 ms (49 elems, 21 attrs, 0 spaces, 2652 chars)
+</pre>
+<p>
+The Xerces2 implementation dynamically instantiates the default
+parser configuration to construct parser objects via the Jar
+service facility. The Jar file <code>nekohtmlXni.jar</code>
+contains a <code>META-INF/services</code> file that is read by
+Xerces2 implementation for this purpose. Therefore, as long as
+this Jar file appears <em>before</em> the Xerces2 Jar files,
+the NekoHTML parser configuration will be used instead of the
+Xerces2 standard configuration.
+<p>
+Using this method will cause <em>every</em> Xerces2 parser
+constructed (using standard APIs) in the same JVM to use the
+HTML parser configuration. If this is not what you want to do,
+you should create the NekoHTML parser explicitly even though
+you parse and access the document contents using standard XML
+APIs. The following sections describe this method in more
+detail.
+<p class='note'>
+<strong>Note:</strong>
+The nekohtmlXni.jar file is no longer built by default. This
+change was made to alleviate confusion about which Jar files
+to add to the JVM classpath. If you still want to use this
+Jar file, you must build it using the "jar-xni" Ant task.
+</p>
+
+<a name='convenience'></a>
+<h2>Convenience Parser Classes</h2>
+<p>
+If you don't want to override the default Xerces2 parser
+instantiation mechanism, separate DOM and SAX parser classes are
+included in the <code>org.cyberneko.html.parsers</code> package
+for convenience. Both parsers use the <code>HTMLConfiguration</code>
+class to be able to parse HTML documents. In addition, the DOM
+parser uses the Xerces HTML DOM implementation so that the
+returned documents are of type
+<code>org.w3c.dom.html.HTMLDocument</code>. The following example
+shows how to use the NekoHTML <code>DOMParser</code> directly:
+<pre class='code'>
+<span class='code-keyword'>package</span> sample<span class='code-punct'>;</span>
+
+<span class='code-keyword'>import</span> org.cyberneko.html.parsers.DOMParser<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.Document<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.Node<span class='code-punct'>;</span>
+
+<span class='code-keyword'>public class</span> TestHTMLDOM <span class='code-punct'>{</span>
+ <span class='code-keyword'>public static void</span> <span class='code-func'>main</span><span class='code-punct'>(</span>String<span class='code-punct'>[]</span> argv<span class='code-punct'>)</span> <span class='code-keyword'>throws</span> Exception <span class='code-punct'>{</span>
+ DOMParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMParser<span class='code-punct'>();</span>
+ <span class='code-keyword'>for</span> <span class='code-punct'>(</span><span class='code-keyword'>int</span> i <span class='code-punct'>=</span> 0<span class='code-punct'>;</span> i <span class='code-punct'><</span> argv<span class='code-punct'>.</span>length<span class='code-punct'>;</span> i<span class='code-punct'>++) {</span>
+ parser<span class='code-punct'>.</span><span class='code-func'>parse</span><span class='code-punct'>(</span>argv<span class='code-punct'>[</span>i<span class='code-punct'>]);</span>
+ <span class='code-func'>print</span><span class='code-punct'>(</span>parser<span class='code-punct'>.</span><span class='code-func'>getDocument</span><span class='code-punct'>(),</span> <span class='code-string'>""</span><span class='code-punct'>);</span>
+ <span class='code-punct'>}</span>
+ <span class='code-punct'>}</span>
+ <span class='code-keyword'>public static void</span> <span class='code-func'>print</span><span class='code-punct'>(</span>Node node<span class='code-punct'>,</span> String indent<span class='code-punct'>) {</span>
+ System<span class='code-punct'>.</span>out<span class='code-punct'>.</span><span class='code-func'>println</span><span class='code-punct'>(</span>indent<span class='code-punct'>+</span>node<span class='code-punct'>.</span><span class='code-func'>getClass</span><span class='code-punct'>().</span><span class='code-func'>getName</span><span class='code-punct'>());</span>
+ Node child <span class='code-punct'>=</span> node<span class='code-punct'>.</span><span class='code-func'>getFirstChild</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>while</span> <span class='code-punct'>(</span>child <span class='code-punct'>!=</span> <span class='code-keyword'>null</span><span class='code-punct'>) {</span>
+ print<span class='code-punct'>(</span>child<span class='code-punct'>,</span> indent<span class='code-punct'>+</span><span class='code-string'>" "</span><span class='code-punct'>);</span>
+ child <span class='code-punct'>=</span> child<span class='code-punct'>.</span><span class='code-func'>getNextSibling</span><span class='code-punct'>();</span>
+ <span class='code-punct'>}
+ }</span>
+<span class='code-punct'>}</span>
+</pre>
+<p>
+Running this program produces the following output:
+[<strong>Note:</strong> The classpath should be
+contiguous. It is split among separate lines in this example to make
+it easier to read.]
+<pre class='cmdline'>
+<span class='cmdline-prompt'>></span> <span class='cmdline-cmd'>java -cp nekohtml.jar;nekohtmlSamples.jar;
+ xmlParserAPIs.jar;xercesImpl.jar
+ sample.TestHTMLDOM data/html/test01.html</span>
+org.apache.html.dom.HTMLDocumentImpl
+ org.apache.html.dom.HTMLHtmlElementImpl
+ org.apache.html.dom.HTMLBodyElementImpl
+ org.apache.xerces.dom.TextImpl
+</pre>
+<p>
+This source code is included in the <code>src/html/sample/</code> directory.
+<p>
+In addition to the provided DOM and SAX parser classes, NekoHTML
+also provides a DOM fragment parser class. The <code>DOMFragmentParser</code>
+class, found in the <code>org.cyberneko.html.parsers</code>
+package, in can be used to parse fragments of HTML documents
+into their corresponding DOM nodes. The following example shows
+how to use the NekoHTML <code>DOMFragmentParser</code> directly:
+<pre class='code'>
+<span class='code-keyword'>package</span> sample<span class='code-punct'>;</span>
+
+<span class='code-keyword'>import</span> org.cyberneko.html.parsers.DOMFragmentParser<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.apache.html.dom.HTMLDocumentImpl<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.Document<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.DocumentFragment<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.Node<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.w3c.dom.html.HTMLDocument<span class='code-punct'>;</span>
+
+<span class='code-keyword'>public class</span> TestHTMLDOMFragment <span class='code-punct'>{</span>
+ <span class='code-keyword'>public static void</span> <span class='code-func'>main</span><span class='code-punct'>(</span>String<span class='code-punct'>[]</span> argv<span class='code-punct'>)</span> <span class='code-keyword'>throws</span> Exception <span class='code-punct'>{</span>
+ DOMFragmentParser parser <span class='code-punct'>=</span> <span class='code-keyword'>new</span> DOMFragmentParser<span class='code-punct'>();</span>
+ HTMLDocument document <span class='code-punct'>=</span> <span class='code-keyword'>new</span> HTMLDocumentImpl<span class='code-punct'>();</span>
+ <span class='code-keyword'>for</span> <span class='code-punct'>(</span><span class='code-keyword'>int</span> i <span class='code-punct'>=</span> 0<span class='code-punct'>;</span> i <span class='code-punct'><</span> argv<span class='code-punct'>.</span>length<span class='code-punct'>;</span> i<span class='code-punct'>++) {</span>
+ DocumentFragment fragment <span class='code-punct'>=</span> document<span class='code-punct'>.</span><span class='code-func'>createDocumentFragment</span><span class='code-punct'>();</span>
+ parser<span class='code-punct'>.</span><span class='code-func'>parse</span><span class='code-punct'>(</span>argv<span class='code-punct'>[</span>i<span class='code-punct'>],</span> fragment<span class='code-punct'>);</span>
+ <span class='code-func'>print</span><span class='code-punct'>(</span>fragment<span class='code-punct'>,</span> <span class='code-string'>""</span><span class='code-punct'>);</span>
+ <span class='code-punct'>}</span>
+ <span class='code-punct'>}</span>
+ <span class='code-keyword'>public static void</span> <span class='code-func'>print</span><span class='code-punct'>(</span>Node node<span class='code-punct'>,</span> String indent<span class='code-punct'>) {</span>
+ System<span class='code-punct'>.</span>out<span class='code-punct'>.</span><span class='code-func'>println</span><span class='code-punct'>(</span>indent<span class='code-punct'>+</span>node<span class='code-punct'>.</span><span class='code-func'>getClass</span><span class='code-punct'>().</span><span class='code-func'>getName</span><span class='code-punct'>());</span>
+ Node child <span class='code-punct'>=</span> node<span class='code-punct'>.</span><span class='code-func'>getFirstChild</span><span class='code-punct'>();</span>
+ <span class='code-keyword'>while</span> <span class='code-punct'>(</span>child <span class='code-punct'>!=</span> <span class='code-keyword'>null</span><span class='code-punct'>) {</span>
+ <span class='code-func'>print</span><span class='code-punct'>(</span>child<span class='code-punct'>,</span> indent<span class='code-punct'>+</span><span class='code-string'>" "</span><span class='code-punct'>);</span>
+ child <span class='code-punct'>=</span> child<span class='code-punct'>.</span><span class='code-func'>getNextSibling</span><span class='code-punct'>();</span>
+ <span class='code-punct'>}
+ }</span>
+<span class='code-punct'>}</span>
+</pre>
+<p>
+This source code is included in the <code>src/html/sample/</code>
+directory.
+<p>
+Notice that the application parses a document fragment a little
+bit differently than parsing a complete document. Instead of
+initiating a parse by passing in a system identifier (or an
+input source), parsing an HTML document fragment requires the
+application to pass a DOM <code>DocumentFragment</code> object
+to the <code>parse</code> method. The DOM fragment parser will
+use the owner document of the <code>DocumentFragment</code> as
+the factory for parsed nodes. These nodes are then appended in
+document order to the document fragment object.
+<p>
+<strong>Note:</strong>
+In order for HTML DOM objects to be created, the document fragment
+object passed to the <code>parse</code> method should be created from
+a DOM document object of type <code>org.w3c.dom.html.HTMLDocument</code>.
+
+<a name='custom'></a>
+<h2>Custom Parser Classes</h2>
+<p>
+Alternatively, you can construct any XNI-based parser class
+using the <code>HTMLConfiguration</code> parser configuration class
+found in the <code>org.cyberneko.html</code> package. The following
+example shows how to extend the abstract SAX parser provided with
+the Xerces2 implementation by passing the NekoHTML parser
+configuration to the base class in the constructor.
+<pre class='code'>
+<span class='code-keyword'>package</span> sample<span class='code-punct'>;</span>
+
+<span class='code-keyword'>import</span> org.apache.xerces.parsers.AbstractSAXParser<span class='code-punct'>;</span>
+<span class='code-keyword'>import</span> org.cyberneko.html.HTMLConfiguration<span class='code-punct'>;</span>
+
+<span class='code-keyword'>public class</span> HTMLSAXParser <span class='code-keyword'>extends</span> AbstractSAXParser <span class='code-punct'>{</span>
+ <span class='code-keyword'>public</span> HTMLSAXParser<span class='code-punct'>() {</span>
+ <span class='code-keyword'>super</span><span class='code-punct'>(</span><span class='code-keyword'>new</span> HTMLConfiguration<span class='code-punct'>());</span>
+ <span class='code-punct'>}</span>
+<span class='code-punct'>}</span>
+</pre>
+<p>
+This source code is included in the <code>src/html/sample/</code> directory.
+
+<div class='copyright'>
+(C) Copyright 2002-2005, Andy Clark. All rights reserved.
+</div>
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/doc/style.css
===================================================================
--- branches/nekohtml/upstream/0.9.5/doc/style.css 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/doc/style.css 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,74 @@
+BODY { background: #EEEEEE;
+ margin: 0; padding: 10; }
+H1 { text-align: center; background: #DDDDFF;
+ border-style: solid; border-color: black; border-width: 0;
+ border-top-width: 1; border-bottom-width: 1;
+ margin: -10; padding: 5 }
+H2 H3 H4 H5 H6 { }
+P { margin: 15; text-align: justify }
+UL { margin-right: 15 }
+OL { margin-right: 15 }
+TD DL { margin: 0 }
+DL { margin: 15 }
+DT { font-weight: bold }
+DD { font-style: italic }
+OL UL { margin: 15 }
+HR { background: black; color: black; height: 1;
+ border: none;
+ margin: 0; padding: 0 }
+
+TABLE { border-style: solid; border-color: black; border-width: 2;
+ border-right-width: 1; border-bottom-width: 1;
+ margin: 15; margin-left: 30; margin-right: 30 }
+TR { background: white; vertical-align: top }
+TH { border-style: solid; border-color: black; border-width: 0;
+ border-bottom-width: 1; border-right-width: 1;
+ background: #DDDDFF }
+TD { border-style: solid; border-color: black; border-width: 0;
+ border-right-width: 1; border-bottom-width: 1 }
+
+PRE.cmdline { background: black; color: silver;
+ border-style: solid; border-color: white; border-width: 1;
+ margin: 15; margin-left: 30; margin-right: 30; padding: 5 }
+.cmdline-prompt { color: magenta }
+.cmdline-cmd { color: yellow }
+
+PRE.code { background: darkblue; color: #00FF00;
+ border-style: solid; border-color: black; border-width: 1;
+ margin: 15; margin-left: 30; margin-right: 30; padding: 5 }
+.code-keyword { color: yellow }
+.code-punct { color: white }
+.code-number { color: silver }
+.code-string { color: cyan }
+.code-comment { color: silver }
+.code-func { font-weight: bold }
+
+PRE.document,PRE.xml { background: white; color: black;
+ border-style: solid; border-color: black; border-width: 1;
+ margin: 15; margin-left: 30; margin-right: 30;
+ padding: 5 }
+.xml-markup { color: blue }
+.xml-comment { color: silver }
+.xml-ename { color: maroon }
+.xml-aname { color: maroon }
+.xml-avalue { font-weight: bold }
+.xml-aname-xmlns { color: red }
+.xml-avalue-xmlns { color: red; font-weight: bold }
+.xml-chars { font-weight: bold }
+
+.id { font-family: sans-serif;
+ text-decoration: underline;
+ white-space: nowrap }
+
+.navbar { margin: 10; text-align: center }
+DIV.navbar A { text-decoration: none }
+
+.new { color: red; font-weight: bold }
+.updated { color: orange; font-weight: bold }
+
+.copyright { text-align: center;
+ background: #DDDDFF;
+ border-style: solid; border-color: black; border-width: 0;
+ border-top-width: 1; border-bottom-width: 1;
+ margin: -10; margin-top: 10 }
+
Added: branches/nekohtml/upstream/0.9.5/src/html/META-INF/services/org.apache.xerces.xni.parser.XMLParserConfiguration
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/META-INF/services/org.apache.xerces.xni.parser.XMLParserConfiguration 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/META-INF/services/org.apache.xerces.xni.parser.XMLParserConfiguration 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1 @@
+org.cyberneko.html.HTMLConfiguration
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLAugmentations.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLAugmentations.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLAugmentations.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,108 @@
+/*
+ * (C) Copyright 2004-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import org.apache.xerces.xni.Augmentations;
+
+import java.util.Enumeration;
+import java.util.Hashtable;
+
+/**
+ * This class is here to overcome the XNI changes to the
+ * <code>Augmentations</code> interface. In early versions of XNI, the
+ * augmentations interface contained a <code>clear()</code> method to
+ * remove all of the items from the augmentations instance. A later
+ * version of XNI changed this method to <code>removeAllItems()</code>.
+ * Therefore, this class extends the augmentations interface and
+ * explicitly implements both of these methods.
+ * <p>
+ * <strong>Note:</strong>
+ * This code is inspired by performance enhancements submitted by
+ * Marc-André Morissette.
+ *
+ * @author Andy Clark
+ */
+public class HTMLAugmentations
+ implements Augmentations {
+
+ //
+ // Data
+ //
+
+ /** Augmentation items. */
+ protected Hashtable fItems = new Hashtable();
+
+ //
+ // Public methods
+ //
+
+ // since Xerces 2.3.0
+
+ /** Removes all of the elements in this augmentations object. */
+ public void removeAllItems() {
+ fItems.clear();
+ } // removeAllItems()
+
+ // from Xerces 2.0.0 (beta4) until 2.3.0
+
+ /** Removes all of the elements in this augmentations object. */
+ public void clear() {
+ fItems.clear();
+ } // clear()
+
+ //
+ // Augmentations methods
+ //
+
+ /**
+ * Add additional information identified by a key to the Augmentations
+ * structure.
+ *
+ * @param key Identifier, can't be <code>null</code>
+ * @param item Additional information
+ *
+ * @return The previous value of the specified key in the Augmentations
+ * structure, or <code>null</code> if it did not have one.
+ */
+ public Object putItem(String key, Object item) {
+ return fItems.put(key, item);
+ } // putItem(String, Object):Object
+
+
+ /**
+ * Get information identified by a key from the Augmentations structure.
+ *
+ * @param key Identifier, can't be <code>null</code>
+ *
+ * @return The value to which the key is mapped in the Augmentations
+ * structure; <code>null</code> if the key is not mapped to any
+ * value.
+ */
+ public Object getItem(String key) {
+ return fItems.get(key);
+ } // getItem(String):Object
+
+ /**
+ * Remove additional info from the Augmentations structure
+ *
+ * @param key Identifier, can't be <code>null</code>
+ * @return The previous value of the specified key in the Augmentations
+ * structure, or <code>null</code> if it did not have one.
+ */
+ public Object removeItem(String key) {
+ return fItems.remove(key);
+ } // removeItem(String):Object
+
+ /**
+ * Returns an enumeration of the keys in the Augmentations structure.
+ */
+ public Enumeration keys() {
+ return fItems.keys();
+ } // keys():Enumeration
+
+} // class HTMLAugmentations
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLComponent.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLComponent.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLComponent.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,42 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import org.apache.xerces.xni.parser.XMLComponent;
+
+/**
+ * This interface extends the XNI <code>XMLComponent</code> interface
+ * to add methods that allow the preferred default values for features
+ * and properties to be queried.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLComponent.java,v 1.4 2005/02/14 03:56:54 andyc Exp $
+ */
+public interface HTMLComponent
+ extends XMLComponent {
+
+ //
+ // HTMLComponent methods
+ //
+
+ /**
+ * Returns the default state for a feature, or null if this
+ * component does not want to report a default value for this
+ * feature.
+ */
+ public Boolean getFeatureDefault(String featureId);
+
+ /**
+ * Returns the default state for a property, or null if this
+ * component does not want to report a default value for this
+ * property.
+ */
+ public Object getPropertyDefault(String propertyId);
+
+} // interface HTMLComponent
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLConfiguration.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLConfiguration.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLConfiguration.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,696 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.lang.reflect.Method;
+import java.lang.reflect.InvocationTargetException;
+import java.text.MessageFormat;
+import java.util.Locale;
+import java.util.MissingResourceException;
+import java.util.Properties;
+import java.util.ResourceBundle;
+import java.util.Vector;
+
+import org.cyberneko.html.filters.NamespaceBinder;
+
+import org.apache.xerces.util.DefaultErrorHandler;
+import org.apache.xerces.util.ParserConfigurationSettings;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLDTDHandler;
+import org.apache.xerces.xni.XMLDTDContentModelHandler;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+import org.apache.xerces.xni.parser.XMLEntityResolver;
+import org.apache.xerces.xni.parser.XMLErrorHandler;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParseException;
+import org.apache.xerces.xni.parser.XMLPullParserConfiguration;
+
+/**
+ * An XNI-based parser configuration that can be used to parse HTML
+ * documents. This configuration can be used directly in order to
+ * parse HTML documents or can be used in conjunction with any XNI
+ * based tools, such as the Xerces2 implementation.
+ * <p>
+ * This configuration recognizes the following features:
+ * <ul>
+ * <li>http://cyberneko.org/html/features/augmentations
+ * <li>http://cyberneko.org/html/features/report-errors
+ * <li>http://cyberneko.org/html/features/report-errors/simple
+ * <li>http://cyberneko.org/html/features/balance-tags
+ * <li><i>and</i>
+ * <li>the features supported by the scanner and tag balancer components.
+ * </ul>
+ * <p>
+ * This configuration recognizes the following properties:
+ * <ul>
+ * <li>http://cyberneko.org/html/properties/names/elems
+ * <li>http://cyberneko.org/html/properties/names/attrs
+ * <li>http://cyberneko.org/html/properties/filters
+ * <li>http://cyberneko.org/html/properties/error-reporter
+ * <li><i>and</i>
+ * <li>the properties supported by the scanner and tag balancer.
+ * </ul>
+ * <p>
+ * For complete usage information, refer to the documentation.
+ *
+ * @see HTMLScanner
+ * @see HTMLTagBalancer
+ * @see HTMLErrorReporter
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLConfiguration.java,v 1.9 2005/02/14 03:56:54 andyc Exp $
+ */
+public class HTMLConfiguration
+ extends ParserConfigurationSettings
+ implements XMLPullParserConfiguration {
+
+ //
+ // Constants
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+ /** Include infoset augmentations. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Report errors. */
+ protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
+
+ /** Simple report format. */
+ protected static final String SIMPLE_ERROR_FORMAT = "http://cyberneko.org/html/features/report-errors/simple";
+
+ /** Balance tags. */
+ protected static final String BALANCE_TAGS = "http://cyberneko.org/html/features/balance-tags";
+
+ // properties
+
+ /** Modify HTML element names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+ /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+
+ /** Pipeline filters. */
+ protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
+
+ /** Error reporter. */
+ protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
+
+ // other
+
+ /** Error domain. */
+ protected static final String ERROR_DOMAIN = "http://cyberneko.org/html";
+
+ // private
+
+ /** Document source class array. */
+ private static final Class[] DOCSOURCE = { XMLDocumentSource.class };
+
+ //
+ // Data
+ //
+
+ // handlers
+
+ /** Document handler. */
+ protected XMLDocumentHandler fDocumentHandler;
+
+ /** DTD handler. */
+ protected XMLDTDHandler fDTDHandler;
+
+ /** DTD content model handler. */
+ protected XMLDTDContentModelHandler fDTDContentModelHandler;
+
+ /** Error handler. */
+ protected XMLErrorHandler fErrorHandler = new DefaultErrorHandler();
+
+ // other settings
+
+ /** Entity resolver. */
+ protected XMLEntityResolver fEntityResolver;
+
+ /** Locale. */
+ protected Locale fLocale = Locale.getDefault();
+
+ // state
+
+ /**
+ * Stream opened by parser. Therefore, must close stream manually upon
+ * termination of parsing.
+ */
+ protected boolean fCloseStream;
+
+ // components
+
+ /** Components. */
+ protected Vector fHTMLComponents = new Vector(2);
+
+ // pipeline
+
+ /** Document scanner. */
+ protected HTMLScanner fDocumentScanner = new HTMLScanner();
+
+ /** HTML tag balancer. */
+ protected HTMLTagBalancer fTagBalancer = new HTMLTagBalancer();
+
+ /** Namespace binder. */
+ protected NamespaceBinder fNamespaceBinder = new NamespaceBinder();
+
+ // other components
+
+ /** Error reporter. */
+ protected HTMLErrorReporter fErrorReporter = new ErrorReporter();
+
+ // HACK: workarounds Xerces 2.0.x problems
+
+ /** Parser version is Xerces 2.0.0. */
+ protected static boolean XERCES_2_0_0 = false;
+
+ /** Parser version is Xerces 2.0.1. */
+ protected static boolean XERCES_2_0_1 = false;
+
+ /** Parser version is XML4J 4.0.x. */
+ protected static boolean XML4J_4_0_x = false;
+
+ //
+ // Static initializer
+ //
+
+ static {
+ try {
+ String VERSION = "org.apache.xerces.impl.Version";
+ Object version = ObjectFactory.createObject(VERSION, VERSION);
+ java.lang.reflect.Field field = version.getClass().getField("fVersion");
+ String versionStr = String.valueOf(field.get(version));
+ XERCES_2_0_0 = versionStr.equals("Xerces-J 2.0.0");
+ XERCES_2_0_1 = versionStr.equals("Xerces-J 2.0.1");
+ XML4J_4_0_x = versionStr.startsWith("XML4J 4.0.");
+ }
+ catch (Throwable e) {
+ // ignore
+ }
+ } // <clinit>()
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public HTMLConfiguration() {
+
+ // add components
+ addComponent(fDocumentScanner);
+ addComponent(fTagBalancer);
+ addComponent(fNamespaceBinder);
+
+ //
+ // features
+ //
+
+ // recognized features
+ String VALIDATION = "http://xml.org/sax/features/validation";
+ String[] recognizedFeatures = {
+ AUGMENTATIONS,
+ NAMESPACES,
+ VALIDATION,
+ REPORT_ERRORS,
+ SIMPLE_ERROR_FORMAT,
+ BALANCE_TAGS,
+ };
+ addRecognizedFeatures(recognizedFeatures);
+ setFeature(AUGMENTATIONS, false);
+ setFeature(NAMESPACES, true);
+ setFeature(VALIDATION, false);
+ setFeature(REPORT_ERRORS, false);
+ setFeature(SIMPLE_ERROR_FORMAT, false);
+ setFeature(BALANCE_TAGS, true);
+
+ // HACK: Xerces 2.0.0
+ if (XERCES_2_0_0) {
+ // NOTE: These features should not be required but it causes a
+ // problem if they're not there. This will be fixed in
+ // subsequent releases of Xerces.
+ recognizedFeatures = new String[] {
+ "http://apache.org/xml/features/scanner/notify-builtin-refs",
+ };
+ addRecognizedFeatures(recognizedFeatures);
+ }
+
+ // HACK: Xerces 2.0.1
+ if (XERCES_2_0_0 || XERCES_2_0_1 || XML4J_4_0_x) {
+ // NOTE: These features should not be required but it causes a
+ // problem if they're not there. This should be fixed in
+ // subsequent releases of Xerces.
+ recognizedFeatures = new String[] {
+ "http://apache.org/xml/features/validation/schema/normalized-value",
+ "http://apache.org/xml/features/scanner/notify-char-refs",
+ };
+ addRecognizedFeatures(recognizedFeatures);
+ }
+
+ //
+ // properties
+ //
+
+ // recognized properties
+ String[] recognizedProperties = {
+ NAMES_ELEMS,
+ NAMES_ATTRS,
+ FILTERS,
+ ERROR_REPORTER,
+ };
+ addRecognizedProperties(recognizedProperties);
+ setProperty(NAMES_ELEMS, "upper");
+ setProperty(NAMES_ATTRS, "lower");
+ setProperty(ERROR_REPORTER, fErrorReporter);
+
+ // HACK: Xerces 2.0.0
+ if (XERCES_2_0_0) {
+ // NOTE: This is a hack to get around a problem in the Xerces 2.0.0
+ // AbstractSAXParser. If it uses a parser configuration that
+ // does not have a SymbolTable, then it will remove *all*
+ // attributes. This will be fixed in subsequent releases of
+ // Xerces.
+ String SYMBOL_TABLE = "http://apache.org/xml/properties/internal/symbol-table";
+ recognizedProperties = new String[] {
+ SYMBOL_TABLE,
+ };
+ addRecognizedProperties(recognizedProperties);
+ Object symbolTable = ObjectFactory.createObject("org.apache.xerces.util.SymbolTable",
+ "org.apache.xerces.util.SymbolTable");
+ setProperty(SYMBOL_TABLE, symbolTable);
+ }
+
+ } // <init>()
+
+ //
+ // Public methods
+ //
+
+ /**
+ * Pushes an input source onto the current entity stack. This
+ * enables the scanner to transparently scan new content (e.g.
+ * the output written by an embedded script). At the end of the
+ * current entity, the scanner returns where it left off at the
+ * time this entity source was pushed.
+ * <p>
+ * <strong>Hint:</strong>
+ * To use this feature to insert the output of <SCRIPT>
+ * tags, remember to buffer the <em>entire</em> output of the
+ * processed instructions before pushing a new input source.
+ * Otherwise, events may appear out of sequence.
+ *
+ * @param inputSource The new input source to start scanning.
+ */
+ public void pushInputSource(XMLInputSource inputSource) {
+ fDocumentScanner.pushInputSource(inputSource);
+ } // pushInputSource(XMLInputSource)
+
+ // XMLParserConfiguration methods
+ //
+
+ /** Sets a feature. */
+ public void setFeature(String featureId, boolean state)
+ throws XMLConfigurationException {
+ super.setFeature(featureId, state);
+ int size = fHTMLComponents.size();
+ for (int i = 0; i < size; i++) {
+ HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
+ component.setFeature(featureId, state);
+ }
+ } // setFeature(String,boolean)
+
+ /** Sets a property. */
+ public void setProperty(String propertyId, Object value)
+ throws XMLConfigurationException {
+ super.setProperty(propertyId, value);
+
+ if (propertyId.equals(FILTERS)) {
+ XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS);
+ if (filters != null) {
+ for (int i = 0; i < filters.length; i++) {
+ XMLDocumentFilter filter = filters[i];
+ if (filter instanceof HTMLComponent) {
+ addComponent((HTMLComponent)filter);
+ }
+ }
+ }
+ }
+
+ int size = fHTMLComponents.size();
+ for (int i = 0; i < size; i++) {
+ HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
+ component.setProperty(propertyId, value);
+ }
+ } // setProperty(String,Object)
+
+ /** Sets the document handler. */
+ public void setDocumentHandler(XMLDocumentHandler handler) {
+ fDocumentHandler = handler;
+ } // setDocumentHandler(XMLDocumentHandler)
+
+ /** Returns the document handler. */
+ public XMLDocumentHandler getDocumentHandler() {
+ return fDocumentHandler;
+ } // getDocumentHandler():XMLDocumentHandler
+
+ /** Sets the DTD handler. */
+ public void setDTDHandler(XMLDTDHandler handler) {
+ fDTDHandler = handler;
+ } // setDTDHandler(XMLDTDHandler)
+
+ /** Returns the DTD handler. */
+ public XMLDTDHandler getDTDHandler() {
+ return fDTDHandler;
+ } // getDTDHandler():XMLDTDHandler
+
+ /** Sets the DTD content model handler. */
+ public void setDTDContentModelHandler(XMLDTDContentModelHandler handler) {
+ fDTDContentModelHandler = handler;
+ } // setDTDContentModelHandler(XMLDTDContentModelHandler)
+
+ /** Returns the DTD content model handler. */
+ public XMLDTDContentModelHandler getDTDContentModelHandler() {
+ return fDTDContentModelHandler;
+ } // getDTDContentModelHandler():XMLDTDContentModelHandler
+
+ /** Sets the error handler. */
+ public void setErrorHandler(XMLErrorHandler handler) {
+ fErrorHandler = handler;
+ } // setErrorHandler(XMLErrorHandler)
+
+ /** Returns the error handler. */
+ public XMLErrorHandler getErrorHandler() {
+ return fErrorHandler;
+ } // getErrorHandler():XMLErrorHandler
+
+ /** Sets the entity resolver. */
+ public void setEntityResolver(XMLEntityResolver resolver) {
+ fEntityResolver = resolver;
+ } // setEntityResolver(XMLEntityResolver)
+
+ /** Returns the entity resolver. */
+ public XMLEntityResolver getEntityResolver() {
+ return fEntityResolver;
+ } // getEntityResolver():XMLEntityResolver
+
+ /** Sets the locale. */
+ public void setLocale(Locale locale) {
+ if (locale == null) {
+ locale = Locale.getDefault();
+ }
+ fLocale = locale;
+ } // setLocale(Locale)
+
+ /** Returns the locale. */
+ public Locale getLocale() {
+ return fLocale;
+ } // getLocale():Locale
+
+ /** Parses a document. */
+ public void parse(XMLInputSource source) throws XNIException, IOException {
+ setInputSource(source);
+ parse(true);
+ } // parse(XMLInputSource)
+
+ //
+ // XMLPullParserConfiguration methods
+ //
+
+ // parsing
+
+ /**
+ * Sets the input source for the document to parse.
+ *
+ * @param inputSource The document's input source.
+ *
+ * @exception XMLConfigurationException Thrown if there is a
+ * configuration error when initializing the
+ * parser.
+ * @exception IOException Thrown on I/O error.
+ *
+ * @see #parse(boolean)
+ */
+ public void setInputSource(XMLInputSource inputSource)
+ throws XMLConfigurationException, IOException {
+ reset();
+ fCloseStream = inputSource.getByteStream() == null &&
+ inputSource.getCharacterStream() == null;
+ fDocumentScanner.setInputSource(inputSource);
+ } // setInputSource(XMLInputSource)
+
+ /**
+ * Parses the document in a pull parsing fashion.
+ *
+ * @param complete True if the pull parser should parse the
+ * remaining document completely.
+ *
+ * @return True if there is more document to parse.
+ *
+ * @exception XNIException Any XNI exception, possibly wrapping
+ * another exception.
+ * @exception IOException An IO exception from the parser, possibly
+ * from a byte stream or character stream
+ * supplied by the parser.
+ *
+ * @see #setInputSource
+ */
+ public boolean parse(boolean complete) throws XNIException, IOException {
+ try {
+ boolean more = fDocumentScanner.scanDocument(complete);
+ if (!more) {
+ cleanup();
+ }
+ return more;
+ }
+ catch (XNIException e) {
+ cleanup();
+ throw e;
+ }
+ catch (IOException e) {
+ cleanup();
+ throw e;
+ }
+ } // parse(boolean):boolean
+
+ /**
+ * If the application decides to terminate parsing before the xml document
+ * is fully parsed, the application should call this method to free any
+ * resource allocated during parsing. For example, close all opened streams.
+ */
+ public void cleanup() {
+ fDocumentScanner.cleanup(fCloseStream);
+ } // cleanup()
+
+ //
+ // Protected methods
+ //
+
+ /** Adds a component. */
+ protected void addComponent(HTMLComponent component) {
+
+ // add component to list
+ fHTMLComponents.addElement(component);
+
+ // add recognized features and set default states
+ String[] features = component.getRecognizedFeatures();
+ addRecognizedFeatures(features);
+ int featureCount = features != null ? features.length : 0;
+ for (int i = 0; i < featureCount; i++) {
+ Boolean state = component.getFeatureDefault(features[i]);
+ if (state != null) {
+ setFeature(features[i], state.booleanValue());
+ }
+ }
+
+ // add recognized properties and set default values
+ String[] properties = component.getRecognizedProperties();
+ addRecognizedProperties(properties);
+ int propertyCount = properties != null ? properties.length : 0;
+ for (int i = 0; i < propertyCount; i++) {
+ Object value = component.getPropertyDefault(properties[i]);
+ if (value != null) {
+ setProperty(properties[i], value);
+ }
+ }
+
+ } // addComponent(HTMLComponent)
+
+ /** Resets the parser configuration. */
+ protected void reset() throws XMLConfigurationException {
+
+ // reset components
+ int size = fHTMLComponents.size();
+ for (int i = 0; i < size; i++) {
+ HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
+ component.reset(this);
+ }
+
+ // configure pipeline
+ XMLDocumentSource lastSource = fDocumentScanner;
+ if (getFeature(BALANCE_TAGS)) {
+ lastSource.setDocumentHandler(fTagBalancer);
+ fTagBalancer.setDocumentSource(fDocumentScanner);
+ lastSource = fTagBalancer;
+ }
+ if (getFeature(NAMESPACES)) {
+ lastSource.setDocumentHandler(fNamespaceBinder);
+ fNamespaceBinder.setDocumentSource(fTagBalancer);
+ lastSource = fNamespaceBinder;
+ }
+ XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS);
+ if (filters != null) {
+ for (int i = 0; i < filters.length; i++) {
+ XMLDocumentFilter filter = filters[i];
+ Class filterClass = filter.getClass();
+ try {
+ Method filterMethod = filterClass.getMethod("setDocumentSource", DOCSOURCE);
+ if (filterMethod != null) {
+ filterMethod.invoke(filter, new Object[] { lastSource });
+ }
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ lastSource.setDocumentHandler(filter);
+ lastSource = filter;
+ }
+ }
+ lastSource.setDocumentHandler(fDocumentHandler);
+
+ } // reset()
+
+ //
+ // Interfaces
+ //
+
+ /**
+ * Defines an error reporter for reporting HTML errors. There is no such
+ * thing as a fatal error in parsing HTML. I/O errors are fatal but should
+ * throw an <code>IOException</code> directly instead of reporting an error.
+ * <p>
+ * When used in a configuration, the error reporter instance should be
+ * set as a property with the following property identifier:
+ * <pre>
+ * "http://cyberneko.org/html/internal/error-reporter" in the
+ * </pre>
+ * Components in the configuration can query the error reporter using this
+ * property identifier.
+ * <p>
+ * <strong>Note:</strong>
+ * All reported errors are within the domain "http://cyberneko.org/html".
+ *
+ * @author Andy Clark
+ */
+ protected class ErrorReporter
+ implements HTMLErrorReporter {
+
+ //
+ // Data
+ //
+
+ /** Last locale. */
+ protected Locale fLastLocale;
+
+ /** Error messages. */
+ protected ResourceBundle fErrorMessages;
+
+ //
+ // HTMLErrorReporter methods
+ //
+
+ /** Format message without reporting error. */
+ public String formatMessage(String key, Object[] args) {
+ if (!getFeature(SIMPLE_ERROR_FORMAT)) {
+ if (!fLocale.equals(fLastLocale)) {
+ fErrorMessages = null;
+ fLastLocale = fLocale;
+ }
+ if (fErrorMessages == null) {
+ fErrorMessages =
+ ResourceBundle.getBundle("org/cyberneko/html/res/ErrorMessages",
+ fLocale);
+ }
+ try {
+ String value = fErrorMessages.getString(key);
+ String message = MessageFormat.format(value, args);
+ return message;
+ }
+ catch (MissingResourceException e) {
+ // ignore and return a simple format
+ }
+ }
+ return formatSimpleMessage(key, args);
+ } // formatMessage(String,Object[]):String
+
+ /** Reports a warning. */
+ public void reportWarning(String key, Object[] args)
+ throws XMLParseException {
+ if (fErrorHandler != null) {
+ fErrorHandler.warning(ERROR_DOMAIN, key, createException(key, args));
+ }
+ } // reportWarning(String,Object[])
+
+ /** Reports an error. */
+ public void reportError(String key, Object[] args)
+ throws XMLParseException {
+ if (fErrorHandler != null) {
+ fErrorHandler.error(ERROR_DOMAIN, key, createException(key, args));
+ }
+ } // reportError(String,Object[])
+
+ //
+ // Protected methods
+ //
+
+ /** Creates parse exception. */
+ protected XMLParseException createException(String key, Object[] args) {
+ String message = formatMessage(key, args);
+ return new XMLParseException(fDocumentScanner, message);
+ } // createException(String,Object[]):XMLParseException
+
+ /** Format simple message. */
+ protected String formatSimpleMessage(String key, Object[] args) {
+ StringBuffer str = new StringBuffer();
+ str.append(ERROR_DOMAIN);
+ str.append('#');
+ str.append(key);
+ if (args != null && args.length > 0) {
+ str.append('\t');
+ for (int i = 0; i < args.length; i++) {
+ if (i > 0) {
+ str.append('\t');
+ }
+ str.append(String.valueOf(args[i]));
+ }
+ }
+ return str.toString();
+ } // formatSimpleMessage(String,
+
+ } // class ErrorReporter
+
+} // class HTMLConfiguration
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLElements.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLElements.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLElements.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,752 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+/**
+ * Collection of HTML element information.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $
+ */
+public class HTMLElements {
+
+ //
+ // Constants
+ //
+
+ // element codes
+
+ // NOTE: The element codes *must* start with 0 and increment in
+ // sequence. The parent and closes references depends on
+ // this assumption. -Ac
+
+ public static final short A = 0;
+ public static final short ABBR = A+1;
+ public static final short ACRONYM = ABBR+1;
+ public static final short ADDRESS = ACRONYM+1;
+ public static final short APPLET = ADDRESS+1;
+ public static final short AREA = APPLET+1;
+ public static final short B = AREA+1;
+ public static final short BASE = B+1;
+ public static final short BASEFONT = BASE+1;
+ public static final short BDO = BASEFONT+1;
+ public static final short BGSOUND = BDO+1;
+ public static final short BIG = BGSOUND+1;
+ public static final short BLINK = BIG+1;
+ public static final short BLOCKQUOTE = BLINK+1;
+ public static final short BODY = BLOCKQUOTE+1;
+ public static final short BR = BODY+1;
+ public static final short BUTTON = BR+1;
+ public static final short CAPTION = BUTTON+1;
+ public static final short CENTER = CAPTION+1;
+ public static final short CITE = CENTER+1;
+ public static final short CODE = CITE+1;
+ public static final short COL = CODE+1;
+ public static final short COLGROUP = COL+1;
+ public static final short COMMENT = COLGROUP+1;
+ public static final short DEL = COMMENT+1;
+ public static final short DFN = DEL+1;
+ public static final short DIR = DFN+1;
+ public static final short DIV = DIR+1;
+ public static final short DD = DIV+1;
+ public static final short DL = DD+1;
+ public static final short DT = DL+1;
+ public static final short EM = DT+1;
+ public static final short EMBED = EM+1;
+ public static final short FIELDSET = EMBED+1;
+ public static final short FONT = FIELDSET+1;
+ public static final short FORM = FONT+1;
+ public static final short FRAME = FORM+1;
+ public static final short FRAMESET = FRAME+1;
+ public static final short H1 = FRAMESET+1;
+ public static final short H2 = H1+1;
+ public static final short H3 = H2+1;
+ public static final short H4 = H3+1;
+ public static final short H5 = H4+1;
+ public static final short H6 = H5+1;
+ public static final short HEAD = H6+1;
+ public static final short HR = HEAD+1;
+ public static final short HTML = HR+1;
+ public static final short I = HTML+1;
+ public static final short IFRAME = I+1;
+ public static final short ILAYER = IFRAME+1;
+ public static final short IMG = ILAYER+1;
+ public static final short INPUT = IMG+1;
+ public static final short INS = INPUT+1;
+ public static final short ISINDEX = INS+1;
+ public static final short KBD = ISINDEX+1;
+ public static final short KEYGEN = KBD+1;
+ public static final short LABEL = KEYGEN+1;
+ public static final short LAYER = LABEL+1;
+ public static final short LEGEND = LAYER+1;
+ public static final short LI = LEGEND+1;
+ public static final short LINK = LI+1;
+ public static final short LISTING = LINK+1;
+ public static final short MAP = LISTING+1;
+ public static final short MARQUEE = MAP+1;
+ public static final short MENU = MARQUEE+1;
+ public static final short META = MENU+1;
+ public static final short MULTICOL = META+1;
+ public static final short NEXTID = MULTICOL+1;
+ public static final short NOBR = NEXTID+1;
+ public static final short NOEMBED = NOBR+1;
+ public static final short NOFRAMES = NOEMBED+1;
+ public static final short NOLAYER = NOFRAMES+1;
+ public static final short NOSCRIPT = NOLAYER+1;
+ public static final short OBJECT = NOSCRIPT+1;
+ public static final short OL = OBJECT+1;
+ public static final short OPTION = OL+1;
+ public static final short OPTGROUP = OPTION+1;
+ public static final short P = OPTGROUP+1;
+ public static final short PARAM = P+1;
+ public static final short PLAINTEXT = PARAM+1;
+ public static final short PRE = PLAINTEXT+1;
+ public static final short Q = PRE+1;
+ public static final short RB = Q+1;
+ public static final short RBC = RB+1;
+ public static final short RP = RBC+1;
+ public static final short RT = RP+1;
+ public static final short RTC = RT+1;
+ public static final short RUBY = RTC+1;
+ public static final short S = RUBY+1;
+ public static final short SAMP = S+1;
+ public static final short SCRIPT = SAMP+1;
+ public static final short SELECT = SCRIPT+1;
+ public static final short SMALL = SELECT+1;
+ public static final short SOUND = SMALL+1;
+ public static final short SPACER = SOUND+1;
+ public static final short SPAN = SPACER+1;
+ public static final short STRIKE = SPAN+1;
+ public static final short STRONG = STRIKE+1;
+ public static final short STYLE = STRONG+1;
+ public static final short SUB = STYLE+1;
+ public static final short SUP = SUB+1;
+ public static final short TABLE = SUP+1;
+ public static final short TBODY = TABLE+1;
+ public static final short TD = TBODY+1;
+ public static final short TEXTAREA = TD+1;
+ public static final short TFOOT = TEXTAREA+1;
+ public static final short TH = TFOOT+1;
+ public static final short THEAD = TH+1;
+ public static final short TITLE = THEAD+1;
+ public static final short TR = TITLE+1;
+ public static final short TT = TR+1;
+ public static final short U = TT+1;
+ public static final short UL = U+1;
+ public static final short VAR = UL+1;
+ public static final short WBR = VAR+1;
+ public static final short XML = WBR+1;
+ public static final short XMP = XML+1;
+ public static final short UNKNOWN = XMP+1;
+
+ // information
+
+ /** Element information organized by first letter. */
+ protected static final Element[][] ELEMENTS_ARRAY = new Element[26][];
+
+ /** Element information as a contiguous list. */
+ protected static final ElementList ELEMENTS = new ElementList();
+
+ /** No such element. */
+ public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", 0, new short[]{HEAD,BODY}/*HTML*/, null);
+
+ //
+ // Static initializer
+ //
+
+ /**
+ * Initializes the element information.
+ * <p>
+ * <strong>Note:</strong>
+ * The <code>getElement</code> method requires that the HTML elements
+ * are added to the list in alphabetical order. If new elements are
+ * added, then they <em>must</em> be inserted in alphabetical order.
+ */
+ static {
+ // <!ENTITY % heading "H1|H2|H3|H4|H5|H6">
+ // <!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
+ // <!ENTITY % phrase "EM | STRONG | DFN | CODE | SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
+ // <!ENTITY % special "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
+ // <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
+ // <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
+ // <!ENTITY % block "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT | BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
+ // <!ENTITY % flow "%block; | %inline;">
+
+ // initialize array of element information
+ ELEMENTS_ARRAY['A'-'A'] = new Element[] {
+ // A - - (%inline;)* -(A)
+ new Element(A, "A", Element.INLINE, BODY, null),
+ // ABBR - - (%inline;)*
+ new Element(ABBR, "ABBR", Element.INLINE, BODY, null),
+ // ACRONYM - - (%inline;)*
+ new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null),
+ // ADDRESS - - (%inline;)*
+ new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null),
+ // APPLET
+ new Element(APPLET, "APPLET", 0, BODY, null),
+ // AREA - O EMPTY
+ new Element(AREA, "AREA", Element.EMPTY, MAP, null),
+ };
+ ELEMENTS_ARRAY['B'-'A'] = new Element[] {
+ // B - - (%inline;)*
+ new Element(B, "B", Element.INLINE, BODY, null),
+ // BASE - O EMPTY
+ new Element(BASE, "BASE", Element.EMPTY, HEAD, null),
+ // BASEFONT
+ new Element(BASEFONT, "BASEFONT", 0, HEAD, null),
+ // BDO - - (%inline;)*
+ new Element(BDO, "BDO", Element.INLINE, BODY, null),
+ // BGSOUND
+ new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null),
+ // BIG - - (%inline;)*
+ new Element(BIG, "BIG", Element.INLINE, BODY, null),
+ // BLINK
+ new Element(BLINK, "BLINK", Element.INLINE, BODY, null),
+ // BLOCKQUOTE - - (%block;|SCRIPT)+
+ new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}),
+ // BODY O O (%block;|SCRIPT)+ +(INS|DEL)
+ new Element(BODY, "BODY", 0, HTML, new short[]{HEAD}),
+ // BR - O EMPTY
+ new Element(BR, "BR", Element.EMPTY, BODY, null),
+ // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET)
+ new Element(BUTTON, "BUTTON", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['C'-'A'] = new Element[] {
+ // CAPTION - - (%inline;)*
+ new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null),
+ // CENTER,
+ new Element(CENTER, "CENTER", 0, BODY, null),
+ // CITE - - (%inline;)*
+ new Element(CITE, "CITE", Element.INLINE, BODY, null),
+ // CODE - - (%inline;)*
+ new Element(CODE, "CODE", Element.INLINE, BODY, null),
+ // COL - O EMPTY
+ new Element(COL, "COL", Element.EMPTY, TABLE, null),
+ // COLGROUP - O (COL)*
+ new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}),
+ // COMMENT
+ new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null),
+ };
+ ELEMENTS_ARRAY['D'-'A'] = new Element[] {
+ // DEL - - (%flow;)*
+ new Element(DEL, "DEL", 0, BODY, null),
+ // DFN - - (%inline;)*
+ new Element(DFN, "DFN", Element.INLINE, BODY, null),
+ // DIR
+ new Element(DIR, "DIR", 0, BODY, null),
+ // DIV - - (%flow;)*
+ new Element(DIV, "DIV", Element.BLOCK, BODY, null),
+ // DD - O (%flow;)*
+ new Element(DD, "DD", 0, DL, new short[]{DT,DD}),
+ // DL - - (DT|DD)+
+ new Element(DL, "DL", Element.BLOCK, BODY, null),
+ // DT - O (%inline;)*
+ new Element(DT, "DT", 0, DL, new short[]{DT,DD}),
+ };
+ ELEMENTS_ARRAY['E'-'A'] = new Element[] {
+ // EM - - (%inline;)*
+ new Element(EM, "EM", Element.INLINE, BODY, null),
+ // EMBED
+ new Element(EMBED, "EMBED", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['F'-'A'] = new Element[] {
+ // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*)
+ new Element(FIELDSET, "FIELDSET", 0, BODY, null),
+ // FONT
+ new Element(FONT, "FONT", Element.CONTAINER, BODY, null),
+ // FORM - - (%block;|SCRIPT)+ -(FORM)
+ new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,P,DIV}, new short[]{FORM,BUTTON}),
+ // FRAME - O EMPTY
+ new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null),
+ // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?)
+ new Element(FRAMESET, "FRAMESET", 0, HTML, null),
+ };
+ ELEMENTS_ARRAY['H'-'A'] = new Element[] {
+ // (H1|H2|H3|H4|H5|H6) - - (%inline;)*
+ new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ // HEAD O O (%head.content;) +(%head.misc;)
+ new Element(HEAD, "HEAD", 0, HTML, null),
+ // HR - O EMPTY
+ new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}),
+ // HTML O O (%html.content;)
+ new Element(HTML, "HTML", 0, null, null),
+ };
+ ELEMENTS_ARRAY['I'-'A'] = new Element[] {
+ // I - - (%inline;)*
+ new Element(I, "I", Element.INLINE, BODY, null),
+ // IFRAME
+ new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null),
+ // ILAYER
+ new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null),
+ // IMG - O EMPTY
+ new Element(IMG, "IMG", Element.EMPTY, BODY, null),
+ // INPUT - O EMPTY
+ new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),
+ // INS - - (%flow;)*
+ new Element(INS, "INS", 0, BODY, null),
+ // ISINDEX
+ new Element(ISINDEX, "ISINDEX", 0, HEAD, null),
+ };
+ ELEMENTS_ARRAY['K'-'A'] = new Element[] {
+ // KBD - - (%inline;)*
+ new Element(KBD, "KBD", Element.INLINE, BODY, null),
+ // KEYGEN
+ new Element(KEYGEN, "KEYGEN", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['L'-'A'] = new Element[] {
+ // LABEL - - (%inline;)* -(LABEL)
+ new Element(LABEL, "LABEL", 0, BODY, null),
+ // LAYER
+ new Element(LAYER, "LAYER", Element.BLOCK, BODY, null),
+ // LEGEND - - (%inline;)*
+ new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null),
+ // LI - O (%flow;)*
+ new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}),
+ // LINK - O EMPTY
+ new Element(LINK, "LINK", Element.EMPTY, HEAD, null),
+ // LISTING
+ new Element(LISTING, "LISTING", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['M'-'A'] = new Element[] {
+ // MAP - - ((%block;) | AREA)+
+ new Element(MAP, "MAP", Element.INLINE, BODY, null),
+ // MARQUEE
+ new Element(MARQUEE, "MARQUEE", 0, BODY, null),
+ // MENU
+ new Element(MENU, "MENU", 0, BODY, null),
+ // META - O EMPTY
+ new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}),
+ // MULTICOL
+ new Element(MULTICOL, "MULTICOL", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['N'-'A'] = new Element[] {
+ // NEXTID
+ new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null),
+ // NOBR
+ new Element(NOBR, "NOBR", Element.INLINE, BODY, null),
+ // NOEMBED
+ new Element(NOEMBED, "NOEMBED", 0, BODY, null),
+ // NOFRAMES - - (BODY) -(NOFRAMES)
+ new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null),
+ // NOLAYER
+ new Element(NOLAYER, "NOLAYER", 0, BODY, null),
+ // NOSCRIPT - - (%block;)+
+ new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{HEAD,BODY}, null),
+ };
+ ELEMENTS_ARRAY['O'-'A'] = new Element[] {
+ // OBJECT - - (PARAM | %flow;)*
+ new Element(OBJECT, "OBJECT", 0, BODY, null),
+ // OL - - (LI)+
+ new Element(OL, "OL", Element.BLOCK, BODY, null),
+ // OPTGROUP - - (OPTION)+
+ new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}),
+ // OPTION - O (#PCDATA)
+ new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}),
+ };
+ ELEMENTS_ARRAY['P'-'A'] = new Element[] {
+ // P - O (%inline;)*
+ new Element(P, "P", 0, BODY, new short[]{P}),
+ // PARAM - O EMPTY
+ new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null),
+ // PLAINTEXT
+ new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null),
+ // PRE - - (%inline;)* -(%pre.exclusion;)
+ new Element(PRE, "PRE", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['Q'-'A'] = new Element[] {
+ // Q - - (%inline;)*
+ new Element(Q, "Q", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['R'-'A'] = new Element[] {
+ // RB
+ new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}),
+ // RBC
+ new Element(RBC, "RBC", 0, RUBY, null),
+ // RP
+ new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}),
+ // RT
+ new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}),
+ // RTC
+ new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}),
+ // RUBY
+ new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}),
+ };
+ ELEMENTS_ARRAY['S'-'A'] = new Element[] {
+ // S
+ new Element(S, "S", 0, BODY, null),
+ // SAMP - - (%inline;)*
+ new Element(SAMP, "SAMP", Element.INLINE, BODY, null),
+ // SCRIPT - - %Script;
+ new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null),
+ // SELECT - - (OPTGROUP|OPTION)+
+ new Element(SELECT, "SELECT", 0, BODY, new short[]{SELECT}),
+ // SMALL - - (%inline;)*
+ new Element(SMALL, "SMALL", Element.INLINE, BODY, null),
+ // SOUND
+ new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null),
+ // SPACER
+ new Element(SPACER, "SPACER", Element.EMPTY, BODY, null),
+ // SPAN - - (%inline;)*
+ new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null),
+ // STRIKE
+ new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null),
+ // STRONG - - (%inline;)*
+ new Element(STRONG, "STRONG", Element.INLINE, BODY, null),
+ // STYLE - - %StyleSheet;
+ new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}),
+ // SUB - - (%inline;)*
+ new Element(SUB, "SUB", Element.INLINE, BODY, null),
+ // SUP - - (%inline;)*
+ new Element(SUP, "SUP", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['T'-'A'] = new Element[] {
+ // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+ new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null),
+ // TBODY O O (TR)+
+ new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}),
+ // TD - O (%flow;)*
+ new Element(TD, "TD", 0, TR, TABLE, new short[]{TD,TH}),
+ // TEXTAREA - - (#PCDATA)
+ new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null),
+ // TFOOT - O (TR)+
+ new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}),
+ // TH - O (%flow;)*
+ new Element(TH, "TH", 0, TR, TABLE, new short[]{TD,TH}),
+ // THEAD - O (TR)+
+ new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}),
+ // TITLE - - (#PCDATA) -(%head.misc;)
+ new Element(TITLE, "TITLE", 0, new short[]{HEAD,BODY}, null),
+ // TR - O (TH|TD)+
+ new Element(TR, "TR", Element.BLOCK, TABLE, new short[]{TD,TH,TR,COLGROUP}),
+ // TT - - (%inline;)*
+ new Element(TT, "TT", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['U'-'A'] = new Element[] {
+ // U,
+ new Element(U, "U", Element.INLINE, BODY, null),
+ // UL - - (LI)+
+ new Element(UL, "UL", Element.BLOCK, BODY, null),
+ };
+ ELEMENTS_ARRAY['V'-'A'] = new Element[] {
+ // VAR - - (%inline;)*
+ new Element(VAR, "VAR", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['W'-'A'] = new Element[] {
+ // WBR
+ new Element(WBR, "WBR", Element.EMPTY, BODY, null),
+ };
+ ELEMENTS_ARRAY['X'-'A'] = new Element[] {
+ // XML
+ new Element(XML, "XML", 0, BODY, null),
+ // XMP
+ new Element(XMP, "XMP", Element.SPECIAL, BODY, null),
+ };
+
+ // keep contiguous list of elements for lookups by code
+ for (int i = 0; i < ELEMENTS_ARRAY.length; i++) {
+ Element[] elements = ELEMENTS_ARRAY[i];
+ if (elements != null) {
+ for (int j = 0; j < elements.length; j++) {
+ Element element = elements[j];
+ ELEMENTS.addElement(element);
+ }
+ }
+ }
+ ELEMENTS.addElement(NO_SUCH_ELEMENT);
+
+ // initialize cross references to parent elements
+ for (int i = 0; i < ELEMENTS.size; i++) {
+ Element element = ELEMENTS.data[i];
+ if (element.parentCodes != null) {
+ element.parent = new Element[element.parentCodes.length];
+ for (int j = 0; j < element.parentCodes.length; j++) {
+ element.parent[j] = ELEMENTS.data[element.parentCodes[j]];
+ }
+ element.parentCodes = null;
+ }
+ }
+
+ } // <clinit>()
+
+ //
+ // Public static methods
+ //
+
+ /**
+ * Returns the element information for the specified element code.
+ *
+ * @param code The element code.
+ */
+ public static final Element getElement(short code) {
+ return ELEMENTS.data[code];
+ } // getElement(short):Element
+
+ /**
+ * Returns the element information for the specified element name.
+ *
+ * @param ename The element name.
+ */
+ public static final Element getElement(String ename) {
+ return getElement(ename, NO_SUCH_ELEMENT);
+ } // getElement(String):Element
+
+ /**
+ * Returns the element information for the specified element name.
+ *
+ * @param ename The element name.
+ * @param element The default element to return if not found.
+ */
+ public static final Element getElement(String ename, Element element) {
+
+ if (ename.length() > 0) {
+ int c = ename.charAt(0);
+ if (c >= 'a' && c <= 'z') {
+ c = 'A' + c - 'a';
+ }
+ if (c >= 'A' && c <= 'Z') {
+ Element[] elements = ELEMENTS_ARRAY[c - 'A'];
+ if (elements != null) {
+ for (int i = 0; i < elements.length; i++) {
+ Element elem = elements[i];
+ if (elem.name.equalsIgnoreCase(ename)) {
+ return elem;
+ }
+ }
+ }
+ }
+ }
+ return element;
+
+ } // getElement(String):Element
+
+ //
+ // Classes
+ //
+
+ /**
+ * Element information.
+ *
+ * @author Andy Clark
+ */
+ public static class Element {
+
+ //
+ // Constants
+ //
+
+ /** Inline element. */
+ public static final int INLINE = 0x01;
+
+ /** Block element. */
+ public static final int BLOCK = 0x02;
+
+ /** Empty element. */
+ public static final int EMPTY = 0x04;
+
+ /** Container element. */
+ public static final int CONTAINER = 0x08;
+
+ /** Special element. */
+ public static final int SPECIAL = 0x10;
+
+ /** Empty array. */
+ private static final short[] EMPTY_ARRAY = new short[0];
+
+ //
+ // Data
+ //
+
+ /** The element code. */
+ public short code;
+
+ /** The element name. */
+ public String name;
+
+ /** Informational flags. */
+ public int flags;
+
+ /** Parent elements. */
+ public short[] parentCodes;
+
+ /** Parent elements. */
+ public Element[] parent;
+
+ /** The bounding element code. */
+ public short bounds;
+
+ /** List of elements this element can close. */
+ public short[] closes;
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parent Natural closing parent name.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short parent, short[] closes) {
+ this(code, name, flags, new short[]{parent}, (short)-1, closes);
+ } // <init>(short,String,int,short,short[]);
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parent Natural closing parent name.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short parent, short bounds, short[] closes) {
+ this(code, name, flags, new short[]{parent}, bounds, closes);
+ } // <init>(short,String,int,short,short,short[])
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parents Natural closing parent names.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short[] parents, short[] closes) {
+ this(code, name, flags, parents, (short)-1, closes);
+ } // <init>(short,String,int,short[],short[])
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parents Natural closing parent names.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short[] parents, short bounds, short[] closes) {
+ this.code = code;
+ this.name = name;
+ this.flags = flags;
+ this.parentCodes = parents;
+ this.parent = null;
+ this.bounds = bounds;
+ this.closes = closes;
+ } // <init>(short,String,int,short[],short,short[])
+
+ //
+ // Public methods
+ //
+
+ /** Returns true if this element is an inline element. */
+ public final boolean isInline() {
+ return (flags & INLINE) != 0;
+ } // isInline():boolean
+
+ /** Returns true if this element is a block element. */
+ public final boolean isBlock() {
+ return (flags & BLOCK) != 0;
+ } // isBlock():boolean
+
+ /** Returns true if this element is an empty element. */
+ public final boolean isEmpty() {
+ return (flags & EMPTY) != 0;
+ } // isEmpty():boolean
+
+ /** Returns true if this element is a container element. */
+ public final boolean isContainer() {
+ return (flags & CONTAINER) != 0;
+ } // isContainer():boolean
+
+ /**
+ * Returns true if this element is special -- if its content
+ * should be parsed ignoring markup.
+ */
+ public final boolean isSpecial() {
+ return (flags & SPECIAL) != 0;
+ } // isSpecial():boolean
+
+ /**
+ * Returns true if this element can close the specified Element.
+ *
+ * @param tag The element.
+ */
+ public boolean closes(short tag) {
+
+ if (closes != null) {
+ for (int i = 0; i < closes.length; i++) {
+ if (closes[i] == tag) {
+ return true;
+ }
+ }
+ }
+ return false;
+
+ } // closes(short):boolean
+
+ //
+ // Object methods
+ //
+
+ /** Returns a hash code for this object. */
+ public int hashCode() {
+ return name.hashCode();
+ } // hashCode():int
+
+ /** Returns true if the objects are equal. */
+ public boolean equals(Object o) {
+ return name.equals(o);
+ } // equals(Object):boolean
+
+ } // class Element
+
+ /** Unsynchronized list of elements. */
+ public static class ElementList {
+
+ //
+ // Data
+ //
+
+ /** The size of the list. */
+ public int size;
+
+ /** The data in the list. */
+ public Element[] data = new Element[120];
+
+ //
+ // Public methods
+ //
+
+ /** Adds an element to list, resizing if necessary. */
+ public void addElement(Element element) {
+ if (size == data.length) {
+ Element[] newarray = new Element[size + 20];
+ System.arraycopy(data, 0, newarray, 0, size);
+ data = newarray;
+ }
+ data[size++] = element;
+ } // addElement(Element)
+
+ } // class Element
+
+} // class HTMLElements
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEntities.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEntities.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEntities.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,126 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.Properties;
+
+/**
+ * Pre-defined HTML entities.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLEntities.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
+ */
+public class HTMLEntities {
+
+ //
+ // Constants
+ //
+
+ /** Entities. */
+ protected static final Properties ENTITIES = new Properties();
+
+ /** Reverse mapping from characters to names. */
+ protected static final IntProperties SEITITNE = new IntProperties();
+
+ //
+ // Static initialization
+ //
+
+ static {
+ // load entities
+ load0("res/HTMLlat1.properties");
+ load0("res/HTMLspecial.properties");
+ load0("res/HTMLsymbol.properties");
+ load0("res/XMLbuiltin.properties");
+
+ // store reverse mappings
+ Enumeration keys = ENTITIES.propertyNames();
+ while (keys.hasMoreElements()) {
+ String key = (String)keys.nextElement();
+ String value = ENTITIES.getProperty(key);
+ if (value.length() == 1) {
+ int ivalue = value.charAt(0);
+ SEITITNE.put(ivalue, key);
+ }
+ }
+ }
+
+ //
+ // Public static methods
+ //
+
+ /**
+ * Returns the character associated to the given entity name, or
+ * -1 if the name is not known.
+ */
+ public static int get(String name) {
+ String value = (String)ENTITIES.get(name);
+ return value != null ? value.charAt(0) : -1;
+ } // get(String):char
+
+ /**
+ * Returns the name associated to the given character or null if
+ * the character is not known.
+ */
+ public static String get(int c) {
+ return SEITITNE.get(c);
+ } // get(int):String
+
+ //
+ // Private static methods
+ //
+
+ /** Loads the entity values in the specified resource. */
+ private static void load0(String filename) {
+ try {
+ ENTITIES.load(HTMLEntities.class.getResourceAsStream(filename));
+ }
+ catch (IOException e) {
+ System.err.println("error: unable to load resource \""+filename+"\"");
+ }
+ } // load0(String)
+
+ //
+ // Classes
+ //
+
+ static class IntProperties {
+ private int top = 0;
+ private Entry[] entries = new Entry[101];
+ public void put(int key, String value) {
+ int hash = key % entries.length;
+ Entry entry = new Entry(key, value, entries[hash]);
+ entries[hash] = entry;
+ }
+ public String get(int key) {
+ int hash = key % entries.length;
+ Entry entry = entries[hash];
+ while (entry != null) {
+ if (entry.key == key) {
+ return entry.value;
+ }
+ entry = entry.next;
+ }
+ return null;
+ }
+ static class Entry {
+ public int key;
+ public String value;
+ public Entry next;
+ public Entry(int key, String value, Entry next) {
+ this.key = key;
+ this.value = value;
+ this.next = next;
+ }
+ }
+ }
+
+} // class HTMLEntities
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLErrorReporter.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLErrorReporter.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLErrorReporter.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,47 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import org.apache.xerces.xni.parser.XMLParseException;
+
+/**
+ * Defines an error reporter for reporting HTML errors. There is no such
+ * thing as a fatal error in parsing HTML. I/O errors are fatal but should
+ * throw an <code>IOException</code> directly instead of reporting an error.
+ * <p>
+ * When used in a configuration, the error reporter instance should be
+ * set as a property with the following property identifier:
+ * <pre>
+ * "http://cyberneko.org/html/internal/error-reporter" in the
+ * </pre>
+ * Components in the configuration can query the error reporter using this
+ * property identifier.
+ * <p>
+ * <strong>Note:</strong>
+ * All reported errors are within the domain "http://cyberneko.org/html".
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLErrorReporter.java,v 1.4 2005/02/14 03:56:54 andyc Exp $
+ */
+public interface HTMLErrorReporter {
+
+ //
+ // HTMLErrorReporter methods
+ //
+
+ /** Format message without reporting error. */
+ public String formatMessage(String key, Object[] args);
+
+ /** Reports a warning. */
+ public void reportWarning(String key, Object[] args) throws XMLParseException;
+
+ /** Reports an error. */
+ public void reportError(String key, Object[] args) throws XMLParseException;
+
+} // interface HTMLErrorReporter
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEventInfo.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEventInfo.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLEventInfo.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,95 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+/**
+ * This interface is used to pass augmentated information to the
+ * application through the XNI pipeline.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLEventInfo.java,v 1.4 2005/02/14 03:56:54 andyc Exp $
+ */
+public interface HTMLEventInfo {
+
+ //
+ // HTMLEventInfo methods
+ //
+
+ // location information
+
+ /** Returns the line number of the beginning of this event.*/
+ public int getBeginLineNumber();
+
+ /** Returns the column number of the beginning of this event.*/
+ public int getBeginColumnNumber();
+
+ /** Returns the line number of the end of this event.*/
+ public int getEndLineNumber();
+
+ /** Returns the column number of the end of this event.*/
+ public int getEndColumnNumber();
+
+ // other information
+
+ /** Returns true if this corresponding event was synthesized. */
+ public boolean isSynthesized();
+
+ /**
+ * Synthesized infoset item.
+ *
+ * @author Andy Clark
+ */
+ public static class SynthesizedItem
+ implements HTMLEventInfo {
+
+ //
+ // HTMLEventInfo methods
+ //
+
+ // location information
+
+ /** Returns the line number of the beginning of this event.*/
+ public int getBeginLineNumber() {
+ return -1;
+ } // getBeginLineNumber():int
+
+ /** Returns the column number of the beginning of this event.*/
+ public int getBeginColumnNumber() {
+ return -1;
+ } // getBeginColumnNumber():int
+
+ /** Returns the line number of the end of this event.*/
+ public int getEndLineNumber() {
+ return -1;
+ } // getEndLineNumber():int
+
+ /** Returns the column number of the end of this event.*/
+ public int getEndColumnNumber() {
+ return -1;
+ } // getEndColumnNumber():int
+
+ // other information
+
+ /** Returns true if this corresponding event was synthesized. */
+ public boolean isSynthesized() {
+ return true;
+ } // isSynthesized():boolean
+
+ //
+ // Object methods
+ //
+
+ /** Returns a string representation of this object. */
+ public String toString() {
+ return "synthesized";
+ } // toString():String
+
+ } // class SynthesizedItem
+
+} // interface HTMLEventInfo
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLScanner.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLScanner.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLScanner.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,3277 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ *
+ * NOTE: The URI fixing code in this source was taken from the Apache
+ * Xerces parser which is distributed under the Apache license.
+ * Refer to the LICENSE_apache file for details.
+ */
+
+package org.cyberneko.html;
+
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.FilterInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.net.URL;
+import java.util.Stack;
+
+import org.apache.xerces.util.EncodingMap;
+import org.apache.xerces.util.NamespaceSupport;
+import org.apache.xerces.util.URI;
+import org.apache.xerces.util.XMLAttributesImpl;
+import org.apache.xerces.util.XMLResourceIdentifierImpl;
+import org.apache.xerces.util.XMLStringBuffer;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentScanner;
+import org.apache.xerces.xni.parser.XMLInputSource;
+
+/**
+ * A simple HTML scanner. This scanner makes no attempt to balance tags
+ * or fix other problems in the source document — it just scans what
+ * it can and generates XNI document "events", ignoring errors of all
+ * kinds.
+ * <p>
+ * This component recognizes the following features:
+ * <ul>
+ * <li>http://cyberneko.org/html/features/augmentations
+ * <li>http://cyberneko.org/html/features/report-errors
+ * <li>http://apache.org/xml/features/scanner/notify-char-refs
+ * <li>http://apache.org/xml/features/scanner/notify-builtin-refs
+ * <li>http://cyberneko.org/html/features/scanner/notify-builtin-refs
+ * <li>http://cyberneko.org/html/features/scanner/fix-mswindows-refs
+ * <li>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
+ * <li>http://cyberneko.org/html/features/scanner/script/strip-comment-delims
+ * <li>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
+ * <li>http://cyberneko.org/html/features/scanner/style/strip-comment-delims
+ * <li>http://cyberneko.org/html/features/scanner/ignore-specified-charset
+ * <li>http://cyberneko.org/html/features/scanner/cdata-sections
+ * <li>http://cyberneko.org/html/features/override-doctype
+ * <li>http://cyberneko.org/html/features/insert-doctype
+ * </ul>
+ * <p>
+ * This component recognizes the following properties:
+ * <ul>
+ * <li>http://cyberneko.org/html/properties/names/elems
+ * <li>http://cyberneko.org/html/properties/names/attrs
+ * <li>http://cyberneko.org/html/properties/default-encoding
+ * <li>http://cyberneko.org/html/properties/error-reporter
+ * <li>http://cyberneko.org/html/properties/doctype/pubid
+ * <li>http://cyberneko.org/html/properties/doctype/sysid
+ * </ul>
+ *
+ * @see HTMLElements
+ * @see HTMLEntities
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
+ */
+public class HTMLScanner
+ implements XMLDocumentScanner, XMLLocator, HTMLComponent {
+
+ //
+ // Constants
+ //
+
+ // doctype info: HTML 4.01 strict
+
+ /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
+ public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";
+
+ /** HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). */
+ public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";
+
+ // doctype info: HTML 4.01 loose
+
+ /** HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). */
+ public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";
+
+ /** HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). */
+ public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";
+
+ // doctype info: HTML 4.01 frameset
+
+ /** HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). */
+ public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";
+
+ /** HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). */
+ public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";
+
+ // features
+
+ /** Include infoset augmentations. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Report errors. */
+ protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
+
+ /** Notify character entity references (e.g. &#32;, &#x20;, etc). */
+ public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
+
+ /**
+ * Notify handler of built-in entity references (e.g. &amp;,
+ * &lt;, etc).
+ * <p>
+ * <strong>Note:</strong>
+ * This only applies to the five pre-defined XML general entities.
+ * Specifically, "amp", "lt", "gt", "quot", and "apos". This is done
+ * for compatibility with the Xerces feature.
+ * <p>
+ * To be notified of the built-in entity references in HTML, set the
+ * <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code>
+ * feature to <code>true</code>.
+ */
+ public static final String NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs";
+
+ /**
+ * Notify handler of built-in entity references (e.g. &nobr;,
+ * &copy;, etc).
+ * <p>
+ * <strong>Note:</strong>
+ * This <em>includes</em> the five pre-defined XML general entities.
+ */
+ public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
+
+ /** Fix Microsoft Windows® character entity references. */
+ public static final String FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs";
+
+ /**
+ * Strip HTML comment delimiters ("<!−−" and
+ * "−−>") from SCRIPT tag contents.
+ */
+ public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";
+
+ /**
+ * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from
+ * SCRIPT tag contents.
+ */
+ public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";
+
+ /**
+ * Strip HTML comment delimiters ("<!−−" and
+ * "−−>") from STYLE tag contents.
+ */
+ public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";
+
+ /**
+ * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from
+ * STYLE tag contents.
+ */
+ public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";
+
+ /**
+ * Ignore specified charset found in the <meta equiv='Content-Type'
+ * content='text/html;charset=…'> tag.
+ */
+ public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";
+
+ /** Scan CDATA sections. */
+ public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";
+
+ /** Override doctype declaration public and system identifiers. */
+ public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";
+
+ /** Insert document type declaration. */
+ public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";
+
+ /** Recognized features. */
+ private static final String[] RECOGNIZED_FEATURES = {
+ AUGMENTATIONS,
+ REPORT_ERRORS,
+ NOTIFY_CHAR_REFS,
+ NOTIFY_XML_BUILTIN_REFS,
+ NOTIFY_HTML_BUILTIN_REFS,
+ FIX_MSWINDOWS_REFS,
+ SCRIPT_STRIP_CDATA_DELIMS,
+ SCRIPT_STRIP_COMMENT_DELIMS,
+ STYLE_STRIP_CDATA_DELIMS,
+ STYLE_STRIP_COMMENT_DELIMS,
+ IGNORE_SPECIFIED_CHARSET,
+ CDATA_SECTIONS,
+ OVERRIDE_DOCTYPE,
+ INSERT_DOCTYPE,
+ };
+
+ /** Recognized features defaults. */
+ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
+ null,
+ null,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ };
+
+ // properties
+
+ /** Modify HTML element names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+ /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+
+ /** Default encoding. */
+ protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";
+
+ /** Error reporter. */
+ protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
+
+ /** Doctype declaration public identifier. */
+ protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";
+
+ /** Doctype declaration system identifier. */
+ protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";
+
+ /** Recognized properties. */
+ private static final String[] RECOGNIZED_PROPERTIES = {
+ NAMES_ELEMS,
+ NAMES_ATTRS,
+ DEFAULT_ENCODING,
+ ERROR_REPORTER,
+ DOCTYPE_PUBID,
+ DOCTYPE_SYSID,
+ };
+
+ /** Recognized properties defaults. */
+ private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
+ null,
+ null,
+ "Windows-1252",
+ null,
+ HTML_4_01_TRANSITIONAL_PUBID,
+ HTML_4_01_TRANSITIONAL_SYSID,
+ };
+
+ // states
+
+ /** State: content. */
+ protected static final short STATE_CONTENT = 0;
+
+ /** State: markup bracket. */
+ protected static final short STATE_MARKUP_BRACKET = 1;
+
+ /** State: start document. */
+ protected static final short STATE_START_DOCUMENT = 10;
+
+ /** State: end document. */
+ protected static final short STATE_END_DOCUMENT = 11;
+
+ // modify HTML names
+
+ /** Don't modify HTML names. */
+ protected static final short NAMES_NO_CHANGE = 0;
+
+ /** Uppercase HTML names. */
+ protected static final short NAMES_UPPERCASE = 1;
+
+ /** Lowercase HTML names. */
+ protected static final short NAMES_LOWERCASE = 2;
+
+ // defaults
+
+ /** Default buffer size. */
+ protected static final int DEFAULT_BUFFER_SIZE = 2048;
+
+ // debugging
+
+ /** Set to true to debug changes in the scanner. */
+ private static final boolean DEBUG_SCANNER = false;
+
+ /** Set to true to debug changes in the scanner state. */
+ private static final boolean DEBUG_SCANNER_STATE = false;
+
+ /** Set to true to debug the buffer. */
+ private static final boolean DEBUG_BUFFER = false;
+
+ /** Set to true to debug character encoding handling. */
+ private static final boolean DEBUG_CHARSET = false;
+
+ /** Set to true to debug callbacks. */
+ protected static final boolean DEBUG_CALLBACKS = false;
+
+ // static vars
+
+ /** Synthesized event info item. */
+ protected static final HTMLEventInfo SYNTHESIZED_ITEM =
+ new HTMLEventInfo.SynthesizedItem();
+
+ //
+ // Data
+ //
+
+ // features
+
+ /** Augmentations. */
+ protected boolean fAugmentations;
+
+ /** Report errors. */
+ protected boolean fReportErrors;
+
+ /** Notify character entity references. */
+ protected boolean fNotifyCharRefs;
+
+ /** Notify XML built-in general entity references. */
+ protected boolean fNotifyXmlBuiltinRefs;
+
+ /** Notify HTML built-in general entity references. */
+ protected boolean fNotifyHtmlBuiltinRefs;
+
+ /** Fix Microsoft Windows® character entity references. */
+ protected boolean fFixWindowsCharRefs;
+
+ /** Strip CDATA delimiters from SCRIPT tags. */
+ protected boolean fScriptStripCDATADelims;
+
+ /** Strip comment delimiters from SCRIPT tags. */
+ protected boolean fScriptStripCommentDelims;
+
+ /** Strip CDATA delimiters from STYLE tags. */
+ protected boolean fStyleStripCDATADelims;
+
+ /** Strip comment delimiters from STYLE tags. */
+ protected boolean fStyleStripCommentDelims;
+
+ /** Ignore specified character set. */
+ protected boolean fIgnoreSpecifiedCharset;
+
+ /** CDATA sections. */
+ protected boolean fCDATASections;
+
+ /** Override doctype declaration public and system identifiers. */
+ protected boolean fOverrideDoctype;
+
+ /** Insert document type declaration. */
+ protected boolean fInsertDoctype;
+
+ // properties
+
+ /** Modify HTML element names. */
+ protected short fNamesElems;
+
+ /** Modify HTML attribute names. */
+ protected short fNamesAttrs;
+
+ /** Default encoding. */
+ protected String fDefaultIANAEncoding;
+
+ /** Error reporter. */
+ protected HTMLErrorReporter fErrorReporter;
+
+ /** Doctype declaration public identifier. */
+ protected String fDoctypePubid;
+
+ /** Doctype declaration system identifier. */
+ protected String fDoctypeSysid;
+
+ // boundary locator information
+
+ /** Beginning line number. */
+ protected int fBeginLineNumber;
+
+ /** Beginning column number. */
+ protected int fBeginColumnNumber;
+
+ /** Ending line number. */
+ protected int fEndLineNumber;
+
+ /** Ending column number. */
+ protected int fEndColumnNumber;
+
+ // state
+
+ /** The playback byte stream. */
+ protected PlaybackInputStream fByteStream;
+
+ /** Current entity. */
+ protected CurrentEntity fCurrentEntity;
+
+ /** The current entity stack. */
+ protected final Stack fCurrentEntityStack = new Stack();
+
+ /** The current scanner. */
+ protected Scanner fScanner;
+
+ /** The current scanner state. */
+ protected short fScannerState;
+
+ /** The document handler. */
+ protected XMLDocumentHandler fDocumentHandler;
+
+ /** Auto-detected IANA encoding. */
+ protected String fIANAEncoding;
+
+ /** Auto-detected Java encoding. */
+ protected String fJavaEncoding;
+
+ /** True if the encoding matches "ISO-8859-*". */
+ protected boolean fIso8859Encoding;
+
+ /** Element count. */
+ protected int fElementCount;
+
+ /** Element depth. */
+ protected int fElementDepth;
+
+ // scanners
+
+ /** Content scanner. */
+ protected Scanner fContentScanner = new ContentScanner();
+
+ /**
+ * Special scanner used for elements whose content needs to be scanned
+ * as plain text, ignoring markup such as elements and entity references.
+ * For example: <SCRIPT> and <COMMENT>.
+ */
+ protected SpecialScanner fSpecialScanner = new SpecialScanner();
+
+ // temp vars
+
+ /** String. */
+ protected final XMLString fString = new XMLString();
+
+ /** String buffer. */
+ protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(1024);
+
+ /** String buffer. */
+ private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(1024);
+
+ /** Non-normalized attribute string buffer. */
+ private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(128);
+
+ /** Augmentations. */
+ private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
+
+ /** Location infoset item. */
+ private final LocationItem fLocationItem = new LocationItem();
+
+ /** Single boolean array. */
+ private final boolean[] fSingleBoolean = { false };
+
+ /** Resource identifier. */
+ private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl();
+
+ //
+ // Public methods
+ //
+
+ /**
+ * Pushes an input source onto the current entity stack. This
+ * enables the scanner to transparently scan new content (e.g.
+ * the output written by an embedded script). At the end of the
+ * current entity, the scanner returns where it left off at the
+ * time this entity source was pushed.
+ * <p>
+ * <strong>Note:</strong>
+ * This functionality is experimental at this time and is
+ * subject to change in future releases of NekoHTML.
+ *
+ * @param inputSource The new input source to start scanning.
+ */
+ public void pushInputSource(XMLInputSource inputSource) {
+ Reader reader = inputSource.getCharacterStream();
+ if (reader == null) {
+ throw new IllegalArgumentException("pushed input source has no reader");
+ }
+ fCurrentEntityStack.push(fCurrentEntity);
+ String encoding = inputSource.getEncoding();
+ String publicId = inputSource.getPublicId();
+ String baseSystemId = inputSource.getBaseSystemId();
+ String literalSystemId = inputSource.getSystemId();
+ String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
+ fCurrentEntity = new CurrentEntity(reader, encoding,
+ publicId, baseSystemId,
+ literalSystemId, expandedSystemId);
+ } // pushInputSource(XMLInputSource)
+
+ /**
+ * Cleans up used resources. For example, if scanning is terminated
+ * early, then this method ensures all remaining open streams are
+ * closed.
+ *
+ * @param closeall Close all streams, including the original.
+ * This is used in cases when the application has
+ * opened the original document stream and should
+ * be responsible for closing it.
+ */
+ public void cleanup(boolean closeall) {
+ int size = fCurrentEntityStack.size();
+ if (size > 0) {
+ // current entity is not the original, so close it
+ if (fCurrentEntity != null) {
+ try {
+ fCurrentEntity.stream.close();
+ }
+ catch (IOException e) {
+ // ignore
+ }
+ }
+ // close remaining streams
+ for (int i = closeall ? 0 : 1; i < size; i++) {
+ fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
+ try {
+ fCurrentEntity.stream.close();
+ }
+ catch (IOException e) {
+ // ignore
+ }
+ }
+ }
+ else if (closeall && fCurrentEntity != null) {
+ try {
+ fCurrentEntity.stream.close();
+ }
+ catch (IOException e) {
+ // ignore
+ }
+ }
+ } // cleanup(boolean)
+
+ //
+ // XMLLocator methods
+ //
+
+ /** Returns the encoding. */
+ public String getEncoding() {
+ return fCurrentEntity != null ? fCurrentEntity.encoding : null;
+ } // getEncoding():String
+
+ /** Returns the public identifier. */
+ public String getPublicId() {
+ return fCurrentEntity != null ? fCurrentEntity.publicId : null;
+ } // getPublicId():String
+
+ /** Returns the base system identifier. */
+ public String getBaseSystemId() {
+ return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null;
+ } // getBaseSystemId():String
+
+ /** Returns the literal system identifier. */
+ public String getLiteralSystemId() {
+ return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null;
+ } // getLiteralSystemId():String
+
+ /** Returns the expanded system identifier. */
+ public String getExpandedSystemId() {
+ return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null;
+ } // getExpandedSystemId():String
+
+ /** Returns the current line number. */
+ public int getLineNumber() {
+ return fCurrentEntity != null ? fCurrentEntity.lineNumber : -1;
+ } // getLineNumber():int
+
+ /** Returns the current column number. */
+ public int getColumnNumber() {
+ return fCurrentEntity != null ? fCurrentEntity.columnNumber : -1;
+ } // getColumnNumber():int
+
+ //
+ // HTMLComponent methods
+ //
+
+ /** Returns the default state for a feature. */
+ public Boolean getFeatureDefault(String featureId) {
+ int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_FEATURES[i].equals(featureId)) {
+ return RECOGNIZED_FEATURES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getFeatureDefault(String):Boolean
+
+ /** Returns the default state for a property. */
+ public Object getPropertyDefault(String propertyId) {
+ int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
+ return RECOGNIZED_PROPERTIES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getPropertyDefault(String):Object
+
+ //
+ // XMLComponent methods
+ //
+
+ /** Returns recognized features. */
+ public String[] getRecognizedFeatures() {
+ return RECOGNIZED_FEATURES;
+ } // getRecognizedFeatures():String[]
+
+ /** Returns recognized properties. */
+ public String[] getRecognizedProperties() {
+ return RECOGNIZED_PROPERTIES;
+ } // getRecognizedProperties():String[]
+
+ /** Resets the component. */
+ public void reset(XMLComponentManager manager)
+ throws XMLConfigurationException {
+
+ // get features
+ fAugmentations = manager.getFeature(AUGMENTATIONS);
+ fReportErrors = manager.getFeature(REPORT_ERRORS);
+ fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS);
+ fNotifyXmlBuiltinRefs = manager.getFeature(NOTIFY_XML_BUILTIN_REFS);
+ fNotifyHtmlBuiltinRefs = manager.getFeature(NOTIFY_HTML_BUILTIN_REFS);
+ fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS);
+ fScriptStripCDATADelims = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS);
+ fScriptStripCommentDelims = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
+ fStyleStripCDATADelims = manager.getFeature(STYLE_STRIP_CDATA_DELIMS);
+ fStyleStripCommentDelims = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS);
+ fIgnoreSpecifiedCharset = manager.getFeature(IGNORE_SPECIFIED_CHARSET);
+ fCDATASections = manager.getFeature(CDATA_SECTIONS);
+ fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
+ fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
+
+ // get properties
+ fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
+ fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
+ fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING));
+ fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
+ fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID));
+ fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID));
+
+ } // reset(XMLComponentManager)
+
+ /** Sets a feature. */
+ public void setFeature(String featureId, boolean state)
+ throws XMLConfigurationException {
+
+ if (featureId.equals(AUGMENTATIONS)) {
+ fAugmentations = state;
+ }
+ else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
+ fIgnoreSpecifiedCharset = state;
+ }
+ else if (featureId.equals(NOTIFY_CHAR_REFS)) {
+ fNotifyCharRefs = state;
+ }
+ else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) {
+ fNotifyXmlBuiltinRefs = state;
+ }
+ else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) {
+ fNotifyHtmlBuiltinRefs = state;
+ }
+ else if (featureId.equals(FIX_MSWINDOWS_REFS)) {
+ fFixWindowsCharRefs = state;
+ }
+ else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) {
+ fScriptStripCDATADelims = state;
+ }
+ else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) {
+ fScriptStripCommentDelims = state;
+ }
+ else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) {
+ fStyleStripCDATADelims = state;
+ }
+ else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) {
+ fStyleStripCommentDelims = state;
+ }
+ else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
+ fIgnoreSpecifiedCharset = state;
+ }
+
+ } // setFeature(String,boolean)
+
+ /** Sets a property. */
+ public void setProperty(String propertyId, Object value)
+ throws XMLConfigurationException {
+
+ if (propertyId.equals(NAMES_ELEMS)) {
+ fNamesElems = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ if (propertyId.equals(NAMES_ATTRS)) {
+ fNamesAttrs = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ if (propertyId.equals(DEFAULT_ENCODING)) {
+ fDefaultIANAEncoding = String.valueOf(value);
+ return;
+ }
+
+ } // setProperty(String,Object)
+
+ //
+ // XMLDocumentScanner methods
+ //
+
+ /** Sets the input source. */
+ public void setInputSource(XMLInputSource source) throws IOException {
+
+ // reset state
+ fElementCount = 0;
+ fElementDepth = -1;
+ fByteStream = null;
+ fCurrentEntityStack.removeAllElements();
+
+ fBeginLineNumber = 1;
+ fBeginColumnNumber = 1;
+ fEndLineNumber = fBeginLineNumber;
+ fEndColumnNumber = fBeginColumnNumber;
+
+ // reset encoding information
+ fIANAEncoding = fDefaultIANAEncoding;
+ fJavaEncoding = fIANAEncoding;
+
+ // get location information
+ String encoding = source.getEncoding();
+ String publicId = source.getPublicId();
+ String baseSystemId = source.getBaseSystemId();
+ String literalSystemId = source.getSystemId();
+ String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
+
+ // open stream
+ Reader reader = source.getCharacterStream();
+ if (reader == null) {
+ InputStream inputStream = source.getByteStream();
+ if (inputStream == null) {
+ URL url = new URL(expandedSystemId);
+ inputStream = url.openStream();
+ }
+ fByteStream = new PlaybackInputStream(inputStream);
+ String[] encodings = new String[2];
+ if (encoding == null) {
+ fByteStream.detectEncoding(encodings);
+ }
+ else {
+ encodings[0] = encoding;
+ }
+ if (encodings[0] == null) {
+ encodings[0] = fDefaultIANAEncoding;
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1000", null);
+ }
+ }
+ if (encodings[1] == null) {
+ encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase());
+ if (encodings[1] == null) {
+ encodings[1] = encodings[0];
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1001", new Object[]{encodings[0]});
+ }
+ }
+ }
+ fIANAEncoding = encodings[0];
+ fJavaEncoding = encodings[1];
+ /* PATCH: Asgeir Asgeirsson */
+ fIso8859Encoding = fIANAEncoding == null
+ || fIANAEncoding.toUpperCase().startsWith("ISO-8859")
+ || fIANAEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
+ encoding = fIANAEncoding;
+ reader = new InputStreamReader(fByteStream, fJavaEncoding);
+ }
+ fCurrentEntity = new CurrentEntity(reader, encoding,
+ publicId, baseSystemId,
+ literalSystemId, expandedSystemId);
+
+ // set scanner and state
+ setScanner(fContentScanner);
+ setScannerState(STATE_START_DOCUMENT);
+
+ } // setInputSource(XMLInputSource)
+
+ /** Scans the document. */
+ public boolean scanDocument(boolean complete) throws XNIException, IOException {
+ do {
+ if (!fScanner.scan(complete)) {
+ return false;
+ }
+ } while (complete);
+ return true;
+ } // scanDocument(boolean):boolean
+
+ /** Sets the document handler. */
+ public void setDocumentHandler(XMLDocumentHandler handler) {
+ fDocumentHandler = handler;
+ } // setDocumentHandler(XMLDocumentHandler)
+
+ // @since Xerces 2.1.0
+
+ /** Returns the document handler. */
+ public XMLDocumentHandler getDocumentHandler() {
+ return fDocumentHandler;
+ } // getDocumentHandler():XMLDocumentHandler
+
+ //
+ // Protected static methods
+ //
+
+ /** Returns the value of the specified attribute, ignoring case. */
+ protected static String getValue(XMLAttributes attrs, String aname) {
+ int length = attrs != null ? attrs.getLength() : 0;
+ for (int i = 0; i < length; i++) {
+ if (attrs.getQName(i).equalsIgnoreCase(aname)) {
+ return attrs.getValue(i);
+ }
+ }
+ return null;
+ } // getValue(XMLAttributes,String):String
+
+ /**
+ * Expands a system id and returns the system id as a URI, if
+ * it can be expanded. A return value of null means that the
+ * identifier is already expanded. An exception thrown
+ * indicates a failure to expand the id.
+ *
+ * @param systemId The systemId to be expanded.
+ *
+ * @return Returns the URI string representing the expanded system
+ * identifier. A null value indicates that the given
+ * system identifier is already expanded.
+ *
+ */
+ public static String expandSystemId(String systemId, String baseSystemId) {
+
+ // check for bad parameters id
+ if (systemId == null || systemId.length() == 0) {
+ return systemId;
+ }
+ // if id already expanded, return
+ try {
+ URI uri = new URI(systemId);
+ if (uri != null) {
+ return systemId;
+ }
+ }
+ catch (URI.MalformedURIException e) {
+ // continue on...
+ }
+ // normalize id
+ String id = fixURI(systemId);
+
+ // normalize base
+ URI base = null;
+ URI uri = null;
+ try {
+ if (baseSystemId == null || baseSystemId.length() == 0 ||
+ baseSystemId.equals(systemId)) {
+ String dir;
+ try {
+ dir = fixURI(System.getProperty("user.dir"));
+ }
+ catch (SecurityException se) {
+ dir = "";
+ }
+ if (!dir.endsWith("/")) {
+ dir = dir + "/";
+ }
+ base = new URI("file", "", dir, null, null);
+ }
+ else {
+ try {
+ base = new URI(fixURI(baseSystemId));
+ }
+ catch (URI.MalformedURIException e) {
+ String dir;
+ try {
+ dir = fixURI(System.getProperty("user.dir"));
+ }
+ catch (SecurityException se) {
+ dir = "";
+ }
+ if (baseSystemId.indexOf(':') != -1) {
+ // for xml schemas we might have baseURI with
+ // a specified drive
+ base = new URI("file", "", fixURI(baseSystemId), null, null);
+ }
+ else {
+ if (!dir.endsWith("/")) {
+ dir = dir + "/";
+ }
+ dir = dir + fixURI(baseSystemId);
+ base = new URI("file", "", dir, null, null);
+ }
+ }
+ }
+ // expand id
+ uri = new URI(base, id);
+ }
+ catch (URI.MalformedURIException e) {
+ // let it go through
+ }
+
+ if (uri == null) {
+ return systemId;
+ }
+ return uri.toString();
+
+ } // expandSystemId(String,String):String
+
+ /**
+ * Fixes a platform dependent filename to standard URI form.
+ *
+ * @param str The string to fix.
+ *
+ * @return Returns the fixed URI string.
+ */
+ protected static String fixURI(String str) {
+
+ // handle platform dependent strings
+ str = str.replace(java.io.File.separatorChar, '/');
+
+ // Windows fix
+ if (str.length() >= 2) {
+ char ch1 = str.charAt(1);
+ // change "C:blah" to "/C:blah"
+ if (ch1 == ':') {
+ char ch0 = Character.toUpperCase(str.charAt(0));
+ if (ch0 >= 'A' && ch0 <= 'Z') {
+ str = "/" + str;
+ }
+ }
+ // change "//blah" to "file://blah"
+ else if (ch1 == '/' && str.charAt(0) == '/') {
+ str = "file:" + str;
+ }
+ }
+
+ // done
+ return str;
+
+ } // fixURI(String):String
+
+ /** Modifies the given name based on the specified mode. */
+ protected static final String modifyName(String name, short mode) {
+ switch (mode) {
+ case NAMES_UPPERCASE: return name.toUpperCase();
+ case NAMES_LOWERCASE: return name.toLowerCase();
+ }
+ return name;
+ } // modifyName(String,short):String
+
+ /**
+ * Converts HTML names string value to constant value.
+ *
+ * @see #NAMES_NO_CHANGE
+ * @see #NAMES_LOWERCASE
+ * @see #NAMES_UPPERCASE
+ */
+ protected static final short getNamesValue(String value) {
+ if (value.equals("lower")) {
+ return NAMES_LOWERCASE;
+ }
+ if (value.equals("upper")) {
+ return NAMES_UPPERCASE;
+ }
+ return NAMES_NO_CHANGE;
+ } // getNamesValue(String):short
+
+ /**
+ * Fixes Microsoft Windows® specific characters.
+ * <p>
+ * Details about this common problem can be found at
+ * <a href='http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.html</a>
+ */
+ protected int fixWindowsCharacter(int origChar) {
+ /* PATCH: Asgeir Asgeirsson */
+ switch(origChar) {
+ case 130: return 8218;
+ case 131: return 402;
+ case 132: return 8222;
+ case 133: return 8230;
+ case 134: return 8224;
+ case 135: return 8225;
+ case 136: return 710;
+ case 137: return 8240;
+ case 138: return 352;
+ case 139: return 8249;
+ case 140: return 338;
+ case 145: return 8216;
+ case 146: return 8217;
+ case 147: return 8220;
+ case 148: return 8221;
+ case 149: return 8226;
+ case 150: return 8211;
+ case 151: return 8212;
+ case 152: return 732;
+ case 153: return 8482;
+ case 154: return 353;
+ case 155: return 8250;
+ case 156: return 339;
+ case 159: return 376;
+ }
+ return origChar;
+ } // fixWindowsCharacter(int):int
+
+ //
+ // Protected methods
+ //
+
+ // i/o
+
+ /** Reads a single character. */
+ protected int read() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(read: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ if (DEBUG_BUFFER) {
+ System.out.println(")read: -> -1");
+ }
+ return -1;
+ }
+ }
+ int c = fCurrentEntity.buffer[fCurrentEntity.offset++];
+ fCurrentEntity.columnNumber++;
+ if (DEBUG_BUFFER) {
+ System.out.print(")read: ");
+ printBuffer();
+ System.out.print(" -> ");
+ System.out.print(c);
+ System.out.println();
+ }
+ return c;
+ } // read():int
+
+ /**
+ * Loads a new chunk of data into the buffer and returns the number of
+ * characters loaded or -1 if no additional characters were loaded.
+ *
+ * @param offset The offset at which new characters should be loaded.
+ */
+ protected int load(int offset) throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(load: ");
+ printBuffer();
+ System.out.println();
+ }
+ // resize buffer, if needed
+ if (offset == fCurrentEntity.buffer.length) {
+ int adjust = fCurrentEntity.buffer.length / 4;
+ char[] array = new char[fCurrentEntity.buffer.length + adjust];
+ System.arraycopy(fCurrentEntity.buffer, 0, array, 0, fCurrentEntity.length);
+ fCurrentEntity.buffer = array;
+ }
+ // read a block of characters
+ int count = fCurrentEntity.stream.read(fCurrentEntity.buffer, offset, fCurrentEntity.buffer.length - offset);
+ fCurrentEntity.length = count != -1 ? count + offset : offset;
+ fCurrentEntity.offset = offset;
+ if (DEBUG_BUFFER) {
+ System.out.print(")load: ");
+ printBuffer();
+ System.out.print(" -> ");
+ System.out.print(count);
+ System.out.println();
+ }
+ return count;
+ } // load():int
+
+ // debugging
+
+ /** Sets the scanner. */
+ protected void setScanner(Scanner scanner) {
+ fScanner = scanner;
+ if (DEBUG_SCANNER) {
+ System.out.print("$$$ setScanner(");
+ System.out.print(scanner!=null?scanner.getClass().getName():"null");
+ System.out.println(");");
+ }
+ } // setScanner(Scanner)
+
+ /** Sets the scanner state. */
+ protected void setScannerState(short state) {
+ fScannerState = state;
+ if (DEBUG_SCANNER_STATE) {
+ System.out.print("$$$ setScannerState(");
+ switch (fScannerState) {
+ case STATE_CONTENT: { System.out.print("STATE_CONTENT"); break; }
+ case STATE_MARKUP_BRACKET: { System.out.print("STATE_MARKUP_BRACKET"); break; }
+ case STATE_START_DOCUMENT: { System.out.print("STATE_START_DOCUMENT"); break; }
+ case STATE_END_DOCUMENT: { System.out.print("STATE_END_DOCUMENT"); break; }
+ }
+ System.out.println(");");
+ }
+ } // setScannerState(short)
+
+ // scanning
+
+ /** Scans a DOCTYPE line. */
+ protected void scanDoctype() throws IOException {
+ String root = null;
+ String pubid = null;
+ String sysid = null;
+
+ if (skipSpaces()) {
+ root = scanName();
+ if (root == null) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1014", null);
+ }
+ }
+ else {
+ root = modifyName(root, fNamesElems);
+ }
+ if (skipSpaces()) {
+ if (skip("PUBLIC", false)) {
+ skipSpaces();
+ pubid = scanLiteral();
+ if (skipSpaces()) {
+ sysid = scanLiteral();
+ }
+ }
+ else if (skip("SYSTEM", false)) {
+ skipSpaces();
+ sysid = scanLiteral();
+ }
+ }
+ }
+ int c;
+ while ((c = read()) != -1) {
+ if (c == '<') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ break;
+ }
+ if (c == '>') {
+ break;
+ }
+ if (c == '[') {
+ skipMarkup(true);
+ break;
+ }
+ }
+
+ if (fDocumentHandler != null) {
+ if (fOverrideDoctype) {
+ pubid = fDoctypePubid;
+ sysid = fDoctypeSysid;
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs());
+ }
+
+ } // scanDoctype()
+
+ /** Scans a quoted literal. */
+ protected String scanLiteral() throws IOException {
+ int quote = read();
+ if (quote == '\'' || quote == '"') {
+ StringBuffer str = new StringBuffer();
+ int c;
+ while ((c = read()) != -1) {
+ if (c == quote) {
+ break;
+ }
+ if (c == '\r' || c == '\n') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ // NOTE: This collapses newlines to a single space.
+ // [Q] Is this the right thing to do here? -Ac
+ skipNewlines();
+ str.append(' ');
+ }
+ else if (c == '<') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ break;
+ }
+ else {
+ str.append((char)c);
+ }
+ }
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ return str.toString();
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ return null;
+ } // scanLiteral():String
+
+ /** Scans a name. */
+ protected String scanName() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanName: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanName: ");
+ printBuffer();
+ System.out.println(" -> null");
+ }
+ return null;
+ }
+ }
+ int offset = fCurrentEntity.offset;
+ while (true) {
+ while (fCurrentEntity.offset < fCurrentEntity.length) {
+ char c = fCurrentEntity.buffer[fCurrentEntity.offset];
+ if (!Character.isLetterOrDigit(c) &&
+ !(c == '-' || c == '.' || c == ':' || c == '_')) {
+ break;
+ }
+ fCurrentEntity.offset++;
+ fCurrentEntity.columnNumber++;
+ }
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ int length = fCurrentEntity.length - offset;
+ System.arraycopy(fCurrentEntity.buffer, offset, fCurrentEntity.buffer, 0, length);
+ int count = load(length);
+ offset = 0;
+ if (count == -1) {
+ break;
+ }
+ }
+ else {
+ break;
+ }
+ }
+ int length = fCurrentEntity.offset - offset;
+ String name = length > 0 ? new String(fCurrentEntity.buffer, offset, length) : null;
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanName: ");
+ printBuffer();
+ System.out.print(" -> \"");
+ System.out.print(name);
+ System.out.println('"');
+ }
+ return name;
+ } // scanName():String
+
+ /** Scans an entity reference. */
+ protected int scanEntityRef(XMLStringBuffer str, boolean content)
+ throws IOException {
+ str.clear();
+ str.append('&');
+ while (true) {
+ int c = read();
+ if (c == ';') {
+ str.append(';');
+ break;
+ }
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1004", null);
+ }
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(str, locationAugs());
+ }
+ return -1;
+ }
+ if (!Character.isLetterOrDigit((char)c) && c != '#') {
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1004", null);
+ }
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(str, locationAugs());
+ }
+ return -1;
+ }
+ str.append((char)c);
+ }
+ if (str.length == 1) {
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(str, locationAugs());
+ }
+ return -1;
+ }
+
+ String name = str.toString().substring(1, str.length-1);
+ if (name.startsWith("#")) {
+ int value = -1;
+ try {
+ if (name.startsWith("#x")) {
+ value = Integer.parseInt(name.substring(2), 16);
+ }
+ else {
+ value = Integer.parseInt(name.substring(1));
+ }
+ /* PATCH: Asgeir Asgeirsson */
+ if (fFixWindowsCharRefs && fIso8859Encoding) {
+ value = fixWindowsCharacter(value);
+ }
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ if (fNotifyCharRefs) {
+ XMLResourceIdentifier id = resourceId();
+ String encoding = null;
+ fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
+ }
+ str.clear();
+ str.append((char)value);
+ fDocumentHandler.characters(str, locationAugs());
+ if (fNotifyCharRefs) {
+ fDocumentHandler.endGeneralEntity(name, locationAugs());
+ }
+ }
+ }
+ catch (NumberFormatException e) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1005", new Object[]{name});
+ }
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(str, locationAugs());
+ }
+ }
+ return value;
+ }
+
+ int c = HTMLEntities.get(name);
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1006", new Object[]{name});
+ }
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(str, locationAugs());
+ }
+ return -1;
+ }
+ if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ boolean notify = fNotifyHtmlBuiltinRefs || (fNotifyXmlBuiltinRefs && builtinXmlRef(name));
+ if (notify) {
+ XMLResourceIdentifier id = resourceId();
+ String encoding = null;
+ fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
+ }
+ str.clear();
+ str.append((char)c);
+ fDocumentHandler.characters(str, locationAugs());
+ if (notify) {
+ fDocumentHandler.endGeneralEntity(name, locationAugs());
+ }
+ }
+ return c;
+
+ } // scanEntityRef(XMLStringBuffer,boolean):int
+
+ /** Returns true if the specified text is present and is skipped. */
+ protected boolean skip(String s, boolean caseSensitive) throws IOException {
+ int length = s != null ? s.length() : 0;
+ for (int i = 0; i < length; i++) {
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ System.arraycopy(fCurrentEntity.buffer, fCurrentEntity.offset - i, fCurrentEntity.buffer, 0, i);
+ if (load(i) == -1) {
+ fCurrentEntity.offset = 0;
+ return false;
+ }
+ }
+ char c0 = s.charAt(i);
+ char c1 = fCurrentEntity.buffer[fCurrentEntity.offset++];
+ fCurrentEntity.columnNumber++;
+ if (!caseSensitive) {
+ c0 = Character.toUpperCase(c0);
+ c1 = Character.toUpperCase(c1);
+ }
+ if (c0 != c1) {
+ fCurrentEntity.offset -= i + 1;
+ return false;
+ }
+ }
+ return true;
+ } // skip(String):boolean
+
+ /** Skips markup. */
+ protected boolean skipMarkup(boolean balance) throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(skipMarkup: ");
+ printBuffer();
+ System.out.println();
+ }
+ int depth = 1;
+ boolean slashgt = false;
+ OUTER: while (true) {
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ break OUTER;
+ }
+ }
+ while (fCurrentEntity.offset < fCurrentEntity.length) {
+ char c = fCurrentEntity.buffer[fCurrentEntity.offset++];
+ fCurrentEntity.columnNumber++;
+ if (balance && c == '<') {
+ depth++;
+ }
+ else if (c == '>') {
+ depth--;
+ if (depth == 0) {
+ break OUTER;
+ }
+ }
+ else if (c == '/') {
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ break OUTER;
+ }
+ }
+ c = fCurrentEntity.buffer[fCurrentEntity.offset++];
+ fCurrentEntity.columnNumber++;
+ if (c == '>') {
+ slashgt = true;
+ depth--;
+ if (depth == 0) {
+ break OUTER;
+ }
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ }
+ else if (c == '\r' || c == '\n') {
+ skipNewlines();
+ }
+ }
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")skipMarkup: ");
+ printBuffer();
+ System.out.print(" -> "+slashgt);
+ System.out.println();
+ }
+ return slashgt;
+ } // skipMarkup():boolean
+
+ /** Skips whitespace. */
+ protected boolean skipSpaces() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(skipSpaces: ");
+ printBuffer();
+ System.out.println();
+ }
+ boolean spaces = false;
+ while (true) {
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ break;
+ }
+ }
+ char c = fCurrentEntity.buffer[fCurrentEntity.offset];
+ if (!Character.isSpace(c)) {
+ break;
+ }
+ spaces = true;
+ if (c == '\r' || c == '\n') {
+ skipNewlines();
+ continue;
+ }
+ fCurrentEntity.offset++;
+ fCurrentEntity.columnNumber++;
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")skipSpaces: ");
+ printBuffer();
+ System.out.print(" -> ");
+ System.out.print(spaces);
+ System.out.println();
+ }
+ return spaces;
+ } // skipSpaces()
+
+ /** Skips newlines and returns the number of newlines skipped. */
+ protected int skipNewlines() throws IOException {
+ return skipNewlines(Integer.MAX_VALUE);
+ } // skipNewlines():int
+
+ /** Skips newlines and returns the number of newlines skipped. */
+ protected int skipNewlines(int maxlines) throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(skipNewlines: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ if (load(0) == -1) {
+ if (DEBUG_BUFFER) {
+ System.out.print(")skipNewlines: ");
+ printBuffer();
+ System.out.println();
+ }
+ return 0;
+ }
+ }
+ char c = fCurrentEntity.buffer[fCurrentEntity.offset];
+ int newlines = 0;
+ int offset = fCurrentEntity.offset;
+ if (c == '\n' || c == '\r') {
+ do {
+ c = fCurrentEntity.buffer[fCurrentEntity.offset++];
+ if (c == '\r') {
+ newlines++;
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ offset = 0;
+ fCurrentEntity.offset = newlines;
+ if (load(newlines) == -1) {
+ break;
+ }
+ }
+ if (fCurrentEntity.buffer[fCurrentEntity.offset] == '\n') {
+ fCurrentEntity.offset++;
+ offset++;
+ }
+ }
+ else if (c == '\n') {
+ newlines++;
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ offset = 0;
+ fCurrentEntity.offset = newlines;
+ if (load(newlines) == -1) {
+ break;
+ }
+ }
+ }
+ else {
+ fCurrentEntity.offset--;
+ break;
+ }
+ } while (newlines < maxlines &&
+ fCurrentEntity.offset < fCurrentEntity.length - 1);
+ fCurrentEntity.lineNumber += newlines;
+ fCurrentEntity.columnNumber = 1;
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")skipNewlines: ");
+ printBuffer();
+ System.out.print(" -> ");
+ System.out.print(newlines);
+ System.out.println();
+ }
+ return newlines;
+ } // skipNewlines(int):int
+
+ // infoset utility methods
+
+ /** Returns an augmentations object with a location item added. */
+ protected final Augmentations locationAugs() {
+ HTMLAugmentations augs = null;
+ if (fAugmentations) {
+ fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber,
+ fEndLineNumber, fEndColumnNumber);
+ augs = fInfosetAugs;
+ augs.removeAllItems();
+ augs.putItem(AUGMENTATIONS, fLocationItem);
+ }
+ return augs;
+ } // locationAugs():Augmentations
+
+ /** Returns an augmentations object with a synthesized item added. */
+ protected final Augmentations synthesizedAugs() {
+ HTMLAugmentations augs = null;
+ if (fAugmentations) {
+ augs = fInfosetAugs;
+ augs.removeAllItems();
+ augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
+ }
+ return augs;
+ } // synthesizedAugs():Augmentations
+
+ /** Returns an empty resource identifier. */
+ protected final XMLResourceIdentifier resourceId() {
+ /***/
+ fResourceId.clear();
+ return fResourceId;
+ /***
+ // NOTE: Unfortunately, the Xerces DOM parser classes expect a
+ // non-null resource identifier object to be passed to
+ // startGeneralEntity. -Ac
+ return null;
+ /***/
+ } // resourceId():XMLResourceIdentifier
+
+ //
+ // Protected static methods
+ //
+
+ /** Returns true if the name is a built-in XML general entity reference. */
+ protected static boolean builtinXmlRef(String name) {
+ return name.equals("amp") || name.equals("lt") || name.equals("gt") ||
+ name.equals("quot") || name.equals("apos");
+ } // builtinXmlRef(String):boolean
+
+ //
+ // Private methods
+ //
+
+ /** Prints the contents of the character buffer to standard out. */
+ private void printBuffer() {
+ if (DEBUG_BUFFER) {
+ System.out.print('[');
+ System.out.print(fCurrentEntity.length);
+ System.out.print(' ');
+ System.out.print(fCurrentEntity.offset);
+ if (fCurrentEntity.length > 0) {
+ System.out.print(" \"");
+ for (int i = 0; i < fCurrentEntity.length; i++) {
+ if (i == fCurrentEntity.offset) {
+ System.out.print('^');
+ }
+ char c = fCurrentEntity.buffer[i];
+ switch (c) {
+ case '\r': {
+ System.out.print("\\r");
+ break;
+ }
+ case '\n': {
+ System.out.print("\\n");
+ break;
+ }
+ case '\t': {
+ System.out.print("\\t");
+ break;
+ }
+ case '"': {
+ System.out.print("\\\"");
+ break;
+ }
+ default: {
+ System.out.print(c);
+ }
+ }
+ }
+ if (fCurrentEntity.offset == fCurrentEntity.length) {
+ System.out.print('^');
+ }
+ System.out.print('"');
+ }
+ System.out.print(']');
+ }
+ } // printBuffer()
+
+ //
+ // Interfaces
+ //
+
+ /**
+ * Basic scanner interface.
+ *
+ * @author Andy Clark
+ */
+ public interface Scanner {
+
+ //
+ // Scanner methods
+ //
+
+ /**
+ * Scans part of the document. This interface allows scanning to
+ * be performed in a pulling manner.
+ *
+ * @param complete True if the scanner should not return until
+ * scanning is complete.
+ *
+ * @return True if additional scanning is required.
+ *
+ * @throws IOException Thrown if I/O error occurs.
+ */
+ public boolean scan(boolean complete) throws IOException;
+
+ } // interface Scanner
+
+ //
+ // Classes
+ //
+
+ /**
+ * Current entity.
+ *
+ * @author Andy Clark
+ */
+ public static class CurrentEntity {
+
+ //
+ // Data
+ //
+
+ /** Character stream. */
+ public Reader stream;
+
+ /** Encoding. */
+ public String encoding;
+
+ /** Public identifier. */
+ public String publicId;
+
+ /** Base system identifier. */
+ public String baseSystemId;
+
+ /** Literal system identifier. */
+ public String literalSystemId;
+
+ /** Expanded system identifier. */
+ public String expandedSystemId;
+
+ /** Line number. */
+ public int lineNumber = 1;
+
+ /** Column number. */
+ public int columnNumber = 1;
+
+ // buffer
+
+ /** Character buffer. */
+ public char[] buffer = new char[DEFAULT_BUFFER_SIZE];
+
+ /** Offset into character buffer. */
+ public int offset = 0;
+
+ /** Length of characters read into character buffer. */
+ public int length = 0;
+
+ //
+ // Constructors
+ //
+
+ /** Constructs an entity from the specified stream. */
+ public CurrentEntity(Reader stream, String encoding,
+ String publicId, String baseSystemId,
+ String literalSystemId, String expandedSystemId) {
+ this.stream = stream;
+ this.encoding = encoding;
+ this.publicId = publicId;
+ this.baseSystemId = baseSystemId;
+ this.literalSystemId = literalSystemId;
+ this.expandedSystemId = expandedSystemId;
+ } // <init>(Reader,String,String,String,String)
+
+ } // class CurrentEntity
+
+ /**
+ * The primary HTML document scanner.
+ *
+ * @author Andy Clark
+ */
+ public class ContentScanner
+ implements Scanner {
+
+ //
+ // Data
+ //
+
+ // temp vars
+
+ /** A qualified name. */
+ private final QName fQName = new QName();
+
+ /** Attributes. */
+ private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();
+
+ //
+ // Scanner methods
+ //
+
+ /** Scan. */
+ public boolean scan(boolean complete) throws IOException {
+ boolean next;
+ do {
+ try {
+ next = false;
+ switch (fScannerState) {
+ case STATE_CONTENT: {
+ fBeginLineNumber = fCurrentEntity.lineNumber;
+ fBeginColumnNumber = fCurrentEntity.columnNumber;
+ int c = read();
+ if (c == '<') {
+ setScannerState(STATE_MARKUP_BRACKET);
+ next = true;
+ }
+ else if (c == '&') {
+ scanEntityRef(fStringBuffer, true);
+ }
+ else if (c == -1) {
+ throw new EOFException();
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ scanCharacters();
+ }
+ break;
+ }
+ case STATE_MARKUP_BRACKET: {
+ int c = read();
+ if (c == '!') {
+ if (skip("--", false)) {
+ scanComment();
+ }
+ else if (skip("[CDATA[", false)) {
+ scanCDATA();
+ }
+ else if (skip("DOCTYPE", false)) {
+ scanDoctype();
+ }
+ else {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1002", null);
+ }
+ skipMarkup(true);
+ }
+ }
+ else if (c == '?') {
+ scanPI();
+ }
+ else if (c == '/') {
+ scanEndElement();
+ }
+ else if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1003", null);
+ }
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fStringBuffer.clear();
+ fStringBuffer.append('<');
+ fDocumentHandler.characters(fStringBuffer, null);
+ }
+ throw new EOFException();
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ fElementCount++;
+ fSingleBoolean[0] = false;
+ String ename = scanStartElement(fSingleBoolean);
+ if (ename != null && !fSingleBoolean[0] &&
+ HTMLElements.getElement(ename).isSpecial()) {
+ setScanner(fSpecialScanner.setElementName(ename));
+ setScannerState(STATE_CONTENT);
+ return true;
+ }
+ }
+ setScannerState(STATE_CONTENT);
+ break;
+ }
+ case STATE_START_DOCUMENT: {
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("startDocument()");
+ }
+ XMLLocator locator = HTMLScanner.this;
+ String encoding = fIANAEncoding;
+ Augmentations augs = locationAugs();
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class,
+ NamespaceContext.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ NamespaceContext nscontext = new NamespaceSupport();
+ Object[] params = {
+ locator, encoding,
+ nscontext, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (IllegalAccessException e) {
+ throw new XNIException(e);
+ }
+ catch (InvocationTargetException e) {
+ throw new XNIException(e);
+ }
+ catch (NoSuchMethodException e) {
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ Object[] params = {
+ locator, encoding, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (IllegalAccessException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ catch (InvocationTargetException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ catch (NoSuchMethodException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ }
+ }
+ if (fInsertDoctype && fDocumentHandler != null) {
+ String root = HTMLElements.getElement(HTMLElements.HTML).name;
+ root = modifyName(root, fNamesElems);
+ String pubid = fDoctypePubid;
+ String sysid = fDoctypeSysid;
+ fDocumentHandler.doctypeDecl(root, pubid, sysid,
+ synthesizedAugs());
+ }
+ setScannerState(STATE_CONTENT);
+ break;
+ }
+ case STATE_END_DOCUMENT: {
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("endDocument()");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.endDocument(locationAugs());
+ }
+ return false;
+ }
+ default: {
+ throw new RuntimeException("unknown scanner state: "+fScannerState);
+ }
+ }
+ }
+ catch (EOFException e) {
+ if (fCurrentEntityStack.empty()) {
+ setScannerState(STATE_END_DOCUMENT);
+ }
+ else {
+ fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
+ }
+ next = true;
+ }
+ } while (next || complete);
+ return true;
+ } // scan(boolean):boolean
+
+ //
+ // Protected methods
+ //
+
+ /** Scans characters. */
+ protected void scanCharacters() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanCharacters: ");
+ printBuffer();
+ System.out.println();
+ }
+ int newlines = skipNewlines();
+ if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) {
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanCharacters: ");
+ printBuffer();
+ System.out.println();
+ }
+ return;
+ }
+ char c;
+ int offset = fCurrentEntity.offset - newlines;
+ for (int i = offset; i < fCurrentEntity.offset; i++) {
+ fCurrentEntity.buffer[i] = '\n';
+ }
+ while (fCurrentEntity.offset < fCurrentEntity.length) {
+ c = fCurrentEntity.buffer[fCurrentEntity.offset];
+ if (c == '<' || c == '&' || c == '\n' || c == '\r') {
+ break;
+ }
+ fCurrentEntity.offset++;
+ fCurrentEntity.columnNumber++;
+ }
+ if (fCurrentEntity.offset > offset &&
+ fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fString.setValues(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset);
+ if (DEBUG_CALLBACKS) {
+ System.out.println("characters("+fString+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(fString, locationAugs());
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanCharacters: ");
+ printBuffer();
+ System.out.println();
+ }
+ } // scanCharacters()
+
+ /** Scans a CDATA section. */
+ protected void scanCDATA() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanCDATA: ");
+ printBuffer();
+ System.out.println();
+ }
+ fStringBuffer.clear();
+ if (fCDATASections) {
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ if (DEBUG_CALLBACKS) {
+ System.out.println("startCDATA()");
+ }
+ fDocumentHandler.startCDATA(locationAugs());
+ }
+ }
+ else {
+ fStringBuffer.append("[CDATA[");
+ }
+ boolean eof = scanMarkupContent(fStringBuffer, ']');
+ if (!fCDATASections) {
+ fStringBuffer.append("]]");
+ }
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ if (fCDATASections) {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("characters("+fStringBuffer+")");
+ }
+ fDocumentHandler.characters(fStringBuffer, locationAugs());
+ if (DEBUG_CALLBACKS) {
+ System.out.println("endCDATA()");
+ }
+ fDocumentHandler.endCDATA(locationAugs());
+ }
+ else {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("comment("+fStringBuffer+")");
+ }
+ fDocumentHandler.comment(fStringBuffer, locationAugs());
+ }
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanCDATA: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (eof) {
+ throw new EOFException();
+ }
+ } // scanCDATA()
+
+ /** Scans a comment. */
+ protected void scanComment() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanComment: ");
+ printBuffer();
+ System.out.println();
+ }
+ fStringBuffer.clear();
+ boolean eof = scanMarkupContent(fStringBuffer, '-');
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("comment("+fStringBuffer+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.comment(fStringBuffer, locationAugs());
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanComment: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (eof) {
+ throw new EOFException();
+ }
+ } // scanComment()
+
+ /** Scans markup content. */
+ protected boolean scanMarkupContent(XMLStringBuffer buffer,
+ char cend) throws IOException {
+ int c = -1;
+ OUTER: while (true) {
+ c = read();
+ if (c == cend) {
+ int count = 1;
+ while (true) {
+ c = read();
+ if (c == cend) {
+ count++;
+ continue;
+ }
+ break;
+ }
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ break OUTER;
+ }
+ if (count < 2) {
+ buffer.append(cend);
+ //if (c != -1) {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ //}
+ continue;
+ }
+ if (c != '>') {
+ for (int i = 0; i < count; i++) {
+ buffer.append(cend);
+ }
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ continue;
+ }
+ for (int i = 0; i < count - 2; i++) {
+ buffer.append(cend);
+ }
+ break;
+ }
+ else if (c == '\n' || c == '\r') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ int newlines = skipNewlines();
+ for (int i = 0; i < newlines; i++) {
+ buffer.append('\n');
+ }
+ continue;
+ }
+ else if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ break;
+ }
+ buffer.append((char)c);
+ }
+ return c == -1;
+ } // scanMarkupContent(XMLStringBuffer,char):boolean
+
+ /** Scans a processing instruction. */
+ protected void scanPI() throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanPI: ");
+ printBuffer();
+ System.out.println();
+ }
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML1008", null);
+ }
+
+ // scan processing instruction
+ String target = scanName();
+ if (target != null && !target.equalsIgnoreCase("xml")) {
+ while (true) {
+ int c = read();
+ if (c == '\r' || c == '\n') {
+ fCurrentEntity.lineNumber++;
+ fCurrentEntity.columnNumber = 1;
+ if (c == '\r') {
+ c = read();
+ if (c != '\n') {
+ fCurrentEntity.offset--;
+ }
+ }
+ continue;
+ }
+ if (c == -1) {
+ break;
+ }
+ if (c != ' ' && c != '\t') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ break;
+ }
+ }
+ fStringBuffer.clear();
+ while (true) {
+ int c = read();
+ if (c == '?' || c == '/') {
+ char c0 = (char)c;
+ c = read();
+ if (c == '>') {
+ break;
+ }
+ else {
+ fStringBuffer.append(c0);
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ continue;
+ }
+ }
+ else if (c == '\r' || c == '\n') {
+ fStringBuffer.append('\n');
+ fCurrentEntity.lineNumber++;
+ fCurrentEntity.columnNumber = 1;
+ if (c == '\r') {
+ c = read();
+ if (c != '\n') {
+ fCurrentEntity.offset--;
+ }
+ }
+ continue;
+ }
+ else if (c == -1) {
+ break;
+ }
+ else {
+ fStringBuffer.append((char)c);
+ }
+ }
+ XMLString data = fStringBuffer;
+ if (fDocumentHandler != null) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.processingInstruction(target, data, locationAugs());
+ }
+ }
+
+ // scan xml/text declaration
+ else {
+ int beginLineNumber = fBeginLineNumber;
+ int beginColumnNumber = fBeginColumnNumber;
+ fAttributes.removeAllAttributes();
+ int aindex = 0;
+ while (scanPseudoAttribute(fAttributes)) {
+ fAttributes.getName(aindex,fQName);
+ fQName.rawname = fQName.rawname.toLowerCase();
+ fAttributes.setName(aindex,fQName);
+ aindex++;
+ }
+ if (fDocumentHandler != null) {
+ String version = fAttributes.getValue("version");
+ String encoding = fAttributes.getValue("encoding");
+ String standalone = fAttributes.getValue("standalone");
+
+ fBeginLineNumber = beginLineNumber;
+ fBeginColumnNumber = beginColumnNumber;
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.xmlDecl(version, encoding, standalone,
+ locationAugs());
+ }
+ }
+
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanPI: ");
+ printBuffer();
+ System.out.println();
+ }
+ } // scanPI()
+
+ /**
+ * Scans a start element.
+ *
+ * @param empty Is used for a second return value to indicate whether
+ * the start element tag is empty (e.g. "/>").
+ */
+ protected String scanStartElement(boolean[] empty) throws IOException {
+ String ename = scanName();
+ int length = ename != null ? ename.length() : 0;
+ int c = length > 0 ? ename.charAt(0) : -1;
+ if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1009", null);
+ }
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fStringBuffer.clear();
+ fStringBuffer.append('<');
+ if (length > 0) {
+ fStringBuffer.append(ename);
+ }
+ fDocumentHandler.characters(fStringBuffer, null);
+ }
+ return null;
+ }
+ ename = modifyName(ename, fNamesElems);
+ fAttributes.removeAllAttributes();
+ boolean print = false;
+ int beginLineNumber = fBeginLineNumber;
+ int beginColumnNumber = fBeginColumnNumber;
+ while (scanAttribute(fAttributes, empty)) {
+ // do nothing
+ }
+ fBeginLineNumber = beginLineNumber;
+ fBeginColumnNumber = beginColumnNumber;
+ if (fByteStream != null && fElementDepth == -1) {
+ if (ename.equalsIgnoreCase("META")) {
+ if (DEBUG_CHARSET) {
+ System.out.println("+++ <META>");
+ }
+ String httpEquiv = getValue(fAttributes, "http-equiv");
+ if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) {
+ if (DEBUG_CHARSET) {
+ System.out.println("+++ @content-type: \""+httpEquiv+'"');
+ }
+ String content = getValue(fAttributes, "content");
+ int index1 = content != null ? content.toLowerCase().indexOf("charset=") : -1;
+ if (index1 != -1 && !fIgnoreSpecifiedCharset) {
+ int index2 = content.indexOf(';', index1);
+ String charset = index2 != -1 ? content.substring(index1+8, index2) : content.substring(index1+8);
+ try {
+ String ianaEncoding = charset;
+ String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase());
+ if (DEBUG_CHARSET) {
+ System.out.println("+++ ianaEncoding: "+ianaEncoding);
+ System.out.println("+++ javaEncoding: "+javaEncoding);
+ }
+ if (javaEncoding == null) {
+ javaEncoding = ianaEncoding;
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1001", new Object[]{ianaEncoding});
+ }
+ }
+ fIso8859Encoding = ianaEncoding == null
+ || ianaEncoding.toUpperCase().startsWith("ISO-8859")
+ || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
+ fCurrentEntity.stream = new InputStreamReader(fByteStream, javaEncoding);
+ fByteStream.playback();
+ fElementDepth = fElementCount;
+ fElementCount = 0;
+ fCurrentEntity.offset = fCurrentEntity.length = 0;
+ fCurrentEntity.lineNumber = 1;
+ fCurrentEntity.columnNumber = 1;
+ }
+ catch (UnsupportedEncodingException e) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1010", new Object[]{charset});
+ }
+ // NOTE: If the encoding change doesn't work,
+ // then there's no point in continuing to
+ // buffer the input stream.
+ fByteStream.clear();
+ }
+ }
+ }
+ }
+ else if (ename.equalsIgnoreCase("BODY")) {
+ fByteStream.clear();
+ }
+ else {
+ HTMLElements.Element element = HTMLElements.getElement(ename);
+ if (element.parent != null && element.parent.length > 0) {
+ if (element.parent[0].code == HTMLElements.BODY) {
+ fByteStream.clear();
+ }
+ }
+ }
+ }
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fQName.setValues(null, ename, ename, null);
+ if (DEBUG_CALLBACKS) {
+ System.out.println("startElement("+fQName+','+fAttributes+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ if (empty[0]) {
+ fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs());
+ }
+ else {
+ fDocumentHandler.startElement(fQName, fAttributes, locationAugs());
+ }
+ }
+ return ename;
+ } // scanStartElement():ename
+
+ /**
+ * Scans a real attribute.
+ *
+ * @param attributes The list of attributes.
+ * @param empty Is used for a second return value to indicate
+ * whether the start element tag is empty
+ * (e.g. "/>").
+ */
+ protected boolean scanAttribute(XMLAttributesImpl attributes,
+ boolean[] empty)
+ throws IOException {
+ return scanAttribute(attributes,empty,'/');
+ } // scanAttribute(XMLAttributesImpl,boolean[]):boolean
+
+ /**
+ * Scans a pseudo attribute.
+ *
+ * @param attributes The list of attributes.
+ */
+ protected boolean scanPseudoAttribute(XMLAttributesImpl attributes)
+ throws IOException {
+ return scanAttribute(attributes,fSingleBoolean,'?');
+ } // scanPseudoAttribute(XMLAttributesImpl):boolean
+
+ /**
+ * Scans an attribute, pseudo or real.
+ *
+ * @param attributes The list of attributes.
+ * @param empty Is used for a second return value to indicate
+ * whether the start element tag is empty
+ * (e.g. "/>").
+ * @param endc The end character that appears before the
+ * closing angle bracket ('>').
+ */
+ protected boolean scanAttribute(XMLAttributesImpl attributes,
+ boolean[] empty, char endc)
+ throws IOException {
+ boolean skippedSpaces = skipSpaces();
+ fBeginLineNumber = fCurrentEntity.lineNumber;
+ fBeginColumnNumber = fCurrentEntity.columnNumber;
+ int c = read();
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ if (c == '>') {
+ return false;
+ }
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ String aname = scanName();
+ if (aname == null) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1011", null);
+ }
+ empty[0] = skipMarkup(false);
+ return false;
+ }
+ if (!skippedSpaces && fReportErrors) {
+ fErrorReporter.reportError("HTML1013", new Object[] { aname });
+ }
+ aname = modifyName(aname, fNamesAttrs);
+ skipSpaces();
+ c = read();
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ if (c == '/' || c == '>') {
+ fQName.setValues(null, aname, aname, null);
+ attributes.addAttribute(fQName, "CDATA", "");
+ attributes.setSpecified(attributes.getLength()-1, true);
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ if (c == '/') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ empty[0] = skipMarkup(false);
+ }
+ return false;
+ }
+ /***
+ // REVISIT: [Q] Why is this still here? -Ac
+ if (c == '/' || c == '>') {
+ if (c == '/') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ empty[0] = skipMarkup(false);
+ }
+ fQName.setValues(null, aname, aname, null);
+ attributes.addAttribute(fQName, "CDATA", "");
+ attributes.setSpecified(attributes.getLength()-1, true);
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ return false;
+ }
+ /***/
+ if (c == '=') {
+ skipSpaces();
+ c = read();
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ // Xiaowei/Ac: Fix for <a href=/cgi-bin/myscript>...</a>
+ if (c == '>') {
+ fQName.setValues(null, aname, aname, null);
+ attributes.addAttribute(fQName, "CDATA", "");
+ attributes.setSpecified(attributes.getLength()-1, true);
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ return false;
+ }
+ fStringBuffer.clear();
+ fNonNormAttr.clear();
+ if (c != '\'' && c != '"') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ while (true) {
+ c = read();
+ // Xiaowei/Ac: Fix for <a href=/broken/>...</a>
+ if (Character.isSpace((char)c) || c == '>') {
+ //fCharOffset--;
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ break;
+ }
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ if (c == '&') {
+ int ce = scanEntityRef(fStringBuffer2, false);
+ if (ce != -1) {
+ fStringBuffer.append((char)ce);
+ }
+ else {
+ fStringBuffer.append(fStringBuffer2);
+ }
+ fNonNormAttr.append(fStringBuffer2);
+ }
+ else {
+ fStringBuffer.append((char)c);
+ fNonNormAttr.append((char)c);
+ }
+ }
+ fQName.setValues(null, aname, aname, null);
+ String avalue = fStringBuffer.toString();
+ attributes.addAttribute(fQName, "CDATA", avalue);
+
+ int lastattr = attributes.getLength()-1;
+ attributes.setSpecified(lastattr, true);
+ attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ return true;
+ }
+ char quote = (char)c;
+ do {
+ c = read();
+ if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ if (c == '&') {
+ int ce = scanEntityRef(fStringBuffer2, false);
+ if (ce != -1) {
+ fStringBuffer.append((char)ce);
+ }
+ else {
+ fStringBuffer.append(fStringBuffer2);
+ }
+ fNonNormAttr.append(fStringBuffer2);
+ }
+ else if (c == '\t') {
+ fStringBuffer.append(' ');
+ fNonNormAttr.append('\t');
+ }
+ else if (c == '\r' || c == '\n') {
+ fCurrentEntity.lineNumber++;
+ fCurrentEntity.columnNumber = 0;
+ if (c == '\r') {
+ int c2 = read();
+ if (c2 != '\n') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ else {
+ fNonNormAttr.append('\r');
+ c = c2;
+ }
+ }
+ fStringBuffer.append(' ');
+ fNonNormAttr.append((char)c);
+ }
+ else if (c != quote) {
+ fStringBuffer.append((char)c);
+ fNonNormAttr.append((char)c);
+ }
+ } while (c != quote);
+ fQName.setValues(null, aname, aname, null);
+ String avalue = fStringBuffer.toString();
+ attributes.addAttribute(fQName, "CDATA", avalue);
+
+ int lastattr = attributes.getLength()-1;
+ attributes.setSpecified(lastattr, true);
+ attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ }
+ else {
+ fQName.setValues(null, aname, aname, null);
+ attributes.addAttribute(fQName, "CDATA", "");
+ attributes.setSpecified(attributes.getLength()-1, true);
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ if (fAugmentations) {
+ addLocationItem(attributes, attributes.getLength() - 1);
+ }
+ }
+ return true;
+ } // scanAttribute(XMLAttributesImpl):boolean
+
+ /** Adds location augmentations to the specified attribute. */
+ protected void addLocationItem(XMLAttributes attributes, int index) {
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ LocationItem locationItem = new LocationItem();
+ locationItem.setValues(fBeginLineNumber, fBeginColumnNumber,
+ fEndLineNumber, fEndColumnNumber);
+ Augmentations augs = attributes.getAugmentations(index);
+ augs.putItem(AUGMENTATIONS, locationItem);
+ } // addLocationItem(XMLAttributes,int)
+
+ /** Scans an end element. */
+ protected void scanEndElement() throws IOException {
+ String ename = scanName();
+ if (fReportErrors && ename == null) {
+ fErrorReporter.reportError("HTML1012", null);
+ }
+ skipMarkup(false);
+ if (ename != null) {
+ ename = modifyName(ename, fNamesElems);
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fQName.setValues(null, ename, ename, null);
+ if (DEBUG_CALLBACKS) {
+ System.out.println("endElement("+fQName+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.endElement(fQName, locationAugs());
+ }
+ }
+ } // scanEndElement()
+
+ } // class ContentScanner
+
+ /**
+ * Special scanner used for elements whose content needs to be scanned
+ * as plain text, ignoring markup such as elements and entity references.
+ * For example: <SCRIPT> and <COMMENT>.
+ *
+ * @author Andy Clark
+ */
+ public class SpecialScanner
+ implements Scanner {
+
+ //
+ // Data
+ //
+
+ /** Name of element whose content needs to be scanned as text. */
+ protected String fElementName;
+
+ /** True if <script> element. */
+ protected boolean fScript;
+
+ /** True if <style> element. */
+ protected boolean fStyle;
+
+ /** True if <textarea> element. */
+ protected boolean fTextarea;
+
+ // temp vars
+
+ /** A qualified name. */
+ private final QName fQName = new QName();
+
+ /** A string buffer. */
+ private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
+
+ //
+ // Public methods
+ //
+
+ /** Sets the element name. */
+ public Scanner setElementName(String ename) {
+ fElementName = ename;
+ fScript = fElementName.equalsIgnoreCase("SCRIPT");
+ fStyle = fElementName.equalsIgnoreCase("STYLE");
+ fTextarea = fElementName.equalsIgnoreCase("TEXTAREA");
+ return this;
+ } // setElementName(String):Scanner
+
+ //
+ // Scanner methods
+ //
+
+ /** Scan. */
+ public boolean scan(boolean complete) throws IOException {
+ boolean next;
+ do {
+ try {
+ next = false;
+ int delimiter = -1;
+ switch (fScannerState) {
+ case STATE_CONTENT: {
+ fBeginLineNumber = fCurrentEntity.lineNumber;
+ fBeginColumnNumber = fCurrentEntity.columnNumber;
+ int c = read();
+ if (c == '<') {
+ setScannerState(STATE_MARKUP_BRACKET);
+ continue;
+ }
+ if (c == '&') {
+ if (fTextarea) {
+ scanEntityRef(fStringBuffer, true);
+ continue;
+ }
+ fStringBuffer.clear();
+ fStringBuffer.append('&');
+ }
+ else if (c == -1) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML1007", null);
+ }
+ throw new EOFException();
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ fStringBuffer.clear();
+ }
+ scanCharacters(fStringBuffer, -1);
+ break;
+ } // case STATE_CONTENT
+ case STATE_MARKUP_BRACKET: {
+ int c = read();
+ if (c == '!') {
+ if (skip("--", false)) {
+ fStringBuffer.clear();
+ boolean strip = (fScript && fScriptStripCommentDelims) ||
+ (fStyle && fStyleStripCommentDelims);
+ if (strip) {
+ do {
+ c = read();
+ if (c == '\r' || c == '\n') {
+ fCurrentEntity.columnNumber--;
+ fCurrentEntity.offset--;
+ break;
+ }
+ } while (c != -1);
+ skipNewlines(1);
+ delimiter = '-';
+ }
+ else {
+ fStringBuffer.append("<!--");
+ }
+ }
+ else if (skip("[CDATA[", false)) {
+ fStringBuffer.clear();
+ boolean strip = (fScript && fScriptStripCDATADelims) ||
+ (fStyle && fStyleStripCDATADelims);
+ if (strip) {
+ do {
+ c = read();
+ if (c == '\r' || c == '\n') {
+ fCurrentEntity.columnNumber--;
+ fCurrentEntity.offset--;
+ break;
+ }
+ } while (c != -1);
+ skipNewlines(1);
+ delimiter = ']';
+ }
+ else {
+ fStringBuffer.append("<![CDATA[");
+ }
+ }
+ }
+ else if (c == '/') {
+ String ename = scanName();
+ if (ename != null) {
+ if (ename.equalsIgnoreCase(fElementName)) {
+ if (read() == '>') {
+ ename = modifyName(ename, fNamesElems);
+ if (fDocumentHandler != null && fElementCount >= fElementDepth) {
+ fQName.setValues(null, ename, ename, null);
+ if (DEBUG_CALLBACKS) {
+ System.out.println("endElement("+fQName+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.endElement(fQName, locationAugs());
+ }
+ setScanner(fContentScanner);
+ setScannerState(STATE_CONTENT);
+ return true;
+ }
+ else {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ }
+ fStringBuffer.clear();
+ fStringBuffer.append("</");
+ fStringBuffer.append(ename);
+ }
+ else {
+ fStringBuffer.clear();
+ fStringBuffer.append("</");
+ }
+ }
+ else {
+ fStringBuffer.clear();
+ fStringBuffer.append('<');
+ fStringBuffer.append((char)c);
+ }
+ scanCharacters(fStringBuffer, delimiter);
+ setScannerState(STATE_CONTENT);
+ break;
+ } // case STATE_MARKUP_BRACKET
+ } // switch
+ } // try
+ catch (EOFException e) {
+ setScanner(fContentScanner);
+ if (fCurrentEntityStack.empty()) {
+ setScannerState(STATE_END_DOCUMENT);
+ }
+ else {
+ fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
+ setScannerState(STATE_CONTENT);
+ }
+ return true;
+ }
+ } // do
+ while (next || complete);
+ return true;
+ } // scan(boolean):boolean
+
+ //
+ // Protected methods
+ //
+
+ /** Scan characters. */
+ protected void scanCharacters(XMLStringBuffer buffer,
+ int delimiter) throws IOException {
+ if (DEBUG_BUFFER) {
+ System.out.print("(scanCharacters, delimiter="+delimiter+": ");
+ printBuffer();
+ System.out.println();
+ }
+ boolean strip = (fScript && fScriptStripCommentDelims) ||
+ (fScript && fScriptStripCDATADelims) ||
+ (fStyle && fStyleStripCommentDelims) ||
+ (fStyle && fStyleStripCDATADelims);
+ while (true) {
+ int c = read();
+ if (c == -1 || (delimiter == -1 && (c == '<' || c == '&'))) {
+ if (c != -1) {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ break;
+ }
+ // Patch supplied by Jonathan Baxter
+ else if (c == '\r' || c == '\n') {
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ int newlines = skipNewlines();
+ for (int i = 0; i < newlines; i++) {
+ buffer.append('\n');
+ }
+ }
+ else if (delimiter != -1 && c == (char)delimiter) {
+ int count = 0;
+ do {
+ count++;
+ c = read();
+ } while (c == (char)delimiter);
+ for (int i = strip && c == '>' ? 2 : 0; i < count; i++) {
+ buffer.append((char)delimiter);
+ }
+ if (c == -1 || (count >= 2 && c == '>')) {
+ if (!strip) {
+ buffer.append((char)c);
+ }
+ break;
+ }
+ fCurrentEntity.offset--;
+ fCurrentEntity.columnNumber--;
+ }
+ else {
+ buffer.append((char)c);
+ if (c == '\n') {
+ fCurrentEntity.columnNumber = 1;
+ fCurrentEntity.lineNumber++;
+ }
+ }
+ }
+ if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) {
+ if (DEBUG_CALLBACKS) {
+ System.out.println("characters("+buffer+")");
+ }
+ fEndLineNumber = fCurrentEntity.lineNumber;
+ fEndColumnNumber = fCurrentEntity.columnNumber;
+ fDocumentHandler.characters(buffer, locationAugs());
+ }
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanCharacters: ");
+ printBuffer();
+ System.out.println();
+ }
+ } // scanCharacters(StringBuffer)
+
+ } // class SpecialScanner
+
+ /**
+ * A playback input stream. This class has the ability to save the bytes
+ * read from the underlying input stream and play the bytes back later.
+ * This class is used by the HTML scanner to switch encodings when a
+ * <meta> tag is detected that specifies a different encoding.
+ * <p>
+ * If the encoding is changed, then the scanner calls the
+ * <code>playback</code> method and re-scans the beginning of the HTML
+ * document again. This should not be too much of a performance problem
+ * because the <meta> tag appears at the beginning of the document.
+ * <p>
+ * If the <body> tag is reached without playing back the bytes,
+ * then the buffer can be cleared by calling the <code>clear</code>
+ * method. This stops the buffering of bytes and allows the memory used
+ * by the buffer to be reclaimed.
+ * <p>
+ * <strong>Note:</strong>
+ * If the buffer is never played back or cleared, this input stream
+ * will continue to buffer the entire stream. Therefore, it is very
+ * important to use this stream correctly.
+ *
+ * @author Andy Clark
+ */
+ public static class PlaybackInputStream
+ extends FilterInputStream {
+
+ //
+ // Constants
+ //
+
+ /** Set to true to debug playback. */
+ private static final boolean DEBUG_PLAYBACK = false;
+
+ //
+ // Data
+ //
+
+ // state
+
+ /** Playback mode. */
+ protected boolean fPlayback = false;
+
+ /** Buffer cleared. */
+ protected boolean fCleared = false;
+
+ /** Encoding detected. */
+ protected boolean fDetected = false;
+
+ // buffer info
+
+ /** Byte buffer. */
+ protected byte[] fByteBuffer = new byte[1024];
+
+ /** Offset into byte buffer during playback. */
+ protected int fByteOffset = 0;
+
+ /** Length of bytes read into byte buffer. */
+ protected int fByteLength = 0;
+
+ /** Pushback offset. */
+ public int fPushbackOffset = 0;
+
+ /** Pushback length. */
+ public int fPushbackLength = 0;
+
+ //
+ // Constructors
+ //
+
+ /** Constructor. */
+ public PlaybackInputStream(InputStream in) {
+ super(in);
+ } // <init>(InputStream)
+
+ //
+ // Public methods
+ //
+
+ /** Detect encoding. */
+ public void detectEncoding(String[] encodings) throws IOException {
+ if (fDetected) {
+ throw new IOException("Should not detect encoding twice.");
+ }
+ fDetected = true;
+ int b1 = read();
+ if (b1 == -1) {
+ return;
+ }
+ int b2 = read();
+ if (b2 == -1) {
+ fPushbackLength = 1;
+ return;
+ }
+ // UTF-8 BOM: 0xEFBBBF
+ if (b1 == 0xEF && b2 == 0xBB) {
+ int b3 = read();
+ if (b3 == 0xBF) {
+ fPushbackOffset = 3;
+ encodings[0] = "UTF-8";
+ encodings[1] = "UTF8";
+ return;
+ }
+ fPushbackLength = 3;
+ }
+ // UTF-16 LE BOM: 0xFFFE
+ if (b1 == 0xFF && b2 == 0xFE) {
+ encodings[0] = "UTF-16";
+ encodings[1] = "UnicodeLittleUnmarked";
+ return;
+ }
+ // UTF-16 BE BOM: 0xFEFF
+ else if (b1 == 0xFE && b2 == 0xFF) {
+ encodings[0] = "UTF-16";
+ encodings[1] = "UnicodeBigUnmarked";
+ return;
+ }
+ // unknown
+ fPushbackLength = 2;
+ } // detectEncoding()
+
+ /** Playback buffer contents. */
+ public void playback() {
+ fPlayback = true;
+ } // playback()
+
+ /**
+ * Clears the buffer.
+ * <p>
+ * <strong>Note:</strong>
+ * The buffer cannot be cleared during playback. Therefore, calling
+ * this method during playback will not do anything. However, the
+ * buffer will be cleared automatically at the end of playback.
+ */
+ public void clear() {
+ if (!fPlayback) {
+ fCleared = true;
+ fByteBuffer = null;
+ }
+ } // clear()
+
+ //
+ // InputStream methods
+ //
+
+ /** Read a byte. */
+ public int read() throws IOException {
+ if (DEBUG_PLAYBACK) {
+ System.out.println("(read");
+ }
+ if (fPushbackOffset < fPushbackLength) {
+ return fByteBuffer[fPushbackOffset++];
+ }
+ if (fCleared) {
+ return in.read();
+ }
+ if (fPlayback) {
+ int c = fByteBuffer[fByteOffset++];
+ if (fByteOffset == fByteLength) {
+ fCleared = true;
+ fByteBuffer = null;
+ }
+ if (DEBUG_PLAYBACK) {
+ System.out.println(")read -> "+(char)c);
+ }
+ return c;
+ }
+ int c = in.read();
+ if (c != -1) {
+ if (fByteLength == fByteBuffer.length) {
+ byte[] newarray = new byte[fByteLength + 1024];
+ System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
+ fByteBuffer = newarray;
+ }
+ fByteBuffer[fByteLength++] = (byte)c;
+ }
+ if (DEBUG_PLAYBACK) {
+ System.out.println(")read -> "+(char)c);
+ }
+ return c;
+ } // read():int
+
+ /** Read an array of bytes. */
+ public int read(byte[] array) throws IOException {
+ return read(array, 0, array.length);
+ } // read(byte[]):int
+
+ /** Read an array of bytes. */
+ public int read(byte[] array, int offset, int length) throws IOException {
+ if (DEBUG_PLAYBACK) {
+ System.out.println(")read("+offset+','+length+')');
+ }
+ if (fPushbackOffset < fPushbackLength) {
+ int count = fPushbackLength - fPushbackOffset;
+ if (count > length) {
+ count = length;
+ }
+ System.arraycopy(fByteBuffer, fPushbackOffset, array, offset, count);
+ fPushbackOffset += count;
+ return count;
+ }
+ if (fCleared) {
+ return in.read(array, offset, length);
+ }
+ if (fPlayback) {
+ if (fByteOffset + length > fByteLength) {
+ length = fByteLength - fByteOffset;
+ }
+ System.arraycopy(fByteBuffer, fByteOffset, array, offset, length);
+ fByteOffset += length;
+ if (fByteOffset == fByteLength) {
+ fCleared = true;
+ fByteBuffer = null;
+ }
+ return length;
+ }
+ int count = in.read(array, offset, length);
+ if (count != -1) {
+ if (fByteLength + count > fByteBuffer.length) {
+ byte[] newarray = new byte[fByteLength + count + 512];
+ System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
+ fByteBuffer = newarray;
+ }
+ System.arraycopy(array, offset, fByteBuffer, fByteLength, count);
+ fByteLength += count;
+ }
+ if (DEBUG_PLAYBACK) {
+ System.out.println(")read("+offset+','+length+") -> "+count);
+ }
+ return count;
+ } // read(byte[]):int
+
+ } // class PlaybackInputStream
+
+ /**
+ * Location infoset item.
+ *
+ * @author Andy Clark
+ */
+ protected static class LocationItem
+ implements HTMLEventInfo {
+
+ //
+ // Data
+ //
+
+ /** Beginning line number. */
+ protected int fBeginLineNumber;
+
+ /** Beginning column number. */
+ protected int fBeginColumnNumber;
+
+ /** Ending line number. */
+ protected int fEndLineNumber;
+
+ /** Ending column number. */
+ protected int fEndColumnNumber;
+
+ //
+ // Public methods
+ //
+
+ /** Sets the values of this item. */
+ public void setValues(int beginLine, int beginColumn,
+ int endLine, int endColumn) {
+ fBeginLineNumber = beginLine;
+ fBeginColumnNumber = beginColumn;
+ fEndLineNumber = endLine;
+ fEndColumnNumber = endColumn;
+ } // setValues(int,int,int,int)
+
+ //
+ // HTMLEventInfo methods
+ //
+
+ // location information
+
+ /** Returns the line number of the beginning of this event.*/
+ public int getBeginLineNumber() {
+ return fBeginLineNumber;
+ } // getBeginLineNumber():int
+
+ /** Returns the column number of the beginning of this event.*/
+ public int getBeginColumnNumber() {
+ return fBeginColumnNumber;
+ } // getBeginColumnNumber():int
+
+ /** Returns the line number of the end of this event.*/
+ public int getEndLineNumber() {
+ return fEndLineNumber;
+ } // getEndLineNumber():int
+
+ /** Returns the column number of the end of this event.*/
+ public int getEndColumnNumber() {
+ return fEndColumnNumber;
+ } // getEndColumnNumber():int
+
+ // other information
+
+ /** Returns true if this corresponding event was synthesized. */
+ public boolean isSynthesized() {
+ return false;
+ } // isSynthesize():boolean
+
+ //
+ // Object methods
+ //
+
+ /** Returns a string representation of this object. */
+ public String toString() {
+ StringBuffer str = new StringBuffer();
+ str.append(fBeginLineNumber);
+ str.append(':');
+ str.append(fBeginColumnNumber);
+ str.append(':');
+ str.append(fEndLineNumber);
+ str.append(':');
+ str.append(fEndColumnNumber);
+ return str.toString();
+ } // toString():String
+
+ } // class LocationItem
+
+} // class HTMLScanner
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLTagBalancer.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLTagBalancer.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/HTMLTagBalancer.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,1234 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+
+import org.apache.xerces.util.XMLAttributesImpl;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+
+/**
+ * Balances tags in an HTML document. This component receives document events
+ * and tries to correct many common mistakes that human (and computer) HTML
+ * document authors make. This tag balancer can:
+ * <ul>
+ * <li>add missing parent elements;
+ * <li>automatically close elements with optional end tags; and
+ * <li>handle mis-matched inline element tags.
+ * </ul>
+ * <p>
+ * This component recognizes the following features:
+ * <ul>
+ * <li>http://cyberneko.org/html/features/augmentations
+ * <li>http://cyberneko.org/html/features/report-errors
+ * <li>http://cyberneko.org/html/features/balance-tags/document-fragment
+ * <li>http://cyberneko.org/html/features/balance-tags/ignore-outside-content
+ * </ul>
+ * <p>
+ * This component recognizes the following properties:
+ * <ul>
+ * <li>http://cyberneko.org/html/properties/names/elems
+ * <li>http://cyberneko.org/html/properties/names/attrs
+ * <li>http://cyberneko.org/html/properties/error-reporter
+ * </ul>
+ *
+ * @see HTMLElements
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
+ */
+public class HTMLTagBalancer
+ implements XMLDocumentFilter, HTMLComponent {
+
+ //
+ // Constants
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+ /** Include infoset augmentations. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Report errors. */
+ protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
+
+ /** Document fragment balancing only (deprecated). */
+ protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
+
+ /** Document fragment balancing only. */
+ protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
+
+ /** Ignore outside content. */
+ protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
+
+ /** Recognized features. */
+ private static final String[] RECOGNIZED_FEATURES = {
+ NAMESPACES,
+ AUGMENTATIONS,
+ REPORT_ERRORS,
+ DOCUMENT_FRAGMENT_DEPRECATED,
+ DOCUMENT_FRAGMENT,
+ IGNORE_OUTSIDE_CONTENT,
+ };
+
+ /** Recognized features defaults. */
+ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
+ null,
+ null,
+ null,
+ null,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ };
+
+ // properties
+
+ /** Modify HTML element names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+ /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+
+ /** Error reporter. */
+ protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
+
+ /** Recognized properties. */
+ private static final String[] RECOGNIZED_PROPERTIES = {
+ NAMES_ELEMS,
+ NAMES_ATTRS,
+ ERROR_REPORTER,
+ };
+
+ /** Recognized properties defaults. */
+ private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
+ null,
+ null,
+ null,
+ };
+
+ // modify HTML names
+
+ /** Don't modify HTML names. */
+ protected static final short NAMES_NO_CHANGE = 0;
+
+ /** Match HTML element names. */
+ protected static final short NAMES_MATCH = 0;
+
+ /** Uppercase HTML names. */
+ protected static final short NAMES_UPPERCASE = 1;
+
+ /** Lowercase HTML names. */
+ protected static final short NAMES_LOWERCASE = 2;
+
+ // static vars
+
+ /** Synthesized event info item. */
+ protected static final HTMLEventInfo SYNTHESIZED_ITEM =
+ new HTMLEventInfo.SynthesizedItem();
+
+ //
+ // Data
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected boolean fNamespaces;
+
+ /** Include infoset augmentations. */
+ protected boolean fAugmentations;
+
+ /** Report errors. */
+ protected boolean fReportErrors;
+
+ /** Document fragment balancing only. */
+ protected boolean fDocumentFragment;
+
+ /** Ignore outside content. */
+ protected boolean fIgnoreOutsideContent;
+
+ // properties
+
+ /** Modify HTML element names. */
+ protected short fNamesElems;
+
+ /** Modify HTML attribute names. */
+ protected short fNamesAttrs;
+
+ /** Error reporter. */
+ protected HTMLErrorReporter fErrorReporter;
+
+ // connections
+
+ /** The document source. */
+ protected XMLDocumentSource fDocumentSource;
+
+ /** The document handler. */
+ protected XMLDocumentHandler fDocumentHandler;
+
+ // state
+
+ /** The element stack. */
+ protected final InfoStack fElementStack = new InfoStack();
+
+ /** The inline stack. */
+ protected final InfoStack fInlineStack = new InfoStack();
+
+ /** True if seen anything. Important for xml declaration. */
+ protected boolean fSeenAnything;
+
+ /** True if root element has been seen. */
+ protected boolean fSeenDoctype;
+
+ /** True if root element has been seen. */
+ protected boolean fSeenRootElement;
+
+ /**
+ * True if seen the end of the document element. In other words,
+ * this variable is set to false <em>until</em> the end </HTML>
+ * tag is seen (or synthesized). This is used to ensure that
+ * extraneous events after the end of the document element do not
+ * make the document stream ill-formed.
+ */
+ protected boolean fSeenRootElementEnd;
+
+ /** True if seen <head< element. */
+ protected boolean fSeenHeadElement;
+
+ /** True if seen <body< element. */
+ protected boolean fSeenBodyElement;
+
+ // temp vars
+
+ /** A qualified name. */
+ private final QName fQName = new QName();
+
+ /** Empty attributes. */
+ private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
+
+ /** Augmentations. */
+ private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
+
+ //
+ // HTMLComponent methods
+ //
+
+ /** Returns the default state for a feature. */
+ public Boolean getFeatureDefault(String featureId) {
+ int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_FEATURES[i].equals(featureId)) {
+ return RECOGNIZED_FEATURES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getFeatureDefault(String):Boolean
+
+ /** Returns the default state for a property. */
+ public Object getPropertyDefault(String propertyId) {
+ int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
+ return RECOGNIZED_PROPERTIES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getPropertyDefault(String):Object
+
+ //
+ // XMLComponent methods
+ //
+
+ /** Returns recognized features. */
+ public String[] getRecognizedFeatures() {
+ return RECOGNIZED_FEATURES;
+ } // getRecognizedFeatures():String[]
+
+ /** Returns recognized properties. */
+ public String[] getRecognizedProperties() {
+ return RECOGNIZED_PROPERTIES;
+ } // getRecognizedProperties():String[]
+
+ /** Resets the component. */
+ public void reset(XMLComponentManager manager)
+ throws XMLConfigurationException {
+
+ // get features
+ fNamespaces = manager.getFeature(NAMESPACES);
+ fAugmentations = manager.getFeature(AUGMENTATIONS);
+ fReportErrors = manager.getFeature(REPORT_ERRORS);
+ fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) ||
+ manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
+ fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT);
+
+ // get properties
+ fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
+ fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
+ fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
+
+ } // reset(XMLComponentManager)
+
+ /** Sets a feature. */
+ public void setFeature(String featureId, boolean state)
+ throws XMLConfigurationException {
+
+ if (featureId.equals(AUGMENTATIONS)) {
+ fAugmentations = state;
+ return;
+ }
+ if (featureId.equals(REPORT_ERRORS)) {
+ fReportErrors = state;
+ return;
+ }
+ if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
+ fIgnoreOutsideContent = state;
+ return;
+ }
+
+ } // setFeature(String,boolean)
+
+ /** Sets a property. */
+ public void setProperty(String propertyId, Object value)
+ throws XMLConfigurationException {
+
+ if (propertyId.equals(NAMES_ELEMS)) {
+ fNamesElems = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ if (propertyId.equals(NAMES_ATTRS)) {
+ fNamesAttrs = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ } // setProperty(String,Object)
+
+ //
+ // XMLDocumentSource methods
+ //
+
+ /** Sets the document handler. */
+ public void setDocumentHandler(XMLDocumentHandler handler) {
+ fDocumentHandler = handler;
+ } // setDocumentHandler(XMLDocumentHandler)
+
+ // @since Xerces 2.1.0
+
+ /** Returns the document handler. */
+ public XMLDocumentHandler getDocumentHandler() {
+ return fDocumentHandler;
+ } // getDocumentHandler():XMLDocumentHandler
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+
+ // reset state
+ fElementStack.top = 0;
+ fSeenAnything = false;
+ fSeenDoctype = false;
+ fSeenRootElement = false;
+ fSeenRootElementEnd = false;
+ fSeenHeadElement = false;
+ fSeenBodyElement = false;
+
+ // pass on event
+ if (fDocumentHandler != null) {
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class,
+ NamespaceContext.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ Object[] params = {
+ locator, encoding,
+ nscontext, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (IllegalAccessException e) {
+ throw new XNIException(e);
+ }
+ catch (InvocationTargetException e) {
+ throw new XNIException(e);
+ }
+ catch (NoSuchMethodException e) {
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ Object[] params = {
+ locator, encoding, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (IllegalAccessException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ catch (InvocationTargetException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ catch (NoSuchMethodException ex) {
+ // NOTE: Should never reach here!
+ throw new XNIException(ex);
+ }
+ }
+ }
+
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ // old methods
+
+ /** XML declaration. */
+ public void xmlDecl(String version, String encoding, String standalone,
+ Augmentations augs) throws XNIException {
+ if (!fSeenAnything && fDocumentHandler != null) {
+ fDocumentHandler.xmlDecl(version, encoding, standalone, augs);
+ }
+ } // xmlDecl(String,String,String,Augmentations)
+
+ /** Doctype declaration. */
+ public void doctypeDecl(String rootElementName, String publicId, String systemId,
+ Augmentations augs) throws XNIException {
+ fSeenAnything = true;
+ if (fReportErrors) {
+ if (fSeenRootElement) {
+ fErrorReporter.reportError("HTML2010", null);
+ }
+ else if (fSeenDoctype) {
+ fErrorReporter.reportError("HTML2011", null);
+ }
+ }
+ if (!fSeenRootElement && !fSeenDoctype) {
+ fSeenDoctype = true;
+ if (fDocumentHandler != null) {
+ fDocumentHandler.doctypeDecl(rootElementName, publicId, systemId, augs);
+ }
+ }
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ /** End document. */
+ public void endDocument(Augmentations augs) throws XNIException {
+
+ // handle empty document
+ if (!fSeenRootElement && !fDocumentFragment) {
+ if (fReportErrors) {
+ fErrorReporter.reportError("HTML2000", null);
+ }
+ String ename = modifyName("html", fNamesElems);
+ fQName.setValues(null, ename, ename, null);
+ if (fDocumentHandler != null) {
+ callStartElement(fQName, emptyAttributes(), synthesizedAugs());
+ callEndElement(fQName, synthesizedAugs());
+ }
+ }
+
+ // pop all remaining elements
+ else {
+ int length = fElementStack.top;
+ for (int i = 0; i < length; i++) {
+ Info info = fElementStack.pop();
+ if (fReportErrors) {
+ String ename = info.qname.rawname;
+ fErrorReporter.reportWarning("HTML2001", new Object[]{ename});
+ }
+ if (fDocumentHandler != null) {
+ callEndElement(info.qname, synthesizedAugs());
+ }
+ }
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endDocument(augs);
+ }
+
+ } // endDocument(Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs) throws XNIException {
+ fSeenAnything = true;
+ if (fDocumentHandler != null) {
+ fDocumentHandler.comment(text, augs);
+ }
+ } // comment(XMLString,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data,
+ Augmentations augs) throws XNIException {
+ fSeenAnything = true;
+ if (fDocumentHandler != null) {
+ fDocumentHandler.processingInstruction(target, data, augs);
+ }
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName elem, XMLAttributes attrs, Augmentations augs)
+ throws XNIException {
+ fSeenAnything = true;
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // get element information
+ HTMLElements.Element element = getElement(elem.rawname);
+
+ // ignore multiple html, head, body elements
+ if (fSeenRootElement && element.code == HTMLElements.HTML) {
+ return;
+ }
+ if (element.code == HTMLElements.HEAD) {
+ if (fSeenHeadElement) {
+ return;
+ }
+ fSeenHeadElement = true;
+ }
+ if (element.code == HTMLElements.BODY) {
+ if (fSeenBodyElement) {
+ return;
+ }
+ fSeenBodyElement = true;
+ }
+
+ // check proper parent
+ if (element.parent != null) {
+ if (!fSeenRootElement && !fDocumentFragment) {
+ String pname = element.parent[0].name;
+ pname = modifyName(pname, fNamesElems);
+ if (fReportErrors) {
+ String ename = elem.rawname;
+ fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname});
+ }
+ QName qname = new QName(null, pname, pname, null);
+ startElement(qname, null, synthesizedAugs());
+ }
+ else {
+ HTMLElements.Element pelement = element.parent[0];
+ if (pelement.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) {
+ int depth = getParentDepth(element.parent, element.bounds);
+ if (depth == -1) {
+ String pname = pelement.name;
+ pname = modifyName(pname, fNamesElems);
+ int pdepth = getParentDepth(pelement.parent, pelement.bounds);
+ if (pdepth != -1) {
+ QName qname = new QName(null, pname, pname, null);
+ if (fReportErrors) {
+ String ename = elem.rawname;
+ fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname});
+ }
+ startElement(qname, null, synthesizedAugs());
+ }
+ }
+ }
+ }
+ }
+
+ // if block element, save immediate parent inline elements
+ int depth = 0;
+ if (element.flags == 0) {
+ int length = fElementStack.top;
+ fInlineStack.top = 0;
+ for (int i = length - 1; i >= 0; i--) {
+ Info info = fElementStack.data[i];
+ if (!info.element.isInline()) {
+ break;
+ }
+ fInlineStack.push(info);
+ endElement(info.qname, synthesizedAugs());
+ }
+ depth = fInlineStack.top;
+ }
+
+ // close previous elements
+ if (element.closes != null) {
+ int length = fElementStack.top;
+ for (int i = length - 1; i >= 0; i--) {
+ Info info = fElementStack.data[i];
+
+ // does it close the element we're looking at?
+ if (element.closes(info.element.code)) {
+ if (fReportErrors) {
+ String ename = elem.rawname;
+ String iname = info.qname.rawname;
+ fErrorReporter.reportWarning("HTML2005", new Object[]{ename,iname});
+ }
+ for (int j = length - 1; j >= i; j--) {
+ info = fElementStack.pop();
+ if (fDocumentHandler != null) {
+ // PATCH: Marc-André Morissette
+ callEndElement(info.qname, synthesizedAugs());
+ }
+ }
+ length = i;
+ continue;
+ }
+
+ // should we stop searching?
+ boolean container = info.element.isContainer();
+ boolean parent = false;
+ if (!container) {
+ for (int j = 0; j < element.parent.length; j++) {
+ parent = parent || info.element.code == element.parent[j].code;
+ }
+ }
+ if (container || parent) {
+ break;
+ }
+ }
+ }
+
+ // call handler
+ fSeenRootElement = true;
+ if (element != null && element.isEmpty()) {
+ if (attrs == null) {
+ attrs = emptyAttributes();
+ }
+ if (fDocumentHandler != null) {
+ fDocumentHandler.emptyElement(elem, attrs, augs);
+ }
+ }
+ else {
+ boolean inline = element != null && element.isInline();
+ fElementStack.push(new Info(element, elem, inline ? attrs : null));
+ if (attrs == null) {
+ attrs = emptyAttributes();
+ }
+ if (fDocumentHandler != null) {
+ callStartElement(elem, attrs, augs);
+ }
+ }
+
+ // re-open inline elements
+ for (int i = 0; i < depth; i++) {
+ Info info = fInlineStack.pop();
+ startElement(info.qname, info.attributes, synthesizedAugs());
+ }
+
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName elem, XMLAttributes attrs, Augmentations augs)
+ throws XNIException {
+ startElement(elem, attrs, augs);
+ endElement(elem, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Start entity. */
+ public void startGeneralEntity(String name,
+ XMLResourceIdentifier id,
+ String encoding,
+ Augmentations augs) throws XNIException {
+ fSeenAnything = true;
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // insert body, if needed
+ if (!fDocumentFragment) {
+ boolean insertBody = !fSeenRootElement;
+ if (!insertBody) {
+ Info info = fElementStack.peek();
+ if (info.element.code == HTMLElements.HEAD ||
+ info.element.code == HTMLElements.HTML) {
+ String hname = modifyName("head", fNamesElems);
+ String bname = modifyName("body", fNamesElems);
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
+ }
+ fQName.setValues(null, hname, hname, null);
+ endElement(fQName, synthesizedAugs());
+ insertBody = true;
+ }
+ }
+ if (insertBody) {
+ String ename = modifyName("body", fNamesElems);
+ fQName.setValues(null, ename, ename, null);
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML2006", new Object[]{ename});
+ }
+ startElement(fQName, null, synthesizedAugs());
+ }
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.startGeneralEntity(name, id, encoding, augs);
+ }
+
+ } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+ /** Text declaration. */
+ public void textDecl(String version, String encoding, Augmentations augs)
+ throws XNIException {
+ fSeenAnything = true;
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.textDecl(version, encoding, augs);
+ }
+
+ } // textDecl(String,String,Augmentations)
+
+ /** End entity. */
+ public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endGeneralEntity(name, augs);
+ }
+
+ } // endGeneralEntity(String,Augmentations)
+
+ /** Start CDATA section. */
+ public void startCDATA(Augmentations augs) throws XNIException {
+ fSeenAnything = true;
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.startCDATA(augs);
+ }
+
+ } // startCDATA(Augmentations)
+
+ /** End CDATA section. */
+ public void endCDATA(Augmentations augs) throws XNIException {
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endCDATA(augs);
+ }
+
+ } // endCDATA(Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs) throws XNIException {
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // is this text whitespace?
+ boolean whitespace = true;
+ for (int i = 0; i < text.length; i++) {
+ if (!Character.isWhitespace(text.ch[text.offset + i])) {
+ whitespace = false;
+ break;
+ }
+ }
+
+ if (!fDocumentFragment) {
+ // handle bare characters
+ if (!fSeenRootElement) {
+ if (whitespace) {
+ return;
+ }
+ String ename = modifyName("body", fNamesElems);
+ fQName.setValues(null, ename, ename, null);
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML2006", new Object[]{ename});
+ }
+ startElement(fQName, null, synthesizedAugs());
+ }
+
+ // handle character content in head
+ // NOTE: This fequently happens when the document looks like:
+ // <title>Title</title>
+ // And here's some text.
+ else if (!whitespace) {
+ Info info = fElementStack.peek();
+ if (info.element.code == HTMLElements.HEAD ||
+ info.element.code == HTMLElements.HTML) {
+ String hname = modifyName("head", fNamesElems);
+ String bname = modifyName("body", fNamesElems);
+ if (fReportErrors) {
+ fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
+ }
+ fQName.setValues(null, hname, hname, null);
+ endElement(fQName, synthesizedAugs());
+ fQName.setValues(null, bname, bname, null);
+ startElement(fQName, null, synthesizedAugs());
+ }
+ }
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ fDocumentHandler.characters(text, augs);
+ }
+
+ } // characters(XMLString,Augmentations)
+
+ /** Ignorable whitespace. */
+ public void ignorableWhitespace(XMLString text, Augmentations augs)
+ throws XNIException {
+ characters(text, augs);
+ } // ignorableWhitespace(XMLString,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs) throws XNIException {
+
+ // is there anything to do?
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // get element information
+ HTMLElements.Element elem = getElement(element.rawname);
+
+ // do we ignore outside content?
+ if (!fIgnoreOutsideContent &&
+ (elem.code == HTMLElements.BODY || elem.code == HTMLElements.HTML)) {
+ return;
+ }
+
+ // check for end of document
+ if (elem.code == HTMLElements.HTML) {
+ fSeenRootElementEnd = true;
+ }
+
+ // empty element
+ int depth = getElementDepth(elem);
+ if (depth == -1 && elem.code == HTMLElements.P) {
+ startElement(element, emptyAttributes(), synthesizedAugs());
+ endElement(element, augs);
+ return;
+ }
+
+ // find unbalanced inline elements
+ if (depth > 1 && elem.isInline()) {
+ int size = fElementStack.top;
+ fInlineStack.top = 0;
+ for (int i = 0; i < depth - 1; i++) {
+ Info info = fElementStack.data[size - i - 1];
+ HTMLElements.Element pelem = info.element;
+ if (pelem.isInline()) {
+ // NOTE: I don't have to make a copy of the info because
+ // it will just be popped off of the element stack
+ // as soon as we close it, anyway.
+ fInlineStack.push(info);
+ }
+ }
+ }
+
+ // close children up to appropriate element
+ for (int i = 0; i < depth; i++) {
+ Info info = fElementStack.pop();
+ if (fReportErrors && i < depth - 1) {
+ String ename = modifyName(element.rawname, fNamesElems);
+ String iname = info.qname.rawname;
+ fErrorReporter.reportWarning("HTML2007", new Object[]{ename,iname});
+ }
+ if (fDocumentHandler != null) {
+ // PATCH: Marc-André Morissette
+ callEndElement(info.qname, i < depth - 1 ? synthesizedAugs() : augs);
+ }
+ }
+
+ // re-open inline elements
+ if (depth > 1) {
+ int size = fInlineStack.top;
+ for (int i = 0; i < size; i++) {
+ Info info = (Info)fInlineStack.pop();
+ XMLAttributes attributes = info.attributes;
+ if (fReportErrors) {
+ String iname = info.qname.rawname;
+ fErrorReporter.reportWarning("HTML2008", new Object[]{iname});
+ }
+ startElement(info.qname, attributes, synthesizedAugs());
+ }
+ }
+
+ } // endElement(QName,Augmentations)
+
+ // @since Xerces 2.1.0
+
+ /** Sets the document source. */
+ public void setDocumentSource(XMLDocumentSource source) {
+ fDocumentSource = source;
+ } // setDocumentSource(XMLDocumentSource)
+
+ /** Returns the document source. */
+ public XMLDocumentSource getDocumentSource() {
+ return fDocumentSource;
+ } // getDocumentSource():XMLDocumentSource
+
+ // removed since Xerces-J 2.3.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
+ throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Start prefix mapping. */
+ public void startPrefixMapping(String prefix, String uri, Augmentations augs)
+ throws XNIException {
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, String.class, Augmentations.class };
+ try {
+ Method method = cls.getMethod("startPrefixMapping", types);
+ Object[] args = { prefix, uri, augs };
+ method.invoke(fDocumentHandler, args);
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ }
+
+ } // startPrefixMapping(String,String,Augmentations)
+
+ /** End prefix mapping. */
+ public void endPrefixMapping(String prefix, Augmentations augs)
+ throws XNIException {
+
+ // check for end of document
+ if (fSeenRootElementEnd) {
+ return;
+ }
+
+ // call handler
+ if (fDocumentHandler != null) {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, Augmentations.class };
+ try {
+ Method method = cls.getMethod("endPrefixMapping", types);
+ Object[] args = { prefix, augs };
+ method.invoke(fDocumentHandler, args);
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ }
+
+ } // endPrefixMapping(String,Augmentations)
+
+ //
+ // Protected methods
+ //
+
+ /** Returns an HTML element. */
+ protected HTMLElements.Element getElement(String name) {
+ if (fNamespaces) {
+ int index = name.indexOf(':');
+ if (index != -1) {
+ name = name.substring(index+1);
+ }
+ }
+ return HTMLElements.getElement(name);
+ } // getElement(String):HTMLElements.Element
+
+ /** Call document handler start element. */
+ protected final void callStartElement(QName element, XMLAttributes attrs,
+ Augmentations augs)
+ throws XNIException {
+ fDocumentHandler.startElement(element, attrs, augs);
+ } // callStartElement(QName,XMLAttributes,Augmentations)
+
+ /** Call document handler end element. */
+ protected final void callEndElement(QName element, Augmentations augs)
+ throws XNIException {
+ fDocumentHandler.endElement(element, augs);
+ } // callEndElement(QName,Augmentations)
+
+ /**
+ * Returns the depth of the open tag associated with the specified
+ * element name or -1 if no matching element is found.
+ *
+ * @param element The element.
+ */
+ protected final int getElementDepth(HTMLElements.Element element) {
+ final boolean container = element.isContainer();
+ int depth = -1;
+ for (int i = fElementStack.top - 1; i >= 0; i--) {
+ Info info = fElementStack.data[i];
+ if (info.element.code == element.code) {
+ depth = fElementStack.top - i;
+ break;
+ }
+ if (!container && info.element.isBlock()) {
+ break;
+ }
+ }
+ return depth;
+ } // getElementDepth(HTMLElements.Element)
+
+ /**
+ * Returns the depth of the open tag associated with the specified
+ * element parent names or -1 if no matching element is found.
+ *
+ * @param parents The parent elements.
+ */
+ protected int getParentDepth(HTMLElements.Element[] parents, short bounds) {
+ if (parents != null) {
+ for (int i = fElementStack.top - 1; i >= 0; i--) {
+ Info info = fElementStack.data[i];
+ if (info.element.code == bounds) {
+ break;
+ }
+ for (int j = 0; j < parents.length; j++) {
+ if (info.element.code == parents[j].code) {
+ return fElementStack.top - i;
+ }
+ }
+ }
+ }
+ return -1;
+ } // getParentDepth(HTMLElements.Element[],short):int
+
+ /** Returns a set of empty attributes. */
+ protected final XMLAttributes emptyAttributes() {
+ fEmptyAttrs.removeAllAttributes();
+ return fEmptyAttrs;
+ } // emptyAttributes():XMLAttributes
+
+ /** Returns an augmentations object with a synthesized item added. */
+ protected final Augmentations synthesizedAugs() {
+ HTMLAugmentations augs = null;
+ if (fAugmentations) {
+ augs = fInfosetAugs;
+ augs.removeAllItems();
+ augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
+ }
+ return augs;
+ } // synthesizedAugs():Augmentations
+
+ //
+ // Protected static methods
+ //
+
+ /** Modifies the given name based on the specified mode. */
+ protected static final String modifyName(String name, short mode) {
+ switch (mode) {
+ case NAMES_UPPERCASE: return name.toUpperCase();
+ case NAMES_LOWERCASE: return name.toLowerCase();
+ }
+ return name;
+ } // modifyName(String,short):String
+
+ /**
+ * Converts HTML names string value to constant value.
+ *
+ * @see #NAMES_NO_CHANGE
+ * @see #NAMES_LOWERCASE
+ * @see #NAMES_UPPERCASE
+ */
+ protected static final short getNamesValue(String value) {
+ if (value.equals("lower")) {
+ return NAMES_LOWERCASE;
+ }
+ if (value.equals("upper")) {
+ return NAMES_UPPERCASE;
+ }
+ return NAMES_NO_CHANGE;
+ } // getNamesValue(String):short
+
+ //
+ // Classes
+ //
+
+ /**
+ * Element info for each start element. This information is used when
+ * closing unbalanced inline elements. For example:
+ * <pre>
+ * <i>unbalanced <b>HTML</i> content</b>
+ * </pre>
+ * <p>
+ * It seems that it is a waste of processing and memory to copy the
+ * attributes for every start element even if there are no unbalanced
+ * inline elements in the document. However, if the attributes are
+ * <em>not</em> saved, then important attributes such as style
+ * information would be lost.
+ *
+ * @author Andy Clark
+ */
+ public static class Info {
+
+ //
+ // Data
+ //
+
+ /** The element. */
+ public HTMLElements.Element element;
+
+ /** The element qualified name. */
+ public QName qname;
+
+ /** The element attributes. */
+ public XMLAttributes attributes;
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Creates an element information object.
+ * <p>
+ * <strong>Note:</strong>
+ * This constructor makes a copy of the element information.
+ *
+ * @param element The element qualified name.
+ */
+ public Info(HTMLElements.Element element, QName qname) {
+ this(element, qname, null);
+ } // <init>(HTMLElements.Element,QName)
+
+ /**
+ * Creates an element information object.
+ * <p>
+ * <strong>Note:</strong>
+ * This constructor makes a copy of the element information.
+ *
+ * @param element The element qualified name.
+ * @param attributes The element attributes.
+ */
+ public Info(HTMLElements.Element element,
+ QName qname, XMLAttributes attributes) {
+ this.element = element;
+ this.qname = new QName(qname);
+ if (attributes != null) {
+ int length = attributes.getLength();
+ if (length > 0) {
+ QName aqname = new QName();
+ XMLAttributes newattrs = new XMLAttributesImpl();
+ for (int i = 0; i < length; i++) {
+ attributes.getName(i, aqname);
+ String type = attributes.getType(i);
+ String value = attributes.getValue(i);
+ String nonNormalizedValue = attributes.getNonNormalizedValue(i);
+ boolean specified = attributes.isSpecified(i);
+ newattrs.addAttribute(aqname, type, value);
+ newattrs.setNonNormalizedValue(i, nonNormalizedValue);
+ newattrs.setSpecified(i, specified);
+ }
+ this.attributes = newattrs;
+ }
+ }
+ } // <init>(HTMLElements.Element,QName,XMLAttributes)
+
+ } // class Info
+
+ /** Unsynchronized stack of element information. */
+ public static class InfoStack {
+
+ //
+ // Data
+ //
+
+ /** The top of the stack. */
+ public int top;
+
+ /** The stack data. */
+ public Info[] data = new Info[10];
+
+ //
+ // Public methods
+ //
+
+ /** Pushes element information onto the stack. */
+ public void push(Info info) {
+ if (top == data.length) {
+ Info[] newarray = new Info[top + 10];
+ System.arraycopy(data, 0, newarray, 0, top);
+ data = newarray;
+ }
+ data[top++] = info;
+ } // push(Info)
+
+ /** Peeks at the top of the stack. */
+ public Info peek() {
+ return data[top-1];
+ } // peek():Info
+
+ /** Pops the top item off of the stack. */
+ public Info pop() {
+ return data[--top];
+ } // pop():Info
+
+ } // class InfoStack
+
+} // class HTMLTagBalancer
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/ObjectFactory.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/ObjectFactory.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/ObjectFactory.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,510 @@
+/*
+ * Copyright 2001-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.File;
+import java.io.FileInputStream;
+
+import java.util.Properties;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * This class is duplicated for each JAXP subpackage so keep it in sync.
+ * It is package private and therefore is not exposed as part of the JAXP
+ * API.
+ * <p>
+ * This code is designed to implement the JAXP 1.1 spec pluggability
+ * feature and is designed to run on JDK version 1.1 and
+ * later, and to compile on JDK 1.2 and onward.
+ * The code also runs both as part of an unbundled jar file and
+ * when bundled as part of the JDK.
+ * <p>
+ *
+ * @version $Id: ObjectFactory.java,v 1.1 2004/03/31 20:00:21 andyc Exp $
+ */
+class ObjectFactory {
+
+ //
+ // Constants
+ //
+
+ // name of default properties file to look for in JDK's jre/lib directory
+ private static final String DEFAULT_PROPERTIES_FILENAME = "xerces.properties";
+
+ /** Set to true for debugging */
+ private static final boolean DEBUG = false;
+
+ /**
+ * Default columns per line.
+ */
+ private static final int DEFAULT_LINE_LENGTH = 80;
+
+ /** cache the contents of the xerces.properties file.
+ * Until an attempt has been made to read this file, this will
+ * be null; if the file does not exist or we encounter some other error
+ * during the read, this will be empty.
+ */
+ private static Properties fXercesProperties = null;
+
+ /***
+ * Cache the time stamp of the xerces.properties file so
+ * that we know if it's been modified and can invalidate
+ * the cache when necessary.
+ */
+ private static long fLastModified = -1;
+
+ //
+ // static methods
+ //
+
+ /**
+ * Finds the implementation Class object in the specified order. The
+ * specified order is the following:
+ * <ol>
+ * <li>query the system property using <code>System.getProperty</code>
+ * <li>read <code>META-INF/services/<i>factoryId</i></code> file
+ * <li>use fallback classname
+ * </ol>
+ *
+ * @return Class object of factory, never null
+ *
+ * @param factoryId Name of the factory to find, same as
+ * a property name
+ * @param fallbackClassName Implementation class name, if nothing else
+ * is found. Use null to mean no fallback.
+ *
+ * @exception ObjectFactory.ConfigurationError
+ */
+ static Object createObject(String factoryId, String fallbackClassName)
+ throws ConfigurationError {
+ return createObject(factoryId, null, fallbackClassName);
+ } // createObject(String,String):Object
+
+ /**
+ * Finds the implementation Class object in the specified order. The
+ * specified order is the following:
+ * <ol>
+ * <li>query the system property using <code>System.getProperty</code>
+ * <li>read <code>$java.home/lib/<i>propertiesFilename</i></code> file
+ * <li>read <code>META-INF/services/<i>factoryId</i></code> file
+ * <li>use fallback classname
+ * </ol>
+ *
+ * @return Class object of factory, never null
+ *
+ * @param factoryId Name of the factory to find, same as
+ * a property name
+ * @param propertiesFilename The filename in the $java.home/lib directory
+ * of the properties file. If none specified,
+ * ${java.home}/lib/xerces.properties will be used.
+ * @param fallbackClassName Implementation class name, if nothing else
+ * is found. Use null to mean no fallback.
+ *
+ * @exception ObjectFactory.ConfigurationError
+ */
+ static Object createObject(String factoryId,
+ String propertiesFilename,
+ String fallbackClassName)
+ throws ConfigurationError
+ {
+ if (DEBUG) debugPrintln("debug is on");
+
+ SecuritySupport ss = SecuritySupport.getInstance();
+ ClassLoader cl = findClassLoader();
+
+ // Use the system property first
+ try {
+ String systemProp = ss.getSystemProperty(factoryId);
+ if (systemProp != null) {
+ if (DEBUG) debugPrintln("found system property, value=" + systemProp);
+ return newInstance(systemProp, cl, true);
+ }
+ } catch (SecurityException se) {
+ // Ignore and continue w/ next location
+ }
+
+ // Try to read from propertiesFilename, or $java.home/lib/xerces.properties
+ String factoryClassName = null;
+ // no properties file name specified; use $JAVA_HOME/lib/xerces.properties:
+ if (propertiesFilename == null) {
+ File propertiesFile = null;
+ boolean propertiesFileExists = false;
+ try {
+ String javah = ss.getSystemProperty("java.home");
+ propertiesFilename = javah + File.separator +
+ "lib" + File.separator + DEFAULT_PROPERTIES_FILENAME;
+ propertiesFile = new File(propertiesFilename);
+ propertiesFileExists = ss.getFileExists(propertiesFile);
+ } catch (SecurityException e) {
+ // try again...
+ fLastModified = -1;
+ fXercesProperties = null;
+ }
+
+ synchronized (ObjectFactory.class) {
+ boolean loadProperties = false;
+ try {
+ // file existed last time
+ if(fLastModified >= 0) {
+ if(propertiesFileExists &&
+ (fLastModified < (fLastModified = ss.getLastModified(propertiesFile)))) {
+ loadProperties = true;
+ } else {
+ // file has stopped existing...
+ if(!propertiesFileExists) {
+ fLastModified = -1;
+ fXercesProperties = null;
+ } // else, file wasn't modified!
+ }
+ } else {
+ // file has started to exist:
+ if(propertiesFileExists) {
+ loadProperties = true;
+ fLastModified = ss.getLastModified(propertiesFile);
+ } // else, nothing's changed
+ }
+ if(loadProperties) {
+ // must never have attempted to read xerces.properties before (or it's outdeated)
+ fXercesProperties = new Properties();
+ FileInputStream fis = ss.getFileInputStream(propertiesFile);
+ fXercesProperties.load(fis);
+ fis.close();
+ }
+ } catch (Exception x) {
+ fXercesProperties = null;
+ fLastModified = -1;
+ // assert(x instanceof FileNotFoundException
+ // || x instanceof SecurityException)
+ // In both cases, ignore and continue w/ next location
+ }
+ }
+ if(fXercesProperties != null) {
+ factoryClassName = fXercesProperties.getProperty(factoryId);
+ }
+ } else {
+ try {
+ FileInputStream fis = ss.getFileInputStream(new File(propertiesFilename));
+ Properties props = new Properties();
+ props.load(fis);
+ fis.close();
+ factoryClassName = props.getProperty(factoryId);
+ } catch (Exception x) {
+ // assert(x instanceof FileNotFoundException
+ // || x instanceof SecurityException)
+ // In both cases, ignore and continue w/ next location
+ }
+ }
+ if (factoryClassName != null) {
+ if (DEBUG) debugPrintln("found in " + propertiesFilename + ", value=" + factoryClassName);
+ return newInstance(factoryClassName, cl, true);
+ }
+
+ // Try Jar Service Provider Mechanism
+ Object provider = findJarServiceProvider(factoryId);
+ if (provider != null) {
+ return provider;
+ }
+
+ if (fallbackClassName == null) {
+ throw new ConfigurationError(
+ "Provider for " + factoryId + " cannot be found", null);
+ }
+
+ if (DEBUG) debugPrintln("using fallback, value=" + fallbackClassName);
+ return newInstance(fallbackClassName, cl, true);
+ } // createObject(String,String,String):Object
+
+ //
+ // Private static methods
+ //
+
+ /** Prints a message to standard error if debugging is enabled. */
+ private static void debugPrintln(String msg) {
+ if (DEBUG) {
+ System.err.println("JAXP: " + msg);
+ }
+ } // debugPrintln(String)
+
+ /**
+ * Figure out which ClassLoader to use. For JDK 1.2 and later use
+ * the context ClassLoader.
+ */
+ static ClassLoader findClassLoader()
+ throws ConfigurationError
+ {
+ SecuritySupport ss = SecuritySupport.getInstance();
+
+ // Figure out which ClassLoader to use for loading the provider
+ // class. If there is a Context ClassLoader then use it.
+ ClassLoader context = ss.getContextClassLoader();
+ ClassLoader system = ss.getSystemClassLoader();
+
+ ClassLoader chain = system;
+ while (true) {
+ if (context == chain) {
+ // Assert: we are on JDK 1.1 or we have no Context ClassLoader
+ // or any Context ClassLoader in chain of system classloader
+ // (including extension ClassLoader) so extend to widest
+ // ClassLoader (always look in system ClassLoader if Xerces
+ // is in boot/extension/system classpath and in current
+ // ClassLoader otherwise); normal classloaders delegate
+ // back to system ClassLoader first so this widening doesn't
+ // change the fact that context ClassLoader will be consulted
+ ClassLoader current = ObjectFactory.class.getClassLoader();
+
+ chain = system;
+ while (true) {
+ if (current == chain) {
+ // Assert: Current ClassLoader in chain of
+ // boot/extension/system ClassLoaders
+ return system;
+ }
+ if (chain == null) {
+ break;
+ }
+ chain = ss.getParentClassLoader(chain);
+ }
+
+ // Assert: Current ClassLoader not in chain of
+ // boot/extension/system ClassLoaders
+ return current;
+ }
+
+ if (chain == null) {
+ // boot ClassLoader reached
+ break;
+ }
+
+ // Check for any extension ClassLoaders in chain up to
+ // boot ClassLoader
+ chain = ss.getParentClassLoader(chain);
+ };
+
+ // Assert: Context ClassLoader not in chain of
+ // boot/extension/system ClassLoaders
+ return context;
+ } // findClassLoader():ClassLoader
+
+ /**
+ * Create an instance of a class using the specified ClassLoader
+ */
+ static Object newInstance(String className, ClassLoader cl,
+ boolean doFallback)
+ throws ConfigurationError
+ {
+ // assert(className != null);
+ try{
+ Class providerClass = findProviderClass(className, cl, doFallback);
+ Object instance = providerClass.newInstance();
+ if (DEBUG) debugPrintln("created new instance of " + providerClass +
+ " using ClassLoader: " + cl);
+ return instance;
+ } catch (ClassNotFoundException x) {
+ throw new ConfigurationError(
+ "Provider " + className + " not found", x);
+ } catch (Exception x) {
+ throw new ConfigurationError(
+ "Provider " + className + " could not be instantiated: " + x,
+ x);
+ }
+ }
+
+ /**
+ * Find a Class using the specified ClassLoader
+ */
+ static Class findProviderClass(String className, ClassLoader cl,
+ boolean doFallback)
+ throws ClassNotFoundException, ConfigurationError
+ {
+ //throw security exception if the calling thread is not allowed to access the package
+ //restrict the access to package as speicified in java.security policy
+ SecurityManager security = System.getSecurityManager();
+ try{
+ if (security != null) {
+ final int lastDot = className.lastIndexOf(".");
+ String packageName = className;
+ if (lastDot != -1) packageName = className.substring(0, lastDot);
+ security.checkPackageAccess(packageName);
+ }
+ }catch(SecurityException e){
+ throw e ;
+ }
+ Class providerClass;
+ if (cl == null) {
+ // XXX Use the bootstrap ClassLoader. There is no way to
+ // load a class using the bootstrap ClassLoader that works
+ // in both JDK 1.1 and Java 2. However, this should still
+ // work b/c the following should be true:
+ //
+ // (cl == null) iff current ClassLoader == null
+ //
+ // Thus Class.forName(String) will use the current
+ // ClassLoader which will be the bootstrap ClassLoader.
+ providerClass = Class.forName(className);
+ } else {
+ try {
+ providerClass = cl.loadClass(className);
+ } catch (ClassNotFoundException x) {
+ if (doFallback) {
+ // Fall back to current classloader
+ ClassLoader current = ObjectFactory.class.getClassLoader();
+ if (current == null) {
+ providerClass = Class.forName(className);
+ } else if (cl != current) {
+ cl = current;
+ providerClass = cl.loadClass(className);
+ } else {
+ throw x;
+ }
+ } else {
+ throw x;
+ }
+ }
+ }
+
+ return providerClass;
+ }
+
+ /*
+ * Try to find provider using Jar Service Provider Mechanism
+ *
+ * @return instance of provider class if found or null
+ */
+ private static Object findJarServiceProvider(String factoryId)
+ throws ConfigurationError
+ {
+ SecuritySupport ss = SecuritySupport.getInstance();
+ String serviceId = "META-INF/services/" + factoryId;
+ InputStream is = null;
+
+ // First try the Context ClassLoader
+ ClassLoader cl = findClassLoader();
+
+ is = ss.getResourceAsStream(cl, serviceId);
+
+ // If no provider found then try the current ClassLoader
+ if (is == null) {
+ ClassLoader current = ObjectFactory.class.getClassLoader();
+ if (cl != current) {
+ cl = current;
+ is = ss.getResourceAsStream(cl, serviceId);
+ }
+ }
+
+ if (is == null) {
+ // No provider found
+ return null;
+ }
+
+ if (DEBUG) debugPrintln("found jar resource=" + serviceId +
+ " using ClassLoader: " + cl);
+
+ // Read the service provider name in UTF-8 as specified in
+ // the jar spec. Unfortunately this fails in Microsoft
+ // VJ++, which does not implement the UTF-8
+ // encoding. Theoretically, we should simply let it fail in
+ // that case, since the JVM is obviously broken if it
+ // doesn't support such a basic standard. But since there
+ // are still some users attempting to use VJ++ for
+ // development, we have dropped in a fallback which makes a
+ // second attempt using the platform's default encoding. In
+ // VJ++ this is apparently ASCII, which is a subset of
+ // UTF-8... and since the strings we'll be reading here are
+ // also primarily limited to the 7-bit ASCII range (at
+ // least, in English versions), this should work well
+ // enough to keep us on the air until we're ready to
+ // officially decommit from VJ++. [Edited comment from
+ // jkesselm]
+ BufferedReader rd;
+ try {
+ rd = new BufferedReader(new InputStreamReader(is, "UTF-8"), DEFAULT_LINE_LENGTH);
+ } catch (java.io.UnsupportedEncodingException e) {
+ rd = new BufferedReader(new InputStreamReader(is), DEFAULT_LINE_LENGTH);
+ }
+
+ String factoryClassName = null;
+ try {
+ // XXX Does not handle all possible input as specified by the
+ // Jar Service Provider specification
+ factoryClassName = rd.readLine();
+ rd.close();
+ } catch (IOException x) {
+ // No provider found
+ return null;
+ }
+
+ if (factoryClassName != null &&
+ ! "".equals(factoryClassName)) {
+ if (DEBUG) debugPrintln("found in resource, value="
+ + factoryClassName);
+
+ // Note: here we do not want to fall back to the current
+ // ClassLoader because we want to avoid the case where the
+ // resource file was found using one ClassLoader and the
+ // provider class was instantiated using a different one.
+ return newInstance(factoryClassName, cl, false);
+ }
+
+ // No provider found
+ return null;
+ }
+
+ //
+ // Classes
+ //
+
+ /**
+ * A configuration error.
+ */
+ static class ConfigurationError
+ extends Error {
+
+ //
+ // Data
+ //
+
+ /** Exception. */
+ private Exception exception;
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Construct a new instance with the specified detail string and
+ * exception.
+ */
+ ConfigurationError(String msg, Exception x) {
+ super(msg);
+ this.exception = x;
+ } // <init>(String,Exception)
+
+ //
+ // methods
+ //
+
+ /** Returns the exception associated to this error. */
+ Exception getException() {
+ return exception;
+ } // getException():Exception
+
+ } // class ConfigurationError
+
+} // class ObjectFactory
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2002,2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+import java.io.*;
+
+/**
+ * This class is duplicated for each JAXP subpackage so keep it in sync.
+ * It is package private and therefore is not exposed as part of the JAXP
+ * API.
+ *
+ * Base class with security related methods that work on JDK 1.1.
+ */
+class SecuritySupport {
+
+ /*
+ * Make this of type Object so that the verifier won't try to
+ * prove its type, thus possibly trying to load the SecuritySupport12
+ * class.
+ */
+ private static final Object securitySupport;
+
+ static {
+ SecuritySupport ss = null;
+ try {
+ Class c = Class.forName("java.security.AccessController");
+ // if that worked, we're on 1.2.
+ /*
+ // don't reference the class explicitly so it doesn't
+ // get dragged in accidentally.
+ c = Class.forName("javax.mail.SecuritySupport12");
+ Constructor cons = c.getConstructor(new Class[] { });
+ ss = (SecuritySupport)cons.newInstance(new Object[] { });
+ */
+ /*
+ * Unfortunately, we can't load the class using reflection
+ * because the class is package private. And the class has
+ * to be package private so the APIs aren't exposed to other
+ * code that could use them to circumvent security. Thus,
+ * we accept the risk that the direct reference might fail
+ * on some JDK 1.1 JVMs, even though we would never execute
+ * this code in such a case. Sigh...
+ */
+ ss = new SecuritySupport12();
+ } catch (Exception ex) {
+ // ignore it
+ } finally {
+ if (ss == null)
+ ss = new SecuritySupport();
+ securitySupport = ss;
+ }
+ }
+
+ /**
+ * Return an appropriate instance of this class, depending on whether
+ * we're on a JDK 1.1 or J2SE 1.2 (or later) system.
+ */
+ static SecuritySupport getInstance() {
+ return (SecuritySupport)securitySupport;
+ }
+
+ ClassLoader getContextClassLoader() {
+ return null;
+ }
+
+ ClassLoader getSystemClassLoader() {
+ return null;
+ }
+
+ ClassLoader getParentClassLoader(ClassLoader cl) {
+ return null;
+ }
+
+ String getSystemProperty(String propName) {
+ return System.getProperty(propName);
+ }
+
+ FileInputStream getFileInputStream(File file)
+ throws FileNotFoundException
+ {
+ return new FileInputStream(file);
+ }
+
+ InputStream getResourceAsStream(ClassLoader cl, String name) {
+ InputStream ris;
+ if (cl == null) {
+ ris = ClassLoader.getSystemResourceAsStream(name);
+ } else {
+ ris = cl.getResourceAsStream(name);
+ }
+ return ris;
+ }
+
+ boolean getFileExists(File f) {
+ return f.exists();
+ }
+
+ long getLastModified(File f) {
+ return f.lastModified();
+ }
+}
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport12.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport12.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/SecuritySupport12.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2002,2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+import java.security.*;
+import java.io.*;
+
+/**
+ * This class is duplicated for each JAXP subpackage so keep it in sync.
+ * It is package private and therefore is not exposed as part of the JAXP
+ * API.
+ *
+ * Security related methods that only work on J2SE 1.2 and newer.
+ */
+class SecuritySupport12 extends SecuritySupport {
+
+ ClassLoader getContextClassLoader() {
+ return (ClassLoader)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ ClassLoader cl = null;
+ try {
+ cl = Thread.currentThread().getContextClassLoader();
+ } catch (SecurityException ex) { }
+ return cl;
+ }
+ });
+ }
+
+ ClassLoader getSystemClassLoader() {
+ return (ClassLoader)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ ClassLoader cl = null;
+ try {
+ cl = ClassLoader.getSystemClassLoader();
+ } catch (SecurityException ex) {}
+ return cl;
+ }
+ });
+ }
+
+ ClassLoader getParentClassLoader(final ClassLoader cl) {
+ return (ClassLoader)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ ClassLoader parent = null;
+ try {
+ parent = cl.getParent();
+ } catch (SecurityException ex) {}
+
+ // eliminate loops in case of the boot
+ // ClassLoader returning itself as a parent
+ return (parent == cl) ? null : parent;
+ }
+ });
+ }
+
+ String getSystemProperty(final String propName) {
+ return (String)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ return System.getProperty(propName);
+ }
+ });
+ }
+
+ FileInputStream getFileInputStream(final File file)
+ throws FileNotFoundException
+ {
+ try {
+ return (FileInputStream)
+ AccessController.doPrivileged(new PrivilegedExceptionAction() {
+ public Object run() throws FileNotFoundException {
+ return new FileInputStream(file);
+ }
+ });
+ } catch (PrivilegedActionException e) {
+ throw (FileNotFoundException)e.getException();
+ }
+ }
+
+ InputStream getResourceAsStream(final ClassLoader cl,
+ final String name)
+ {
+ return (InputStream)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ InputStream ris;
+ if (cl == null) {
+ ris = ClassLoader.getSystemResourceAsStream(name);
+ } else {
+ ris = cl.getResourceAsStream(name);
+ }
+ return ris;
+ }
+ });
+ }
+
+ boolean getFileExists(final File f) {
+ return ((Boolean)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ return new Boolean(f.exists());
+ }
+ })).booleanValue();
+ }
+
+ long getLastModified(final File f) {
+ return ((Long)
+ AccessController.doPrivileged(new PrivilegedAction() {
+ public Object run() {
+ return new Long(f.lastModified());
+ }
+ })).longValue();
+ }
+
+}
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/DefaultFilter.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/DefaultFilter.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/DefaultFilter.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,437 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+
+import org.cyberneko.html.HTMLComponent;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+
+/**
+ * This class implements a filter that simply passes document
+ * events to the next handler. It can be used as a base class to
+ * simplify the development of new document filters.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: DefaultFilter.java,v 1.7 2005/02/14 03:56:54 andyc Exp $
+ */
+public class DefaultFilter
+ implements XMLDocumentFilter, HTMLComponent {
+
+ //
+ // Data
+ //
+
+ /** Document handler. */
+ protected XMLDocumentHandler fDocumentHandler;
+
+ /** Document source. */
+ protected XMLDocumentSource fDocumentSource;
+
+ //
+ // XMLDocumentSource methods
+ //
+
+ /** Sets the document handler. */
+ public void setDocumentHandler(XMLDocumentHandler handler) {
+ fDocumentHandler = handler;
+ } // setDocumentHandler(XMLDocumentHandler)
+
+ // @since Xerces 2.1.0
+
+ /** Returns the document handler. */
+ public XMLDocumentHandler getDocumentHandler() {
+ return fDocumentHandler;
+ } // getDocumentHandler():XMLDocumentHandler
+
+ /** Sets the document source. */
+ public void setDocumentSource(XMLDocumentSource source) {
+ fDocumentSource = source;
+ } // setDocumentSource(XMLDocumentSource)
+
+ /** Returns the document source. */
+ public XMLDocumentSource getDocumentSource() {
+ return fDocumentSource;
+ } // getDocumentSource():XMLDocumentSource
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class,
+ NamespaceContext.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ Object[] params = {
+ locator, encoding,
+ nscontext, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (IllegalAccessException e) {
+ throw new XNIException(e);
+ }
+ catch (InvocationTargetException e) {
+ throw new XNIException(e);
+ }
+ catch (NoSuchMethodException e) {
+ try {
+ // NOTE: Hack to allow the default filter to work with
+ // old and new versions of the XNI document handler
+ // interface. -Ac
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = {
+ XMLLocator.class, String.class, Augmentations.class
+ };
+ Method method = cls.getMethod("startDocument", types);
+ Object[] params = {
+ locator, encoding, augs
+ };
+ method.invoke(fDocumentHandler, params);
+ }
+ catch (NoSuchMethodException ex) {
+ // NOTE: Should not happen!
+ throw new XNIException(ex);
+ }
+ catch (IllegalAccessException ex) {
+ // NOTE: Should not happen!
+ throw new XNIException(ex);
+ }
+ catch (InvocationTargetException ex) {
+ // NOTE: Should not happen!
+ throw new XNIException(ex);
+ }
+ }
+ }
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ // old methods
+
+ /** XML declaration. */
+ public void xmlDecl(String version, String encoding, String standalone, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.xmlDecl(version, encoding, standalone, augs);
+ }
+ } // xmlDecl(String,String,String,Augmentations)
+
+ /** Doctype declaration. */
+ public void doctypeDecl(String root, String publicId, String systemId, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.doctypeDecl(root, publicId, systemId, augs);
+ }
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.comment(text, augs);
+ }
+ } // comment(XMLString,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.processingInstruction(target, data, augs);
+ }
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.startElement(element, attributes, augs);
+ }
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.emptyElement(element, attributes, augs);
+ }
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.characters(text, augs);
+ }
+ } // characters(XMLString,Augmentations)
+
+ /** Ignorable whitespace. */
+ public void ignorableWhitespace(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.ignorableWhitespace(text, augs);
+ }
+ } // ignorableWhitespace(XMLString,Augmentations)
+
+ /** Start general entity. */
+ public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.startGeneralEntity(name, id, encoding, augs);
+ }
+ } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+ /** Text declaration. */
+ public void textDecl(String version, String encoding, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.textDecl(version, encoding, augs);
+ }
+ } // textDecl(String,String,Augmentations)
+
+ /** End general entity. */
+ public void endGeneralEntity(String name, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endGeneralEntity(name, augs);
+ }
+ } // endGeneralEntity(String,Augmentations)
+
+ /** Start CDATA section. */
+ public void startCDATA(Augmentations augs) throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.startCDATA(augs);
+ }
+ } // startCDATA(Augmentations)
+
+ /** End CDATA section. */
+ public void endCDATA(Augmentations augs) throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endCDATA(augs);
+ }
+ } // endCDATA(Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endElement(element, augs);
+ }
+ } // endElement(QName,Augmentations)
+
+ /** End document. */
+ public void endDocument(Augmentations augs) throws XNIException {
+ if (fDocumentHandler != null) {
+ fDocumentHandler.endDocument(augs);
+ }
+ } // endDocument(Augmentations)
+
+ // removed since Xerces-J 2.3.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
+ throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Start prefix mapping. */
+ public void startPrefixMapping(String prefix, String uri, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, String.class, Augmentations.class };
+ try {
+ Method method = cls.getMethod("startPrefixMapping", types);
+ Object[] args = { prefix, uri, augs };
+ method.invoke(fDocumentHandler, args);
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ }
+ } // startPrefixMapping(String,String,Augmentations)
+
+ /** End prefix mapping. */
+ public void endPrefixMapping(String prefix, Augmentations augs)
+ throws XNIException {
+ if (fDocumentHandler != null) {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, Augmentations.class };
+ try {
+ Method method = cls.getMethod("endPrefixMapping", types);
+ Object[] args = { prefix, augs };
+ method.invoke(fDocumentHandler, args);
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ }
+ } // endPrefixMapping(String,Augmentations)
+
+ //
+ // HTMLComponent methods
+ //
+
+ /**
+ * Returns a list of feature identifiers that are recognized by
+ * this component. This method may return null if no features
+ * are recognized by this component.
+ */
+ public String[] getRecognizedFeatures() {
+ return null;
+ } // getRecognizedFeatures():String[]
+
+ /**
+ * Returns the default state for a feature, or null if this
+ * component does not want to report a default value for this
+ * feature.
+ */
+ public Boolean getFeatureDefault(String featureId) {
+ return null;
+ } // getFeatureDefault(String):Boolean
+
+ /**
+ * Returns a list of property identifiers that are recognized by
+ * this component. This method may return null if no properties
+ * are recognized by this component.
+ */
+ public String[] getRecognizedProperties() {
+ return null;
+ } // getRecognizedProperties():String[]
+
+ /**
+ * Returns the default state for a property, or null if this
+ * component does not want to report a default value for this
+ * property.
+ */
+ public Object getPropertyDefault(String propertyId) {
+ return null;
+ } // getPropertyDefault(String):Object
+
+ /**
+ * Resets the component. The component can query the component manager
+ * about any features and properties that affect the operation of the
+ * component.
+ *
+ * @param componentManager The component manager.
+ *
+ * @throws XNIException Thrown by component on initialization error.
+ */
+ public void reset(XMLComponentManager componentManager)
+ throws XMLConfigurationException {
+ } // reset(XMLComponentManager)
+
+ /**
+ * Sets the state of a feature. This method is called by the component
+ * manager any time after reset when a feature changes state.
+ * <p>
+ * <strong>Note:</strong> Components should silently ignore features
+ * that do not affect the operation of the component.
+ *
+ * @param featureId The feature identifier.
+ * @param state The state of the feature.
+ *
+ * @throws XMLConfigurationException Thrown for configuration error.
+ * In general, components should
+ * only throw this exception if
+ * it is <strong>really</strong>
+ * a critical error.
+ */
+ public void setFeature(String featureId, boolean state)
+ throws XMLConfigurationException {
+ } // setFeature(String,boolean)
+
+ /**
+ * Sets the value of a property. This method is called by the component
+ * manager any time after reset when a property changes value.
+ * <p>
+ * <strong>Note:</strong> Components should silently ignore properties
+ * that do not affect the operation of the component.
+ *
+ * @param propertyId The property identifier.
+ * @param value The value of the property.
+ *
+ * @throws XMLConfigurationException Thrown for configuration error.
+ * In general, components should
+ * only throw this exception if
+ * it is <strong>really</strong>
+ * a critical error.
+ */
+ public void setProperty(String propertyId, Object value)
+ throws XMLConfigurationException {
+ } // setProperty(String,Object)
+
+ //
+ // Protected static methods
+ //
+
+ /**
+ * Utility method for merging string arrays for recognized features
+ * and recognized properties.
+ */
+ protected static String[] merge(String[] array1, String[] array2) {
+
+ // shortcut merge
+ if (array1 == array2) {
+ return array1;
+ }
+ if (array1 == null) {
+ return array2;
+ }
+ if (array2 == null) {
+ return array1;
+ }
+
+ // full merge
+ String[] array3 = new String[array1.length + array2.length];
+ System.arraycopy(array1, 0, array3, 0, array1.length);
+ System.arraycopy(array2, 0, array3, array1.length, array2.length);
+
+ return array3;
+
+ } // merge(String[],String[]):String[]
+
+} // class DefaultFilter
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/ElementRemover.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/ElementRemover.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/ElementRemover.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,340 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import java.util.Hashtable;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+
+/**
+ * This class is a document filter capable of removing specified
+ * elements from the processing stream. There are two options for
+ * processing document elements:
+ * <ul>
+ * <li>specifying those elements which should be accepted and,
+ * optionally, which attributes of that element should be
+ * kept; and
+ * <li>specifying those elements whose tags and content should be
+ * completely removed from the event stream.
+ * </ul>
+ * <p>
+ * The first option allows the application to specify which elements
+ * appearing in the event stream should be accepted and, therefore,
+ * passed on to the next stage in the pipeline. All elements
+ * <em>not</em> in the list of acceptable elements have their start
+ * and end tags stripped from the event stream <em>unless</em> those
+ * elements appear in the list of elements to be removed.
+ * <p>
+ * The second option allows the application to specify which elements
+ * should be completely removed from the event stream. When an element
+ * appears that is to be removed, the element's start and end tag as
+ * well as all of that element's content is removed from the event
+ * stream.
+ * <p>
+ * A common use of this filter would be to only allow rich-text
+ * and linking elements as well as the character content to pass
+ * through the filter — all other elements would be stripped.
+ * The following code shows how to configure this filter to perform
+ * this task:
+ * <pre>
+ * ElementRemover remover = new ElementRemover();
+ * remover.acceptElement("b", null);
+ * remover.acceptElement("i", null);
+ * remover.acceptElement("u", null);
+ * remover.acceptElement("a", new String[] { "href" });
+ * </pre>
+ * <p>
+ * However, this would still allow the text content of other
+ * elements to pass through, which may not be desirable. In order
+ * to further "clean" the input, the <code>removeElement</code>
+ * option can be used. The following piece of code adds the ability
+ * to completely remove any <SCRIPT> tags and content
+ * from the stream.
+ * <pre>
+ * remover.removeElement("script");
+ * </pre>
+ * <p>
+ * <strong>Note:</strong>
+ * All text and accepted element children of a stripped element is
+ * retained. To completely remove an element's content, use the
+ * <code>removeElement</code> method.
+ * <p>
+ * <strong>Note:</strong>
+ * Care should be taken when using this filter because the output
+ * may not be a well-balanced tree. Specifically, if the application
+ * removes the <HTML> element (with or without retaining its
+ * children), the resulting document event stream will no longer be
+ * well-formed.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: ElementRemover.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
+ */
+public class ElementRemover
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ /** A "null" object. */
+ protected static final Object NULL = new Object();
+
+ //
+ // Data
+ //
+
+ // information
+
+ /** Accepted elements. */
+ protected Hashtable fAcceptedElements = new Hashtable();
+
+ /** Removed elements. */
+ protected Hashtable fRemovedElements = new Hashtable();
+
+ // state
+
+ /** The element depth. */
+ protected int fElementDepth;
+
+ /** The element depth at element removal. */
+ protected int fRemovalElementDepth;
+
+ //
+ // Public methods
+ //
+
+ /**
+ * Specifies that the given element should be accepted and, optionally,
+ * which attributes of that element should be kept.
+ *
+ * @param element The element to accept.
+ * @param attributes The list of attributes to be kept or null if no
+ * attributes should be kept for this element.
+ *
+ * see #removeElement
+ */
+ public void acceptElement(String element, String[] attributes) {
+ Object key = element.toLowerCase();
+ Object value = NULL;
+ if (attributes != null) {
+ String[] newarray = new String[attributes.length];
+ for (int i = 0; i < attributes.length; i++) {
+ newarray[i] = attributes[i].toLowerCase();
+ }
+ value = attributes;
+ }
+ fAcceptedElements.put(key, value);
+ } // acceptElement(String,String[])
+
+ /**
+ * Specifies that the given element should be completely removed. If an
+ * element is encountered during processing that is on the remove list,
+ * the element's start and end tags as well as all of content contained
+ * within the element will be removed from the processing stream.
+ *
+ * @param element The element to completely remove.
+ */
+ public void removeElement(String element) {
+ Object key = element.toLowerCase();
+ Object value = NULL;
+ fRemovedElements.put(key, value);
+ } // removeElement(String)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+ fElementDepth = 0;
+ fRemovalElementDepth = Integer.MAX_VALUE;
+ super.startDocument(locator, encoding, nscontext, augs);
+ } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
+
+ // old methods
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
+ throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Start prefix mapping. */
+ public void startPrefixMapping(String prefix, String uri, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.startPrefixMapping(prefix, uri, augs);
+ }
+ } // startPrefixMapping(String,String,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
+ super.startElement(element, attributes, augs);
+ }
+ fElementDepth++;
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
+ super.emptyElement(element, attributes, augs);
+ }
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.comment(text, augs);
+ }
+ } // comment(XMLString,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.processingInstruction(target, data, augs);
+ }
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.characters(text, augs);
+ }
+ } // characters(XMLString,Augmentations)
+
+ /** Ignorable whitespace. */
+ public void ignorableWhitespace(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.ignorableWhitespace(text, augs);
+ }
+ } // ignorableWhitespace(XMLString,Augmentations)
+
+ /** Start general entity. */
+ public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.startGeneralEntity(name, id, encoding, augs);
+ }
+ } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+ /** Text declaration. */
+ public void textDecl(String version, String encoding, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.textDecl(version, encoding, augs);
+ }
+ } // textDecl(String,String,Augmentations)
+
+ /** End general entity. */
+ public void endGeneralEntity(String name, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.endGeneralEntity(name, augs);
+ }
+ } // endGeneralEntity(String,Augmentations)
+
+ /** Start CDATA section. */
+ public void startCDATA(Augmentations augs) throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.startCDATA(augs);
+ }
+ } // startCDATA(Augmentations)
+
+ /** End CDATA section. */
+ public void endCDATA(Augmentations augs) throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.endCDATA(augs);
+ }
+ } // endCDATA(Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth && elementAccepted(element.rawname)) {
+ super.endElement(element, augs);
+ }
+ fElementDepth--;
+ if (fElementDepth == fRemovalElementDepth) {
+ fRemovalElementDepth = Integer.MAX_VALUE;
+ }
+ } // endElement(QName,Augmentations)
+
+ /** End prefix mapping. */
+ public void endPrefixMapping(String prefix, Augmentations augs)
+ throws XNIException {
+ if (fElementDepth <= fRemovalElementDepth) {
+ super.endPrefixMapping(prefix, augs);
+ }
+ } // endPrefixMapping(String,Augmentations)
+
+ //
+ // Protected methods
+ //
+
+ /** Returns true if the specified element is accepted. */
+ protected boolean elementAccepted(String element) {
+ Object key = element.toLowerCase();
+ return fAcceptedElements.containsKey(key);
+ } // elementAccepted(String):boolean
+
+ /** Returns true if the specified element should be removed. */
+ protected boolean elementRemoved(String element) {
+ Object key = element.toLowerCase();
+ return fRemovedElements.containsKey(key);
+ } // elementRemoved(String):boolean
+
+ /** Handles an open tag. */
+ protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
+ if (elementAccepted(element.rawname)) {
+ Object key = element.rawname.toLowerCase();
+ Object value = fAcceptedElements.get(key);
+ if (value != NULL) {
+ String[] anames = (String[])value;
+ int attributeCount = attributes.getLength();
+ LOOP: for (int i = 0; i < attributeCount; i++) {
+ String aname = attributes.getQName(i).toLowerCase();
+ for (int j = 0; j < anames.length; j++) {
+ if (anames[j].equals(aname)) {
+ continue LOOP;
+ }
+ }
+ attributes.removeAttributeAt(i--);
+ attributeCount--;
+ }
+ }
+ else {
+ attributes.removeAllAttributes();
+ }
+ return true;
+ }
+ else if (elementRemoved(element.rawname)) {
+ fRemovalElementDepth = fElementDepth;
+ }
+ return false;
+ } // handleOpenTag(QName,XMLAttributes):boolean
+
+} // class DefaultFilter
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Identity.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Identity.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Identity.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,99 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import org.cyberneko.html.HTMLConfiguration;
+import org.cyberneko.html.HTMLEventInfo;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
+
+/**
+ * This filter performs the identity operation of the original
+ * document event stream generated by the HTML scanner by removing
+ * events that are synthesized by the tag balancer. This operation
+ * is essentially the same as turning off tag-balancing in the
+ * parser. However, this filter is useful when you want the tag
+ * balancer to report "errors" but do not want the synthesized
+ * events in the output.
+ * <p>
+ * <strong>Note:</strong>
+ * This filter requires the augmentations feature to be turned on.
+ * For example:
+ * <pre>
+ * XMLParserConfiguration parser = new HTMLConfiguration();
+ * parser.setFeature("http://cyberneko.org/html/features/augmentations", true);
+ * </pre>
+ * <p>
+ * <strong>Note:</strong>
+ * This isn't <em>exactly</em> the identify transform because the
+ * element and attributes names may have been modified from the
+ * original document. For example, by default, NekoHTML converts
+ * element names to upper-case and attribute names to lower-case.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: Identity.java,v 1.4 2005/02/14 03:56:54 andyc Exp $
+ */
+public class Identity
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ /** Augmentations feature identifier. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Filters property identifier. */
+ protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attributes,
+ Augmentations augs) throws XNIException {
+ if (augs == null || !synthesized(augs)) {
+ super.startElement(element, attributes, augs);
+ }
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attributes,
+ Augmentations augs) throws XNIException {
+ if (augs == null || !synthesized(augs)) {
+ super.emptyElement(element, attributes, augs);
+ }
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ if (augs == null || !synthesized(augs)) {
+ super.endElement(element, augs);
+ }
+ } // endElement(QName,XMLAttributes,Augmentations)
+
+ //
+ // Protected static methods
+ //
+
+ /** Returns true if the information provided is synthesized. */
+ protected static boolean synthesized(Augmentations augs) {
+ HTMLEventInfo info = (HTMLEventInfo)augs.getItem(AUGMENTATIONS);
+ return info != null ? info.isSynthesized() : false;
+ } // synthesized(Augmentations):boolean
+
+} // class Identity
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/NamespaceBinder.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/NamespaceBinder.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/NamespaceBinder.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,693 @@
+/*
+ * (C) Copyright 2004-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import org.cyberneko.html.HTMLElements;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Enumeration;
+import java.util.Vector;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+
+/**
+ * This filter binds namespaces if namespace processing is turned on
+ * by setting the feature "http://xml.org/sax/features/namespaces" is
+ * set to <code>true</code>.
+ * <p>
+ * This configuration recognizes the following features:
+ * <ul>
+ * <li>http://xml.org/sax/features/namespaces
+ * </ul>
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: NamespaceBinder.java,v 1.8 2005/05/30 00:19:28 andyc Exp $
+ */
+public class NamespaceBinder
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ // namespace uris
+
+ /** XHTML 1.0 namespace URI (http://www.w3.org/1999/xhtml). */
+ public static final String XHTML_1_0_URI = "http://www.w3.org/1999/xhtml";
+
+ /** XML namespace URI (http://www.w3.org/XML/1998/namespace). */
+ public static final String XML_URI = "http://www.w3.org/XML/1998/namespace";
+
+ /** XMLNS namespace URI (http://www.w3.org/2000/xmlns/). */
+ public static final String XMLNS_URI = "http://www.w3.org/2000/xmlns/";
+
+ // features
+
+ /** Namespaces. */
+ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+ /** Override namespace binding URI. */
+ protected static final String OVERRIDE_NAMESPACES = "http://cyberneko.org/html/features/override-namespaces";
+
+ /** Insert namespace binding URIs. */
+ protected static final String INSERT_NAMESPACES = "http://cyberneko.org/html/features/insert-namespaces";
+
+ /** Recognized features. */
+ private static final String[] RECOGNIZED_FEATURES = {
+ NAMESPACES,
+ OVERRIDE_NAMESPACES,
+ INSERT_NAMESPACES,
+ };
+
+ /** Feature defaults. */
+ private static final Boolean[] FEATURE_DEFAULTS = {
+ null,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ };
+
+ // properties
+
+ /** Modify HTML element names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+ /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+
+ /** Namespaces URI. */
+ protected static final String NAMESPACES_URI = "http://cyberneko.org/html/properties/namespaces-uri";
+
+ /** Recognized properties. */
+ private static final String[] RECOGNIZED_PROPERTIES = new String[] {
+ NAMES_ELEMS,
+ NAMES_ATTRS,
+ NAMESPACES_URI,
+ };
+
+ /** Property defaults. */
+ private static final Object[] PROPERTY_DEFAULTS = {
+ null,
+ null,
+ XHTML_1_0_URI,
+ };
+
+ // modify HTML names
+
+ /** Don't modify HTML names. */
+ protected static final short NAMES_NO_CHANGE = 0;
+
+ /** Uppercase HTML names. */
+ protected static final short NAMES_UPPERCASE = 1;
+
+ /** Lowercase HTML names. */
+ protected static final short NAMES_LOWERCASE = 2;
+
+ //
+ // Data
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected boolean fNamespaces;
+
+ /** Namespace prefixes. */
+ protected boolean fNamespacePrefixes;
+
+ /** Override namespaces. */
+ protected boolean fOverrideNamespaces;
+
+ /** Insert namespaces. */
+ protected boolean fInsertNamespaces;
+
+ // properties
+
+ /** Modify HTML element names. */
+ protected short fNamesElems;
+
+ /** Modify HTML attribute names. */
+ protected short fNamesAttrs;
+
+ /** Namespaces URI. */
+ protected String fNamespacesURI;
+
+ // state
+
+ /** Namespace context. */
+ protected final NamespaceSupport fNamespaceContext = new NamespaceSupport();
+
+ // temp vars
+
+ /** QName. */
+ private final QName fQName = new QName();
+
+ //
+ // HTMLComponent methods
+ //
+
+ /**
+ * Returns a list of feature identifiers that are recognized by
+ * this component. This method may return null if no features
+ * are recognized by this component.
+ */
+ public String[] getRecognizedFeatures() {
+ return merge(super.getRecognizedFeatures(), RECOGNIZED_FEATURES);
+ } // getRecognizedFeatures():String[]
+
+ /**
+ * Returns the default state for a feature, or null if this
+ * component does not want to report a default value for this
+ * feature.
+ */
+ public Boolean getFeatureDefault(String featureId) {
+ for (int i = 0; i < RECOGNIZED_FEATURES.length; i++) {
+ if (RECOGNIZED_FEATURES[i].equals(featureId)) {
+ return FEATURE_DEFAULTS[i];
+ }
+ }
+ return super.getFeatureDefault(featureId);
+ } // getFeatureDefault(String):Boolean
+
+ /**
+ * Returns a list of property identifiers that are recognized by
+ * this component. This method may return null if no properties
+ * are recognized by this component.
+ */
+ public String[] getRecognizedProperties() {
+ return merge(super.getRecognizedProperties(), RECOGNIZED_PROPERTIES);
+ } // getRecognizedProperties():String[]
+
+ /**
+ * Returns the default value for a property, or null if this
+ * component does not want to report a default value for this
+ * property.
+ */
+ public Object getPropertyDefault(String propertyId) {
+ for (int i = 0; i < RECOGNIZED_PROPERTIES.length; i++) {
+ if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
+ return PROPERTY_DEFAULTS[i];
+ }
+ }
+ return super.getPropertyDefault(propertyId);
+ } // getPropertyDefault(String):Object
+
+ /**
+ * Resets the component. The component can query the component manager
+ * about any features and properties that affect the operation of the
+ * component.
+ *
+ * @param manager The component manager.
+ *
+ * @throws XNIException Thrown by component on initialization error.
+ */
+ public void reset(XMLComponentManager manager)
+ throws XMLConfigurationException {
+ super.reset(manager);
+
+ // features
+ fNamespaces = manager.getFeature(NAMESPACES);
+ fOverrideNamespaces = manager.getFeature(OVERRIDE_NAMESPACES);
+ fInsertNamespaces = manager.getFeature(INSERT_NAMESPACES);
+
+ // get properties
+ fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
+ fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
+ fNamespacesURI = String.valueOf(manager.getProperty(NAMESPACES_URI));
+
+ // initialize state
+ fNamespaceContext.reset();
+
+ } // reset(XMLComponentManager)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+
+ // perform default handling
+ // NOTE: using own namespace context
+ super.startDocument(locator,encoding,fNamespaceContext,augs);
+
+ } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+
+ // bind namespaces, if needed
+ if (fNamespaces) {
+ fNamespaceContext.pushContext();
+ bindNamespaces(element, attrs);
+
+ int dcount = fNamespaceContext.getDeclaredPrefixCount();
+ if (fDocumentHandler != null && dcount > 0) {
+ try {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, String.class };
+ Method method = cls.getMethod("startPrefixMapping", types);
+ for (int i = 0; i < dcount; i++) {
+ String prefix = fNamespaceContext.getDeclaredPrefixAt(i);
+ String uri = fNamespaceContext.getURI(prefix);
+ Object[] args = { prefix, uri };
+ method.invoke(fDocumentHandler, args);
+ }
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ }
+ }
+
+ // perform default handling
+ super.startElement(element, attrs, augs);
+
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+
+ // bind namespaces, if needed
+ if (fNamespaces) {
+ fNamespaceContext.pushContext();
+ bindNamespaces(element, attrs);
+
+ int dcount = fNamespaceContext.getDeclaredPrefixCount();
+ if (fDocumentHandler != null && dcount > 0) {
+ try {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class, String.class };
+ Method method = cls.getMethod("startPrefixMapping", types);
+ for (int i = 0; i < dcount; i++) {
+ String prefix = fNamespaceContext.getDeclaredPrefixAt(i);
+ String uri = fNamespaceContext.getURI(prefix);
+ Object[] args = { prefix, uri };
+ method.invoke(fDocumentHandler, args);
+ }
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ }
+ }
+
+ // perform default handling
+ super.emptyElement(element, attrs, augs);
+
+ // pop context
+ if (fNamespaces) {
+ int dcount = fNamespaceContext.getDeclaredPrefixCount();
+ if (fDocumentHandler != null && dcount > 0) {
+ try {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class };
+ Method method = cls.getMethod("endPrefixMapping", types);
+ for (int i = dcount-1; i >= 0; i--) {
+ String prefix = fNamespaceContext.getDeclaredPrefixAt(i);
+ Object[] args = { prefix };
+ method.invoke(fDocumentHandler, args);
+ }
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ }
+
+ fNamespaceContext.popContext();
+ }
+
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+
+ // bind namespaces, if needed
+ if (fNamespaces) {
+ bindNamespaces(element, null);
+ }
+
+ // perform default handling
+ super.endElement(element, augs);
+
+ // pop context
+ if (fNamespaces) {
+ int dcount = fNamespaceContext.getDeclaredPrefixCount();
+ if (fDocumentHandler != null && dcount > 0) {
+ try {
+ Class cls = fDocumentHandler.getClass();
+ Class[] types = { String.class };
+ Method method = cls.getMethod("endPrefixMapping", types);
+ for (int i = dcount-1; i >= 0; i--) {
+ String prefix = fNamespaceContext.getDeclaredPrefixAt(i);
+ Object[] args = { prefix };
+ method.invoke(fDocumentHandler, args);
+ }
+ }
+ catch (NoSuchMethodException e) {
+ // ignore
+ }
+ catch (InvocationTargetException e) {
+ // ignore
+ }
+ catch (IllegalAccessException e) {
+ // ignore
+ }
+ }
+
+ fNamespaceContext.popContext();
+ }
+
+ } // endElement(QName,Augmentations)
+
+ //
+ // Protected static methods
+ //
+
+ /** Splits a qualified name. */
+ protected static void splitQName(QName qname) {
+ int index = qname.rawname.indexOf(':');
+ if (index != -1) {
+ qname.prefix = qname.rawname.substring(0,index);
+ qname.localpart = qname.rawname.substring(index+1);
+ }
+ } // splitQName(QName)
+
+ /**
+ * Converts HTML names string value to constant value.
+ *
+ * @see #NAMES_NO_CHANGE
+ * @see #NAMES_LOWERCASE
+ * @see #NAMES_UPPERCASE
+ */
+ protected static final short getNamesValue(String value) {
+ if (value.equals("lower")) { return NAMES_LOWERCASE; }
+ if (value.equals("upper")) { return NAMES_UPPERCASE; }
+ return NAMES_NO_CHANGE;
+ } // getNamesValue(String):short
+
+ /** Modifies the given name based on the specified mode. */
+ protected static final String modifyName(String name, short mode) {
+ switch (mode) {
+ case NAMES_UPPERCASE: return name.toUpperCase();
+ case NAMES_LOWERCASE: return name.toLowerCase();
+ }
+ return name;
+ } // modifyName(String,short):String
+
+ //
+ // Protected methods
+ //
+
+ /** Binds namespaces. */
+ protected void bindNamespaces(QName element, XMLAttributes attrs) {
+
+ // split element qname
+ splitQName(element);
+
+ // declare namespace prefixes
+ int attrCount = attrs != null ? attrs.getLength() : 0;
+ for (int i = attrCount - 1; i >= 0; i--) {
+ attrs.getName(i, fQName);
+ String aname = fQName.rawname;
+ String ANAME = aname.toUpperCase();
+ if (ANAME.startsWith("XMLNS:") || ANAME.equals("XMLNS")) {
+ int anamelen = aname.length();
+
+ // get parts
+ String aprefix = anamelen > 5 ? aname.substring(0,5) : null;
+ String alocal = anamelen > 5 ? aname.substring(6) : aname;
+ String avalue = attrs.getValue(i);
+
+ // re-case parts and set them back into attributes
+ if (anamelen > 5) {
+ aprefix = modifyName(aprefix, NAMES_LOWERCASE);
+ alocal = modifyName(alocal, fNamesElems);
+ aname = aprefix + ':' + alocal;
+ }
+ else {
+ alocal = modifyName(alocal, NAMES_LOWERCASE);
+ aname = alocal;
+ }
+ fQName.setValues(aprefix, alocal, aname, null);
+ attrs.setName(i, fQName);
+
+ // declare prefix
+ String prefix = alocal != aname ? alocal : "";
+ String uri = avalue.length() > 0 ? avalue : null;
+ if (fOverrideNamespaces &&
+ prefix.equals(element.prefix) &&
+ HTMLElements.getElement(element.localpart, null) != null) {
+ uri = fNamespacesURI;
+ }
+ fNamespaceContext.declarePrefix(prefix, uri);
+ }
+ }
+
+ // bind element
+ String prefix = element.prefix != null ? element.prefix : "";
+ element.uri = fNamespaceContext.getURI(prefix);
+ // REVISIT: The prefix of a qualified element name that is
+ // bound to a namespace is passed (as recent as
+ // Xerces 2.4.0) as "" for start elements and null
+ // for end elements. Why? One of them is a bug,
+ // clearly. -Ac
+ if (element.uri != null && element.prefix == null) {
+ element.prefix = "";
+ }
+
+ // do we need to insert namespace bindings?
+ if (fInsertNamespaces &&
+ HTMLElements.getElement(element.localpart,null) != null) {
+ if (element.prefix == null ||
+ fNamespaceContext.getURI(element.prefix) == null) {
+ String xmlns = "xmlns" + ((element.prefix != null)
+ ? ":"+element.prefix : "");
+ fQName.setValues(null, xmlns, xmlns, null);
+ attrs.addAttribute(fQName, "CDATA", fNamespacesURI);
+ bindNamespaces(element, attrs);
+ return;
+ }
+ }
+
+ // bind attributes
+ attrCount = attrs != null ? attrs.getLength() : 0;
+ for (int i = 0; i < attrCount; i++) {
+ attrs.getName(i, fQName);
+ splitQName(fQName);
+ prefix = !fQName.rawname.equals("xmlns")
+ ? (fQName.prefix != null ? fQName.prefix : "") : "xmlns";
+ // PATCH: Joseph Walton
+ if (!prefix.equals("")) {
+ fQName.uri = prefix.equals("xml") ? XML_URI : fNamespaceContext.getURI(prefix);
+ }
+ // NOTE: You would think the xmlns namespace would be handled
+ // by NamespaceSupport but it's not. -Ac
+ if (prefix.equals("xmlns") && fQName.uri == null) {
+ fQName.uri = XMLNS_URI;
+ }
+ attrs.setName(i, fQName);
+ }
+
+ } // bindNamespaces(QName,XMLAttributes)
+
+ //
+ // Classes
+ //
+
+ /**
+ * This namespace context object implements the old and new XNI
+ * <code>NamespaceContext</code> interface methods so that it can
+ * be used across all versions of Xerces2.
+ */
+ public static class NamespaceSupport
+ implements NamespaceContext {
+
+ //
+ // Data
+ //
+
+ /** Top of the levels list. */
+ protected int fTop = 0;
+
+ /** The levels of the entries. */
+ protected int[] fLevels = new int[10];
+
+ /** The entries. */
+ protected Entry[] fEntries = new Entry[10];
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public NamespaceSupport() {
+ pushContext();
+ declarePrefix("xml", NamespaceContext.XML_URI);
+ declarePrefix("xmlns", NamespaceContext.XMLNS_URI);
+ } // <init>()
+
+ //
+ // NamespaceContext methods
+ //
+
+ // since Xerces 2.0.0-beta2 (old XNI namespaces)
+
+ /** Get URI. */
+ public String getURI(String prefix) {
+ for (int i = fLevels[fTop]-1; i >= 0; i--) {
+ Entry entry = (Entry)fEntries[i];
+ if (entry.prefix.equals(prefix)) {
+ return entry.uri;
+ }
+ }
+ return null;
+ } // getURI(String):String
+
+ /** Get declared prefix count. */
+ public int getDeclaredPrefixCount() {
+ return fLevels[fTop] - fLevels[fTop-1];
+ } // getDeclaredPrefixCount():int
+
+ /** Get declared prefix at. */
+ public String getDeclaredPrefixAt(int index) {
+ return fEntries[fLevels[fTop-1] + index].prefix;
+ } // getDeclaredPrefixAt(int):String
+
+ /** Get parent context. */
+ public NamespaceContext getParentContext() {
+ return this;
+ } // getParentContext():NamespaceContext
+
+ // since Xerces #.#.# (new XNI namespaces)
+
+ /** Reset. */
+ public void reset() {
+ fLevels[fTop = 1] = fLevels[fTop-1];
+ } // reset()
+
+ /** Push context. */
+ public void pushContext() {
+ if (++fTop == fLevels.length) {
+ int[] iarray = new int[fLevels.length + 10];
+ System.arraycopy(fLevels, 0, iarray, 0, fLevels.length);
+ fLevels = iarray;
+ }
+ fLevels[fTop] = fLevels[fTop-1];
+ } // pushContext()
+
+ /** Pop context. */
+ public void popContext() {
+ fTop--;
+ } // popContext()
+
+ /** Declare prefix. */
+ public boolean declarePrefix(String prefix, String uri) {
+ int count = getDeclaredPrefixCount();
+ for (int i = 0; i < count; i++) {
+ String dprefix = getDeclaredPrefixAt(i);
+ if (dprefix.equals(prefix)) {
+ return false;
+ }
+ }
+ Entry entry = new Entry(prefix, uri);
+ if (fLevels[fTop] == fEntries.length) {
+ Entry[] earray = new Entry[fEntries.length + 10];
+ System.arraycopy(fEntries, 0, earray, 0, fEntries.length);
+ fEntries = earray;
+ }
+ fEntries[fLevels[fTop]++] = entry;
+ return true;
+ } // declarePrefix(String,String):boolean
+
+ /** Get prefix. */
+ public String getPrefix(String uri) {
+ for (int i = fLevels[fTop]-1; i >= 0; i--) {
+ Entry entry = (Entry)fEntries[i];
+ if (entry.uri.equals(uri)) {
+ return entry.prefix;
+ }
+ }
+ return null;
+ } // getPrefix(String):String
+
+ /** Get all prefixes. */
+ public Enumeration getAllPrefixes() {
+ Vector prefixes = new Vector();
+ for (int i = fLevels[1]; i < fLevels[fTop]; i++) {
+ String prefix = fEntries[i].prefix;
+ if (!prefixes.contains(prefix)) {
+ prefixes.addElement(prefix);
+ }
+ }
+ return prefixes.elements();
+ } // getAllPrefixes():Enumeration
+
+ //
+ // Classes
+ //
+
+ /** A namespace binding entry. */
+ static class Entry {
+
+ //
+ // Data
+ //
+
+ /** Prefix. */
+ public String prefix;
+
+ /** URI. */
+ public String uri;
+
+ //
+ // Constructors
+ //
+
+ /** Constructs an entry. */
+ public Entry(String prefix, String uri) {
+ this.prefix = prefix;
+ this.uri = uri;
+ } // <init>(String,String)
+
+ } // class Entry
+
+ } // class NamespaceSupport
+
+} // class NamespaceBinder
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Purifier.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Purifier.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Purifier.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,470 @@
+/*
+ * (C) Copyright 2004-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import org.cyberneko.html.HTMLAugmentations;
+import org.cyberneko.html.HTMLEventInfo;
+
+import java.lang.reflect.Method;
+import java.lang.reflect.InvocationTargetException;
+
+import org.apache.xerces.util.XMLChar;
+import org.apache.xerces.util.XMLStringBuffer;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+
+/**
+ * This filter purifies the HTML input to ensure XML well-formedness.
+ * The purification process includes:
+ * <ul>
+ * <li>fixing illegal characters in the document, including
+ * <ul>
+ * <li>element and attribute names,
+ * <li>processing instruction target and data,
+ * <li>document text;
+ * </ul>
+ * <li>ensuring the string "--" does not appear in the content of
+ * a comment;
+ * <li>ensuring the string "]]>" does not appear in the content of
+ * a CDATA section;
+ * <li>ensuring that the XML declaration has required pseudo-attributes
+ * and that the values are correct;
+ * and
+ * <li>synthesized missing namespace bindings.
+ * </ul>
+ * <p>
+ * Illegal characters in XML names are converted to the character
+ * sequence "_u####_" where "####" is the value of the Unicode
+ * character represented in hexadecimal. Whereas illegal characters
+ * appearing in document content is converted to the character
+ * sequence "\\u####".
+ * <p>
+ * In comments, the character '-' is replaced by the character
+ * sequence "- " to prevent "--" from ever appearing in the comment
+ * content. For CDATA sections, the character ']' is replaced by
+ * the character sequence "] " to prevent "]]" from appearing.
+ * <p>
+ * The URI used for synthesized namespace bindings is
+ * "http://cyberneko.org/html/ns/synthesized/<i>number</i>" where
+ * <i>number</i> is generated to ensure uniqueness.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: Purifier.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
+ */
+public class Purifier
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ /** Synthesized namespace binding prefix. */
+ public static final String SYNTHESIZED_NAMESPACE_PREFX =
+ "http://cyberneko.org/html/ns/synthesized/";
+
+ /** Namespaces. */
+ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+ /** Include infoset augmentations. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Recognized features. */
+ private static final String[] RECOGNIZED_FEATURES = {
+ NAMESPACES,
+ AUGMENTATIONS,
+ };
+
+ /** Recognized features defaults. */
+ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
+ null,
+ null,
+ };
+
+ // static vars
+
+ /** Synthesized event info item. */
+ protected static final HTMLEventInfo SYNTHESIZED_ITEM =
+ new HTMLEventInfo.SynthesizedItem();
+
+ //
+ // Data
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected boolean fNamespaces;
+
+ /** Augmentations. */
+ protected boolean fAugmentations;
+
+ // state
+
+ /** True if the doctype declaration was seen. */
+ protected boolean fSeenDoctype;
+
+ /** True if root element was seen. */
+ protected boolean fSeenRootElement;
+
+ /** True if inside a CDATA section. */
+ protected boolean fInCDATASection;
+
+ // doctype declaration info
+
+ /** Public identifier of doctype declaration. */
+ protected String fPublicId;
+
+ /** System identifier of doctype declaration. */
+ protected String fSystemId;
+
+ // namespace info
+
+ /** Namespace information. */
+ protected NamespaceContext fNamespaceContext;
+
+ /** Synthesized namespace binding count. */
+ protected int fSynthesizedNamespaceCount;
+
+ // temp vars
+
+ /** Qualified name. */
+ private QName fQName = new QName();
+
+ /** Augmentations. */
+ private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
+
+ /** String buffer. */
+ private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
+
+ //
+ // XMLComponent methods
+ //
+
+ public void reset(XMLComponentManager manager)
+ throws XMLConfigurationException {
+
+ // state
+ fInCDATASection = false;
+
+ // features
+ fNamespaces = manager.getFeature(NAMESPACES);
+ fAugmentations = manager.getFeature(AUGMENTATIONS);
+
+ } // reset(XMLComponentManager)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ Augmentations augs) throws XNIException {
+ fNamespaceContext = fNamespaces
+ ? new NamespaceBinder.NamespaceSupport() : null;
+ fSynthesizedNamespaceCount = 0;
+ handleStartDocument();
+ super.startDocument(locator, encoding, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+ fNamespaceContext = nscontext;
+ fSynthesizedNamespaceCount = 0;
+ handleStartDocument();
+ super.startDocument(locator, encoding, nscontext, augs);
+ } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)
+
+ /** XML declaration. */
+ public void xmlDecl(String version, String encoding, String standalone,
+ Augmentations augs) throws XNIException {
+ if (version == null || !version.equals("1.0")) {
+ version = "1.0";
+ }
+ if (encoding != null && encoding.length() == 0) {
+ encoding = null;
+ }
+ if (standalone != null) {
+ if (!standalone.equalsIgnoreCase("true") &&
+ !standalone.equalsIgnoreCase("false")) {
+ standalone = null;
+ }
+ else {
+ standalone = standalone.toLowerCase();
+ }
+ }
+ super.xmlDecl(version,encoding,standalone,augs);
+ } // xmlDecl(String,String,String,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs)
+ throws XNIException {
+ StringBuffer str = new StringBuffer(purifyText(text).toString());
+ int length = str.length();
+ for (int i = length-1; i >= 0; i--) {
+ char c = str.charAt(i);
+ if (c == '-') {
+ str.insert(i + 1, ' ');
+ }
+ }
+ fStringBuffer.length = 0;
+ fStringBuffer.append(str.toString());
+ text = fStringBuffer;
+ super.comment(text, augs);
+ } // comment(XMLString,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data,
+ Augmentations augs)
+ throws XNIException {
+ target = purifyName(target, true);
+ data = purifyText(data);
+ super.processingInstruction(target, data, augs);
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Doctype declaration. */
+ public void doctypeDecl(String root, String pubid, String sysid,
+ Augmentations augs) throws XNIException {
+ fSeenDoctype = true;
+ // NOTE: It doesn't matter what the root element name is because
+ // it must match the root element. -Ac
+ fPublicId = pubid;
+ fSystemId = sysid;
+ // NOTE: If the public identifier is specified, then a system
+ // identifier must also be specified. -Ac
+ if (fPublicId != null && fSystemId == null) {
+ fSystemId = "";
+ }
+ // NOTE: Can't save the augmentations because the object state
+ // is transient. -Ac
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+ handleStartElement(element, attrs);
+ super.startElement(element, attrs, augs);
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+ handleStartElement(element, attrs);
+ super.emptyElement(element, attrs, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Start CDATA section. */
+ public void startCDATA(Augmentations augs) throws XNIException {
+ fInCDATASection = true;
+ super.startCDATA(augs);
+ } // startCDATA(Augmentations)
+
+ /** End CDATA section. */
+ public void endCDATA(Augmentations augs) throws XNIException {
+ fInCDATASection = false;
+ super.endCDATA(augs);
+ } // endCDATA(Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+ text = purifyText(text);
+ if (fInCDATASection) {
+ StringBuffer str = new StringBuffer(text.toString());
+ int length = str.length();
+ for (int i = length-1; i >= 0; i--) {
+ char c = str.charAt(i);
+ if (c == ']') {
+ str.insert(i + 1, ' ');
+ }
+ }
+ fStringBuffer.length = 0;
+ fStringBuffer.append(str.toString());
+ text = fStringBuffer;
+ }
+ super.characters(text,augs);
+ } // characters(XMLString,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ element = purifyQName(element);
+ if (fNamespaces) {
+ if (element.prefix != null && element.uri == null) {
+ element.uri = fNamespaceContext.getURI(element.prefix);
+ }
+ }
+ super.endElement(element, augs);
+ } // endElement(QName,Augmentations)
+
+ //
+ // Protected methods
+ //
+
+ /** Handle start document. */
+ protected void handleStartDocument() {
+ fSeenDoctype = false;
+ fSeenRootElement = false;
+ } // handleStartDocument()
+
+ /** Handle start element. */
+ protected void handleStartElement(QName element, XMLAttributes attrs) {
+
+ // handle element and attributes
+ element = purifyQName(element);
+ int attrCount = attrs != null ? attrs.getLength() : 0;
+ for (int i = attrCount-1; i >= 0; i--) {
+ // purify attribute name
+ attrs.getName(i, fQName);
+ attrs.setName(i, purifyQName(fQName));
+
+ // synthesize namespace bindings
+ if (fNamespaces) {
+ if (!fQName.rawname.equals("xmlns") &&
+ !fQName.rawname.startsWith("xmlns:")) {
+ // NOTE: Must get attribute name again because the
+ // purifyQName method does not guarantee that
+ // the same QName object is returned. -Ac
+ attrs.getName(i, fQName);
+ if (fQName.prefix != null && fQName.uri == null) {
+ synthesizeBinding(attrs, fQName.prefix);
+ }
+ }
+ }
+ }
+
+ // synthesize namespace bindings
+ if (fNamespaces) {
+ if (element.prefix != null && element.uri == null) {
+ synthesizeBinding(attrs, element.prefix);
+ }
+ }
+
+ // synthesize doctype declaration
+ if (!fSeenRootElement && fSeenDoctype) {
+ Augmentations augs = synthesizedAugs();
+ super.doctypeDecl(element.rawname, fPublicId, fSystemId, augs);
+ }
+
+ // mark start element as seen
+ fSeenRootElement = true;
+
+ } // handleStartElement(QName,XMLAttributes)
+
+ /** Synthesize namespace binding. */
+ protected void synthesizeBinding(XMLAttributes attrs, String ns) {
+ String prefix = "xmlns";
+ String localpart = ns;
+ String qname = prefix+':'+localpart;
+ String uri = NamespaceBinder.NAMESPACES_URI;
+ String atype = "CDATA";
+ String avalue = SYNTHESIZED_NAMESPACE_PREFX+fSynthesizedNamespaceCount++;
+
+ // add attribute
+ fQName.setValues(prefix, localpart, qname, uri);
+ attrs.addAttribute(fQName, atype, avalue);
+
+ // bind namespace
+ fNamespaceContext.declarePrefix(ns, avalue);
+
+ } // synthesizeBinding(XMLAttributes,String)
+
+ /** Returns an augmentations object with a synthesized item added. */
+ protected final Augmentations synthesizedAugs() {
+ HTMLAugmentations augs = null;
+ if (fAugmentations) {
+ augs = fInfosetAugs;
+ augs.removeAllItems();
+ augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
+ }
+ return augs;
+ } // synthesizedAugs():Augmentations
+
+ //
+ // Protected methods
+ //
+
+ /** Purify qualified name. */
+ protected QName purifyQName(QName qname) {
+ qname.prefix = purifyName(qname.prefix, true);
+ qname.localpart = purifyName(qname.localpart, true);
+ qname.rawname = purifyName(qname.rawname, false);
+ return qname;
+ } // purifyQName(QName):QName
+
+ /** Purify name. */
+ protected String purifyName(String name, boolean localpart) {
+ if (name == null) {
+ return name;
+ }
+ StringBuffer str = new StringBuffer();
+ int length = name.length();
+ boolean seenColon = localpart;
+ for (int i = 0; i < length; i++) {
+ char c = name.charAt(i);
+ if (i == 0) {
+ if (!XMLChar.isNameStart(c)) {
+ str.append("_u"+toHexString(c,4)+"_");
+ }
+ else {
+ str.append(c);
+ }
+ }
+ else {
+ if ((fNamespaces && c == ':' && seenColon) || !XMLChar.isName(c)) {
+ str.append("_u"+toHexString(c,4)+"_");
+ }
+ else {
+ str.append(c);
+ }
+ seenColon = seenColon || c == ':';
+ }
+ }
+ return str.toString();
+ } // purifyName(String):String
+
+ /** Purify content. */
+ protected XMLString purifyText(XMLString text) {
+ fStringBuffer.length = 0;
+ for (int i = 0; i < text.length; i++) {
+ char c = text.ch[text.offset+i];
+ if (XMLChar.isInvalid(c)) {
+ fStringBuffer.append("\\u"+toHexString(c,4));
+ }
+ else {
+ fStringBuffer.append(c);
+ }
+ }
+ return fStringBuffer;
+ } // purifyText(XMLString):XMLString
+
+ //
+ // Protected static methods
+ //
+
+ /** Returns a padded hexadecimal string for the given value. */
+ protected static String toHexString(int c, int padlen) {
+ StringBuffer str = new StringBuffer(padlen);
+ str.append(Integer.toHexString(c));
+ int len = padlen - str.length();
+ for (int i = 0; i < len; i++) {
+ str.insert(0, '0');
+ }
+ return str.toString().toUpperCase();
+ } // toHexString(int,int):String
+
+} // class Purifier
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Writer.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Writer.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/filters/Writer.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,474 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.filters;
+
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+
+import org.cyberneko.html.HTMLConfiguration;
+import org.cyberneko.html.HTMLElements;
+import org.cyberneko.html.HTMLEntities;
+import org.cyberneko.html.filters.DefaultFilter;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
+
+/**
+ * An HTML writer written as a filter. Besides serializing the HTML
+ * event stream, the writer also passes the document events to the next
+ * stage in the pipeline. This allows applications to insert writer
+ * filters between other custom filters for debugging purposes.
+ * <p>
+ * Since an HTML document may have specified its encoding using the
+ * <META> tag and http-equiv/content attributes, the writer will
+ * automatically change any character set specified in this tag to
+ * match the encoding of the output stream. Therefore, the character
+ * encoding name used to construct the writer should be an official
+ * <a href='http://www.iana.org/assignments/character-sets'>IANA</a>
+ * encoding name and not a Java encoding name.
+ * <p>
+ * <strong>Note:</strong>
+ * The modified character set in the <META> tag is <em>not</em>
+ * propagated to the next stage in the pipeline. The changed value is
+ * only output to the stream; the original value is sent to the next
+ * stage in the pipeline.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: Writer.java,v 1.7 2005/02/14 04:01:33 andyc Exp $
+ */
+public class Writer
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ /** Notify character entity references. */
+ public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
+
+ /** Notify built-in entity references. */
+ public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
+
+ /** Augmentations feature identifier. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Filters property identifier. */
+ protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
+
+ //
+ // Data
+ //
+
+ /** The encoding. */
+ protected String fEncoding;
+
+ /**
+ * The print writer used for serializing the document with the
+ * appropriate character encoding.
+ */
+ protected PrintWriter fPrinter;
+
+ // state
+
+ /** Seen root element. */
+ protected boolean fSeenRootElement;
+
+ /** Seen http-equiv directive. */
+ protected boolean fSeenHttpEquiv;
+
+ /** Element depth. */
+ protected int fElementDepth;
+
+ /** Normalize character content. */
+ protected boolean fNormalize;
+
+ /** Print characters. */
+ protected boolean fPrintChars;
+
+ //
+ // Constructors
+ //
+
+ /** Constructs a writer filter that prints to standard out. */
+ public Writer() {
+ // Note: UTF-8 should *always* be a supported encoding. Although,
+ // I've heard of the old M$ JVM not supporting it! Amazing. -Ac
+ try {
+ fEncoding = "UTF-8";
+ fPrinter = new PrintWriter(new OutputStreamWriter(System.out, fEncoding));
+ }
+ catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e.getMessage());
+ }
+ } // <init>()
+
+ /**
+ * Constructs a writer filter using the specified output stream and
+ * encoding.
+ *
+ * @param outputStream The output stream to write to.
+ * @param encoding The encoding to be used for the output. The encoding name
+ * should be an official IANA encoding name.
+ */
+ public Writer(OutputStream outputStream, String encoding)
+ throws UnsupportedEncodingException {
+ this(new OutputStreamWriter(outputStream, encoding), encoding);
+ } // <init>(OutputStream,String)
+
+ /**
+ * Constructs a writer filter using the specified Java writer and
+ * encoding.
+ *
+ * @param writer The Java writer to write to.
+ * @param encoding The encoding to be used for the output. The encoding name
+ * should be an official IANA encoding name.
+ */
+ public Writer(java.io.Writer writer, String encoding) {
+ fEncoding = encoding;
+ if (writer instanceof PrintWriter) {
+ fPrinter = (PrintWriter)writer;
+ }
+ else {
+ fPrinter = new PrintWriter(writer);
+ }
+ } // <init>(java.io.Writer,String)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+ fSeenRootElement = false;
+ fSeenHttpEquiv = false;
+ fElementDepth = 0;
+ fNormalize = true;
+ fPrintChars = true;
+ super.startDocument(locator, encoding, nscontext, augs);
+ } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
+
+ // old methods
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
+ throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fSeenRootElement && fElementDepth <= 0) {
+ fPrinter.println();
+ }
+ fPrinter.print("<!--");
+ printCharacters(text, false);
+ fPrinter.print("-->");
+ if (!fSeenRootElement) {
+ fPrinter.println();
+ }
+ fPrinter.flush();
+ } // comment(XMLString,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ fSeenRootElement = true;
+ fElementDepth++;
+ fNormalize = !HTMLElements.getElement(element.rawname).isSpecial();
+ printStartElement(element, attributes);
+ super.startElement(element, attributes, augs);
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
+ throws XNIException {
+ fSeenRootElement = true;
+ printStartElement(element, attributes);
+ super.emptyElement(element, attributes, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fPrintChars) {
+ printCharacters(text, fNormalize);
+ }
+ super.characters(text, augs);
+ } // characters(XMLString,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ fElementDepth--;
+ fNormalize = true;
+ /***
+ // NOTE: Not sure if this is what should be done in the case where
+ // the encoding is not explitly declared within the HEAD. So
+ // I'm leaving it commented out for now. -Ac
+ if (element.rawname.equalsIgnoreCase("head") && !fSeenHttpEquiv) {
+ boolean capitalize = Character.isUpperCase(element.rawname.charAt(0));
+ String ename = capitalize ? "META" : "meta";
+ QName qname = new QName(null, ename, ename, null);
+ XMLAttributes attrs = new XMLAttributesImpl();
+ QName aname = new QName(null, "http-equiv", "http-equiv", null);
+ attrs.addAttribute(aname, "CDATA", "Content-Type");
+ aname.setValues(null, "content", "content", null);
+ attrs.addAttribute(aname, "CDATA", "text/html; charset="+fEncoding);
+ super.emptyElement(qname, attrs, null);
+ }
+ /***/
+ printEndElement(element);
+ super.endElement(element, augs);
+ } // endElement(QName,Augmentations)
+
+ /** Start general entity. */
+ public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
+ throws XNIException {
+ fPrintChars = false;
+ if (name.startsWith("#")) {
+ try {
+ boolean hex = name.startsWith("#x");
+ int offset = hex ? 2 : 1;
+ int base = hex ? 16 : 10;
+ int value = Integer.parseInt(name.substring(offset), base);
+ String entity = HTMLEntities.get(value);
+ if (entity != null) {
+ name = entity;
+ }
+ }
+ catch (NumberFormatException e) {
+ // ignore
+ }
+ }
+ printEntity(name);
+ super.startGeneralEntity(name, id, encoding, augs);
+ } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+ /** End general entity. */
+ public void endGeneralEntity(String name, Augmentations augs)
+ throws XNIException {
+ fPrintChars = true;
+ super.endGeneralEntity(name, augs);
+ } // endGeneralEntity(String,Augmentations)
+
+ //
+ // Protected methods
+ //
+
+ /** Print attribute value. */
+ protected void printAttributeValue(String text) {
+ int length = text.length();
+ for (int j = 0; j < length; j++) {
+ char c = text.charAt(j);
+ if (c == '"') {
+ fPrinter.print(""");
+ }
+ else {
+ fPrinter.print(c);
+ }
+ }
+ fPrinter.flush();
+ } // printAttributeValue(String)
+
+ /** Print characters. */
+ protected void printCharacters(XMLString text, boolean normalize) {
+ if (normalize) {
+ for (int i = 0; i < text.length; i++) {
+ char c = text.ch[text.offset + i];
+ if (c != '\n') {
+ String entity = HTMLEntities.get(c);
+ if (entity != null) {
+ printEntity(entity);
+ }
+ else {
+ fPrinter.print(c);
+ }
+ }
+ else {
+ fPrinter.println();
+ }
+ }
+ }
+ else {
+ for (int i = 0; i < text.length; i++) {
+ char c = text.ch[text.offset + i];
+ fPrinter.print(c);
+ }
+ }
+ fPrinter.flush();
+ } // printCharacters(XMLString,boolean)
+
+ /** Print start element. */
+ protected void printStartElement(QName element, XMLAttributes attributes) {
+
+ // modify META[@http-equiv='content-type']/@content value
+ int contentIndex = -1;
+ String originalContent = null;
+ if (element.rawname.toLowerCase().equals("meta")) {
+ String httpEquiv = null;
+ int length = attributes.getLength();
+ for (int i = 0; i < length; i++) {
+ String aname = attributes.getQName(i).toLowerCase();
+ if (aname.equals("http-equiv")) {
+ httpEquiv = attributes.getValue(i);
+ }
+ else if (aname.equals("content")) {
+ contentIndex = i;
+ }
+ }
+ if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) {
+ fSeenHttpEquiv = true;
+ String content = null;
+ if (contentIndex != -1) {
+ originalContent = attributes.getValue(contentIndex);
+ content = originalContent.toLowerCase();
+ }
+ if (content != null) {
+ int charsetIndex = content.indexOf("charset=");
+ if (charsetIndex != -1) {
+ content = content.substring(0, charsetIndex + 8);
+ }
+ else {
+ content += ";charset=";
+ }
+ content += fEncoding;
+ attributes.setValue(contentIndex, content);
+ }
+ }
+ }
+
+ // print element
+ fPrinter.print('<');
+ fPrinter.print(element.rawname);
+ int attrCount = attributes != null ? attributes.getLength() : 0;
+ for (int i = 0; i < attrCount; i++) {
+ String aname = attributes.getQName(i);
+ String avalue = attributes.getValue(i);
+ fPrinter.print(' ');
+ fPrinter.print(aname);
+ fPrinter.print("=\"");
+ printAttributeValue(avalue);
+ fPrinter.print('"');
+ }
+ fPrinter.print('>');
+ fPrinter.flush();
+
+ // return original META[@http-equiv]/@content value
+ if (contentIndex != -1) {
+ attributes.setValue(contentIndex, originalContent);
+ }
+
+ } // printStartElement(QName,XMLAttributes)
+
+ /** Print end element. */
+ protected void printEndElement(QName element) {
+ fPrinter.print("</");
+ fPrinter.print(element.rawname);
+ fPrinter.print('>');
+ fPrinter.flush();
+ } // printEndElement(QName)
+
+ /** Print entity. */
+ protected void printEntity(String name) {
+ fPrinter.print('&');
+ fPrinter.print(name);
+ fPrinter.print(';');
+ fPrinter.flush();
+ } // printEntity(String)
+
+ //
+ // MAIN
+ //
+
+ /** Main. */
+ public static void main(String[] argv) throws Exception {
+ if (argv.length == 0) {
+ printUsage();
+ System.exit(1);
+ }
+ XMLParserConfiguration parser = new HTMLConfiguration();
+ parser.setFeature(NOTIFY_CHAR_REFS, true);
+ parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
+ String iencoding = null;
+ String oencoding = "Windows-1252";
+ boolean identity = false;
+ boolean purify = false;
+ for (int i = 0; i < argv.length; i++) {
+ String arg = argv[i];
+ if (arg.equals("-ie")) {
+ iencoding = argv[++i];
+ continue;
+ }
+ if (arg.equals("-e") || arg.equals("-oe")) {
+ oencoding = argv[++i];
+ continue;
+ }
+ if (arg.equals("-i")) {
+ identity = true;
+ continue;
+ }
+ if (arg.equals("-p")) {
+ purify = true;
+ continue;
+ }
+ if (arg.equals("-h")) {
+ printUsage();
+ System.exit(1);
+ }
+ java.util.Vector filtersVector = new java.util.Vector(2);
+ if (identity) {
+ filtersVector.addElement(new Identity());
+ }
+ else if (purify) {
+ filtersVector.addElement(new Purifier());
+ }
+ filtersVector.addElement(new Writer(System.out, oencoding));
+ XMLDocumentFilter[] filters =
+ new XMLDocumentFilter[filtersVector.size()];
+ filtersVector.copyInto(filters);
+ parser.setProperty(FILTERS, filters);
+ XMLInputSource source = new XMLInputSource(null, arg, null);
+ source.setEncoding(iencoding);
+ parser.parse(source);
+ }
+ } // main(String[])
+
+ /** Print usage. */
+ private static void printUsage() {
+ System.err.println("usage: java "+Writer.class.getName()+" (options) file ...");
+ System.err.println();
+ System.err.println("options:");
+ System.err.println(" -ie name Specify IANA name of input encoding.");
+ System.err.println(" -oe name Specify IANA name of output encoding.");
+ System.err.println(" -i Perform identity transform.");
+ System.err.println(" -p Purify output to ensure XML well-formedness.");
+ System.err.println(" -h Display help screen.");
+ System.err.println();
+ System.err.println("notes:");
+ System.err.println(" The -i and -p options are mutually exclusive.");
+ System.err.println(" The -e option has been replaced with -oe.");
+ } // printUsage()
+
+} // class Writer
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMFragmentParser.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMFragmentParser.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMFragmentParser.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,577 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ * ==============================================================
+ * This file contains some code from Apache Xerces-J which is
+ * used in accordance with the Apache license. Please refer to
+ * the LICENSE_apache file for specific details.
+ */
+
+package org.cyberneko.html.parsers;
+
+import org.cyberneko.html.HTMLConfiguration;
+
+import org.apache.xerces.impl.Constants;
+import org.apache.xerces.util.ErrorHandlerWrapper;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+import org.apache.xerces.xni.parser.XMLErrorHandler;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParseException;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
+
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.w3c.dom.Attr;
+import org.w3c.dom.CDATASection;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.EntityReference;
+import org.w3c.dom.Node;
+import org.w3c.dom.ProcessingInstruction;
+import org.w3c.dom.Text;
+
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
+/**
+ * A DOM parser for HTML fragments.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: DOMFragmentParser.java,v 1.8 2005/02/14 03:56:54 andyc Exp $
+ */
+public class DOMFragmentParser
+ implements XMLDocumentHandler {
+
+ //
+ // Constants
+ //
+
+ // features
+
+ /** Document fragment balancing only. */
+ protected static final String DOCUMENT_FRAGMENT =
+ "http://cyberneko.org/html/features/document-fragment";
+
+ /** Recognized features. */
+ protected static final String[] RECOGNIZED_FEATURES = {
+ DOCUMENT_FRAGMENT,
+ };
+
+ // properties
+
+ /** Property identifier: error handler. */
+ protected static final String ERROR_HANDLER =
+ Constants.XERCES_PROPERTY_PREFIX + Constants.ERROR_HANDLER_PROPERTY;
+
+ /** Current element node. */
+ protected static final String CURRENT_ELEMENT_NODE =
+ Constants.XERCES_PROPERTY_PREFIX + Constants.CURRENT_ELEMENT_NODE_PROPERTY;
+
+ /** Recognized properties. */
+ protected static final String[] RECOGNIZED_PROPERTIES = {
+ ERROR_HANDLER,
+ CURRENT_ELEMENT_NODE,
+ };
+
+ //
+ // Data
+ //
+
+ /** Parser configuration. */
+ protected XMLParserConfiguration fParserConfiguration;
+
+ /** Document source. */
+ protected XMLDocumentSource fDocumentSource;
+
+ /** DOM document fragment. */
+ protected DocumentFragment fDocumentFragment;
+
+ /** Document. */
+ protected Document fDocument;
+
+ /** Current node. */
+ protected Node fCurrentNode;
+
+ /** True if within a CDATA section. */
+ protected boolean fInCDATASection;
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public DOMFragmentParser() {
+ fParserConfiguration = new HTMLConfiguration();
+ fParserConfiguration.addRecognizedFeatures(RECOGNIZED_FEATURES);
+ fParserConfiguration.addRecognizedProperties(RECOGNIZED_PROPERTIES);
+ fParserConfiguration.setFeature(DOCUMENT_FRAGMENT, true);
+ fParserConfiguration.setDocumentHandler(this);
+ } // <init>()
+
+ //
+ // Public methods
+ //
+
+ /** Parses a document fragment. */
+ public void parse(String systemId, DocumentFragment fragment)
+ throws SAXException, IOException {
+ parse(new InputSource(systemId), fragment);
+ } // parse(String,DocumentFragment)
+
+ /** Parses a document fragment. */
+ public void parse(InputSource source, DocumentFragment fragment)
+ throws SAXException, IOException {
+
+ fCurrentNode = fDocumentFragment = fragment;
+ fDocument = fDocumentFragment.getOwnerDocument();
+
+ try {
+ String pubid = source.getPublicId();
+ String sysid = source.getSystemId();
+ String encoding = source.getEncoding();
+ InputStream stream = source.getByteStream();
+ Reader reader = source.getCharacterStream();
+
+ XMLInputSource inputSource =
+ new XMLInputSource(pubid, sysid, sysid);
+ inputSource.setEncoding(encoding);
+ inputSource.setByteStream(stream);
+ inputSource.setCharacterStream(reader);
+
+ fParserConfiguration.parse(inputSource);
+ }
+ catch (XMLParseException e) {
+ Exception ex = e.getException();
+ if (ex != null) {
+ throw new SAXParseException(e.getMessage(), null, ex);
+ }
+ throw new SAXParseException(e.getMessage(), null);
+ }
+
+ } // parse(InputSource,DocumentFragment)
+
+ /**
+ * Allow an application to register an error event handler.
+ *
+ * <p>If the application does not register an error handler, all
+ * error events reported by the SAX parser will be silently
+ * ignored; however, normal processing may not continue. It is
+ * highly recommended that all SAX applications implement an
+ * error handler to avoid unexpected bugs.</p>
+ *
+ * <p>Applications may register a new or different handler in the
+ * middle of a parse, and the SAX parser must begin using the new
+ * handler immediately.</p>
+ *
+ * @param errorHandler The error handler.
+ * @exception java.lang.NullPointerException If the handler
+ * argument is null.
+ * @see #getErrorHandler
+ */
+ public void setErrorHandler(ErrorHandler errorHandler) {
+ fParserConfiguration.setErrorHandler(new ErrorHandlerWrapper(errorHandler));
+ } // setErrorHandler(ErrorHandler)
+
+ /**
+ * Return the current error handler.
+ *
+ * @return The current error handler, or null if none
+ * has been registered.
+ * @see #setErrorHandler
+ */
+ public ErrorHandler getErrorHandler() {
+
+ ErrorHandler errorHandler = null;
+ try {
+ XMLErrorHandler xmlErrorHandler =
+ (XMLErrorHandler)fParserConfiguration.getProperty(ERROR_HANDLER);
+ if (xmlErrorHandler != null &&
+ xmlErrorHandler instanceof ErrorHandlerWrapper) {
+ errorHandler = ((ErrorHandlerWrapper)xmlErrorHandler).getErrorHandler();
+ }
+ }
+ catch (XMLConfigurationException e) {
+ // do nothing
+ }
+ return errorHandler;
+
+ } // getErrorHandler():ErrorHandler
+
+ /**
+ * Set the state of any feature in a SAX2 parser. The parser
+ * might not recognize the feature, and if it does recognize
+ * it, it might not be able to fulfill the request.
+ *
+ * @param featureId The unique identifier (URI) of the feature.
+ * @param state The requested state of the feature (true or false).
+ *
+ * @exception SAXNotRecognizedException If the
+ * requested feature is not known.
+ * @exception SAXNotSupportedException If the
+ * requested feature is known, but the requested
+ * state is not supported.
+ */
+ public void setFeature(String featureId, boolean state)
+ throws SAXNotRecognizedException, SAXNotSupportedException {
+
+ try {
+ fParserConfiguration.setFeature(featureId, state);
+ }
+ catch (XMLConfigurationException e) {
+ String message = e.getMessage();
+ if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
+ throw new SAXNotRecognizedException(message);
+ }
+ else {
+ throw new SAXNotSupportedException(message);
+ }
+ }
+
+ } // setFeature(String,boolean)
+
+ /**
+ * Query the state of a feature.
+ *
+ * Query the current state of any feature in a SAX2 parser. The
+ * parser might not recognize the feature.
+ *
+ * @param featureId The unique identifier (URI) of the feature
+ * being set.
+ * @return The current state of the feature.
+ * @exception org.xml.sax.SAXNotRecognizedException If the
+ * requested feature is not known.
+ * @exception SAXNotSupportedException If the
+ * requested feature is known but not supported.
+ */
+ public boolean getFeature(String featureId)
+ throws SAXNotRecognizedException, SAXNotSupportedException {
+
+ try {
+ return fParserConfiguration.getFeature(featureId);
+ }
+ catch (XMLConfigurationException e) {
+ String message = e.getMessage();
+ if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
+ throw new SAXNotRecognizedException(message);
+ }
+ else {
+ throw new SAXNotSupportedException(message);
+ }
+ }
+
+ } // getFeature(String):boolean
+
+ /**
+ * Set the value of any property in a SAX2 parser. The parser
+ * might not recognize the property, and if it does recognize
+ * it, it might not support the requested value.
+ *
+ * @param propertyId The unique identifier (URI) of the property
+ * being set.
+ * @param value The value to which the property is being set.
+ *
+ * @exception SAXNotRecognizedException If the
+ * requested property is not known.
+ * @exception SAXNotSupportedException If the
+ * requested property is known, but the requested
+ * value is not supported.
+ */
+ public void setProperty(String propertyId, Object value)
+ throws SAXNotRecognizedException, SAXNotSupportedException {
+
+ try {
+ fParserConfiguration.setProperty(propertyId, value);
+ }
+ catch (XMLConfigurationException e) {
+ String message = e.getMessage();
+ if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
+ throw new SAXNotRecognizedException(message);
+ }
+ else {
+ throw new SAXNotSupportedException(message);
+ }
+ }
+
+ } // setProperty(String,Object)
+
+ /**
+ * Query the value of a property.
+ *
+ * Return the current value of a property in a SAX2 parser.
+ * The parser might not recognize the property.
+ *
+ * @param propertyId The unique identifier (URI) of the property
+ * being set.
+ * @return The current value of the property.
+ * @exception org.xml.sax.SAXNotRecognizedException If the
+ * requested property is not known.
+ * @exception SAXNotSupportedException If the
+ * requested property is known but not supported.
+ */
+ public Object getProperty(String propertyId)
+ throws SAXNotRecognizedException, SAXNotSupportedException {
+
+ if (propertyId.equals(CURRENT_ELEMENT_NODE)) {
+ return (fCurrentNode!=null &&
+ fCurrentNode.getNodeType() == Node.ELEMENT_NODE)? fCurrentNode:null;
+ }
+
+ try {
+ return fParserConfiguration.getProperty(propertyId);
+ }
+ catch (XMLConfigurationException e) {
+ String message = e.getMessage();
+ if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
+ throw new SAXNotRecognizedException(message);
+ }
+ else {
+ throw new SAXNotSupportedException(message);
+ }
+ }
+
+ } // getProperty(String):Object
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Sets the document source. */
+ public void setDocumentSource(XMLDocumentSource source) {
+ fDocumentSource = source;
+ } // setDocumentSource(XMLDocumentSource)
+
+ /** Returns the document source. */
+ public XMLDocumentSource getDocumentSource() {
+ return fDocumentSource;
+ } // getDocumentSource():XMLDocumentSource
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ Augmentations augs) throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ // since Xerces 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext,
+ Augmentations augs) throws XNIException {
+ fInCDATASection = false;
+ } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
+
+ /** XML declaration. */
+ public void xmlDecl(String version, String encoding,
+ String standalone, Augmentations augs)
+ throws XNIException {
+ } // xmlDecl(String,String,String,Augmentations)
+
+ /** Document type declaration. */
+ public void doctypeDecl(String root, String pubid, String sysid,
+ Augmentations augs) throws XNIException {
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data,
+ Augmentations augs)
+ throws XNIException {
+ ProcessingInstruction pi =
+ fDocument.createProcessingInstruction(target, data.toString());
+ fCurrentNode.appendChild(pi);
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs)
+ throws XNIException {
+ Comment comment = fDocument.createComment(text.toString());
+ fCurrentNode.appendChild(comment);
+ } // comment(XMLString,Augmentations)
+
+ /** Start prefix mapping. @deprecated Since Xerces 2.2.0. */
+ public void startPrefixMapping(String prefix, String uri,
+ Augmentations augs) throws XNIException {
+ } // startPrefixMapping(String,String,Augmentations)
+
+ /** End prefix mapping. @deprecated Since Xerces 2.2.0. */
+ public void endPrefixMapping(String prefix, Augmentations augs)
+ throws XNIException {
+ } // endPrefixMapping(String,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+ Element elementNode = fDocument.createElement(element.rawname);
+ int count = attrs != null ? attrs.getLength() : 0;
+ for (int i = 0; i < count; i++) {
+ String aname = attrs.getQName(i);
+ String avalue = attrs.getValue(i);
+ elementNode.setAttribute(aname, avalue);
+ }
+ fCurrentNode.appendChild(elementNode);
+ fCurrentNode = elementNode;
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attrs,
+ Augmentations augs) throws XNIException {
+ startElement(element, attrs, augs);
+ endElement(element, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+
+ if (fInCDATASection) {
+ Node node = fCurrentNode.getLastChild();
+ if (node != null && node.getNodeType() == Node.CDATA_SECTION_NODE) {
+ CDATASection cdata = (CDATASection)node;
+ cdata.appendData(text.toString());
+ }
+ else {
+ CDATASection cdata = fDocument.createCDATASection(text.toString());
+ fCurrentNode.appendChild(cdata);
+ }
+ }
+ else {
+ Node node = fCurrentNode.getLastChild();
+ if (node != null && node.getNodeType() == Node.TEXT_NODE) {
+ Text textNode = (Text)node;
+ textNode.appendData(text.toString());
+ }
+ else {
+ Text textNode = fDocument.createTextNode(text.toString());
+ fCurrentNode.appendChild(textNode);
+ }
+ }
+
+ } // characters(XMLString,Augmentations)
+
+ /** Ignorable whitespace. */
+ public void ignorableWhitespace(XMLString text, Augmentations augs)
+ throws XNIException {
+ characters(text, augs);
+ } // ignorableWhitespace(XMLString,Augmentations)
+
+ /** Start general entity. */
+ public void startGeneralEntity(String name, XMLResourceIdentifier id,
+ String encoding, Augmentations augs)
+ throws XNIException {
+ EntityReference entityRef = fDocument.createEntityReference(name);
+ fCurrentNode.appendChild(entityRef);
+ fCurrentNode = entityRef;
+ } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+ /** Text declaration. */
+ public void textDecl(String version, String encoding,
+ Augmentations augs) throws XNIException {
+ } // textDecl(String,String,Augmentations)
+
+ /** End general entity. */
+ public void endGeneralEntity(String name, Augmentations augs)
+ throws XNIException {
+ fCurrentNode = fCurrentNode.getParentNode();
+ } // endGeneralEntity(String,Augmentations)
+
+ /** Start CDATA section. */
+ public void startCDATA(Augmentations augs) throws XNIException {
+ fInCDATASection = true;
+ } // startCDATA(Augmentations)
+
+ /** End CDATA section. */
+ public void endCDATA(Augmentations augs) throws XNIException {
+ fInCDATASection = false;
+ } // endCDATA(Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs)
+ throws XNIException {
+ fCurrentNode = fCurrentNode.getParentNode();
+ } // endElement(QName,Augmentations)
+
+ /** End document. */
+ public void endDocument(Augmentations augs) throws XNIException {
+ } // endDocument(Augmentations)
+
+ //
+ // DEBUG
+ //
+
+ /***
+ public static void print(Node node) {
+ short type = node.getNodeType();
+ switch (type) {
+ case Node.ELEMENT_NODE: {
+ System.out.print('<');
+ System.out.print(node.getNodeName());
+ org.w3c.dom.NamedNodeMap attrs = node.getAttributes();
+ int attrCount = attrs != null ? attrs.getLength() : 0;
+ for (int i = 0; i < attrCount; i++) {
+ Node attr = attrs.item(i);
+ System.out.print(' ');
+ System.out.print(attr.getNodeName());
+ System.out.print("='");
+ System.out.print(attr.getNodeValue());
+ System.out.print('\'');
+ }
+ System.out.print('>');
+ break;
+ }
+ case Node.TEXT_NODE: {
+ System.out.print(node.getNodeValue());
+ break;
+ }
+ }
+ Node child = node.getFirstChild();
+ while (child != null) {
+ print(child);
+ child = child.getNextSibling();
+ }
+ if (type == Node.ELEMENT_NODE) {
+ System.out.print("</");
+ System.out.print(node.getNodeName());
+ System.out.print('>');
+ }
+ else if (type == Node.DOCUMENT_NODE || type == Node.DOCUMENT_FRAGMENT_NODE) {
+ System.out.println();
+ }
+ System.out.flush();
+ }
+
+ public static void main(String[] argv) throws Exception {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ HTMLDocument document = new org.apache.html.dom.HTMLDocumentImpl();
+ for (int i = 0; i < argv.length; i++) {
+ String sysid = argv[i];
+ System.err.println("# "+sysid);
+ DocumentFragment fragment = document.createDocumentFragment();
+ parser.parse(sysid, fragment);
+ print(fragment);
+ }
+ }
+ /***/
+
+} // class DOMFragmentParser
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMParser.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMParser.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/DOMParser.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,111 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.parsers;
+
+import org.cyberneko.html.HTMLConfiguration;
+
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.XNIException;
+
+import org.w3c.dom.DOMException;
+
+/**
+ * A DOM parser for HTML documents.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: DOMParser.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
+ */
+public class DOMParser
+ /***/
+ extends org.apache.xerces.parsers.DOMParser {
+ /***
+ // NOTE: It would be better to extend from AbstractDOMParser but
+ // most users will find it easier if the API is just like the
+ // Xerces DOM parser. By extending directly from DOMParser,
+ // users can register SAX error handlers, entity resolvers,
+ // and the like. -Ac
+ extends org.apache.xerces.parsers.AbstractDOMParser {
+ /***/
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public DOMParser() {
+ super(new HTMLConfiguration());
+ /*** extending DOMParser ***/
+ try {
+ setProperty("http://apache.org/xml/properties/dom/document-class-name",
+ "org.apache.html.dom.HTMLDocumentImpl");
+ }
+ catch (org.xml.sax.SAXNotRecognizedException e) {
+ throw new RuntimeException("http://apache.org/xml/properties/dom/document-class-name property not recognized");
+ }
+ catch (org.xml.sax.SAXNotSupportedException e) {
+ throw new RuntimeException("http://apache.org/xml/properties/dom/document-class-name property not supported");
+ }
+ /*** extending AbstractDOMParser ***
+ fConfiguration.setProperty("http://apache.org/xml/properties/dom/document-class-name",
+ "org.apache.html.dom.HTMLDocumentImpl");
+ /***/
+ } // <init>()
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Doctype declaration. */
+ public void doctypeDecl(String root, String pubid, String sysid,
+ Augmentations augs) throws XNIException {
+
+ // NOTE: Xerces HTML DOM implementation (up to and including
+ // 2.5.0) throws a heirarchy request error exception
+ // when a doctype node is appended to the tree. So,
+ // don't insert this node into the tree for those
+ // versions... -Ac
+
+ String VERSION = org.apache.xerces.impl.Version.fVersion;
+ boolean okay = true;
+ if (VERSION.startsWith("Xerces-J 2.")) {
+ okay = getParserSubVersion() > 5;
+ }
+ // REVISIT: As soon as XML4J is updated with the latest code
+ // from Xerces, then this needs to be updated to
+ // check XML4J's version. -Ac
+ else if (VERSION.startsWith("XML4J")) {
+ okay = false;
+ }
+
+ // if okay, insert doctype; otherwise, don't risk it
+ if (okay) {
+ super.doctypeDecl(root, pubid, sysid, augs);
+ }
+
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ //
+ // Private static methods
+ //
+
+ /** Returns the parser's sub-version number. */
+ private static int getParserSubVersion() {
+ try {
+ String VERSION = org.apache.xerces.impl.Version.fVersion;
+ int index1 = VERSION.indexOf('.') + 1;
+ int index2 = VERSION.indexOf('.', index1);
+ if (index2 == -1) { index2 = VERSION.length(); }
+ return Integer.parseInt(VERSION.substring(index1, index2));
+ }
+ catch (Exception e) {
+ return -1;
+ }
+ } // getParserSubVersion():int
+
+} // class DOMParser
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/SAXParser.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/SAXParser.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/parsers/SAXParser.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,32 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package org.cyberneko.html.parsers;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+
+/**
+ * A SAX parser for HTML documents.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: SAXParser.java,v 1.4 2005/02/14 03:56:54 andyc Exp $
+ */
+public class SAXParser
+ extends AbstractSAXParser {
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public SAXParser() {
+ super(new HTMLConfiguration());
+ } // <init>()
+
+} // class SAXParser
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/ErrorMessages.properties
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/ErrorMessages.properties 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/ErrorMessages.properties 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,39 @@
+##
+# NekoHTML error messages.
+#
+# @author Andy Clark
+#
+# @version $Id: ErrorMessages.properties,v 1.4 2004/11/01 00:28:35 andyc Exp $
+
+# internal messages
+HTML0000=General internal error.
+
+# scanner messages
+HTML1000=No character encoding indicator at beginning of document.
+HTML1001=No Java character encoding mapping for IANA character encoding "{0}".
+HTML1002=Unsupported syntax starting with "<!". Skipping to '>'.
+HTML1003=Bare markup character '<' found.
+HTML1004=Bare ampersand found.
+HTML1005=Invalid character entity "{0}".
+HTML1006=Unknown general entity "{0}".
+HTML1007=Premature end of file encountered.
+HTML1008=Skipping processing instruction.
+HTML1009=Missing start element name.
+HTML1010=Unsupported character encoding "{0}". Ignoring charset directive.
+HTML1011=Missing attribute name.
+HTML1012=Missing end element name.
+HTML1013=Missing whitespace before attribute "{0}".
+HTML1014=Missing root element name in DOCTYPE.
+
+# tag balancer messages
+HTML2000=Empty document.
+HTML2001=Element <{0}> not closed properly.
+HTML2002=Missing parent chain. Inserting proper parent <{1}> for element <{0}>.
+HTML2004=Inserting proper parent element <{1}> for element <{0}>.
+HTML2005=Start element <{0}> automatically closes element <{1}>.
+HTML2006=Bare character content found. Inserting parent element <{0}>.
+HTML2007=End element <{0}> automatically closes element <{1}>.
+HTML2008=Re-opening unbalanced inline element <{0}>.
+HTML2009=Character content found within element <{0}>. Inserting proper parent element <{1}>.
+HTML2010=DOCTYPE declaration found inside document content.
+HTML2011=Multiple DOCTYPE declaration.
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLlat1.properties
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLlat1.properties 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLlat1.properties 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,101 @@
+##
+# This file was generated from the HTMLlat1.ent file.
+#
+# @version $Id: HTMLlat1.properties,v 1.1 2004/07/28 09:11:52 andyc Exp $
+
+nbsp=\u00a0
+iexcl=\u00a1
+cent=\u00a2
+pound=\u00a3
+curren=\u00a4
+yen=\u00a5
+brvbar=\u00a6
+sect=\u00a7
+uml=\u00a8
+copy=\u00a9
+ordf=\u00aa
+laquo=\u00ab
+not=\u00ac
+shy=\u00ad
+reg=\u00ae
+macr=\u00af
+deg=\u00b0
+plusmn=\u00b1
+sup2=\u00b2
+sup3=\u00b3
+acute=\u00b4
+micro=\u00b5
+para=\u00b6
+middot=\u00b7
+cedil=\u00b8
+sup1=\u00b9
+ordm=\u00ba
+raquo=\u00bb
+frac14=\u00bc
+frac12=\u00bd
+frac34=\u00be
+iquest=\u00bf
+Agrave=\u00c0
+Aacute=\u00c1
+Acirc=\u00c2
+Atilde=\u00c3
+Auml=\u00c4
+Aring=\u00c5
+AElig=\u00c6
+Ccedil=\u00c7
+Egrave=\u00c8
+Eacute=\u00c9
+Ecirc=\u00ca
+Euml=\u00cb
+Igrave=\u00cc
+Iacute=\u00cd
+Icirc=\u00ce
+Iuml=\u00cf
+ETH=\u00d0
+Ntilde=\u00d1
+Ograve=\u00d2
+Oacute=\u00d3
+Ocirc=\u00d4
+Otilde=\u00d5
+Ouml=\u00d6
+times=\u00d7
+Oslash=\u00d8
+Ugrave=\u00d9
+Uacute=\u00da
+Ucirc=\u00db
+Uuml=\u00dc
+Yacute=\u00dd
+THORN=\u00de
+szlig=\u00df
+agrave=\u00e0
+aacute=\u00e1
+acirc=\u00e2
+atilde=\u00e3
+auml=\u00e4
+aring=\u00e5
+aelig=\u00e6
+ccedil=\u00e7
+egrave=\u00e8
+eacute=\u00e9
+ecirc=\u00ea
+euml=\u00eb
+igrave=\u00ec
+iacute=\u00ed
+icirc=\u00ee
+iuml=\u00ef
+eth=\u00f0
+ntilde=\u00f1
+ograve=\u00f2
+oacute=\u00f3
+ocirc=\u00f4
+otilde=\u00f5
+ouml=\u00f6
+divide=\u00f7
+oslash=\u00f8
+ugrave=\u00f9
+uacute=\u00fa
+ucirc=\u00fb
+uuml=\u00fc
+yacute=\u00fd
+thorn=\u00fe
+yuml=\u00ff
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLspecial.properties
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLspecial.properties 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLspecial.properties 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,37 @@
+##
+# This file was generated from the HTMLspecial.ent file.
+#
+# @version $Id: HTMLspecial.properties,v 1.1 2004/07/28 09:11:52 andyc Exp $
+
+quot=\u0022
+amp=\u0026
+lt=\u003c
+gt=\u003e
+OElig=\u0152
+oelig=\u0153
+Scaron=\u0160
+scaron=\u0161
+Yuml=\u0178
+circ=\u02c6
+tilde=\u02dc
+ensp=\u2002
+emsp=\u2003
+thinsp=\u2009
+zwnj=\u200c
+zwj=\u200d
+lrm=\u200e
+rlm=\u200f
+ndash=\u2013
+mdash=\u2014
+lsquo=\u2018
+rsquo=\u2019
+sbquo=\u201a
+ldquo=\u201c
+rdquo=\u201d
+bdquo=\u201e
+dagger=\u2020
+Dagger=\u2021
+permil=\u2030
+lsaquo=\u2039
+rsaquo=\u203a
+euro=\u20ac
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLsymbol.properties
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLsymbol.properties 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/HTMLsymbol.properties 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,129 @@
+##
+# This file was generated from the HTMLsymbol.ent file.
+#
+# @version $Id: HTMLsymbol.properties,v 1.1 2004/07/28 09:11:52 andyc Exp $
+
+fnof=\u0192
+Alpha=\u0391
+Beta=\u0392
+Gamma=\u0393
+Delta=\u0394
+Epsilon=\u0395
+Zeta=\u0396
+Eta=\u0397
+Theta=\u0398
+Iota=\u0399
+Kappa=\u039a
+Lambda=\u039b
+Mu=\u039c
+Nu=\u039d
+Xi=\u039e
+Omicron=\u039f
+Pi=\u03a0
+Rho=\u03a1
+Sigma=\u03a3
+Tau=\u03a4
+Upsilon=\u03a5
+Phi=\u03a6
+Chi=\u03a7
+Psi=\u03a8
+Omega=\u03a9
+alpha=\u03b1
+beta=\u03b2
+gamma=\u03b3
+delta=\u03b4
+epsilon=\u03b5
+zeta=\u03b6
+eta=\u03b7
+theta=\u03b8
+iota=\u03b9
+kappa=\u03ba
+lambda=\u03bb
+mu=\u03bc
+nu=\u03bd
+xi=\u03be
+omicron=\u03bf
+pi=\u03c0
+rho=\u03c1
+sigmaf=\u03c2
+sigma=\u03c3
+tau=\u03c4
+upsilon=\u03c5
+phi=\u03c6
+chi=\u03c7
+psi=\u03c8
+omega=\u03c9
+thetasym=\u03d1
+upsih=\u03d2
+piv=\u03d6
+bull=\u2022
+hellip=\u2026
+prime=\u2032
+Prime=\u2033
+oline=\u203e
+frasl=\u2044
+weierp=\u2118
+image=\u2111
+real=\u211c
+trade=\u2122
+alefsym=\u2135
+larr=\u2190
+uarr=\u2191
+rarr=\u2192
+darr=\u2193
+harr=\u2194
+crarr=\u21b5
+lArr=\u21d0
+uArr=\u21d1
+rArr=\u21d2
+dArr=\u21d3
+hArr=\u21d4
+forall=\u2200
+part=\u2202
+exist=\u2203
+empty=\u2205
+nabla=\u2207
+isin=\u2208
+notin=\u2209
+ni=\u220b
+prod=\u220f
+sum=\u2211
+minus=\u2212
+lowast=\u2217
+radic=\u221a
+prop=\u221d
+infin=\u221e
+ang=\u2220
+and=\u2227
+or=\u2228
+cap=\u2229
+cup=\u222a
+int=\u222b
+there4=\u2234
+sim=\u223c
+cong=\u2245
+asymp=\u2248
+ne=\u2260
+equiv=\u2261
+le=\u2264
+ge=\u2265
+sub=\u2282
+sup=\u2283
+nsub=\u2284
+sube=\u2286
+supe=\u2287
+oplus=\u2295
+otimes=\u2297
+perp=\u22a5
+sdot=\u22c5
+lceil=\u2308
+rceil=\u2309
+lfloor=\u230a
+rfloor=\u230b
+lang=\u2329
+rang=\u232a
+loz=\u25ca
+spades=\u2660
+clubs=\u2663
+hearts=\u2665
+diams=\u2666
Added: branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/XMLbuiltin.properties
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/XMLbuiltin.properties 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/org/cyberneko/html/res/XMLbuiltin.properties 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,7 @@
+##
+# This file exists because the list of HTML entities does not include
+# apostrophe ("apos") which should be recognized, IMHO. -Ac
+#
+# @version $Id: XMLbuiltin.properties,v 1.1 2004/07/28 09:11:52 andyc Exp $
+
+apos='
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/src/html/sample/HTMLSAXParser.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/sample/HTMLSAXParser.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/sample/HTMLSAXParser.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,33 @@
+/*
+ * (C) Copyright 2002-2004, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package sample;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+
+/**
+ * This sample shows how to extend a Xerces2 parser class, replacing
+ * the default parser configuration with the NekoHTML configuration.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: HTMLSAXParser.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
+ */
+public class HTMLSAXParser
+ extends AbstractSAXParser {
+
+ //
+ // Constructors
+ //
+
+ /** Default constructor. */
+ public HTMLSAXParser() {
+ super(new HTMLConfiguration());
+ } // <init>()
+
+} // class HTMLSAXParser
Added: branches/nekohtml/upstream/0.9.5/src/html/sample/RemoveElements.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/sample/RemoveElements.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/sample/RemoveElements.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,69 @@
+/*
+ * (C) Copyright 2002-2004, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package sample;
+
+import org.cyberneko.html.HTMLConfiguration;
+import org.cyberneko.html.filters.ElementRemover;
+
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
+
+/**
+ * This is a sample that illustrates how to use the
+ * <code>ElementRemover</code> filter.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: RemoveElements.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
+ */
+public class RemoveElements {
+
+ //
+ // MAIN
+ //
+
+ /** Main. */
+ public static void main(String[] argv) throws Exception {
+
+ // create element remover filter
+ ElementRemover remover = new ElementRemover();
+
+ // set which elements to accept
+ remover.acceptElement("b", null);
+ remover.acceptElement("i", null);
+ remover.acceptElement("u", null);
+ remover.acceptElement("a", new String[] { "href" });
+
+ // completely remove script elements
+ remover.removeElement("script");
+
+ // create writer filter
+ org.cyberneko.html.filters.Writer writer =
+ new org.cyberneko.html.filters.Writer();
+
+ // setup filter chain
+ XMLDocumentFilter[] filters = {
+ remover,
+ writer,
+ };
+
+ // create HTML parser
+ XMLParserConfiguration parser = new HTMLConfiguration();
+ parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
+
+ // parse documents
+ for (int i = 0; i < argv.length; i++) {
+ String systemId = argv[i];
+ XMLInputSource source = new XMLInputSource(null, systemId, null);
+ parser.parse(source);
+ }
+
+ } // main(String[])
+
+} // class RemoveElements
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/src/html/sample/Script.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/sample/Script.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/sample/Script.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,227 @@
+/*
+ * (C) Copyright 2002-2004, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package sample;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringReader;
+import java.io.StringWriter;
+
+import org.cyberneko.html.HTMLConfiguration;
+import org.cyberneko.html.filters.DefaultFilter;
+import org.cyberneko.html.filters.Identity;
+import org.cyberneko.html.filters.Writer;
+
+import org.apache.xerces.util.XMLAttributesImpl;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLInputSource;
+
+/**
+ * This sample demonstrates how to use of the <code>pushInputSource</code>
+ * method of the HTMLConfiguration in order to dynamically insert content
+ * into the HTML stream. The typical use for this functionality is to
+ * insert the result of an embedded script into the HTML document in place
+ * of the script.
+ * <p>
+ * This particular example defines a new script language called "NekoHTML"
+ * script that is a tiny subset of the NSGMLS format. The following table
+ * enumerates the NSGMLS features supported by this script language:
+ * <table border='1' cellspacing='0', cellpadding='3'>
+ * <tr><th>(<i>name</i><td>A start element with the specified <i>name</i>.
+ * <tr><th>"<i>text</i><td>Character content with the specified <i>text</i>.
+ * <tr><th>)<i>name</i><td>An end element with the specified <i>name</i>.
+ * </table>
+ * <p>
+ * In this format, every <i>command</i> is specified on a line by itself.
+ * For example, the following document:
+ * <pre>
+ * <script type='NekoHTML'>
+ * (h1
+ * "Header
+ * )h1
+ * </script>
+ * </pre>
+ * is equivalent to the following HTML document:
+ * <pre>
+ * <H1>Header</H1>
+ * </pre>
+ * as seen by document handler registered with the parser, when processed
+ * by this filter.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: Script.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
+ */
+public class Script
+ extends DefaultFilter {
+
+ //
+ // Constants
+ //
+
+ /** Augmentations feature identifier. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Filters property identifier. */
+ protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
+
+ /** Script type ("text/x-nekoscript"). */
+ protected static final String SCRIPT_TYPE = "text/x-nekoscript";
+
+ //
+ // Data
+ //
+
+ /** The NekoHTML configuration. */
+ protected HTMLConfiguration fConfiguration;
+
+ /** A string buffer to collect the "script". */
+ protected StringBuffer fBuffer;
+
+ /** The system identifier of the source document. */
+ protected String fSystemId;
+
+ /** The script count. */
+ protected int fScriptCount;
+
+ //
+ // Constructors
+ //
+
+ /** Constructs a script object with the specified configuration. */
+ public Script(HTMLConfiguration config) {
+ fConfiguration = config;
+ } // <init>(HTMLConfiguration)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
+ throws XNIException {
+ fBuffer = null;
+ fSystemId = locator != null ? locator.getLiteralSystemId() : null;
+ fScriptCount = 0;
+ super.startDocument(locator, encoding, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attrs, Augmentations augs)
+ throws XNIException {
+ if (element.rawname.equalsIgnoreCase("script") && attrs != null) {
+ String value = attrs.getValue("type");
+ if (value != null && value.equalsIgnoreCase(SCRIPT_TYPE)) {
+ fBuffer = new StringBuffer();
+ return;
+ }
+ }
+ super.startElement(element, attrs, augs);
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attrs, Augmentations augs)
+ throws XNIException {
+ if (element.rawname.equalsIgnoreCase("script") && attrs != null) {
+ String value = attrs.getValue("type");
+ if (value != null && value.equalsIgnoreCase(SCRIPT_TYPE)) {
+ return;
+ }
+ }
+ super.emptyElement(element, attrs, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs)
+ throws XNIException {
+ if (fBuffer != null) {
+ fBuffer.append(text.ch, text.offset, text.length);
+ }
+ else {
+ super.characters(text, augs);
+ }
+ } // characters(XMLString,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs) throws XNIException {
+ if (fBuffer != null) {
+ try {
+ // run "script" and generate HTML output
+ BufferedReader in = new BufferedReader(new StringReader(fBuffer.toString()));
+ StringWriter sout = new StringWriter();
+ PrintWriter out = new PrintWriter(sout);
+ String line;
+ while ((line = in.readLine()) != null) {
+ line.trim();
+ if (line.length() == 0) {
+ continue;
+ }
+ switch (line.charAt(0)) {
+ case '(': {
+ out.print('<');
+ out.print(line.substring(1));
+ out.print('>');
+ break;
+ }
+ case '"': {
+ out.print(line.substring(1));
+ break;
+ }
+ case ')': {
+ out.print("</");
+ out.print(line.substring(1));
+ out.print('>');
+ break;
+ }
+ }
+ }
+
+ // push new input source
+ String systemId = fSystemId != null ? fSystemId+'_' : "";
+ fScriptCount++;
+ systemId += "script"+fScriptCount;
+ XMLInputSource source = new XMLInputSource(null, systemId, null,
+ new StringReader(sout.toString()),
+ "UTF-8");
+ fConfiguration.pushInputSource(source);
+ }
+ catch (IOException e) {
+ // ignore
+ }
+ finally {
+ fBuffer = null;
+ }
+ }
+ else {
+ super.endElement(element, augs);
+ }
+ } // endElement(QName,Augmentations)
+
+ //
+ // MAIN
+ //
+
+ /** Main. */
+ public static void main(String[] argv) throws Exception {
+ HTMLConfiguration parser = new HTMLConfiguration();
+ parser.setFeature(AUGMENTATIONS, true);
+ XMLDocumentFilter[] filters = { new Script(parser), new Identity(), new Writer() };
+ parser.setProperty(FILTERS, filters);
+ for (int i = 0; i < argv.length; i++) {
+ parser.parse(new XMLInputSource(null, argv[i], null));
+ }
+ } // main(String[])
+
+} // class Script
Added: branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOM.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOM.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOM.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,52 @@
+/*
+ * (C) Copyright 2002-2004, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package sample;
+
+import org.cyberneko.html.parsers.DOMParser;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+
+/**
+ * This program tests the NekoHTML parser's use of the HTML DOM
+ * implementation by printing the class names of all the nodes in
+ * the parsed document.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: TestHTMLDOM.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
+ */
+public class TestHTMLDOM {
+
+ //
+ // MAIN
+ //
+
+ /** Main. */
+ public static void main(String[] argv) throws Exception {
+ DOMParser parser = new DOMParser();
+ for (int i = 0; i < argv.length; i++) {
+ parser.parse(argv[i]);
+ print(parser.getDocument(), "");
+ }
+ } // main(String[])
+
+ //
+ // Public static methods
+ //
+
+ /** Prints a node's class name. */
+ public static void print(Node node, String indent) {
+ System.out.println(indent+node.getClass().getName());
+ Node child = node.getFirstChild();
+ while (child != null) {
+ print(child, indent+" ");
+ child = child.getNextSibling();
+ }
+ } // print(Node)
+
+} // class TestHTMLDOM
Added: branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOMFragment.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOMFragment.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/sample/TestHTMLDOMFragment.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,57 @@
+/*
+ * (C) Copyright 2002-2004, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package sample;
+
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+import org.w3c.dom.html.HTMLDocument;
+
+/**
+ * This program tests the NekoHTML parser's use of the HTML DOM
+ * implementation to parse document fragments by printing the
+ * class names of all the nodes in the parsed document.
+ *
+ * @author Andy Clark
+ *
+ * @version $Id: TestHTMLDOMFragment.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
+ */
+public class TestHTMLDOMFragment {
+
+ //
+ // MAIN
+ //
+
+ /** Main. */
+ public static void main(String[] argv) throws Exception {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ HTMLDocument document = new HTMLDocumentImpl();
+ for (int i = 0; i < argv.length; i++) {
+ DocumentFragment fragment = document.createDocumentFragment();
+ parser.parse(argv[i], fragment);
+ print(fragment, "");
+ }
+ } // main(String[])
+
+ //
+ // Public static methods
+ //
+
+ /** Prints a node's class name. */
+ public static void print(Node node, String indent) {
+ System.out.println(indent+node.getClass().getName());
+ Node child = node.getFirstChild();
+ while (child != null) {
+ print(child, indent+" ");
+ child = child.getNextSibling();
+ }
+ } // print(Node)
+
+} // class TestHTMLDOMFragment
\ No newline at end of file
Added: branches/nekohtml/upstream/0.9.5/src/html/test/Tester.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/test/Tester.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/test/Tester.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,256 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package test;
+
+import org.cyberneko.html.HTMLConfiguration;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.tools.ant.BuildException;
+import org.apache.tools.ant.DirectoryScanner;
+import org.apache.tools.ant.Project;
+import org.apache.tools.ant.Task;
+import org.apache.tools.ant.types.FileSet;
+
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
+
+/**
+ * A simple regression tester written as an Ant task. This task
+ * generates canonical output using the <code>Writer</code> class
+ * and compares it against the expected canonical output. Simple
+ * as that.
+ *
+ * @author Andy Clark
+ */
+public class Tester
+ extends Task {
+
+ //
+ // Data
+ //
+
+ /** Canonical test directory. */
+ protected String fCanonicalDir;
+
+ /** Output directory for generated files. */
+ protected String fOutputDir;
+
+ /** List of test filesets. */
+ protected Vector fFileSets = new Vector();
+
+ //
+ // Public methods
+ //
+
+ /** Sets the canonical test directory. */
+ public void setCanonDir(String canondir) {
+ fCanonicalDir = canondir;
+ } // setCanonDir(String)
+
+ /** Sets the output directory for generated files. */
+ public void setOutputDir(String outdir) {
+ fOutputDir = outdir;
+ } // setOutputDir(String)
+
+ /** Adds a fileset to the list of test filesets. */
+ public void addFileSet(FileSet fileset) {
+ fFileSets.addElement(fileset);
+ } // addFileSet(FileSet)
+
+ //
+ // Task methods
+ //
+
+ /** Performs the test. */
+ public void execute() throws BuildException {
+
+ // check params
+ String canonicaldir = fCanonicalDir;
+ if (canonicaldir == null) {
+ canonicaldir = ".";
+ log("Canonical directory not specified. Assuming current directory.",
+ Project.MSG_WARN);
+ }
+ String outputdir = fOutputDir;
+ if (outputdir == null) {
+ outputdir = ".";
+ log("Output directory not specified. Assuming current directory.",
+ Project.MSG_WARN);
+ }
+ if (fFileSets.size() == 0) {
+ throw new BuildException("must specify at least one fileset");
+ }
+
+ // parse input files and produce output files
+ log("Parsing test files and generating output...");
+ File outdir = new File(outputdir);
+ int size = fFileSets.size();
+ for (int i = 0; i < size; i++) {
+ FileSet fileset = (FileSet)fFileSets.elementAt(i);
+ DirectoryScanner dirscanner = fileset.getDirectoryScanner(project);
+ File indir = dirscanner.getBasedir();
+ String[] files = dirscanner.getIncludedFiles();
+ for (int j = 0; j < files.length; j++) {
+ File infile = new File(indir, files[j]);
+ File outfile = new File(outdir, files[j]);
+ log(" "+outfile, Project.MSG_VERBOSE);
+ OutputStream out = null;
+ try {
+ // create filters
+ out = new FileOutputStream(outfile);
+ XMLDocumentFilter[] filters = { new Writer(out) };
+
+ // create parser
+ XMLParserConfiguration parser = new HTMLConfiguration();
+
+ // parser settings
+ parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
+ String infilename = infile.toString();
+ File insettings = new File(infilename+".settings");
+ if (insettings.exists()) {
+ BufferedReader settings = new BufferedReader(new FileReader(insettings));
+ String settingline;
+ while ((settingline = settings.readLine()) != null) {
+ StringTokenizer tokenizer = new StringTokenizer(settingline);
+ String type = tokenizer.nextToken();
+ String id = tokenizer.nextToken();
+ String value = tokenizer.nextToken();
+ if (type.equals("feature")) {
+ parser.setFeature(id, value.equals("true"));
+ }
+ else {
+ parser.setProperty(id, value);
+ }
+ }
+ settings.close();
+ }
+
+ // parse
+ parser.parse(new XMLInputSource(null, infilename, null));
+ }
+ catch (Exception e) {
+ log(" error parsing input file, "+infile);
+ throw new BuildException(e);
+ }
+ finally {
+ try {
+ out.close();
+ }
+ catch (Exception e) {
+ log(" error closing output file, "+outfile);
+ throw new BuildException(e);
+ }
+ }
+ }
+ }
+
+ // compare against canonical output
+ log("Comparing parsed output against canonical output...");
+ File canondir = new File(canonicaldir);
+ int errors = 0;
+ for (int i = 0; i < size; i++) {
+ FileSet fileset = (FileSet)fFileSets.elementAt(i);
+ DirectoryScanner dirscanner = fileset.getDirectoryScanner(project);
+ File indir = dirscanner.getBasedir();
+ String[] files = dirscanner.getIncludedFiles();
+ for (int j = 0; j < files.length; j++) {
+ File canonfile = new File(canondir, files[j]);
+ if (!canonfile.exists()) {
+ errors++;
+ log(" canonical file missing, "+canonfile);
+ continue;
+ }
+ File outfile = new File(outdir, files[j]);
+ if (!outfile.exists()) {
+ errors++;
+ log(" output file missing, "+outfile);
+ continue;
+ }
+ log(" comparing "+canonfile+" and "+outfile, Project.MSG_VERBOSE);
+ try {
+ if (compare(canonfile, outfile)) {
+ errors++;
+ }
+ }
+ catch (IOException e) {
+ errors++;
+ log("i/o error");
+ }
+ }
+ }
+
+ // finished
+ if (errors > 0) {
+ log("Finished with errors.");
+ throw new BuildException();
+ }
+ log("Done.");
+
+ } // execute()
+
+ //
+ // Protected methods
+ //
+
+ /** Compares two files. */
+ protected boolean compare(File f1, File f2) throws IOException {
+ BufferedReader i1 = new BufferedReader(new InputStreamReader(new UTF8BOMSkipper(new FileInputStream(f1)), "UTF8"));
+ BufferedReader i2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2), "UTF8"));
+ String l1;
+ String l2;
+ int errors = 0;
+ long n = 0;
+ while ((l1 = i1.readLine()) != null) {
+ n++;
+ if ((l2 = i2.readLine()) == null) {
+ errors++;
+ log(" file lengths don't match ("+f1+")");
+ break;
+ }
+ if (compare(f1.getName(), n, l1, l2)) {
+ errors++;
+ break;
+ }
+ }
+ if (errors == 0 && (l2 = i2.readLine()) != null) {
+ errors++;
+ log(" file lengths don't match ("+f1+")");
+ }
+ i1.close();
+ i2.close();
+ return errors > 0;
+ } // compare(File,File):boolean
+
+ /** Compares two strings. */
+ protected boolean compare(String f, long n, String s1, String s2) {
+ int l1 = s1.length();
+ int l2 = s2.length();
+ boolean error = false;
+ if (l1 < l2) {
+ error = true;
+ log(" "+f+':'+n+" output string too long");
+ }
+ else if (l1 > l2) {
+ error = true;
+ log(" "+f+':'+n+" output string too short");
+ }
+ else if (!s1.equals(s2)) {
+ error = true;
+ log(" "+f+':'+n+" strings don't match");
+ }
+ if (error) {
+ log(" [in: "+s1+']');
+ log(" [out: "+s2+']');
+ }
+ return error;
+ } // compare(String,long,String,String):boolean
+
+} // class Tester
Added: branches/nekohtml/upstream/0.9.5/src/html/test/UTF8BOMSkipper.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/test/UTF8BOMSkipper.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/test/UTF8BOMSkipper.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,111 @@
+/*
+ * (C) Copyright 2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package test;
+
+import java.io.*;
+
+/**
+ * This class is an input stream filter that skips the first
+ * three bytes read if they match the UTF-8 byte order mark,
+ * 0xEFBBBF. The UTF-8 BOM is most often generated by Windows®
+ * tools.
+ *
+ * @author Andy Clark
+ */
+public class UTF8BOMSkipper
+ extends FilterInputStream {
+
+ //
+ // Data
+ //
+
+ /** Start of reading. */
+ private boolean fStart = true;
+
+ /** Byte offset. */
+ private int fOffset;
+
+ /** First three bytes. */
+ private int[] fFirst3Bytes;
+
+ //
+ // Constructors
+ //
+
+ /** Constructs a UTF-8 BOM skipper. */
+ public UTF8BOMSkipper(InputStream stream) {
+ super(stream);
+ } // <init>(InputStream)
+
+ //
+ // InputStream methods
+ //
+
+ /** Returns the next byte. */
+ public int read() throws IOException {
+
+ // read first three bytes in order to skip UTF-8 BOM, if present
+ if (fStart) {
+ fStart = false;
+ int b1 = super.read();
+ int b2 = super.read();
+ int b3 = super.read();
+ if (b1 != 0xEF || b2 != 0xBB || b3 != 0xBF) {
+ fFirst3Bytes = new int[3];
+ fFirst3Bytes[0] = b1;
+ fFirst3Bytes[1] = b2;
+ fFirst3Bytes[2] = b3;
+ }
+ }
+
+ // return read bytes
+ if (fFirst3Bytes != null) {
+ int b = fFirst3Bytes[fOffset++];
+ if (fOffset == fFirst3Bytes.length) {
+ fFirst3Bytes = null;
+ }
+ return b;
+ }
+
+ // return next char
+ return super.read();
+
+ } // read():int
+
+ /** Reads bytes into specified buffer and returns total bytes read. */
+ public int read(byte[] buffer, int offset, int length) throws IOException {
+
+ if (fStart || fFirst3Bytes != null) {
+ for (int i = 0; i < length; i++) {
+ int b = this.read();
+ if (b == -1) {
+ return i > 0 ? i : -1;
+ }
+ buffer[offset + i] = (byte)b;
+ }
+ return length;
+ }
+
+ return super.read(buffer, offset, length);
+
+ } // read(byte[],int,int):int
+
+ /** Mark is not supported for this input stream. */
+ public boolean markSupported() {
+ return false;
+ } // markSupported():boolean
+
+ /** Returns the number of bytes available. */
+ public int available() throws IOException {
+ if (fFirst3Bytes != null) {
+ return fFirst3Bytes.length - fOffset;
+ }
+ return super.available();
+ } // available():int
+
+} // class UTF8BOMSkipper
Added: branches/nekohtml/upstream/0.9.5/src/html/test/Writer.java
===================================================================
--- branches/nekohtml/upstream/0.9.5/src/html/test/Writer.java 2006-08-18 12:36:39 UTC (rev 2344)
+++ branches/nekohtml/upstream/0.9.5/src/html/test/Writer.java 2006-08-22 16:00:27 UTC (rev 2345)
@@ -0,0 +1,310 @@
+/*
+ * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
+ *
+ * This file is distributed under an Apache style license. Please
+ * refer to the LICENSE file for specific details.
+ */
+
+package test;
+
+import org.cyberneko.html.filters.DefaultFilter;
+
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.xerces.util.XMLStringBuffer;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+
+/**
+ * This class implements an filter to output "canonical" files for
+ * regression testing.
+ *
+ * @author Andy Clark
+ */
+public class Writer
+ extends DefaultFilter {
+
+ //
+ // Data
+ //
+
+ /** Writer. */
+ protected PrintWriter out = new PrintWriter(System.out);
+
+ // temp vars
+
+ /** String buffer for collecting text content. */
+ private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Creates a writer to the standard output stream using UTF-8
+ * encoding.
+ */
+ public Writer() {
+ this(System.out);
+ } // <init>()
+
+ /**
+ * Creates a writer with the specified output stream using UTF-8
+ * encoding.
+ */
+ public Writer(OutputStream stream) {
+ this(stream, "UTF8");
+ } // <init>(OutputStream)
+
+ /** Creates a writer with the specified output stream and encoding. */
+ public Writer(OutputStream stream, String encoding) {
+ try {
+ out = new PrintWriter(new OutputStreamWriter(stream, encoding), true);
+ }
+ catch (UnsupportedEncodingException e) {
+ throw new RuntimeException("JVM must have "+encoding+" decoder");
+ }
+ } // <init>(OutputStream,String)
+
+ /** Creates a writer with the specified Java Writer. */
+ public Writer(java.io.Writer writer) {
+ out = new PrintWriter(writer);
+ } // <init>(java.io.Writer)
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs) throws XNIException {
+ fStringBuffer.clear();
+ } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
+
+ // old methods
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding, Augmentations augs) throws XNIException {
+ startDocument(locator, encoding, null, augs);
+ } // startDocument(XMLLocator,String,Augmentations)
+
+ /** XML declaration. */
+ public void xmlDecl(String version, String encoding, String standalone,
+ Augmentations augs) throws XNIException {
+ if (version!=null) {
+ out.print("xversion ");
+ out.println(version);
+ }
+ if (encoding!=null) {
+ out.print("xencoding ");
+ out.println(encoding);
+ }
+ if (standalone!=null) {
+ out.print("xstandalone ");
+ out.println(standalone);
+ }
+ out.flush();
+ } // xmlDecl(String,String,String,Augmentations)
+
+ /** Doctype declaration. */
+ public void doctypeDecl(String root, String pubid, String sysid, Augmentations augs) throws XNIException {
+ chars();
+ out.print('!');
+ if (root != null) {
+ out.print(root);
+ }
+ out.println();
+ if (pubid != null) {
+ out.print('p');
+ out.print(pubid);
+ out.println();
+ }
+ if (sysid != null) {
+ out.print('s');
+ out.print(sysid);
+ out.println();
+ }
+ out.flush();
+ } // doctypeDecl(String,String,String,Augmentations)
+
+ /** Processing instruction. */
+ public void processingInstruction(String target, XMLString data, Augmentations augs) throws XNIException {
+ chars();
+ out.print('?');
+ out.print(target);
+ if (data != null && data.length > 0) {
+ out.print(' ');
+ print(data.toString());
+ }
+ out.println();
+ out.flush();
+ } // processingInstruction(String,XMLString,Augmentations)
+
+ /** Comment. */
+ public void comment(XMLString text, Augmentations augs) throws XNIException {
+ chars();
+ out.print('#');
+ print(text.toString());
+ out.println();
+ out.flush();
+ } // comment(XMLString,Augmentations)
+
+ /** Start element. */
+ public void startElement(QName element, XMLAttributes attrs, Augmentations augs) throws XNIException {
+ chars();
+ out.print('(');
+ out.print(element.rawname);
+ int acount = attrs != null ? attrs.getLength() : 0;
+ if (acount > 0) {
+ String[] anames = new String[acount];
+ String[] auris = new String[acount];
+ sortAttrNames(attrs, anames, auris);
+ for (int i = 0; i < acount; i++) {
+ String aname = anames[i];
+ out.println();
+ out.flush();
+ out.print('A');
+ if (auris[i] != null) {
+ out.print('{');
+ out.print(auris[i]);
+ out.print('}');
+ }
+ out.print(aname);
+ out.print(' ');
+ print(attrs.getValue(aname));
+ }
+ }
+ out.println();
+ out.flush();
+ } // startElement(QName,XMLAttributes,Augmentations)
+
+ /** End element. */
+ public void endElement(QName element, Augmentations augs) throws XNIException {
+ chars();
+ out.print(')');
+ out.print(element.rawname);
+ out.println();
+ out.flush();
+ } // endElement(QName,Augmentations)
+
+ /** Empty element. */
+ public void emptyElement(QName element, XMLAttributes attrs, Augmentations augs) throws XNIException {
+ startElement(element, attrs, augs);
+ endElement(element, augs);
+ } // emptyElement(QName,XMLAttributes,Augmentations)
+
+ /** Characters. */
+ public void characters(XMLString text, Augmentations augs) throws XNIException {
+ fStringBuffer.append(text);
+ } // characters(XMLString,Augmentations)
+
+ /** Ignorable whitespace. */
+ public void ignorableWhitespace(XMLString text, Augmentations augs) throws XNIException {
+ characters(text, augs);
+ } // ignorableWhitespace(XMLString,Augmentations)
+
+ //
+ // Protected methods
+ //
+
+ /** Prints collected characters. */
+ protected void chars() {
+ if (fStringBuffer.length == 0) {
+ return;
+ }
+ out.print('"');
+ print(fStringBuffer.toString());
+ out.println();
+ out.flush();
+ fStringBuffer.clear();
+ } // chars()
+
+ /** Prints the specified string. */
+ protected void print(String s) {
+ int length = s != null ? s.length() : 0;
+ for (int i = 0; i < length; i++) {
+ char c = s.charAt(i);
+ switch (c) {
+ case '\n': {
+ out.print("\\n");
+ break;
+ }
+ case '\r': {
+ out.print("\\r");
+ break;
+ }
+ case '\t': {
+ out.print("\\t");
+ break;
+ }
+ case '\\': {
+ out.print("\\\\");
+ break;
+ }
+ default: {
+ out.print(c);
+ }
+ }
+ }
+ } // print(String)
+
+ //
+ // Protected static methods
+ //
+
+ /** Sorts the attribute names. */
+ protected static void sortAttrNames(XMLAttributes attrs,
+ String[] anames, String[] auris) {
+ for (int i = 0; i < anames.length; i++) {
+ anames[i] = attrs.getQName(i);
+ auris[i] = attrs.getURI(i);
+ }
+ // NOTE: This is super inefficient but it doesn't really matter. -Ac
+ for (int i = 0; i < anames.length - 1; i++) {
+ int index = i;
+ for (int j = i + 1; j < anames.length; j++) {
+ if (anames[j].compareTo(anames[index]) < 0) {
+ index = j;
+ }
+ }
+ if (index != i) {
+ String tn = anames[i];
+ anames[i] = anames[index];
+ anames[index] = tn;
+ String tu = auris[i];
+ auris[i] = auris[index];
+ auris[index] = tu;
+ }
+ }
+ } // sortAttrNames(XMLAttributes,String[])
+
+ //
+ // MAIN
+ //
+
+ /** Main program. */
+ public static void main(String[] argv) throws Exception {
+ org.apache.xerces.xni.parser.XMLDocumentFilter[] filters = {
+ new Writer(),
+ };
+ org.apache.xerces.xni.parser.XMLParserConfiguration parser =
+ new org.cyberneko.html.HTMLConfiguration();
+ parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
+ for (int i = 0; i < argv.length; i++) {
+ org.apache.xerces.xni.parser.XMLInputSource source =
+ new org.apache.xerces.xni.parser.XMLInputSource(null, argv[i], null);
+ parser.parse(source);
+ }
+ } // main(String[])
+
+} // class Writer
More information about the pkg-java-commits
mailing list