[xml/sgml-pkgs] Bug#770836: libxml2: please consider adding a patch fixing invalid output
Thorsten Glaser
t.glaser at tarent.de
Mon Nov 24 14:22:44 UTC 2014
Source: libxml2
Version: 2.9.2+dfsg1-1
Severity: wishlist
Tags: patch upstream forwarded-upstream
Forwarded: https://bugzilla.gnome.org/show_bug.cgi?id=739574
Hi,
please consider applying the attached patch in subsequent uploads,
at least until upstream has integrated it. It fixes:
• replace several ad-hōc UTF-8 decoders with calls to one that
does the thing right (validate input string length and encoding,
and check for minimal encoded values)
• in several places, check the values for being actually ok in
XML documents, which limits what Unicode codepoints may be used
‣ when there was already error handling in place, re-use that
‣ otherwise silently drop the characters, to not break any
existing application
This prevents e.g. a SOAP-WS client written in PHP from sending
invalid XML as SOAP request over the wire for strings containing
e.g. literal backspace characters.
Thanks,
//mirabilos
--
tarent solutions GmbH
Rochusstraße 2-4, D-53123 Bonn • http://www.tarent.de/
Tel: +49 228 54881-393 • Fax: +49 228 54881-235
HRB 5168 (AG Bonn) • USt-ID (VAT): DE122264941
Geschäftsführer: Dr. Stefan Barth, Kai Ebenrett, Boris Esser, Alexander Steeg
-------------- next part --------------
diff -Nru libxml2-2.9.2+dfsg1/debian/changelog libxml2-2.9.2+dfsg1/debian/changelog
--- libxml2-2.9.2+dfsg1/debian/changelog 2014-10-26 02:45:27.000000000 +0200
+++ libxml2-2.9.2+dfsg1/debian/changelog 2014-11-24 14:05:49.000000000 +0100
@@ -1,3 +1,10 @@
+libxml2 (2.9.2+dfsg1-1.0tarent1) tarent; urgency=medium
+
+ * Non-maintainer upload.
+ * Add patch fixing XML and UTF-8 character validity of output
+
+ -- Thorsten Glaser <t.glaser at tarent.de> Mon, 24 Nov 2014 14:05:46 +0100
+
libxml2 (2.9.2+dfsg1-1) unstable; urgency=low
* New upstream release (Closes: #765722, CVE-2014-3660)
diff -Nru libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch
--- libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch 1970-01-01 01:00:00.000000000 +0100
+++ libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch 2014-11-24 14:05:44.000000000 +0100
@@ -0,0 +1,330 @@
+# DP: Fix emitting invalid XML (things not IS_BYTE_CHAR or not IS_CHAR).
+# DP: Bonus: be correct when decoding UTF-8.
+# DP: Invalid XML or UTF-8 is silently skipped, unless existing code
+# DP: dealt with error conditions already.
+# DP: Bug: https://bugzilla.gnome.org/show_bug.cgi?id=739574
+# DP: Author: mirabilos <t.glaser at tarent.de>
+
+--- a/entities.c
++++ b/entities.c
+@@ -25,6 +25,84 @@
+ #include "save.h"
+
+ /*
++ * Bonus: correct UTF-8 decoder, for use here and elsewhere.
++ * Decodes into *wcp valid UTF-8 string to U+0000‥U+FFFD or
++ * U-00010000‥U-0010FFFF and returns number of octets used.
++ * In error case, returns 0 and does not change *wcp. Use 5
++ * for buflen if buf is guaranteed to be NUL-terminated.
++ */
++unsigned int
++xmlInternalUTF8decode(unsigned int *wcp, const void *buf, size_t buflen)
++{
++ unsigned int wc, to, lo;
++ const unsigned char *src = buf;
++
++ if (buflen < 1)
++ goto xmlInternalUTF8decode_error;
++ wc = *src++;
++ /* check for valid ASCII */
++ if (wc < 0x80)
++ goto xmlInternalUTF8decode_success;
++ /* check for valid lead octet in valid range */
++ if (wc < 0xC2 || wc > 0xF4)
++ goto xmlInternalUTF8decode_error;
++ /* check first trail byte for validity */
++ if (buflen < 2)
++ goto xmlInternalUTF8decode_error;
++ if ((to = *src++ ^ 0x80) > 0x3F)
++ goto xmlInternalUTF8decode_error;
++ to &= 0x3F;
++ /* check for 2-octet sequence */
++ if ((lo = wc) < 0xE0) {
++ wc = ((wc & 0x1F) << 6) | to;
++ /* check for minimal encoding */
++ if (wc < 0x80)
++ goto xmlInternalUTF8decode_error;
++ goto xmlInternalUTF8decode_success;
++ }
++ /* differentiate between 3-octet and 4-octet sequences */
++ if (lo < 0xF0)
++ wc = ((wc & 0x0F) << 12) | (to << 6);
++ else
++ wc = ((wc & 0x07) << 18) | (to << 12);
++
++ /* check second trail byte for validity */
++ if (buflen < 3)
++ goto xmlInternalUTF8decode_error;
++ if ((to = *src++ ^ 0x80) > 0x3F)
++ goto xmlInternalUTF8decode_error;
++ to &= 0x3F;
++ /* check for 3-octet sequence */
++ if (lo < 0xF0) {
++ wc |= to;
++ /* check for minimal and valid encoding */
++ if (wc < 0x800 || wc > 0xFFFD)
++ goto xmlInternalUTF8decode_error;
++ goto xmlInternalUTF8decode_success;
++ }
++ wc |= (to << 6);
++ /* check third trail byte for validity */
++ if (buflen < 4)
++ goto xmlInternalUTF8decode_error;
++ if ((to = *src++ ^ 0x80) > 0x3F)
++ goto xmlInternalUTF8decode_error;
++ /* handle 4-octet sequence */
++ wc |= to & 0x3F;
++ /* check for minimal and valid encoding */
++ if (wc < 0x10000 || wc > 0x10FFFF)
++ goto xmlInternalUTF8decode_error;
++
++ xmlInternalUTF8decode_success:
++ if (wcp)
++ *wcp = wc;
++ return ((unsigned int)(src - ((const unsigned char *)buf)));
++
++ xmlInternalUTF8decode_error:
++ return (0);
++}
++
++
++/*
+ * The XML predefined entities.
+ */
+
+@@ -663,46 +741,35 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc,
+ * We assume we have UTF-8 input.
+ */
+ char buf[11], *ptr;
+- int val = 0, l = 1;
++ unsigned int val, l;
+
+- if (*cur < 0xC0) {
++ if (!(l = xmlInternalUTF8decode(&val, cur, 5))) {
+ xmlEntitiesErr(XML_CHECK_NOT_UTF8,
+ "xmlEncodeEntities: input not UTF-8");
+ if (doc != NULL)
+ doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
++ if (!IS_BYTE_CHAR(*cur)) {
++ /* just skip the offending character */
++ cur++;
++ continue;
++ }
+ snprintf(buf, sizeof(buf), "&#%d;", *cur);
+ buf[sizeof(buf) - 1] = 0;
+ ptr = buf;
+ while (*ptr != 0) *out++ = *ptr++;
+ cur++;
+ continue;
+- } else if (*cur < 0xE0) {
+- val = (cur[0]) & 0x1F;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- l = 2;
+- } else if (*cur < 0xF0) {
+- val = (cur[0]) & 0x0F;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- val <<= 6;
+- val |= (cur[2]) & 0x3F;
+- l = 3;
+- } else if (*cur < 0xF8) {
+- val = (cur[0]) & 0x07;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- val <<= 6;
+- val |= (cur[2]) & 0x3F;
+- val <<= 6;
+- val |= (cur[3]) & 0x3F;
+- l = 4;
+ }
+ if ((l == 1) || (!IS_CHAR(val))) {
+ xmlEntitiesErr(XML_ERR_INVALID_CHAR,
+ "xmlEncodeEntities: char out of range\n");
+ if (doc != NULL)
+ doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
++ if (!IS_BYTE_CHAR(*cur)) {
++ /* just skip the offending character */
++ cur++;
++ continue;
++ }
+ snprintf(buf, sizeof(buf), "&#%d;", *cur);
+ buf[sizeof(buf) - 1] = 0;
+ ptr = buf;
+@@ -842,11 +909,17 @@ xmlEncodeSpecialChars(const xmlDoc *doc
+ *out++ = '3';
+ *out++ = ';';
+ } else {
+- /*
+- * Works because on UTF-8, all extended sequences cannot
+- * result in bytes in the ASCII range.
+- */
+- *out++ = *cur;
++ unsigned int wc, wl;
++
++ if ((wl = xmlInternalUTF8decode(&wc, cur, 5)) && IS_CHAR(wc)) {
++ /* copy correct UTF-8 sequence */
++ while (wl--)
++ *out++ = *cur++;
++ continue;
++ }
++ /* we can still copy it, but only if allowed */
++ if (IS_BYTE_CHAR(*cur))
++ *out++ = *cur;
+ }
+ cur++;
+ }
+--- a/include/libxml/parserInternals.h
++++ b/include/libxml/parserInternals.h
+@@ -636,6 +636,9 @@ XMLPUBFUN void XMLCALL
+ XMLPUBFUN void XMLCALL
+ xmlErrMemory (xmlParserCtxtPtr ctxt,
+ const char *extra);
++
++unsigned int xmlInternalUTF8decode(unsigned int *wcp, const void *buf, size_t buflen)
++ __attribute__((__visibility__("hidden")));
+ #endif
+
+ #ifdef __cplusplus
+--- a/xmlIO.c
++++ b/xmlIO.c
+@@ -3570,7 +3570,15 @@ xmlEscapeContent(unsigned char* out, int
+ *out++ = '3';
+ *out++ = ';';
+ } else {
+- *out++ = (unsigned char) *in;
++ unsigned int wc, wl;
++
++ if ((wl = xmlInternalUTF8decode(&wc, in, inend - in)) && IS_CHAR(wc)) {
++ if (outend - out < wl) break;
++ /* copy correct UTF-8 sequence */
++ while (wl--)
++ *out++ = *in++;
++ continue;
++ }
+ }
+ ++in;
+ }
+--- a/xmlsave.c
++++ b/xmlsave.c
+@@ -249,44 +249,19 @@ xmlEscapeEntities(unsigned char* out, in
+ *out++ = *in++;
+ continue;
+ } else if (*in >= 0x80) {
++ unsigned int wc, wl;
++
+ /*
+ * We assume we have UTF-8 input.
+ */
+ if (outend - out < 11) break;
+
+- if (*in < 0xC0) {
++ if (!(wl = xmlInternalUTF8decode(&wc, in, inend - in))) {
+ xmlSaveErr(XML_SAVE_NOT_UTF8, NULL, NULL);
+ in++;
+ goto error;
+- } else if (*in < 0xE0) {
+- if (inend - in < 2) break;
+- val = (in[0]) & 0x1F;
+- val <<= 6;
+- val |= (in[1]) & 0x3F;
+- in += 2;
+- } else if (*in < 0xF0) {
+- if (inend - in < 3) break;
+- val = (in[0]) & 0x0F;
+- val <<= 6;
+- val |= (in[1]) & 0x3F;
+- val <<= 6;
+- val |= (in[2]) & 0x3F;
+- in += 3;
+- } else if (*in < 0xF8) {
+- if (inend - in < 4) break;
+- val = (in[0]) & 0x07;
+- val <<= 6;
+- val |= (in[1]) & 0x3F;
+- val <<= 6;
+- val |= (in[2]) & 0x3F;
+- val <<= 6;
+- val |= (in[3]) & 0x3F;
+- in += 4;
+- } else {
+- xmlSaveErr(XML_SAVE_CHAR_INVALID, NULL, NULL);
+- in++;
+- goto error;
+ }
++ val = wc;
+ if (!IS_CHAR(val)) {
+ xmlSaveErr(XML_SAVE_CHAR_INVALID, NULL, NULL);
+ in++;
+@@ -2103,48 +2078,31 @@ xmlBufAttrSerializeTxtContent(xmlBufPtr
+ * We assume we have UTF-8 content.
+ */
+ unsigned char tmp[12];
+- int val = 0, l = 1;
++ unsigned int val, l;
+
+ if (base != cur)
+ xmlBufAdd(buf, base, cur - base);
+- if (*cur < 0xC0) {
++ if (!(l = xmlInternalUTF8decode(&val, cur, 5))) {
+ xmlSaveErr(XML_SAVE_NOT_UTF8, (xmlNodePtr) attr, NULL);
+ if (doc != NULL)
+ doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
+- xmlSerializeHexCharRef(tmp, *cur);
+- xmlBufAdd(buf, (xmlChar *) tmp, -1);
++ if (IS_BYTE_CHAR(*cur)) {
++ xmlSerializeHexCharRef(tmp, *cur);
++ xmlBufAdd(buf, (xmlChar *) tmp, -1);
++ }
+ cur++;
+ base = cur;
+ continue;
+- } else if (*cur < 0xE0) {
+- val = (cur[0]) & 0x1F;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- l = 2;
+- } else if (*cur < 0xF0) {
+- val = (cur[0]) & 0x0F;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- val <<= 6;
+- val |= (cur[2]) & 0x3F;
+- l = 3;
+- } else if (*cur < 0xF8) {
+- val = (cur[0]) & 0x07;
+- val <<= 6;
+- val |= (cur[1]) & 0x3F;
+- val <<= 6;
+- val |= (cur[2]) & 0x3F;
+- val <<= 6;
+- val |= (cur[3]) & 0x3F;
+- l = 4;
+- }
++ }
+ if ((l == 1) || (!IS_CHAR(val))) {
+ xmlSaveErr(XML_SAVE_CHAR_INVALID, (xmlNodePtr) attr, NULL);
+ if (doc != NULL)
+ doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
+
+- xmlSerializeHexCharRef(tmp, *cur);
+- xmlBufAdd(buf, (xmlChar *) tmp, -1);
++ if (IS_BYTE_CHAR(*cur)) {
++ xmlSerializeHexCharRef(tmp, *cur);
++ xmlBufAdd(buf, (xmlChar *) tmp, -1);
++ }
+ cur++;
+ base = cur;
+ continue;
+@@ -2157,6 +2115,11 @@ xmlBufAttrSerializeTxtContent(xmlBufPtr
+ xmlBufAdd(buf, (xmlChar *) tmp, -1);
+ cur += l;
+ base = cur;
++ } else if (!IS_BYTE_CHAR(*cur)) {
++ if (base != cur)
++ xmlBufAdd(buf, base, cur - base);
++ cur++;
++ base = cur;
+ } else {
+ cur++;
+ }
diff -Nru libxml2-2.9.2+dfsg1/debian/patches/series libxml2-2.9.2+dfsg1/debian/patches/series
--- libxml2-2.9.2+dfsg1/debian/patches/series 2014-10-26 01:04:04.000000000 +0200
+++ libxml2-2.9.2+dfsg1/debian/patches/series 2014-11-21 15:36:57.000000000 +0100
@@ -1,2 +1,3 @@
0001-modify-xml2-config-and-pkgconfig-behaviour.patch
0002-fix-python-multiarch-includes.patch
+quell-omitting-invalid-XML-chars.patch
More information about the debian-xml-sgml-pkgs
mailing list