[PATCH 08/12] deb822.Deb822Dict.dump: Add a text_mode parameter.

Sun Aug 31 21:26:14 UTC 2014

From: John Wright <jsw at google.com>

This will allow dumping to file(-like) objects that assume text/unicode
input (e.g. files that are opened in text mode, and io.StringIO).

(There is a lot of encoding stuff in deb822 that makes my head hurt.
Most of it is almost certainly wrong.  Lots of it could be ripped out,
but it's hard to know which users would break as a result or how.  It's
almost tempting to rip it out anyway and just commit to helping the
reverse-dependencies properly deal with text...)
---
 debian/changelog     |  2 ++
 lib/debian/deb822.py | 27 ++++++++++++++++++++-------
 tests/test_deb822.py | 14 ++++++++++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 96dd58f..6d3ee73 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -25,6 +25,8 @@ python-debian (0.1.23) UNRELEASED; urgency=medium
   * Add a deb822.RestrictedWrapper class, for exposing read-only access
     to a Deb822 instance's field values as strings, while restricting
     write access to some fields, which are exposed via properties.
+  * deb822.Deb822Dict.dump: Add a text_mode parameter for dumping to
+    file(-like) objects that assume text/unicode input.
 
  -- Stuart Prescott <stuart at debian.org>  Fri, 13 Jun 2014 00:27:59 +1000
 
diff --git a/lib/debian/deb822.py b/lib/debian/deb822.py
index ec3f011..3f79e4a 100644
--- a/lib/debian/deb822.py
+++ b/lib/debian/deb822.py
@@ -249,6 +249,11 @@ class Deb822Dict(collections.MutableMapping):
             else:
                 raise
 
+        # TODO(jsw): Move the decoding logic into __setitem__ so that we decode
+        # it once instead of every time somebody asks for it.  Even better if
+        # Deb822* classes dealt in pure unicode and didn't care about the
+        # encoding of the files they came from...but I don't know how to fix
+        # that without breaking a bunch of users.
         return self._detect_encoding(value)
 
     def __delitem__(self, key):
@@ -490,16 +495,24 @@ class Deb822(Deb822Dict):
         """
         return six.text_type(self[key])
 
-    def dump(self, fd=None, encoding=None):
+    def dump(self, fd=None, encoding=None, text_mode=False):
         """Dump the the contents in the original format
 
-        If fd is None, return a unicode object.
+        If fd is None, returns a unicode object.  Otherwise, fd is assumed to
+        be a file-like object, and this method will write the data to it
+        instead of returning a unicode object.
 
-        If fd is not None, attempt to encode the output to the encoding the
-        object was initialized with, or the value of the encoding argument if
-        it is not None.  This will raise UnicodeEncodeError if the encoding
-        can't support all the characters in the Deb822Dict values.
+        If fd is not none and text_mode is False, the data will be encoded
+        to a byte string before writing to the file.  The encoding used is
+        chosen via the encoding parameter; None means to use the encoding the
+        object was initialized with (utf-8 by default).  This will raise
+        UnicodeEncodeError if the encoding can't support all the characters in
+        the Deb822Dict values.
         """
+        # Ideally this would never try to encode (that should be up to the
+        # caller when opening the file), but we may still have users who rely
+        # on the binary mode encoding.  But...might it be better to break them
+        # than to introduce yet another parameter relating to encoding?
 
         if fd is None:
             fd = StringIO()
@@ -522,7 +535,7 @@ class Deb822(Deb822Dict):
                 entry = '%s:%s\n' % (key, value)
             else:
                 entry = '%s: %s\n' % (key, value)
-            if not return_string:
+            if not return_string and not text_mode:
                 fd.write(entry.encode(encoding))
             else:
                 fd.write(entry)
diff --git a/tests/test_deb822.py b/tests/test_deb822.py
index 910a958..40049a6 100755
--- a/tests/test_deb822.py
+++ b/tests/test_deb822.py
@@ -19,6 +19,7 @@
 
 from __future__ import absolute_import
 
+import io
 import os
 import re
 import sys
@@ -284,6 +285,12 @@ PARSED_PARAGRAPHS_WITH_COMMENTS = [
 def open_utf8(filename, mode='r'):
     """Open a UTF-8 text file in text mode."""
     if sys.version < '3':
+        # TODO(jsw): This isn't actually doing what the docstring says.  The
+        # correct code (for both 2 and 3) is
+        #   io.open(filename, code=mode, encoding='utf-8')
+        # but that makes a couple of other tests fail on 2.x (both related to
+        # apt_pkg - not surprisingly, its behavior with unicode objects isn't
+        # very consistent).
         return open(filename, mode=mode)
     else:
         return open(filename, mode=mode, encoding='UTF-8')
@@ -847,6 +854,13 @@ Description: python modules to work with Debian-related data formats
         f2.close()
         f1.close()
 
+    def test_dump_text_mode(self):
+        d = deb822.Deb822(CHANGES_FILE.splitlines())
+        buf = io.StringIO()
+        d.dump(fd=buf, text_mode=True)
+        self.assertEqual(CHANGES_FILE, buf.getvalue())
+
+
     def test_bug597249_colon_as_first_value_character(self):
         """Colon should be allowed as the first value character. See #597249.
         """
-- 
2.1.0