[PATCH 3/4] introduce utils/uni.py module

Nicolas Sebrecht nicolas.s-dev at laposte.net
Tue Feb 10 17:04:42 GMT 2015


This module gets all unicode-related stuff.

Signed-off-by: Nicolas Sebrecht <nicolas.s-dev at laposte.net>
---
 offlineimap/utils/uni.py | 487 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 487 insertions(+)
 create mode 100644 offlineimap/utils/uni.py

diff --git a/offlineimap/utils/uni.py b/offlineimap/utils/uni.py
new file mode 100644
index 0000000..a9df9df
--- /dev/null
+++ b/offlineimap/utils/uni.py
@@ -0,0 +1,487 @@
+# Copyright (C) 2015 Nicolas Sebrecht
+#
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+
+# Low-level functions to work with unicode.
+#
+# Do the best to prevent from double encoding/decoding. For this, we rely on
+# variables type.
+
+#
+# Unicode in OfflineIMAP is documented in API.rst.
+#
+
+import sys
+import shutil
+import os
+import logging
+
+from .IMAPClient import imap_utf7
+
+# TODO: move out, add CLI option for -d.
+DEBUG = False
+if DEBUG:
+    import inspect
+
+
+# Expected encoding to work with, outside specific requirements. This encoding
+# can be used as a transition encoding.
+# There are assumptions that the ASCII charset is a subset
+# of this encoding.
+# TODO: add an option in configuration file.
+ENCODING = 'UTF-8'
+
+# Depends on LANG or LC_TYPE environment variables.
+FS_ENCODING = sys.getfilesystemencoding()
+
+# Standard legacy encoding is fixed. Mostly used to encode exception
+# messages.
+STD_ENCODING = 'ascii'
+
+
+
+class UniError(Exception):
+    pass
+
+def __exc_message(original_msg, exception_msg=None, add_msg=[]):
+    """Format exception message to be raised with UniError.
+
+    Meant for internal use only (functions on this module).
+
+    Understanding Python Unicode exceptions might be hard. Improve basic
+    exception by adding surrounding informations about context.
+
+    :params: handles 3 kinds of informations passed as arguments:
+    - original_msg: message from the original Python Unicode exception;
+    - exception_msg: optional context message about the caller's context (from
+      outside this module);
+    - add_msg: additional optional message from the function in this module
+      where the Python Unicode exception is raised.
+
+    Third argument is a list of lines nicely printed."""
+
+    # str() call ensure correct casting from other types like int.
+    exception_msg = str(uni2std(exception_msg))
+
+    msg = "Module 'uni:"
+    msg += "\nException message\n  (from original exception): %s"% \
+        uni2std(original_msg)
+    if exception_msg:
+        msg += "\nException message"
+        for line in exception_msg.split('\n'):
+            msg += "\n  (from caller): %s"% uni2std(line)
+    if add_msg:
+        msg += "\nException message"
+        if type(add_msg) == list:
+            for line in add_msg:
+                msg += "\n  (from function in module uni): %s"% \
+                    uni2std(line)
+        else:
+            msg += uni2std(add_msg)
+    return msg
+
+
+"""
+#############################################################
+
+# Actually work on encodings.
+
+# Preventing from double encoding/decoding is not an easy task. We try to avoid
+# that mess by relying on variable types.
+
+# Most functions here accept the unusual 'exception_msg' parameter, optional,
+# allowing better information on Unicode exceptions.
+
+#############################################################
+"""
+
+def convert(direction, s, encoding, errors, exception_msg=None):
+    """Lowest level function of the module to encode/decode."""
+
+    try:
+        if direction == 'uni2bytes' and type(s) == unicode:
+            target = s.encode(encoding, errors=errors)
+        elif direction == 'bytes2uni' and type(s) == str:
+            target = s.decode(encoding, errors=errors)
+        else:
+            target = s
+        return target
+    except (UnicodeDecodeError, UnicodeEncodeError):
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, uni2std(exception_msg), [
+            "direction=%s"% uni2std(direction),
+            "s: %s"% uni2std(s),
+            "type(s): %s"% str(type(s)),
+            "encoding=%s"% uni2std(encoding),
+            "errors=%s"% uni2std(errors)])
+        raise UniError(msg), None, tb
+
+
+def uni2bytes(u, encoding=ENCODING, errors='strict', exception_msg=None):
+    """Wrapper to encode unicode types back to string of bytes.
+
+    The advantages are:
+    - tune the error raised to make it more user-friendly
+    - simplify/compact the code: do not require extra try/expect"""
+
+    return convert('uni2bytes', u, encoding, errors, exception_msg)
+
+
+def bytes2uni(b, encoding=ENCODING, errors='strict', exception_msg=None):
+    """Wrapper to decode a string of bytes to unicode."""
+
+    return convert('bytes2uni', b, encoding, errors, exception_msg)
+
+
+def uni2str(u, exception_msg=None):
+    """Convert unicode to str type the hard way.
+
+    Will raise an exception if the string has any character outside of the ASCII
+    subset of Unicode."""
+
+    try:
+        if type(u) == unicode:
+            target = str(u)
+        else:
+            target = u
+        return target
+    except (UnicodeDecodeError, UnicodeEncodeError):
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, uni2std(exception_msg), [
+            "unsupported character in input '%s'"% uni2std(u)])
+        raise UniError(msg), None, tb
+
+
+def uni2std(s):
+    """Convert string of bytes or unicode to a string of ASCII characters only.
+
+    Must always work without error so that it can safely be called whatever
+    Unicode support is enabled or not.
+
+    Usefull to encode text in exceptions if you're not sure encoding will work,
+    for example."""
+
+    # Handle string of bytes since it could already by encoded.
+    # In this case, we assume encoding to be ENCODING.
+    if type(s) == str:
+        s = bytes2uni(s)
+    return uni2bytes(s, encoding=STD_ENCODING, errors='replace')
+
+
+def fs2uni(path, errors='strict', exception_msg=None):
+    """Expected argument path is type str.
+
+    Returns path in Unicode from a filesystem encoded string of bytes
+    (type unicode)."""
+
+    try:
+        if type(path) == str:
+            target = bytes2uni(path, encoding=FS_ENCODING, errors=errors)
+        else:
+            target = path
+        return target
+    except (UnicodeDecodeError, UnicodeEncodeError):
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, uni2std(exception_msg), [
+            "unsupported character in path '%s'"% uni2std(path)])
+        raise UniError(msg), None, tb
+
+
+def uni2fs(path, errors='strict', exception_msg=None):
+    """Expected argument path is unicode or str encoded with ENCODING encoding.
+
+    Returns path filesystem encoded (type str)."""
+
+    try:
+        if type(path) == unicode:
+            target = uni2bytes(path, encoding=FS_ENCODING, errors=errors)
+        else:
+            target = path
+        return target
+    except UnicodeEncodeError:
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, uni2std(exception_msg), [
+            "unsupported character in path '%s'"% uni2std(path)])
+        raise UniError(msg), None, tb
+
+
+def isASCII(s):
+    try:
+        s.encode('ascii')
+        return True
+    except:
+        return False
+
+
+"""
+# IMAP modified UTF-7 charset has to be handled differently.
+#
+# IMAP charset is a way to encode non-ASCII characters with only ASCII
+# characters. Encoded characters are variable-length.  E.g. minus e-acute is
+# encoded to '&AOk-'.
+#
+# UTF-7 is NOT a Unicode standard but it is more efficient on the internet and
+# legacy compatible with the expectations of the server-side softwares running
+# Usenet, SMTP, etc.
+#
+# IMAP use a modified version of UTF-7. See http://tools.ietf.org/html/rfc2060
+#
+# Anyway, this pure-ASCII encoding means that the encoded string can either be
+# in bytes or unicode types in Python.
+#
+# Have fun! ,-)
+"""
+
+def imap2uni(b, exception_msg=None):
+    """Input may still be a unicode string.
+
+    Returned value is Unicode."""
+
+    try:
+        return imap_utf7.decode(b)
+    except Exception as e:
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, exception_msg, [
+            "unsupported character in input '%s'"% uni2std(b)])
+        raise UniError(msg), None, tb
+
+
+def uni2imap(u, exception_msg=None):
+    """Output is still a unicode string."""
+
+    try:
+        return imap_utf7.encode(u)
+    except Exception as e:
+        eclass, ex, tb = sys.exc_info()
+        msg = __exc_message(ex, exception_msg, [
+            "unsupported character in input '%s'"% uni2std(u)])
+        raise UniError(msg), None, tb
+
+
+"""
+#############################################################
+
+# Debugging and logging facilities.
+
+#############################################################
+"""
+
+def warn(msg):
+    logging.warn(u"UNICODE WARNING: %s"% msg)
+
+
+"""
+#############################################################
+
+# Factorized stuff.
+
+# From here, add the factorized functions/classes. They are usefull for us only
+# when dealing with Unicode, that's why they stand in this module.
+
+# On the other hand, they are not purely related to Unicode in the sense that
+# they suppose knowledge of OfflineIMAP logic. No other kind of software would
+# make use of them. That's why they are considered as outside fonctions
+# regarding the exception_msg handling point of view of this module.
+
+#############################################################
+"""
+
+class UnicodeFormatter(logging.Formatter):
+    """Wrap logging.Formatter to handle Unicode.
+
+    We have to to this because each Handler handles Unicode in its own way.
+    Some handlers might do not handle Unicode at all.
+    On top of that, encoding vary with the Handler."""
+
+    def __init__(self, fmt, datefmt=None, encode_function=None):
+        logging.Formatter.__init__(self, fmt, datefmt)
+        self.encode_function = encode_function
+
+    def format(self, record):
+        """Give precedence to the method defined encode_function argument over
+        the class attribute."""
+
+        result = logging.Formatter.format(self, record)
+        if self.encode_function:
+            if DEBUG and type(record.msg) == str:
+                infos = inspect.stack()[9]
+                warn(u"logger '%s' called with str type in: %s:%s\n"
+                    "  in function %s() \"%s\""% (record.name,
+                    record.pathname, record.lineno, infos[3], record.msg))
+            result = self.encode_function(result)
+        return result
+
+
+def diverged_foldernames(s, use_unicode):
+    """Compare foldernames between the expected and unexpected encodings.
+
+    Support of Unicode for foldernames has to do more than just encoding strings
+    right. If previously run without Unicode support, the folder might exist on
+    disk with the wrong encoding.
+
+    Above statement is also true for the opposite: if currently running with
+    Unicode support disabled while previously run with it enabled.
+
+    :param s: can be of type unicode or str wheter unicode support is enabled or
+              not. If not, we assume that encoding is IMAP UTF-7 (plain ASCII).
+              It MUST be the basename to avoid mixing unicode in the dirname part.
+
+    Returns 3 values:
+    - True if fodernames diverged, False otherwise (bool)
+    - The string encoded with the unexpected encoding (bytes)
+    - The string encoded with the expected encoding (bytes)
+    """
+
+    if use_unicode:
+        assert type(s) == unicode
+        uni_s = s
+    else:
+        assert type(s) == str
+        uni_s = fs2uni(s)
+
+
+    if use_unicode:
+        # Currently, s is standard Unicode code points.
+        expected = uni2fs(s)
+        # If unicode support were disabled we would have worked with a ASCII
+        # string of bytes encoded with IMAP UTF-7.
+        # Again, s is currently standard Unicode code points.
+        unexpected = str(uni2imap(s))
+    else:
+        # Currently, s is an ASCII string of bytes encoded with IMAP UTF-7.
+        expected = s
+        # If unicode support were enabled we would have worked with a filesystem
+        # encoded string of bytes.
+        # Again, s is currently an ASCII string of bytes encoded with IMAP UTF-7.
+        unexpected = uni2fs(imap2uni(s))
+
+    diverged = ( expected != unexpected )
+
+    if diverged and DEBUG:
+        warn(u"diverged_foldernames: got: %s (%s)"% (uni_s, str(type(s))))
+        warn(u"diverged_foldernames: unexpected: %s"% fs2uni(unexpected))
+        warn(u"diverged_foldernames: expected: %s"% fs2uni(expected))
+    return diverged, unexpected, expected
+
+
+def rename_diverged(root, old, new):
+    """Move old to new (both full path in bytes).
+
+    We require root to avoid mixing encodings.
+
+    :param:
+    - root: the dirname (as opposed to basename) (bytes)
+    - old: old filename (bytes)
+    - new: new filename (bytes)
+    """
+
+    assert type(root) == str
+    assert type(old) == str
+    assert type(new) == str
+
+    old = os.path.sep.join([root, old])
+    new = os.path.sep.join([root, new])
+
+    try:
+        if DEBUG:
+            warn(u"rename_diverged (old): %s (%s)"%
+                (fs2uni(old), str(type(old))))
+            warn(u"rename_diverged (new): %s (%s)"%
+                (fs2uni(new), str(type(new))))
+        shutil.move(old, new)
+        return True
+    except IOError as e:
+        if e.errno == 2:
+            if DEBUG:
+                warn(u"rename_diverged: not renaming "
+                    "folder '%s'"% fs2uni(new))
+            return False
+        else:
+            raise
+
+
+def help_message():
+    print("""
+Welcome to the Unicode world with OfflineIMAP. :-)
+
+Unicode is still an EXPERIMENTAL feature. Toying with it is very welcome because
+I can't test all possible options but you're advised to make good backups of
+both your mails and the cache. I aim to make Unicode the default but it won't
+happen without your help. So, here is a good way to play with this new feature.
+
+Some configuration options support UTF-8, some not. First, check the
+'offlineimap.conf' coming with your version for details. The very last WIP
+version (standing in the "next" branch) can be found online at 
+
+ https://github.com/OfflineIMAP/offlineimap/blob/next/offlineimap.conf
+
+but it might not match your local version of OfflineIMAP.
+
+Do keep your current configuration file intact. The best approach is to copy
+your 'offlineimaprc' to 'offlineimaprc.utf-8' and update the latter with UTF-8
+in mind.  Then the correct configuration file can be set (with the -c CLI
+option), according to the unicode CLI option you use.
+
+It's a good thing to also copy the content of your current 'metadata' and
+'localfolders'. Then, you'll have free hands to play on the copy (don't forget
+to update the paths in your 'offlineimaprc.utf8' accordingly).
+
+Working on a copy does not mean you should bypass the backups steps. Something
+might go very bad and delete all your mails from the server. Make REGULARY
+backups.
+
+Now that you are warned, I can tell you: the true option is --enable-unicode.
+Please, keep the existence of this option for you (don't communicate it to
+others) so that new comers will fall on this warning message, too.
+
+Not afraid? Good, I need you!
+
+Python 2 is not really consistent when it comes to Unicode and I expect unicode
+to come with subtle bugs. Subtle bugs require meticulous bug reports. This is
+not something hard to do, it just asks to be a bit rigorous. If you have to
+report bugs, follow the procedure at
+
+ https://github.com/OfflineIMAP/offlineimap/wiki/Unicode:-Reporting-bugs-about-Unicode-issues
+
+I intend to REJECT all the bug reports not following this procedure. I'm not a
+strong guy. I'd just like to keep both your life and mine as easy as possible
+while communicating about such bugs. I'm providing you all the ressources you
+might need to do so. It asked me a significant amount of time. Please, take the
+10 minutes to read the doc and follow the steps!
+
+I'm also requesting for POSITIVE feedbacks. For them to be usefull, read the
+link page above. Positive feedbacks will help to know when it will be suitable
+to turn Unicode support from EXPERIMENTAL to TESTING, remove this message, and
+finally make it the default.
+
+Last but not least, as soon as Unicode is used once it might not be possible to
+come back to --no-unicode safely (e.g. if any Unicode character was written to
+the cache). I've tried hard to make it not happen but I can't be categorical.
+This is code. Well, you actually took my advices into account and made a copy of
+your mails, metadata and configuration... Good! You're not concerned by this
+issue anymore. 
+
+If you intend to hack on Unicode, you should read both the API documentation and
+the utils/uni.py module. Last online versions can be found here:
+
+ http://docs.offlineimap.org/en/latest/API.html
+ https://github.com/OfflineIMAP/offlineimap/blob/next/offlineimap/utils/uni.py
+
+
+Have fun!
+
+--
+Nicolas Sebrecht """)
-- 
2.2.2





More information about the OfflineIMAP-project mailing list