[Python-modules-commits] r13407 - in packages/pdfminer/trunk/debian (9 files)

jwilk at users.alioth.debian.org jwilk at users.alioth.debian.org
Sat Jun 12 17:45:30 UTC 2010


    Date: Saturday, June 12, 2010 @ 17:45:29
  Author: jwilk
Revision: 13407

Overhaul in encoding data storage.

Added:
  packages/pdfminer/trunk/debian/patches/encoding-data.diff
  packages/pdfminer/trunk/debian/pdfminer-data.install
    (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer-cmap.install)
  packages/pdfminer/trunk/debian/python-pdfminer.install
    (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer.install)
Modified:
  packages/pdfminer/trunk/debian/changelog
  packages/pdfminer/trunk/debian/clean
  packages/pdfminer/trunk/debian/control
  packages/pdfminer/trunk/debian/patches/series
  packages/pdfminer/trunk/debian/rules
Deleted:
  packages/pdfminer/trunk/debian/patches/poppler-data.diff

Modified: packages/pdfminer/trunk/debian/changelog
===================================================================
--- packages/pdfminer/trunk/debian/changelog	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/changelog	2010-06-12 17:45:29 UTC (rev 13407)
@@ -6,7 +6,7 @@
       [dfsg-testsuite.diff]
   * Disable test suite for psparser.py, as it is currently broken.
     [psparser-testsuite.diff]
-  * Reuse CMaps provided by the poppler-data package.
-    [poppler-data.diff]
+  * Store encoding data in gzipped pickles rather than in Python modules.
+    This way we can save lots of disk space. [encoding-data.diff]
 
  -- Jakub Wilk <jwilk at debian.org>  Sat, 12 Jun 2010 16:54:14 +0200

Modified: packages/pdfminer/trunk/debian/clean
===================================================================
--- packages/pdfminer/trunk/debian/clean	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/clean	2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,2 +1,3 @@
 debian/manpages/*.[0-9]
 docs/changelog
+pdfminer/cmap/*.gz

Modified: packages/pdfminer/trunk/debian/control
===================================================================
--- packages/pdfminer/trunk/debian/control	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/control	2010-06-12 17:45:29 UTC (rev 13407)
@@ -16,7 +16,7 @@
 Package: python-pdfminer
 Architecture: all
 Depends: ${misc:Depends}, ${python:Depends}
-Suggests: poppler-data
+Suggests: pdfminer-data
 Description: PDF parser and analyser
  PDFMiner is a tool for extracting information from PDF documents, which
  focuses entirely on getting and analyzing text data. It allows to obtain the
@@ -27,3 +27,14 @@
  .
  This package provides the Python module and the command-line tools: pdf2txt
  and dumppdf.
+
+Package: pdfminer-data
+Architecture: all
+Depends: ${misc:Depends}
+Recommends: python-pdfminer
+Description: PDF parser and analyser (encoding data)
+ PDFMiner is a tool for extracting information from PDF documents, which
+ focuses entirely on getting and analyzing text data. 
+ .
+ This package contains the encoding data needed to read some PDF documents in
+ CJK (Chinese, Japanese, Korean) languages.

Added: packages/pdfminer/trunk/debian/patches/encoding-data.diff
===================================================================
--- packages/pdfminer/trunk/debian/patches/encoding-data.diff	                        (rev 0)
+++ packages/pdfminer/trunk/debian/patches/encoding-data.diff	2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1,193 @@
+Description:
+  Store encoding data in gzipped pickles rather than in Python modules.
+  This way we can save lots of disk space.
+Author: Jakub Wilk <jwilk at debian.org>
+Forwarded: not-needed
+Last-Update: 2010-06-12
+
+--- a/setup.py
++++ b/setup.py
+@@ -22,6 +22,9 @@
+     'pdfminer',
+     'pdfminer.cmap'
+     ],
++    package_data={
++    'pdfminer.cmap': ['*.pickle.gz'],
++    },
+     scripts=[
+     'tools/pdf2txt.py',
+     'tools/dumppdf.py'
+--- a/Makefile
++++ b/Makefile
+@@ -36,17 +36,17 @@
+ CONV_CMAP=$(PYTHON) tools/conv_cmap.py
+ CMAPSRC=cmaprsrc
+ CMAPDST=pdfminer/cmap
+-cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
+-	$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
++cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
++	$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
+ cmap_clean:
+ 	cd $(CMAPDST) && make cmap_clean
+-$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
++$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
+ 	$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+-$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
++$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
+ 	$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+-$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
++$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
+ 	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+-$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
++$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
+ 	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+ 
+ test: cmap
+--- a/pdfminer/cmapdb.py
++++ b/pdfminer/cmapdb.py
+@@ -15,6 +15,9 @@
+ import re
+ import os
+ import os.path
++import gzip
++import cPickle as pickle
++import cmap
+ from struct import pack, unpack
+ from psparser import PSStackParser
+ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
+@@ -210,34 +213,50 @@
+ class CMapDB(object):
+ 
+     debug = 0
++    _cmap_cache = {}
++    _umap_cache = {}
+     
+     class CMapNotFound(CMapError): pass
+ 
+     @classmethod
++    def _load_data(klass, name):
++        filename = '%s.pickle.gz' % name
++        if klass.debug:
++            print >>sys.stderr, 'loading:', name
++        for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
++            path = os.path.join(directory, filename)
++            if os.path.exists(path):
++                gzfile = gzip.open(path)
++                try:
++                    return type(name, (), pickle.loads(gzfile.read()))
++                finally:
++                    gzfile.close()
++        else:
++            raise CMapDB.CMapNotFound(name)
++
++    @classmethod
+     def get_cmap(klass, name):
+         if name == 'Identity-H':
+             return IdentityCMap(False)
+         elif name == 'Identity-V':
+             return IdentityCMap(True)
+-        modname = 'pdfminer.cmap.%s' % name.replace('-','_')
+-        if klass.debug:
+-            print >>sys.stderr, 'loading:', modname
+         try:
+-            module = __import__(modname, fromlist=['pdfminer.cmap'])
+-        except ImportError:
+-            raise CMapDB.CMapNotFound(name)
+-        return PyCMap(name, module)
++            return klass._cmap_cache[name]
++        except KeyError:
++            pass
++        data = klass._load_data(name)
++        klass._cmap_cache[name] = cmap = PyCMap(name, data)
++        return cmap
+ 
+     @classmethod
+     def get_unicode_map(klass, name, vertical=False):
+-        modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
+-        if klass.debug:
+-            print >>sys.stderr, 'loading:', modname, vertical
+         try:
+-            module = __import__(modname, fromlist=['pdfminer.cmap'])
+-        except ImportError:
+-            raise CMapDB.CMapNotFound(name)
+-        return PyUnicodeMap(name, module, vertical)
++            return klass._umap_cache[name][vertical]
++        except KeyError:
++            pass
++        data = klass._load_data('to-unicode-%s' % name)
++        klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
++        return umaps[vertical]
+ 
+ 
+ ##  CMapParser
+--- a/pdfminer/cmap/Makefile
++++ b/pdfminer/cmap/Makefile
+@@ -6,5 +6,4 @@
+ 	-rm *.pyc *.pyo
+ 
+ cmap_clean:
+-	-rm *.py
+-	touch __init__.py
++	rm -f *.pickle.gz
+--- a/tools/conv_cmap.py
++++ b/tools/conv_cmap.py
+@@ -1,6 +1,8 @@
+ #!/usr/bin/env python
+ import sys
+ import os.path
++import gzip
++import cPickle as pickle
+ 
+ def process_cid2code(fp, check_codecs=[]):
+ 
+@@ -118,9 +120,6 @@
+         print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
+         return 100
+     
+-    def pyname(name):
+-        return name.replace('-','_')+'.py'
+-
+     args = argv[1:]
+     if len(args) < 3: return usage()
+     (outdir, regname, src) = args[:3]
+@@ -132,22 +131,24 @@
+     fp.close()
+ 
+     for (name, cmap) in code2cid.iteritems():
+-        fname = pyname(name)
++        fname = '%s.pickle.gz' % name
+         print >>sys.stderr, 'writing %r...' % fname
+-        fp = file(os.path.join(outdir, fname), 'w')
+-        print >>fp, '#!/usr/bin/env python'
+-        print >>fp, '#', fname
+-        print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
+-        print >>fp, 'CODE2CID = %r' % cmap
++        fp = gzip.open(os.path.join(outdir, fname), 'wb')
++        data = dict(
++            IS_VERTICAL=is_vertical.get(name, False),
++            CODE2CID=cmap,
++        )
++        fp.write(pickle.dumps(data))
+         fp.close()
+ 
+-    fname = 'TO_UNICODE_'+pyname(regname)
++    fname = 'to-unicode-%s.pickle.gz' % regname
+     print >>sys.stderr, 'writing %r...' % fname
+-    fp = file(os.path.join(outdir, fname), 'w')
+-    print >>fp, '#!/usr/bin/env python'
+-    print >>fp, '#', fname
+-    print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
+-    print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
++    fp = gzip.open(os.path.join(outdir, fname), 'wb')
++    data = dict(
++        CID2UNICHR_H=cid2unichr_h,
++        CID2UNICHR_V=cid2unichr_v,
++    )
++    fp.write(pickle.dumps(data))
+     fp.close()
+ 
+     return 0
+--- /dev/null
++++ b/pdfminer/cmap/__init__.py
+@@ -0,0 +1 @@
++#

Deleted: packages/pdfminer/trunk/debian/patches/poppler-data.diff
===================================================================
--- packages/pdfminer/trunk/debian/patches/poppler-data.diff	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/patches/poppler-data.diff	2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,95 +0,0 @@
-Description:
-  Reuse CMaps provided by the poppler-data package. This way we don't need to
-  ship pdfminer.cmap.* modules in the Debian package.
-Author: Jakub Wilk <jwilk at debian.org>
-Bug-Debian: http://bugs.debian.org/584555
-Forwarded: not-needed
-Last-Update: 2010-06-08
-
---- a/pdfminer/cmapdb.py
-+++ b/pdfminer/cmapdb.py
-@@ -15,6 +15,7 @@
- import re
- import os
- import os.path
-+import glob
- from struct import pack, unpack
- from psparser import PSStackParser
- from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
-@@ -204,6 +205,22 @@
-     def __repr__(self):
-         return '<PyUnicodeMap: %s>' % (self.name)
- 
-+def _cache(method):
-+
-+    def wrapped_method(self, *args):
-+        cache_attr_name = '_%s_cache' % method.__name__
-+        cache = getattr(self, cache_attr_name, None)
-+        if cache is None:
-+            cache = {}
-+            setattr(self, cache_attr_name, cache)
-+        try:
-+            return cache[args]
-+        except KeyError:
-+            result = method(self, *args)
-+            cache[args] = result
-+            return result
-+
-+    return wrapped_method
- 
- ##  CMapDB
- ##
-@@ -214,6 +231,7 @@
-     class CMapNotFound(CMapError): pass
- 
-     @classmethod
-+    @_cache
-     def get_cmap(klass, name):
-         if name == 'Identity-H':
-             return IdentityCMap(False)
-@@ -225,10 +243,22 @@
-         try:
-             module = __import__(modname, fromlist=['pdfminer.cmap'])
-         except ImportError:
-+            for directory in glob.glob('/usr/share/poppler/cMap/*/'):
-+                if not os.path.exists(directory + name):
-+                    continue
-+                cmap = FileCMap()
-+                fp = file(directory + name, 'rb')
-+                try:
-+                    CMapParser(cmap, fp).run()
-+                finally:
-+                    fp.close()
-+                return cmap
-             raise CMapDB.CMapNotFound(name)
--        return PyCMap(name, module)
-+        else:
-+            return PyCMap(name, module)
- 
-     @classmethod
-+    @_cache
-     def get_unicode_map(klass, name, vertical=False):
-         modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
-         if klass.debug:
-@@ -236,8 +266,20 @@
-         try:
-             module = __import__(modname, fromlist=['pdfminer.cmap'])
-         except ImportError:
-+            for directory in glob.glob('/usr/share/poppler/cMap/*/'):
-+                filename = directory + name + '-UCS2'
-+                if not os.path.exists(filename):
-+                    continue
-+                cmap = FileUnicodeMap()
-+                fp = file(filename, 'rb')
-+                try:
-+                    CMapParser(cmap, fp).run()
-+                finally:
-+                    fp.close()
-+                return cmap
-             raise CMapDB.CMapNotFound(name)
--        return PyUnicodeMap(name, module, vertical)
-+        else:
-+            return PyUnicodeMap(name, module, vertical)
- 
- 
- ##  CMapParser

Modified: packages/pdfminer/trunk/debian/patches/series
===================================================================
--- packages/pdfminer/trunk/debian/patches/series	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/patches/series	2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,3 +1,3 @@
 pdf-testsuite.diff
 psparser-testsuite.diff
-poppler-data.diff
+encoding-data.diff

Copied: packages/pdfminer/trunk/debian/pdfminer-data.install (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer-cmap.install)
===================================================================
--- packages/pdfminer/trunk/debian/pdfminer-data.install	                        (rev 0)
+++ packages/pdfminer/trunk/debian/pdfminer-data.install	2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1 @@
+usr/lib/python*/*-packages/pdfminer/cmap/*.pickle.gz /usr/share/pdfminer/

Copied: packages/pdfminer/trunk/debian/python-pdfminer.install (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer.install)
===================================================================
--- packages/pdfminer/trunk/debian/python-pdfminer.install	                        (rev 0)
+++ packages/pdfminer/trunk/debian/python-pdfminer.install	2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1,5 @@
+/usr/bin/pdf2txt
+/usr/bin/dumppdf
+/usr/lib/python*/*-packages/pdfminer-*.egg-info
+/usr/lib/python*/*-packages/pdfminer/*.py
+/usr/lib/python*/*-packages/pdfminer/cmap/*.py

Modified: packages/pdfminer/trunk/debian/rules
===================================================================
--- packages/pdfminer/trunk/debian/rules	2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/rules	2010-06-12 17:45:29 UTC (rev 13407)
@@ -5,6 +5,7 @@
 
 .PHONY: override_dh_auto_build
 override_dh_auto_build:
+	$(MAKE) cmap
 	dh_auto_build -Spython_distutils
 
 .PHONY: override_dh_auto_install
@@ -13,7 +14,7 @@
 
 .PHONY: override_dh_install
 override_dh_install:
-	rename.ul .py '' debian/python-pdfminer/usr/bin/*.py
+	rename.ul .py '' debian/tmp/usr/bin/*.py
 	dh_install
 
 .PHONY: override_dh_installman




More information about the Python-modules-commits mailing list