[Python-modules-commits] r13407 - in packages/pdfminer/trunk/debian (9 files)
jwilk at users.alioth.debian.org
jwilk at users.alioth.debian.org
Sat Jun 12 17:45:30 UTC 2010
Date: Saturday, June 12, 2010 @ 17:45:29
Author: jwilk
Revision: 13407
Overhaul in encoding data storage.
Added:
packages/pdfminer/trunk/debian/patches/encoding-data.diff
packages/pdfminer/trunk/debian/pdfminer-data.install
(from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer-cmap.install)
packages/pdfminer/trunk/debian/python-pdfminer.install
(from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer.install)
Modified:
packages/pdfminer/trunk/debian/changelog
packages/pdfminer/trunk/debian/clean
packages/pdfminer/trunk/debian/control
packages/pdfminer/trunk/debian/patches/series
packages/pdfminer/trunk/debian/rules
Deleted:
packages/pdfminer/trunk/debian/patches/poppler-data.diff
Modified: packages/pdfminer/trunk/debian/changelog
===================================================================
--- packages/pdfminer/trunk/debian/changelog 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/changelog 2010-06-12 17:45:29 UTC (rev 13407)
@@ -6,7 +6,7 @@
[dfsg-testsuite.diff]
* Disable test suite for psparser.py, as it is currently broken.
[psparser-testsuite.diff]
- * Reuse CMaps provided by the poppler-data package.
- [poppler-data.diff]
+ * Store encoding data in gzipped pickles rather than in Python modules.
+ This way we can save lots of disk space. [encoding-data.diff]
-- Jakub Wilk <jwilk at debian.org> Sat, 12 Jun 2010 16:54:14 +0200
Modified: packages/pdfminer/trunk/debian/clean
===================================================================
--- packages/pdfminer/trunk/debian/clean 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/clean 2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,2 +1,3 @@
debian/manpages/*.[0-9]
docs/changelog
+pdfminer/cmap/*.gz
Modified: packages/pdfminer/trunk/debian/control
===================================================================
--- packages/pdfminer/trunk/debian/control 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/control 2010-06-12 17:45:29 UTC (rev 13407)
@@ -16,7 +16,7 @@
Package: python-pdfminer
Architecture: all
Depends: ${misc:Depends}, ${python:Depends}
-Suggests: poppler-data
+Suggests: pdfminer-data
Description: PDF parser and analyser
PDFMiner is a tool for extracting information from PDF documents, which
focuses entirely on getting and analyzing text data. It allows to obtain the
@@ -27,3 +27,14 @@
.
This package provides the Python module and the command-line tools: pdf2txt
and dumppdf.
+
+Package: pdfminer-data
+Architecture: all
+Depends: ${misc:Depends}
+Recommends: python-pdfminer
+Description: PDF parser and analyser (encoding data)
+ PDFMiner is a tool for extracting information from PDF documents, which
+ focuses entirely on getting and analyzing text data.
+ .
+ This package contains the encoding data needed to read some PDF documents in
+ CJK (Chinese, Japanese, Korean) languages.
Added: packages/pdfminer/trunk/debian/patches/encoding-data.diff
===================================================================
--- packages/pdfminer/trunk/debian/patches/encoding-data.diff (rev 0)
+++ packages/pdfminer/trunk/debian/patches/encoding-data.diff 2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1,193 @@
+Description:
+ Store encoding data in gzipped pickles rather than in Python modules.
+ This way we can save lots of disk space.
+Author: Jakub Wilk <jwilk at debian.org>
+Forwarded: not-needed
+Last-Update: 2010-06-12
+
+--- a/setup.py
++++ b/setup.py
+@@ -22,6 +22,9 @@
+ 'pdfminer',
+ 'pdfminer.cmap'
+ ],
++ package_data={
++ 'pdfminer.cmap': ['*.pickle.gz'],
++ },
+ scripts=[
+ 'tools/pdf2txt.py',
+ 'tools/dumppdf.py'
+--- a/Makefile
++++ b/Makefile
+@@ -36,17 +36,17 @@
+ CONV_CMAP=$(PYTHON) tools/conv_cmap.py
+ CMAPSRC=cmaprsrc
+ CMAPDST=pdfminer/cmap
+-cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
+- $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
++cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
++ $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
+ cmap_clean:
+ cd $(CMAPDST) && make cmap_clean
+-$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
++$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
+ $(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+-$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
++$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
+ $(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+-$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
++$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
+ $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+-$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
++$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
+ $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+
+ test: cmap
+--- a/pdfminer/cmapdb.py
++++ b/pdfminer/cmapdb.py
+@@ -15,6 +15,9 @@
+ import re
+ import os
+ import os.path
++import gzip
++import cPickle as pickle
++import cmap
+ from struct import pack, unpack
+ from psparser import PSStackParser
+ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
+@@ -210,34 +213,50 @@
+ class CMapDB(object):
+
+ debug = 0
++ _cmap_cache = {}
++ _umap_cache = {}
+
+ class CMapNotFound(CMapError): pass
+
+ @classmethod
++ def _load_data(klass, name):
++ filename = '%s.pickle.gz' % name
++ if klass.debug:
++ print >>sys.stderr, 'loading:', name
++ for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
++ path = os.path.join(directory, filename)
++ if os.path.exists(path):
++ gzfile = gzip.open(path)
++ try:
++ return type(name, (), pickle.loads(gzfile.read()))
++ finally:
++ gzfile.close()
++ else:
++ raise CMapDB.CMapNotFound(name)
++
++ @classmethod
+ def get_cmap(klass, name):
+ if name == 'Identity-H':
+ return IdentityCMap(False)
+ elif name == 'Identity-V':
+ return IdentityCMap(True)
+- modname = 'pdfminer.cmap.%s' % name.replace('-','_')
+- if klass.debug:
+- print >>sys.stderr, 'loading:', modname
+ try:
+- module = __import__(modname, fromlist=['pdfminer.cmap'])
+- except ImportError:
+- raise CMapDB.CMapNotFound(name)
+- return PyCMap(name, module)
++ return klass._cmap_cache[name]
++ except KeyError:
++ pass
++ data = klass._load_data(name)
++ klass._cmap_cache[name] = cmap = PyCMap(name, data)
++ return cmap
+
+ @classmethod
+ def get_unicode_map(klass, name, vertical=False):
+- modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
+- if klass.debug:
+- print >>sys.stderr, 'loading:', modname, vertical
+ try:
+- module = __import__(modname, fromlist=['pdfminer.cmap'])
+- except ImportError:
+- raise CMapDB.CMapNotFound(name)
+- return PyUnicodeMap(name, module, vertical)
++ return klass._umap_cache[name][vertical]
++ except KeyError:
++ pass
++ data = klass._load_data('to-unicode-%s' % name)
++ klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
++ return umaps[vertical]
+
+
+ ## CMapParser
+--- a/pdfminer/cmap/Makefile
++++ b/pdfminer/cmap/Makefile
+@@ -6,5 +6,4 @@
+ -rm *.pyc *.pyo
+
+ cmap_clean:
+- -rm *.py
+- touch __init__.py
++ rm -f *.pickle.gz
+--- a/tools/conv_cmap.py
++++ b/tools/conv_cmap.py
+@@ -1,6 +1,8 @@
+ #!/usr/bin/env python
+ import sys
+ import os.path
++import gzip
++import cPickle as pickle
+
+ def process_cid2code(fp, check_codecs=[]):
+
+@@ -118,9 +120,6 @@
+ print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
+ return 100
+
+- def pyname(name):
+- return name.replace('-','_')+'.py'
+-
+ args = argv[1:]
+ if len(args) < 3: return usage()
+ (outdir, regname, src) = args[:3]
+@@ -132,22 +131,24 @@
+ fp.close()
+
+ for (name, cmap) in code2cid.iteritems():
+- fname = pyname(name)
++ fname = '%s.pickle.gz' % name
+ print >>sys.stderr, 'writing %r...' % fname
+- fp = file(os.path.join(outdir, fname), 'w')
+- print >>fp, '#!/usr/bin/env python'
+- print >>fp, '#', fname
+- print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
+- print >>fp, 'CODE2CID = %r' % cmap
++ fp = gzip.open(os.path.join(outdir, fname), 'wb')
++ data = dict(
++ IS_VERTICAL=is_vertical.get(name, False),
++ CODE2CID=cmap,
++ )
++ fp.write(pickle.dumps(data))
+ fp.close()
+
+- fname = 'TO_UNICODE_'+pyname(regname)
++ fname = 'to-unicode-%s.pickle.gz' % regname
+ print >>sys.stderr, 'writing %r...' % fname
+- fp = file(os.path.join(outdir, fname), 'w')
+- print >>fp, '#!/usr/bin/env python'
+- print >>fp, '#', fname
+- print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
+- print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
++ fp = gzip.open(os.path.join(outdir, fname), 'wb')
++ data = dict(
++ CID2UNICHR_H=cid2unichr_h,
++ CID2UNICHR_V=cid2unichr_v,
++ )
++ fp.write(pickle.dumps(data))
+ fp.close()
+
+ return 0
+--- /dev/null
++++ b/pdfminer/cmap/__init__.py
+@@ -0,0 +1 @@
++#
Deleted: packages/pdfminer/trunk/debian/patches/poppler-data.diff
===================================================================
--- packages/pdfminer/trunk/debian/patches/poppler-data.diff 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/patches/poppler-data.diff 2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,95 +0,0 @@
-Description:
- Reuse CMaps provided by the poppler-data package. This way we don't need to
- ship pdfminer.cmap.* modules in the Debian package.
-Author: Jakub Wilk <jwilk at debian.org>
-Bug-Debian: http://bugs.debian.org/584555
-Forwarded: not-needed
-Last-Update: 2010-06-08
-
---- a/pdfminer/cmapdb.py
-+++ b/pdfminer/cmapdb.py
-@@ -15,6 +15,7 @@
- import re
- import os
- import os.path
-+import glob
- from struct import pack, unpack
- from psparser import PSStackParser
- from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
-@@ -204,6 +205,22 @@
- def __repr__(self):
- return '<PyUnicodeMap: %s>' % (self.name)
-
-+def _cache(method):
-+
-+ def wrapped_method(self, *args):
-+ cache_attr_name = '_%s_cache' % method.__name__
-+ cache = getattr(self, cache_attr_name, None)
-+ if cache is None:
-+ cache = {}
-+ setattr(self, cache_attr_name, cache)
-+ try:
-+ return cache[args]
-+ except KeyError:
-+ result = method(self, *args)
-+ cache[args] = result
-+ return result
-+
-+ return wrapped_method
-
- ## CMapDB
- ##
-@@ -214,6 +231,7 @@
- class CMapNotFound(CMapError): pass
-
- @classmethod
-+ @_cache
- def get_cmap(klass, name):
- if name == 'Identity-H':
- return IdentityCMap(False)
-@@ -225,10 +243,22 @@
- try:
- module = __import__(modname, fromlist=['pdfminer.cmap'])
- except ImportError:
-+ for directory in glob.glob('/usr/share/poppler/cMap/*/'):
-+ if not os.path.exists(directory + name):
-+ continue
-+ cmap = FileCMap()
-+ fp = file(directory + name, 'rb')
-+ try:
-+ CMapParser(cmap, fp).run()
-+ finally:
-+ fp.close()
-+ return cmap
- raise CMapDB.CMapNotFound(name)
-- return PyCMap(name, module)
-+ else:
-+ return PyCMap(name, module)
-
- @classmethod
-+ @_cache
- def get_unicode_map(klass, name, vertical=False):
- modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
- if klass.debug:
-@@ -236,8 +266,20 @@
- try:
- module = __import__(modname, fromlist=['pdfminer.cmap'])
- except ImportError:
-+ for directory in glob.glob('/usr/share/poppler/cMap/*/'):
-+ filename = directory + name + '-UCS2'
-+ if not os.path.exists(filename):
-+ continue
-+ cmap = FileUnicodeMap()
-+ fp = file(filename, 'rb')
-+ try:
-+ CMapParser(cmap, fp).run()
-+ finally:
-+ fp.close()
-+ return cmap
- raise CMapDB.CMapNotFound(name)
-- return PyUnicodeMap(name, module, vertical)
-+ else:
-+ return PyUnicodeMap(name, module, vertical)
-
-
- ## CMapParser
Modified: packages/pdfminer/trunk/debian/patches/series
===================================================================
--- packages/pdfminer/trunk/debian/patches/series 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/patches/series 2010-06-12 17:45:29 UTC (rev 13407)
@@ -1,3 +1,3 @@
pdf-testsuite.diff
psparser-testsuite.diff
-poppler-data.diff
+encoding-data.diff
Copied: packages/pdfminer/trunk/debian/pdfminer-data.install (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer-cmap.install)
===================================================================
--- packages/pdfminer/trunk/debian/pdfminer-data.install (rev 0)
+++ packages/pdfminer/trunk/debian/pdfminer-data.install 2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1 @@
+usr/lib/python*/*-packages/pdfminer/cmap/*.pickle.gz /usr/share/pdfminer/
Copied: packages/pdfminer/trunk/debian/python-pdfminer.install (from rev 13336, packages/pdfminer/trunk/debian/python-pdfminer.install)
===================================================================
--- packages/pdfminer/trunk/debian/python-pdfminer.install (rev 0)
+++ packages/pdfminer/trunk/debian/python-pdfminer.install 2010-06-12 17:45:29 UTC (rev 13407)
@@ -0,0 +1,5 @@
+/usr/bin/pdf2txt
+/usr/bin/dumppdf
+/usr/lib/python*/*-packages/pdfminer-*.egg-info
+/usr/lib/python*/*-packages/pdfminer/*.py
+/usr/lib/python*/*-packages/pdfminer/cmap/*.py
Modified: packages/pdfminer/trunk/debian/rules
===================================================================
--- packages/pdfminer/trunk/debian/rules 2010-06-12 14:54:34 UTC (rev 13406)
+++ packages/pdfminer/trunk/debian/rules 2010-06-12 17:45:29 UTC (rev 13407)
@@ -5,6 +5,7 @@
.PHONY: override_dh_auto_build
override_dh_auto_build:
+ $(MAKE) cmap
dh_auto_build -Spython_distutils
.PHONY: override_dh_auto_install
@@ -13,7 +14,7 @@
.PHONY: override_dh_install
override_dh_install:
- rename.ul .py '' debian/python-pdfminer/usr/bin/*.py
+ rename.ul .py '' debian/tmp/usr/bin/*.py
dh_install
.PHONY: override_dh_installman
More information about the Python-modules-commits
mailing list