[Python-modules-team] Bug#939044: ocrmypdf: autopkgtest not compatible with new pikepdf, ghostscript and/or pytest
Paul Gevers
elbrus at debian.org
Sat Aug 31 14:25:04 BST 2019
Source: ocrmypdf
Version: 8.0.1+dfsg-1
Severity: serious
Tags: sid bullseye
User: debian-ci at lists.debian.org
Usertags: needs-update
Control: affects -1 src:pikepdf
Control: affects -1 src:ghostscript
Control: affects -1 src:pytest
[X-Debbugs-CC: debian-ci at lists.debian.org, pikepdf at packages.debian.org,
ghostscript at packages.debian.org, pytest at packages.debian.org]
Dear maintainers,
With a recent upload of pikepdf and with a recent upload of ghostscript
and with a recent upload of pytest (althought that pulls in the others)
the autopkgtest of ocrmypdf fails in testing when that autopkgtest is
run with the binary packages of those packages from unstable. It passes
when run with only packages from testing. In tabular form, e.g.:
pass fail
pikepdf from testing 1.6.1+dfsg-1
ocrmypdf from testing 8.0.1+dfsg-1
all others from testing from testing
I copied some of the output at the bottom of this report.
Currently this regression is blocking the migration of pikepdf,
ghostscript and pytest to testing [1]. Because failure is triggered by
two packages separately, I filed the bug against ocrmypdf, please
reassign (and clone) if that wasn't correct.
More information about this bug and the reason for filing it can be found on
https://wiki.debian.org/ContinuousIntegration/RegressionEmailInformation
Paul
[1] https://qa.debian.org/excuses.php?package=pikepdf
https://ci.debian.net/data/autopkgtest/testing/amd64/o/ocrmypdf/2854254/log.gz
=================================== FAILURES
===================================
_______________________ test_non_square_resolution[hocr]
_______________________
renderer = 'hocr'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_hoc0/out.pdf'
@pytest.mark.parametrize('renderer', RENDERERS)
def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].xres != in_pageinfo[0].yres
check_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--pdf-renderer',
renderer,
> env=spoof_tesseract_cache,
)
tests/test_main.py:481:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _
input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_hoc0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--pdf-renderer', 'hocr')
p = <subprocess.Popen object at 0x7f12ee1bcb90>, out = ''
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view
print(err, file=sys.stderr)
> assert p.returncode == 0
E assert 15 == 0
E + where 15 = <subprocess.Popen object at
0x7f12ee1bcb90>.returncode
tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
INFO - 1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt
- HIT
ERROR - Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
register_cleanup, touch_files_only)
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
ret_val = user_defined_work_func(*params)
File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file
_____________________ test_non_square_resolution[sandwich]
_____________________
renderer = 'sandwich'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_san0/out.pdf'
@pytest.mark.parametrize('renderer', RENDERERS)
def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].xres != in_pageinfo[0].yres
check_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--pdf-renderer',
renderer,
> env=spoof_tesseract_cache,
)
tests/test_main.py:481:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _
input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_san0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--pdf-renderer', 'sandwich')
p = <subprocess.Popen object at 0x7f12ee1bcb50>, out = ''
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view
print(err, file=sys.stderr)
> assert p.returncode == 0
E assert 15 == 0
E + where 15 = <subprocess.Popen object at
0x7f12ee1bcb50>.returncode
tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
INFO - 1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001.text__pdf__txt
- HIT
ERROR - Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
register_cleanup, touch_files_only)
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
ret_val = user_defined_work_func(*params)
File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file
___________________ test_convert_to_square_resolution[hocr]
____________________
renderer = 'hocr'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut0/out.pdf'
@pytest.mark.parametrize('renderer', RENDERERS)
def test_convert_to_square_resolution(
renderer, spoof_tesseract_cache, resources, outpdf
):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].xres != in_pageinfo[0].yres
# --force-ocr requires means forced conversion to square resolution
check_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--force-ocr',
'--pdf-renderer',
renderer,
> env=spoof_tesseract_cache,
)
tests/test_main.py:506:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _
input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--force-ocr', '--pdf-renderer', 'hocr')
p = <subprocess.Popen object at 0x7f12ee17c690>, out = ''
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view
print(err, file=sys.stderr)
> assert p.returncode == 0
E assert 15 == 0
E + where 15 = <subprocess.Popen object at
0x7f12ee17c690>.returncode
tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
INFO - 1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt
- HIT
ERROR - Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
register_cleanup, touch_files_only)
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
ret_val = user_defined_work_func(*params)
File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file
_________________ test_convert_to_square_resolution[sandwich]
__________________
renderer = 'sandwich'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut1/out.pdf'
@pytest.mark.parametrize('renderer', RENDERERS)
def test_convert_to_square_resolution(
renderer, spoof_tesseract_cache, resources, outpdf
):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].xres != in_pageinfo[0].yres
# --force-ocr requires means forced conversion to square resolution
check_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--force-ocr',
'--pdf-renderer',
renderer,
> env=spoof_tesseract_cache,
)
tests/test_main.py:506:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _
input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut1/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--force-ocr', '--pdf-renderer', 'sandwich')
p = <subprocess.Popen object at 0x7f12edfd38d0>, out = ''
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view
print(err, file=sys.stderr)
> assert p.returncode == 0
E assert 15 == 0
E + where 15 = <subprocess.Popen object at
0x7f12edfd38d0>.returncode
tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
INFO - 1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001.text__pdf__txt
- HIT
ERROR - Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
register_cleanup, touch_files_only)
File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
ret_val = user_defined_work_func(*params)
File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file
_________________________ test_metadata_fixup_warning
__________________________
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outdir =
PosixPath('/tmp/pytest-of-debci/pytest-0/test_metadata_fixup_warning0')
def test_metadata_fixup_warning(resources, outdir):
from ocrmypdf._pipeline import metadata_fixup
input_files = [
str(outdir / 'graph.repaired.pdf'),
str(outdir / 'layers.rendered.pdf'),
str(outdir / 'pdfa.pdf'), # It is okay that this is not a PDF/A
]
for f in input_files:
copyfile(resources / 'graph.pdf', f)
log = MagicMock()
context = MagicMock()
metadata_fixup(
input_files_groups=input_files,
output_file=outdir / 'out.pdf',
log=log,
context=context,
)
log.warning.assert_not_called()
# Now add some metadata that will not be copyable
graph = pikepdf.open(outdir / 'graph.repaired.pdf')
with graph.open_metadata() as meta:
meta['prism2:publicationName'] = 'OCRmyPDF Test'
> graph.save(outdir / 'graph.repaired.pdf')
E ValueError: Cannot overwrite input file
tests/test_metadata.py:314: ValueError
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: OpenPGP digital signature
URL: <http://alioth-lists.debian.net/pipermail/python-modules-team/attachments/20190831/d1ed11a5/attachment-0001.sig>
More information about the Python-modules-team
mailing list