[Python-modules-team] Bug#939044: ocrmypdf: autopkgtest not compatible with new pikepdf, ghostscript and/or pytest

Paul Gevers elbrus at debian.org
Sat Aug 31 14:25:04 BST 2019


Source: ocrmypdf
Version: 8.0.1+dfsg-1
Severity: serious
Tags: sid bullseye
User: debian-ci at lists.debian.org
Usertags: needs-update
Control: affects -1 src:pikepdf
Control: affects -1 src:ghostscript
Control: affects -1 src:pytest

[X-Debbugs-CC: debian-ci at lists.debian.org, pikepdf at packages.debian.org,
ghostscript at packages.debian.org, pytest at packages.debian.org]

Dear maintainers,

With a recent upload of pikepdf and with a recent upload of ghostscript
and with a recent upload of pytest (althought that pulls in the others)
the autopkgtest of ocrmypdf fails in testing when that autopkgtest is
run with the binary packages of those packages from unstable. It passes
when run with only packages from testing. In tabular form, e.g.:
                       pass            fail
pikepdf                from testing    1.6.1+dfsg-1
ocrmypdf               from testing    8.0.1+dfsg-1
all others             from testing    from testing

I copied some of the output at the bottom of this report.

Currently this regression is blocking the migration of pikepdf,
ghostscript and pytest to testing [1]. Because failure is triggered by
two packages separately, I filed the bug against ocrmypdf, please
reassign (and clone) if that wasn't correct.

More information about this bug and the reason for filing it can be found on
https://wiki.debian.org/ContinuousIntegration/RegressionEmailInformation

Paul

[1] https://qa.debian.org/excuses.php?package=pikepdf

https://ci.debian.net/data/autopkgtest/testing/amd64/o/ocrmypdf/2854254/log.gz

=================================== FAILURES
===================================
_______________________ test_non_square_resolution[hocr]
_______________________

renderer = 'hocr'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_hoc0/out.pdf'

    @pytest.mark.parametrize('renderer', RENDERERS)
    def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
        # Confirm input image is non-square resolution
        in_pageinfo = PdfInfo(resources / 'aspect.pdf')
        assert in_pageinfo[0].xres != in_pageinfo[0].yres

        check_ocrmypdf(
            resources / 'aspect.pdf',
            outpdf,
            '--pdf-renderer',
            renderer,
>           env=spoof_tesseract_cache,
        )

tests/test_main.py:481:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_hoc0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--pdf-renderer', 'hocr')
p = <subprocess.Popen object at 0x7f12ee1bcb90>, out = ''

    @pytest.helpers.register
    def check_ocrmypdf(input_file, output_file, *args, env=None):
        "Run ocrmypdf and confirmed that a valid file was created"

        p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
        # ensure py.test collects the output, use -s to view
        print(err, file=sys.stderr)
>       assert p.returncode == 0
E       assert 15 == 0
E        +  where 15 = <subprocess.Popen object at
0x7f12ee1bcb90>.returncode

tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
   INFO -    1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt
- HIT
  ERROR - Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
    register_cleanup, touch_files_only)
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
    ret_val = user_defined_work_func(*params)
  File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
    pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file


_____________________ test_non_square_resolution[sandwich]
_____________________

renderer = 'sandwich'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_san0/out.pdf'

    @pytest.mark.parametrize('renderer', RENDERERS)
    def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
        # Confirm input image is non-square resolution
        in_pageinfo = PdfInfo(resources / 'aspect.pdf')
        assert in_pageinfo[0].xres != in_pageinfo[0].yres

        check_ocrmypdf(
            resources / 'aspect.pdf',
            outpdf,
            '--pdf-renderer',
            renderer,
>           env=spoof_tesseract_cache,
        )

tests/test_main.py:481:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_non_square_resolution_san0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--pdf-renderer', 'sandwich')
p = <subprocess.Popen object at 0x7f12ee1bcb50>, out = ''

    @pytest.helpers.register
    def check_ocrmypdf(input_file, output_file, *args, env=None):
        "Run ocrmypdf and confirmed that a valid file was created"

        p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
        # ensure py.test collects the output, use -s to view
        print(err, file=sys.stderr)
>       assert p.returncode == 0
E       assert 15 == 0
E        +  where 15 = <subprocess.Popen object at
0x7f12ee1bcb50>.returncode

tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
   INFO -    1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001.text__pdf__txt
- HIT
  ERROR - Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
    register_cleanup, touch_files_only)
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
    ret_val = user_defined_work_func(*params)
  File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
    pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file


___________________ test_convert_to_square_resolution[hocr]
____________________

renderer = 'hocr'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut0/out.pdf'

    @pytest.mark.parametrize('renderer', RENDERERS)
    def test_convert_to_square_resolution(
        renderer, spoof_tesseract_cache, resources, outpdf
    ):
        # Confirm input image is non-square resolution
        in_pageinfo = PdfInfo(resources / 'aspect.pdf')
        assert in_pageinfo[0].xres != in_pageinfo[0].yres

        # --force-ocr requires means forced conversion to square resolution
        check_ocrmypdf(
            resources / 'aspect.pdf',
            outpdf,
            '--force-ocr',
            '--pdf-renderer',
            renderer,
>           env=spoof_tesseract_cache,
        )

tests/test_main.py:506:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut0/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--force-ocr', '--pdf-renderer', 'hocr')
p = <subprocess.Popen object at 0x7f12ee17c690>, out = ''

    @pytest.helpers.register
    def check_ocrmypdf(input_file, output_file, *args, env=None):
        "Run ocrmypdf and confirmed that a valid file was created"

        p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
        # ensure py.test collects the output, use -s to view
        print(err, file=sys.stderr)
>       assert p.returncode == 0
E       assert 15 == 0
E        +  where 15 = <subprocess.Popen object at
0x7f12ee17c690>.returncode

tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
   INFO -    1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt
- HIT
  ERROR - Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
    register_cleanup, touch_files_only)
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
    ret_val = user_defined_work_func(*params)
  File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
    pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file


_________________ test_convert_to_square_resolution[sandwich]
__________________

renderer = 'sandwich'
spoof_tesseract_cache = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut1/out.pdf'

    @pytest.mark.parametrize('renderer', RENDERERS)
    def test_convert_to_square_resolution(
        renderer, spoof_tesseract_cache, resources, outpdf
    ):
        # Confirm input image is non-square resolution
        in_pageinfo = PdfInfo(resources / 'aspect.pdf')
        assert in_pageinfo[0].xres != in_pageinfo[0].yres

        # --force-ocr requires means forced conversion to square resolution
        check_ocrmypdf(
            resources / 'aspect.pdf',
            outpdf,
            '--force-ocr',
            '--pdf-renderer',
            renderer,
>           env=spoof_tesseract_cache,
        )

tests/test_main.py:506:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

input_file =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources/aspect.pdf')
output_file =
'/tmp/pytest-of-debci/pytest-0/test_convert_to_square_resolut1/out.pdf'
env = {'ADTTMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc._q0vjo65/do...q0vjo65/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc._q0vjo65/downtmp/autopkgtest_tmp', ...}
args = ('--force-ocr', '--pdf-renderer', 'sandwich')
p = <subprocess.Popen object at 0x7f12edfd38d0>, out = ''

    @pytest.helpers.register
    def check_ocrmypdf(input_file, output_file, *args, env=None):
        "Run ocrmypdf and confirmed that a valid file was created"

        p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
        # ensure py.test collects the output, use -s to view
        print(err, file=sys.stderr)
>       assert p.returncode == 0
E       assert 15 == 0
E        +  where 15 = <subprocess.Popen object at
0x7f12edfd38d0>.returncode

tests/conftest.py:155: AssertionError
----------------------------- Captured stderr call
-----------------------------
   INFO -    1: [tesseract] Tesseract cache folder
/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/cache/aspect/__-l__eng__000001.ocr.png__000001.text__pdf__txt
- HIT
  ERROR - Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 712, in
run_pooled_job_without_exceptions
    register_cleanup, touch_files_only)
  File "/usr/lib/python3/dist-packages/ruffus/task.py", line 544, in
job_wrapper_io_files
    ret_val = user_defined_work_func(*params)
  File "/usr/lib/python3/dist-packages/ocrmypdf/_pipeline.py", line 827,
in convert_to_pdfa
    pdf_layers_file.save(layers_file)
ValueError: Cannot overwrite input file


_________________________ test_metadata_fixup_warning
__________________________

resources =
PosixPath('/tmp/autopkgtest-lxc._q0vjo65/downtmp/build.Oxe/src/tests/resources')
outdir =
PosixPath('/tmp/pytest-of-debci/pytest-0/test_metadata_fixup_warning0')

    def test_metadata_fixup_warning(resources, outdir):
        from ocrmypdf._pipeline import metadata_fixup

        input_files = [
            str(outdir / 'graph.repaired.pdf'),
            str(outdir / 'layers.rendered.pdf'),
            str(outdir / 'pdfa.pdf'),  # It is okay that this is not a PDF/A
        ]
        for f in input_files:
            copyfile(resources / 'graph.pdf', f)

        log = MagicMock()
        context = MagicMock()
        metadata_fixup(
            input_files_groups=input_files,
            output_file=outdir / 'out.pdf',
            log=log,
            context=context,
        )
        log.warning.assert_not_called()

        # Now add some metadata that will not be copyable
        graph = pikepdf.open(outdir / 'graph.repaired.pdf')
        with graph.open_metadata() as meta:
            meta['prism2:publicationName'] = 'OCRmyPDF Test'
>       graph.save(outdir / 'graph.repaired.pdf')
E       ValueError: Cannot overwrite input file

tests/test_metadata.py:314: ValueError

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: OpenPGP digital signature
URL: <http://alioth-lists.debian.net/pipermail/python-modules-team/attachments/20190831/d1ed11a5/attachment-0001.sig>


More information about the Python-modules-team mailing list