Bug#879011: diffoscope: zipinfo diff shows warning differences that are due to temporary file names

Mike Hommey mh at glandium.org
Thu Oct 19 10:15:35 UTC 2017


On Thu, Oct 19, 2017 at 06:27:07PM +0900, Mike Hommey wrote:
> On Thu, Oct 19, 2017 at 04:41:04PM +0900, Mike Hommey wrote:
> > On Thu, Oct 19, 2017 at 03:57:29PM +0900, Mike Hommey wrote:
> > > On Wed, Oct 18, 2017 at 09:27:29PM +0900, Mike Hommey wrote:
> > > > Package: diffoscope
> > > > Version: 87
> > > > Severity: normal
> > > > 
> > > > While diffing firefox, zipinfo is run on omni.ja, and issues warnings
> > > > like:
> > > > warning [/tmp/tmplgigxgm__diffoscope/0/24]:  17283883 extra bytes at beginning or within zipfile
> > > > 
> > > > Now, when both ends have the same warning, as expected, the diff still
> > > > shows a difference because of the /tmp/tmp*__diffoscope/0/fd path.
> > > 
> > > Note these messages are sent to stderr, and I noticed that for readelf,
> > > stderr is not part of the diff. It seems it would make sense to do the
> > > same for zipinfo.
> > 
> > The code actually does the same thing as for readelf. The problem is
> > that zipinfo is not consistently sending its warning to stderr.
> > 
> > $ zipinfo omni.ja > /dev/null
> > warning [omni.ja]:  17283876 extra bytes at beginning or within zipfile
> >   (attempting to process anyway)
> > error [omni.ja]:  reported length of central directory is
> >   -17283876 bytes too long (Atari STZip zipfile?  J.H.Holm ZIPSPLIT 1.1
> >   zipfile?).  Compensating...
> > 
> > ^ means the warnings are on stderr, right? Nope
> > 
> > $ zipinfo omni.ja 2> /dev/null | grep -v 10-Jan-01
> > Archive:  omni.ja
> > Zip file size: 17400328 bytes, number of entries: 1313
> > warning [omni.ja]:  17283876 extra bytes at beginning or within zipfile
> >   (attempting to process anyway)
> > error [omni.ja]:  reported length of central directory is
> >   -17283876 bytes too long (Atari STZip zipfile?  J.H.Holm ZIPSPLIT 1.1
> >   zipfile?).  Compensating...
> > 1313 files, 17188436 bytes uncompressed, 17188436 bytes compressed:  0.0%
> > 
> > It looks like it only sends those messages to stderr when it's a tty.
> 
> I haven't found a way to make things work with zipinfo's crazy behavior,
> however, I was able to get the zip comparator to work by using
> libarchive, like for other archive formats. I still need to do some more
> cleanup of the code, adjust the apk comparator, and adjust tests.

Attached is my work in progress. I won't finish it right now because
it's getting late and I don't have all the optional tools to run the
test suite.

It's worth noting that this changes the output, and the notable
difference is that the information whether files are stored or deflated
is lost. It doesn't seem at first glance that libarchive exposes that,
so that might be a showstopper :-/

Mike
-------------- next part --------------
diff --git a/diffoscope/comparators/apk.py b/diffoscope/comparators/apk.py
index 7623f82..e00d9f2 100644
--- a/diffoscope/comparators/apk.py
+++ b/diffoscope/comparators/apk.py
@@ -26,10 +26,9 @@ from diffoscope.tools import tool_required
 from diffoscope.tempfiles import get_temporary_directory
 from diffoscope.difference import Difference
 
-from .utils.file import File
 from .utils.archive import Archive
 from .utils.compare import compare_files
-from .zip import Zipinfo, ZipinfoVerbose
+from .zip import ZipFile
 from .missing_file import MissingFile
 
 logger = logging.getLogger(__name__)
@@ -146,17 +145,12 @@ class ApkContainer(Archive):
         return differences
 
 
-class ApkFile(File):
+class ApkFile(ZipFile):
     FILE_TYPE_HEADER_PREFIX = b"PK\x03\x04"
     FILE_TYPE_RE = re.compile(r'^(Java|Zip) archive data.*\b')
     FILE_EXTENSION_SUFFIX = '.apk'
     CONTAINER_CLASS = ApkContainer
 
-    def compare_details(self, other, source=None):
-        zipinfo_difference = Difference.from_command(Zipinfo, self.path, other.path) or \
-                             Difference.from_command(ZipinfoVerbose, self.path, other.path)
-        return [zipinfo_difference]
-
 
 def filter_apk_metadata(filepath, archive_name):
     new_filename = os.path.join(os.path.dirname(filepath), 'APK metadata')
diff --git a/diffoscope/comparators/zip.py b/diffoscope/comparators/zip.py
index 0815ddc..01f5071 100644
--- a/diffoscope/comparators/zip.py
+++ b/diffoscope/comparators/zip.py
@@ -23,84 +23,14 @@ import shutil
 import os.path
 import zipfile
 
-from diffoscope.tools import tool_required
 from diffoscope.difference import Difference
-
 from .utils.file import File
+from .utils.libarchive import LibarchiveContainer, list_libarchive
 from .directory import Directory
-from .utils.archive import Archive, ArchiveMember
-from .utils.command import Command
-
-
-class Zipinfo(Command):
-    @tool_required('zipinfo')
-    def cmdline(self):
-        return ['zipinfo', self.path]
-
-    def filter(self, line):
-        # we don't care about the archive file path
-        if line.startswith(b'Archive:'):
-            return b''
-        return line
-
-
-class ZipinfoVerbose(Zipinfo):
-    @tool_required('zipinfo')
-    def cmdline(self):
-        return ['zipinfo', '-v', self.path]
-
-
-class BsdtarVerbose(Command):
-    @tool_required('bsdtar')
-    def cmdline(self):
-        return ['bsdtar', '-tvf', self.path]
-
-
-class ZipDirectory(Directory, ArchiveMember):
-    def __init__(self, archive, member_name):
-        ArchiveMember.__init__(self, archive, member_name)
-
-    def compare(self, other, source=None):
-        return None
-
-    def has_same_content_as(self, other):
-        return False
-
-    def is_directory(self):
-        return True
-
-    def get_member_names(self):
-        raise ValueError("Zip archives are compared as a whole.")  # noqa
-
-    def get_member(self, member_name):
-        raise ValueError("Zip archives are compared as a whole.")  # noqa
-
 
-class ZipContainer(Archive):
-    def open_archive(self):
-        return zipfile.ZipFile(self.source.path, 'r')
 
-    def close_archive(self):
-        self.archive.close()
-
-    def get_member_names(self):
-        return self.archive.namelist()
-
-    def extract(self, member_name, dest_dir):
-        # We don't really want to crash if the filename in the zip archive
-        # can't be encoded using the filesystem encoding. So let's replace
-        # any weird character so we can get to the bytes.
-        targetpath = os.path.join(dest_dir, os.path.basename(member_name)).encode(sys.getfilesystemencoding(), errors='replace')
-        with self.archive.open(member_name) as source, open(targetpath, 'wb') as target:
-            shutil.copyfileobj(source, target)
-        return targetpath.decode(sys.getfilesystemencoding())
-
-    def get_member(self, member_name):
-        zipinfo = self.archive.getinfo(member_name)
-        if zipinfo.filename[-1] == '/':
-            return ZipDirectory(self, member_name)
-        else:
-            return ArchiveMember(self, member_name)
+class ZipContainer(LibarchiveContainer):
+    pass
 
 
 class ZipFile(File):
@@ -108,47 +38,16 @@ class ZipFile(File):
     FILE_TYPE_RE = re.compile(r'^(Zip archive|Java archive|EPUB document|OpenDocument (Text|Spreadsheet|Presentation|Drawing|Formula|Template|Text Template))\b')
 
     def compare_details(self, other, source=None):
-        zipinfo_difference = Difference.from_command(Zipinfo, self.path, other.path) or \
-                             Difference.from_command(ZipinfoVerbose, self.path, other.path) or \
-                             Difference.from_command(BsdtarVerbose, self.path, other.path)
-        return [zipinfo_difference]
-
-
-class MozillaZipCommandMixin(object):
-    def wait(self):
-        # zipinfo emits an error when reading Mozilla-optimized ZIPs,
-        # which is fine to ignore.
-        super(Zipinfo, self).wait()
-        return 0
-
-
-class MozillaZipinfo(MozillaZipCommandMixin, Zipinfo):
-    pass
-
-
-class MozillaZipinfoVerbose(MozillaZipCommandMixin, ZipinfoVerbose):
-    pass
+        return [Difference.from_text_readers(list_libarchive(self.path),
+                                             list_libarchive(other.path),
+                                             self.path, other.path, source="file list")]
 
 
 class MozillaZipContainer(ZipContainer):
-    def open_archive(self):
-        # This is gross: Monkeypatch zipfile._EndRecData to work with
-        # Mozilla-optimized ZIPs
-        _orig_EndRecData = zipfile._EndRecData
-
-        def _EndRecData(fh):
-            endrec = _orig_EndRecData(fh)
-            if endrec:
-                endrec[zipfile._ECD_LOCATION] = (endrec[zipfile._ECD_OFFSET] +
-                                                 endrec[zipfile._ECD_SIZE])
-            return endrec
-        zipfile._EndRecData = _EndRecData
-        result = super(MozillaZipContainer, self).open_archive()
-        zipfile._EndRecData = _orig_EndRecData
-        return result
+    pass
 
 
-class MozillaZipFile(File):
+class MozillaZipFile(ZipFile):
     CONTAINER_CLASS = MozillaZipContainer
 
     @classmethod
@@ -157,9 +56,3 @@ class MozillaZipFile(File):
         # indicating the amount of data to preload, followed by the ZIP
         # central directory (with a PK\x01\x02 signature)
         return file.file_header[4:8] == b'PK\x01\x02'
-
-    def compare_details(self, other, source=None):
-        zipinfo_difference = Difference.from_command(MozillaZipinfo, self.path, other.path) or \
-                             Difference.from_command(MozillaZipinfoVerbose, self.path, other.path) or \
-                             Difference.from_command(BsdtarVerbose, self.path, other.path)
-        return [zipinfo_difference]
diff --git a/tests/comparators/test_apk.py b/tests/comparators/test_apk.py
index 379cbce..94eb625 100644
--- a/tests/comparators/test_apk.py
+++ b/tests/comparators/test_apk.py
@@ -49,20 +49,20 @@ def differences2(apk1, apk3):
     return apk1.compare(apk3).details
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_compare_non_existing(monkeypatch, apk1):
     assert_non_existing(monkeypatch, apk1)
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_zipinfo(differences):
-    assert differences[0].source1 == 'zipinfo {}'
-    assert differences[0].source2 == 'zipinfo {}'
+    assert differences[0].source1 == 'file list'
+    assert differences[0].source2 == 'file list'
     expected_diff = get_data('apk_zipinfo_expected_diff')
     assert differences[0].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_android_manifest(differences):
     assert differences[1].source1 == 'AndroidManifest.xml (decoded)'
     assert differences[1].source2 == 'AndroidManifest.xml (decoded)'
@@ -70,13 +70,13 @@ def test_android_manifest(differences):
     assert differences[1].details[0].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_apk_metadata_source(differences):
     assert differences[2].source1 == 'APK metadata'
     assert differences[2].source2 == 'APK metadata'
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_skip_undecoded_android_manifest(differences):
     assert not any(difference.source1 == 'original/AndroidManifest.xml'
                    for difference in differences)
@@ -89,7 +89,7 @@ def test_skip_undecoded_android_manifest(differences):
                    for difference in differences)
 
 
- at skip_unless_tools_exist('apktool', 'zipinfo')
+ at skip_unless_tools_exist('apktool')
 def test_no_android_manifest(differences2):
     undecoded_manifest = 'AndroidManifest.xml (original / undecoded)'
     assert differences2[2].source1 == undecoded_manifest
diff --git a/tests/comparators/test_dex.py b/tests/comparators/test_dex.py
index 7565c81..c7bcba5 100644
--- a/tests/comparators/test_dex.py
+++ b/tests/comparators/test_dex.py
@@ -60,7 +60,7 @@ def differences(dex1, dex2):
     return dex1.compare(dex2).details
 
 
- at skip_unless_tools_exist('enjarify', 'zipinfo', 'javap')
+ at skip_unless_tools_exist('enjarify', 'javap')
 @skip_unless_tool_is_at_least('javap', javap_version, '1.8')
 @skip_unless_tool_is_at_least('enjarify', enjarify_version, '1.0.3')
 def test_differences(differences):
@@ -68,8 +68,8 @@ def test_differences(differences):
     assert differences[0].source2 == 'test2.jar'
     zipinfo = differences[0].details[0]
     classdiff = differences[0].details[1]
-    assert zipinfo.source1 == 'zipinfo -v {}'
-    assert zipinfo.source2 == 'zipinfo -v {}'
+    assert zipinfo.source1 == 'file list'
+    assert zipinfo.source2 == 'file list'
     assert classdiff.source1 == 'com/example/MainActivity.class'
     assert classdiff.source2 == 'com/example/MainActivity.class'
     expected_diff = get_data('dex_expected_diffs')
@@ -77,7 +77,7 @@ def test_differences(differences):
     assert expected_diff == found_diff
 
 
- at skip_unless_tools_exist('enjarify', 'zipinfo', 'javap')
+ at skip_unless_tools_exist('enjarify', 'javap')
 def test_compare_non_existing(monkeypatch, dex1):
     monkeypatch.setattr(Config(), 'new_file', True)
     difference = dex1.compare(MissingFile('/nonexisting', dex1))
diff --git a/tests/comparators/test_epub.py b/tests/comparators/test_epub.py
index 893d9d5..d774b01 100644
--- a/tests/comparators/test_epub.py
+++ b/tests/comparators/test_epub.py
@@ -45,10 +45,9 @@ def differences(epub1, epub2):
     return epub1.compare(epub2).details
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_differences(differences):
-    assert differences[0].source1 == 'zipinfo {}'
-    assert differences[0].source2 == 'zipinfo {}'
+    assert differences[0].source1 == 'file list'
+    assert differences[0].source2 == 'file list'
     assert differences[1].source1 == 'content.opf'
     assert differences[1].source2 == 'content.opf'
     assert differences[2].source1 == 'toc.ncx'
@@ -59,7 +58,6 @@ def test_differences(differences):
     assert expected_diff == "".join(map(lambda x: x.unified_diff, differences))
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_compare_non_existing(monkeypatch, epub1):
     monkeypatch.setattr(Config(), 'new_file', True)
     difference = epub1.compare(MissingFile('/nonexisting', epub1))
diff --git a/tests/comparators/test_zip.py b/tests/comparators/test_zip.py
index 31c4b60..9ae6d7b 100644
--- a/tests/comparators/test_zip.py
+++ b/tests/comparators/test_zip.py
@@ -52,13 +52,11 @@ def differences2(zip1, zip3):
     return zip1.compare(zip3).details
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_metadata(differences):
     expected_diff = get_data('zip_zipinfo_expected_diff')
     assert differences[0].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_compressed_files(differences):
     assert differences[1].source1 == 'dir/text'
     assert differences[1].source2 == 'dir/text'
@@ -66,13 +64,11 @@ def test_compressed_files(differences):
     assert differences[1].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('zipinfo', 'bsdtar')
 def test_extra_fields(differences2):
     expected_diff = get_data('zip_bsdtar_expected_diff')
     assert differences2[0].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_compare_non_existing(monkeypatch, zip1):
     assert_non_existing(monkeypatch, zip1)
 
@@ -91,7 +87,6 @@ def mozzip_differences(mozzip1, mozzip2):
     return mozzip1.compare(mozzip2).details
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_mozzip_metadata(mozzip_differences, mozzip1, mozzip2):
     expected_diff = get_data('mozzip_zipinfo_expected_diff')
     diff = mozzip_differences[0].unified_diff
@@ -99,7 +94,6 @@ def test_mozzip_metadata(mozzip_differences, mozzip1, mozzip2):
                 .replace(mozzip2.path, 'test2.mozzip')) == expected_diff
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_mozzip_compressed_files(mozzip_differences):
     assert mozzip_differences[1].source1 == 'dir/text'
     assert mozzip_differences[1].source2 == 'dir/text'
@@ -107,6 +101,5 @@ def test_mozzip_compressed_files(mozzip_differences):
     assert mozzip_differences[1].unified_diff == expected_diff
 
 
- at skip_unless_tools_exist('zipinfo')
 def test_mozzip_compare_non_existing(monkeypatch, mozzip1):
     assert_non_existing(monkeypatch, mozzip1)
diff --git a/tests/data/epub_expected_diffs b/tests/data/epub_expected_diffs
index a111c76..8dbac6b 100644
--- a/tests/data/epub_expected_diffs
+++ b/tests/data/epub_expected_diffs
@@ -1,26 +1,22 @@
-@@ -1,11 +1,11 @@
--Zip file size: 3320 bytes, number of entries: 9
---rw----     0.0 fat       20 b- stor 15-Oct-27 11:32 mimetype
---rw----     0.0 fat      246 b- defX 15-Oct-27 11:32 META-INF/container.xml
---rw----     0.0 fat      160 b- defX 15-Oct-27 11:32 META-INF/com.apple.ibooks.display-options.xml
---rw----     0.0 fat      439 b- defX 15-Oct-27 11:32 stylesheet.css
---rw----     0.0 fat      529 b- defX 15-Oct-27 11:32 title_page.xhtml
---rw----     0.0 fat     1129 b- defX 15-Oct-27 11:32 content.opf
---rw----     0.0 fat      742 b- defX 15-Oct-27 11:32 toc.ncx
---rw----     0.0 fat      655 b- defX 15-Oct-27 11:32 nav.xhtml
---rw----     0.0 fat      615 b- defX 15-Oct-27 11:32 ch001.xhtml
--9 files, 4535 bytes uncompressed, 2328 bytes compressed:  48.7%
-+Zip file size: 3317 bytes, number of entries: 9
-+-rw----     0.0 fat       20 b- stor 15-Oct-27 11:33 mimetype
-+-rw----     0.0 fat      246 b- defX 15-Oct-27 11:33 META-INF/container.xml
-+-rw----     0.0 fat      160 b- defX 15-Oct-27 11:33 META-INF/com.apple.ibooks.display-options.xml
-+-rw----     0.0 fat      439 b- defX 15-Oct-27 11:33 stylesheet.css
-+-rw----     0.0 fat      529 b- defX 15-Oct-27 11:33 title_page.xhtml
-+-rw----     0.0 fat     1129 b- defX 15-Oct-27 11:33 content.opf
-+-rw----     0.0 fat      742 b- defX 15-Oct-27 11:33 toc.ncx
-+-rw----     0.0 fat      655 b- defX 15-Oct-27 11:33 nav.xhtml
-+-rw----     0.0 fat      615 b- defX 15-Oct-27 11:33 ch001.xhtml
-+9 files, 4535 bytes uncompressed, 2325 bytes compressed:  48.7%
+@@ -1,9 +1,9 @@
+--rw-rw-r--   0        0        0       20 2015-10-27 11:32:54.000000 mimetype
+--rw-rw-r--   0        0        0      246 2015-10-27 11:32:54.000000 META-INF/container.xml
+--rw-rw-r--   0        0        0      160 2015-10-27 11:32:54.000000 META-INF/com.apple.ibooks.display-options.xml
+--rw-rw-r--   0        0        0      439 2015-10-27 11:32:54.000000 stylesheet.css
+--rw-rw-r--   0        0        0      529 2015-10-27 11:32:54.000000 title_page.xhtml
+--rw-rw-r--   0        0        0     1129 2015-10-27 11:32:54.000000 content.opf
+--rw-rw-r--   0        0        0      742 2015-10-27 11:32:54.000000 toc.ncx
+--rw-rw-r--   0        0        0      655 2015-10-27 11:32:54.000000 nav.xhtml
+--rw-rw-r--   0        0        0      615 2015-10-27 11:32:54.000000 ch001.xhtml
++-rw-rw-r--   0        0        0       20 2015-10-27 11:33:20.000000 mimetype
++-rw-rw-r--   0        0        0      246 2015-10-27 11:33:20.000000 META-INF/container.xml
++-rw-rw-r--   0        0        0      160 2015-10-27 11:33:20.000000 META-INF/com.apple.ibooks.display-options.xml
++-rw-rw-r--   0        0        0      439 2015-10-27 11:33:20.000000 stylesheet.css
++-rw-rw-r--   0        0        0      529 2015-10-27 11:33:20.000000 title_page.xhtml
++-rw-rw-r--   0        0        0     1129 2015-10-27 11:33:20.000000 content.opf
++-rw-rw-r--   0        0        0      742 2015-10-27 11:33:20.000000 toc.ncx
++-rw-rw-r--   0        0        0      655 2015-10-27 11:33:20.000000 nav.xhtml
++-rw-rw-r--   0        0        0      615 2015-10-27 11:33:20.000000 ch001.xhtml
 @@ -1,13 +1,13 @@
  <?xml version="1.0" encoding="UTF-8"?>
  <package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="epub-id-1">


More information about the Reproducible-builds mailing list