[Pkg-privacy-commits] [Git][pkg-privacy-team/mat2][upstream] New upstream version 0.6.0
Georg Faerber
gitlab at salsa.debian.org
Sat Nov 10 13:20:47 GMT 2018
Georg Faerber pushed to branch upstream at Privacy Maintainers / mat2
Commits:
8a41f99d by Georg Faerber at 2018-11-10T13:06:42Z
New upstream version 0.6.0
- - - - -
18 changed files:
- .gitlab-ci.yml
- CHANGELOG.md
- doc/mat2.1
- libmat2/archive.py
- libmat2/exiftool.py
- libmat2/images.py
- libmat2/office.py
- libmat2/video.py
- mat2
- nautilus/mat2.py
- setup.py
- + tests/data/control_chars.jpg
- + tests/data/dirty.mp4
- tests/test_climat2.py
- tests/test_corrupted_files.py
- tests/test_deep_cleaning.py
- tests/test_libmat2.py
- tests/test_lightweigh_cleaning.py
Changes:
=====================================
.gitlab-ci.yml
=====================================
@@ -35,8 +35,7 @@ mypy:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-pip
- pip3 install mypy
- - mypy mat2 libmat2/*.py --ignore-missing-imports
- - mypy --ignore-missing-imports ./nautilus/mat2.py
+ - mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
tests:debian:
stage: test
=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,14 @@
+# 0.6.0 - 2018-11-10
+
+- Add lightweight cleaning for jpeg
+- Add support for zip files
+- Add support for mp4 files
+- Improve metadata extraction for archives
+- Improve robustness against corrupted embedded files
+- Fix a possible security issue on some terminals (control character
+ injection via --show)
+- Various internal cleanup/improvements
+
# 0.5.0 - 2018-10-23
- Video (.avi files for now) support, via FFmpeg, optionally
@@ -5,7 +16,7 @@
- Processing files starting with a dash is now quicker
- Metadata are now displayed sorted
- Recursive metadata support for FLAC files
-- Unsupported extensions aren't displayed in `/.mat -l` anymore
+- Unsupported extensions aren't displayed in `./mat2 -l` anymore
- Improve the display when no metadata are found
- Update the logo according to the GNOME guidelines
- The testsuite is now runnable on the installed version of mat2
=====================================
doc/mat2.1
=====================================
@@ -1,4 +1,4 @@
-.TH MAT2 "1" "October 2018" "MAT2 0.5.0" "User Commands"
+.TH MAT2 "1" "November 2018" "MAT2 0.6.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
=====================================
libmat2/archive.py
=====================================
@@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return metadata
+ def get_meta(self) -> Dict[str, Union[str, dict]]:
+ meta = dict() # type: Dict[str, Union[str, dict]]
+
+ with zipfile.ZipFile(self.filename) as zin:
+ temp_folder = tempfile.mkdtemp()
+
+ for item in zin.infolist():
+ if item.filename[-1] == '/': # pragma: no cover
+ # `is_dir` is added in Python3.6
+ continue # don't keep empty folders
+
+ zin.extract(member=item, path=temp_folder)
+ full_path = os.path.join(temp_folder, item.filename)
+
+ tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
+ if not tmp_parser:
+ continue
+
+ local_meta = tmp_parser.get_meta()
+ if local_meta:
+ meta[item.filename] = local_meta
+
+ shutil.rmtree(temp_folder)
+ return meta
+
def remove_all(self) -> bool:
# pylint: disable=too-many-branches
@@ -113,7 +138,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
abort = True
continue
if tmp_parser:
- tmp_parser.remove_all()
+ if tmp_parser.remove_all() is False:
+ logging.warning("In file %s, something went wrong \
+ with the cleaning of %s \
+ (format: %s)",
+ self.filename, item.filename, mtype)
+ abort = True
+ continue
os.rename(tmp_parser.output_filename, full_path)
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
@@ -126,3 +157,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
os.remove(self.output_filename)
return False
return True
+
+
+
+class ZipParser(ArchiveBasedAbstractParser):
+ mimetypes = {'application/zip'}
=====================================
libmat2/exiftool.py
=====================================
@@ -53,15 +53,14 @@ class ExiftoolParser(abstract.AbstractParser):
return True
def _get_exiftool_path() -> str: # pragma: no cover
- exiftool_path = '/usr/bin/exiftool'
- if os.path.isfile(exiftool_path):
- if os.access(exiftool_path, os.X_OK):
- return exiftool_path
+ possible_pathes = {
+ '/usr/bin/exiftool', # debian/fedora
+ '/usr/bin/vendor_perl/exiftool', # archlinux
+ }
- # ArchLinux
- exiftool_path = '/usr/bin/vendor_perl/exiftool'
- if os.path.isfile(exiftool_path):
- if os.access(exiftool_path, os.X_OK):
- return exiftool_path
+ for possible_path in possible_pathes:
+ if os.path.isfile(possible_path):
+ if os.access(possible_path, os.X_OK):
+ return possible_path
raise RuntimeError("Unable to find exiftool")
=====================================
libmat2/images.py
=====================================
@@ -6,7 +6,7 @@ import cairo
import gi
gi.require_version('GdkPixbuf', '2.0')
-from gi.repository import GdkPixbuf
+from gi.repository import GdkPixbuf, GLib
from . import exiftool
@@ -50,15 +50,21 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
def __init__(self, filename):
super().__init__(filename)
- if imghdr.what(filename) != self._type: # better safe than sorry
+ # we can't use imghdr here because of https://bugs.python.org/issue28591
+ try:
+ GdkPixbuf.Pixbuf.new_from_file(self.filename)
+ except GLib.GError:
raise ValueError
def remove_all(self) -> bool:
+ if self.lightweight_cleaning:
+ return self._lightweight_cleanup()
+
_, extension = os.path.splitext(self.filename)
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
if extension.lower() == '.jpg':
extension = '.jpeg' # gdk is picky
- pixbuf.savev(self.output_filename, extension[1:], [], [])
+ pixbuf.savev(self.output_filename, type=extension[1:], option_keys=[], option_values=[])
return True
=====================================
libmat2/office.py
=====================================
@@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
"""
- metadata = {}
+ metadata = super().get_meta()
zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
=====================================
libmat2/video.py
=====================================
@@ -2,10 +2,37 @@ import os
import subprocess
import logging
+from typing import Dict, Union
+
from . import exiftool
-class AVIParser(exiftool.ExiftoolParser):
+class AbstractFFmpegParser(exiftool.ExiftoolParser):
+ """ Abstract parser for all FFmpeg-based ones, mainly for video. """
+ def remove_all(self) -> bool:
+ cmd = [_get_ffmpeg_path(),
+ '-i', self.filename, # input file
+ '-y', # overwrite existing output file
+ '-map', '0', # copy everything all streams from input to output
+ '-codec', 'copy', # don't decode anything, just copy (speed!)
+ '-loglevel', 'panic', # Don't show log
+ '-hide_banner', # hide the banner
+ '-map_metadata', '-1', # remove supperficial metadata
+ '-map_chapters', '-1', # remove chapters
+ '-disposition', '0', # Remove dispositions (check ffmpeg's manpage)
+ '-fflags', '+bitexact', # don't add any metadata
+ '-flags:v', '+bitexact', # don't add any metadata
+ '-flags:a', '+bitexact', # don't add any metadata
+ self.output_filename]
+ try:
+ subprocess.check_call(cmd)
+ except subprocess.CalledProcessError as e:
+ logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
+ return False
+ return True
+
+
+class AVIParser(AbstractFFmpegParser):
mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
@@ -24,25 +51,55 @@ class AVIParser(exiftool.ExiftoolParser):
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
'Duration', 'ImageSize', 'Megapixels'}
+class MP4Parser(AbstractFFmpegParser):
+ mimetypes = {'video/mp4', }
+ meta_whitelist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
+ 'XResolution', 'YResolution', 'ExifToolVersion',
+ 'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
+ 'FileName', 'FilePermissions', 'MIMEType', 'FileType',
+ 'FileTypeExtension', 'Directory', 'ImageWidth',
+ 'ImageSize', 'ImageHeight', 'FileSize', 'SourceFile',
+ 'BitDepth', 'Duration', 'AudioChannels',
+ 'AudioBitsPerSample', 'AudioSampleRate', 'Megapixels',
+ 'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale',
+ 'SourceImageHeight', 'SourceImageWidth',
+ 'MatrixStructure', 'MediaDuration'}
+ meta_key_value_whitelist = { # some metadata are mandatory :/
+ 'CreateDate': '0000:00:00 00:00:00',
+ 'CurrentTime': '0 s',
+ 'MediaCreateDate': '0000:00:00 00:00:00',
+ 'MediaLanguageCode': 'und',
+ 'MediaModifyDate': '0000:00:00 00:00:00',
+ 'ModifyDate': '0000:00:00 00:00:00',
+ 'OpColor': '0 0 0',
+ 'PosterTime': '0 s',
+ 'PreferredRate': '1',
+ 'PreferredVolume': '100.00%',
+ 'PreviewDuration': '0 s',
+ 'PreviewTime': '0 s',
+ 'SelectionDuration': '0 s',
+ 'SelectionTime': '0 s',
+ 'TrackCreateDate': '0000:00:00 00:00:00',
+ 'TrackModifyDate': '0000:00:00 00:00:00',
+ 'TrackVolume': '0.00%',
+ }
+
def remove_all(self) -> bool:
- cmd = [_get_ffmpeg_path(),
- '-i', self.filename, # input file
- '-y', # overwrite existing output file
- '-loglevel', 'panic', # Don't show log
- '-hide_banner', # hide the banner
- '-codec', 'copy', # don't decode anything, just copy (speed!)
- '-map_metadata', '-1', # remove supperficial metadata
- '-map_chapters', '-1', # remove chapters
- '-fflags', '+bitexact', # don't add any metadata
- '-flags:v', '+bitexact', # don't add any metadata
- '-flags:a', '+bitexact', # don't add any metadata
- self.output_filename]
- try:
- subprocess.check_call(cmd)
- except subprocess.CalledProcessError as e:
- logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
- return False
- return True
+ logging.warning('The format of "%s" (video/mp4) has some mandatory '
+ 'metadata fields; mat2 filled them with standard data.',
+ self.filename)
+ return super().remove_all()
+
+ def get_meta(self) -> Dict[str, Union[str, dict]]:
+ meta = super().get_meta()
+
+ ret = dict() # type: Dict[str, Union[str, dict]]
+ for key, value in meta.items():
+ if key in self.meta_key_value_whitelist.keys():
+ if value == self.meta_key_value_whitelist[key]:
+ continue
+ ret[key] = value
+ return ret
def _get_ffmpeg_path() -> str: # pragma: no cover
=====================================
mat2
=====================================
@@ -6,6 +6,7 @@ import sys
import mimetypes
import argparse
import logging
+import unicodedata
try:
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
@@ -14,12 +15,14 @@ except ValueError as e:
print(e)
sys.exit(1)
-__version__ = '0.5.0'
+__version__ = '0.6.0'
# Make pyflakes happy
assert Tuple
assert Union
+logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
+
def __check_file(filename: str, mode: int=os.R_OK) -> bool:
if not os.path.exists(filename):
@@ -81,6 +84,15 @@ def __print_meta(filename: str, metadata: dict, depth: int=1):
if isinstance(v, dict):
__print_meta(k, v, depth+1)
continue
+
+ # Remove control characters
+ # We might use 'Cc' instead of 'C', but better safe than sorry
+ # https://www.unicode.org/reports/tr44/#GC_Values_Table
+ try:
+ v = ''.join(ch for ch in v if not unicodedata.category(ch).startswith('C'))
+ except TypeError:
+ pass # for things that aren't iterable
+
try: # FIXME this is ugly.
print(padding + " %s: %s" % (k, v))
except UnicodeEncodeError:
=====================================
nautilus/mat2.py
=====================================
@@ -14,7 +14,7 @@ thread, so we'll have to resort to using a `queue` to pass "messages" around.
import queue
import threading
-from typing import Tuple
+from typing import Tuple, Optional, List
from urllib.parse import unquote
import gi
@@ -25,10 +25,8 @@ from gi.repository import Nautilus, GObject, Gtk, Gio, GLib, GdkPixbuf
from libmat2 import parser_factory
-# make pyflakes happy
-assert Tuple
-def _remove_metadata(fpath):
+def _remove_metadata(fpath) -> Tuple[bool, Optional[str]]:
""" This is a simple wrapper around libmat2, because it's
easier and cleaner this way.
"""
@@ -63,7 +61,7 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
self.infobar.get_content_area().pack_start(self.infobar_hbox, True, True, 0)
self.infobar.show_all()
- def get_widget(self, uri, window):
+ def get_widget(self, uri, window) -> Gtk.Widget:
""" This is the method that we have to implement (because we're
a LocationWidgetProvider) in order to show our infobar.
"""
@@ -228,7 +226,7 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
""" https://bugzilla.gnome.org/show_bug.cgi?id=784278 """
return None
- def get_file_items(self, window, files):
+ def get_file_items(self, window, files) -> Optional[List[Nautilus.MenuItem]]:
""" This method is the one allowing us to create a menu item.
"""
# Do not show the menu item if not a single file has a chance to be
=====================================
setup.py
=====================================
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools.setup(
name="mat2",
- version='0.5.0',
+ version='0.6.0',
author="Julien (jvoisin) Voisin",
author_email="julien.voisin+mat2 at dustri.org",
description="A handy tool to trash your metadata",
=====================================
tests/data/control_chars.jpg
=====================================
Binary files /dev/null and b/tests/data/control_chars.jpg differ
=====================================
tests/data/dirty.mp4
=====================================
Binary files /dev/null and b/tests/data/dirty.mp4 differ
=====================================
tests/test_climat2.py
=====================================
@@ -121,7 +121,7 @@ class TestGetMeta(unittest.TestCase):
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.pdf'],
stdout=subprocess.PIPE)
stdout, _ = proc.communicate()
- self.assertIn(b'producer: pdfTeX-1.40.14', stdout)
+ self.assertIn(b'Producer: pdfTeX-1.40.14', stdout)
def test_png(self):
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.png'],
@@ -174,3 +174,10 @@ class TestGetMeta(unittest.TestCase):
self.assertIn(b'genre: Python', stdout)
self.assertIn(b'i am a : various comment', stdout)
self.assertIn(b'artist: jvoisin', stdout)
+
+class TestControlCharInjection(unittest.TestCase):
+ def test_jpg(self):
+ proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/control_chars.jpg'],
+ stdout=subprocess.PIPE)
+ stdout, _ = proc.communicate()
+ self.assertIn(b'Comment: GQ\n', stdout)
=====================================
tests/test_corrupted_files.py
=====================================
@@ -4,6 +4,7 @@ import unittest
import shutil
import os
import logging
+import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video
@@ -222,3 +223,17 @@ class TestCorruptedFiles(unittest.TestCase):
p = video.AVIParser('./tests/data/--output.avi')
self.assertFalse(p.remove_all())
os.remove('./tests/data/--output.avi')
+
+ def test_zip(self):
+ with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+ zout.write('./tests/data/dirty.flac')
+ zout.write('./tests/data/dirty.docx')
+ zout.write('./tests/data/dirty.jpg')
+ zout.write('./tests/data/embedded_corrupted.docx')
+ p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+ self.assertEqual(mimetype, 'application/zip')
+ meta = p.get_meta()
+ self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
+ self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
+ self.assertFalse(p.remove_all())
+ os.remove('./tests/data/dirty.zip')
=====================================
tests/test_deep_cleaning.py
=====================================
@@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase):
meta = p.get_meta()
self.assertIsNotNone(meta)
+ self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
=====================================
tests/test_libmat2.py
=====================================
@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video
+from libmat2 import check_dependencies, video, archive
class TestCheckDependencies(unittest.TestCase):
@@ -153,6 +153,18 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta, {})
+ def test_zip(self):
+ with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+ zout.write('./tests/data/dirty.flac')
+ zout.write('./tests/data/dirty.docx')
+ zout.write('./tests/data/dirty.jpg')
+ p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+ self.assertEqual(mimetype, 'application/zip')
+ meta = p.get_meta()
+ self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
+ self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
+ os.remove('./tests/data/dirty.zip')
+
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
@@ -488,3 +500,47 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.avi')
os.remove('./tests/data/clean.cleaned.avi')
os.remove('./tests/data/clean.cleaned.cleaned.avi')
+
+ def test_zip(self):
+ with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+ zout.write('./tests/data/dirty.flac')
+ zout.write('./tests/data/dirty.docx')
+ zout.write('./tests/data/dirty.jpg')
+ p = archive.ZipParser('./tests/data/dirty.zip')
+ meta = p.get_meta()
+ self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
+
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = archive.ZipParser('./tests/data/dirty.cleaned.zip')
+ self.assertEqual(p.get_meta(), {})
+ self.assertTrue(p.remove_all())
+
+ os.remove('./tests/data/dirty.zip')
+ os.remove('./tests/data/dirty.cleaned.zip')
+ os.remove('./tests/data/dirty.cleaned.cleaned.zip')
+
+
+ def test_mp4(self):
+ try:
+ video._get_ffmpeg_path()
+ except RuntimeError:
+ raise unittest.SkipTest
+
+ shutil.copy('./tests/data/dirty.mp4', './tests/data/clean.mp4')
+ p = video.MP4Parser('./tests/data/clean.mp4')
+
+ meta = p.get_meta()
+ self.assertEqual(meta['Encoder'], 'HandBrake 0.9.4 2009112300')
+
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = video.MP4Parser('./tests/data/clean.cleaned.mp4')
+ self.assertNotIn('Encoder', p.get_meta())
+ self.assertTrue(p.remove_all())
+
+ os.remove('./tests/data/clean.mp4')
+ os.remove('./tests/data/clean.cleaned.mp4')
+ os.remove('./tests/data/clean.cleaned.cleaned.mp4')
=====================================
tests/test_lightweigh_cleaning.py
=====================================
@@ -4,7 +4,7 @@ import unittest
import shutil
import os
-from libmat2 import pdf, images
+from libmat2 import pdf, images, torrent
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
@@ -63,3 +63,44 @@ class TestLightWeightCleaning(unittest.TestCase):
os.remove('./tests/data/clean.jpg')
os.remove('./tests/data/clean.cleaned.jpg')
+
+ def test_torrent(self):
+ shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.torrent')
+ p = torrent.TorrentParser('./tests/data/clean.torrent')
+
+ meta = p.get_meta()
+ self.assertEqual(meta['created by'], b'mktorrent 1.0')
+
+ p.lightweight_cleaning = True
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = torrent.TorrentParser('./tests/data/clean.cleaned.torrent')
+ self.assertEqual(p.get_meta(), {})
+
+ os.remove('./tests/data/clean.torrent')
+ os.remove('./tests/data/clean.cleaned.torrent')
+
+ def test_tiff(self):
+ shutil.copy('./tests/data/dirty.tiff', './tests/data/clean.tiff')
+ p = images.TiffParser('./tests/data/clean.tiff')
+
+ meta = p.get_meta()
+ self.assertEqual(meta['ImageDescription'], 'OLYMPUS DIGITAL CAMERA ')
+
+ p.lightweight_cleaning = True
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = images.TiffParser('./tests/data/clean.cleaned.tiff')
+ self.assertEqual(p.get_meta(),
+ {
+ 'Orientation': 'Horizontal (normal)',
+ 'ResolutionUnit': 'inches',
+ 'XResolution': 72,
+ 'YResolution': 72
+ }
+ )
+
+ os.remove('./tests/data/clean.tiff')
+ os.remove('./tests/data/clean.cleaned.tiff')
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/commit/8a41f99d29f0c9097d34bc5b8227f05f7dc45d3c
--
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/commit/8a41f99d29f0c9097d34bc5b8227f05f7dc45d3c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-privacy-commits/attachments/20181110/8803585f/attachment-0001.html>
More information about the Pkg-privacy-commits
mailing list