[Pkg-privacy-commits] [Git][pkg-privacy-team/mat2][upstream] New upstream version 0.12.1
Georg Faerber
georg at debian.org
Sat Mar 20 19:26:04 GMT 2021
Georg Faerber pushed to branch upstream at Privacy Maintainers / mat2
Commits:
44d17389 by Georg Faerber at 2021-03-20T19:09:46+00:00
New upstream version 0.12.1
- - - - -
6 changed files:
- CHANGELOG.md
- doc/mat2.1
- libmat2/epub.py
- libmat2/office.py
- mat2
- setup.py
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,8 @@
+# 0.12.1 - 2021-03-19
+
+- Improve epub support
+- Improve MS Office support
+
# 0.12.0 - 2020-12-18
- Improve significantly MS Office formats support
=====================================
doc/mat2.1
=====================================
@@ -1,4 +1,4 @@
-.TH mat2 "1" "December 2020" "mat2 0.12.0" "User Commands"
+.TH mat2 "1" "March 2021" "mat2 0.12.1" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
=====================================
libmat2/epub.py
=====================================
@@ -1,7 +1,9 @@
import logging
import re
import uuid
+import zipfile
import xml.etree.ElementTree as ET # type: ignore
+from typing import Dict, Any
from . import archive, office
@@ -15,11 +17,28 @@ class EPUBParser(archive.ZipParser):
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
+ 'content.opf',
+ 'hmh.opf',
+ 'OPS/.+.xml'
}))
+ self.files_to_omit = set(map(re.compile, { # type: ignore
+ 'iTunesMetadata.plist',
+ 'META-INF/calibre_bookmarks.txt',
+ 'OEBPS/package.opf',
+ }))
self.uniqid = uuid.uuid4()
- def _specific_get_meta(self, full_path, file_path):
- if file_path != 'OEBPS/content.opf':
+
+ def is_archive_valid(self):
+ super().is_archive_valid()
+ with zipfile.ZipFile(self.filename) as zin:
+ for item in self._get_all_members(zin):
+ member_name = self._get_member_name(item)
+ if member_name.endswith('META-INF/encryption.xml'):
+ raise ValueError('the file contains encrypted fonts')
+
+ def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+ if not file_path.endswith('.opf'):
return {}
with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,32 @@ class EPUBParser(archive.ZipParser):
except (TypeError, UnicodeDecodeError):
return {file_path: 'harmful content', }
- def _specific_cleanup(self, full_path: str):
- if full_path.endswith('OEBPS/content.opf'):
+ def _specific_cleanup(self, full_path: str) -> bool:
+ if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path)
+ elif re.search('/OPS/[^/]+.xml$', full_path):
+ return self.__handle_ops_xml(full_path)
return True
- def __handle_tocncx(self, full_path: str):
+ def __handle_ops_xml(self, full_path: str) -> bool:
+ try:
+ tree, namespace = office._parse_xml(full_path)
+ except ET.ParseError: # pragma: nocover
+ logging.error("Unable to parse %s in %s.", full_path, self.filename)
+ return False
+
+ for item in tree.iterfind('.//', namespace): # pragma: nocover
+ if item.tag.strip().lower().endswith('head'):
+ item.clear()
+ break
+ tree.write(full_path, xml_declaration=True, encoding='utf-8',
+ short_empty_elements=False)
+ return True
+
+
+ def __handle_tocncx(self, full_path: str) -> bool:
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
@@ -53,7 +90,7 @@ class EPUBParser(archive.ZipParser):
short_empty_elements=False)
return True
- def __handle_contentopf(self, full_path: str):
+ def __handle_contentopf(self, full_path: str) -> bool:
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
=====================================
libmat2/office.py
=====================================
@@ -87,6 +87,7 @@ class MSOfficeParser(ZipParser):
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
+ r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$',
r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
@@ -108,6 +109,7 @@ class MSOfficeParser(ZipParser):
r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
+ r'^\[trash\]/',
r'^customXml/',
r'webSettings\.xml$',
r'^docProps/custom\.xml$',
=====================================
mat2
=====================================
@@ -17,7 +17,7 @@ except ValueError as e:
print(e)
sys.exit(1)
-__version__ = '0.12.0'
+__version__ = '0.12.1'
# Make pyflakes happy
assert Set
=====================================
setup.py
=====================================
@@ -5,7 +5,7 @@ with open("README.md", encoding='utf-8') as fh:
setuptools.setup(
name="mat2",
- version='0.12.0',
+ version='0.12.1',
author="Julien (jvoisin) Voisin",
author_email="julien.voisin+mat2 at dustri.org",
description="A handy tool to trash your metadata",
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/-/commit/44d17389f94ef2855bb2c40a76a7a2e3316a53fb
--
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/-/commit/44d17389f94ef2855bb2c40a76a7a2e3316a53fb
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-privacy-commits/attachments/20210320/a5b41f40/attachment-0001.htm>
More information about the Pkg-privacy-commits
mailing list