[Pkg-privacy-commits] [Git][pkg-privacy-team/mat2][upstream] New upstream version 0.12.1

Sat Mar 20 19:26:04 GMT 2021


Georg Faerber pushed to branch upstream at Privacy Maintainers / mat2


Commits:
44d17389 by Georg Faerber at 2021-03-20T19:09:46+00:00
New upstream version 0.12.1
- - - - -


6 changed files:

- CHANGELOG.md
- doc/mat2.1
- libmat2/epub.py
- libmat2/office.py
- mat2
- setup.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,8 @@
+# 0.12.1 - 2021-03-19
+
+- Improve epub support
+- Improve MS Office support
+
 # 0.12.0 - 2020-12-18
 
 - Improve significantly MS Office formats support


=====================================
doc/mat2.1
=====================================
@@ -1,4 +1,4 @@
-.TH mat2 "1" "December 2020" "mat2 0.12.0" "User Commands"
+.TH mat2 "1" "March 2021" "mat2 0.12.1" "User Commands"
 
 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2


=====================================
libmat2/epub.py
=====================================
@@ -1,7 +1,9 @@
 import logging
 import re
 import uuid
+import zipfile
 import xml.etree.ElementTree as ET  # type: ignore
+from typing import Dict, Any
 
 from . import archive, office
 
@@ -15,11 +17,28 @@ class EPUBParser(archive.ZipParser):
             'META-INF/container.xml',
             'mimetype',
             'OEBPS/content.opf',
+            'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
             }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist',
+            'META-INF/calibre_bookmarks.txt',
+            'OEBPS/package.opf',
+             }))
         self.uniqid = uuid.uuid4()
 
-    def _specific_get_meta(self, full_path, file_path):
-        if file_path != 'OEBPS/content.opf':
+
+    def is_archive_valid(self):
+        super().is_archive_valid()
+        with zipfile.ZipFile(self.filename) as zin:
+            for item in self._get_all_members(zin):
+                member_name = self._get_member_name(item)
+                if member_name.endswith('META-INF/encryption.xml'):
+                    raise ValueError('the file contains encrypted fonts')
+
+    def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+        if not file_path.endswith('.opf'):
             return {}
 
         with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,32 @@ class EPUBParser(archive.ZipParser):
             except (TypeError, UnicodeDecodeError):
                 return {file_path: 'harmful content', }
 
-    def _specific_cleanup(self, full_path: str):
-        if full_path.endswith('OEBPS/content.opf'):
+    def _specific_cleanup(self, full_path: str) -> bool:
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
             return self.__handle_contentopf(full_path)
         elif full_path.endswith('OEBPS/toc.ncx'):
             return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
         return True
 
-    def __handle_tocncx(self, full_path: str):
+    def __handle_ops_xml(self, full_path: str) -> bool:
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+
+
+    def __handle_tocncx(self, full_path: str) -> bool:
         try:
             tree, namespace = office._parse_xml(full_path)
         except ET.ParseError:  # pragma: nocover
@@ -53,7 +90,7 @@ class EPUBParser(archive.ZipParser):
                    short_empty_elements=False)
         return True
 
-    def __handle_contentopf(self, full_path: str):
+    def __handle_contentopf(self, full_path: str) -> bool:
         try:
             tree, namespace = office._parse_xml(full_path)
         except ET.ParseError:


=====================================
libmat2/office.py
=====================================
@@ -87,6 +87,7 @@ class MSOfficeParser(ZipParser):
         self.files_to_keep = set(map(re.compile, {  # type: ignore
             r'^\[Content_Types\]\.xml$',
             r'^_rels/\.rels$',
+            r'^xl/sharedStrings\.xml$',  # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
             r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$',
             r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
             r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
@@ -108,6 +109,7 @@ class MSOfficeParser(ZipParser):
             r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
         }))
         self.files_to_omit = set(map(re.compile, {  # type: ignore
+            r'^\[trash\]/',
             r'^customXml/',
             r'webSettings\.xml$',
             r'^docProps/custom\.xml$',


=====================================
mat2
=====================================
@@ -17,7 +17,7 @@ except ValueError as e:
     print(e)
     sys.exit(1)
 
-__version__ = '0.12.0'
+__version__ = '0.12.1'
 
 # Make pyflakes happy
 assert Set


=====================================
setup.py
=====================================
@@ -5,7 +5,7 @@ with open("README.md", encoding='utf-8') as fh:
 
 setuptools.setup(
     name="mat2",
-    version='0.12.0',
+    version='0.12.1',
     author="Julien (jvoisin) Voisin",
     author_email="julien.voisin+mat2 at dustri.org",
     description="A handy tool to trash your metadata",



View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/-/commit/44d17389f94ef2855bb2c40a76a7a2e3316a53fb

-- 
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/-/commit/44d17389f94ef2855bb2c40a76a7a2e3316a53fb
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-privacy-commits/attachments/20210320/a5b41f40/attachment-0001.htm>