[Pkg-privacy-commits] [Git][pkg-privacy-team/mat2][upstream] New upstream version 0.8.0
Georg Faerber
gitlab at salsa.debian.org
Thu Feb 28 11:27:45 GMT 2019
Georg Faerber pushed to branch upstream at Privacy Maintainers / mat2
Commits:
dd721b09 by Georg Faerber at 2019-02-28T11:18:58Z
New upstream version 0.8.0
- - - - -
16 changed files:
- CHANGELOG.md
- README.md
- doc/mat2.1
- libmat2/archive.py
- + libmat2/epub.py
- − libmat2/html.py
- libmat2/parser_factory.py
- + libmat2/web.py
- mat2
- setup.py
- + tests/data/dirty.css
- + tests/data/dirty.epub
- + tests/dirty.epub
- tests/test_corrupted_files.py
- tests/test_deep_cleaning.py
- tests/test_libmat2.py
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,10 @@
+# 0.8.0 - 2019-02-28
+
+- Add support for epub files
+- Fix the setup.py file crashing on non-utf8 platforms
+- Improve css support
+- Improve html support
+
# 0.7.0 - 2019-02-17
- Add support for wmv files
=====================================
README.md
=====================================
@@ -32,6 +32,7 @@ metadata.
- `gir1.2-gdkpixbuf-2.0` for images support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else
+- `bubblewrap`, optionally, for sandboxing
Please note that MAT2 requires at least Python3.5, meaning that it
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).
=====================================
doc/mat2.1
=====================================
@@ -1,4 +1,4 @@
-.TH MAT2 "1" "February 2019" "MAT2 0.7.0" "User Commands"
+.TH MAT2 "1" "February 2019" "MAT2 0.8.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
=====================================
libmat2/archive.py
=====================================
@@ -4,13 +4,14 @@ import tempfile
import os
import logging
import shutil
-from typing import Dict, Set, Pattern, Union, Any
+from typing import Dict, Set, Pattern, Union, Any, List
from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy
assert Set
assert Pattern
+assert List
assert Union
@@ -115,9 +116,18 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
temp_folder = tempfile.mkdtemp()
abort = False
+ items = list() # type: List[zipfile.ZipInfo]
+ for item in sorted(zin.infolist(), key=lambda z: z.filename):
+ # Some fileformats do require to have the `mimetype` file
+ # as the first file in the archive.
+ if item.filename == 'mimetype':
+ items = [item] + items
+ else:
+ items.append(item)
+
# Since files order is a fingerprint factor,
# we're iterating (and thus inserting) them in lexicographic order.
- for item in sorted(zin.infolist(), key=lambda z: z.filename):
+ for item in items:
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
continue # don't keep empty folders
=====================================
libmat2/epub.py
=====================================
@@ -0,0 +1,79 @@
+import logging
+import re
+import uuid
+import xml.etree.ElementTree as ET # type: ignore
+
+from . import archive, office
+
+class EPUBParser(archive.ArchiveBasedAbstractParser):
+ mimetypes = {'application/epub+zip', }
+ metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
+
+ def __init__(self, filename):
+ super().__init__(filename)
+ self.files_to_keep = set(map(re.compile, { # type: ignore
+ 'META-INF/container.xml',
+ 'mimetype',
+ 'OEBPS/content.opf',
+ }))
+ self.uniqid = uuid.uuid4()
+
+ def _specific_get_meta(self, full_path, file_path):
+ if file_path != 'OEBPS/content.opf':
+ return {}
+
+ with open(full_path, encoding='utf-8') as f:
+ try:
+ results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
+ f.read(), re.I|re.M)
+ return {k:v for (k, v) in results}
+ except (TypeError, UnicodeDecodeError):
+ return {file_path: 'harmful content', }
+
+ def _specific_cleanup(self, full_path: str):
+ if full_path.endswith('OEBPS/content.opf'):
+ return self.__handle_contentopf(full_path)
+ elif full_path.endswith('OEBPS/toc.ncx'):
+ return self.__handle_tocncx(full_path)
+ return True
+
+ def __handle_tocncx(self, full_path: str):
+ try:
+ tree, namespace = office._parse_xml(full_path)
+ except ET.ParseError: # pragma: nocover
+ logging.error("Unable to parse %s in %s.", full_path, self.filename)
+ return False
+
+ for item in tree.iterfind('.//', namespace): # pragma: nocover
+ if item.tag.strip().lower().endswith('head'):
+ item.clear()
+ ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
+ break
+ tree.write(full_path, xml_declaration=True, encoding='utf-8',
+ short_empty_elements=False)
+ return True
+
+ def __handle_contentopf(self, full_path: str):
+ try:
+ tree, namespace = office._parse_xml(full_path)
+ except ET.ParseError:
+ logging.error("Unable to parse %s in %s.", full_path, self.filename)
+ return False
+
+ for item in tree.iterfind('.//', namespace): # pragma: nocover
+ if item.tag.strip().lower().endswith('metadata'):
+ item.clear()
+
+ # item with mandatory content
+ uniqid = ET.Element(self.metadata_namespace + 'identifier')
+ uniqid.text = str(self.uniqid)
+ uniqid.set('id', 'id')
+ item.append(uniqid)
+
+ # items without mandatory content
+ for name in {'language', 'title'}:
+ uniqid = ET.Element(self.metadata_namespace + name)
+ item.append(uniqid)
+ break # there is only a single <metadata> block
+ tree.write(full_path, xml_declaration=True, encoding='utf-8')
+ return True
=====================================
libmat2/html.py deleted
=====================================
@@ -1,69 +0,0 @@
-from html import parser
-from typing import Dict, Any, List, Tuple
-
-from . import abstract
-
-
-class HTMLParser(abstract.AbstractParser):
- mimetypes = {'text/html', }
- def __init__(self, filename):
- super().__init__(filename)
- self.__parser = _HTMLParser()
- with open(filename) as f:
- self.__parser.feed(f.read())
- self.__parser.close()
-
- def get_meta(self) -> Dict[str, Any]:
- return self.__parser.get_meta()
-
- def remove_all(self) -> bool:
- return self.__parser.remove_all(self.output_filename)
-
-
-class _HTMLParser(parser.HTMLParser):
- """Python doesn't have a validating html parser in its stdlib, so
- we're using an internal queue to track all the opening/closing tags,
- and hoping for the best.
- """
- def __init__(self):
- super().__init__()
- self.__textrepr = ''
- self.__meta = {}
- self.__validation_queue = []
-
- def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
- self.__textrepr += self.get_starttag_text()
- self.__validation_queue.append(tag)
-
- def handle_endtag(self, tag: str):
- if not self.__validation_queue:
- raise ValueError
- elif tag != self.__validation_queue.pop():
- raise ValueError
- # There is no `get_endtag_text()` method :/
- self.__textrepr += '</' + tag + '>\n'
-
- def handle_data(self, data: str):
- if data.strip():
- self.__textrepr += data
-
- def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
- if tag == 'meta':
- meta = {k:v for k, v in attrs}
- name = meta.get('name', 'harmful metadata')
- content = meta.get('content', 'harmful data')
- self.__meta[name] = content
- else:
- self.__textrepr += self.get_starttag_text()
-
- def remove_all(self, output_filename: str) -> bool:
- if self.__validation_queue:
- raise ValueError
- with open(output_filename, 'w') as f:
- f.write(self.__textrepr)
- return True
-
- def get_meta(self) -> Dict[str, Any]:
- if self.__validation_queue:
- raise ValueError
- return self.__meta
=====================================
libmat2/parser_factory.py
=====================================
@@ -1,3 +1,4 @@
+import logging
import glob
import os
import mimetypes
@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser')
+mimetypes.add_type('application/epub+zip', '.epub')
+# EPUB Navigation Control XML File
+mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
+
def __load_all_parsers():
""" Loads every parser in a dynamic way """
@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
if mtype in parser_class.mimetypes:
try:
return parser_class(filename), mtype
- except ValueError:
+ except ValueError as e:
+ logging.info("Got an exception when trying to instanciate "
+ "%s for %s: %s", parser_class, filename, e)
return None, mtype
return None, mtype
=====================================
libmat2/web.py
=====================================
@@ -0,0 +1,173 @@
+from html import parser, escape
+from typing import Dict, Any, List, Tuple, Set
+import re
+import string
+
+from . import abstract
+
+assert Set
+
+# pylint: disable=too-many-instance-attributes
+
+class CSSParser(abstract.AbstractParser):
+ """There is no such things as metadata in CSS files,
+ only comments of the form `/* … */`, so we're removing the laters."""
+ mimetypes = {'text/css', }
+ flags = re.MULTILINE | re.DOTALL
+
+ def remove_all(self) -> bool:
+ with open(self.filename, encoding='utf-8') as f:
+ cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
+ with open(self.output_filename, 'w', encoding='utf-8') as f:
+ f.write(cleaned)
+ return True
+
+ def get_meta(self) -> Dict[str, Any]:
+ metadata = {}
+ with open(self.filename, encoding='utf-8') as f:
+ cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
+ for match in cssdoc:
+ for line in match.splitlines():
+ try:
+ k, v = line.split(':')
+ metadata[k.strip(string.whitespace + '*')] = v.strip()
+ except ValueError:
+ metadata['harmful data'] = line.strip()
+ return metadata
+
+
+class AbstractHTMLParser(abstract.AbstractParser):
+ tags_blacklist = set() # type: Set[str]
+ # In some html/xml-based formats some tags are mandatory,
+ # so we're keeping them, but are discaring their content
+ tags_required_blacklist = set() # type: Set[str]
+
+ def __init__(self, filename):
+ super().__init__(filename)
+ self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
+ self.tags_required_blacklist)
+ with open(filename, encoding='utf-8') as f:
+ self.__parser.feed(f.read())
+ self.__parser.close()
+
+ def get_meta(self) -> Dict[str, Any]:
+ return self.__parser.get_meta()
+
+ def remove_all(self) -> bool:
+ return self.__parser.remove_all(self.output_filename)
+
+
+class HTMLParser(AbstractHTMLParser):
+ mimetypes = {'text/html', }
+ tags_blacklist = {'meta', }
+ tags_required_blacklist = {'title', }
+
+
+class DTBNCXParser(AbstractHTMLParser):
+ mimetypes = {'application/x-dtbncx+xml', }
+ tags_required_blacklist = {'title', 'doctitle', 'meta'}
+
+
+class _HTMLParser(parser.HTMLParser):
+ """Python doesn't have a validating html parser in its stdlib, so
+ we're using an internal queue to track all the opening/closing tags,
+ and hoping for the best.
+
+ Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
+ method, so we have to use get_starttag_text instead, put its result in a
+ LIFO, and transform it in a closing tag when needed.
+
+ Also, gotcha: the `tag` parameters are always in lowercase.
+ """
+ def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
+ super().__init__()
+ self.filename = filename
+ self.__textrepr = ''
+ self.__meta = {}
+ self.__validation_queue = [] # type: List[str]
+
+ # We're using counters instead of booleans, to handle nested tags
+ self.__in_dangerous_but_required_tag = 0
+ self.__in_dangerous_tag = 0
+
+ if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
+ raise ValueError("There is an overlap between %s and %s" % (
+ required_blacklisted_tags, blacklisted_tags))
+ self.tag_required_blacklist = required_blacklisted_tags
+ self.tag_blacklist = blacklisted_tags
+
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+ original_tag = self.get_starttag_text()
+ self.__validation_queue.append(original_tag)
+
+ if tag in self.tag_blacklist:
+ self.__in_dangerous_tag += 1
+
+ if self.__in_dangerous_tag == 0:
+ if self.__in_dangerous_but_required_tag == 0:
+ self.__textrepr += original_tag
+
+ if tag in self.tag_required_blacklist:
+ self.__in_dangerous_but_required_tag += 1
+
+ def handle_endtag(self, tag: str):
+ if not self.__validation_queue:
+ raise ValueError("The closing tag %s doesn't have a corresponding "
+ "opening one in %s." % (tag, self.filename))
+
+ previous_tag = self.__validation_queue.pop()
+ previous_tag = previous_tag[1:-1] # remove < and >
+ previous_tag = previous_tag.split(' ')[0] # remove attributes
+ if tag != previous_tag.lower():
+ raise ValueError("The closing tag %s doesn't match the previous "
+ "tag %s in %s" %
+ (tag, previous_tag, self.filename))
+
+ if tag in self.tag_required_blacklist:
+ self.__in_dangerous_but_required_tag -= 1
+
+ if self.__in_dangerous_tag == 0:
+ if self.__in_dangerous_but_required_tag == 0:
+ # There is no `get_endtag_text()` method :/
+ self.__textrepr += '</' + previous_tag + '>'
+
+ if tag in self.tag_blacklist:
+ self.__in_dangerous_tag -= 1
+
+ def handle_data(self, data: str):
+ if self.__in_dangerous_but_required_tag == 0:
+ if self.__in_dangerous_tag == 0:
+ if data.strip():
+ self.__textrepr += escape(data)
+
+ def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+ if tag in self.tag_required_blacklist | self.tag_blacklist:
+ meta = {k:v for k, v in attrs}
+ name = meta.get('name', 'harmful metadata')
+ content = meta.get('content', 'harmful data')
+ self.__meta[name] = content
+
+ if self.__in_dangerous_tag == 0:
+ if tag in self.tag_required_blacklist:
+ self.__textrepr += '<' + tag + ' />'
+ return
+
+ if self.__in_dangerous_tag == 0:
+ if self.__in_dangerous_but_required_tag == 0:
+ self.__textrepr += self.get_starttag_text()
+
+ def remove_all(self, output_filename: str) -> bool:
+ if self.__validation_queue:
+ raise ValueError("Some tags (%s) were left unclosed in %s" % (
+ ', '.join(self.__validation_queue),
+ self.filename))
+ with open(output_filename, 'w', encoding='utf-8') as f:
+ f.write(self.__textrepr)
+ return True
+
+ def get_meta(self) -> Dict[str, Any]:
+ if self.__validation_queue:
+ raise ValueError("Some tags (%s) were left unclosed in %s" % (
+ ', '.join(self.__validation_queue),
+ self.filename))
+ return self.__meta
=====================================
mat2
=====================================
@@ -15,7 +15,7 @@ except ValueError as e:
print(e)
sys.exit(1)
-__version__ = '0.7.0'
+__version__ = '0.8.0'
# Make pyflakes happy
assert Tuple
=====================================
setup.py
=====================================
@@ -1,11 +1,11 @@
import setuptools
-with open("README.md", "r") as fh:
+with open("README.md", encoding='utf-8') as fh:
long_description = fh.read()
setuptools.setup(
name="mat2",
- version='0.7.0',
+ version='0.8.0',
author="Julien (jvoisin) Voisin",
author_email="julien.voisin+mat2 at dustri.org",
description="A handy tool to trash your metadata",
=====================================
tests/data/dirty.css
=====================================
@@ -0,0 +1,14 @@
+/**
+ * This is my super css framework
+ * version: 1.0
+ * author : jvoisin
+ */
+
+body {
+ color: red;
+ background-color: blue;
+}
+
+.underline {
+ text-decoration: underline; /* underline is cool */
+}
=====================================
tests/data/dirty.epub
=====================================
Binary files /dev/null and b/tests/data/dirty.epub differ
=====================================
tests/dirty.epub
=====================================
Binary files /dev/null and b/tests/dirty.epub differ
=====================================
tests/test_corrupted_files.py
=====================================
@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video, html
+from libmat2 import harmless, video, web
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
@@ -220,52 +220,74 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/--output.avi')
def test_zip(self):
- with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+ with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac')
zout.write('./tests/data/dirty.docx')
zout.write('./tests/data/dirty.jpg')
zout.write('./tests/data/embedded_corrupted.docx')
- p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+ p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
self.assertEqual(mimetype, 'application/zip')
meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
- os.remove('./tests/data/dirty.zip')
+ os.remove('./tests/data/clean.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
- html.HTMLParser('./tests/data/clean.html')
+ web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
- p = html.HTMLParser('./tests/data/clean.html')
+ p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
- p = html.HTMLParser('./tests/data/clean.cleaned.html')
+ p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
- f.write('</close>')
+ f.write('</meta>')
with self.assertRaises(ValueError):
- html.HTMLParser('./tests/data/clean.html')
+ web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
- f.write('<notclosed>')
- p = html.HTMLParser('./tests/data/clean.html')
+ f.write('<meta><a>test</a><set/></meta><title></title><meta>')
+ p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
- p = html.HTMLParser('./tests/data/clean.html')
+ p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('<doctitle><br/></doctitle><br/><notclosed>')
+ p = web.HTMLParser('./tests/data/clean.html')
+ with self.assertRaises(ValueError):
+ p.get_meta()
+ p = web.HTMLParser('./tests/data/clean.html')
+ with self.assertRaises(ValueError):
+ p.remove_all()
+ os.remove('./tests/data/clean.html')
+
+
+ def test_epub(self):
+ with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
+ zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
+ p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
+ self.assertEqual(mimetype, 'application/epub+zip')
+ meta = p.get_meta()
+ self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
+ 'harmful content')
+
+ self.assertFalse(p.remove_all())
+ os.remove('./tests/data/clean.epub')
=====================================
tests/test_deep_cleaning.py
=====================================
@@ -83,6 +83,8 @@ class TestZipOrder(unittest.TestCase):
previous_name = ''
for item in zin.infolist():
if previous_name == '':
+ if item.filename == 'mimetype':
+ continue
previous_name = item.filename
continue
elif item.filename < previous_name:
@@ -97,6 +99,8 @@ class TestZipOrder(unittest.TestCase):
previous_name = ''
for item in zin.infolist():
if previous_name == '':
+ if item.filename == 'mimetype':
+ continue
previous_name = item.filename
continue
self.assertGreaterEqual(item.filename, previous_name)
=====================================
tests/test_libmat2.py
=====================================
@@ -3,10 +3,11 @@
import unittest
import shutil
import os
+import re
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive, html
+from libmat2 import check_dependencies, video, archive, web, epub
class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +178,23 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'this is a test comment')
+ def test_epub(self):
+ p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
+ self.assertEqual(mimetype, 'application/epub+zip')
+ meta = p.get_meta()
+ self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
+ self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster at gutenberg.org>')
+ self.assertEqual(meta['OEBPS/@public at vhost@g at gutenberg@html at files@58820 at 58820-h@images at shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
+ self.assertEqual(meta['OEBPS/@public at vhost@g at gutenberg@html at files@58820 at 58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster at gutenberg.org>')
+
+ def test_css(self):
+ p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
+ self.assertEqual(mimetype, 'text/css')
+ meta = p.get_meta()
+ self.assertEqual(meta['author'], 'jvoisin')
+ self.assertEqual(meta['version'], '1.0')
+ self.assertEqual(meta['harmful data'], 'underline is cool')
+
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +617,7 @@ class TestCleaning(unittest.TestCase):
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
- p = html.HTMLParser('./tests/data/clean.html')
+ p = web.HTMLParser('./tests/data/clean.html')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +625,80 @@ class TestCleaning(unittest.TestCase):
ret = p.remove_all()
self.assertTrue(ret)
- p = html.HTMLParser('./tests/data/clean.cleaned.html')
+ p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')
+
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('<title><title><pouet/><meta/></title></title><test/>')
+ p = web.HTMLParser('./tests/data/clean.html')
+ self.assertTrue(p.remove_all())
+ with open('./tests/data/clean.cleaned.html', 'r') as f:
+ self.assertEqual(f.read(), '<title></title><test/>')
+ os.remove('./tests/data/clean.html')
+ os.remove('./tests/data/clean.cleaned.html')
+
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('<test><title>Some<b>metadata</b><br/></title></test>')
+ p = web.HTMLParser('./tests/data/clean.html')
+ self.assertTrue(p.remove_all())
+ with open('./tests/data/clean.cleaned.html', 'r') as f:
+ self.assertEqual(f.read(), '<test><title></title></test>')
+ os.remove('./tests/data/clean.html')
+ os.remove('./tests/data/clean.cleaned.html')
+
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('<meta><meta/><!----><!-- test--></meta>')
+ p = web.HTMLParser('./tests/data/clean.html')
+ self.assertTrue(p.remove_all())
+ with open('./tests/data/clean.cleaned.html', 'r') as f:
+ self.assertEqual(f.read(), '')
+ os.remove('./tests/data/clean.html')
+ os.remove('./tests/data/clean.cleaned.html')
+
+
+ def test_epub(self):
+ shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
+ p = epub.EPUBParser('./tests/data/clean.epub')
+
+ meta = p.get_meta()
+ self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
+
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
+ meta = p.get_meta()
+ res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
+ self.assertNotEqual(res, False)
+
+ self.assertTrue(p.remove_all())
+
+ os.remove('./tests/data/clean.epub')
+ os.remove('./tests/data/clean.cleaned.epub')
+ os.remove('./tests/data/clean.cleaned.cleaned.epub')
+
+
+ def test_css(self):
+ shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
+ p = web.CSSParser('./tests/data/clean.css')
+
+ self.assertEqual(p.get_meta(), {
+ 'harmful data': 'underline is cool',
+ 'version': '1.0',
+ 'author': 'jvoisin'})
+
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = web.CSSParser('./tests/data/clean.cleaned.css')
+ self.assertEqual(p.get_meta(), {})
+ self.assertTrue(p.remove_all())
+
+ os.remove('./tests/data/clean.css')
+ os.remove('./tests/data/clean.cleaned.css')
+ os.remove('./tests/data/clean.cleaned.cleaned.css')
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/commit/dd721b09c46745de8a16fa7cde10f682305b3a2f
--
View it on GitLab: https://salsa.debian.org/pkg-privacy-team/mat2/commit/dd721b09c46745de8a16fa7cde10f682305b3a2f
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-privacy-commits/attachments/20190228/fac38f99/attachment-0001.html>
More information about the Pkg-privacy-commits
mailing list