[Python-modules-commits] [pynliner] 01/08: Import pynliner_0.7.2.orig.tar.gz

Mon Sep 26 16:35:58 UTC 2016

This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch master
in repository pynliner.

commit 1bb44b25c3a8fc63e6544cef67e34f1058ccc460
Author: Sandro Tosi <morph at debian.org>
Date:   Mon Sep 26 15:33:46 2016 +0100

    Import pynliner_0.7.2.orig.tar.gz
---
 PKG-INFO                       |  16 +++-
 pynliner.egg-info/PKG-INFO     |  16 +++-
 pynliner.egg-info/SOURCES.txt  |   1 +
 pynliner.egg-info/requires.txt |   4 +-
 pynliner/__init__.py           | 165 +++++++++++++++++++++++++----------------
 pynliner/soupselect.py         |  41 ++++++----
 setup.cfg                      |   3 +
 setup.py                       |  35 +++++----
 8 files changed, 180 insertions(+), 101 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index d1772d7..be79e72 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,11 +1,21 @@
 Metadata-Version: 1.1
 Name: pynliner
-Version: 0.5.2
+Version: 0.7.2
 Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
 Home-page: UNKNOWN
 Author: Tanner Netterville
 Author-email: tannern at gmail.com
-License: UNKNOWN
+License: MIT
 Description: UNKNOWN
 Platform: UNKNOWN
-Provides: pynliner
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
diff --git a/pynliner.egg-info/PKG-INFO b/pynliner.egg-info/PKG-INFO
index d1772d7..be79e72 100644
--- a/pynliner.egg-info/PKG-INFO
+++ b/pynliner.egg-info/PKG-INFO
@@ -1,11 +1,21 @@
 Metadata-Version: 1.1
 Name: pynliner
-Version: 0.5.2
+Version: 0.7.2
 Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
 Home-page: UNKNOWN
 Author: Tanner Netterville
 Author-email: tannern at gmail.com
-License: UNKNOWN
+License: MIT
 Description: UNKNOWN
 Platform: UNKNOWN
-Provides: pynliner
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
diff --git a/pynliner.egg-info/SOURCES.txt b/pynliner.egg-info/SOURCES.txt
index 6ca44b7..c442cbe 100644
--- a/pynliner.egg-info/SOURCES.txt
+++ b/pynliner.egg-info/SOURCES.txt
@@ -1,3 +1,4 @@
+setup.cfg
 setup.py
 pynliner/__init__.py
 pynliner/soupselect.py
diff --git a/pynliner.egg-info/requires.txt b/pynliner.egg-info/requires.txt
index fe1d0f7..ac8a5f4 100644
--- a/pynliner.egg-info/requires.txt
+++ b/pynliner.egg-info/requires.txt
@@ -1,2 +1,2 @@
-BeautifulSoup >=3.2.1,<4.0
-cssutils >=0.9.7
\ No newline at end of file
+BeautifulSoup4 >= 4.4.1
+cssutils >=0.9.7
diff --git a/pynliner/__init__.py b/pynliner/__init__.py
index de346c9..76c552e 100644
--- a/pynliner/__init__.py
+++ b/pynliner/__init__.py
@@ -5,7 +5,7 @@
 Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and
 cssutils
 
-Copyright (c) 2011-2013 Tanner Netterville
+Copyright (c) 2011-2016 Tanner Netterville
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -30,14 +30,29 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 """
 
-__version__ = "0.5.2"
-
 import re
-import urlparse
-import urllib2
+
 import cssutils
-from BeautifulSoup import BeautifulSoup, Comment
-from soupselect import select
+from bs4 import BeautifulSoup
+
+from .soupselect import select
+
+try:
+    from urllib.parse import urljoin
+    from urllib.request import urlopen
+    unicode = str
+except ImportError:
+    from urlparse import urljoin
+    from urllib2 import urlopen
+
+__version__ = "0.7.2"
+
+
+# this pattern may be too aggressive
+HTML_ENTITY_PATTERN = re.compile(r'&(#([0-9]+|x[a-fA-F0-9]+)|[a-zA-Z][^\s;]+);')
+
+SUBSTITUTION_FORMAT = '[pynlinerSubstitute:{0}]'
+SUBSTITUTION_PATTERN = re.compile(r'\[pynlinerSubstitute:(\d+)\]')
 
 
 class Pynliner(object):
@@ -48,13 +63,16 @@ class Pynliner(object):
     stylesheet = False
     output = False
 
-    def __init__(self, log=None, allow_conditional_comments=False):
+    def __init__(self, log=None, allow_conditional_comments=False,
+                 preserve_entities=True):
         self.log = log
         cssutils.log.enabled = False if log is None else True
         self.extra_style_strings = []
         self.allow_conditional_comments = allow_conditional_comments
+        self.preserve_entities = preserve_entities
         self.root_url = None
         self.relative_url = None
+        self._substitutions = None
 
     def from_url(self, url):
         """Gets remote HTML page for conversion
@@ -111,19 +129,52 @@ class Pynliner(object):
         >>> Pynliner().from_string(html).run()
         u'<h1 style="color: #fc0">Hello World!</h1>'
         """
+        self._substitutions = []
+        if self.preserve_entities:
+            self._substitute_entities()
         if not self.soup:
             self._get_soup()
         if not self.stylesheet:
             self._get_styles()
         self._apply_styles()
+        self._insert_media_rules()
         self._get_output()
-        self._clean_output()
+        self._unsubstitute_output()
         return self.output
 
+    def _store_substitute(self, value):
+        """
+        store a string and return it's substitute
+        """
+        index = len(self._substitutions)
+        self._substitutions.append(value)
+        return SUBSTITUTION_FORMAT.format(index)
+
     def _get_url(self, url):
         """Returns the response content from the given url
         """
-        return urllib2.urlopen(url).read()
+        return urlopen(url).read()
+
+    def _substitute_entities(self):
+        """
+        Add HTML entities to the substitutions list and replace with
+        placeholders in HTML source
+        """
+        self.source_string = re.sub(
+            HTML_ENTITY_PATTERN,
+            lambda m: self._store_substitute(m.group(0)),
+            self.source_string
+        )
+
+    def _unsubstitute_output(self):
+        """
+        Put substitutions back into the output
+        """
+        self.output = re.sub(
+            SUBSTITUTION_PATTERN,
+            lambda m: self._substitutions[int(m.group(1))],
+            self.output
+        )
 
     def _get_soup(self):
         """Convert source string to BeautifulSoup object. Sets it to self.soup.
@@ -136,8 +187,8 @@ class Pynliner(object):
         try:
             from mod_wsgi import version
             self.soup = BeautifulSoup(self.source_string, "html5lib")
-        except:
-            self.soup = BeautifulSoup(self.source_string)
+        except ImportError:
+            self.soup = BeautifulSoup(self.source_string, "html.parser")
 
     def _get_styles(self):
         """Gets all CSS content from and removes all <link rel="stylesheet"> and
@@ -166,7 +217,7 @@ class Pynliner(object):
 
             # Convert the relative URL to an absolute URL ready to pass to urllib
             base_url = self.relative_url or self.root_url
-            url = urlparse.urljoin(base_url, url)
+            url = urljoin(base_url, url)
 
             self.style_string += self._get_url(url)
             tag.extract()
@@ -184,20 +235,19 @@ class Pynliner(object):
             self.style_string += u'\n'.join(tag.contents) + u'\n'
             tag.extract()
 
-    def _get_specificity_from_list(self, lst):
+    def _insert_media_rules(self):
+        """If there are any media rules, re-insert a style tag at the top and
+        dump them all in.
         """
-        Takes an array of ints and returns an integer formed
-        by adding all ints multiplied by the power of 10 of the current index
-
-        (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
-        """
-        return int(''.join(map(str, lst)))
-
-    def _get_rule_specificity(self, rule):
-        """
-        For a given CSSRule get its selector specificity in base 10
-        """
-        return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))
+        rules = list(self.stylesheet.cssRules.rulesOfType(cssutils.css.CSSRule.MEDIA_RULE))
+        if rules:
+            style = BeautifulSoup(
+                "<style>" + "\n".join(re.sub(r'\s+', ' ', x.cssText) for x in rules) +
+                "</style>",
+                "html.parser"
+            )
+            target = self.soup.body or self.soup
+            target.insert(0, style)
 
     def _apply_styles(self):
         """Steps through CSS rules and applies each to all the proper elements
@@ -206,43 +256,39 @@ class Pynliner(object):
         rules = self.stylesheet.cssRules.rulesOfType(1)
         elem_prop_map = {}
         elem_style_map = {}
-
         # build up a property list for every styled element
         for rule in rules:
-            # select elements for every selector
-            selectors = rule.selectorText.split(',')
-            elements = []
-            for selector in selectors:
-                elements += select(self.soup, selector.strip())
-            # build prop_list for each selected element
-            for elem in elements:
-                if elem not in elem_prop_map:
-                    elem_prop_map[elem] = []
-                elem_prop_map[elem].append({
-                    'specificity': self._get_rule_specificity(rule),
-                    'props': rule.style.getProperties(),
-                })
+            for selector in rule.selectorList:
+                for element in select(self.soup, selector.selectorText):
+                    element_tuple = (element, id(element))
+                    if element_tuple not in elem_prop_map:
+                        elem_prop_map[element_tuple] = []
+                    elem_prop_map[element_tuple].append({
+                        'specificity': selector.specificity,
+                        'props': rule.style.getProperties(),
+                    })
 
         # build up another property list using selector specificity
-        for elem, props in elem_prop_map.items():
-            if elem not in elem_style_map:
-                elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
+        for elem_tuple, props in elem_prop_map.items():
+            elem, elem_id = elem_tuple
+            if elem_tuple not in elem_style_map:
+                elem_style_map[elem_tuple] = cssutils.css.CSSStyleDeclaration()
             # ascending sort of prop_lists based on specificity
             props = sorted(props, key=lambda p: p['specificity'])
             # for each prop_list, apply to CSSStyleDeclaration
             for prop_list in map(lambda obj: obj['props'], props):
                 for prop in prop_list:
-                    elem_style_map[elem].removeProperty(prop.name)
-                    elem_style_map[elem].setProperty(prop.name, prop.value)
-
+                    elem_style_map[elem_tuple].removeProperty(prop.name)
+                    elem_style_map[elem_tuple].setProperty(prop.name, prop.value)
 
         # apply rules to elements
-        for elem, style_declaration in elem_style_map.items():
-            if elem.has_key('style'):
+        for elem_tuple, style_declaration in elem_style_map.items():
+            elem, elem_id = elem_tuple
+            if elem.has_attr('style'):
                 elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
             else:
                 elem['style'] = style_declaration.cssText.replace('\n', ' ')
-        
+
     def _get_output(self):
         """Generate Unicode string of `self.soup` and set it to `self.output`
 
@@ -250,34 +296,23 @@ class Pynliner(object):
         """
         self.output = unicode(self.soup)
         return self.output
-    
-    def _clean_output(self):
-        """Clean up after BeautifulSoup's output.
-        """
-        if self.allow_conditional_comments:
-            matches = re.finditer('(<!--\[if .+\].+?<!\[endif\]-->)', self.output)
-            for match in matches:
-                comment = match.group()
-                comment = comment.replace('>', '>')
-                comment = comment.replace('<', '<')
-                self.output = (self.output[:match.start()] + comment +
-                               self.output[match.end():])
 
 
-def fromURL(url, log=None):
+def fromURL(url, **kwargs):
     """Shortcut Pynliner constructor. Equivalent to:
 
     >>> Pynliner().from_url(someURL).run()
 
     Returns processed HTML string.
     """
-    return Pynliner(log).from_url(url).run()
+    return Pynliner(**kwargs).from_url(url).run()
+
 
-def fromString(string, log=None):
+def fromString(string, **kwargs):
     """Shortcut Pynliner constructor. Equivalent to:
 
     >>> Pynliner().from_string(someString).run()
 
     Returns processed HTML string.
     """
-    return Pynliner(log).from_string(string).run()
+    return Pynliner(**kwargs).from_string(string).run()
diff --git a/pynliner/soupselect.py b/pynliner/soupselect.py
index 70cfe26..4a081e2 100644
--- a/pynliner/soupselect.py
+++ b/pynliner/soupselect.py
@@ -15,10 +15,15 @@ select(soup, 'div#main ul a')
 patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
 """
 import re
-import BeautifulSoup
+import operator as operator_
+from functools import partial
+
+import bs4
+
+ATTRIBUTE_PATTERN = re.compile(r'\[(?P<attribute>[^\s\]=~\|\^\$\*]+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
+PSEUDO_CLASS_PATTERN = re.compile(u':(([^:.#(*\\[]|\\([^)]+\\))+)')
+SELECTOR_TOKEN_PATTERN = re.compile(r'([_0-9a-zA-Z-#.:*]+|\[[^\]]+\])$')
 
-attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
-pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
 
 def get_attribute_checker(operator, attribute, value=''):
     """
@@ -38,15 +43,17 @@ def get_attribute_checker(operator, attribute, value=''):
         # attribute is either exactly value or starts with value-
         '|': lambda el: el.get(attribute, '') == value \
             or el.get(attribute, '').startswith('%s-' % value),
-    }.get(operator, lambda el: el.has_key(attribute))
+    }.get(operator, lambda el: el.has_attr(attribute))
+
 
 def is_white_space(el):
-    if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '':
+    if isinstance(el, bs4.NavigableString) and str(el).strip() == '':
         return True
-    if isinstance(el, BeautifulSoup.Comment):
+    if isinstance(el, bs4.Comment):
         return True
     return False
 
+
 def is_last_content_node(el):
     result = False
     if el is None:
@@ -55,6 +62,7 @@ def is_last_content_node(el):
         result = is_last_content_node(el.nextSibling)
     return result
 
+
 def is_first_content_node(el):
     result = False
     if el is None:
@@ -63,6 +71,7 @@ def is_first_content_node(el):
         result = is_first_content_node(el.previousSibling)
     return result
 
+
 def get_pseudo_class_checker(psuedo_class):
     """
     Takes a psuedo_class, like "first-child" or "last-child"
@@ -74,6 +83,7 @@ def get_pseudo_class_checker(psuedo_class):
         'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None))
     }.get(psuedo_class, lambda el: False)
 
+
 def get_checker(functions):
     def checker(el):
         for func in functions:
@@ -95,7 +105,7 @@ def select(soup, selector):
         if handle_token:
             # Get the rightmost token
             handle_token = False
-            match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
+            match = SELECTOR_TOKEN_PATTERN.search(selector)
             if not match:
                 raise Exception("No match was found. We're done or something is broken")
             token = match.groups(1)[0]
@@ -107,14 +117,14 @@ def select(soup, selector):
             #
             # Get attribute selectors from token
             #
-            matches = attribute_regex.findall(token)
+            matches = ATTRIBUTE_PATTERN.findall(token)
             for match in matches:
                 checker_functions.append(get_attribute_checker(match[1], match[0], match[2]))
 
             #
             # Get pseudo classes from token
             #
-            for match in pseudo_class_regex.finditer(token):
+            for match in PSEUDO_CLASS_PATTERN.finditer(token):
                 checker_functions.append(get_pseudo_class_checker(match.groups(1)[0]))
 
             checker = get_checker(checker_functions)
@@ -148,11 +158,11 @@ def select(soup, selector):
             if ids:
                 find_dict['id'] = ids
             if classes:
-                find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split())
+                find_dict['class'] = partial(operator_.contains, classes)
             if operator is None:
                 # This is the first token: simply find all matches
                 for context in current_context:
-                    context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)]
+                    context_matches = [el for el in context[0].find_all(tag, find_dict) if checker(el)]
                     for context_match in context_matches:
                         found.append(
                             (context_match, [context_match]),
@@ -205,25 +215,26 @@ def select(soup, selector):
         else:
             # Get the next operator (whitespace, >, ~, +)
             handle_token = True
-            operator = None
             match = re.search('([>~+]+)$', selector)
             if match:
                 operator = match.groups(1)[0]
+                selector = selector.rsplit(operator, 1)[0].rstrip()
             else:
                 operator = ' '
-            selector = selector.rsplit(operator, 1)[0].rstrip()
     return [entry[0] for entry in current_context]
 
+
 def monkeypatch(BeautifulSoupClass=None):
     """
     If you don't explicitly state the class to patch, defaults to the most 
     common import location for BeautifulSoup.
     """
     if not BeautifulSoupClass:
-        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+        from bs4 import BeautifulSoup as BeautifulSoupClass
     BeautifulSoupClass.findSelect = select
 
+
 def unmonkeypatch(BeautifulSoupClass=None):
     if not BeautifulSoupClass:
-        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+        from bs4 import BeautifulSoup as BeautifulSoupClass
     delattr(BeautifulSoupClass, 'findSelect')
diff --git a/setup.cfg b/setup.cfg
index 861a9f5..6f08d0e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,6 @@
+[bdist_wheel]
+universal = 1
+
 [egg_info]
 tag_build = 
 tag_date = 0
diff --git a/setup.py b/setup.py
index a34f7cc..711aeed 100644
--- a/setup.py
+++ b/setup.py
@@ -3,23 +3,32 @@
 
 from setuptools import setup
 
-install_requires = [
-    'BeautifulSoup >=3.2.1,<4.0',
-    'cssutils >=0.9.7',
-]
-
-tests_require = [
-    'mock'
-] + install_requires
-
 setup(name='pynliner',
-      version='0.5.2',
+      version='0.7.2',
       description='Python CSS-to-inline-styles conversion tool for HTML using'
                   ' BeautifulSoup and cssutils',
       author='Tanner Netterville',
       author_email='tannern at gmail.com',
-      install_requires=install_requires,
-      tests_require=tests_require,
+      install_requires=[
+          'BeautifulSoup4 >= 4.4.1',
+          'cssutils >=0.9.7',
+      ],
+      tests_require=[
+          'mock'
+      ],
       test_suite='tests',
       packages=['pynliner'],
-      provides=['pynliner'])
+      license='MIT',
+      classifiers=[
+          'Development Status :: 5 - Production/Stable',
+          'Intended Audience :: Developers',
+          'License :: OSI Approved :: MIT License',
+          'Topic :: Text Processing :: Markup :: HTML',
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 2.6',
+          'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3',
+          'Programming Language :: Python :: 3.3',
+          'Programming Language :: Python :: 3.4',
+          'Programming Language :: Python :: 3.5'
+      ])

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pynliner.git