[Python-modules-commits] [pynliner] 01/03: Imported Upstream version 0.5.2

Sandro Tosi morph at moszumanska.debian.org
Tue Jul 7 22:14:41 UTC 2015


This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch bpo8
in repository pynliner.

commit aebab24111597829a86d44575bde508449768330
Author: Sandro Tosi <morph at debian.org>
Date:   Tue Jul 7 18:08:36 2015 -0400

    Imported Upstream version 0.5.2
---
 PKG-INFO                               |  11 ++
 pynliner.egg-info/PKG-INFO             |  11 ++
 pynliner.egg-info/SOURCES.txt          |   8 +
 pynliner.egg-info/dependency_links.txt |   1 +
 pynliner.egg-info/requires.txt         |   2 +
 pynliner.egg-info/top_level.txt        |   1 +
 pynliner/__init__.py                   | 283 +++++++++++++++++++++++++++++++++
 pynliner/soupselect.py                 | 229 ++++++++++++++++++++++++++
 setup.cfg                              |   5 +
 setup.py                               |  25 +++
 10 files changed, 576 insertions(+)

diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..d1772d7
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.1
+Name: pynliner
+Version: 0.5.2
+Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
+Home-page: UNKNOWN
+Author: Tanner Netterville
+Author-email: tannern at gmail.com
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
+Provides: pynliner
diff --git a/pynliner.egg-info/PKG-INFO b/pynliner.egg-info/PKG-INFO
new file mode 100644
index 0000000..d1772d7
--- /dev/null
+++ b/pynliner.egg-info/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.1
+Name: pynliner
+Version: 0.5.2
+Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
+Home-page: UNKNOWN
+Author: Tanner Netterville
+Author-email: tannern at gmail.com
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
+Provides: pynliner
diff --git a/pynliner.egg-info/SOURCES.txt b/pynliner.egg-info/SOURCES.txt
new file mode 100644
index 0000000..6ca44b7
--- /dev/null
+++ b/pynliner.egg-info/SOURCES.txt
@@ -0,0 +1,8 @@
+setup.py
+pynliner/__init__.py
+pynliner/soupselect.py
+pynliner.egg-info/PKG-INFO
+pynliner.egg-info/SOURCES.txt
+pynliner.egg-info/dependency_links.txt
+pynliner.egg-info/requires.txt
+pynliner.egg-info/top_level.txt
\ No newline at end of file
diff --git a/pynliner.egg-info/dependency_links.txt b/pynliner.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pynliner.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pynliner.egg-info/requires.txt b/pynliner.egg-info/requires.txt
new file mode 100644
index 0000000..fe1d0f7
--- /dev/null
+++ b/pynliner.egg-info/requires.txt
@@ -0,0 +1,2 @@
+BeautifulSoup >=3.2.1,<4.0
+cssutils >=0.9.7
\ No newline at end of file
diff --git a/pynliner.egg-info/top_level.txt b/pynliner.egg-info/top_level.txt
new file mode 100644
index 0000000..3f906af
--- /dev/null
+++ b/pynliner.egg-info/top_level.txt
@@ -0,0 +1 @@
+pynliner
diff --git a/pynliner/__init__.py b/pynliner/__init__.py
new file mode 100644
index 0000000..de346c9
--- /dev/null
+++ b/pynliner/__init__.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Pynliner : Convert CSS to inline styles
+
+Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and
+cssutils
+
+Copyright (c) 2011-2013 Tanner Netterville
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+The generated output of this software shall not be used in a mass marketing
+service.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 
+THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+
+__version__ = "0.5.2"
+
+import re
+import urlparse
+import urllib2
+import cssutils
+from BeautifulSoup import BeautifulSoup, Comment
+from soupselect import select
+
+
+class Pynliner(object):
+    """Pynliner class"""
+
+    soup = False
+    style_string = False
+    stylesheet = False
+    output = False
+
+    def __init__(self, log=None, allow_conditional_comments=False):
+        self.log = log
+        cssutils.log.enabled = False if log is None else True
+        self.extra_style_strings = []
+        self.allow_conditional_comments = allow_conditional_comments
+        self.root_url = None
+        self.relative_url = None
+
+    def from_url(self, url):
+        """Gets remote HTML page for conversion
+
+        Downloads HTML page from `url` as a string and passes it to the
+        `from_string` method. Also sets `self.root_url` and `self.relative_url`
+        for use in importing <link> elements.
+
+        Returns self.
+
+        >>> p = Pynliner()
+        >>> p.from_url('http://somewebsite.com/file.html')
+        <Pynliner object at 0x26ac70>
+        """
+        self.url = url
+        self.relative_url = '/'.join(url.split('/')[:-1]) + '/'
+        self.root_url = '/'.join(url.split('/')[:3])
+        self.source_string = self._get_url(self.url)
+        return self
+
+    def from_string(self, string):
+        """Generates a Pynliner object from the given HTML string.
+
+        Returns self.
+
+        >>> p = Pynliner()
+        >>> p.from_string('<style>h1 {color:#ffcc00;}</style><h1>Hi</h1>')
+        <Pynliner object at 0x26ac70>
+        """
+        self.source_string = string
+        return self
+
+    def with_cssString(self, css_string):
+        """Adds external CSS to the Pynliner object. Can be "chained".
+
+        Returns self.
+
+        >>> html = "<h1>Hello World!</h1>"
+        >>> css = "h1 { color:#ffcc00; }"
+        >>> p = Pynliner()
+        >>> p.from_string(html).with_cssString(css)
+        <pynliner.Pynliner object at 0x2ca810>
+        """
+        self.extra_style_strings.append(css_string)
+        return self
+
+    def run(self):
+        """Applies each step of the process if they have not already been
+        performed.
+
+        Returns Unicode output with applied styles.
+
+        >>> html = "<style>h1 { color:#ffcc00; }</style><h1>Hello World!</h1>"
+        >>> Pynliner().from_string(html).run()
+        u'<h1 style="color: #fc0">Hello World!</h1>'
+        """
+        if not self.soup:
+            self._get_soup()
+        if not self.stylesheet:
+            self._get_styles()
+        self._apply_styles()
+        self._get_output()
+        self._clean_output()
+        return self.output
+
+    def _get_url(self, url):
+        """Returns the response content from the given url
+        """
+        return urllib2.urlopen(url).read()
+
+    def _get_soup(self):
+        """Convert source string to BeautifulSoup object. Sets it to self.soup.
+
+        If using mod_wgsi, use html5 parsing to prevent BeautifulSoup
+        incompatibility.
+        """
+        # Check if mod_wsgi is running
+        # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks
+        try:
+            from mod_wsgi import version
+            self.soup = BeautifulSoup(self.source_string, "html5lib")
+        except:
+            self.soup = BeautifulSoup(self.source_string)
+
+    def _get_styles(self):
+        """Gets all CSS content from and removes all <link rel="stylesheet"> and
+        <style> tags concatenating into one CSS string which is then parsed with
+        cssutils and the resulting CSSStyleSheet object set to
+        `self.stylesheet`.
+        """
+        self._get_external_styles()
+        self._get_internal_styles()
+        for style_string in self.extra_style_strings:
+            self.style_string += style_string
+        cssparser = cssutils.CSSParser(log=self.log)
+        self.stylesheet = cssparser.parseString(self.style_string)
+
+    def _get_external_styles(self):
+        """Gets <link> element styles
+        """
+        if not self.style_string:
+            self.style_string = u''
+        else:
+            self.style_string += u'\n'
+
+        link_tags = self.soup.findAll('link', {'rel': 'stylesheet'})
+        for tag in link_tags:
+            url = tag['href']
+
+            # Convert the relative URL to an absolute URL ready to pass to urllib
+            base_url = self.relative_url or self.root_url
+            url = urlparse.urljoin(base_url, url)
+
+            self.style_string += self._get_url(url)
+            tag.extract()
+
+    def _get_internal_styles(self):
+        """Gets <style> element styles
+        """
+        if not self.style_string:
+            self.style_string = u''
+        else:
+            self.style_string += u'\n'
+
+        style_tags = self.soup.findAll('style')
+        for tag in style_tags:
+            self.style_string += u'\n'.join(tag.contents) + u'\n'
+            tag.extract()
+
+    def _get_specificity_from_list(self, lst):
+        """
+        Takes an array of ints and returns an integer formed
+        by adding all ints multiplied by the power of 10 of the current index
+
+        (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
+        """
+        return int(''.join(map(str, lst)))
+
+    def _get_rule_specificity(self, rule):
+        """
+        For a given CSSRule get its selector specificity in base 10
+        """
+        return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))
+
+    def _apply_styles(self):
+        """Steps through CSS rules and applies each to all the proper elements
+        as @style attributes prepending any current @style attributes.
+        """
+        rules = self.stylesheet.cssRules.rulesOfType(1)
+        elem_prop_map = {}
+        elem_style_map = {}
+
+        # build up a property list for every styled element
+        for rule in rules:
+            # select elements for every selector
+            selectors = rule.selectorText.split(',')
+            elements = []
+            for selector in selectors:
+                elements += select(self.soup, selector.strip())
+            # build prop_list for each selected element
+            for elem in elements:
+                if elem not in elem_prop_map:
+                    elem_prop_map[elem] = []
+                elem_prop_map[elem].append({
+                    'specificity': self._get_rule_specificity(rule),
+                    'props': rule.style.getProperties(),
+                })
+
+        # build up another property list using selector specificity
+        for elem, props in elem_prop_map.items():
+            if elem not in elem_style_map:
+                elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
+            # ascending sort of prop_lists based on specificity
+            props = sorted(props, key=lambda p: p['specificity'])
+            # for each prop_list, apply to CSSStyleDeclaration
+            for prop_list in map(lambda obj: obj['props'], props):
+                for prop in prop_list:
+                    elem_style_map[elem].removeProperty(prop.name)
+                    elem_style_map[elem].setProperty(prop.name, prop.value)
+
+
+        # apply rules to elements
+        for elem, style_declaration in elem_style_map.items():
+            if elem.has_key('style'):
+                elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
+            else:
+                elem['style'] = style_declaration.cssText.replace('\n', ' ')
+        
+    def _get_output(self):
+        """Generate Unicode string of `self.soup` and set it to `self.output`
+
+        Returns self.output
+        """
+        self.output = unicode(self.soup)
+        return self.output
+    
+    def _clean_output(self):
+        """Clean up after BeautifulSoup's output.
+        """
+        if self.allow_conditional_comments:
+            matches = re.finditer('(<!--\[if .+\].+?<!\[endif\]-->)', self.output)
+            for match in matches:
+                comment = match.group()
+                comment = comment.replace('>', '>')
+                comment = comment.replace('<', '<')
+                self.output = (self.output[:match.start()] + comment +
+                               self.output[match.end():])
+
+
+def fromURL(url, log=None):
+    """Shortcut Pynliner constructor. Equivalent to:
+
+    >>> Pynliner().from_url(someURL).run()
+
+    Returns processed HTML string.
+    """
+    return Pynliner(log).from_url(url).run()
+
+def fromString(string, log=None):
+    """Shortcut Pynliner constructor. Equivalent to:
+
+    >>> Pynliner().from_string(someString).run()
+
+    Returns processed HTML string.
+    """
+    return Pynliner(log).from_string(string).run()
diff --git a/pynliner/soupselect.py b/pynliner/soupselect.py
new file mode 100644
index 0000000..70cfe26
--- /dev/null
+++ b/pynliner/soupselect.py
@@ -0,0 +1,229 @@
+"""
+# Included with pynliner since it isn't on PyPI #
+
+soupselect.py
+
+CSS selector support for BeautifulSoup.
+
+soup = BeautifulSoup('<html>...')
+select(soup, 'div')
+    - returns a list of div elements
+
+select(soup, 'div#main ul a')
+    - returns a list of links inside a ul inside div#main
+
+patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
+"""
+import re
+import BeautifulSoup
+
+attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
+pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
+
+def get_attribute_checker(operator, attribute, value=''):
+    """
+    Takes an operator, attribute and optional value; returns a function that
+    will return True for elements that match that combination.
+    """
+    return {
+        '=': lambda el: el.get(attribute) == value,
+        # attribute includes value as one of a set of space separated tokens
+        '~': lambda el: value in el.get(attribute, '').split(),
+        # attribute starts with value
+        '^': lambda el: el.get(attribute, '').startswith(value),
+        # attribute ends with value
+        '$': lambda el: el.get(attribute, '').endswith(value),
+        # attribute contains value
+        '*': lambda el: value in el.get(attribute, ''),
+        # attribute is either exactly value or starts with value-
+        '|': lambda el: el.get(attribute, '') == value \
+            or el.get(attribute, '').startswith('%s-' % value),
+    }.get(operator, lambda el: el.has_key(attribute))
+
+def is_white_space(el):
+    if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '':
+        return True
+    if isinstance(el, BeautifulSoup.Comment):
+        return True
+    return False
+
+def is_last_content_node(el):
+    result = False
+    if el is None:
+        result = True
+    elif is_white_space(el):
+        result = is_last_content_node(el.nextSibling)
+    return result
+
+def is_first_content_node(el):
+    result = False
+    if el is None:
+        result = True
+    if is_white_space(el):
+        result = is_first_content_node(el.previousSibling)
+    return result
+
+def get_pseudo_class_checker(psuedo_class):
+    """
+    Takes a psuedo_class, like "first-child" or "last-child"
+    and returns a function that will check if the element satisfies
+    that psuedo class
+    """
+    return {
+        'first-child': lambda el: is_first_content_node(getattr(el, 'previousSibling', None)),
+        'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None))
+    }.get(psuedo_class, lambda el: False)
+
+def get_checker(functions):
+    def checker(el):
+        for func in functions:
+            if not func(el):
+                return False
+        return el
+    return checker
+
+
+def select(soup, selector):
+    """
+    soup should be a BeautifulSoup instance; selector is a CSS selector 
+    specifying the elements you want to retrieve.
+    """
+    handle_token = True
+    current_context = [(soup, [])]
+    operator = None
+    while selector:
+        if handle_token:
+            # Get the rightmost token
+            handle_token = False
+            match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
+            if not match:
+                raise Exception("No match was found. We're done or something is broken")
+            token = match.groups(1)[0]
+
+            # remove this token from the selector
+            selector = selector.rsplit(token, 1)[0].rstrip()
+            
+            checker_functions = []
+            #
+            # Get attribute selectors from token
+            #
+            matches = attribute_regex.findall(token)
+            for match in matches:
+                checker_functions.append(get_attribute_checker(match[1], match[0], match[2]))
+
+            #
+            # Get pseudo classes from token
+            #
+            for match in pseudo_class_regex.finditer(token):
+                checker_functions.append(get_pseudo_class_checker(match.groups(1)[0]))
+
+            checker = get_checker(checker_functions)
+            #
+            # Get tag
+            #
+            tag = re.findall('^([a-zA-Z0-9]+)', token)
+            if len(tag) == 0:
+                tag = True
+            elif len(tag) == 1:
+                tag = tag[0]
+            else:
+                raise Exception("Multiple tags found (invalid CSS)")
+
+            #
+            # Get ID
+            #
+            ids = re.findall('#([a-zA-Z0-9_-]+)', token)
+            if len(ids) > 1:
+                raise Exception("Only single # OK")
+            #
+            # Get classes
+            #
+            classes = re.findall('\.([a-zA-Z0-9_-]+)', token)
+
+            #
+            # Search contexts for matches
+            #
+            found = []
+            find_dict = {}
+            if ids:
+                find_dict['id'] = ids
+            if classes:
+                find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split())
+            if operator is None:
+                # This is the first token: simply find all matches
+                for context in current_context:
+                    context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)]
+                    for context_match in context_matches:
+                        found.append(
+                            (context_match, [context_match]),
+                        )
+            elif operator == ' ':
+                # for each context in current_context, ensure there
+                # exists an element somewhere above that element that
+                # matches the provided token
+                # ("descendant" selector)
+                for context in current_context:
+                    context_matches = []
+                    for el in context[1]:
+                        if checker(el.findParent(tag, find_dict)):
+                            context_matches.append(el)
+                    if context_matches:
+                        found.append(
+                            (context[0], context_matches),
+                        )
+            elif operator == '>':
+                # for each context in current_context,
+                # check if the parent satisfies the provided
+                # arguments.
+                for context in current_context:
+                    context_matches = []
+                    for el in context[1]:
+                        if checker(el.findParent(tag, find_dict)) == el.parent:
+                            context_matches.append(el.parent)
+                    if context_matches:
+                        found.append(
+                            (context[0], context_matches),
+                        )
+            elif operator == '~':
+                # for each context in current_context
+                # check 
+                raise NotImplementedError("~ operator is not implemented. Sad face :(")
+            elif operator == '+':
+                # for each context in current_context
+                # check if the preceding sibling satisfies the
+                # provided arguments
+                for context in current_context:
+                    context_matches = []
+                    for el in context[1]:
+                        if checker(el.findPreviousSibling(tag, find_dict)) == el.previousSibling:
+                            context_matches.append(el.previousSibling)
+                    if context_matches:
+                        found.append(
+                            (context[0], context_matches)
+                        )
+            current_context = found
+        else:
+            # Get the next operator (whitespace, >, ~, +)
+            handle_token = True
+            operator = None
+            match = re.search('([>~+]+)$', selector)
+            if match:
+                operator = match.groups(1)[0]
+            else:
+                operator = ' '
+            selector = selector.rsplit(operator, 1)[0].rstrip()
+    return [entry[0] for entry in current_context]
+
+def monkeypatch(BeautifulSoupClass=None):
+    """
+    If you don't explicitly state the class to patch, defaults to the most 
+    common import location for BeautifulSoup.
+    """
+    if not BeautifulSoupClass:
+        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+    BeautifulSoupClass.findSelect = select
+
+def unmonkeypatch(BeautifulSoupClass=None):
+    if not BeautifulSoupClass:
+        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+    delattr(BeautifulSoupClass, 'findSelect')
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..861a9f5
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,5 @@
+[egg_info]
+tag_build = 
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a34f7cc
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+install_requires = [
+    'BeautifulSoup >=3.2.1,<4.0',
+    'cssutils >=0.9.7',
+]
+
+tests_require = [
+    'mock'
+] + install_requires
+
+setup(name='pynliner',
+      version='0.5.2',
+      description='Python CSS-to-inline-styles conversion tool for HTML using'
+                  ' BeautifulSoup and cssutils',
+      author='Tanner Netterville',
+      author_email='tannern at gmail.com',
+      install_requires=install_requires,
+      tests_require=tests_require,
+      test_suite='tests',
+      packages=['pynliner'],
+      provides=['pynliner'])

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pynliner.git



More information about the Python-modules-commits mailing list