[Python-modules-commits] [pynliner] 01/03: Imported Upstream version 0.5.2
Sandro Tosi
morph at moszumanska.debian.org
Tue Jul 7 22:14:41 UTC 2015
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch bpo8
in repository pynliner.
commit aebab24111597829a86d44575bde508449768330
Author: Sandro Tosi <morph at debian.org>
Date: Tue Jul 7 18:08:36 2015 -0400
Imported Upstream version 0.5.2
---
PKG-INFO | 11 ++
pynliner.egg-info/PKG-INFO | 11 ++
pynliner.egg-info/SOURCES.txt | 8 +
pynliner.egg-info/dependency_links.txt | 1 +
pynliner.egg-info/requires.txt | 2 +
pynliner.egg-info/top_level.txt | 1 +
pynliner/__init__.py | 283 +++++++++++++++++++++++++++++++++
pynliner/soupselect.py | 229 ++++++++++++++++++++++++++
setup.cfg | 5 +
setup.py | 25 +++
10 files changed, 576 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..d1772d7
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.1
+Name: pynliner
+Version: 0.5.2
+Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
+Home-page: UNKNOWN
+Author: Tanner Netterville
+Author-email: tannern at gmail.com
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
+Provides: pynliner
diff --git a/pynliner.egg-info/PKG-INFO b/pynliner.egg-info/PKG-INFO
new file mode 100644
index 0000000..d1772d7
--- /dev/null
+++ b/pynliner.egg-info/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.1
+Name: pynliner
+Version: 0.5.2
+Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
+Home-page: UNKNOWN
+Author: Tanner Netterville
+Author-email: tannern at gmail.com
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
+Provides: pynliner
diff --git a/pynliner.egg-info/SOURCES.txt b/pynliner.egg-info/SOURCES.txt
new file mode 100644
index 0000000..6ca44b7
--- /dev/null
+++ b/pynliner.egg-info/SOURCES.txt
@@ -0,0 +1,8 @@
+setup.py
+pynliner/__init__.py
+pynliner/soupselect.py
+pynliner.egg-info/PKG-INFO
+pynliner.egg-info/SOURCES.txt
+pynliner.egg-info/dependency_links.txt
+pynliner.egg-info/requires.txt
+pynliner.egg-info/top_level.txt
\ No newline at end of file
diff --git a/pynliner.egg-info/dependency_links.txt b/pynliner.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pynliner.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pynliner.egg-info/requires.txt b/pynliner.egg-info/requires.txt
new file mode 100644
index 0000000..fe1d0f7
--- /dev/null
+++ b/pynliner.egg-info/requires.txt
@@ -0,0 +1,2 @@
+BeautifulSoup >=3.2.1,<4.0
+cssutils >=0.9.7
\ No newline at end of file
diff --git a/pynliner.egg-info/top_level.txt b/pynliner.egg-info/top_level.txt
new file mode 100644
index 0000000..3f906af
--- /dev/null
+++ b/pynliner.egg-info/top_level.txt
@@ -0,0 +1 @@
+pynliner
diff --git a/pynliner/__init__.py b/pynliner/__init__.py
new file mode 100644
index 0000000..de346c9
--- /dev/null
+++ b/pynliner/__init__.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Pynliner : Convert CSS to inline styles
+
+Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and
+cssutils
+
+Copyright (c) 2011-2013 Tanner Netterville
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+The generated output of this software shall not be used in a mass marketing
+service.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+
+__version__ = "0.5.2"
+
+import re
+import urlparse
+import urllib2
+import cssutils
+from BeautifulSoup import BeautifulSoup, Comment
+from soupselect import select
+
+
+class Pynliner(object):
+ """Pynliner class"""
+
+ soup = False
+ style_string = False
+ stylesheet = False
+ output = False
+
+ def __init__(self, log=None, allow_conditional_comments=False):
+ self.log = log
+ cssutils.log.enabled = False if log is None else True
+ self.extra_style_strings = []
+ self.allow_conditional_comments = allow_conditional_comments
+ self.root_url = None
+ self.relative_url = None
+
+ def from_url(self, url):
+ """Gets remote HTML page for conversion
+
+ Downloads HTML page from `url` as a string and passes it to the
+ `from_string` method. Also sets `self.root_url` and `self.relative_url`
+ for use in importing <link> elements.
+
+ Returns self.
+
+ >>> p = Pynliner()
+ >>> p.from_url('http://somewebsite.com/file.html')
+ <Pynliner object at 0x26ac70>
+ """
+ self.url = url
+ self.relative_url = '/'.join(url.split('/')[:-1]) + '/'
+ self.root_url = '/'.join(url.split('/')[:3])
+ self.source_string = self._get_url(self.url)
+ return self
+
+ def from_string(self, string):
+ """Generates a Pynliner object from the given HTML string.
+
+ Returns self.
+
+ >>> p = Pynliner()
+ >>> p.from_string('<style>h1 {color:#ffcc00;}</style><h1>Hi</h1>')
+ <Pynliner object at 0x26ac70>
+ """
+ self.source_string = string
+ return self
+
+ def with_cssString(self, css_string):
+ """Adds external CSS to the Pynliner object. Can be "chained".
+
+ Returns self.
+
+ >>> html = "<h1>Hello World!</h1>"
+ >>> css = "h1 { color:#ffcc00; }"
+ >>> p = Pynliner()
+ >>> p.from_string(html).with_cssString(css)
+ <pynliner.Pynliner object at 0x2ca810>
+ """
+ self.extra_style_strings.append(css_string)
+ return self
+
+ def run(self):
+ """Applies each step of the process if they have not already been
+ performed.
+
+ Returns Unicode output with applied styles.
+
+ >>> html = "<style>h1 { color:#ffcc00; }</style><h1>Hello World!</h1>"
+ >>> Pynliner().from_string(html).run()
+ u'<h1 style="color: #fc0">Hello World!</h1>'
+ """
+ if not self.soup:
+ self._get_soup()
+ if not self.stylesheet:
+ self._get_styles()
+ self._apply_styles()
+ self._get_output()
+ self._clean_output()
+ return self.output
+
+ def _get_url(self, url):
+ """Returns the response content from the given url
+ """
+ return urllib2.urlopen(url).read()
+
+ def _get_soup(self):
+ """Convert source string to BeautifulSoup object. Sets it to self.soup.
+
+ If using mod_wgsi, use html5 parsing to prevent BeautifulSoup
+ incompatibility.
+ """
+ # Check if mod_wsgi is running
+ # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks
+ try:
+ from mod_wsgi import version
+ self.soup = BeautifulSoup(self.source_string, "html5lib")
+ except:
+ self.soup = BeautifulSoup(self.source_string)
+
+ def _get_styles(self):
+ """Gets all CSS content from and removes all <link rel="stylesheet"> and
+ <style> tags concatenating into one CSS string which is then parsed with
+ cssutils and the resulting CSSStyleSheet object set to
+ `self.stylesheet`.
+ """
+ self._get_external_styles()
+ self._get_internal_styles()
+ for style_string in self.extra_style_strings:
+ self.style_string += style_string
+ cssparser = cssutils.CSSParser(log=self.log)
+ self.stylesheet = cssparser.parseString(self.style_string)
+
+ def _get_external_styles(self):
+ """Gets <link> element styles
+ """
+ if not self.style_string:
+ self.style_string = u''
+ else:
+ self.style_string += u'\n'
+
+ link_tags = self.soup.findAll('link', {'rel': 'stylesheet'})
+ for tag in link_tags:
+ url = tag['href']
+
+ # Convert the relative URL to an absolute URL ready to pass to urllib
+ base_url = self.relative_url or self.root_url
+ url = urlparse.urljoin(base_url, url)
+
+ self.style_string += self._get_url(url)
+ tag.extract()
+
+ def _get_internal_styles(self):
+ """Gets <style> element styles
+ """
+ if not self.style_string:
+ self.style_string = u''
+ else:
+ self.style_string += u'\n'
+
+ style_tags = self.soup.findAll('style')
+ for tag in style_tags:
+ self.style_string += u'\n'.join(tag.contents) + u'\n'
+ tag.extract()
+
+ def _get_specificity_from_list(self, lst):
+ """
+ Takes an array of ints and returns an integer formed
+ by adding all ints multiplied by the power of 10 of the current index
+
+ (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
+ """
+ return int(''.join(map(str, lst)))
+
+ def _get_rule_specificity(self, rule):
+ """
+ For a given CSSRule get its selector specificity in base 10
+ """
+ return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))
+
+ def _apply_styles(self):
+ """Steps through CSS rules and applies each to all the proper elements
+ as @style attributes prepending any current @style attributes.
+ """
+ rules = self.stylesheet.cssRules.rulesOfType(1)
+ elem_prop_map = {}
+ elem_style_map = {}
+
+ # build up a property list for every styled element
+ for rule in rules:
+ # select elements for every selector
+ selectors = rule.selectorText.split(',')
+ elements = []
+ for selector in selectors:
+ elements += select(self.soup, selector.strip())
+ # build prop_list for each selected element
+ for elem in elements:
+ if elem not in elem_prop_map:
+ elem_prop_map[elem] = []
+ elem_prop_map[elem].append({
+ 'specificity': self._get_rule_specificity(rule),
+ 'props': rule.style.getProperties(),
+ })
+
+ # build up another property list using selector specificity
+ for elem, props in elem_prop_map.items():
+ if elem not in elem_style_map:
+ elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
+ # ascending sort of prop_lists based on specificity
+ props = sorted(props, key=lambda p: p['specificity'])
+ # for each prop_list, apply to CSSStyleDeclaration
+ for prop_list in map(lambda obj: obj['props'], props):
+ for prop in prop_list:
+ elem_style_map[elem].removeProperty(prop.name)
+ elem_style_map[elem].setProperty(prop.name, prop.value)
+
+
+ # apply rules to elements
+ for elem, style_declaration in elem_style_map.items():
+ if elem.has_key('style'):
+ elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
+ else:
+ elem['style'] = style_declaration.cssText.replace('\n', ' ')
+
+ def _get_output(self):
+ """Generate Unicode string of `self.soup` and set it to `self.output`
+
+ Returns self.output
+ """
+ self.output = unicode(self.soup)
+ return self.output
+
+ def _clean_output(self):
+ """Clean up after BeautifulSoup's output.
+ """
+ if self.allow_conditional_comments:
+ matches = re.finditer('(<!--\[if .+\].+?<!\[endif\]-->)', self.output)
+ for match in matches:
+ comment = match.group()
+ comment = comment.replace('>', '>')
+ comment = comment.replace('<', '<')
+ self.output = (self.output[:match.start()] + comment +
+ self.output[match.end():])
+
+
+def fromURL(url, log=None):
+ """Shortcut Pynliner constructor. Equivalent to:
+
+ >>> Pynliner().from_url(someURL).run()
+
+ Returns processed HTML string.
+ """
+ return Pynliner(log).from_url(url).run()
+
+def fromString(string, log=None):
+ """Shortcut Pynliner constructor. Equivalent to:
+
+ >>> Pynliner().from_string(someString).run()
+
+ Returns processed HTML string.
+ """
+ return Pynliner(log).from_string(string).run()
diff --git a/pynliner/soupselect.py b/pynliner/soupselect.py
new file mode 100644
index 0000000..70cfe26
--- /dev/null
+++ b/pynliner/soupselect.py
@@ -0,0 +1,229 @@
+"""
+# Included with pynliner since it isn't on PyPI #
+
+soupselect.py
+
+CSS selector support for BeautifulSoup.
+
+soup = BeautifulSoup('<html>...')
+select(soup, 'div')
+ - returns a list of div elements
+
+select(soup, 'div#main ul a')
+ - returns a list of links inside a ul inside div#main
+
+patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
+"""
+import re
+import BeautifulSoup
+
+attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
+pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
+
+def get_attribute_checker(operator, attribute, value=''):
+ """
+ Takes an operator, attribute and optional value; returns a function that
+ will return True for elements that match that combination.
+ """
+ return {
+ '=': lambda el: el.get(attribute) == value,
+ # attribute includes value as one of a set of space separated tokens
+ '~': lambda el: value in el.get(attribute, '').split(),
+ # attribute starts with value
+ '^': lambda el: el.get(attribute, '').startswith(value),
+ # attribute ends with value
+ '$': lambda el: el.get(attribute, '').endswith(value),
+ # attribute contains value
+ '*': lambda el: value in el.get(attribute, ''),
+ # attribute is either exactly value or starts with value-
+ '|': lambda el: el.get(attribute, '') == value \
+ or el.get(attribute, '').startswith('%s-' % value),
+ }.get(operator, lambda el: el.has_key(attribute))
+
+def is_white_space(el):
+ if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '':
+ return True
+ if isinstance(el, BeautifulSoup.Comment):
+ return True
+ return False
+
+def is_last_content_node(el):
+ result = False
+ if el is None:
+ result = True
+ elif is_white_space(el):
+ result = is_last_content_node(el.nextSibling)
+ return result
+
+def is_first_content_node(el):
+ result = False
+ if el is None:
+ result = True
+ if is_white_space(el):
+ result = is_first_content_node(el.previousSibling)
+ return result
+
+def get_pseudo_class_checker(psuedo_class):
+ """
+ Takes a psuedo_class, like "first-child" or "last-child"
+ and returns a function that will check if the element satisfies
+ that psuedo class
+ """
+ return {
+ 'first-child': lambda el: is_first_content_node(getattr(el, 'previousSibling', None)),
+ 'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None))
+ }.get(psuedo_class, lambda el: False)
+
+def get_checker(functions):
+ def checker(el):
+ for func in functions:
+ if not func(el):
+ return False
+ return el
+ return checker
+
+
+def select(soup, selector):
+ """
+ soup should be a BeautifulSoup instance; selector is a CSS selector
+ specifying the elements you want to retrieve.
+ """
+ handle_token = True
+ current_context = [(soup, [])]
+ operator = None
+ while selector:
+ if handle_token:
+ # Get the rightmost token
+ handle_token = False
+ match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
+ if not match:
+ raise Exception("No match was found. We're done or something is broken")
+ token = match.groups(1)[0]
+
+ # remove this token from the selector
+ selector = selector.rsplit(token, 1)[0].rstrip()
+
+ checker_functions = []
+ #
+ # Get attribute selectors from token
+ #
+ matches = attribute_regex.findall(token)
+ for match in matches:
+ checker_functions.append(get_attribute_checker(match[1], match[0], match[2]))
+
+ #
+ # Get pseudo classes from token
+ #
+ for match in pseudo_class_regex.finditer(token):
+ checker_functions.append(get_pseudo_class_checker(match.groups(1)[0]))
+
+ checker = get_checker(checker_functions)
+ #
+ # Get tag
+ #
+ tag = re.findall('^([a-zA-Z0-9]+)', token)
+ if len(tag) == 0:
+ tag = True
+ elif len(tag) == 1:
+ tag = tag[0]
+ else:
+ raise Exception("Multiple tags found (invalid CSS)")
+
+ #
+ # Get ID
+ #
+ ids = re.findall('#([a-zA-Z0-9_-]+)', token)
+ if len(ids) > 1:
+ raise Exception("Only single # OK")
+ #
+ # Get classes
+ #
+ classes = re.findall('\.([a-zA-Z0-9_-]+)', token)
+
+ #
+ # Search contexts for matches
+ #
+ found = []
+ find_dict = {}
+ if ids:
+ find_dict['id'] = ids
+ if classes:
+ find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split())
+ if operator is None:
+ # This is the first token: simply find all matches
+ for context in current_context:
+ context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)]
+ for context_match in context_matches:
+ found.append(
+ (context_match, [context_match]),
+ )
+ elif operator == ' ':
+ # for each context in current_context, ensure there
+ # exists an element somewhere above that element that
+ # matches the provided token
+ # ("descendant" selector)
+ for context in current_context:
+ context_matches = []
+ for el in context[1]:
+ if checker(el.findParent(tag, find_dict)):
+ context_matches.append(el)
+ if context_matches:
+ found.append(
+ (context[0], context_matches),
+ )
+ elif operator == '>':
+ # for each context in current_context,
+ # check if the parent satisfies the provided
+ # arguments.
+ for context in current_context:
+ context_matches = []
+ for el in context[1]:
+ if checker(el.findParent(tag, find_dict)) == el.parent:
+ context_matches.append(el.parent)
+ if context_matches:
+ found.append(
+ (context[0], context_matches),
+ )
+ elif operator == '~':
+ # for each context in current_context
+ # check
+ raise NotImplementedError("~ operator is not implemented. Sad face :(")
+ elif operator == '+':
+ # for each context in current_context
+ # check if the preceding sibling satisfies the
+ # provided arguments
+ for context in current_context:
+ context_matches = []
+ for el in context[1]:
+ if checker(el.findPreviousSibling(tag, find_dict)) == el.previousSibling:
+ context_matches.append(el.previousSibling)
+ if context_matches:
+ found.append(
+ (context[0], context_matches)
+ )
+ current_context = found
+ else:
+ # Get the next operator (whitespace, >, ~, +)
+ handle_token = True
+ operator = None
+ match = re.search('([>~+]+)$', selector)
+ if match:
+ operator = match.groups(1)[0]
+ else:
+ operator = ' '
+ selector = selector.rsplit(operator, 1)[0].rstrip()
+ return [entry[0] for entry in current_context]
+
+def monkeypatch(BeautifulSoupClass=None):
+ """
+ If you don't explicitly state the class to patch, defaults to the most
+ common import location for BeautifulSoup.
+ """
+ if not BeautifulSoupClass:
+ from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+ BeautifulSoupClass.findSelect = select
+
+def unmonkeypatch(BeautifulSoupClass=None):
+ if not BeautifulSoupClass:
+ from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+ delattr(BeautifulSoupClass, 'findSelect')
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..861a9f5
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,5 @@
+[egg_info]
+tag_build =
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a34f7cc
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+install_requires = [
+ 'BeautifulSoup >=3.2.1,<4.0',
+ 'cssutils >=0.9.7',
+]
+
+tests_require = [
+ 'mock'
+] + install_requires
+
+setup(name='pynliner',
+ version='0.5.2',
+ description='Python CSS-to-inline-styles conversion tool for HTML using'
+ ' BeautifulSoup and cssutils',
+ author='Tanner Netterville',
+ author_email='tannern at gmail.com',
+ install_requires=install_requires,
+ tests_require=tests_require,
+ test_suite='tests',
+ packages=['pynliner'],
+ provides=['pynliner'])
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pynliner.git
More information about the Python-modules-commits
mailing list