[Python-modules-commits] [pynliner] 01/08: Import pynliner_0.7.2.orig.tar.gz
Sandro Tosi
morph at moszumanska.debian.org
Mon Sep 26 16:35:58 UTC 2016
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch master
in repository pynliner.
commit 1bb44b25c3a8fc63e6544cef67e34f1058ccc460
Author: Sandro Tosi <morph at debian.org>
Date: Mon Sep 26 15:33:46 2016 +0100
Import pynliner_0.7.2.orig.tar.gz
---
PKG-INFO | 16 +++-
pynliner.egg-info/PKG-INFO | 16 +++-
pynliner.egg-info/SOURCES.txt | 1 +
pynliner.egg-info/requires.txt | 4 +-
pynliner/__init__.py | 165 +++++++++++++++++++++++++----------------
pynliner/soupselect.py | 41 ++++++----
setup.cfg | 3 +
setup.py | 35 +++++----
8 files changed, 180 insertions(+), 101 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index d1772d7..be79e72 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,11 +1,21 @@
Metadata-Version: 1.1
Name: pynliner
-Version: 0.5.2
+Version: 0.7.2
Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
Home-page: UNKNOWN
Author: Tanner Netterville
Author-email: tannern at gmail.com
-License: UNKNOWN
+License: MIT
Description: UNKNOWN
Platform: UNKNOWN
-Provides: pynliner
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
diff --git a/pynliner.egg-info/PKG-INFO b/pynliner.egg-info/PKG-INFO
index d1772d7..be79e72 100644
--- a/pynliner.egg-info/PKG-INFO
+++ b/pynliner.egg-info/PKG-INFO
@@ -1,11 +1,21 @@
Metadata-Version: 1.1
Name: pynliner
-Version: 0.5.2
+Version: 0.7.2
Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils
Home-page: UNKNOWN
Author: Tanner Netterville
Author-email: tannern at gmail.com
-License: UNKNOWN
+License: MIT
Description: UNKNOWN
Platform: UNKNOWN
-Provides: pynliner
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
diff --git a/pynliner.egg-info/SOURCES.txt b/pynliner.egg-info/SOURCES.txt
index 6ca44b7..c442cbe 100644
--- a/pynliner.egg-info/SOURCES.txt
+++ b/pynliner.egg-info/SOURCES.txt
@@ -1,3 +1,4 @@
+setup.cfg
setup.py
pynliner/__init__.py
pynliner/soupselect.py
diff --git a/pynliner.egg-info/requires.txt b/pynliner.egg-info/requires.txt
index fe1d0f7..ac8a5f4 100644
--- a/pynliner.egg-info/requires.txt
+++ b/pynliner.egg-info/requires.txt
@@ -1,2 +1,2 @@
-BeautifulSoup >=3.2.1,<4.0
-cssutils >=0.9.7
\ No newline at end of file
+BeautifulSoup4 >= 4.4.1
+cssutils >=0.9.7
diff --git a/pynliner/__init__.py b/pynliner/__init__.py
index de346c9..76c552e 100644
--- a/pynliner/__init__.py
+++ b/pynliner/__init__.py
@@ -5,7 +5,7 @@
Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and
cssutils
-Copyright (c) 2011-2013 Tanner Netterville
+Copyright (c) 2011-2016 Tanner Netterville
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -30,14 +30,29 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
-__version__ = "0.5.2"
-
import re
-import urlparse
-import urllib2
+
import cssutils
-from BeautifulSoup import BeautifulSoup, Comment
-from soupselect import select
+from bs4 import BeautifulSoup
+
+from .soupselect import select
+
+try:
+ from urllib.parse import urljoin
+ from urllib.request import urlopen
+ unicode = str
+except ImportError:
+ from urlparse import urljoin
+ from urllib2 import urlopen
+
+__version__ = "0.7.2"
+
+
+# this pattern may be too aggressive
+HTML_ENTITY_PATTERN = re.compile(r'&(#([0-9]+|x[a-fA-F0-9]+)|[a-zA-Z][^\s;]+);')
+
+SUBSTITUTION_FORMAT = '[pynlinerSubstitute:{0}]'
+SUBSTITUTION_PATTERN = re.compile(r'\[pynlinerSubstitute:(\d+)\]')
class Pynliner(object):
@@ -48,13 +63,16 @@ class Pynliner(object):
stylesheet = False
output = False
- def __init__(self, log=None, allow_conditional_comments=False):
+ def __init__(self, log=None, allow_conditional_comments=False,
+ preserve_entities=True):
self.log = log
cssutils.log.enabled = False if log is None else True
self.extra_style_strings = []
self.allow_conditional_comments = allow_conditional_comments
+ self.preserve_entities = preserve_entities
self.root_url = None
self.relative_url = None
+ self._substitutions = None
def from_url(self, url):
"""Gets remote HTML page for conversion
@@ -111,19 +129,52 @@ class Pynliner(object):
>>> Pynliner().from_string(html).run()
u'<h1 style="color: #fc0">Hello World!</h1>'
"""
+ self._substitutions = []
+ if self.preserve_entities:
+ self._substitute_entities()
if not self.soup:
self._get_soup()
if not self.stylesheet:
self._get_styles()
self._apply_styles()
+ self._insert_media_rules()
self._get_output()
- self._clean_output()
+ self._unsubstitute_output()
return self.output
+ def _store_substitute(self, value):
+ """
+ store a string and return it's substitute
+ """
+ index = len(self._substitutions)
+ self._substitutions.append(value)
+ return SUBSTITUTION_FORMAT.format(index)
+
def _get_url(self, url):
"""Returns the response content from the given url
"""
- return urllib2.urlopen(url).read()
+ return urlopen(url).read()
+
+ def _substitute_entities(self):
+ """
+ Add HTML entities to the substitutions list and replace with
+ placeholders in HTML source
+ """
+ self.source_string = re.sub(
+ HTML_ENTITY_PATTERN,
+ lambda m: self._store_substitute(m.group(0)),
+ self.source_string
+ )
+
+ def _unsubstitute_output(self):
+ """
+ Put substitutions back into the output
+ """
+ self.output = re.sub(
+ SUBSTITUTION_PATTERN,
+ lambda m: self._substitutions[int(m.group(1))],
+ self.output
+ )
def _get_soup(self):
"""Convert source string to BeautifulSoup object. Sets it to self.soup.
@@ -136,8 +187,8 @@ class Pynliner(object):
try:
from mod_wsgi import version
self.soup = BeautifulSoup(self.source_string, "html5lib")
- except:
- self.soup = BeautifulSoup(self.source_string)
+ except ImportError:
+ self.soup = BeautifulSoup(self.source_string, "html.parser")
def _get_styles(self):
"""Gets all CSS content from and removes all <link rel="stylesheet"> and
@@ -166,7 +217,7 @@ class Pynliner(object):
# Convert the relative URL to an absolute URL ready to pass to urllib
base_url = self.relative_url or self.root_url
- url = urlparse.urljoin(base_url, url)
+ url = urljoin(base_url, url)
self.style_string += self._get_url(url)
tag.extract()
@@ -184,20 +235,19 @@ class Pynliner(object):
self.style_string += u'\n'.join(tag.contents) + u'\n'
tag.extract()
- def _get_specificity_from_list(self, lst):
+ def _insert_media_rules(self):
+ """If there are any media rules, re-insert a style tag at the top and
+ dump them all in.
"""
- Takes an array of ints and returns an integer formed
- by adding all ints multiplied by the power of 10 of the current index
-
- (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
- """
- return int(''.join(map(str, lst)))
-
- def _get_rule_specificity(self, rule):
- """
- For a given CSSRule get its selector specificity in base 10
- """
- return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))
+ rules = list(self.stylesheet.cssRules.rulesOfType(cssutils.css.CSSRule.MEDIA_RULE))
+ if rules:
+ style = BeautifulSoup(
+ "<style>" + "\n".join(re.sub(r'\s+', ' ', x.cssText) for x in rules) +
+ "</style>",
+ "html.parser"
+ )
+ target = self.soup.body or self.soup
+ target.insert(0, style)
def _apply_styles(self):
"""Steps through CSS rules and applies each to all the proper elements
@@ -206,43 +256,39 @@ class Pynliner(object):
rules = self.stylesheet.cssRules.rulesOfType(1)
elem_prop_map = {}
elem_style_map = {}
-
# build up a property list for every styled element
for rule in rules:
- # select elements for every selector
- selectors = rule.selectorText.split(',')
- elements = []
- for selector in selectors:
- elements += select(self.soup, selector.strip())
- # build prop_list for each selected element
- for elem in elements:
- if elem not in elem_prop_map:
- elem_prop_map[elem] = []
- elem_prop_map[elem].append({
- 'specificity': self._get_rule_specificity(rule),
- 'props': rule.style.getProperties(),
- })
+ for selector in rule.selectorList:
+ for element in select(self.soup, selector.selectorText):
+ element_tuple = (element, id(element))
+ if element_tuple not in elem_prop_map:
+ elem_prop_map[element_tuple] = []
+ elem_prop_map[element_tuple].append({
+ 'specificity': selector.specificity,
+ 'props': rule.style.getProperties(),
+ })
# build up another property list using selector specificity
- for elem, props in elem_prop_map.items():
- if elem not in elem_style_map:
- elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
+ for elem_tuple, props in elem_prop_map.items():
+ elem, elem_id = elem_tuple
+ if elem_tuple not in elem_style_map:
+ elem_style_map[elem_tuple] = cssutils.css.CSSStyleDeclaration()
# ascending sort of prop_lists based on specificity
props = sorted(props, key=lambda p: p['specificity'])
# for each prop_list, apply to CSSStyleDeclaration
for prop_list in map(lambda obj: obj['props'], props):
for prop in prop_list:
- elem_style_map[elem].removeProperty(prop.name)
- elem_style_map[elem].setProperty(prop.name, prop.value)
-
+ elem_style_map[elem_tuple].removeProperty(prop.name)
+ elem_style_map[elem_tuple].setProperty(prop.name, prop.value)
# apply rules to elements
- for elem, style_declaration in elem_style_map.items():
- if elem.has_key('style'):
+ for elem_tuple, style_declaration in elem_style_map.items():
+ elem, elem_id = elem_tuple
+ if elem.has_attr('style'):
elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
else:
elem['style'] = style_declaration.cssText.replace('\n', ' ')
-
+
def _get_output(self):
"""Generate Unicode string of `self.soup` and set it to `self.output`
@@ -250,34 +296,23 @@ class Pynliner(object):
"""
self.output = unicode(self.soup)
return self.output
-
- def _clean_output(self):
- """Clean up after BeautifulSoup's output.
- """
- if self.allow_conditional_comments:
- matches = re.finditer('(<!--\[if .+\].+?<!\[endif\]-->)', self.output)
- for match in matches:
- comment = match.group()
- comment = comment.replace('>', '>')
- comment = comment.replace('<', '<')
- self.output = (self.output[:match.start()] + comment +
- self.output[match.end():])
-def fromURL(url, log=None):
+def fromURL(url, **kwargs):
"""Shortcut Pynliner constructor. Equivalent to:
>>> Pynliner().from_url(someURL).run()
Returns processed HTML string.
"""
- return Pynliner(log).from_url(url).run()
+ return Pynliner(**kwargs).from_url(url).run()
+
-def fromString(string, log=None):
+def fromString(string, **kwargs):
"""Shortcut Pynliner constructor. Equivalent to:
>>> Pynliner().from_string(someString).run()
Returns processed HTML string.
"""
- return Pynliner(log).from_string(string).run()
+ return Pynliner(**kwargs).from_string(string).run()
diff --git a/pynliner/soupselect.py b/pynliner/soupselect.py
index 70cfe26..4a081e2 100644
--- a/pynliner/soupselect.py
+++ b/pynliner/soupselect.py
@@ -15,10 +15,15 @@ select(soup, 'div#main ul a')
patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
"""
import re
-import BeautifulSoup
+import operator as operator_
+from functools import partial
+
+import bs4
+
+ATTRIBUTE_PATTERN = re.compile(r'\[(?P<attribute>[^\s\]=~\|\^\$\*]+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
+PSEUDO_CLASS_PATTERN = re.compile(u':(([^:.#(*\\[]|\\([^)]+\\))+)')
+SELECTOR_TOKEN_PATTERN = re.compile(r'([_0-9a-zA-Z-#.:*]+|\[[^\]]+\])$')
-attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
-pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
def get_attribute_checker(operator, attribute, value=''):
"""
@@ -38,15 +43,17 @@ def get_attribute_checker(operator, attribute, value=''):
# attribute is either exactly value or starts with value-
'|': lambda el: el.get(attribute, '') == value \
or el.get(attribute, '').startswith('%s-' % value),
- }.get(operator, lambda el: el.has_key(attribute))
+ }.get(operator, lambda el: el.has_attr(attribute))
+
def is_white_space(el):
- if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '':
+ if isinstance(el, bs4.NavigableString) and str(el).strip() == '':
return True
- if isinstance(el, BeautifulSoup.Comment):
+ if isinstance(el, bs4.Comment):
return True
return False
+
def is_last_content_node(el):
result = False
if el is None:
@@ -55,6 +62,7 @@ def is_last_content_node(el):
result = is_last_content_node(el.nextSibling)
return result
+
def is_first_content_node(el):
result = False
if el is None:
@@ -63,6 +71,7 @@ def is_first_content_node(el):
result = is_first_content_node(el.previousSibling)
return result
+
def get_pseudo_class_checker(psuedo_class):
"""
Takes a psuedo_class, like "first-child" or "last-child"
@@ -74,6 +83,7 @@ def get_pseudo_class_checker(psuedo_class):
'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None))
}.get(psuedo_class, lambda el: False)
+
def get_checker(functions):
def checker(el):
for func in functions:
@@ -95,7 +105,7 @@ def select(soup, selector):
if handle_token:
# Get the rightmost token
handle_token = False
- match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
+ match = SELECTOR_TOKEN_PATTERN.search(selector)
if not match:
raise Exception("No match was found. We're done or something is broken")
token = match.groups(1)[0]
@@ -107,14 +117,14 @@ def select(soup, selector):
#
# Get attribute selectors from token
#
- matches = attribute_regex.findall(token)
+ matches = ATTRIBUTE_PATTERN.findall(token)
for match in matches:
checker_functions.append(get_attribute_checker(match[1], match[0], match[2]))
#
# Get pseudo classes from token
#
- for match in pseudo_class_regex.finditer(token):
+ for match in PSEUDO_CLASS_PATTERN.finditer(token):
checker_functions.append(get_pseudo_class_checker(match.groups(1)[0]))
checker = get_checker(checker_functions)
@@ -148,11 +158,11 @@ def select(soup, selector):
if ids:
find_dict['id'] = ids
if classes:
- find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split())
+ find_dict['class'] = partial(operator_.contains, classes)
if operator is None:
# This is the first token: simply find all matches
for context in current_context:
- context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)]
+ context_matches = [el for el in context[0].find_all(tag, find_dict) if checker(el)]
for context_match in context_matches:
found.append(
(context_match, [context_match]),
@@ -205,25 +215,26 @@ def select(soup, selector):
else:
# Get the next operator (whitespace, >, ~, +)
handle_token = True
- operator = None
match = re.search('([>~+]+)$', selector)
if match:
operator = match.groups(1)[0]
+ selector = selector.rsplit(operator, 1)[0].rstrip()
else:
operator = ' '
- selector = selector.rsplit(operator, 1)[0].rstrip()
return [entry[0] for entry in current_context]
+
def monkeypatch(BeautifulSoupClass=None):
"""
If you don't explicitly state the class to patch, defaults to the most
common import location for BeautifulSoup.
"""
if not BeautifulSoupClass:
- from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+ from bs4 import BeautifulSoup as BeautifulSoupClass
BeautifulSoupClass.findSelect = select
+
def unmonkeypatch(BeautifulSoupClass=None):
if not BeautifulSoupClass:
- from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+ from bs4 import BeautifulSoup as BeautifulSoupClass
delattr(BeautifulSoupClass, 'findSelect')
diff --git a/setup.cfg b/setup.cfg
index 861a9f5..6f08d0e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,6 @@
+[bdist_wheel]
+universal = 1
+
[egg_info]
tag_build =
tag_date = 0
diff --git a/setup.py b/setup.py
index a34f7cc..711aeed 100644
--- a/setup.py
+++ b/setup.py
@@ -3,23 +3,32 @@
from setuptools import setup
-install_requires = [
- 'BeautifulSoup >=3.2.1,<4.0',
- 'cssutils >=0.9.7',
-]
-
-tests_require = [
- 'mock'
-] + install_requires
-
setup(name='pynliner',
- version='0.5.2',
+ version='0.7.2',
description='Python CSS-to-inline-styles conversion tool for HTML using'
' BeautifulSoup and cssutils',
author='Tanner Netterville',
author_email='tannern at gmail.com',
- install_requires=install_requires,
- tests_require=tests_require,
+ install_requires=[
+ 'BeautifulSoup4 >= 4.4.1',
+ 'cssutils >=0.9.7',
+ ],
+ tests_require=[
+ 'mock'
+ ],
test_suite='tests',
packages=['pynliner'],
- provides=['pynliner'])
+ license='MIT',
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: MIT License',
+ 'Topic :: Text Processing :: Markup :: HTML',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 2.6',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.3',
+ 'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5'
+ ])
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pynliner.git
More information about the Python-modules-commits
mailing list