[Python-modules-commits] [python-pyld] 03/04: Implement URL parsing/unparsing per RFC 3986.
Wolfgang Borgert
debacle at moszumanska.debian.org
Tue Oct 13 22:56:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
debacle pushed a commit to tag 0.6.5
in repository python-pyld.
commit fd7ad30c8e8d3a238f60e0355a7294943ebe4f38
Author: Dave Longley <dlongley at digitalbazaar.com>
Date: Tue Dec 2 17:34:14 2014 -0500
Implement URL parsing/unparsing per RFC 3986.
- Section 5.3 Component Recomposition in RFC 3986 makes a
differentiation between undefined components and empty
components that the built-in urlparse in python does not. This
patch deals with that issue and preserves empty queries and
fragments (despite them being semantically equivalent).
---
lib/pyld/jsonld.py | 116 +++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 85 insertions(+), 31 deletions(-)
diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py
index d6cfe93..9fa489c 100644
--- a/lib/pyld/jsonld.py
+++ b/lib/pyld/jsonld.py
@@ -36,7 +36,7 @@ import ssl
import string
import sys
import traceback
-from collections import deque
+from collections import deque, namedtuple
from contextlib import closing
from numbers import Integral, Real
@@ -437,40 +437,67 @@ def prepend_base(base, iri):
return iri
# parse IRIs
- base = urllib_parse.urlsplit(base)
- rel = urllib_parse.urlsplit(iri)
-
- # IRI represents an absolute path
- if rel.path.startswith('/'):
- path = rel.path
+ base = parse_url(base)
+ rel = parse_url(iri)
+
+ # per RFC3986 5.2.2
+ transform = {
+ 'scheme': base.scheme
+ };
+
+ if rel.authority is not None:
+ transform['authority'] = rel.authority
+ transform['path'] = rel.path
+ transform['query'] = rel.query
else:
- path = base.path
+ transform['authority'] = base.authority
- # append relative path to the end of the last directory from base
- if rel.path != '':
- path = path[0:path.rfind('/') + 1]
- if len(path) > 0 and not path.endswith('/'):
- path += '/'
- path += rel.path
+ if rel.path == '':
+ transform['path'] = base.path
+ if rel.query != None:
+ transform['query'] = rel.query
+ else:
+ transform['query'] = base.query
+ else:
+ if rel.path.startswith('/'):
+ # IRI represents an absolute path
+ transform['path'] = rel.path
+ else:
+ # merge paths
+ path = base.path
- add_slash = path.endswith('/')
+ # append relative path to the end of the last directory from base
+ if rel.path != '':
+ path = path[0:path.rfind('/') + 1]
+ if len(path) > 0 and not path.endswith('/'):
+ path += '/'
+ path += rel.path
+
+ transform['path'] = path
+
+ transform['query'] = rel.query
# normalize path
+ path = transform['path']
+ add_slash = path.endswith('/')
path = posixpath.normpath(path)
if not path.endswith('/') and add_slash:
path += '/'
-
- # do not include '.' path for fragments
- if path == '.' and rel.fragment != '':
+ # do not include '.' path
+ if path == '.':
path = ''
+ transform['path'] = path
- return urllib_parse.urlunsplit((
- base.scheme,
- rel.netloc or base.netloc,
- path,
- rel.query,
- rel.fragment
- ))
+ transform['fragment'] = rel.fragment
+
+ # construct URL
+ rval = unparse_url(transform)
+
+ # handle empty base case
+ if rval == '':
+ rval = './'
+
+ return rval
def remove_base(base, iri):
@@ -486,11 +513,11 @@ def remove_base(base, iri):
if base is None:
return iri
- base = urllib_parse.urlsplit(base)
- rel = urllib_parse.urlsplit(iri)
+ base = parse_url(base)
+ rel = parse_url(iri)
- # schemes and network locations don't match, don't alter IRI
- if not (base.scheme == rel.scheme and base.netloc == rel.netloc):
+ # schemes and network locations (authorities) don't match, don't alter IRI
+ if not (base.scheme == rel.scheme and base.authority == rel.authority):
return iri
path = posixpath.relpath(rel.path, base.path) if rel.path else ''
@@ -513,8 +540,35 @@ def remove_base(base, iri):
elif path.startswith('.'):
path = path[1:]
- return urllib_parse.urlunsplit((
- '', '', path, rel.query, rel.fragment)) or './'
+ return unparse_url((None, None, path, rel.query, rel.fragment)) or './'
+
+
+ParsedUrl = namedtuple(
+ 'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
+
+def parse_url(url):
+ # regex from RFC 3986
+ p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
+ m = re.match(p, url)
+ return ParsedUrl(*m.groups())
+
+
+def unparse_url(parsed):
+ if isinstance(parsed, dict):
+ parsed = ParsedUrl(**parsed)
+ elif isinstance(parsed, list) or isinstance(parsed, tuple):
+ parsed = ParsedUrl(*parsed)
+ rval = ''
+ if parsed.scheme:
+ rval += parsed.scheme + ':'
+ if parsed.authority is not None:
+ rval += '//' + parsed.authority
+ rval += parsed.path
+ if parsed.query is not None:
+ rval += '?' + parsed.query
+ if parsed.fragment is not None:
+ rval += '#' + parsed.fragment
+ return rval
# The default JSON-LD document loader.
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-pyld.git
More information about the Python-modules-commits
mailing list