[Python-modules-commits] [python-pyld] 85/276: Add URL resolution support.
Wolfgang Borgert
debacle at moszumanska.debian.org
Wed Oct 8 23:47:56 UTC 2014
This is an automated email from the git hooks/post-receive script.
debacle pushed a commit to branch master
in repository python-pyld.
commit 39ef982253186b033bd3b9062a89d02f36d2cbdc
Author: Dave Longley <dlongley at digitalbazaar.com>
Date: Wed May 9 16:06:56 2012 -0400
Add URL resolution support.
---
lib/pyld/jsonld.py | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++---
tests/runtests.py | 16 ++-
2 files changed, 279 insertions(+), 20 deletions(-)
diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py
index 09e7076..5a4bf88 100644
--- a/lib/pyld/jsonld.py
+++ b/lib/pyld/jsonld.py
@@ -16,11 +16,14 @@ __copyright__ = 'Copyright (c) 2011-2012 Digital Bazaar, Inc.'
__license__ = 'New BSD license'
__all__ = ['compact', 'expand', 'frame', 'normalize', 'from_rdf', 'to_rdf',
- 'JsonLdProcessor']
+ 'set_url_resolver', 'resolve_url', 'JsonLdProcessor', 'ContextCache']
-import copy, hashlib, re
+import copy, hashlib, json, os, re, string, sys, time, traceback
+import urllib2, urlparse
+from contextlib import closing
from functools import cmp_to_key
from numbers import Integral, Real
+from httplib import HTTPSConnection
# XSD constants
XSD_BOOLEAN = 'http://www.w3.org/2001/XMLSchema#boolean'
@@ -50,6 +53,9 @@ KEYWORDS = [
'@type',
'@value']
+# Restraints
+MAX_CONTEXT_URLS = 10
+
def compact(input, ctx, options=None):
"""
@@ -143,6 +149,48 @@ def to_rdf(input, options=None):
return JsonLdProcessor().to_rdf(input, options)
+def set_url_resolver(resolver):
+ """
+ Sets the default JSON-LD URL resolver.
+
+ :param resolver(url): the URL resolver to use.
+ """
+ _jsonld_default_url_resolver = resolver
+
+
+def resolve_url(url):
+ """
+ Retrieves JSON-LD as the given URL.
+
+ :param url: the URL to resolve.
+
+ :return: the JSON-LD.
+ """
+ global _jsonld_default_url_resolver
+ global _jsonld_context_cache
+ if (_jsonld_default_url_resolver is None or
+ _jsonld_default_url_resolver == resolve_url):
+ # create context cache as needed
+ if _jsonld_context_cache is None:
+ _jsonld_context_cache = ContextCache()
+
+ # default JSON-LD GET implementation
+ ctx = _jsonld_context_cache.get(url)
+ if ctx is None:
+ https_handler = VerifiedHTTPSHandler()
+ url_opener = urllib2.build_opener(https_handler)
+ with closing(url_opener.open(url)) as handle:
+ ctx = handle.read()
+ _jsonld_context_cache.set(url, ctx)
+ return ctx
+ return _jsonld_default_url_resolver(url)
+
+
+# The default JSON-LD URL resolver and cache.
+_jsonld_default_url_resolver = resolve_url
+_jsonld_context_cache = None
+
+
class JsonLdProcessor:
"""
A JSON-LD processor.
@@ -180,6 +228,7 @@ class JsonLdProcessor:
options.setdefault('optimize', False)
options.setdefault('graph', False)
options.setdefault('activeCtx', False)
+ options.setdefault('resolver', _jsonld_default_url_resolver)
# expand input
try:
@@ -263,10 +312,16 @@ class JsonLdProcessor:
# set default options
options = options or {}
options.setdefault('base', '')
+ options.setdefault('resolver', _jsonld_default_url_resolver)
# resolve all @context URLs in the input
input = copy.deepcopy(input)
- #self._resolveUrls(input, options['resolver'])
+ try:
+ self._resolve_context_urls(input, {}, options['resolver'])
+ except Exception as cause:
+ raise JsonLdError(
+ 'Could not perform JSON-LD expansion.',
+ 'jsonld.ExpandError', None, cause)
# do expansion
ctx = self._get_initial_context()
@@ -302,6 +357,7 @@ class JsonLdProcessor:
options.setdefault('explicit', False)
options.setdefault('omitDefault', False)
options.setdefault('optimize', False)
+ options.setdefault('resolver', _jsonld_default_url_resolver)
# preserve frame context
ctx = frame.get('@context', {})
@@ -358,6 +414,7 @@ class JsonLdProcessor:
# set default options
options = options or {}
options.setdefault('base', '')
+ options.setdefault('resolver', _jsonld_default_url_resolver)
try:
# expand input then do normalization
@@ -408,6 +465,7 @@ class JsonLdProcessor:
# set default options
options = options or {}
options.setdefault('base', '')
+ options.setdefault('resolver', _jsonld_default_url_resolver)
try:
# expand input
@@ -455,13 +513,18 @@ class JsonLdProcessor:
# set default options
options = options or {}
options.setdefault('base', '')
+ options.setdefault('resolver', _jsonld_default_url_resolver)
# resolve URLs in local_ctx
- local_ctx = copy.deepcopy(local_ctx)
- if _is_object(local_ctx) and '@context' not in local_ctx:
- local_ctx = {'@context': local_ctx}
- #ctx = self._resolveUrls(local_ctx, options['resolver'])
- ctx = local_ctx
+ ctx = copy.deepcopy(local_ctx)
+ if _is_object(ctx) and '@context' not in ctx:
+ ctx = {'@context': ctx}
+ try:
+ self._resolve_context_urls(ctx, {}, options['resolver'])
+ except Exception as cause:
+ raise JsonLdError(
+ 'Could not process JSON-LD context.',
+ 'jsonld.ContextError', None, cause)
# process context
return self._process_context(active_ctx, ctx, options)
@@ -710,7 +773,7 @@ class JsonLdProcessor:
# element is a @value
if _is_value(element):
# if @value is the only key, return its value
- if len(element.keys()) == 1:
+ if len(element) == 1:
return element['@value']
# get type and language context rules
@@ -1208,7 +1271,7 @@ class JsonLdProcessor:
if 'first' not in entry:
raise JsonLdError(
'Invalid RDF list entry.',
- 'jsonld.RdfError', {bnode: rest})
+ 'jsonld.RdfError', {'bnode': rest})
list_.append(entry['first'])
# build default graph in subject @id order
@@ -2486,6 +2549,134 @@ class JsonLdProcessor:
# prepend base to term
return self._prepend_base(base, term)
+ def _find_context_urls(self, input, urls, replace):
+ """
+ Finds all @context URLs in the given JSON-LD input.
+
+ :param input: the JSON-LD input.
+ :param urls: a map of URLs (url => false/@contexts).
+ :param replace: true to replace the URLs in the given input with
+ the @contexts from the urls map, false not to.
+ """
+ count = len(urls)
+ if _is_array(input):
+ for e in input:
+ self._find_context_urls(e, urls, replace)
+ elif _is_object(input):
+ for k, v in input.items():
+ if k != '@context':
+ self._find_context_urls(v, urls, replace)
+ continue
+
+ # array @context
+ if _is_array(v):
+ length = len(v)
+ i = 0
+ while i < length:
+ if _is_string(v[i]):
+ url = v[i]
+ # replace w/@context if requested
+ if replace:
+ ctx = urls[url]
+ if _is_array(ctx):
+ # add flattened context
+ v.pop(i)
+ for e in reversed(ctx):
+ v.insert(i, e)
+ i += len(ctx)
+ length += len(ctx)
+ else:
+ v[i] = ctx
+ # @context URL found
+ elif url not in urls:
+ urls[url] = False
+ i += 1
+ # string @context
+ elif _is_string(v):
+ # replace w/@context if requested
+ if replace:
+ input[k] = urls[v]
+ # @context URL found
+ elif v not in urls:
+ urls[v] = False
+
+ def _resolve_context_urls(self, input, cycles, resolver):
+ """
+ Resolves external @context URLs using the given URL resolver. Each
+ instance of @context in the input that refers to a URL will be
+ replaced with the JSON @context found at that URL.
+
+ :param input: the JSON-LD input with possible contexts.
+ :param cycles: an object for tracking context cycles.
+ :param resolver(url): the URL resolver.
+
+ :return: the result.
+ """
+ if len(cycles) > MAX_CONTEXT_URLS:
+ raise JsonLdError(
+ 'Maximum number of @context URLs exceeded.',
+ 'jsonld.ContextUrlError', {'max': MAX_CONTEXT_URLS})
+
+ # for tracking URLs to resolve
+ urls = {}
+
+ # find all URLs in the given input
+ self._find_context_urls(input, urls, False)
+
+ # queue all unresolved URLs
+ queue = []
+ for url, ctx in urls.items():
+ if ctx == False:
+ # validate URL
+ pieces = urlparse.urlparse(url)
+ if (not all([pieces.scheme, pieces.netloc]) or
+ pieces.scheme not in ['http', 'https'] or
+ set(pieces.netloc) > set(
+ string.letters + string.digits + '-.:')):
+ raise JsonLdError(
+ 'Malformed or unsupported URL.',
+ 'jsonld.InvalidUrl', {'url': url})
+ queue.append(url)
+
+ # resolve URLs in queue
+ for url in queue:
+ # check for context URL cycle
+ if url in cycles:
+ raise JsonLdError(
+ 'Cyclical @context URLs detected.',
+ 'jsonld.ContextUrlError', {'url': url})
+ _cycles = copy.deepcopy(cycles)
+ _cycles[url] = True
+
+ # resolve URL
+ ctx = resolver(url)
+
+ # parse string context as JSON
+ if _is_string(ctx):
+ try:
+ ctx = json.loads(ctx)
+ except Exception as cause:
+ raise JsonLdError(
+ 'Could not parse JSON from URL.',
+ 'jsonld.ParseError', {'url': url}, cause)
+
+ # ensure ctx is an object
+ if not _is_object(ctx):
+ raise JsonLdError(
+ 'URL does not resolve to a valid JSON-LD context.',
+ 'jsonld.InvalidUrl', {'url': url})
+
+ # use empty context if no @context key is present
+ if '@context' not in ctx:
+ ctx = {'@context': {}}
+
+ # recurse
+ self._resolve_context_urls(ctx, cycles, resolver)
+ urls[url] = ctx['@context']
+
+ # replace all URLs in the input
+ self._find_context_urls(input, urls, True)
+
def _prepend_base(self, base, iri):
"""
Prepends a base IRI to the given relative IRI.
@@ -2559,7 +2750,7 @@ class JsonLdProcessor:
if match is None:
raise JsonLdError(
'Error while parsing N-Quads invalid quad.',
- 'jsonld.ParseError', {line: lineNumber})
+ 'jsonld.ParseError', {'line': lineNumber})
match = match.groups()
# create RDF statement
@@ -2669,6 +2860,7 @@ class JsonLdError(Exception):
self.type = type
self.details = details
self.cause = cause
+ self.causeTrace = traceback.extract_tb(*sys.exc_info()[2:])
def __str__(self):
rval = repr(self.message)
@@ -2677,6 +2869,7 @@ class JsonLdError(Exception):
rval += '\nDetails: ' + repr(self.details)
if self.cause:
rval += '\nCause: ' + str(self.cause)
+ rval += ''.join(traceback.format_list(self.causeTrace))
return rval
@@ -3028,3 +3221,71 @@ def _get_adjacent_bnode_name(node, id):
if node['interfaceName'] == 'BlankNode' and node['nominalValue'] != id:
return node['nominalValue']
return None
+
+
+class ContextCache:
+ """
+ A simple JSON-LD context cache.
+ """
+
+ def __init__(self, size=50):
+ self.order = []
+ self.cache = {}
+ self.size = size
+ self.expires = 30 * 60 * 1000
+
+ def get(self, url):
+ if url in self.cache:
+ entry = self.cache[url]
+ if entry['expires'] >= time.time():
+ return entry['ctx']
+ del self.cache[url]
+ self.order.remove(url)
+ return None
+
+ def set(self, url, ctx):
+ if(len(self.order) == self.size):
+ del self.cache[self.order.pop(0)]
+ self.order.append(url)
+ self.cache[url] = {
+ 'ctx': ctx, 'expires': (time.time() + self.expires)}
+
+
+class VerifiedHTTPSConnection(HTTPSConnection):
+ """
+ Used to verify SSL certificates when resolving URLs.
+ Taken from: http://thejosephturner.com/blog/2011/03/19/https-certificate-verification-in-python-with-urllib2/
+ """
+
+ def connect(self):
+ global _trust_root_certificates
+ # overrides the version in httplib to do certificate verification
+ sock = socket.create_connection((self.host, self.port), self.timeout)
+ if self._tunnel_host:
+ self.sock = sock
+ self._tunnel()
+ # wrap the socket using verification with trusted_root_certs
+ self.sock = ssl.wrap_socket(sock,
+ self.key_file,
+ self.cert_file,
+ cert_reqs=ssl.CERT_REQUIRED,
+ ca_certs=_trust_root_certificates)
+
+
+class VerifiedHTTPSHandler(urllib2.HTTPSHandler):
+ """
+ Wraps urllib2 HTTPS connections enabling SSL certificate verification.
+ """
+
+ def __init__(self, connection_class=VerifiedHTTPSConnection):
+ self.specialized_conn_class = connection_class
+ urllib2.HTTPSHandler.__init__(self)
+
+ def https_open(self, req):
+ return self.do_open(self.specialized_conn_class, req)
+
+
+# the path to the system's default trusted root SSL certificates
+_trust_root_certificates = None
+if os.path.exists('/etc/ssl/certs'):
+ _trust_root_certificates = '/etc/ssl/certs'
diff --git a/tests/runtests.py b/tests/runtests.py
index a6cc446..6430b41 100644
--- a/tests/runtests.py
+++ b/tests/runtests.py
@@ -37,7 +37,6 @@ class TestRunner:
# command line options
self.options = {}
self.parser = OptionParser()
- self.test_dir = None
self.manifest_files = []
def main(self):
@@ -66,7 +65,6 @@ class TestRunner:
os.path.isfile(self.options.file)):
# add manifest file to the file list
self.manifest_files.append(os.path.abspath(self.options.file))
- self.test_dir = os.path.dirname(self.options.file)
else:
raise Exception('Invalid test file: "%s"' % self.options.file)
@@ -75,14 +73,13 @@ class TestRunner:
if (os.path.exists(self.options.directory) and
os.path.isdir(self.options.directory)):
# load manifest files from test directory
- for self.test_dir, dirs, files in os.walk(
- self.options.directory):
+ for test_dir, dirs, files in os.walk(self.options.directory):
for manifest in files:
# add all .jsonld manifest files to the file list
if (manifest.find('manifest') != -1 and
manifest.endswith('.jsonld')):
self.manifest_files.append(
- join(self.test_dir, manifest))
+ join(test_dir, manifest))
else:
raise Exception('Invalid test directory: "%s"' %
self.options.directory)
@@ -97,6 +94,7 @@ class TestRunner:
# run the tests from each manifest file
for manifest_file in self.manifest_files:
+ test_dir = os.path.dirname(manifest_file)
manifest = json.load(open(manifest_file, 'r'))
count = 1
@@ -119,13 +117,13 @@ class TestRunner:
count += 1
# read input file
- with open(join(self.test_dir, test['input'])) as f:
+ with open(join(test_dir, test['input'])) as f:
if test['input'].endswith('.jsonld'):
input = json.load(f)
else:
input = f.read().decode('utf8')
# read expect file
- with open(join(self.test_dir, test['expect'])) as f:
+ with open(join(test_dir, test['expect'])) as f:
if test['expect'].endswith('.jsonld'):
expect = json.load(f)
else:
@@ -143,10 +141,10 @@ class TestRunner:
elif 'jld:ExpandTest' in test_type:
result = jsonld.expand(input, options)
elif 'jld:CompactTest' in test_type:
- ctx = json.load(open(join(self.test_dir, test['context'])))
+ ctx = json.load(open(join(test_dir, test['context'])))
result = jsonld.compact(input, ctx, options)
elif 'jld:FrameTest' in test_type:
- frame = json.load(open(join(self.test_dir, test['frame'])))
+ frame = json.load(open(join(test_dir, test['frame'])))
result = jsonld.frame(input, frame, options)
elif 'jld:FromRDFTest' in test_type:
result = jsonld.from_rdf(input, options)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-pyld.git
More information about the Python-modules-commits
mailing list