[Python-modules-commits] [adapt-parser] 01/04: import adapt-parser_0.3.0.orig.tar.gz
Ethan Ward
ethanward-guest at moszumanska.debian.org
Mon Jul 24 19:35:23 UTC 2017
This is an automated email from the git hooks/post-receive script.
ethanward-guest pushed a commit to branch master
in repository adapt-parser.
commit 8c8c37ec251c99fd3185b3b2ca2782f0169ce28f
Author: ethan <ethan at debian>
Date: Mon Jul 24 14:29:26 2017 -0500
import adapt-parser_0.3.0.orig.tar.gz
---
PKG-INFO | 11 +
adapt/__init__.py | 6 +
adapt/context.py | 91 ++++++++
adapt/engine.py | 329 +++++++++++++++++++++++++++++
adapt/entity_tagger.py | 109 ++++++++++
adapt/expander.py | 179 ++++++++++++++++
adapt/intent.py | 190 +++++++++++++++++
adapt/parser.py | 77 +++++++
adapt/tools/__init__.py | 2 +
adapt/tools/text/__init__.py | 1 +
adapt/tools/text/tokenizer.py | 58 +++++
adapt/tools/text/trie.py | 132 ++++++++++++
adapt_parser.egg-info/PKG-INFO | 11 +
adapt_parser.egg-info/SOURCES.txt | 17 ++
adapt_parser.egg-info/dependency_links.txt | 1 +
adapt_parser.egg-info/requires.txt | 2 +
adapt_parser.egg-info/top_level.txt | 1 +
setup.cfg | 4 +
setup.py | 20 ++
19 files changed, 1241 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..e617dad
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.0
+Name: adapt-parser
+Version: 0.3.0
+Summary: A text-to-intent parsing framework.
+Home-page: https://github.com/MycroftAI/adapt
+Author: Sean Fitzgerald
+Author-email: sean at fitzgeralds.me
+License: LGPL-3
+Description: UNKNOWN
+Keywords: natural language processing
+Platform: UNKNOWN
diff --git a/adapt/__init__.py b/adapt/__init__.py
new file mode 100644
index 0000000..41c15bc
--- /dev/null
+++ b/adapt/__init__.py
@@ -0,0 +1,6 @@
+__author__ = 'seanfitz'
+import os
+if os.path.exists('README.md'):
+ import codecs
+ __doc__ = codecs.open('README.md', encoding='utf-8', mode='r').read()
+
diff --git a/adapt/context.py b/adapt/context.py
new file mode 100644
index 0000000..66bc6f3
--- /dev/null
+++ b/adapt/context.py
@@ -0,0 +1,91 @@
+from six.moves import xrange
+
+__author__ = "seanfitz"
+
+
+class ContextManagerFrame(object):
+ """
+ Manages entities and context for a single frame of conversation.
+ Provides simple equality querying.
+ """
+ def __init__(self, entities=[], metadata={}):
+ self.entities = entities
+ self.metadata = metadata
+
+ def metadata_matches(self, query={}):
+ result = len(query.keys()) > 0
+ for key in query.keys():
+ result = result and query[key] == self.metadata.get(key)
+
+ return result
+
+ def merge_context(self, tag, metadata):
+ self.entities.append(tag)
+ for k in metadata.keys():
+ if k not in self.metadata:
+ self.metadata[k] = k
+
+
+class ContextManager(object):
+ """
+ ContextManager
+ Use to track context throughout the course of a conversational session. How to manage a session's
+ lifecycle is not captured here.
+ """
+ def __init__(self):
+ self.frame_stack = []
+
+ def inject_context(self, entity, metadata={}):
+ """
+ :param entity:
+ format {'data': 'Entity tag as <str>', 'key': 'entity proper name as <str>', 'confidence': <float>'}
+
+ :param metadata: dict, arbitrary metadata about the entity being added
+
+ :return:
+ """
+ top_frame = self.frame_stack[0] if len(self.frame_stack) > 0 else None
+ if top_frame and top_frame.metadata_matches(metadata):
+ top_frame.merge_context(entity, metadata)
+ else:
+ frame = ContextManagerFrame(entities=[entity], metadata=metadata.copy())
+ self.frame_stack.insert(0, frame)
+
+ def get_context(self, max_frames=None, missing_entities=[]):
+ """
+ Constructs a list of entities from the context.
+
+ :param max_frames: integer, max number of frames to look back
+
+ :param missing_entities: a list or set of tag names, as strings
+
+ :return: a list of entities
+ """
+ if not max_frames:
+ max_frames = len(self.frame_stack)
+
+ missing_entities = list(missing_entities)
+ context = []
+ for i in xrange(max_frames):
+ frame_entities = [entity.copy() for entity in self.frame_stack[i].entities]
+ for entity in frame_entities:
+ entity['confidence'] = entity.get('confidence', 1.0) / (2.0 + i)
+ context += frame_entities
+
+ result = []
+ if len(missing_entities) > 0:
+ for entity in context:
+ if entity.get('data') in missing_entities:
+ result.append(entity)
+ # NOTE: this implies that we will only ever get one
+ # of an entity kind from context, unless specified
+ # multiple times in missing_entities. Cannot get
+ # an arbitrary number of an entity kind.
+ missing_entities.remove(entity.get('data'))
+ else:
+ result = context
+
+ return result
+
+
+
diff --git a/adapt/engine.py b/adapt/engine.py
new file mode 100644
index 0000000..8704603
--- /dev/null
+++ b/adapt/engine.py
@@ -0,0 +1,329 @@
+import re
+import heapq
+import pyee
+from adapt.entity_tagger import EntityTagger
+from adapt.parser import Parser
+from adapt.tools.text.tokenizer import EnglishTokenizer
+from adapt.tools.text.trie import Trie
+
+__author__ = 'seanfitz'
+
+
+class IntentDeterminationEngine(pyee.EventEmitter):
+ """
+ IntentDeterminationEngine
+
+ The IntentDeterminationEngine is a greedy and naive implementation of intent determination. Given an utterance,
+ it uses the Adapt parsing tools to come up with a sorted collection of tagged parses. A valid parse result contains
+ no overlapping tagged entities, and it's confidence is the sum of the tagged entity confidences, which are
+ weighted based on the percentage of the utterance (per character) that the entity match represents.
+
+ This system makes heavy use of generators to enable greedy algorithms to short circuit large portions of
+ computation.
+ """
+ def __init__(self, tokenizer=None, trie=None):
+ pyee.EventEmitter.__init__(self)
+ self.tokenizer = tokenizer or EnglishTokenizer()
+ self.trie = trie or Trie()
+ self.regular_expressions_entities = []
+ self._regex_strings = set()
+ self.tagger = EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities)
+ self.intent_parsers = []
+
+ def __best_intent(self, parse_result, context=[]):
+ best_intent = None
+ best_tags = None
+ context_as_entities = [{'entities': [c]} for c in context]
+ for intent in self.intent_parsers:
+ i, tags = intent.validate_with_tags(parse_result.get('tags') + context_as_entities, parse_result.get('confidence'))
+ if not best_intent or (i and i.get('confidence') > best_intent.get('confidence')):
+ best_intent = i
+ best_tags = tags
+
+ return best_intent, best_tags
+
+ def __get_unused_context(self, parse_result, context):
+ tags_keys = set([t['key'] for t in parse_result['tags'] if t['from_context']])
+ result_context = [c for c in context if c['key'] not in tags_keys]
+ return result_context
+
+ def determine_intent(self, utterance, num_results=1, include_tags=False, context_manager=None):
+ """
+ Given an utterance, provide a valid intent.
+
+ :param utterance: an ascii or unicode string representing natural language speech
+
+ :param include_tags: includes the parsed tags (including position and confidence)
+ as part of result
+
+ :param context_manager: a context manager to provide context to the utterance
+
+ :param num_results: a maximum number of results to be returned.
+
+ :return: A generator that yields dictionaries.
+ """
+ parser = Parser(self.tokenizer, self.tagger)
+ parser.on('tagged_entities',
+ (lambda result:
+ self.emit("tagged_entities", result)))
+
+ context = []
+ if context_manager:
+ context = context_manager.get_context()
+
+ for result in parser.parse(utterance, N=num_results, context=context):
+ self.emit("parse_result", result)
+ # create a context without entities used in result
+ remaining_context = self.__get_unused_context(result, context)
+ best_intent, tags = self.__best_intent(result, remaining_context)
+ if best_intent and best_intent.get('confidence', 0.0) > 0:
+ if include_tags:
+ best_intent['__tags__'] = tags
+ yield best_intent
+
+ def register_entity(self, entity_value, entity_type, alias_of=None):
+ """
+ Register an entity to be tagged in potential parse results
+
+ :param entity_value: the value/proper name of an entity instance (Ex: "The Big Bang Theory")
+
+ :param entity_type: the type/tag of an entity instance (Ex: "Television Show")
+
+ :return: None
+ """
+ if alias_of:
+ self.trie.insert(entity_value.lower(), data=(alias_of, entity_type))
+ else:
+ self.trie.insert(entity_value.lower(), data=(entity_value, entity_type))
+ self.trie.insert(entity_type.lower(), data=(entity_type, 'Concept'))
+
+ def register_regex_entity(self, regex_str):
+ """
+ A regular expression making use of python named group expressions.
+
+ Example: (?P<Artist>.*)
+
+ :param regex_str: a string representing a regular expression as defined above
+
+ :return: None
+ """
+ if regex_str and regex_str not in self._regex_strings:
+ self._regex_strings.add(regex_str)
+ self.regular_expressions_entities.append(re.compile(regex_str, re.IGNORECASE))
+
+ def register_intent_parser(self, intent_parser):
+ """
+ "Enforce" the intent parser interface at registration time.
+
+ :param intent_parser:
+
+ :return: None
+
+ :raises ValueError on invalid intent
+ """
+ if hasattr(intent_parser, 'validate') and callable(intent_parser.validate):
+ self.intent_parsers.append(intent_parser)
+ else:
+ raise ValueError("%s is not an intent parser" % str(intent_parser))
+
+
+class DomainIntentDeterminationEngine(object):
+ """
+ DomainIntentDeterminationEngine.
+
+ The DomainIntentDeterminationEngine is a greedy and naive implementation of intent
+ determination. Given an utterance, it uses the Adapt parsing tools to come up with a
+ sorted collection of tagged parses. A valid parse result contains no overlapping
+ tagged entities in a single domain, and it's confidence is the sum of the tagged
+ entity confidences, which are weighted based on the percentage of the utterance
+ (per character) that the entity match represents.
+
+ This system makes heavy use of generators to enable greedy algorithms to short circuit
+ large portions of computation.
+ """
+
+ def __init__(self):
+ """
+ Initialize DomainIntentDeterminationEngine.
+
+ :param tokenizer: The tokenizer you wish to use.
+
+ :param trie: the Trie() you wish to use.
+
+ :param domain: a string representing the domain you wish to add
+ """
+ self.domains = {}
+
+ @property
+ def tokenizer(self):
+ """
+ A property to link into IntentEngine's tokenizer.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains tokenizer from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain].tokenizer
+
+ @property
+ def trie(self):
+ """
+ A property to link into IntentEngine's trie.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains trie from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain].trie
+
+ @property
+ def tagger(self):
+ """
+ A property to link into IntentEngine's intent_parsers.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains intent_parsers from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain].tagger
+
+ @property
+ def intent_parsers(self):
+ """
+ A property to link into IntentEngine's intent_parsers.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains intent_parsers from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain].intent_parsers
+
+ @property
+ def _regex_strings(self):
+ """
+ A property to link into IntentEngine's _regex_strings.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains _regex_strings from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain]._regex_strings
+
+ @property
+ def regular_expressions_entities(self):
+ """
+ A property to link into IntentEngine's regular_expressions_entities.
+
+ warning:: this is only for backwards compatiblility and should not be used if you
+ intend on using domains.
+
+ :return: the domains regular_expression_entities from its IntentEngine
+ """
+ domain = 0
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ return self.domains[domain].regular_expressions_entities
+
+ def register_domain(self, domain=0, tokenizer=None, trie=None):
+ """
+ Register a domain with the intent engine.
+
+ :param tokenizer: The tokenizer you wish to use.
+
+ :param trie: the Trie() you wish to use.
+
+ :param domain: a string representing the domain you wish to add
+ """
+ self.domains[domain] = IntentDeterminationEngine(
+ tokenizer=tokenizer, trie=trie)
+
+ def register_entity(self, entity_value, entity_type, alias_of=None, domain=0):
+ """
+ Register an entity to be tagged in potential parse results.
+
+ :param entity_value: the value/proper name of an entity instance
+ (Ex: "The Big Bang Theory")
+
+ :param entity_type: the type/tag of an entity instance (Ex: "Television Show")
+
+ :param domain: a string representing the domain you wish to add the entity to
+
+ :return: None
+ """
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ self.domains[domain].register_entity(entity_value=entity_value,
+ entity_type=entity_type,
+ alias_of=alias_of)
+
+ def register_regex_entity(self, regex_str, domain=0):
+ """
+ A regular expression making use of python named group expressions.
+
+ Example: (?P<Artist>.*)
+
+ :param regex_str: a string representing a regular expression as defined above
+
+ :param domain: a string representing the domain you wish to add the entity to
+
+ :return: None
+ """
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ self.domains[domain].register_regex_entity(regex_str=regex_str)
+
+ def determine_intent(self, utterance, num_results=1):
+ """
+ Given an utterance, provide a valid intent.
+
+ :param utterance: an ascii or unicode string representing natural language speech
+
+ :param num_results: a maximum number of results to be returned.
+
+ :return: A generator the yields dictionaries.
+ """
+ intents = []
+ for domain in self.domains:
+ gen = self.domains[domain].determine_intent(utterance=utterance,
+ num_results=1)
+ for intent in gen:
+ intents.append(intent)
+
+ heapq.nlargest(
+ num_results, intents, key=lambda domain: domain['confidence'])
+ for intent in intents:
+ yield intent
+
+ def register_intent_parser(self, intent_parser, domain=0):
+ """
+ Register a intent parser with a domain.
+
+ :param intent_parser: The intent parser you wish to register.
+
+ :param domain: a string representing the domain you wish register the intent
+ parser to.
+ """
+ if domain not in self.domains:
+ self.register_domain(domain=domain)
+ self.domains[domain].register_intent_parser(
+ intent_parser=intent_parser)
\ No newline at end of file
diff --git a/adapt/entity_tagger.py b/adapt/entity_tagger.py
new file mode 100644
index 0000000..3112028
--- /dev/null
+++ b/adapt/entity_tagger.py
@@ -0,0 +1,109 @@
+from adapt.tools.text.trie import Trie
+from six.moves import xrange
+
+__author__ = 'seanfitz'
+
+
+class EntityTagger(object):
+ """
+ Known Entity Tagger
+ Given an index of known entities, can efficiently search for those entities within a provided utterance.
+ """
+ def __init__(self, trie, tokenizer, regex_entities=[], max_tokens=20):
+ self.trie = trie
+ self.tokenizer = tokenizer
+ self.max_tokens = max_tokens
+ self.regex_entities = regex_entities
+
+ def _iterate_subsequences(self, tokens):
+ """
+ Using regex invokes this function, which significantly impacts performance of adapt. it is an N! operation.
+
+ :param tokens:
+
+ :return:
+ """
+ for start_idx in xrange(len(tokens)):
+ for end_idx in xrange(start_idx + 1, len(tokens) + 1):
+ yield ' '.join(tokens[start_idx:end_idx]), start_idx
+
+ def _sort_and_merge_tags(self, tags):
+ decorated = [(tag['start_token'], tag['end_token'], tag) for tag in tags]
+ decorated.sort(key=lambda x: (x[0], x[1]))
+ return [tag for start_token, end_token, tag in decorated]
+
+ def tag(self, utterance, context_trie=None):
+ """
+ Tag known entities within the utterance.
+
+ :param utterance: a string of natural language text
+
+ :param context_trie: optional, a trie containing only entities from context
+ for this request
+
+ :return: dictionary, with the following keys
+
+ match: str - the proper entity matched
+
+ key: str - the string that was matched to the entity
+
+ start_token: int - 0-based index of the first token matched
+
+ end_token: int - 0-based index of the last token matched
+
+ entities: list - a list of entity kinds as strings (Ex: Artist, Location)
+ """
+ tokens = self.tokenizer.tokenize(utterance)
+ entities = []
+ if len(self.regex_entities) > 0:
+ for part, idx in self._iterate_subsequences(tokens):
+ local_trie = Trie()
+ for regex_entity in self.regex_entities:
+ match = regex_entity.match(part)
+ groups = match.groupdict() if match else {}
+ for key in list(groups):
+ match_str = groups.get(key)
+ local_trie.insert(match_str, (match_str, key))
+ sub_tagger = EntityTagger(local_trie, self.tokenizer, max_tokens=self.max_tokens)
+ for sub_entity in sub_tagger.tag(part):
+ sub_entity['start_token'] += idx
+ sub_entity['end_token'] += idx
+ for e in sub_entity['entities']:
+ e['confidence'] = 0.5
+ entities.append(sub_entity)
+ additional_sort = len(entities) > 0
+
+ context_entities = []
+ for i in xrange(len(tokens)):
+ part = ' '.join(tokens[i:])
+
+ for new_entity in self.trie.gather(part):
+ new_entity['data'] = list(new_entity['data'])
+ entities.append({
+ 'match': new_entity.get('match'),
+ 'key': new_entity.get('key'),
+ 'start_token': i,
+ 'entities': [new_entity],
+ 'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
+ 'from_context': False
+ })
+
+ if context_trie:
+ for new_entity in context_trie.gather(part):
+ new_entity['data'] = list(new_entity['data'])
+ new_entity['confidence'] *= 2.0 # context entities get double the weight!
+ context_entities.append({
+ 'match': new_entity.get('match'),
+ 'key': new_entity.get('key'),
+ 'start_token': i,
+ 'entities': [new_entity],
+ 'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
+ 'from_context': True
+ })
+
+ additional_sort = additional_sort or len(entities) > 0
+
+ if additional_sort:
+ entities = self._sort_and_merge_tags(entities + context_entities)
+
+ return entities
diff --git a/adapt/expander.py b/adapt/expander.py
new file mode 100644
index 0000000..4ad1764
--- /dev/null
+++ b/adapt/expander.py
@@ -0,0 +1,179 @@
+from six.moves import xrange
+
+__author__ = 'seanfitz'
+
+
+class SimpleGraph(object):
+ def __init__(self):
+ self.adjacency_lists = {}
+
+ def add_edge(self, a, b):
+ neighbors_of_a = self.adjacency_lists.get(a)
+ if not neighbors_of_a:
+ neighbors_of_a = set()
+ self.adjacency_lists[a] = neighbors_of_a
+
+ neighbors_of_a.add(b)
+
+ neighbors_of_b = self.adjacency_lists.get(b)
+ if not neighbors_of_b:
+ neighbors_of_b = set()
+ self.adjacency_lists[b] = neighbors_of_b
+
+ neighbors_of_b.add(a)
+
+ def get_neighbors_of(self, a):
+ return self.adjacency_lists.get(a, set())
+
+ def vertex_set(self):
+ return list(self.adjacency_lists)
+
+
+def bronk(r, p, x, graph):
+ if len(p) == 0 and len(x) == 0:
+ yield r
+ return
+ for vertex in p[:]:
+ r_new = r[::]
+ r_new.append(vertex)
+ p_new = [val for val in p if val in graph.get_neighbors_of(vertex)] # p intersects N(vertex)
+ x_new = [val for val in x if val in graph.get_neighbors_of(vertex)] # x intersects N(vertex)
+ for result in bronk(r_new, p_new, x_new, graph):
+ yield result
+ p.remove(vertex)
+ x.append(vertex)
+
+
+def get_cliques(vertices, graph):
+ for clique in bronk([], vertices, [], graph):
+ yield clique
+
+
+def graph_key_from_tag(tag, entity_index):
+ start_token = tag.get('start_token')
+ entity = tag.get('entities', [])[entity_index]
+ return str(start_token) + '-' + entity.get('key') + '-' + str(entity.get('confidence'))
+
+
+class Lattice(object):
+ def __init__(self):
+ self.nodes = []
+
+ def append(self, data):
+ if isinstance(data, list) and len(data) > 0:
+ self.nodes.append(data)
+ else:
+ self.nodes.append([data])
+
+ def traverse(self, index=0):
+ if index < len(self.nodes):
+ for entity in self.nodes[index]:
+ for next_result in self.traverse(index=index+1):
+ if isinstance(entity, list):
+ yield entity + next_result
+ else:
+ yield [entity] + next_result
+ else:
+ yield []
+
+
+class BronKerboschExpander(object):
+ """
+ BronKerboschExpander
+
+ Given a list of tagged entities (from the existing entity tagger implementation or another), expand out
+ valid parse results.
+
+ A parse result is considered valid if it contains no overlapping spans.
+
+ Since total confidence of a parse result is based on the sum of confidences of the entities, there is no sense
+ in yielding any potential parse results that are a subset/sequence of a larger valid parse result. By comparing
+ this concept to that of maximal cliques (https://en.wikipedia.org/wiki/Clique_problem), we can use well known
+ solutions to the maximal clique problem like the Bron/Kerbosch algorithm (https://en.wikipedia.org/wiki/Bron%E2%80%93Kerbosch_algorithm).
+
+ By considering tagged entities that do not overlap to be "neighbors", BronKerbosch will yield a set of maximal
+ cliques that are also valid parse results.
+ """
+ def __init__(self, tokenizer):
+ self.tokenizer = tokenizer
+
+ def _build_graph(self, tags):
+ graph = SimpleGraph()
+ for tag_index in xrange(len(tags)):
+ for entity_index in xrange(len(tags[tag_index].get('entities'))):
+ a_entity_name = graph_key_from_tag(tags[tag_index], entity_index)
+ tokens = self.tokenizer.tokenize(tags[tag_index].get('entities', [])[entity_index].get('match'))
+ for tag in tags[tag_index + 1:]:
+ start_token = tag.get('start_token')
+ if start_token >= tags[tag_index].get('start_token') + len(tokens):
+ for b_entity_index in xrange(len(tag.get('entities'))):
+ b_entity_name = graph_key_from_tag(tag, b_entity_index)
+ graph.add_edge(a_entity_name, b_entity_name)
+
+ return graph
+
+ def _sub_expand(self, tags):
+ entities = {}
+ graph = self._build_graph(tags)
+
+ # name entities
+ for tag in tags:
+ for entity_index in xrange(len(tag.get('entities'))):
+ node_name = graph_key_from_tag(tag, entity_index)
+ if not node_name in entities:
+ entities[node_name] = []
+ entities[node_name] += [
+ tag.get('entities', [])[entity_index],
+ tag.get('entities', [])[entity_index].get('confidence'),
+ tag
+ ]
+
+ for clique in get_cliques(list(entities), graph):
+ result = []
+ for entity_name in clique:
+ start_token = int(entity_name.split("-")[0])
+ old_tag = entities[entity_name][2]
+ tag = {
+ 'start_token': start_token,
+ 'entities': [entities.get(entity_name)[0]],
+ 'confidence': entities.get(entity_name)[1] * old_tag.get('confidence', 1.0),
+ 'end_token': old_tag.get('end_token'),
+ 'match': old_tag.get('entities')[0].get('match'),
+ 'key': old_tag.get('entities')[0].get('key'),
+ 'from_context': old_tag.get('from_context', False)
+ }
+ result.append(tag)
+ result = sorted(result, key=lambda e: e.get('start_token'))
+ yield result
+
+ def expand(self, tags, clique_scoring_func=None):
+ lattice = Lattice()
+ overlapping_spans = []
+
+ def end_token_index():
+ return max([t.get('end_token') for t in overlapping_spans])
+
+ for i in xrange(len(tags)):
+ tag = tags[i]
+
+ if len(overlapping_spans) > 0 and end_token_index() >= tag.get('start_token'):
+ overlapping_spans.append(tag)
+ elif len(overlapping_spans) > 1:
+ cliques = list(self._sub_expand(overlapping_spans))
+ if clique_scoring_func:
+ cliques = sorted(cliques, key=lambda e: -1 * clique_scoring_func(e))
+ lattice.append(cliques)
+ overlapping_spans = [tag]
+ else:
+ lattice.append(overlapping_spans)
+ overlapping_spans = [tag]
+ if len(overlapping_spans) > 1:
+ cliques = list(self._sub_expand(overlapping_spans))
+ if clique_scoring_func:
+ cliques = sorted(cliques, key=lambda e: -1 * clique_scoring_func(e))
+ lattice.append(cliques)
+ else:
+ lattice.append(overlapping_spans)
+
+ return lattice.traverse()
+
diff --git a/adapt/intent.py b/adapt/intent.py
new file mode 100644
index 0000000..a1f8892
--- /dev/null
+++ b/adapt/intent.py
@@ -0,0 +1,190 @@
+__author__ = 'seanfitz'
+
+CLIENT_ENTITY_NAME = 'Client'
+
+
+def is_entity(tag, entity_name):
+ for entity in tag.get('entities'):
+ for v, t in entity.get('data'):
+ if t.lower() == entity_name.lower():
+ return True
+ return False
+
+
+def find_first_tag(tags, entity_type, after_index=-1):
+ for tag in tags:
+ for entity in tag.get('entities'):
+ for v, t in entity.get('data'):
+ if t.lower() == entity_type.lower() and tag.get('start_token', 0) > after_index:
+ return tag, v, entity.get('confidence')
+
+ return None, None, None
+
+
+def find_next_tag(tags, end_index=0):
+ for tag in tags:
+ if tag.get('start_token') > end_index:
+ return tag
+ return None
+
+
+def choose_1_from_each(lists):
+ if len(lists) == 0:
+ yield []
+ else:
+ for el in lists[0]:
+ for next_list in choose_1_from_each(lists[1:]):
+ yield [el] + next_list
+
+
+def resolve_one_of(tags, at_least_one):
+ if len(tags) < len(at_least_one):
+ return None
+ for possible_resolution in choose_1_from_each(at_least_one):
+ resolution = {}
+ pr = possible_resolution[:]
+ for entity_type in pr:
+ last_end_index = -1
+ if entity_type in resolution:
+ last_end_index = resolution.get[entity_type][-1].get('end_token')
+ tag, value, c = find_first_tag(tags, entity_type, after_index=last_end_index)
+ if not tag:
+ break
+ else:
+ if entity_type not in resolution:
+ resolution[entity_type] = []
+ resolution[entity_type].append(tag)
+ if len(resolution) == len(possible_resolution):
+ return resolution
+
+ return None
+
+
+class Intent(object):
+ def __init__(self, name, requires, at_least_one, optional):
+ self.name = name
+ self.requires = requires
+ self.at_least_one = at_least_one
+ self.optional = optional
+
+ def validate(self, tags, confidence):
+ intent, tags = self.validate_with_tags(tags, confidence)
+ return intent
+
+ def validate_with_tags(self, tags, confidence):
+ result = {'intent_type': self.name}
+ intent_confidence = 0.0
+ local_tags = tags[:]
+ used_tags = []
+
+ for require_type, attribute_name in self.requires:
+ required_tag, canonical_form, confidence = find_first_tag(local_tags, require_type)
+ if not required_tag:
+ result['confidence'] = 0.0
+ return result, []
+
+ result[attribute_name] = canonical_form
+ if required_tag in local_tags:
+ local_tags.remove(required_tag)
+ used_tags.append(required_tag)
+ # TODO: use confidence based on edit distance and context
+ intent_confidence += confidence
+
+ if len(self.at_least_one) > 0:
+ best_resolution = resolve_one_of(tags, self.at_least_one)
+ if not best_resolution:
+ result['confidence'] = 0.0
+ return result, []
+ else:
+ for key in best_resolution:
+ result[key] = best_resolution[key][0].get('key') # TODO: at least one must support aliases
+ intent_confidence += 1.0
+ used_tags.append(best_resolution)
+ if best_resolution in local_tags:
+ local_tags.remove(best_resolution)
+
+ for optional_type, attribute_name in self.optional:
+ optional_tag, canonical_form, conf = find_first_tag(local_tags, optional_type)
+ if not optional_tag or attribute_name in result:
+ continue
+ result[attribute_name] = canonical_form
+ if optional_tag in local_tags:
+ local_tags.remove(optional_tag)
+ used_tags.append(optional_tag)
+ intent_confidence += 1.0
+
+ total_confidence = intent_confidence / len(tags) * confidence
+
+ target_client, canonical_form, confidence = find_first_tag(local_tags, CLIENT_ENTITY_NAME)
+
+ result['target'] = target_client.get('key') if target_client else None
+ result['confidence'] = total_confidence
+
+ return result, used_tags
+
+
+class IntentBuilder(object):
+ """
+ IntentBuilder, used to construct intent parsers.
+ """
+ def __init__(self, intent_name):
+ """
+ Constructor
+
+ :param intent_name: the name of the intents that this parser parses/validates
+
+ :return: an instance of IntentBuilder
+ """
+ self.at_least_one = []
+ self.requires = []
+ self.optional = []
+ self.name = intent_name
+
+ def one_of(self, *args):
+ """
+ The intent parser should require one of the provided entity types to validate this clause.
+
+ :param args: *args notation list of entity names
+
+ :return: self
+ """
+ self.at_least_one.append(args)
+ return self
+
+ def require(self, entity_type, attribute_name=None):
+ """
+ The intent parser should require an entity of the provided type.
+
+ :param entity_type: string, an entity type
+
+ :param attribute_name: string, the name of the attribute on the parsed intent. Defaults to match entity_type.
+
+ :return: self
+ """
+ if not attribute_name:
+ attribute_name = entity_type
+ self.requires += [(entity_type, attribute_name)]
+ return self
+
+ def optionally(self, entity_type, attribute_name=None):
+ """
+ Parsed intents from this parser can optionally include an entity of the provided type.
+
+ :param entity_type: string, an entity type
+
+ :param attribute_name: string, the name of the attribute on the parsed intent. Defaults to match entity_type.
+
+ :return: self
+ """
+ if not attribute_name:
+ attribute_name = entity_type
+ self.optional += [(entity_type, attribute_name)]
+ return self
+
+ def build(self):
+ """
+ Constructs an intent from the builder's specifications.
+
+ :return: an Intent instance.
+ """
+ return Intent(self.name, self.requires, self.at_least_one, self.optional)
\ No newline at end of file
diff --git a/adapt/parser.py b/adapt/parser.py
new file mode 100644
index 0000000..8226e01
--- /dev/null
+++ b/adapt/parser.py
@@ -0,0 +1,77 @@
+import pyee
+import time
+from adapt.expander import BronKerboschExpander
+from adapt.tools.text.trie import Trie
+
+__author__ = 'seanfitz'
+
+
... 386 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/adapt-parser.git
More information about the Python-modules-commits
mailing list