[Python-modules-commits] [adapt-parser] 01/04: import adapt-parser_0.3.0.orig.tar.gz

Ethan Ward ethanward-guest at moszumanska.debian.org
Mon Jul 24 19:35:23 UTC 2017


This is an automated email from the git hooks/post-receive script.

ethanward-guest pushed a commit to branch master
in repository adapt-parser.

commit 8c8c37ec251c99fd3185b3b2ca2782f0169ce28f
Author: ethan <ethan at debian>
Date:   Mon Jul 24 14:29:26 2017 -0500

    import adapt-parser_0.3.0.orig.tar.gz
---
 PKG-INFO                                   |  11 +
 adapt/__init__.py                          |   6 +
 adapt/context.py                           |  91 ++++++++
 adapt/engine.py                            | 329 +++++++++++++++++++++++++++++
 adapt/entity_tagger.py                     | 109 ++++++++++
 adapt/expander.py                          | 179 ++++++++++++++++
 adapt/intent.py                            | 190 +++++++++++++++++
 adapt/parser.py                            |  77 +++++++
 adapt/tools/__init__.py                    |   2 +
 adapt/tools/text/__init__.py               |   1 +
 adapt/tools/text/tokenizer.py              |  58 +++++
 adapt/tools/text/trie.py                   | 132 ++++++++++++
 adapt_parser.egg-info/PKG-INFO             |  11 +
 adapt_parser.egg-info/SOURCES.txt          |  17 ++
 adapt_parser.egg-info/dependency_links.txt |   1 +
 adapt_parser.egg-info/requires.txt         |   2 +
 adapt_parser.egg-info/top_level.txt        |   1 +
 setup.cfg                                  |   4 +
 setup.py                                   |  20 ++
 19 files changed, 1241 insertions(+)

diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..e617dad
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,11 @@
+Metadata-Version: 1.0
+Name: adapt-parser
+Version: 0.3.0
+Summary: A text-to-intent parsing framework.
+Home-page: https://github.com/MycroftAI/adapt
+Author: Sean Fitzgerald
+Author-email: sean at fitzgeralds.me
+License: LGPL-3
+Description: UNKNOWN
+Keywords: natural language processing
+Platform: UNKNOWN
diff --git a/adapt/__init__.py b/adapt/__init__.py
new file mode 100644
index 0000000..41c15bc
--- /dev/null
+++ b/adapt/__init__.py
@@ -0,0 +1,6 @@
+__author__ = 'seanfitz'
+import os
+if os.path.exists('README.md'):
+  import codecs
+  __doc__ = codecs.open('README.md', encoding='utf-8', mode='r').read()
+
diff --git a/adapt/context.py b/adapt/context.py
new file mode 100644
index 0000000..66bc6f3
--- /dev/null
+++ b/adapt/context.py
@@ -0,0 +1,91 @@
+from six.moves import xrange
+
+__author__ = "seanfitz"
+
+
+class ContextManagerFrame(object):
+    """
+    Manages entities and context for a single frame of conversation.
+    Provides simple equality querying.
+    """
+    def __init__(self, entities=[], metadata={}):
+        self.entities = entities
+        self.metadata = metadata
+
+    def metadata_matches(self, query={}):
+        result = len(query.keys()) > 0
+        for key in query.keys():
+            result = result and query[key] == self.metadata.get(key)
+
+        return result
+
+    def merge_context(self, tag, metadata):
+        self.entities.append(tag)
+        for k in metadata.keys():
+            if k not in self.metadata:
+                self.metadata[k] = k
+
+
+class ContextManager(object):
+    """
+    ContextManager
+    Use to track context throughout the course of a conversational session. How to manage a session's
+    lifecycle is not captured here.
+    """
+    def __init__(self):
+        self.frame_stack = []
+
+    def inject_context(self, entity, metadata={}):
+        """
+        :param entity:
+            format {'data': 'Entity tag as <str>', 'key': 'entity proper name as <str>', 'confidence': <float>'}
+
+        :param metadata: dict, arbitrary metadata about the entity being added
+
+        :return:
+        """
+        top_frame = self.frame_stack[0] if len(self.frame_stack) > 0 else None
+        if top_frame and top_frame.metadata_matches(metadata):
+            top_frame.merge_context(entity, metadata)
+        else:
+            frame = ContextManagerFrame(entities=[entity], metadata=metadata.copy())
+            self.frame_stack.insert(0, frame)
+
+    def get_context(self, max_frames=None, missing_entities=[]):
+        """
+        Constructs a list of entities from the context.
+
+        :param max_frames: integer, max number of frames to look back
+
+        :param missing_entities: a list or set of tag names, as strings
+
+        :return: a list of entities
+        """
+        if not max_frames:
+            max_frames = len(self.frame_stack)
+
+        missing_entities = list(missing_entities)
+        context = []
+        for i in xrange(max_frames):
+            frame_entities = [entity.copy() for entity in self.frame_stack[i].entities]
+            for entity in frame_entities:
+                entity['confidence'] = entity.get('confidence', 1.0) / (2.0 + i)
+            context += frame_entities
+
+        result = []
+        if len(missing_entities) > 0:
+            for entity in context:
+                if entity.get('data') in missing_entities:
+                    result.append(entity)
+                    # NOTE: this implies that we will only ever get one
+                    # of an entity kind from context, unless specified
+                    # multiple times in missing_entities. Cannot get
+                    # an arbitrary number of an entity kind.
+                    missing_entities.remove(entity.get('data'))
+        else:
+            result = context
+
+        return result
+
+
+
diff --git a/adapt/engine.py b/adapt/engine.py
new file mode 100644
index 0000000..8704603
--- /dev/null
+++ b/adapt/engine.py
@@ -0,0 +1,329 @@
+import re
+import heapq
+import pyee
+from adapt.entity_tagger import EntityTagger
+from adapt.parser import Parser
+from adapt.tools.text.tokenizer import EnglishTokenizer
+from adapt.tools.text.trie import Trie
+
+__author__ = 'seanfitz'
+
+
+class IntentDeterminationEngine(pyee.EventEmitter):
+    """
+    IntentDeterminationEngine
+
+    The IntentDeterminationEngine is a greedy and naive implementation of intent determination. Given an utterance,
+    it uses the Adapt parsing tools to come up with a sorted collection of tagged parses. A valid parse result contains
+    no overlapping tagged entities, and it's confidence is the sum of the tagged entity confidences, which are
+    weighted based on the percentage of the utterance (per character) that the entity match represents.
+
+    This system makes heavy use of generators to enable greedy algorithms to short circuit large portions of
+    computation.
+    """
+    def __init__(self, tokenizer=None, trie=None):
+        pyee.EventEmitter.__init__(self)
+        self.tokenizer = tokenizer or EnglishTokenizer()
+        self.trie = trie or Trie()
+        self.regular_expressions_entities = []
+        self._regex_strings = set()
+        self.tagger = EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities)
+        self.intent_parsers = []
+
+    def __best_intent(self, parse_result, context=[]):
+        best_intent = None
+        best_tags = None
+        context_as_entities = [{'entities': [c]} for c in context]
+        for intent in self.intent_parsers:
+            i, tags = intent.validate_with_tags(parse_result.get('tags') + context_as_entities, parse_result.get('confidence'))
+            if not best_intent or (i and i.get('confidence') > best_intent.get('confidence')):
+                best_intent = i
+                best_tags = tags
+
+        return best_intent, best_tags
+
+    def __get_unused_context(self, parse_result, context):
+        tags_keys = set([t['key'] for t in parse_result['tags'] if t['from_context']])
+        result_context = [c for c in context if c['key'] not in tags_keys]
+        return result_context
+
+    def determine_intent(self, utterance, num_results=1, include_tags=False, context_manager=None):
+        """
+        Given an utterance, provide a valid intent.
+
+        :param utterance: an ascii or unicode string representing natural language speech
+
+        :param include_tags: includes the parsed tags (including position and confidence)
+            as part of result
+
+        :param context_manager: a context manager to provide context to the utterance
+
+        :param num_results: a maximum number of results to be returned.
+
+        :return: A generator that yields dictionaries.
+        """
+        parser = Parser(self.tokenizer, self.tagger)
+        parser.on('tagged_entities',
+                  (lambda result:
+                   self.emit("tagged_entities", result)))
+
+        context = []
+        if context_manager:
+            context = context_manager.get_context()
+
+        for result in parser.parse(utterance, N=num_results, context=context):
+            self.emit("parse_result", result)
+            # create a context without entities used in result
+            remaining_context = self.__get_unused_context(result, context)
+            best_intent, tags = self.__best_intent(result, remaining_context)
+            if best_intent and best_intent.get('confidence', 0.0) > 0:
+                if include_tags:
+                    best_intent['__tags__'] = tags
+                yield best_intent
+
+    def register_entity(self, entity_value, entity_type, alias_of=None):
+        """
+        Register an entity to be tagged in potential parse results
+
+        :param entity_value: the value/proper name of an entity instance (Ex: "The Big Bang Theory")
+
+        :param entity_type: the type/tag of an entity instance (Ex: "Television Show")
+
+        :return: None
+        """
+        if alias_of:
+            self.trie.insert(entity_value.lower(), data=(alias_of, entity_type))
+        else:
+            self.trie.insert(entity_value.lower(), data=(entity_value, entity_type))
+            self.trie.insert(entity_type.lower(), data=(entity_type, 'Concept'))
+
+    def register_regex_entity(self, regex_str):
+        """
+        A regular expression making use of python named group expressions.
+
+        Example: (?P<Artist>.*)
+
+        :param regex_str: a string representing a regular expression as defined above
+
+        :return: None
+        """
+        if regex_str and regex_str not in self._regex_strings:
+            self._regex_strings.add(regex_str)
+            self.regular_expressions_entities.append(re.compile(regex_str, re.IGNORECASE))
+
+    def register_intent_parser(self, intent_parser):
+        """
+        "Enforce" the intent parser interface at registration time.
+
+        :param intent_parser:
+
+        :return: None
+
+        :raises ValueError on invalid intent
+        """
+        if hasattr(intent_parser, 'validate') and callable(intent_parser.validate):
+            self.intent_parsers.append(intent_parser)
+        else:
+            raise ValueError("%s is not an intent parser" % str(intent_parser))
+
+
+class DomainIntentDeterminationEngine(object):
+    """
+    DomainIntentDeterminationEngine.
+
+    The DomainIntentDeterminationEngine is a greedy and naive implementation of intent
+    determination. Given an utterance, it uses the Adapt parsing tools to come up with a
+    sorted collection of tagged parses. A valid parse result contains no overlapping
+    tagged entities in a single domain, and it's confidence is the sum of the tagged
+    entity confidences, which are weighted based on the percentage of the utterance
+    (per character) that the entity match represents.
+
+    This system makes heavy use of generators to enable greedy algorithms to short circuit
+    large portions of computation.
+    """
+
+    def __init__(self):
+        """
+        Initialize DomainIntentDeterminationEngine.
+
+        :param tokenizer: The tokenizer you wish to use.
+
+        :param trie: the Trie() you wish to use.
+
+        :param domain: a string representing the domain you wish to add
+        """
+        self.domains = {}
+
+    @property
+    def tokenizer(self):
+        """
+        A property to link into IntentEngine's tokenizer.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains tokenizer from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain].tokenizer
+
+    @property
+    def trie(self):
+        """
+        A property to link into IntentEngine's trie.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains trie from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain].trie
+
+    @property
+    def tagger(self):
+        """
+        A property to link into IntentEngine's intent_parsers.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains intent_parsers from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain].tagger
+
+    @property
+    def intent_parsers(self):
+        """
+        A property to link into IntentEngine's intent_parsers.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains intent_parsers from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain].intent_parsers
+
+    @property
+    def _regex_strings(self):
+        """
+        A property to link into IntentEngine's _regex_strings.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains _regex_strings from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain]._regex_strings
+
+    @property
+    def regular_expressions_entities(self):
+        """
+        A property to link into IntentEngine's regular_expressions_entities.
+
+        warning:: this is only for backwards compatiblility and should not be used if you
+        intend on using domains.
+
+        :return: the domains regular_expression_entities from its IntentEngine
+        """
+        domain = 0
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        return self.domains[domain].regular_expressions_entities
+
+    def register_domain(self, domain=0, tokenizer=None, trie=None):
+        """
+        Register a domain with the intent engine.
+
+        :param tokenizer: The tokenizer you wish to use.
+
+        :param trie: the Trie() you wish to use.
+
+        :param domain: a string representing the domain you wish to add
+        """
+        self.domains[domain] = IntentDeterminationEngine(
+            tokenizer=tokenizer, trie=trie)
+
+    def register_entity(self, entity_value, entity_type, alias_of=None, domain=0):
+        """
+        Register an entity to be tagged in potential parse results.
+
+        :param entity_value: the value/proper name of an entity instance
+        (Ex: "The Big Bang Theory")
+
+        :param entity_type: the type/tag of an entity instance (Ex: "Television Show")
+
+        :param domain: a string representing the domain you wish to add the entity to
+
+        :return: None
+        """
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        self.domains[domain].register_entity(entity_value=entity_value,
+                                             entity_type=entity_type,
+                                             alias_of=alias_of)
+
+    def register_regex_entity(self, regex_str, domain=0):
+        """
+        A regular expression making use of python named group expressions.
+
+        Example: (?P<Artist>.*)
+
+        :param regex_str: a string representing a regular expression as defined above
+
+        :param domain: a string representing the domain you wish to add the entity to
+
+        :return: None
+        """
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        self.domains[domain].register_regex_entity(regex_str=regex_str)
+
+    def determine_intent(self, utterance, num_results=1):
+        """
+        Given an utterance, provide a valid intent.
+
+        :param utterance: an ascii or unicode string representing natural language speech
+
+        :param num_results: a maximum number of results to be returned.
+
+        :return: A generator the yields dictionaries.
+        """
+        intents = []
+        for domain in self.domains:
+            gen = self.domains[domain].determine_intent(utterance=utterance,
+                                                        num_results=1)
+            for intent in gen:
+                intents.append(intent)
+
+        heapq.nlargest(
+            num_results, intents, key=lambda domain: domain['confidence'])
+        for intent in intents:
+            yield intent
+
+    def register_intent_parser(self, intent_parser, domain=0):
+        """
+        Register a intent parser with a domain.
+
+        :param intent_parser: The intent parser you wish to register.
+
+        :param domain: a string representing the domain you wish register the intent
+        parser to.
+        """
+        if domain not in self.domains:
+            self.register_domain(domain=domain)
+        self.domains[domain].register_intent_parser(
+            intent_parser=intent_parser)
\ No newline at end of file
diff --git a/adapt/entity_tagger.py b/adapt/entity_tagger.py
new file mode 100644
index 0000000..3112028
--- /dev/null
+++ b/adapt/entity_tagger.py
@@ -0,0 +1,109 @@
+from adapt.tools.text.trie import Trie
+from six.moves import xrange
+
+__author__ = 'seanfitz'
+
+
+class EntityTagger(object):
+    """
+    Known Entity Tagger
+    Given an index of known entities, can efficiently search for those entities within a provided utterance.
+    """
+    def __init__(self, trie, tokenizer, regex_entities=[], max_tokens=20):
+        self.trie = trie
+        self.tokenizer = tokenizer
+        self.max_tokens = max_tokens
+        self.regex_entities = regex_entities
+
+    def _iterate_subsequences(self, tokens):
+        """
+        Using regex invokes this function, which significantly impacts performance of adapt. it is an N! operation.
+
+        :param tokens:
+
+        :return:
+        """
+        for start_idx in xrange(len(tokens)):
+            for end_idx in xrange(start_idx + 1, len(tokens) + 1):
+                yield ' '.join(tokens[start_idx:end_idx]), start_idx
+
+    def _sort_and_merge_tags(self, tags):
+        decorated = [(tag['start_token'], tag['end_token'], tag) for tag in tags]
+        decorated.sort(key=lambda x: (x[0], x[1]))
+        return [tag for start_token, end_token, tag in decorated]
+
+    def tag(self, utterance, context_trie=None):
+        """
+        Tag known entities within the utterance.
+
+        :param utterance: a string of natural language text
+
+        :param context_trie: optional, a trie containing only entities from context
+            for this request
+
+        :return: dictionary, with the following keys
+
+        match: str - the proper entity matched
+
+        key: str - the string that was matched to the entity
+
+        start_token: int - 0-based index of the first token matched
+
+        end_token: int - 0-based index of the last token matched
+
+        entities: list - a list of entity kinds as strings (Ex: Artist, Location)
+        """
+        tokens = self.tokenizer.tokenize(utterance)
+        entities = []
+        if len(self.regex_entities) > 0:
+            for part, idx in self._iterate_subsequences(tokens):
+                local_trie = Trie()
+                for regex_entity in self.regex_entities:
+                    match = regex_entity.match(part)
+                    groups = match.groupdict() if match else {}
+                    for key in list(groups):
+                        match_str = groups.get(key)
+                        local_trie.insert(match_str, (match_str, key))
+                sub_tagger = EntityTagger(local_trie, self.tokenizer, max_tokens=self.max_tokens)
+                for sub_entity in sub_tagger.tag(part):
+                    sub_entity['start_token'] += idx
+                    sub_entity['end_token'] += idx
+                    for e in sub_entity['entities']:
+                        e['confidence'] = 0.5
+                    entities.append(sub_entity)
+        additional_sort = len(entities) > 0
+
+        context_entities = []
+        for i in xrange(len(tokens)):
+            part = ' '.join(tokens[i:])
+
+            for new_entity in self.trie.gather(part):
+                new_entity['data'] = list(new_entity['data'])
+                entities.append({
+                    'match': new_entity.get('match'),
+                    'key': new_entity.get('key'),
+                    'start_token': i,
+                    'entities': [new_entity],
+                    'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
+                    'from_context': False
+                })
+
+            if context_trie:
+                for new_entity in context_trie.gather(part):
+                    new_entity['data'] = list(new_entity['data'])
+                    new_entity['confidence'] *= 2.0  # context entities get double the weight!
+                    context_entities.append({
+                        'match': new_entity.get('match'),
+                        'key': new_entity.get('key'),
+                        'start_token': i,
+                        'entities': [new_entity],
+                        'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
+                        'from_context': True
+                    })
+
+        additional_sort = additional_sort or len(entities) > 0
+
+        if additional_sort:
+            entities = self._sort_and_merge_tags(entities + context_entities)
+
+        return entities
diff --git a/adapt/expander.py b/adapt/expander.py
new file mode 100644
index 0000000..4ad1764
--- /dev/null
+++ b/adapt/expander.py
@@ -0,0 +1,179 @@
+from six.moves import xrange
+
+__author__ = 'seanfitz'
+
+
+class SimpleGraph(object):
+    def __init__(self):
+        self.adjacency_lists = {}
+
+    def add_edge(self, a, b):
+        neighbors_of_a = self.adjacency_lists.get(a)
+        if not neighbors_of_a:
+            neighbors_of_a = set()
+            self.adjacency_lists[a] = neighbors_of_a
+
+        neighbors_of_a.add(b)
+
+        neighbors_of_b = self.adjacency_lists.get(b)
+        if not neighbors_of_b:
+            neighbors_of_b = set()
+            self.adjacency_lists[b] = neighbors_of_b
+
+        neighbors_of_b.add(a)
+
+    def get_neighbors_of(self, a):
+        return self.adjacency_lists.get(a, set())
+
+    def vertex_set(self):
+        return list(self.adjacency_lists)
+
+
+def bronk(r, p, x, graph):
+    if len(p) == 0 and len(x) == 0:
+        yield r
+        return
+    for vertex in p[:]:
+        r_new = r[::]
+        r_new.append(vertex)
+        p_new = [val for val in p if val in graph.get_neighbors_of(vertex)] # p intersects N(vertex)
+        x_new = [val for val in x if val in graph.get_neighbors_of(vertex)] # x intersects N(vertex)
+        for result in bronk(r_new, p_new, x_new, graph):
+            yield result
+        p.remove(vertex)
+        x.append(vertex)
+
+
+def get_cliques(vertices, graph):
+    for clique in bronk([], vertices, [], graph):
+        yield clique
+
+
+def graph_key_from_tag(tag, entity_index):
+    start_token = tag.get('start_token')
+    entity = tag.get('entities', [])[entity_index]
+    return str(start_token) + '-' + entity.get('key') + '-' + str(entity.get('confidence'))
+
+
+class Lattice(object):
+    def __init__(self):
+        self.nodes = []
+
+    def append(self, data):
+        if isinstance(data, list) and len(data) > 0:
+            self.nodes.append(data)
+        else:
+            self.nodes.append([data])
+
+    def traverse(self, index=0):
+        if index < len(self.nodes):
+            for entity in self.nodes[index]:
+                for next_result in self.traverse(index=index+1):
+                    if isinstance(entity, list):
+                        yield entity + next_result
+                    else:
+                        yield [entity] + next_result
+        else:
+            yield []
+
+
+class BronKerboschExpander(object):
+    """
+    BronKerboschExpander
+
+    Given a list of tagged entities (from the existing entity tagger implementation or another), expand out
+    valid parse results.
+
+    A parse result is considered valid if it contains no overlapping spans.
+
+    Since total confidence of a parse result is based on the sum of confidences of the entities, there is no sense
+    in yielding any potential parse results that are a subset/sequence of a larger valid parse result. By comparing
+    this concept to that of maximal cliques (https://en.wikipedia.org/wiki/Clique_problem), we can use well known
+    solutions to the maximal clique problem like the Bron/Kerbosch algorithm (https://en.wikipedia.org/wiki/Bron%E2%80%93Kerbosch_algorithm).
+
+    By considering tagged entities that do not overlap to be "neighbors", BronKerbosch will yield a set of maximal
+    cliques that are also valid parse results.
+    """
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def _build_graph(self, tags):
+        graph = SimpleGraph()
+        for tag_index in xrange(len(tags)):
+            for entity_index in xrange(len(tags[tag_index].get('entities'))):
+                a_entity_name = graph_key_from_tag(tags[tag_index], entity_index)
+                tokens = self.tokenizer.tokenize(tags[tag_index].get('entities', [])[entity_index].get('match'))
+                for tag in tags[tag_index + 1:]:
+                    start_token = tag.get('start_token')
+                    if start_token >= tags[tag_index].get('start_token') + len(tokens):
+                        for b_entity_index in xrange(len(tag.get('entities'))):
+                            b_entity_name = graph_key_from_tag(tag, b_entity_index)
+                            graph.add_edge(a_entity_name, b_entity_name)
+
+        return graph
+
+    def _sub_expand(self, tags):
+        entities = {}
+        graph = self._build_graph(tags)
+
+        # name entities
+        for tag in tags:
+            for entity_index in xrange(len(tag.get('entities'))):
+                node_name = graph_key_from_tag(tag, entity_index)
+                if not node_name in entities:
+                    entities[node_name] = []
+                entities[node_name] += [
+                    tag.get('entities', [])[entity_index],
+                    tag.get('entities', [])[entity_index].get('confidence'),
+                    tag
+                ]
+
+        for clique in get_cliques(list(entities), graph):
+            result = []
+            for entity_name in clique:
+                start_token = int(entity_name.split("-")[0])
+                old_tag = entities[entity_name][2]
+                tag = {
+                    'start_token': start_token,
+                    'entities': [entities.get(entity_name)[0]],
+                    'confidence': entities.get(entity_name)[1] * old_tag.get('confidence', 1.0),
+                    'end_token': old_tag.get('end_token'),
+                    'match': old_tag.get('entities')[0].get('match'),
+                    'key': old_tag.get('entities')[0].get('key'),
+                    'from_context': old_tag.get('from_context', False)
+                }
+                result.append(tag)
+            result = sorted(result, key=lambda e: e.get('start_token'))
+            yield result
+
+    def expand(self, tags, clique_scoring_func=None):
+        lattice = Lattice()
+        overlapping_spans = []
+
+        def end_token_index():
+            return max([t.get('end_token') for t in overlapping_spans])
+
+        for i in xrange(len(tags)):
+            tag = tags[i]
+
+            if len(overlapping_spans) > 0 and end_token_index() >= tag.get('start_token'):
+                overlapping_spans.append(tag)
+            elif len(overlapping_spans) > 1:
+                cliques = list(self._sub_expand(overlapping_spans))
+                if clique_scoring_func:
+                    cliques = sorted(cliques, key=lambda e: -1 * clique_scoring_func(e))
+                lattice.append(cliques)
+                overlapping_spans = [tag]
+            else:
+                lattice.append(overlapping_spans)
+                overlapping_spans = [tag]
+        if len(overlapping_spans) > 1:
+            cliques = list(self._sub_expand(overlapping_spans))
+            if clique_scoring_func:
+                    cliques = sorted(cliques, key=lambda e: -1 * clique_scoring_func(e))
+            lattice.append(cliques)
+        else:
+            lattice.append(overlapping_spans)
+
+        return lattice.traverse()
+
diff --git a/adapt/intent.py b/adapt/intent.py
new file mode 100644
index 0000000..a1f8892
--- /dev/null
+++ b/adapt/intent.py
@@ -0,0 +1,190 @@
+__author__ = 'seanfitz'
+
+CLIENT_ENTITY_NAME = 'Client'
+
+
+def is_entity(tag, entity_name):
+    for entity in tag.get('entities'):
+        for v, t in entity.get('data'):
+            if t.lower() == entity_name.lower():
+                return True
+    return False
+
+
+def find_first_tag(tags, entity_type, after_index=-1):
+    for tag in tags:
+        for entity in tag.get('entities'):
+            for v, t in entity.get('data'):
+                if t.lower() == entity_type.lower() and tag.get('start_token', 0) > after_index:
+                    return tag, v, entity.get('confidence')
+
+    return None, None, None
+
+
+def find_next_tag(tags, end_index=0):
+    for tag in tags:
+        if tag.get('start_token') > end_index:
+            return tag
+    return None
+
+
+def choose_1_from_each(lists):
+    if len(lists) == 0:
+        yield []
+    else:
+        for el in lists[0]:
+            for next_list in choose_1_from_each(lists[1:]):
+                yield [el] + next_list
+
+
+def resolve_one_of(tags, at_least_one):
+    if len(tags) < len(at_least_one):
+        return None
+    for possible_resolution in choose_1_from_each(at_least_one):
+        resolution = {}
+        pr = possible_resolution[:]
+        for entity_type in pr:
+            last_end_index = -1
+            if entity_type in resolution:
+                last_end_index = resolution.get[entity_type][-1].get('end_token')
+            tag, value, c = find_first_tag(tags, entity_type, after_index=last_end_index)
+            if not tag:
+                break
+            else:
+                if entity_type not in resolution:
+                    resolution[entity_type] = []
+                resolution[entity_type].append(tag)
+        if len(resolution) == len(possible_resolution):
+            return resolution
+
+    return None
+
+
+class Intent(object):
+    def __init__(self, name, requires, at_least_one, optional):
+        self.name = name
+        self.requires = requires
+        self.at_least_one = at_least_one
+        self.optional = optional
+
+    def validate(self, tags, confidence):
+        intent, tags = self.validate_with_tags(tags, confidence)
+        return intent
+
+    def validate_with_tags(self, tags, confidence):
+        result = {'intent_type': self.name}
+        intent_confidence = 0.0
+        local_tags = tags[:]
+        used_tags = []
+
+        for require_type, attribute_name in self.requires:
+            required_tag, canonical_form, confidence = find_first_tag(local_tags, require_type)
+            if not required_tag:
+                result['confidence'] = 0.0
+                return result, []
+
+            result[attribute_name] = canonical_form
+            if required_tag in local_tags:
+                local_tags.remove(required_tag)
+            used_tags.append(required_tag)
+            # TODO: use confidence based on edit distance and context
+            intent_confidence += confidence
+
+        if len(self.at_least_one) > 0:
+            best_resolution = resolve_one_of(tags, self.at_least_one)
+            if not best_resolution:
+                result['confidence'] = 0.0
+                return result, []
+            else:
+                for key in best_resolution:
+                    result[key] = best_resolution[key][0].get('key') # TODO: at least one must support aliases
+                    intent_confidence += 1.0
+                used_tags.append(best_resolution)
+                if best_resolution in local_tags:
+                    local_tags.remove(best_resolution)
+
+        for optional_type, attribute_name in self.optional:
+            optional_tag, canonical_form, conf = find_first_tag(local_tags, optional_type)
+            if not optional_tag or attribute_name in result:
+                continue
+            result[attribute_name] = canonical_form
+            if optional_tag in local_tags:
+                local_tags.remove(optional_tag)
+            used_tags.append(optional_tag)
+            intent_confidence += 1.0
+
+        total_confidence = intent_confidence / len(tags) * confidence
+
+        target_client, canonical_form, confidence = find_first_tag(local_tags, CLIENT_ENTITY_NAME)
+
+        result['target'] = target_client.get('key') if target_client else None
+        result['confidence'] = total_confidence
+
+        return result, used_tags
+
+
+class IntentBuilder(object):
+    """
+    IntentBuilder, used to construct intent parsers.
+    """
+    def __init__(self, intent_name):
+        """
+        Constructor
+
+        :param intent_name: the name of the intents that this parser parses/validates
+
+        :return: an instance of IntentBuilder
+        """
+        self.at_least_one = []
+        self.requires = []
+        self.optional = []
+        self.name = intent_name
+
+    def one_of(self, *args):
+        """
+        The intent parser should require one of the provided entity types to validate this clause.
+
+        :param args: *args notation list of entity names
+
+        :return: self
+        """
+        self.at_least_one.append(args)
+        return self
+
+    def require(self, entity_type, attribute_name=None):
+        """
+        The intent parser should require an entity of the provided type.
+
+        :param entity_type: string, an entity type
+
+        :param attribute_name: string, the name of the attribute on the parsed intent. Defaults to match entity_type.
+
+        :return: self
+        """
+        if not attribute_name:
+            attribute_name = entity_type
+        self.requires += [(entity_type, attribute_name)]
+        return self
+
+    def optionally(self, entity_type, attribute_name=None):
+        """
+        Parsed intents from this parser can optionally include an entity of the provided type.
+
+        :param entity_type: string, an entity type
+
+        :param attribute_name: string, the name of the attribute on the parsed intent. Defaults to match entity_type.
+
+        :return: self
+        """
+        if not attribute_name:
+            attribute_name = entity_type
+        self.optional += [(entity_type, attribute_name)]
+        return self
+
+    def build(self):
+        """
+        Constructs an intent from the builder's specifications.
+
+        :return: an Intent instance.
+        """
+        return Intent(self.name, self.requires, self.at_least_one, self.optional)
\ No newline at end of file
diff --git a/adapt/parser.py b/adapt/parser.py
new file mode 100644
index 0000000..8226e01
--- /dev/null
+++ b/adapt/parser.py
@@ -0,0 +1,77 @@
+import pyee
+import time
+from adapt.expander import BronKerboschExpander
+from adapt.tools.text.trie import Trie
+
+__author__ = 'seanfitz'
+
+
... 386 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/adapt-parser.git



More information about the Python-modules-commits mailing list