[Python-modules-commits] [fuzzywuzzy] 02/05: Import fuzzywuzzy_0.11.0.orig.tar.gz

Edward Betts edward at moszumanska.debian.org
Mon Jul 4 10:43:20 UTC 2016


This is an automated email from the git hooks/post-receive script.

edward pushed a commit to branch master
in repository fuzzywuzzy.

commit 2b65cf045c7161b2435bf736eb75fc13dac1ba80
Author: Edward Betts <edward at 4angle.com>
Date:   Mon Jul 4 08:03:21 2016 +0100

    Import fuzzywuzzy_0.11.0.orig.tar.gz
---
 CHANGES.rst                  |  19 +++++++-
 PKG-INFO                     |  20 +++++----
 README                       |  18 ++++----
 README.rst                   |  18 ++++----
 fuzzywuzzy.egg-info/PKG-INFO |  20 +++++----
 fuzzywuzzy.egg-info/pbr.json |   2 +-
 fuzzywuzzy/StringMatcher.py  |   1 +
 fuzzywuzzy/__init__.py       |   2 +-
 fuzzywuzzy/fuzz.py           |  45 ++++++++++---------
 fuzzywuzzy/process.py        | 103 +++++++++++++++++++++++++++++++++----------
 10 files changed, 165 insertions(+), 83 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index d95eb31..a0cada3 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,7 +1,24 @@
 Changelog
 =========
 
-0.10.0 (2016-03-13)
+0.11.0 (2016-06-30)
+-------------------
+
+- Clean-up. [desmaisons_david]
+
+- Improving performance. [desmaisons_david]
+
+- Performance Improvement. [desmaisons_david]
+
+- Fix link to Levenshtein. [Brian J. McGuirk]
+
+- Fix readme links. [Brian J. McGuirk]
+
+- Add license to StringMatcher.py. [Jose Diaz-Gonzalez]
+
+  Closes #113
+
+0.10.0 (2016-03-14)
 -------------------
 
 - Handle None inputs same as empty string (Issue #94) [Nick Miller]
diff --git a/PKG-INFO b/PKG-INFO
index 21a33fc..2596b7c 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: fuzzywuzzy
-Version: 0.10.0
+Version: 0.11.0
 Summary: Fuzzy string matching in python
 Home-page: https://github.com/seatgeek/fuzzywuzzy
 Author: Adam Cohen
@@ -26,19 +26,20 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-Description: |Build Status|
+Description: .. image:: https://travis-ci.org/seatgeek/fuzzywuzzy.svg?branch=master
+            :target: https://travis-ci.org/seatgeek/fuzzywuzzy
         
         FuzzyWuzzy
         ==========
         
-        Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to help calculate differences between sequences in a simple to use package.
+        Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to calculate the differences between sequences in a simple-to-use package.
         
         Requirements
         ============
         
         -  Python 2.4 or higher
         -  difflib
-        -  python-Levenshtein (optional, provides a 4-10x speedup in String
+        -  `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
            Matching)
         
         Installation
@@ -54,13 +55,13 @@ Description: |Build Status|
         
         .. code:: bash
         
-            pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+            pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
         
         Adding to your ``requirements.txt`` file (run ``pip install -r requirements.txt`` afterwards)
         
         .. code:: bash
         
-            git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+            git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
             
         Manually via GIT
         
@@ -85,7 +86,7 @@ Description: |Build Status|
         .. code:: python
         
             >>> fuzz.ratio("this is a test", "this is a test!")
-                96
+                97
         
         Partial Ratio
         ~~~~~~~~~~~~~
@@ -101,7 +102,7 @@ Description: |Build Status|
         .. code:: python
         
             >>> fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
-                90
+                91
             >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
                 100
         
@@ -131,7 +132,8 @@ Description: |Build Status|
         
         Known Ports
         ============
-        Some people are porting FuzzyWuzzy to other languages. Here is one port we know about:
+        
+        FuzzyWuzzy is being ported to other languages too! Here is one port we know about:
         
         -  Java: https://github.com/WantedTechnologies/xpresso/wiki/Approximate-string-comparison-and-pattern-matching-in-Java
         
diff --git a/README b/README
index 0356270..3e35178 100644
--- a/README
+++ b/README
@@ -1,16 +1,17 @@
-|Build Status|
+.. image:: https://travis-ci.org/seatgeek/fuzzywuzzy.svg?branch=master
+    :target: https://travis-ci.org/seatgeek/fuzzywuzzy
 
 FuzzyWuzzy
 ==========
 
-Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to help calculate differences between sequences in a simple to use package.
+Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to calculate the differences between sequences in a simple-to-use package.
 
 Requirements
 ============
 
 -  Python 2.4 or higher
 -  difflib
--  python-Levenshtein (optional, provides a 4-10x speedup in String
+-  `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
    Matching)
 
 Installation
@@ -26,13 +27,13 @@ Using PIP via Github
 
 .. code:: bash
 
-    pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+    pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
 
 Adding to your ``requirements.txt`` file (run ``pip install -r requirements.txt`` afterwards)
 
 .. code:: bash
 
-    git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+    git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
     
 Manually via GIT
 
@@ -57,7 +58,7 @@ Simple Ratio
 .. code:: python
 
     >>> fuzz.ratio("this is a test", "this is a test!")
-        96
+        97
 
 Partial Ratio
 ~~~~~~~~~~~~~
@@ -73,7 +74,7 @@ Token Sort Ratio
 .. code:: python
 
     >>> fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
-        90
+        91
     >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
         100
 
@@ -103,6 +104,7 @@ Process
 
 Known Ports
 ============
-Some people are porting FuzzyWuzzy to other languages. Here is one port we know about:
+
+FuzzyWuzzy is being ported to other languages too! Here is one port we know about:
 
 -  Java: https://github.com/WantedTechnologies/xpresso/wiki/Approximate-string-comparison-and-pattern-matching-in-Java
diff --git a/README.rst b/README.rst
index 0356270..3e35178 100644
--- a/README.rst
+++ b/README.rst
@@ -1,16 +1,17 @@
-|Build Status|
+.. image:: https://travis-ci.org/seatgeek/fuzzywuzzy.svg?branch=master
+    :target: https://travis-ci.org/seatgeek/fuzzywuzzy
 
 FuzzyWuzzy
 ==========
 
-Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to help calculate differences between sequences in a simple to use package.
+Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to calculate the differences between sequences in a simple-to-use package.
 
 Requirements
 ============
 
 -  Python 2.4 or higher
 -  difflib
--  python-Levenshtein (optional, provides a 4-10x speedup in String
+-  `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
    Matching)
 
 Installation
@@ -26,13 +27,13 @@ Using PIP via Github
 
 .. code:: bash
 
-    pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+    pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
 
 Adding to your ``requirements.txt`` file (run ``pip install -r requirements.txt`` afterwards)
 
 .. code:: bash
 
-    git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+    git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
     
 Manually via GIT
 
@@ -57,7 +58,7 @@ Simple Ratio
 .. code:: python
 
     >>> fuzz.ratio("this is a test", "this is a test!")
-        96
+        97
 
 Partial Ratio
 ~~~~~~~~~~~~~
@@ -73,7 +74,7 @@ Token Sort Ratio
 .. code:: python
 
     >>> fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
-        90
+        91
     >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
         100
 
@@ -103,6 +104,7 @@ Process
 
 Known Ports
 ============
-Some people are porting FuzzyWuzzy to other languages. Here is one port we know about:
+
+FuzzyWuzzy is being ported to other languages too! Here is one port we know about:
 
 -  Java: https://github.com/WantedTechnologies/xpresso/wiki/Approximate-string-comparison-and-pattern-matching-in-Java
diff --git a/fuzzywuzzy.egg-info/PKG-INFO b/fuzzywuzzy.egg-info/PKG-INFO
index 21a33fc..2596b7c 100644
--- a/fuzzywuzzy.egg-info/PKG-INFO
+++ b/fuzzywuzzy.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: fuzzywuzzy
-Version: 0.10.0
+Version: 0.11.0
 Summary: Fuzzy string matching in python
 Home-page: https://github.com/seatgeek/fuzzywuzzy
 Author: Adam Cohen
@@ -26,19 +26,20 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-Description: |Build Status|
+Description: .. image:: https://travis-ci.org/seatgeek/fuzzywuzzy.svg?branch=master
+            :target: https://travis-ci.org/seatgeek/fuzzywuzzy
         
         FuzzyWuzzy
         ==========
         
-        Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to help calculate differences between sequences in a simple to use package.
+        Fuzzy string matching like a boss. It uses `Levenshtein Distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ to calculate the differences between sequences in a simple-to-use package.
         
         Requirements
         ============
         
         -  Python 2.4 or higher
         -  difflib
-        -  python-Levenshtein (optional, provides a 4-10x speedup in String
+        -  `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
            Matching)
         
         Installation
@@ -54,13 +55,13 @@ Description: |Build Status|
         
         .. code:: bash
         
-            pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+            pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
         
         Adding to your ``requirements.txt`` file (run ``pip install -r requirements.txt`` afterwards)
         
         .. code:: bash
         
-            git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.10.0#egg=fuzzywuzzy
+            git+ssh://git@github.com/seatgeek/fuzzywuzzy.git@0.11.0#egg=fuzzywuzzy
             
         Manually via GIT
         
@@ -85,7 +86,7 @@ Description: |Build Status|
         .. code:: python
         
             >>> fuzz.ratio("this is a test", "this is a test!")
-                96
+                97
         
         Partial Ratio
         ~~~~~~~~~~~~~
@@ -101,7 +102,7 @@ Description: |Build Status|
         .. code:: python
         
             >>> fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
-                90
+                91
             >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
                 100
         
@@ -131,7 +132,8 @@ Description: |Build Status|
         
         Known Ports
         ============
-        Some people are porting FuzzyWuzzy to other languages. Here is one port we know about:
+        
+        FuzzyWuzzy is being ported to other languages too! Here is one port we know about:
         
         -  Java: https://github.com/WantedTechnologies/xpresso/wiki/Approximate-string-comparison-and-pattern-matching-in-Java
         
diff --git a/fuzzywuzzy.egg-info/pbr.json b/fuzzywuzzy.egg-info/pbr.json
index c3e7452..fea72b7 100644
--- a/fuzzywuzzy.egg-info/pbr.json
+++ b/fuzzywuzzy.egg-info/pbr.json
@@ -1 +1 @@
-{"is_release": true, "git_version": "6244ce3"}
\ No newline at end of file
+{"is_release": true, "git_version": "23f4709"}
\ No newline at end of file
diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py
index f27f1ff..d35e075 100644
--- a/fuzzywuzzy/StringMatcher.py
+++ b/fuzzywuzzy/StringMatcher.py
@@ -5,6 +5,7 @@ StringMatcher.py
 
 ported from python-Levenshtein
 [https://github.com/miohtama/python-Levenshtein]
+License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING
 """
 
 from Levenshtein import *
diff --git a/fuzzywuzzy/__init__.py b/fuzzywuzzy/__init__.py
index 4462bb4..a52f1c2 100644
--- a/fuzzywuzzy/__init__.py
+++ b/fuzzywuzzy/__init__.py
@@ -1,2 +1,2 @@
 # -*- coding: utf-8 -*-
-__version__ = '0.10.0'
+__version__ = '0.11.0'
diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py
index da2bc55..843c72b 100644
--- a/fuzzywuzzy/fuzz.py
+++ b/fuzzywuzzy/fuzz.py
@@ -94,10 +94,11 @@ def partial_ratio(s1, s2):
 # Advanced Scoring Functions #
 ##############################
 
-def _process_and_sort(s, force_ascii):
+def _process_and_sort(s, force_ascii, full_process=True):
     """Return a cleaned string with token sorted."""
     # pull tokens
-    tokens = utils.full_process(s, force_ascii=force_ascii).split()
+    ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
+    tokens = ts.split()
 
     # sort tokens and join
     sorted_string = u" ".join(sorted(tokens))
@@ -109,9 +110,9 @@ def _process_and_sort(s, force_ascii):
 #   sort those tokens and take ratio of resulting joined strings
 #   controls for unordered string elements
 @utils.check_for_none
-def _token_sort(s1, s2, partial=True, force_ascii=True):
-    sorted1 = _process_and_sort(s1, force_ascii)
-    sorted2 = _process_and_sort(s2, force_ascii)
+def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
+    sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
+    sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
 
     if partial:
         return partial_ratio(sorted1, sorted2)
@@ -119,22 +120,22 @@ def _token_sort(s1, s2, partial=True, force_ascii=True):
         return ratio(sorted1, sorted2)
 
 
-def token_sort_ratio(s1, s2, force_ascii=True):
+def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
     """Return a measure of the sequences' similarity between 0 and 100
     but sorting the token before comparing.
     """
-    return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
+    return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
 
 
-def partial_token_sort_ratio(s1, s2, force_ascii=True):
+def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
     """Return the ratio of the most similar substring as a number between
     0 and 100 but sorting the token before comparing.
     """
-    return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)
+    return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
 
 
 @utils.check_for_none
-def _token_set(s1, s2, partial=True, force_ascii=True):
+def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
     """Find all alphanumeric tokens in each string...
         - treat them as a set
         - construct two strings of the form:
@@ -142,8 +143,8 @@ def _token_set(s1, s2, partial=True, force_ascii=True):
         - take ratios of those two strings
         - controls for unordered partial matches"""
 
-    p1 = utils.full_process(s1, force_ascii=force_ascii)
-    p2 = utils.full_process(s2, force_ascii=force_ascii)
+    p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
+    p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
 
     if not utils.validate_string(p1):
         return 0
@@ -151,8 +152,8 @@ def _token_set(s1, s2, partial=True, force_ascii=True):
         return 0
 
     # pull tokens
-    tokens1 = set(utils.full_process(p1).split())
-    tokens2 = set(utils.full_process(p2).split())
+    tokens1 = set(p1.split())
+    tokens2 = set(p2.split())
 
     intersection = tokens1.intersection(tokens2)
     diff1to2 = tokens1.difference(tokens2)
@@ -183,12 +184,12 @@ def _token_set(s1, s2, partial=True, force_ascii=True):
     return max(pairwise)
 
 
-def token_set_ratio(s1, s2, force_ascii=True):
-    return _token_set(s1, s2, partial=False, force_ascii=force_ascii)
+def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
 
 
-def partial_token_set_ratio(s1, s2, force_ascii=True):
-    return _token_set(s1, s2, partial=True, force_ascii=force_ascii)
+def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
 
 
 ###################
@@ -245,15 +246,15 @@ def WRatio(s1, s2, force_ascii=True):
 
     if try_partial:
         partial = partial_ratio(p1, p2) * partial_scale
-        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
+        ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
             * unbase_scale * partial_scale
-        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
+        ptser = partial_token_set_ratio(p1, p2, full_process=False) \
             * unbase_scale * partial_scale
 
         return utils.intr(max(base, partial, ptsor, ptser))
     else:
-        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
-        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
+        tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
+        tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
 
         return utils.intr(max(base, tsor, tser))
 
diff --git a/fuzzywuzzy/process.py b/fuzzywuzzy/process.py
index 88eaa83..a023692 100644
--- a/fuzzywuzzy/process.py
+++ b/fuzzywuzzy/process.py
@@ -24,17 +24,17 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 """
-import itertools
 
 from . import fuzz
 from . import utils
+import heapq
 
 
-def extract(query, choices, processor=None, scorer=None, limit=5):
+def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutoff=0):
     """Select the best match in a list or dictionary of choices.
 
     Find best matches in a list or dictionary of choices, return a
-    list of tuples containing the match and it's score. If a dictionary
+    generator of tuples containing the match and it's score. If a dictionary
     is used, also returns the key for each match.
 
     Arguments:
@@ -58,11 +58,11 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
 
             By default, fuzz.WRatio() is used and expects both query and
             choice to be strings.
-        limit: Optional maximum for the number of elements returned. Defaults
-            to 5.
+        score_cutoff: Optional argument for score threshold. No matches with
+            a score less than this number will be returned. Defaults to 0.
 
     Returns:
-        List of tuples containing the match and its score.
+        Generator of tuples containing the match and its score.
 
         If a list is used for choices, then the result will be 2-tuples.
         If a dictionary is used, then the result will be 3-tuples containing
@@ -74,44 +74,96 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
 
         may return
 
-        [('train', 22, 'bard'), ('man', 0, 'dog')]
+        ('train', 22, 'bard'), ('man', 0, 'dog')
     """
+    def no_process(x):
+        return x
 
     if choices is None:
-        return []
+        raise StopIteration
 
     # Catch generators without lengths
     try:
         if len(choices) == 0:
-            return []
+            raise StopIteration
     except TypeError:
         pass
 
-    # default, turn whatever the choice is into a workable string
-    if not processor:
-        processor = utils.full_process
-
     # default: wratio
     if not scorer:
         scorer = fuzz.WRatio
+        # fuzz.WRatio already process string so no need extra step
+        if not processor:
+            processor = no_process
 
-    sl = []
+    # default, turn whatever the choice is into a workable string
+    if not processor:
+        processor = utils.full_process
 
     try:
         # See if choices is a dictionary-like object.
         for key, choice in choices.items():
             processed = processor(choice)
             score = scorer(query, processed)
-            sl.append((choice, score, key))
+            if score >= score_cutoff:
+                yield (choice, score, key)
     except AttributeError:
         # It's a list; just iterate over it.
         for choice in choices:
             processed = processor(choice)
             score = scorer(query, processed)
-            sl.append((choice, score))
+            if score >= score_cutoff:
+                yield (choice, score)
+
 
-    sl.sort(key=lambda i: i[1], reverse=True)
-    return sl[:limit]
+def extract(query, choices, processor=None, scorer=None, limit=5):
+    """Select the best match in a list or dictionary of choices.
+
+    Find best matches in a list or dictionary of choices, return a
+    list of tuples containing the match and it's score. If a dictionary
+    is used, also returns the key for each match.
+
+    Arguments:
+        query: An object representing the thing we want to find.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query. Dictionary arguments of
+            {key: value} pairs will attempt to match the query against
+            each value.
+        processor: Optional function of the form f(a) -> b, where a is an
+            individual choice and b is the choice to be used in matching.
+
+            This can be used to match against, say, the first element of
+            a list:
+
+            lambda x: x[0]
+
+            Defaults to fuzzywuzzy.utils.full_process().
+        scorer: Optional function for scoring matches between the query and
+            an individual processed choice. This should be a function
+            of the form f(query, choice) -> int.
+            By default, fuzz.WRatio() is used and expects both query and
+            choice to be strings.
+        limit: Optional maximum for the number of elements returned. Defaults
+            to 5.
+
+    Returns:
+        List of tuples containing the match and its score.
+
+        If a list is used for choices, then the result will be 2-tuples.
+        If a dictionary is used, then the result will be 3-tuples containing
+        he key for each match.
+
+        For example, searching for 'bird' in the dictionary
+
+        {'bard': 'train', 'dog': 'man'}
+
+        may return
+
+        [('train', 22, 'bard'), ('man', 0, 'dog')]
+    """
+    sl = extractWithoutOrder(query, choices, processor, scorer)
+    return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
+        sorted(sl, key=lambda i: i[1], reverse=True)
 
 
 def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
@@ -133,8 +185,10 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
 
     Returns: A a list of (match, score) tuples.
     """
-    best_list = extract(query, choices, processor, scorer, limit)
-    return list(itertools.takewhile(lambda x: x[1] >= score_cutoff, best_list))
+
+    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
+    return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
+        sorted(best_list, key=lambda i: i[1], reverse=True)
 
 
 def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
@@ -158,10 +212,11 @@ def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
         A tuple containing a single match and its score, if a match
         was found that was above score_cutoff. Otherwise, returns None.
     """
-    best_list = extract(query, choices, processor, scorer, limit=1)
-    if len(best_list) > 0 and best_list[0][1] >= score_cutoff:
-        return best_list[0]
-    return None
+    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
+    try:
+        return max(best_list, key=lambda i: i[1])
+    except ValueError:
+        return None
 
 
 def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/fuzzywuzzy.git



More information about the Python-modules-commits mailing list