[Python-modules-commits] [python-jellyfish] 01/02: Imported Upstream version 0.5.2

Víctor Cuadrado Juan viccuad-guest at moszumanska.debian.org
Mon Feb 29 12:16:46 UTC 2016


This is an automated email from the git hooks/post-receive script.

viccuad-guest pushed a commit to branch master
in repository python-jellyfish.

commit 5162e4d7579d630c2621ad2a93b1ce667fe5dfb8
Author: Víctor Cuadrado Juan <me at viccuad.me>
Date:   Sun Feb 28 20:57:16 2016 +0100

    Imported Upstream version 0.5.2
---
 LICENSE                                 |  25 ++
 MANIFEST.in                             |   1 +
 PKG-INFO                                |  85 ++++++
 README.rst                              |  66 +++++
 cjellyfish/damerau_levenshtein.c        |  72 +++++
 cjellyfish/hamming.c                    |  25 ++
 cjellyfish/jaro.c                       | 145 ++++++++++
 cjellyfish/jellyfish.h                  |  40 +++
 cjellyfish/jellyfishmodule.c            | 436 ++++++++++++++++++++++++++++
 cjellyfish/levenshtein.c                |  47 +++
 cjellyfish/metaphone.c                  | 197 +++++++++++++
 cjellyfish/mra.c                        | 115 ++++++++
 cjellyfish/nysiis.c                     | 191 +++++++++++++
 cjellyfish/porter.c                     | 395 ++++++++++++++++++++++++++
 cjellyfish/soundex.c                    |  70 +++++
 jellyfish.egg-info/PKG-INFO             |  85 ++++++
 jellyfish.egg-info/SOURCES.txt          |  24 ++
 jellyfish.egg-info/dependency_links.txt |   1 +
 jellyfish.egg-info/top_level.txt        |   1 +
 jellyfish/__init__.py                   |   4 +
 jellyfish/_jellyfish.py                 | 489 ++++++++++++++++++++++++++++++++
 jellyfish/compat.py                     |  13 +
 jellyfish/porter.py                     | 218 ++++++++++++++
 jellyfish/test.py                       | 198 +++++++++++++
 setup.cfg                               |   5 +
 setup.py                                | 124 ++++++++
 26 files changed, 3072 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b563a37
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2015, James Turk
+Copyright (c) 2015, Sunlight Foundation
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..dffddd0
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE *.rst *.py cjellyfish/*.c cjellyfish/*.h
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..8eb43df
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,85 @@
+Metadata-Version: 1.1
+Name: jellyfish
+Version: 0.5.2
+Summary: a library for doing approximate and phonetic matching of strings.
+Home-page: http://github.com/jamesturk/jellyfish
+Author: UNKNOWN
+Author-email: UNKNOWN
+License: UNKNOWN
+Description: =========
+        jellyfish
+        =========
+        
+        .. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+            :target: https://travis-ci.org/jamesturk/jellyfish
+        
+        .. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+            :target: https://coveralls.io/r/jamesturk/jellyfish
+        
+        .. image:: https://img.shields.io/pypi/v/jellyfish.svg
+            :target: https://pypi.python.org/pypi/jellyfish
+        
+        .. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+            :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+            :alt: Documentation Status
+        
+        .. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+            :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+        
+        Jellyfish is a python library for doing approximate and phonetic matching of strings.
+        
+        Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+        
+        See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+        
+        Source is available at http://github.com/jamesturk/jellyfish.
+        
+        Included Algorithms
+        ===================
+        
+        String comparison:
+        
+          * Levenshtein Distance
+          * Damerau-Levenshtein Distance
+          * Jaro Distance
+          * Jaro-Winkler Distance
+          * Match Rating Approach Comparison
+          * Hamming Distance
+        
+        Phonetic encoding:
+        
+          * American Soundex
+          * Metaphone
+          * NYSIIS (New York State Identification and Intelligence System)
+          * Match Rating Codex
+        
+        Example Usage
+        =============
+        
+        >>> import jellyfish
+        >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
+        2
+        >>> jellyfish.jaro_distance('jellyfish', 'smellyfish')
+        0.89629629629629637
+        >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
+        1
+        
+        >>> jellyfish.metaphone('Jellyfish')
+        'JLFX'
+        >>> jellyfish.soundex('Jellyfish')
+        'J412'
+        >>> jellyfish.nysiis('Jellyfish')
+        'JALYF'
+        >>> jellyfish.match_rating_codex('Jellyfish')
+        'JLLFSH'
+        
+Platform: any
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Topic :: Text Processing :: Linguistic
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..931b3b3
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,66 @@
+=========
+jellyfish
+=========
+
+.. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+    :target: https://travis-ci.org/jamesturk/jellyfish
+
+.. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+    :target: https://coveralls.io/r/jamesturk/jellyfish
+
+.. image:: https://img.shields.io/pypi/v/jellyfish.svg
+    :target: https://pypi.python.org/pypi/jellyfish
+
+.. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+    :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+    :alt: Documentation Status
+
+.. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+    :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+
+Jellyfish is a python library for doing approximate and phonetic matching of strings.
+
+Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+
+See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+
+Source is available at http://github.com/jamesturk/jellyfish.
+
+Included Algorithms
+===================
+
+String comparison:
+
+  * Levenshtein Distance
+  * Damerau-Levenshtein Distance
+  * Jaro Distance
+  * Jaro-Winkler Distance
+  * Match Rating Approach Comparison
+  * Hamming Distance
+
+Phonetic encoding:
+
+  * American Soundex
+  * Metaphone
+  * NYSIIS (New York State Identification and Intelligence System)
+  * Match Rating Codex
+
+Example Usage
+=============
+
+>>> import jellyfish
+>>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
+2
+>>> jellyfish.jaro_distance('jellyfish', 'smellyfish')
+0.89629629629629637
+>>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
+1
+
+>>> jellyfish.metaphone('Jellyfish')
+'JLFX'
+>>> jellyfish.soundex('Jellyfish')
+'J412'
+>>> jellyfish.nysiis('Jellyfish')
+'JALYF'
+>>> jellyfish.match_rating_codex('Jellyfish')
+'JLLFSH'
diff --git a/cjellyfish/damerau_levenshtein.c b/cjellyfish/damerau_levenshtein.c
new file mode 100644
index 0000000..a31a246
--- /dev/null
+++ b/cjellyfish/damerau_levenshtein.c
@@ -0,0 +1,72 @@
+#include "jellyfish.h"
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *s1, const JFISH_UNICODE *s2, size_t len1, size_t len2)
+{
+    size_t infinite = len1 + len2;
+    size_t cols = len2 + 2;
+
+    size_t i, j, i1, j1;
+    size_t db;
+    size_t d1, d2, d3, d4, result;
+    unsigned short cost;
+
+    size_t *dist = NULL;
+
+    size_t *da = calloc(256, sizeof(size_t));
+    if (!da) {
+        return -1;
+    }
+
+    dist = malloc((len1 + 2) * cols * sizeof(size_t));
+    if (!dist) {
+        free(da);
+        return -1;
+    }
+
+    dist[0] = infinite;
+
+    for (i = 0; i <= len1; i++) {
+        dist[((i + 1) * cols) + 0] = infinite;
+        dist[((i + 1) * cols) + 1] = i;
+    }
+
+    for (i = 0; i <= len2; i++) {
+        dist[i + 1] = infinite;       // 0*cols + row
+        dist[cols + i + 1] = i;       // 1*cols + row
+    }
+
+    for (i = 1; i <= len1; i++) {
+        db = 0;
+        for (j = 1; j <= len2; j++) {
+            i1 = da[(JFISH_UNICODE)s2[j-1]];
+            j1 = db;
+
+            if (s1[i - 1] == s2[j - 1]) {
+                cost = 0;
+                db = j;
+            } else {
+                cost = 1;
+            }
+
+            d1 = dist[(i * cols) + j] + cost;
+            d2 = dist[((i + 1) * cols) + j] + 1;
+            d3 = dist[(i * cols) + j + 1] + 1;
+            d4 = dist[(i1 * cols) + j1] + (i - i1 - 1) + 1 + (j - j1 - 1);
+
+            dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4));
+        }
+
+        da[(JFISH_UNICODE)s1[i-1]] = i;
+    }
+
+    result = dist[((len1+1) * cols) + len2 + 1];
+
+    free(dist);
+    free(da);
+
+    return result;
+}
diff --git a/cjellyfish/hamming.c b/cjellyfish/hamming.c
new file mode 100644
index 0000000..d48540e
--- /dev/null
+++ b/cjellyfish/hamming.c
@@ -0,0 +1,25 @@
+#include "jellyfish.h"
+#include <ctype.h>
+
+size_t hamming_distance(const Py_UNICODE *s1, int len1,
+                        const Py_UNICODE *s2, int len2) {
+    unsigned distance = 0;
+    int i1 = 0;
+    int i2 = 0;
+
+    for (; i1 < len1 && i2 < len2; i1++, i2++, s1++, s2++) {
+        if (*s1 != *s2) {
+            distance++;
+        }
+    }
+
+    for ( ; i1 < len1; i1++, s1++) {
+        distance++;
+    }
+
+    for ( ; i2 < len2; i2++, s2++) {
+        distance++;
+    }
+
+    return distance;
+}
diff --git a/cjellyfish/jaro.c b/cjellyfish/jaro.c
new file mode 100644
index 0000000..7f02928
--- /dev/null
+++ b/cjellyfish/jaro.c
@@ -0,0 +1,145 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "jellyfish.h"
+
+#define NOTNUM(c)   ((c>57) || (c<48))
+
+/* borrowed heavily from strcmp95.c
+ *    http://www.census.gov/geo/msb/stand/strcmp.c
+ */
+double _jaro_winkler(const JFISH_UNICODE *ying, int ying_length,
+                     const JFISH_UNICODE *yang, int yang_length,
+                     int long_tolerance, int winklerize)
+{
+    /* Arguments:
+
+       ying
+       yang
+         pointers to the 2 strings to be compared.
+
+       long_tolerance
+         Increase the probability of a match when the number of matched
+         characters is large.  This option allows for a little more
+         tolerance when the strings are large.  It is not an appropriate
+         test when comparing fixed length fields such as phone and
+         social security numbers.
+    */
+    JFISH_UNICODE *ying_flag=0, *yang_flag=0;
+
+    double weight;
+
+    long min_len;
+    long search_range;
+    long lowlim, hilim;
+    long trans_count, common_chars;
+
+    int i, j, k;
+
+    // ensure that neither string is blank
+    if (!ying_length || !yang_length) return 0;
+
+    search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
+
+    // Blank out the flags
+    ying_flag = calloc((ying_length + 1), sizeof(JFISH_UNICODE));
+    if (!ying_flag) {
+        return -100;
+    }
+
+    yang_flag = calloc((yang_length + 1), sizeof(JFISH_UNICODE));
+    if (!yang_flag) {
+        free(ying_flag);
+        return -100;
+    }
+
+    search_range = (search_range/2) - 1;
+    if (search_range < 0) search_range = 0;
+
+
+    // Looking only within the search range, count and flag the matched pairs.
+    common_chars = 0;
+    for (i = 0; i < ying_length; i++) {
+        lowlim = (i >= search_range) ? i - search_range : 0;
+        hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
+        for (j = lowlim; j <= hilim; j++)  {
+            if (!yang_flag[j] && yang[j] == ying[i]) {
+                yang_flag[j] = 1;
+                ying_flag[i] = 1;
+                common_chars++;
+                break;
+            }
+        }
+    }
+
+    // If no characters in common - return
+    if (!common_chars) {
+        free(ying_flag);
+        free(yang_flag);
+        return 0;
+    }
+
+    // Count the number of transpositions
+    k = trans_count = 0;
+    for (i = 0; i < ying_length; i++) {
+        if (ying_flag[i]) {
+            for (j = k; j < yang_length; j++) {
+                if (yang_flag[j]) {
+                    k = j + 1;
+                    break;
+                }
+            }
+            if (ying[i] != yang[j]) {
+                trans_count++;
+            }
+        }
+    }
+    trans_count /= 2;
+
+    // adjust for similarities in nonmatched characters
+
+    // Main weight computation.
+    weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
+        + ((double) (common_chars - trans_count)) / ((double) common_chars);
+    weight /=  3.0;
+
+    // Continue to boost the weight if the strings are similar
+    if (winklerize && weight > 0.7 && ying_length > 3 && yang_length > 3) {
+
+        // Adjust for having up to the first 4 characters in common
+        j = (min_len >= 4) ? 4 : min_len;
+        for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
+        if (i) {
+            weight += i * 0.1 * (1.0 - weight);
+        }
+
+        /* Optionally adjust for long strings. */
+        /* After agreeing beginning chars, at least two more must agree and
+           the agreeing characters must be > .5 of remaining characters.
+        */
+        if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
+            if (NOTNUM(ying[0])) {
+                weight += (double) (1.0-weight) *
+                    ((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
+            }
+        }
+    }
+
+    free(ying_flag);
+    free(yang_flag);
+    return weight;
+}
+
+
+double jaro_winkler(const JFISH_UNICODE *ying, int ying_len,
+        const JFISH_UNICODE *yang, int yang_len,
+        int long_tolerance)
+{
+    return _jaro_winkler(ying, ying_len, yang, yang_len, long_tolerance, 1);
+}
+
+double jaro_distance(const JFISH_UNICODE *ying, int ying_len, const JFISH_UNICODE *yang, int yang_len)
+{
+    return _jaro_winkler(ying, ying_len, yang, yang_len, 0, 0);
+}
diff --git a/cjellyfish/jellyfish.h b/cjellyfish/jellyfish.h
new file mode 100644
index 0000000..3d50d14
--- /dev/null
+++ b/cjellyfish/jellyfish.h
@@ -0,0 +1,40 @@
+#ifndef _JELLYFISH_H_
+#define _JELLYFISH_H_
+
+#include <stdlib.h>
+
+#if CJELLYFISH_PYTHON
+#include <Python.h>
+#define JFISH_UNICODE Py_UNICODE
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+double jaro_winkler(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2, int long_tolerance);
+double jaro_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+size_t hamming_distance(const JFISH_UNICODE *str1, int len1,
+        const JFISH_UNICODE *str2, int len2);
+
+int levenshtein_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *str1, const JFISH_UNICODE *str2,
+        size_t len1, size_t len2);
+
+char* soundex(const char *str);
+
+char* metaphone(const char *str);
+
+JFISH_UNICODE *nysiis(const JFISH_UNICODE *str, int len);
+
+JFISH_UNICODE* match_rating_codex(const JFISH_UNICODE *str, size_t len);
+int match_rating_comparison(const JFISH_UNICODE *str1, size_t len1, const JFISH_UNICODE *str2, size_t len2);
+
+struct stemmer;
+extern struct stemmer * create_stemmer(void);
+extern void free_stemmer(struct stemmer * z);
+extern int stem(struct stemmer * z, JFISH_UNICODE * b, int k);
+
+#endif
diff --git a/cjellyfish/jellyfishmodule.c b/cjellyfish/jellyfishmodule.c
new file mode 100644
index 0000000..ae6d560
--- /dev/null
+++ b/cjellyfish/jellyfishmodule.c
@@ -0,0 +1,436 @@
+#include <Python.h>
+#include <math.h>
+#include "jellyfish.h"
+
+struct jellyfish_state {
+    PyObject *unicodedata_normalize;
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
+#else
+#define GETSTATE(m) (&_state)
+static struct jellyfish_state _state;
+#endif
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#define UTF8_BYTES(s) (PyBytes_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected str, got bytes"
+#else
+#define UTF8_BYTES(s) (PyString_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected unicode, got str"
+#endif
+
+
+/* Returns a new reference to a PyString (python < 3) or
+ * PyBytes (python >= 3.0).
+ *
+ * If passed a PyUnicode, the returned object will be NFKD UTF-8.
+ * If passed a PyString or PyBytes no conversion is done.
+ */
+static INLINE PyObject* normalize(PyObject *mod, PyObject *pystr) {
+    PyObject *unicodedata_normalize;
+    PyObject *normalized;
+    PyObject *utf8;
+
+    if (PyUnicode_Check(pystr)) {
+        unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
+        normalized = PyObject_CallFunction(unicodedata_normalize,
+                                           "sO", "NFKD", pystr);
+        if (!normalized) {
+            return NULL;
+        }
+        utf8 = PyUnicode_AsUTF8String(normalized);
+        Py_DECREF(normalized);
+        return utf8;
+    }
+
+    PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+    return NULL;
+}
+
+static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args, PyObject *kw)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    double result;
+    int long_tolerance = 0;
+    static char *keywords[] = {"s1", "s2", "long_tolerance", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "u#u#|i", keywords, &s1, &len1, &s2, &len2, &long_tolerance)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = jaro_winkler(s1, len1, s2, len2, long_tolerance);
+    // jaro returns a big negative number on error, don't use
+    // 0 here in case there's floating point inaccuracy
+    // .. used to use NaN but different compilers (*cough*MSVC*cough)
+    // handle it really poorly
+    if (result < -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    double result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = jaro_distance(s1, len1, s2, len2);
+    // see earlier note about jaro_distance return value
+    if (result < -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    unsigned result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = hamming_distance(s1, len1, s2, len2);
+
+    return Py_BuildValue("I", result);
+}
+
+static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = levenshtein_distance(s1, len1, s2, len2);
+    if (result == -1) {
+        // levenshtein_distance only returns failure code (-1) on
+        // failed malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("i", result);
+}
+
+
+static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
+                                                        PyObject *args)
+{
+    Py_UNICODE *s1, *s2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = damerau_levenshtein_distance(s1, s2, len1, len2);
+    if (result == -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("i", result);
+}
+
+static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
+{
+    PyObject *pystr;
+    PyObject *normalized;
+    PyObject* ret;
+    char *result;
+
+    if (!PyArg_ParseTuple(args, "O", &pystr)) {
+        return NULL;
+    }
+
+    normalized = normalize(self, pystr);
+    if (!normalized) {
+        return NULL;
+    }
+
+    result = soundex(UTF8_BYTES(normalized));
+    Py_DECREF(normalized);
+
+    if (!result) {
+        // soundex only fails on bad malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("s", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
+{
+    PyObject *pystr;
+    PyObject *normalized;
+    PyObject *ret;
+    char *result;
+
+    if (!PyArg_ParseTuple(args, "O", &pystr)) {
+        return NULL;
+    }
+
+    normalized = normalize(self, pystr);
+    if (!normalized) {
+        return NULL;
+    }
+
+    result = metaphone((const char*)UTF8_BYTES(normalized));
+    Py_DECREF(normalized);
+
+    if (!result) {
+        // metaphone only fails on bad malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("s", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    Py_UNICODE *result;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = match_rating_codex(str, len);
+    if (!result) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("u", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_match_rating_comparison(PyObject *self,
+                                                   PyObject *args)
+{
+    const Py_UNICODE *str1, *str2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &str1, &len1, &str2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = match_rating_comparison(str1, len1, str2, len2);
+
+    if (result == -1) {
+        Py_RETURN_NONE;
+    } else if (result) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    Py_UNICODE *result;
+    int len;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = nysiis(str, len);
+    if (!result) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("u", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    Py_UNICODE *result;
+    PyObject *ret;
+    struct stemmer *z;
+    int end;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    z = create_stemmer();
+    if (!z) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    result = malloc((len+1) * sizeof(Py_UNICODE));
+    if (!result) {
+        free_stemmer(z);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memcpy(result, str, len * sizeof(Py_UNICODE));
+
+    end = stem(z, result, len - 1);
+    result[end + 1] = '\0';
+
+    ret = Py_BuildValue("u", result);
+
+    free(result);
+    free_stemmer(z);
+
+    return ret;
+}
+
+static PyMethodDef jellyfish_methods[] = {
+    {"jaro_winkler", (PyCFunction)jellyfish_jaro_winkler, METH_VARARGS|METH_KEYWORDS,
+     "jaro_winkler(string1, string2, long_tolerance)\n\n"
+     "Do a Jaro-Winkler string comparison between string1 and string2."},
+
+    {"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
+     "jaro_distance(string1, string2)\n\n"
+     "Get a Jaro string distance metric for string1 and string2."},
+
+    {"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
+     "hamming_distance(string1, string2)\n\n"
+     "Compute the Hamming distance between string1 and string2."},
+
+    {"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
+     "levenshtein_distance(string1, string2)\n\n"
+     "Compute the Levenshtein distance between string1 and string2."},
+
+    {"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
+     METH_VARARGS,
+     "damerau_levenshtein_distance(string1, string2)\n\n"
+     "Compute the Damerau-Levenshtein distance between string1 and string2."},
+
+    {"soundex", jellyfish_soundex, METH_VARARGS,
+     "soundex(string)\n\n"
+     "Calculate the soundex code for a given name."},
+
+    {"metaphone", jellyfish_metaphone, METH_VARARGS,
+     "metaphone(string)\n\n"
+     "Calculate the metaphone representation of a given string."},
+
+    {"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
+     "match_rating_codex(string)\n\n"
+     "Calculate the Match Rating Approach representation of a given string."},
+
+    {"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
+     "match_rating_comparison(string, string)\n\n"
+     "Compute the Match Rating Approach similarity between string1 and"
+     "string2."},
+
+    {"nysiis", jellyfish_nysiis, METH_VARARGS,
+     "nysiis(string)\n\n"
+     "Compute the NYSIIS (New York State Identification and Intelligence\n"
+     "System) code for a string."},
+
+    {"porter_stem", jellyfish_porter_stem, METH_VARARGS,
+     "porter_stem(string)\n\n"
+     "Return the result of running the Porter stemming algorithm on "
+     "a single-word string."},
+
+    {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define INITERROR return NULL
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "jellyfish.cjellyfish",
+    NULL,
+    sizeof(struct jellyfish_state),
+    jellyfish_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject* PyInit_cjellyfish(void)
+#else
+
+#define INITERROR return
+
+PyMODINIT_FUNC initcjellyfish(void)
+#endif
+{
+    PyObject *unicodedata;
+
+#if PY_MAJOR_VERSION >= 3
+    PyObject *module = PyModule_Create(&moduledef);
+#else
+    PyObject *module = Py_InitModule("jellyfish.cjellyfish", jellyfish_methods);
+#endif
+
+    if (module == NULL) {
+        INITERROR;
+    }
+
+    unicodedata = PyImport_ImportModule("unicodedata");
+    if (!unicodedata) {
+        INITERROR;
+    }
+
+    GETSTATE(module)->unicodedata_normalize =
+        PyObject_GetAttrString(unicodedata, "normalize");
+    Py_DECREF(unicodedata);
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}
diff --git a/cjellyfish/levenshtein.c b/cjellyfish/levenshtein.c
new file mode 100644
index 0000000..0218df2
--- /dev/null
+++ b/cjellyfish/levenshtein.c
@@ -0,0 +1,47 @@
+#include "jellyfish.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int levenshtein_distance(const JFISH_UNICODE *s1, int s1_len, const JFISH_UNICODE *s2, int s2_len)
+{
+    size_t rows = s1_len + 1;
+    size_t cols = s2_len + 1;
+    size_t i, j;
+
... 2263 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-jellyfish.git



More information about the Python-modules-commits mailing list