[Python-modules-commits] [python-jellyfish] 01/06: import python-jellyfish_0.5.6.orig.tar.gz
Diego M. Rodriguez
diegom-guest at moszumanska.debian.org
Thu Sep 15 15:33:33 UTC 2016
This is an automated email from the git hooks/post-receive script.
diegom-guest pushed a commit to branch master
in repository python-jellyfish.
commit ea4a1ce384845f8bf6cec0ad6db58ec01b975c96
Author: Diego M. Rodriguez <diego.plan9 at gmail.com>
Date: Thu Sep 15 16:15:29 2016 +0200
import python-jellyfish_0.5.6.orig.tar.gz
---
LICENSE | 25 +
MANIFEST.in | 2 +
PKG-INFO | 85 +
README.rst | 66 +
cjellyfish/damerau_levenshtein.c | 86 +
cjellyfish/hamming.c | 25 +
cjellyfish/jaro.c | 145 +
cjellyfish/jellyfish.h | 40 +
cjellyfish/jellyfishmodule.c | 441 +
cjellyfish/levenshtein.c | 47 +
cjellyfish/metaphone.c | 197 +
cjellyfish/mra.c | 115 +
cjellyfish/nysiis.c | 191 +
cjellyfish/porter.c | 395 +
cjellyfish/soundex.c | 70 +
docs/Makefile | 177 +
docs/changelog.rst | 91 +
docs/comparison.rst | 77 +
docs/conf.py | 259 +
docs/index.rst | 33 +
docs/phonetic.rst | 62 +
docs/stemming.rst | 15 +
jellyfish.egg-info/PKG-INFO | 85 +
jellyfish.egg-info/SOURCES.txt | 43 +
jellyfish.egg-info/dependency_links.txt | 1 +
jellyfish.egg-info/top_level.txt | 1 +
jellyfish/__init__.py | 4 +
jellyfish/_jellyfish.py | 488 +
jellyfish/compat.py | 13 +
jellyfish/porter.py | 218 +
jellyfish/test.py | 213 +
setup.cfg | 5 +
setup.py | 124 +
testdata/README.md | 1 +
testdata/damerau_levenshtein.csv | 9 +
testdata/hamming.csv | 8 +
testdata/jaro_distance.csv | 4 +
testdata/jaro_winkler.csv | 7 +
testdata/levenshtein.csv | 6 +
testdata/match_rating_codex.csv | 9 +
testdata/match_rating_comparison.csv | 6 +
testdata/metaphone.csv | 28 +
testdata/nysiis.csv | 33 +
testdata/porter.csv | 23531 ++++++++++++++++++++++++++++++
testdata/soundex.csv | 9 +
45 files changed, 27490 insertions(+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b563a37
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2015, James Turk
+Copyright (c) 2015, Sunlight Foundation
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..c7c6f27
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include LICENSE *.rst *.py cjellyfish/*.c cjellyfish/*.h docs/* testdata/*
+global-exclude .git
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..cf4b132
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,85 @@
+Metadata-Version: 1.1
+Name: jellyfish
+Version: 0.5.6
+Summary: a library for doing approximate and phonetic matching of strings.
+Home-page: http://github.com/jamesturk/jellyfish
+Author: UNKNOWN
+Author-email: UNKNOWN
+License: UNKNOWN
+Description: =========
+ jellyfish
+ =========
+
+ .. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+ :target: https://travis-ci.org/jamesturk/jellyfish
+
+ .. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+ :target: https://coveralls.io/r/jamesturk/jellyfish
+
+ .. image:: https://img.shields.io/pypi/v/jellyfish.svg
+ :target: https://pypi.python.org/pypi/jellyfish
+
+ .. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+ :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+ :alt: Documentation Status
+
+ .. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+ :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+
+ Jellyfish is a python library for doing approximate and phonetic matching of strings.
+
+ Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+
+ See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+
+ Source is available at http://github.com/jamesturk/jellyfish.
+
+ Included Algorithms
+ ===================
+
+ String comparison:
+
+ * Levenshtein Distance
+ * Damerau-Levenshtein Distance
+ * Jaro Distance
+ * Jaro-Winkler Distance
+ * Match Rating Approach Comparison
+ * Hamming Distance
+
+ Phonetic encoding:
+
+ * American Soundex
+ * Metaphone
+ * NYSIIS (New York State Identification and Intelligence System)
+ * Match Rating Codex
+
+ Example Usage
+ =============
+
+ >>> import jellyfish
+ >>> jellyfish.levenshtein_distance(u'jellyfish', u'smellyfish')
+ 2
+ >>> jellyfish.jaro_distance(u'jellyfish', u'smellyfish')
+ 0.89629629629629637
+ >>> jellyfish.damerau_levenshtein_distance(u'jellyfish', u'jellyfihs')
+ 1
+
+ >>> jellyfish.metaphone(u'Jellyfish')
+ 'JLFX'
+ >>> jellyfish.soundex(u'Jellyfish')
+ 'J412'
+ >>> jellyfish.nysiis(u'Jellyfish')
+ 'JALYF'
+ >>> jellyfish.match_rating_codex(u'Jellyfish')
+ 'JLLFSH'
+
+Platform: any
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Topic :: Text Processing :: Linguistic
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..41ffcd9
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,66 @@
+=========
+jellyfish
+=========
+
+.. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+ :target: https://travis-ci.org/jamesturk/jellyfish
+
+.. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+ :target: https://coveralls.io/r/jamesturk/jellyfish
+
+.. image:: https://img.shields.io/pypi/v/jellyfish.svg
+ :target: https://pypi.python.org/pypi/jellyfish
+
+.. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+ :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+ :alt: Documentation Status
+
+.. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+ :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+
+Jellyfish is a python library for doing approximate and phonetic matching of strings.
+
+Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+
+See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+
+Source is available at http://github.com/jamesturk/jellyfish.
+
+Included Algorithms
+===================
+
+String comparison:
+
+ * Levenshtein Distance
+ * Damerau-Levenshtein Distance
+ * Jaro Distance
+ * Jaro-Winkler Distance
+ * Match Rating Approach Comparison
+ * Hamming Distance
+
+Phonetic encoding:
+
+ * American Soundex
+ * Metaphone
+ * NYSIIS (New York State Identification and Intelligence System)
+ * Match Rating Codex
+
+Example Usage
+=============
+
+>>> import jellyfish
+>>> jellyfish.levenshtein_distance(u'jellyfish', u'smellyfish')
+2
+>>> jellyfish.jaro_distance(u'jellyfish', u'smellyfish')
+0.89629629629629637
+>>> jellyfish.damerau_levenshtein_distance(u'jellyfish', u'jellyfihs')
+1
+
+>>> jellyfish.metaphone(u'Jellyfish')
+'JLFX'
+>>> jellyfish.soundex(u'Jellyfish')
+'J412'
+>>> jellyfish.nysiis(u'Jellyfish')
+'JALYF'
+>>> jellyfish.match_rating_codex(u'Jellyfish')
+'JLLFSH'
diff --git a/cjellyfish/damerau_levenshtein.c b/cjellyfish/damerau_levenshtein.c
new file mode 100644
index 0000000..43f5a43
--- /dev/null
+++ b/cjellyfish/damerau_levenshtein.c
@@ -0,0 +1,86 @@
+#include "jellyfish.h"
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *s1, const JFISH_UNICODE *s2, size_t len1, size_t len2)
+{
+ size_t infinite = len1 + len2;
+ size_t cols = len2 + 2;
+
+ size_t i, j, i1, j1;
+ size_t db;
+ size_t d1, d2, d3, d4, result;
+ size_t da_idx;
+ unsigned short cost;
+
+ size_t *dist = NULL;
+
+ const size_t len_da = 256;
+ size_t *da = calloc(len_da, sizeof(size_t));
+ if (!da) {
+ return -1;
+ }
+
+ dist = malloc((len1 + 2) * cols * sizeof(size_t));
+ if (!dist) {
+ free(da);
+ return -1;
+ }
+
+ dist[0] = infinite;
+
+ for (i = 0; i <= len1; i++) {
+ dist[((i + 1) * cols) + 0] = infinite;
+ dist[((i + 1) * cols) + 1] = i;
+ }
+
+ for (i = 0; i <= len2; i++) {
+ dist[i + 1] = infinite; // 0*cols + row
+ dist[cols + i + 1] = i; // 1*cols + row
+ }
+
+ for (i = 1; i <= len1; i++) {
+ db = 0;
+ for (j = 1; j <= len2; j++) {
+ da_idx = (JFISH_UNICODE)s2[j-1];
+ if (da_idx >= len_da) {
+ free(dist);
+ free(da);
+ return -2;
+ }
+ i1 = da[da_idx];
+ j1 = db;
+
+ if (s1[i - 1] == s2[j - 1]) {
+ cost = 0;
+ db = j;
+ } else {
+ cost = 1;
+ }
+
+ d1 = dist[(i * cols) + j] + cost;
+ d2 = dist[((i + 1) * cols) + j] + 1;
+ d3 = dist[(i * cols) + j + 1] + 1;
+ d4 = dist[(i1 * cols) + j1] + (i - i1 - 1) + 1 + (j - j1 - 1);
+
+ dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4));
+ }
+
+ da_idx = (JFISH_UNICODE)s1[i-1];
+ if (da_idx >= len_da) {
+ free(dist);
+ free(da);
+ return -2;
+ }
+ da[da_idx] = i;
+ }
+
+ result = dist[((len1+1) * cols) + len2 + 1];
+
+ free(dist);
+ free(da);
+
+ return result;
+}
diff --git a/cjellyfish/hamming.c b/cjellyfish/hamming.c
new file mode 100644
index 0000000..d48540e
--- /dev/null
+++ b/cjellyfish/hamming.c
@@ -0,0 +1,25 @@
+#include "jellyfish.h"
+#include <ctype.h>
+
+size_t hamming_distance(const Py_UNICODE *s1, int len1,
+ const Py_UNICODE *s2, int len2) {
+ unsigned distance = 0;
+ int i1 = 0;
+ int i2 = 0;
+
+ for (; i1 < len1 && i2 < len2; i1++, i2++, s1++, s2++) {
+ if (*s1 != *s2) {
+ distance++;
+ }
+ }
+
+ for ( ; i1 < len1; i1++, s1++) {
+ distance++;
+ }
+
+ for ( ; i2 < len2; i2++, s2++) {
+ distance++;
+ }
+
+ return distance;
+}
diff --git a/cjellyfish/jaro.c b/cjellyfish/jaro.c
new file mode 100644
index 0000000..7f02928
--- /dev/null
+++ b/cjellyfish/jaro.c
@@ -0,0 +1,145 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "jellyfish.h"
+
+#define NOTNUM(c) ((c>57) || (c<48))
+
+/* borrowed heavily from strcmp95.c
+ * http://www.census.gov/geo/msb/stand/strcmp.c
+ */
+double _jaro_winkler(const JFISH_UNICODE *ying, int ying_length,
+ const JFISH_UNICODE *yang, int yang_length,
+ int long_tolerance, int winklerize)
+{
+ /* Arguments:
+
+ ying
+ yang
+ pointers to the 2 strings to be compared.
+
+ long_tolerance
+ Increase the probability of a match when the number of matched
+ characters is large. This option allows for a little more
+ tolerance when the strings are large. It is not an appropriate
+ test when comparing fixed length fields such as phone and
+ social security numbers.
+ */
+ JFISH_UNICODE *ying_flag=0, *yang_flag=0;
+
+ double weight;
+
+ long min_len;
+ long search_range;
+ long lowlim, hilim;
+ long trans_count, common_chars;
+
+ int i, j, k;
+
+ // ensure that neither string is blank
+ if (!ying_length || !yang_length) return 0;
+
+ search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
+
+ // Blank out the flags
+ ying_flag = calloc((ying_length + 1), sizeof(JFISH_UNICODE));
+ if (!ying_flag) {
+ return -100;
+ }
+
+ yang_flag = calloc((yang_length + 1), sizeof(JFISH_UNICODE));
+ if (!yang_flag) {
+ free(ying_flag);
+ return -100;
+ }
+
+ search_range = (search_range/2) - 1;
+ if (search_range < 0) search_range = 0;
+
+
+ // Looking only within the search range, count and flag the matched pairs.
+ common_chars = 0;
+ for (i = 0; i < ying_length; i++) {
+ lowlim = (i >= search_range) ? i - search_range : 0;
+ hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
+ for (j = lowlim; j <= hilim; j++) {
+ if (!yang_flag[j] && yang[j] == ying[i]) {
+ yang_flag[j] = 1;
+ ying_flag[i] = 1;
+ common_chars++;
+ break;
+ }
+ }
+ }
+
+ // If no characters in common - return
+ if (!common_chars) {
+ free(ying_flag);
+ free(yang_flag);
+ return 0;
+ }
+
+ // Count the number of transpositions
+ k = trans_count = 0;
+ for (i = 0; i < ying_length; i++) {
+ if (ying_flag[i]) {
+ for (j = k; j < yang_length; j++) {
+ if (yang_flag[j]) {
+ k = j + 1;
+ break;
+ }
+ }
+ if (ying[i] != yang[j]) {
+ trans_count++;
+ }
+ }
+ }
+ trans_count /= 2;
+
+ // adjust for similarities in nonmatched characters
+
+ // Main weight computation.
+ weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
+ + ((double) (common_chars - trans_count)) / ((double) common_chars);
+ weight /= 3.0;
+
+ // Continue to boost the weight if the strings are similar
+ if (winklerize && weight > 0.7 && ying_length > 3 && yang_length > 3) {
+
+ // Adjust for having up to the first 4 characters in common
+ j = (min_len >= 4) ? 4 : min_len;
+ for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
+ if (i) {
+ weight += i * 0.1 * (1.0 - weight);
+ }
+
+ /* Optionally adjust for long strings. */
+ /* After agreeing beginning chars, at least two more must agree and
+ the agreeing characters must be > .5 of remaining characters.
+ */
+ if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
+ if (NOTNUM(ying[0])) {
+ weight += (double) (1.0-weight) *
+ ((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
+ }
+ }
+ }
+
+ free(ying_flag);
+ free(yang_flag);
+ return weight;
+}
+
+
+double jaro_winkler(const JFISH_UNICODE *ying, int ying_len,
+ const JFISH_UNICODE *yang, int yang_len,
+ int long_tolerance)
+{
+ return _jaro_winkler(ying, ying_len, yang, yang_len, long_tolerance, 1);
+}
+
+double jaro_distance(const JFISH_UNICODE *ying, int ying_len, const JFISH_UNICODE *yang, int yang_len)
+{
+ return _jaro_winkler(ying, ying_len, yang, yang_len, 0, 0);
+}
diff --git a/cjellyfish/jellyfish.h b/cjellyfish/jellyfish.h
new file mode 100644
index 0000000..3d50d14
--- /dev/null
+++ b/cjellyfish/jellyfish.h
@@ -0,0 +1,40 @@
+#ifndef _JELLYFISH_H_
+#define _JELLYFISH_H_
+
+#include <stdlib.h>
+
+#if CJELLYFISH_PYTHON
+#include <Python.h>
+#define JFISH_UNICODE Py_UNICODE
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+double jaro_winkler(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2, int long_tolerance);
+double jaro_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+size_t hamming_distance(const JFISH_UNICODE *str1, int len1,
+ const JFISH_UNICODE *str2, int len2);
+
+int levenshtein_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *str1, const JFISH_UNICODE *str2,
+ size_t len1, size_t len2);
+
+char* soundex(const char *str);
+
+char* metaphone(const char *str);
+
+JFISH_UNICODE *nysiis(const JFISH_UNICODE *str, int len);
+
+JFISH_UNICODE* match_rating_codex(const JFISH_UNICODE *str, size_t len);
+int match_rating_comparison(const JFISH_UNICODE *str1, size_t len1, const JFISH_UNICODE *str2, size_t len2);
+
+struct stemmer;
+extern struct stemmer * create_stemmer(void);
+extern void free_stemmer(struct stemmer * z);
+extern int stem(struct stemmer * z, JFISH_UNICODE * b, int k);
+
+#endif
diff --git a/cjellyfish/jellyfishmodule.c b/cjellyfish/jellyfishmodule.c
new file mode 100644
index 0000000..5c60429
--- /dev/null
+++ b/cjellyfish/jellyfishmodule.c
@@ -0,0 +1,441 @@
+#include <Python.h>
+#include <math.h>
+#include "jellyfish.h"
+
+struct jellyfish_state {
+ PyObject *unicodedata_normalize;
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
+#else
+#define GETSTATE(m) (&_state)
+static struct jellyfish_state _state;
+#endif
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#define UTF8_BYTES(s) (PyBytes_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected str, got bytes"
+#else
+#define UTF8_BYTES(s) (PyString_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected unicode, got str"
+#endif
+
+#define UNSUPPORTED_CODEPOINT "Encountered unsupported code point in string."
+
+
+/* Returns a new reference to a PyString (python < 3) or
+ * PyBytes (python >= 3.0).
+ *
+ * If passed a PyUnicode, the returned object will be NFKD UTF-8.
+ * If passed a PyString or PyBytes no conversion is done.
+ */
+static INLINE PyObject* normalize(PyObject *mod, const Py_UNICODE *pystr) {
+ PyObject *unicodedata_normalize;
+ PyObject *normalized;
+ PyObject *utf8;
+
+ unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
+ normalized = PyObject_CallFunction(unicodedata_normalize,
+ "su", "NFKD", pystr);
+ if (!normalized) {
+ return NULL;
+ }
+ utf8 = PyUnicode_AsUTF8String(normalized);
+ Py_DECREF(normalized);
+ return utf8;
+}
+
+static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args, PyObject *kw)
+{
+ const Py_UNICODE *s1, *s2;
+ int len1, len2;
+ double result;
+ int long_tolerance = 0;
+ static char *keywords[] = {"s1", "s2", "long_tolerance", NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, kw, "u#u#|i", keywords, &s1, &len1, &s2, &len2, &long_tolerance)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = jaro_winkler(s1, len1, s2, len2, long_tolerance);
+ // jaro returns a big negative number on error, don't use
+ // 0 here in case there's floating point inaccuracy
+ // .. used to use NaN but different compilers (*cough*MSVC*cough)
+ // handle it really poorly
+ if (result < -1) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *s1, *s2;
+ int len1, len2;
+ double result;
+
+ if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = jaro_distance(s1, len1, s2, len2);
+ // see earlier note about jaro_distance return value
+ if (result < -1) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *s1, *s2;
+ int len1, len2;
+ unsigned result;
+
+ if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = hamming_distance(s1, len1, s2, len2);
+
+ return Py_BuildValue("I", result);
+}
+
+static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *s1, *s2;
+ int len1, len2;
+ int result;
+
+ if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = levenshtein_distance(s1, len1, s2, len2);
+ if (result == -1) {
+ // levenshtein_distance only returns failure code (-1) on
+ // failed malloc
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ return Py_BuildValue("i", result);
+}
+
+
+static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
+ PyObject *args)
+{
+ Py_UNICODE *s1, *s2;
+ int len1, len2;
+ int result;
+
+ if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = damerau_levenshtein_distance(s1, s2, len1, len2);
+ if (result == -1) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ else if (result == -2) {
+ PyErr_SetString(PyExc_ValueError, UNSUPPORTED_CODEPOINT);
+ return NULL;
+ }
+
+ return Py_BuildValue("i", result);
+}
+
+static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *str;
+ int len;
+ PyObject *normalized;
+ PyObject* ret;
+ char *result;
+
+ if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ normalized = normalize(self, str);
+ if (!normalized) {
+ return NULL;
+ }
+
+ result = soundex(UTF8_BYTES(normalized));
+ Py_DECREF(normalized);
+
+ if (!result) {
+ // soundex only fails on bad malloc
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ ret = Py_BuildValue("s", result);
+ free(result);
+
+ return ret;
+}
+
+static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *str;
+ int len;
+ PyObject *normalized;
+ PyObject *ret;
+ char *result;
+
+ if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ normalized = normalize(self, str);
+ if (!normalized) {
+ return NULL;
+ }
+
+ result = metaphone((const char*)UTF8_BYTES(normalized));
+ Py_DECREF(normalized);
+
+ if (!result) {
+ // metaphone only fails on bad malloc
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ ret = Py_BuildValue("s", result);
+ free(result);
+
+ return ret;
+}
+
+static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *str;
+ int len;
+ Py_UNICODE *result;
+ PyObject *ret;
+
+ if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = match_rating_codex(str, len);
+ if (!result) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ ret = Py_BuildValue("u", result);
+ free(result);
+
+ return ret;
+}
+
+static PyObject* jellyfish_match_rating_comparison(PyObject *self,
+ PyObject *args)
+{
+ const Py_UNICODE *str1, *str2;
+ int len1, len2;
+ int result;
+
+ if (!PyArg_ParseTuple(args, "u#u#", &str1, &len1, &str2, &len2)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = match_rating_comparison(str1, len1, str2, len2);
+
+ if (result == -1) {
+ Py_RETURN_NONE;
+ } else if (result) {
+ Py_RETURN_TRUE;
+ } else {
+ Py_RETURN_FALSE;
+ }
+}
+
+static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *str;
+ Py_UNICODE *result;
+ int len;
+ PyObject *ret;
+
+ if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ result = nysiis(str, len);
+ if (!result) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ ret = Py_BuildValue("u", result);
+ free(result);
+
+ return ret;
+}
+
+static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
+{
+ const Py_UNICODE *str;
+ int len;
+ Py_UNICODE *result;
+ PyObject *ret;
+ struct stemmer *z;
+ int end;
+
+ if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+ PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+ return NULL;
+ }
+
+ z = create_stemmer();
+ if (!z) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ result = malloc((len+1) * sizeof(Py_UNICODE));
+ if (!result) {
+ free_stemmer(z);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ memcpy(result, str, len * sizeof(Py_UNICODE));
+
+ end = stem(z, result, len - 1);
+ result[end + 1] = '\0';
+
+ ret = Py_BuildValue("u", result);
+
+ free(result);
+ free_stemmer(z);
+
+ return ret;
+}
+
+static PyMethodDef jellyfish_methods[] = {
+ {"jaro_winkler", (PyCFunction)jellyfish_jaro_winkler, METH_VARARGS|METH_KEYWORDS,
+ "jaro_winkler(string1, string2, long_tolerance)\n\n"
+ "Do a Jaro-Winkler string comparison between string1 and string2."},
+
+ {"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
+ "jaro_distance(string1, string2)\n\n"
+ "Get a Jaro string distance metric for string1 and string2."},
+
+ {"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
+ "hamming_distance(string1, string2)\n\n"
+ "Compute the Hamming distance between string1 and string2."},
+
+ {"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
+ "levenshtein_distance(string1, string2)\n\n"
+ "Compute the Levenshtein distance between string1 and string2."},
+
+ {"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
+ METH_VARARGS,
+ "damerau_levenshtein_distance(string1, string2)\n\n"
+ "Compute the Damerau-Levenshtein distance between string1 and string2."},
+
+ {"soundex", jellyfish_soundex, METH_VARARGS,
+ "soundex(string)\n\n"
+ "Calculate the soundex code for a given name."},
+
+ {"metaphone", jellyfish_metaphone, METH_VARARGS,
+ "metaphone(string)\n\n"
+ "Calculate the metaphone representation of a given string."},
+
+ {"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
+ "match_rating_codex(string)\n\n"
+ "Calculate the Match Rating Approach representation of a given string."},
+
+ {"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
+ "match_rating_comparison(string, string)\n\n"
+ "Compute the Match Rating Approach similarity between string1 and"
+ "string2."},
+
+ {"nysiis", jellyfish_nysiis, METH_VARARGS,
+ "nysiis(string)\n\n"
+ "Compute the NYSIIS (New York State Identification and Intelligence\n"
+ "System) code for a string."},
+
+ {"porter_stem", jellyfish_porter_stem, METH_VARARGS,
+ "porter_stem(string)\n\n"
+ "Return the result of running the Porter stemming algorithm on "
+ "a single-word string."},
+
+ {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define INITERROR return NULL
+
+static struct PyModuleDef moduledef = {
+ PyModuleDef_HEAD_INIT,
+ "jellyfish.cjellyfish",
+ NULL,
+ sizeof(struct jellyfish_state),
+ jellyfish_methods,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+
+PyObject* PyInit_cjellyfish(void)
+#else
+
+#define INITERROR return
+
+PyMODINIT_FUNC initcjellyfish(void)
+#endif
+{
+ PyObject *unicodedata;
+
+#if PY_MAJOR_VERSION >= 3
... 26814 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-jellyfish.git
More information about the Python-modules-commits
mailing list