[Python-modules-commits] [python-nameparser] 01/02: Import python-nameparser_0.5.1.orig.tar.gz
Edward Betts
edward at moszumanska.debian.org
Wed Aug 31 10:41:23 UTC 2016
This is an automated email from the git hooks/post-receive script.
edward pushed a commit to branch master
in repository python-nameparser.
commit 4b6a3a2de78752f9e3917830e14d2c9b34c4b0fb
Author: Edward Betts <edward at 4angle.com>
Date: Wed Aug 31 10:07:21 2016 +0100
Import python-nameparser_0.5.1.orig.tar.gz
---
PKG-INFO | 2 +-
nameparser.egg-info/PKG-INFO | 2 +-
nameparser/__init__.py | 2 +-
nameparser/config/titles.py | 1 -
nameparser/parser.py | 172 ++++++++++++++++++++++++++-----------------
tests.py | 35 ++++++++-
6 files changed, 142 insertions(+), 72 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 87a9820..6b0218e 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nameparser
-Version: 0.4.0
+Version: 0.5.1
Summary: A simple Python module for parsing human names into their individual components.
Home-page: https://github.com/derek73/python-nameparser
Author: Derek Gulbranson
diff --git a/nameparser.egg-info/PKG-INFO b/nameparser.egg-info/PKG-INFO
index 87a9820..6b0218e 100644
--- a/nameparser.egg-info/PKG-INFO
+++ b/nameparser.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nameparser
-Version: 0.4.0
+Version: 0.5.1
Summary: A simple Python module for parsing human names into their individual components.
Home-page: https://github.com/derek73/python-nameparser
Author: Derek Gulbranson
diff --git a/nameparser/__init__.py b/nameparser/__init__.py
index 80ae1e7..71c4f7a 100644
--- a/nameparser/__init__.py
+++ b/nameparser/__init__.py
@@ -1,4 +1,4 @@
-VERSION = (0, 4, 0)
+VERSION = (0, 5, 1)
__version__ = '.'.join(map(str, VERSION))
__author__ = "Derek Gulbranson"
__author_email__ = 'derek73 at gmail.com'
diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py
index 0fb78e6..28c6d01 100644
--- a/nameparser/config/titles.py
+++ b/nameparser/config/titles.py
@@ -93,7 +93,6 @@ TITLES = FIRST_NAME_TITLES | set([
'bench',
'bg',
'bgen',
- 'bishop',
'blessed',
'bodhisattva',
'brigadier',
diff --git a/nameparser/parser.py b/nameparser/parser.py
index fc2c173..d697a69 100644
--- a/nameparser/parser.py
+++ b/nameparser/parser.py
@@ -2,6 +2,9 @@
from __future__ import unicode_literals
import sys
+from operator import itemgetter
+from itertools import groupby
+
from nameparser.util import u
from nameparser.util import text_types, binary_type
from nameparser.util import lc
@@ -9,9 +12,19 @@ from nameparser.util import log
from nameparser.config import CONSTANTS
from nameparser.config import Constants
-
ENCODING = 'utf-8'
+def group_contiguous_integers(data):
+ """
+ return list of tuples containing first and last index
+ position of contiguous numbers in a series
+ """
+ ranges = []
+ for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]):
+ group = list(map(itemgetter(1), group))
+ if len(group) > 1:
+ ranges.append((group[0], group[-1]))
+ return ranges
class HumanName(object):
"""
@@ -60,12 +73,9 @@ class HumanName(object):
def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING,
string_format=None):
- global CONSTANTS
self.C = constants
- if not self.C:
+ if type(self.C) is not type(CONSTANTS):
self.C = Constants()
- if self.C is not CONSTANTS:
- self.has_own_config = True
self.ENCODING = encoding
self.string_format = string_format or self.C.string_format
@@ -170,7 +180,11 @@ class HumanName(object):
if val:
d[m] = val
return d
-
+
+ @property
+ def has_own_config(self):
+ return self.C is not CONSTANTS
+
### attributes
@property
@@ -273,8 +287,8 @@ class HumanName(object):
return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)
def is_prefix(self, piece):
- """Is in the prefixes set and not :py:func:`is_an_initial()`."""
- return piece.lower() in self.C.prefixes and not self.is_an_initial(piece)
+ """Lowercase and no periods version of piece is in the `~nameparser.config.titles.PREFIXES` set."""
+ return lc(piece) in self.C.prefixes
def is_roman_numeral(self, value):
"""
@@ -551,8 +565,8 @@ class HumanName(object):
if self.C.regexes.period_not_at_end.match(part):
# split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.")
period_chunks = part.split(".")
- titles = filter(self.is_title, period_chunks)
- suffixes = filter(self.is_suffix, period_chunks)
+ titles = list(filter(self.is_title, period_chunks))
+ suffixes = list(filter(self.is_suffix, period_chunks))
# add the part to the constant so it will be found
if len(list(titles)):
@@ -566,8 +580,18 @@ class HumanName(object):
def join_on_conjunctions(self, pieces, additional_parts_count=0):
"""
- Join conjunctions to surrounding pieces, e.g.:
- ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia']
+ Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
+
+ ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
+ ['Mr. and Mrs.', 'John', 'Doe']
+
+ ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
+ ['The Secretary of State', 'Hillary', 'Clinton']
+
+ When joining titles, saves newly formed piece to the instance's titles
+ constant so they will be parsed correctly later. E.g. after parsing the
+ example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
+ be present in the titles constant set.
:param list pieces: name pieces strings after split on spaces
:param int additional_parts_count:
@@ -579,70 +603,84 @@ class HumanName(object):
# don't join on conjuctions if there's only 2 parts
if length < 3:
return pieces
-
- for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list
-
- # loop through the pieces backwards, starting at the end of the list.
- # Join conjunctions to the pieces on either side of them.
-
- rootname_pieces = [p for p in pieces if self.is_rootname(p)]
- total_length= len(rootname_pieces) + additional_parts_count
- if len(conj) == 1 and total_length < 4:
- # if there are only 3 total parts (minus known titles, suffixes and prefixes)
- # and this conjunction is a single letter, prefer treating it as an initial
- # rather than a conjunction.
- # http://code.google.com/p/python-nameparser/issues/detail?id=11
- continue
+ rootname_pieces = [p for p in pieces if self.is_rootname(p)]
+ total_length= len(rootname_pieces) + additional_parts_count
+
+ # find all the conjunctions, join any conjunctions that are next to each
+ # other, then join those newly joined conjunctions and any single
+ # conjunctions to the piece before and after it
+ conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
+
+ contiguous_conj_i = []
+ for i, val in enumerate(conj_index):
try:
- i = pieces.index((conj))
- except ValueError:
- log.error("Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".format(**locals()))
+ if conj_index[i+1] == val+1:
+ contiguous_conj_i += [val]
+ except IndexError:
+ pass
+
+ contiguous_conj_i = group_contiguous_integers(conj_index)
+
+ delete_i = []
+ for i in contiguous_conj_i:
+ if type(i) == tuple:
+ new_piece = " ".join(pieces[ i[0] : i[1]+1] )
+ delete_i += list(range( i[0]+1, i[1]+1 ))
+ pieces[i[0]] = new_piece
+ else:
+ new_piece = " ".join(pieces[ i : i+2 ])
+ delete_i += [i+1]
+ pieces[i] = new_piece
+ #add newly joined conjunctions to constants to be found later
+ self.C.conjunctions.add(new_piece)
+
+ for i in reversed(delete_i):
+ # delete pieces in reverse order or the index changes on each delete
+ del pieces[i]
+
+ # refresh conjunction index locations
+ conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
+
+ for i in conj_index:
+ if len(pieces[i]) == 1 and total_length < 4:
+ # if there are only 3 total parts (minus known titles, suffixes
+ # and prefixes) and this conjunction is a single letter, prefer
+ # treating it as an initial rather than a conjunction.
+ # http://code.google.com/p/python-nameparser/issues/detail?id=11
continue
- if i < len(pieces) - 1:
- # if this is not the last piece
-
- if i is 0:
- # if this is the first piece and it's a conjunction
- nxt = pieces[i+1]
- const = self.C.conjunctions
- if self.is_title(nxt):
- const = self.C.titles
- new_piece = ' '.join(pieces[0:2])
- const.add(new_piece)
- pieces[i] = new_piece
- pieces.pop(i+1)
- continue
-
- if self.is_conjunction(pieces[i-1]):
-
- # if the piece in front of this one is a conjunction too,
- # add new_piece (this conjuction and the following piece)
- # to the conjuctions constant so that it is recognized
- # as a conjunction in the next loop.
- # e.g. for ["Lord","of","the Universe"], put "the Universe"
- # into the conjunctions constant.
-
- new_piece = ' '.join(pieces[i:i+2])
- self.C.conjunctions.add(new_piece)
- pieces[i] = new_piece
- pieces.pop(i+1)
- continue
+ if i is 0:
+ new_piece = " ".join(pieces[i:i+2])
+ if self.is_title(pieces[i+1]):
+ # when joining to a title, make new_piece a title too
+ self.C.titles.add(new_piece)
+ pieces[i] = new_piece
+ pieces.pop(i+1)
+ # subtract 1 from the index of all the remaining conjunctions
+ for j,val in enumerate(conj_index):
+ if val > i:
+ conj_index[j]=val-1
- new_piece = ' '.join(pieces[i-1:i+2])
+ else:
+ new_piece = " ".join(pieces[i-1:i+2])
if self.is_title(pieces[i-1]):
-
- # if the second name is a title, assume the first one is too and add the
- # two titles with the conjunction between them to the titles constant
- # so the combo we just created gets parsed as a title.
- # e.g. "Mr. and Mrs." becomes a title.
-
+ # when joining to a title, make new_piece a title too
self.C.titles.add(new_piece)
-
pieces[i-1] = new_piece
pieces.pop(i)
- pieces.pop(i)
+ rm_count = 2
+ try:
+ pieces.pop(i)
+ except IndexError:
+ rm_count = 1
+ pass
+ # subtract the number of removed pieces from the index
+ # of all the remaining conjunctions
+ for j,val in enumerate(conj_index):
+ if val > i:
+ conj_index[j] = val - rm_count
+
# join prefixes to following lastnames: ['de la Vega'], ['van Buren']
prefixes = list(filter(self.is_prefix, pieces))
diff --git a/tests.py b/tests.py
index 579bf13..32e0b74 100644
--- a/tests.py
+++ b/tests.py
@@ -1061,11 +1061,39 @@ class HumanNameBruteForceTests(HumanNameTestBase):
class HumanNameConjunctionTestCase(HumanNameTestBase):
# Last name with conjunction
- def test117(self):
+ def test_last_name_with_conjunction(self):
hn = HumanName('Jose Aznar y Lopez')
self.m(hn.first, "Jose", hn)
self.m(hn.last, "Aznar y Lopez", hn)
+ def test_multiple_conjunctions(self):
+ hn = HumanName("part1 of The part2 of the part3 and part4")
+ self.m(hn.first, "part1 of The part2 of the part3 and part4", hn)
+
+ def test_multiple_conjunctions2(self):
+ hn = HumanName("part1 of and The part2 of the part3 And part4")
+ self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn)
+
+ def test_ends_with_conjunction(self):
+ hn = HumanName("Jon Dough and")
+ self.m(hn.first, "Jon", hn)
+ self.m(hn.last, "Dough and", hn)
+
+ def test_ends_with_two_conjunctions(self):
+ hn = HumanName("Jon Dough and of")
+ self.m(hn.first, "Jon", hn)
+ self.m(hn.last, "Dough and of", hn)
+
+ def test_starts_with_conjunction(self):
+ hn = HumanName("and Jon Dough")
+ self.m(hn.first, "and Jon", hn)
+ self.m(hn.last, "Dough", hn)
+
+ def test_starts_with_two_conjunctions(self):
+ hn = HumanName("the and Jon Dough")
+ self.m(hn.first, "the and Jon", hn)
+ self.m(hn.last, "Dough", hn)
+
# Potential conjunction/prefix treated as initial (because uppercase)
def test_uppercase_middle_initial_conflict_with_conjunction(self):
hn = HumanName('John E Smith')
@@ -1378,6 +1406,11 @@ class PrefixesTestCase(HumanNameTestBase):
self.m(hn.first, "Juan", hn)
self.m(hn.last, "del Sur", hn)
+ def test_prefix_with_period(self):
+ hn = HumanName("Jill St. John")
+ self.m(hn.first, "Jill", hn)
+ self.m(hn.last, "St. John", hn)
+
def test_prefix_before_two_part_last_name(self):
hn = HumanName("pennie von bergen wessels")
self.m(hn.first, "pennie", hn)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-nameparser.git
More information about the Python-modules-commits
mailing list