[Python-modules-commits] [python-nameparser] 01/02: Import python-nameparser_0.5.1.orig.tar.gz

Wed Aug 31 10:41:23 UTC 2016

This is an automated email from the git hooks/post-receive script.

edward pushed a commit to branch master
in repository python-nameparser.

commit 4b6a3a2de78752f9e3917830e14d2c9b34c4b0fb
Author: Edward Betts <edward at 4angle.com>
Date:   Wed Aug 31 10:07:21 2016 +0100

    Import python-nameparser_0.5.1.orig.tar.gz
---
 PKG-INFO                     |   2 +-
 nameparser.egg-info/PKG-INFO |   2 +-
 nameparser/__init__.py       |   2 +-
 nameparser/config/titles.py  |   1 -
 nameparser/parser.py         | 172 ++++++++++++++++++++++++++-----------------
 tests.py                     |  35 ++++++++-
 6 files changed, 142 insertions(+), 72 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index 87a9820..6b0218e 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nameparser
-Version: 0.4.0
+Version: 0.5.1
 Summary: A simple Python module for parsing human names into their individual components.
 Home-page: https://github.com/derek73/python-nameparser
 Author: Derek Gulbranson
diff --git a/nameparser.egg-info/PKG-INFO b/nameparser.egg-info/PKG-INFO
index 87a9820..6b0218e 100644
--- a/nameparser.egg-info/PKG-INFO
+++ b/nameparser.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nameparser
-Version: 0.4.0
+Version: 0.5.1
 Summary: A simple Python module for parsing human names into their individual components.
 Home-page: https://github.com/derek73/python-nameparser
 Author: Derek Gulbranson
diff --git a/nameparser/__init__.py b/nameparser/__init__.py
index 80ae1e7..71c4f7a 100644
--- a/nameparser/__init__.py
+++ b/nameparser/__init__.py
@@ -1,4 +1,4 @@
-VERSION = (0, 4, 0)
+VERSION = (0, 5, 1)
 __version__ = '.'.join(map(str, VERSION))
 __author__ = "Derek Gulbranson"
 __author_email__ = 'derek73 at gmail.com'
diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py
index 0fb78e6..28c6d01 100644
--- a/nameparser/config/titles.py
+++ b/nameparser/config/titles.py
@@ -93,7 +93,6 @@ TITLES = FIRST_NAME_TITLES | set([
     'bench',
     'bg',
     'bgen',
-    'bishop',
     'blessed',
     'bodhisattva',
     'brigadier',
diff --git a/nameparser/parser.py b/nameparser/parser.py
index fc2c173..d697a69 100644
--- a/nameparser/parser.py
+++ b/nameparser/parser.py
@@ -2,6 +2,9 @@
 from __future__ import unicode_literals
 
 import sys
+from operator import itemgetter
+from itertools import groupby
+
 from nameparser.util import u
 from nameparser.util import text_types, binary_type
 from nameparser.util import lc
@@ -9,9 +12,19 @@ from nameparser.util import log
 from nameparser.config import CONSTANTS
 from nameparser.config import Constants
 
-
 ENCODING = 'utf-8'
 
+def group_contiguous_integers(data):
+    """
+    return list of tuples containing first and last index 
+    position of contiguous numbers in a series
+    """
+    ranges = []
+    for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]):
+        group = list(map(itemgetter(1), group))
+        if len(group) > 1:
+            ranges.append((group[0], group[-1]))
+    return ranges
 
 class HumanName(object):
     """
@@ -60,12 +73,9 @@ class HumanName(object):
     
     def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, 
                 string_format=None):
-        global CONSTANTS
         self.C = constants
-        if not self.C:
+        if type(self.C) is not type(CONSTANTS):
             self.C = Constants()
-        if self.C is not CONSTANTS:
-            self.has_own_config = True
         
         self.ENCODING = encoding
         self.string_format = string_format or self.C.string_format
@@ -170,7 +180,11 @@ class HumanName(object):
                 if val:
                     d[m] = val
         return d
-        
+    
+    @property
+    def has_own_config(self):
+        return self.C is not CONSTANTS
+    
     ### attributes
     
     @property
@@ -273,8 +287,8 @@ class HumanName(object):
         return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)
     
     def is_prefix(self, piece):
-        """Is in the prefixes set and not :py:func:`is_an_initial()`."""
-        return piece.lower() in self.C.prefixes and not self.is_an_initial(piece)
+        """Lowercase and no periods version of piece is in the `~nameparser.config.titles.PREFIXES` set."""
+        return lc(piece) in self.C.prefixes
 
     def is_roman_numeral(self, value):
         """
@@ -551,8 +565,8 @@ class HumanName(object):
             if self.C.regexes.period_not_at_end.match(part):
                 # split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.")
                 period_chunks = part.split(".")
-                titles   = filter(self.is_title,  period_chunks)
-                suffixes = filter(self.is_suffix, period_chunks)
+                titles   = list(filter(self.is_title,  period_chunks))
+                suffixes = list(filter(self.is_suffix, period_chunks))
                 
                 # add the part to the constant so it will be found
                 if len(list(titles)):
@@ -566,8 +580,18 @@ class HumanName(object):
         
     def join_on_conjunctions(self, pieces, additional_parts_count=0):
         """
-        Join conjunctions to surrounding pieces, e.g.:
-        ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia']
+        Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
+            
+            ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
+                            ['Mr. and Mrs.', 'John', 'Doe']
+            
+            ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
+                            ['The Secretary of State', 'Hillary', 'Clinton']
+        
+        When joining titles, saves newly formed piece to the instance's titles
+        constant so they will be parsed correctly later. E.g. after parsing the
+        example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
+        be present in the titles constant set.
         
         :param list pieces: name pieces strings after split on spaces
         :param int additional_parts_count: 
@@ -579,70 +603,84 @@ class HumanName(object):
         # don't join on conjuctions if there's only 2 parts
         if length < 3:
             return pieces
-        
-        for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list
-            
-            # loop through the pieces backwards, starting at the end of the list.
-            # Join conjunctions to the pieces on either side of them.
-            
-            rootname_pieces = [p for p in pieces if self.is_rootname(p)]
-            total_length= len(rootname_pieces) + additional_parts_count
-            if len(conj) == 1 and total_length < 4:
-                # if there are only 3 total parts (minus known titles, suffixes and prefixes) 
-                # and this conjunction is a single letter, prefer treating it as an initial
-                # rather than a conjunction.
-                # http://code.google.com/p/python-nameparser/issues/detail?id=11
-                continue
             
+        rootname_pieces = [p for p in pieces if self.is_rootname(p)]
+        total_length= len(rootname_pieces) + additional_parts_count
+        
+        # find all the conjunctions, join any conjunctions that are next to each
+        # other, then join those newly joined conjunctions and any single
+        # conjunctions to the piece before and after it
+        conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
+        
+        contiguous_conj_i = []
+        for i, val in enumerate(conj_index):
             try:
-                i = pieces.index((conj))
-            except ValueError:
-                log.error("Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".format(**locals()))
+                if conj_index[i+1] == val+1:
+                     contiguous_conj_i += [val]
+            except IndexError:
+                pass
+        
+        contiguous_conj_i = group_contiguous_integers(conj_index)
+        
+        delete_i = [] 
+        for i in contiguous_conj_i:
+            if type(i) == tuple:
+                new_piece = " ".join(pieces[ i[0] : i[1]+1] )
+                delete_i += list(range( i[0]+1, i[1]+1 ))
+                pieces[i[0]] = new_piece
+            else:
+                new_piece = " ".join(pieces[ i : i+2 ])
+                delete_i += [i+1]
+                pieces[i] = new_piece
+            #add newly joined conjunctions to constants to be found later
+            self.C.conjunctions.add(new_piece)
+        
+        for i in reversed(delete_i):
+            # delete pieces in reverse order or the index changes on each delete
+            del pieces[i]
+        
+        # refresh conjunction index locations
+        conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
+        
+        for i in conj_index:
+            if len(pieces[i]) == 1 and total_length < 4:
+                # if there are only 3 total parts (minus known titles, suffixes
+                # and prefixes) and this conjunction is a single letter, prefer
+                # treating it as an initial rather than a conjunction.
+                # http://code.google.com/p/python-nameparser/issues/detail?id=11
                 continue
             
-            if i < len(pieces) - 1: 
-                # if this is not the last piece
-                
-                if i is 0:
-                    # if this is the first piece and it's a conjunction
-                    nxt = pieces[i+1]
-                    const = self.C.conjunctions
-                    if self.is_title(nxt):
-                        const = self.C.titles
-                    new_piece = ' '.join(pieces[0:2])
-                    const.add(new_piece)
-                    pieces[i] = new_piece
-                    pieces.pop(i+1)
-                    continue
-                
-                if self.is_conjunction(pieces[i-1]):
-                    
-                    # if the piece in front of this one is a conjunction too,
-                    # add new_piece (this conjuction and the following piece) 
-                    # to the conjuctions constant so that it is recognized
-                    # as a conjunction in the next loop. 
-                    # e.g. for ["Lord","of","the Universe"], put "the Universe"
-                    # into the conjunctions constant.
-                    
-                    new_piece = ' '.join(pieces[i:i+2])
-                    self.C.conjunctions.add(new_piece)
-                    pieces[i] = new_piece
-                    pieces.pop(i+1)
-                    continue
+            if i is 0:
+                new_piece = " ".join(pieces[i:i+2])
+                if self.is_title(pieces[i+1]):
+                    # when joining to a title, make new_piece a title too
+                    self.C.titles.add(new_piece)
+                pieces[i] = new_piece
+                pieces.pop(i+1)
+                # subtract 1 from the index of all the remaining conjunctions
+                for j,val in enumerate(conj_index):
+                    if val > i:
+                        conj_index[j]=val-1
                 
-                new_piece = ' '.join(pieces[i-1:i+2])
+            else:    
+                new_piece = " ".join(pieces[i-1:i+2])
                 if self.is_title(pieces[i-1]):
-                    
-                    # if the second name is a title, assume the first one is too and add the 
-                    # two titles with the conjunction between them to the titles constant 
-                    # so the combo we just created gets parsed as a title. 
-                    # e.g. "Mr. and Mrs." becomes a title.
-                    
+                    # when joining to a title, make new_piece a title too
                     self.C.titles.add(new_piece)
-                
                 pieces[i-1] = new_piece
                 pieces.pop(i)
-                pieces.pop(i)
+                rm_count = 2
+                try:
+                    pieces.pop(i)
+                except IndexError:
+                    rm_count = 1
+                    pass
+                # subtract the number of removed pieces from the index
+                # of all the remaining conjunctions
+                for j,val in enumerate(conj_index):
+                    if val > i:
+                        conj_index[j] = val - rm_count
+        
         
         # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
         prefixes = list(filter(self.is_prefix, pieces))
diff --git a/tests.py b/tests.py
index 579bf13..32e0b74 100644
--- a/tests.py
+++ b/tests.py
@@ -1061,11 +1061,39 @@ class HumanNameBruteForceTests(HumanNameTestBase):
 
 class HumanNameConjunctionTestCase(HumanNameTestBase):
     # Last name with conjunction
-    def test117(self):
+    def test_last_name_with_conjunction(self):
         hn = HumanName('Jose Aznar y Lopez')
         self.m(hn.first, "Jose", hn)
         self.m(hn.last, "Aznar y Lopez", hn)
 
+    def test_multiple_conjunctions(self):
+        hn = HumanName("part1 of The part2 of the part3 and part4")
+        self.m(hn.first, "part1 of The part2 of the part3 and part4", hn)
+
+    def test_multiple_conjunctions2(self):
+        hn = HumanName("part1 of and The part2 of the part3 And part4")
+        self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn)
+    
+    def test_ends_with_conjunction(self):
+        hn = HumanName("Jon Dough and")
+        self.m(hn.first, "Jon", hn)
+        self.m(hn.last, "Dough and", hn)
+
+    def test_ends_with_two_conjunctions(self):
+        hn = HumanName("Jon Dough and of")
+        self.m(hn.first, "Jon", hn)
+        self.m(hn.last, "Dough and of", hn)
+
+    def test_starts_with_conjunction(self):
+        hn = HumanName("and Jon Dough")
+        self.m(hn.first, "and Jon", hn)
+        self.m(hn.last, "Dough", hn)
+
+    def test_starts_with_two_conjunctions(self):
+        hn = HumanName("the and Jon Dough")
+        self.m(hn.first, "the and Jon", hn)
+        self.m(hn.last, "Dough", hn)
+
     # Potential conjunction/prefix treated as initial (because uppercase)
     def test_uppercase_middle_initial_conflict_with_conjunction(self):
         hn = HumanName('John E Smith')
@@ -1378,6 +1406,11 @@ class PrefixesTestCase(HumanNameTestBase):
         self.m(hn.first, "Juan", hn)
         self.m(hn.last, "del Sur", hn)
     
+    def test_prefix_with_period(self):
+        hn = HumanName("Jill St. John")
+        self.m(hn.first, "Jill", hn)
+        self.m(hn.last, "St. John", hn)
+    
     def test_prefix_before_two_part_last_name(self):
         hn = HumanName("pennie von bergen wessels")
         self.m(hn.first, "pennie", hn)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-nameparser.git