From 8d6ef523d98b736b1229dda91bcd95e6c499ca7b Mon Sep 17 00:00:00 2001 From: Matt VanEseltine Date: Thu, 20 Jun 2019 16:26:16 -0400 Subject: [PATCH] Prevent false nicknames due to multiple quotes Certain Anglicized names such as those from some Hawaiian, Samoan, and Kenyan traditions, include multiple single quotation marks. This adjusts the quoted_word regex to only capture single quote marks that are not inside words. Without this fix, false nicknames are extracted from inside names like Ng'ang'a and Kawai'ae'a. Tests are included to cover; existing Benjamin 'Ben' Franklin test assures that the typical nickname case is unchanged. --- nameparser/config/regexes.py | 2 +- nameparser/parser.py | 6 +++--- tests.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index beac95f..bd4b320 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -23,7 +23,7 @@ ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'\'([^\s]*?)\'', re.U)), + ("quoted_word", re.compile(r'(?