Last active
July 9, 2017 08:53
-
-
Save ettorerizza/67ece768253c56eca7f613389b4f544a to your computer and use it in GitHub Desktop.
Jython naive method to detect potential persons names in OpenRefine based on a list of first names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from unidecode import unidecode | |
with open(r"C:\Users\Boulot\Desktop\prenoms.txt", 'r') as f: | |
prenoms = [name.strip().lower() for name in f] | |
CHARS = "abcdefghijklmnopqrstuvwxyzéèàçüûùABCDEFGHIJKLMNOPQRSTUVWXYZ- " | |
family_joint = ["d'", "de", "du", "der", "den", "vander", "vanden", "van", "le"] | |
#TEST | |
value = "mexico pierre françois van pip test" | |
valeurs = "".join(unidecode(c.lower()) for c in value if c in CHARS).strip().split(' ') | |
liste = [] | |
if len(valeurs) > 1: | |
for i, token in enumerate(valeurs): | |
if token in prenoms: | |
liste.append(token) | |
try: | |
liste.append(valeurs[i + 1]) | |
if valeurs[i + 1] in family_joint and valeurs[i + 2] not in liste: | |
liste.append(valeurs[i + 2]) | |
if valeurs[i + 2] in family_joint and valeurs[i + 3] not in liste: | |
liste.append(valeurs[i + 3]) | |
except IndexError: | |
try: | |
if valeurs[i - 1] not in liste: | |
liste.insert(1, valeurs[i - 1]) | |
if valeurs[i - 2] in family_joint and valeurs[i - 2] not in liste: | |
liste.insert(1, valeurs[i - 2]) | |
if valeurs[i - 3] in family_joint and valeurs[i - 3] not in liste: | |
liste.insert(1, valeurs[i - 3]) | |
except IndexError: | |
pass | |
#liste dédoublonnée | |
seen = set() | |
seen_add = seen.add | |
liste = [x for x in liste if not (x in seen or seen_add(x))] | |
print(" ".join(liste)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment