Created
October 7, 2021 20:39
-
-
Save restrepo/8c4f1c8721dfe06232bab713a53be4cd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def split_names(s,exceptions=['Gil', 'Lew', 'Liz', 'Paz', 'Rey', 'Rio', 'Roa', 'Rua', 'Sus', 'Zea'], | |
nacionality='Colombiana'): | |
""" | |
Extract the parts of the full name `s` in the format ([] → optional): | |
[SMALL_CONECTORS] FIRST_LAST_NAME [SMALL_CONECTORS] [SECOND_LAST_NAME] NAMES | |
If len(s) == 3 → Not Ibero-America name is asked | |
Add short last names to `exceptions` list if necessary | |
Works with: | |
---- | |
s='DANIEL ANDRES LA ROTTA FORERO' | |
s='MARIA DEL CONSUELO MONTES RAMIREZ' | |
s='RICARDO DE LA MERCED CALLEJAS POSADA' | |
s='MARIA DEL CARMEN DE LA CUESTA BENJUMEA' | |
s='NICOLAS CARLOS MARTI JARAMILLO OCAMPO' | |
s='DIEGO ALEJANDRO RESTREPO QUINTERO' | |
s='JAIRO HUMBERTO RESTREPO ZEA' | |
s='MARLEN JIMENEZ DEL RIO' | |
Fails with more than 2 last names: | |
---- | |
s='ANDRES MAURICIO RANGEL MARTINEZ VILLAL' | |
""" | |
s=s.title() | |
sl=re.sub('(\s\w{1,3})\s',r'\1::',s,re.UNICODE) | |
sl=re.sub('(\s\w{1,3}\::\w{1,3})\s',r'\1::',sl,re.UNICODE) | |
sl=re.sub('^(\w{1,3})\s',r'\1::' ,sl,re.UNICODE) | |
#Clean exceptions | |
#Extract short names list | |
lst=[s for s in re.split( '(\w{1,3})\:\:',sl ) if len(s)>=1 and len(s)<=3 ] | |
#intersection with exceptions list | |
exc=[value for value in exceptions if value in lst] | |
if exc: | |
for e in exc: | |
sl=sl.replace(f'{e}::',f'{e} ') | |
#if sl.find('-')>-1: | |
#sll=[s.replace('-',' ') for s in sl.split()] | |
# | |
sll=sl.split() | |
if len(s.split())==3: | |
IBERO=True | |
if nacionality in ['Extranjero - otra', 'Brasilera']: | |
NOT_IA=input(f'Is {s} from Ibero-America? (y/n)') | |
if NOT_IA.lower()=='n' or NOT_IA.lower()=='not' or NOT_IA.lower()=='no': | |
IBERO=False | |
if IBERO: | |
sll=['']+s.split() | |
else: | |
sll=s.split()+[''] | |
if len(s.split())==2: | |
sll=['']+s.split()+[''] | |
d={'NOMBRE COMPLETO' : ' '.join([x for x in sll[2:] if x]+ | |
[x for x in sll[:2] if x]).replace('::',' '), | |
'PRIMER APELLIDO' : sll[-2].replace('::',' '), | |
'SEGUNDO APELLIDO': sll[-1].replace('::',' '), | |
'NOMBRES' :' '.join([x for x in sll[:-2] if x]).replace('::',' '), | |
'INICIALES' :' '.join( [i[0]+'.' for i in ' '.join( | |
[x for x in sll[:-2] if x]).replace('::',' ').split() ] ) | |
} | |
return d | |
assert split_names('DANIEL ANDRES LA ROTTA FORERO')['NOMBRES']=='Daniel Andres' | |
assert split_names('Jairo Humberto Zea Restrepo')['NOMBRES']=='Jairo Humberto' | |
assert split_names('RICARDO DE LA MERCED CALLEJAS POSADA')['NOMBRES']=='Ricardo De La Merced' | |
assert (split_names('MARIA DEL CARMEN DE LA CUESTA BENJUMEA')['NOMBRES']== 'Maria Del Carmen' and | |
split_names('MARIA DEL CARMEN DE LA CUESTA BENJUMEA')['PRIMER APELLIDO']=='De La Cuesta') | |
assert split_names('NICOLAS CARLOS MARTI JARAMILLO OCAMPO')['NOMBRES']=='Nicolas Carlos Marti' | |
assert split_names('DIEGO ALEJANDRO RESTREPO QUINTERO')['NOMBRES']=='Diego Alejandro' | |
assert split_names('DIEGO RESTREPO QUINTERO',nacionality='Colombiana')['NOMBRES']=='Diego' | |
assert split_names('DIEGO RESTREPO',nacionality='Colombiana')['NOMBRES']=='Diego' | |
assert split_names('Anand Jagadeesh Puppala',nacionality='Extranjero - otra')['NOMBRES']=='Anand Jagadeesh' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment