You can use re
module together with string manipulation to extract the dates easily
import requests
import re
import json
if __name__ == "__main__":
texts = [
'en los dias 3,06,8 ,9, 15 y 29 de diciembre de 2018.Por c',
'n en: 2,04,05,8,9,10,11,14,15,22,24, y 27 de junio de 2018.Valor de',
]
# select from the beginning of date-like text till the end of year
pattern = r'\s*((\d+[\sy\,]*)+[\D\s]+20\d{2})'
month_names = ['diciembre', 'junio'] # add others
month_pattern = re.compile(f'({"|".join(month_names)})', flags=re.IGNORECASE)
all_dates = []
for item in texts:
match = re.search(pattern, item)
if not match:
continue
date_region: str = match.group(1)
# find year
year = re.search('(20\d{2})', date_region).group(1)
# find month
month_match = re.search(month_pattern, date_region)
month = month_match.group(1)
# remove everything after month
date_region = date_region[: month_match.start()]
# find all numbers, we're assuming they represent day of the month
days = re.findall('(\d+)', date_region)
found_dates = [f'{d}/{month}/{year}' for d in days]
all_dates.append(found_dates)
print(all_dates)
I don't know the month names in Portuguese?, but replacing those is a trivial task. output:
[['3/diciembre/2018',
'06/diciembre/2018',
'8/diciembre/2018',
'9/diciembre/2018',
'15/diciembre/2018',
'29/diciembre/2018'],
['2/junio/2018',
'04/junio/2018',
'05/junio/2018',
'8/junio/2018',
'9/junio/2018',
'10/junio/2018',
'11/junio/2018',
'14/junio/2018',
'15/junio/2018',
'22/junio/2018',
'24/junio/2018',
'27/junio/2018']]