Skip to content

Instantly share code, notes, and snippets.

@dansondergaard
Created May 20, 2016 13:44
Show Gist options
  • Select an option

  • Save dansondergaard/66ba2e048f38db683a2bb957f894a253 to your computer and use it in GitHub Desktop.

Select an option

Save dansondergaard/66ba2e048f38db683a2bb957f894a253 to your computer and use it in GitHub Desktop.
Script for parsing the international UN days page.
"""Script for parsing the international UN days page."""
from datetime import datetime, timedelta
import requests
import requests_cache
requests_cache.install_cache('ido_cache', expire_after=timedelta(weeks=4))
from bs4 import BeautifulSoup
def date_of_international_day(tag):
if not tag.name == 'h4':
return False
potential_paragraph = tag.find_next_sibling('p')
if potential_paragraph is None:
return False
potential_link = potential_paragraph.find('a')
if potential_link is None:
return False
return True
def extract_title(tag):
return tag.find_next_sibling('p').get_text().encode('ascii', 'ignore').decode('utf8')
def extract_date(tag):
if tag.find('a'):
tag = tag.next_element.next_sibling.string
datestr = tag.string.encode('ascii', 'replace').decode('utf8').replace('??', ' ')
if '(' in datestr:
datestr = datestr[:datestr.index('(')-1].strip()
if '-' in datestr:
datestr = datestr.replace(' - ', '-')
datespan, month = datestr.split(' ')
begindate, enddate = datespan.split('-')
begindate = begindate + ' ' + month
enddate = enddate + ' ' + month
else:
begindate = datestr
enddate = datestr
begindate = datetime.strptime(begindate, r'%d %B')
begindate = begindate.replace(year=datetime.now().year)
enddate = datetime.strptime(enddate, r'%d %B')
enddate = enddate.replace(year=datetime.now().year, hour=23, minute=59, second=59)
return begindate, enddate
def main():
res = requests.get('http://www.un.org/en/sections/observances/international-days/')
soup = BeautifulSoup(res.text, 'html.parser')
for tag in soup.find_all(date_of_international_day):
begindate, enddate = extract_date(tag)
title = extract_title(tag)
marker = ' '
if begindate <= datetime.today() <= enddate:
marker = '*'
print('{:<2} {:<20} {:<20} {}'.format(
marker,
begindate.strftime(r'%d %B'),
enddate.strftime(r'%d %B'),
title))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment