Created
January 25, 2019 16:22
-
-
Save Attumm/825a6bc90fdb8d1c3de9a6eccbd8f09a to your computer and use it in GitHub Desktop.
parse date
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ======== parsing pdf ======= | |
| woensdag 2 januari 2019 | |
| dinsdag 31 december 2019 | |
| vrijdag 31 december 2021 | |
| vrijdag 31 december 2021 | |
| donderdag 6 december 2018 Om 23:59 | |
| vrijdag 7 december 2018 Vóór 23:59 | |
| maandag 10 december 2018 Vóór 23:59 | |
| woensdag 12 december 2018 Vóór 23:59 | |
| maandag 31 december 2018 | |
| ======== parsing pdf ======= | |
| maandag 7 januari 2019 | |
| dinsdag 30 april 2019 | |
| donderdag 30 april 2020 | |
| donderdag 6 december 2018 Om 23:59 | |
| vrijdag 7 december 2018 Vóór 23:59 | |
| maandag 10 december 2018 Vóór 23:59 | |
| woensdag 12 december 2018 Vóór 23:59 | |
| maandag 31 december 2018 | |
| ======== parsing pdf ======= | |
| dinsdag 1 januari 2019 | |
| donderdag 28 februari 2019 | |
| donderdag 31 december 2020 | |
| donderdag 31 december 2020 | |
| donderdag 6 december 2018 Om 23:59 | |
| vrijdag 7 december 2018 Vóór 23:59 | |
| maandag 10 december 2018 Vóór 23:59 | |
| woensdag 12 december 2018 Vóór 23:59 | |
| maandag 31 december 2018 | |
| ======== parsing pdf ======= | |
| maandag 11 februari 2019 | |
| dinsdag 31 december 2019 | |
| zaterdag 31 december 2022 | |
| donderdag 17 januari 2019 Om 23:59 | |
| vrijdag 18 januari 2019 Vóór 23:59 | |
| maandag 21 januari 2019 Vóór 23:59 | |
| woensdag 23 januari 2019 Vóór 23:59 | |
| donderdag 7 februari 2019 | |
| ======== parsing pdf ======= | |
| maandag 11 februari 2019 | |
| dinsdag 31 december 2019 | |
| vrijdag 31 december 2021 | |
| vrijdag 31 december 2021 | |
| donderdag 17 januari 2019 Om 23:59 | |
| vrijdag 18 januari 2019 Vóór 23:59 | |
| maandag 21 januari 2019 Vóór 23:59 | |
| donderdag 24 donderdag 2019 Vóór 23:59 | |
| donderdag 7 februari 2019 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import datetime | |
| class UnableToParse(Exception): | |
| pass | |
| trans_day = { | |
| 'zondag': 0, | |
| 'maandag': 1, | |
| 'dinsdag': 2, | |
| 'woensdag': 3, | |
| 'donderdag': 4, | |
| 'vrijdag': 5, | |
| 'zaterdag': 6, | |
| } | |
| trans_month = { | |
| 'januari': 1, | |
| 'februari': 2, | |
| 'maart': 3, | |
| 'april': 4, | |
| 'mei': 5, | |
| 'juni': 6, | |
| 'juli': 7, | |
| 'augustus': 8, | |
| 'september': 9, | |
| 'oktober': 10, | |
| 'november': 11, | |
| 'december': 12, | |
| } | |
| def get_day_name(raw_item): | |
| try: | |
| return raw_item[0] | |
| except IndexError: | |
| raise UnableToParse | |
| def get_day_week(raw_item): | |
| try: | |
| return trans_day[raw_item[0]] | |
| except KeyError: | |
| raise UnableToParse | |
| def get_day(raw_item): | |
| try: | |
| return int(raw_item[1]) | |
| except (IndexError, ValueError): | |
| raise UnableToParse | |
| def get_month_name(raw_item): | |
| try: | |
| return raw_item[2] | |
| except IndexError: | |
| raise UnableToParse | |
| def get_month(raw_item): | |
| try: | |
| return trans_month[raw_item[2]] | |
| except KeyError: | |
| raise UnableToParse | |
| def get_year(raw_item): | |
| try: | |
| return int(raw_item[3]) | |
| except (IndexError, ValueError): | |
| raise UnableToParse | |
| def parse_item(line): | |
| raw_item = line.split() | |
| try: | |
| return { | |
| 'day_name': get_day_name(raw_item), | |
| 'day_week': get_day_week(raw_item), | |
| 'day': get_day(raw_item), | |
| 'month_name': get_month_name(raw_item), | |
| 'month': get_month(raw_item), | |
| 'year': get_year(raw_item), | |
| } | |
| except UnableToParse: | |
| return None | |
| def valid_lines(filename): | |
| for line in (i for i in open(filename)): | |
| if len(line) < 10: | |
| continue | |
| if '====' in line: | |
| continue | |
| if 'parsing' in line: | |
| continue | |
| yield line | |
| # validation not yet used | |
| def valid_day_of_month(i): | |
| return i['number'] < 32 # TODO extend with last day of month check | |
| if __name__ == '__main__': | |
| filename= sys.argv[sys.argv.index('-f')+1] if '-f' in sys.argv else 'dates.txt' | |
| for line in valid_lines(filename): | |
| item = parse_item(line) | |
| if item is None: | |
| print(line) | |
| else: | |
| date = datetime.datetime(year=item['year'], month=item['month'], day=item['day']) | |
| print(date.isoformat()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment