Created
April 11, 2022 14:43
-
-
Save cutterkom/dee09712bdda23c91cf28e4e536d166e to your computer and use it in GitHub Desktop.
Spacy German date patterns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
date_marker_start = "[Aa]b$|[Zw]ischen$|[Bb]is$|[Ss]eit$|[Vv]on$" | |
date_marker_between = "\\-|und|bis" | |
date_marker_decade = "er|er Jahre" | |
day = "0?[1-9]|[12]\d|3[01]" | |
month = "Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember|0?1\.|0?2\.|0?3\.|0?4\.|0?5\.|0?6\.|0?7\.|0?8\.|0?9\.|10\.|11\.|12\." | |
year = "[12][0-9]{3}" # years from 1000 to 2999 | |
decade_years = "[12][0-9]{2}0ern?" # 1970er, 1990er | |
daymonthyear = "^(((0?[1-9]|[12]\d|3[01])\.(0[13578]|[13578]|1[02])\.((1[6-9]|[2-9]\d)\d{2}))|((0?[1-9]|[12]\d|30)\.(0[13456789]|[13456789]|1[012])\.((1[6-9]|[2-9]\d)\d{2}))|((0?[1-9]|1\d|2[0-8])\.0?2\.((1[6-9]|[2-9]\d)\d{2}))|(29\.0?2\.((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))$" # all combinations of German compact dates | |
patterns = [ | |
{"label": "DATE_BETWEEN", "pattern": [ | |
# zwischen 21. Juni 2000 und 25. Juli 2000 | |
# von 21. Januar bis 10.10.2000 | |
{"TEXT": {"REGEX": date_marker_start}}, | |
{"TEXT": {"REGEX": day}, "OP": "?"}, | |
{"TEXT": {"REGEX": month}}, | |
{"TEXT": {"REGEX": year}, "OP": "?"}, | |
{"TEXT": {"REGEX": date_marker_between}}, | |
{"TEXT": {"REGEX": day}, "OP": "?"}, | |
{"TEXT": {"REGEX": month}}, | |
{"TEXT": {"REGEX": year}, "OP": "?"} | |
]}, | |
{"label": "DATE_SINGLE", "pattern": [ | |
# 1. Mai 2022, | |
{"TEXT": {"REGEX": day}}, | |
{"TEXT": {"REGEX": month}}, | |
{"TEXT": {"REGEX": year}} | |
]}, | |
{"label": "DATE_SINGLE_WITHOUT_DAY", "pattern": [ | |
# Mai 2022, | |
{"TEXT": {"REGEX": month}}, | |
{"TEXT": {"REGEX": year}} | |
]}, | |
{"label": "DATE_SINGLE_COMPACT", "pattern": [ | |
# 1.10.2000 | |
{"TEXT": {"REGEX": daymonthyear}} | |
]}, | |
{"label": "DATE_BETWEEN", "pattern": [ | |
# zwischen 1988 und 2020 | |
{"TEXT": {"REGEX": date_marker_start}}, | |
{"SHAPE": "dddd"}, | |
{"TEXT": {"REGEX": date_marker_between}}, | |
{"SHAPE": "dddd"} | |
]}, | |
{"label": "DATE_DECADE", "pattern": [ | |
# 1970er, 1990ern | |
{"TEXT": {"REGEX": decade_years}} | |
]}, | |
{"label": "DATE_START", "pattern": [ | |
# Ab 1820, bis 1990 | |
{"TEXT": {"REGEX": date_marker_start}}, | |
{"TEXT": {"REGEX": year}} | |
]}, | |
{"label": "DATE_YEAR_FROM_TO", "pattern": [ | |
# 1999 - 2022, 1999-2022 | |
{"TEXT": {"REGEX": year}}, | |
{"TEXT": {"REGEX": "\s?-\s?"}}, | |
{"TEXT": {"REGEX": year}} | |
]}, | |
{"label": "DATE_YEAR", "pattern": [ | |
# 1999 | |
{"TEXT": {"REGEX": year}} | |
]} | |
] |
Works on:
text = "von April bis 10.Januar 2000, 10.1.2000, 1999 - 2022, 1999-2022 Ab 1820, ab 1830, 1. Mai 2022, 31. Dezember 1200 ,2000, bis 1954, August 1924, Zwischen 1988 und 2020, April bis 10. Mai, April 1929 bis 10. Mai 2000, bis in die 1970er Jahre, ab 1992, zwischen 21. Juni 2000 und 25. Juli 2000, Seit 2000, 1988, 1.1.2000, von 21. Januar bis 10.10.2000"
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Pattern for
daymonthyear
https://regexlib.com/REDetails.aspx?regexp_id=319