Skip to content

Instantly share code, notes, and snippets.

@cutterkom
Created April 11, 2022 14:43
Show Gist options
  • Save cutterkom/dee09712bdda23c91cf28e4e536d166e to your computer and use it in GitHub Desktop.
Save cutterkom/dee09712bdda23c91cf28e4e536d166e to your computer and use it in GitHub Desktop.
Spacy German date patterns
date_marker_start = "[Aa]b$|[Zw]ischen$|[Bb]is$|[Ss]eit$|[Vv]on$"
date_marker_between = "\\-|und|bis"
date_marker_decade = "er|er Jahre"
day = "0?[1-9]|[12]\d|3[01]"
month = "Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember|0?1\.|0?2\.|0?3\.|0?4\.|0?5\.|0?6\.|0?7\.|0?8\.|0?9\.|10\.|11\.|12\."
year = "[12][0-9]{3}" # years from 1000 to 2999
decade_years = "[12][0-9]{2}0ern?" # 1970er, 1990er
daymonthyear = "^(((0?[1-9]|[12]\d|3[01])\.(0[13578]|[13578]|1[02])\.((1[6-9]|[2-9]\d)\d{2}))|((0?[1-9]|[12]\d|30)\.(0[13456789]|[13456789]|1[012])\.((1[6-9]|[2-9]\d)\d{2}))|((0?[1-9]|1\d|2[0-8])\.0?2\.((1[6-9]|[2-9]\d)\d{2}))|(29\.0?2\.((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))$" # all combinations of German compact dates
patterns = [
{"label": "DATE_BETWEEN", "pattern": [
# zwischen 21. Juni 2000 und 25. Juli 2000
# von 21. Januar bis 10.10.2000
{"TEXT": {"REGEX": date_marker_start}},
{"TEXT": {"REGEX": day}, "OP": "?"},
{"TEXT": {"REGEX": month}},
{"TEXT": {"REGEX": year}, "OP": "?"},
{"TEXT": {"REGEX": date_marker_between}},
{"TEXT": {"REGEX": day}, "OP": "?"},
{"TEXT": {"REGEX": month}},
{"TEXT": {"REGEX": year}, "OP": "?"}
]},
{"label": "DATE_SINGLE", "pattern": [
# 1. Mai 2022,
{"TEXT": {"REGEX": day}},
{"TEXT": {"REGEX": month}},
{"TEXT": {"REGEX": year}}
]},
{"label": "DATE_SINGLE_WITHOUT_DAY", "pattern": [
# Mai 2022,
{"TEXT": {"REGEX": month}},
{"TEXT": {"REGEX": year}}
]},
{"label": "DATE_SINGLE_COMPACT", "pattern": [
# 1.10.2000
{"TEXT": {"REGEX": daymonthyear}}
]},
{"label": "DATE_BETWEEN", "pattern": [
# zwischen 1988 und 2020
{"TEXT": {"REGEX": date_marker_start}},
{"SHAPE": "dddd"},
{"TEXT": {"REGEX": date_marker_between}},
{"SHAPE": "dddd"}
]},
{"label": "DATE_DECADE", "pattern": [
# 1970er, 1990ern
{"TEXT": {"REGEX": decade_years}}
]},
{"label": "DATE_START", "pattern": [
# Ab 1820, bis 1990
{"TEXT": {"REGEX": date_marker_start}},
{"TEXT": {"REGEX": year}}
]},
{"label": "DATE_YEAR_FROM_TO", "pattern": [
# 1999 - 2022, 1999-2022
{"TEXT": {"REGEX": year}},
{"TEXT": {"REGEX": "\s?-\s?"}},
{"TEXT": {"REGEX": year}}
]},
{"label": "DATE_YEAR", "pattern": [
# 1999
{"TEXT": {"REGEX": year}}
]}
]
@cutterkom
Copy link
Author

Works on:

text = "von April bis 10.Januar 2000, 10.1.2000, 1999 - 2022, 1999-2022 Ab 1820, ab 1830, 1. Mai 2022, 31. Dezember 1200 ,2000, bis 1954, August 1924, Zwischen 1988 und 2020, April bis 10. Mai, April 1929 bis 10. Mai 2000, bis in die 1970er Jahre, ab 1992, zwischen 21. Juni 2000 und 25. Juli 2000, Seit 2000, 1988, 1.1.2000, von 21. Januar bis 10.10.2000"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment