justin@justin-XPS-13-9360:~/work$ python team_matcher.py
matched:
AFC Bournemouth: Bournemouth
Arsenal: Arsenal
Aston Villa: Aston Villa
Bournemouth: Bournemouth
Brighton: Brighton
Brighton & Hove Albion: Brighton
Burnley: Burnley
Chelsea: Chelsea
Crystal Palace: Crystal Palace
Everton: Everton
Leicester: Leicester
Leicester City: Leicester
Liverpool: Liverpool
Man City: Man City
Man United: Man Utd
Man Utd: Man Utd
Manchester City: Man City
Manchester United: Man Utd
Newcastle: Newcastle
Newcastle United: Newcastle
Norwich: Norwich
Norwich City: Norwich
Sheffield United: Sheffield Utd
Sheffield Utd: Sheffield Utd
Southampton: Southampton
Tottenham: Tottenham
Tottenham Hotspur: Tottenham
Watford: Watford
West Ham: West Ham
West Ham United: West Ham
Wolverhampton Wanderers: Wolves
Wolves: Wolves
unmatched: []
Last active
October 20, 2019 16:14
-
-
Save jhw/8e07a52d77c9d49ea571e46784e56054 to your computer and use it in GitHub Desktop.
Team matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pyyaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
AFC Bournemouth: Bournemouth | |
Arsenal: Arsenal | |
Aston Villa: Aston Villa | |
Bournemouth: Bournemouth | |
Brighton: Brighton | |
Brighton & Hove Albion: Brighton | |
Burnley: Burnley | |
Chelsea: Chelsea | |
Crystal Palace: Crystal Palace | |
Everton: Everton | |
Leicester: Leicester | |
Leicester City: Leicester | |
Liverpool: Liverpool | |
Man City: Man City | |
Man United: Man Utd | |
Man Utd: Man Utd | |
Manchester City: Man City | |
Manchester United: Man Utd | |
Newcastle: Newcastle | |
Newcastle United: Newcastle | |
Norwich: Norwich | |
Norwich City: Norwich | |
Sheffield United: Sheffield Utd | |
Sheffield Utd: Sheffield Utd | |
Southampton: Southampton | |
Tottenham: Tottenham | |
Tottenham Hotspur: Tottenham | |
Watford: Watford | |
West Ham: West Ham | |
West Ham United: West Ham | |
Wolverhampton Wanderers: Wolves | |
Wolves: Wolves |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
""" | |
- don't use \\W here as will remove accented characters | |
""" | |
def clean_text(text, | |
pattern="\\'|\\-|\\.|\\,|\\:|\\;|\\(|\\)|\\/", | |
reject=["Real"]): | |
tokens=[re.sub(pattern, "", tok) | |
for tok in re.split("\\s", text) | |
if (tok!='' and | |
tok not in reject and | |
re.search("^\\d+$", tok)==None)] | |
if len(tokens)!=1: # QPR | |
tokens=[tok for tok in tokens | |
if not tok==tok.upper()] | |
return " ".join([tok.lower() | |
for tok in tokens]) | |
def run_matcher(matcher, text, teams): | |
cleantext=clean_text(text) | |
for team in teams: | |
teamnames=[team["name"]] | |
if "alt_names" in team: | |
teamnames+=team["alt_names"] | |
for teamname in teamnames: | |
cleanteamname=clean_text(teamname) | |
if matcher(cleantext, cleanteamname): | |
return team["name"] | |
return None | |
""" | |
https://stackoverflow.com/questions/7331462/check-if-a-string-is-a-possible-abbrevation-for-a-name | |
""" | |
def is_abbrev(abbrev, text): | |
words=text.split() | |
if not abbrev: | |
return True | |
if abbrev and not text: | |
return False | |
if abbrev[0]!=text[0]: | |
return False | |
else: | |
return (is_abbrev(abbrev[1:],' '.join(words[1:])) or | |
any(is_abbrev(abbrev[1:],text[i+1:]) | |
for i in range(len(words[0])))) | |
def tokenmatch(x, y): | |
xtokens, ytokens = x.split(" "), y.split(" ") | |
count=0 | |
for xtok in xtokens: | |
for ytok in ytokens: | |
if xtok==ytok: | |
count+=1 | |
return count/(len(xtokens)*len(ytokens)) | |
""" | |
http://hetland.org/coding/python/levenshtein.py | |
""" | |
def levenshtein(a, b): | |
n, m = len(a), len(b) | |
if n > m: | |
a, b = b, a | |
n, m = m, n | |
current=range(n+1) | |
for i in range(1,m+1): | |
previous, current = current, [i]+[0]*n | |
for j in range(1, n+1): | |
add, delete = previous[j]+1, current[j-1]+1 | |
change = previous[j-1] | |
if a[j-1]!=b[i-1]: | |
change=change+1 | |
current[j]=min(add, delete, change) | |
return current[n] | |
""" | |
- NB don't make abbreviation matching two way or "Dundee Utd" will get matched against "Dundee" | |
- NB levenshtein matching threshold increased to <= 2 for "Saint-Étienne" | |
- NB levenshtein promoted to 2nd place to avoid "Atlético" being matched to Real Madrid by token matcher | |
""" | |
Matchers=[("exact", lambda x, y: clean_text(x)==clean_text(y)), | |
("levenshtein", lambda x, y: levenshtein(x, y) <= 2), | |
("abbrev0", lambda x, y: is_abbrev(x, y)), | |
("abbrev1", lambda x, y: is_abbrev(y, x)), | |
("token", lambda x, y: tokenmatch(x, y) >= 0.5)] | |
def match_team(text, teams): | |
for matchername, matcher in Matchers: | |
teamname=run_matcher(matcher, text, teams) | |
if teamname: | |
# print ("%s matched as %s by %s" % (text, teamname, matchername)) | |
return teamname | |
return None | |
def match_teams(texts, teams): | |
matched={} | |
for text in texts: | |
teamname=match_team(text, teams) | |
if teamname: | |
matched[text]=teamname | |
return {"matched": matched, | |
"unmatched": [text for text in texts | |
if text not in matched]} | |
def match_event(text, teams): | |
rawteamnames=re.split(" vs? ", text, re.I) | |
if len(rawteamnames)!=2: | |
return None | |
teamnames=[match_team(rawteamname, teams) | |
for rawteamname in rawteamnames] | |
if None in teamnames: | |
return None | |
return "%s vs %s" % tuple(teamnames) | |
def match_events(texts, teams): | |
matched={} | |
for text in texts: | |
eventname=match_event(text, teams) | |
if eventname: | |
matched[text]=eventname | |
return {"matched": matched, | |
"unmatched": [text for text in texts | |
if text not in matched]} | |
if __name__=="__main__": | |
import yaml | |
teams=yaml.load(open("teams.yaml").read(), | |
Loader=yaml.FullLoader) | |
samples=yaml.load(open("samples.yaml").read(), | |
Loader=yaml.FullLoader) | |
print (yaml.safe_dump(match_teams(samples.keys(), teams), | |
default_flow_style=False)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- name: Arsenal | |
- name: Aston Villa | |
- name: Brighton | |
- name: Bournemouth | |
- name: Burnley | |
- name: Chelsea | |
- name: Crystal Palace | |
- name: Everton | |
- name: Leicester | |
- name: Liverpool | |
- name: Man City | |
- name: Man Utd | |
- name: Newcastle | |
- name: Norwich | |
- name: Sheffield Utd | |
- name: Southampton | |
- name: Tottenham | |
alt_names: | |
- Spurs | |
- name: Watford | |
- name: West Ham | |
- name: Wolves | |
alt_names: | |
- Wolverhampton Wanderers |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment