Skip to content

Instantly share code, notes, and snippets.

@jhw
Last active October 20, 2019 16:14
Show Gist options
  • Save jhw/8e07a52d77c9d49ea571e46784e56054 to your computer and use it in GitHub Desktop.
Save jhw/8e07a52d77c9d49ea571e46784e56054 to your computer and use it in GitHub Desktop.
Team matcher
justin@justin-XPS-13-9360:~/work$ python team_matcher.py 
matched:
  AFC Bournemouth: Bournemouth
  Arsenal: Arsenal
  Aston Villa: Aston Villa
  Bournemouth: Bournemouth
  Brighton: Brighton
  Brighton & Hove Albion: Brighton
  Burnley: Burnley
  Chelsea: Chelsea
  Crystal Palace: Crystal Palace
  Everton: Everton
  Leicester: Leicester
  Leicester City: Leicester
  Liverpool: Liverpool
  Man City: Man City
  Man United: Man Utd
  Man Utd: Man Utd
  Manchester City: Man City
  Manchester United: Man Utd
  Newcastle: Newcastle
  Newcastle United: Newcastle
  Norwich: Norwich
  Norwich City: Norwich
  Sheffield United: Sheffield Utd
  Sheffield Utd: Sheffield Utd
  Southampton: Southampton
  Tottenham: Tottenham
  Tottenham Hotspur: Tottenham
  Watford: Watford
  West Ham: West Ham
  West Ham United: West Ham
  Wolverhampton Wanderers: Wolves
  Wolves: Wolves
unmatched: []
---
AFC Bournemouth: Bournemouth
Arsenal: Arsenal
Aston Villa: Aston Villa
Bournemouth: Bournemouth
Brighton: Brighton
Brighton & Hove Albion: Brighton
Burnley: Burnley
Chelsea: Chelsea
Crystal Palace: Crystal Palace
Everton: Everton
Leicester: Leicester
Leicester City: Leicester
Liverpool: Liverpool
Man City: Man City
Man United: Man Utd
Man Utd: Man Utd
Manchester City: Man City
Manchester United: Man Utd
Newcastle: Newcastle
Newcastle United: Newcastle
Norwich: Norwich
Norwich City: Norwich
Sheffield United: Sheffield Utd
Sheffield Utd: Sheffield Utd
Southampton: Southampton
Tottenham: Tottenham
Tottenham Hotspur: Tottenham
Watford: Watford
West Ham: West Ham
West Ham United: West Ham
Wolverhampton Wanderers: Wolves
Wolves: Wolves
import re
"""
- don't use \\W here as will remove accented characters
"""
def clean_text(text,
pattern="\\'|\\-|\\.|\\,|\\:|\\;|\\(|\\)|\\/",
reject=["Real"]):
tokens=[re.sub(pattern, "", tok)
for tok in re.split("\\s", text)
if (tok!='' and
tok not in reject and
re.search("^\\d+$", tok)==None)]
if len(tokens)!=1: # QPR
tokens=[tok for tok in tokens
if not tok==tok.upper()]
return " ".join([tok.lower()
for tok in tokens])
def run_matcher(matcher, text, teams):
cleantext=clean_text(text)
for team in teams:
teamnames=[team["name"]]
if "alt_names" in team:
teamnames+=team["alt_names"]
for teamname in teamnames:
cleanteamname=clean_text(teamname)
if matcher(cleantext, cleanteamname):
return team["name"]
return None
"""
https://stackoverflow.com/questions/7331462/check-if-a-string-is-a-possible-abbrevation-for-a-name
"""
def is_abbrev(abbrev, text):
words=text.split()
if not abbrev:
return True
if abbrev and not text:
return False
if abbrev[0]!=text[0]:
return False
else:
return (is_abbrev(abbrev[1:],' '.join(words[1:])) or
any(is_abbrev(abbrev[1:],text[i+1:])
for i in range(len(words[0]))))
def tokenmatch(x, y):
xtokens, ytokens = x.split(" "), y.split(" ")
count=0
for xtok in xtokens:
for ytok in ytokens:
if xtok==ytok:
count+=1
return count/(len(xtokens)*len(ytokens))
"""
http://hetland.org/coding/python/levenshtein.py
"""
def levenshtein(a, b):
n, m = len(a), len(b)
if n > m:
a, b = b, a
n, m = m, n
current=range(n+1)
for i in range(1,m+1):
previous, current = current, [i]+[0]*n
for j in range(1, n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1]!=b[i-1]:
change=change+1
current[j]=min(add, delete, change)
return current[n]
"""
- NB don't make abbreviation matching two way or "Dundee Utd" will get matched against "Dundee"
- NB levenshtein matching threshold increased to <= 2 for "Saint-Étienne"
- NB levenshtein promoted to 2nd place to avoid "Atlético" being matched to Real Madrid by token matcher
"""
Matchers=[("exact", lambda x, y: clean_text(x)==clean_text(y)),
("levenshtein", lambda x, y: levenshtein(x, y) <= 2),
("abbrev0", lambda x, y: is_abbrev(x, y)),
("abbrev1", lambda x, y: is_abbrev(y, x)),
("token", lambda x, y: tokenmatch(x, y) >= 0.5)]
def match_team(text, teams):
for matchername, matcher in Matchers:
teamname=run_matcher(matcher, text, teams)
if teamname:
# print ("%s matched as %s by %s" % (text, teamname, matchername))
return teamname
return None
def match_teams(texts, teams):
matched={}
for text in texts:
teamname=match_team(text, teams)
if teamname:
matched[text]=teamname
return {"matched": matched,
"unmatched": [text for text in texts
if text not in matched]}
def match_event(text, teams):
rawteamnames=re.split(" vs? ", text, re.I)
if len(rawteamnames)!=2:
return None
teamnames=[match_team(rawteamname, teams)
for rawteamname in rawteamnames]
if None in teamnames:
return None
return "%s vs %s" % tuple(teamnames)
def match_events(texts, teams):
matched={}
for text in texts:
eventname=match_event(text, teams)
if eventname:
matched[text]=eventname
return {"matched": matched,
"unmatched": [text for text in texts
if text not in matched]}
if __name__=="__main__":
import yaml
teams=yaml.load(open("teams.yaml").read(),
Loader=yaml.FullLoader)
samples=yaml.load(open("samples.yaml").read(),
Loader=yaml.FullLoader)
print (yaml.safe_dump(match_teams(samples.keys(), teams),
default_flow_style=False))
---
- name: Arsenal
- name: Aston Villa
- name: Brighton
- name: Bournemouth
- name: Burnley
- name: Chelsea
- name: Crystal Palace
- name: Everton
- name: Leicester
- name: Liverpool
- name: Man City
- name: Man Utd
- name: Newcastle
- name: Norwich
- name: Sheffield Utd
- name: Southampton
- name: Tottenham
alt_names:
- Spurs
- name: Watford
- name: West Ham
- name: Wolves
alt_names:
- Wolverhampton Wanderers
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment