jhw · October 20, 2019 16:14
diff --git a/.TEAM_MATCHER.md b/.TEAM_MATCHER.md
diff --git a/requirements.txt b/requirements.txt
 pyyaml
diff --git a/samples.yaml b/samples.yaml
 ---
 AFC Bournemouth: Bournemouth
 Arsenal: Arsenal
 Aston Villa: Aston Villa
 Bournemouth: Bournemouth
 Brighton: Brighton
 Brighton & Hove Albion: Brighton
 Burnley: Burnley
 Chelsea: Chelsea
 Crystal Palace: Crystal Palace
 Everton: Everton
 Leicester: Leicester
 Leicester City: Leicester
 Liverpool: Liverpool
 Man City: Man City
 Man United: Man Utd
 Man Utd: Man Utd
 Manchester City: Man City
 Manchester United: Man Utd
 Newcastle: Newcastle
 Newcastle United: Newcastle
 Norwich: Norwich
 Norwich City: Norwich
 Sheffield United: Sheffield Utd
 Sheffield Utd: Sheffield Utd
 Southampton: Southampton
 Tottenham: Tottenham
 Tottenham Hotspur: Tottenham
 Watford: Watford
 West Ham: West Ham
 West Ham United: West Ham
 Wolverhampton Wanderers: Wolves
 Wolves: Wolves
diff --git a/team_matcher.py b/team_matcher.py
 import re

 """
 - don't use \\W here as will remove accented characters
 """

 def clean_text(text,
               pattern="\\'|\\-|\\.|\\,|\\:|\\;|\\(|\\)|\\/",
               reject=["Real"]):
    tokens=[re.sub(pattern, "", tok)
            for tok in re.split("\\s", text)
            if (tok!='' and
                tok not in reject and
                re.search("^\\d+$", tok)==None)]
    if len(tokens)!=1: # QPR
        tokens=[tok for tok in tokens
                if not tok==tok.upper()]
    return " ".join([tok.lower()
                     for tok in tokens])

 def run_matcher(matcher, text, teams):
    cleantext=clean_text(text)
    for team in teams:
        teamnames=[team["name"]]
        if "alt_names" in team:
            teamnames+=team["alt_names"]
        for teamname in teamnames:
            cleanteamname=clean_text(teamname)
            if matcher(cleantext, cleanteamname):
                return team["name"]
    return None

 """
 https://stackoverflow.com/questions/7331462/check-if-a-string-is-a-possible-abbrevation-for-a-name
 """

 def is_abbrev(abbrev, text):
    words=text.split()
    if not abbrev:
        return True
    if abbrev and not text:
        return False
    if abbrev[0]!=text[0]:
        return False
    else:
        return (is_abbrev(abbrev[1:],' '.join(words[1:])) or
                any(is_abbrev(abbrev[1:],text[i+1:])
                    for i in range(len(words[0]))))

 def tokenmatch(x, y):
    xtokens, ytokens = x.split(" "), y.split(" ")
    count=0
    for xtok in xtokens:
        for ytok in ytokens:
            if xtok==ytok:
                count+=1
    return count/(len(xtokens)*len(ytokens))
    
 """
 http://hetland.org/coding/python/levenshtein.py
 """

 def levenshtein(a, b):
    n, m = len(a), len(b)
    if n > m:
        a, b = b, a
        n, m = m, n        
    current=range(n+1)
    for i in range(1,m+1):
        previous, current = current, [i]+[0]*n
        for j in range(1, n+1):
            add, delete = previous[j]+1, current[j-1]+1
            change = previous[j-1]
            if a[j-1]!=b[i-1]:
                change=change+1
            current[j]=min(add, delete, change)            
    return current[n]

 """
 - NB don't make abbreviation matching two way or "Dundee Utd" will get matched against "Dundee"
 - NB levenshtein matching threshold increased to <= 2 for "Saint-Étienne"
 - NB levenshtein promoted to 2nd place to avoid "Atlético" being matched to Real Madrid by token matcher
 """

 Matchers=[("exact", lambda x, y: clean_text(x)==clean_text(y)),
          ("levenshtein", lambda x, y: levenshtein(x, y) <= 2),
          ("abbrev0", lambda x, y: is_abbrev(x, y)),
          ("abbrev1", lambda x, y: is_abbrev(y, x)),          
          ("token", lambda x, y: tokenmatch(x, y) >= 0.5)]

 def match_team(text, teams):
    for matchername, matcher in Matchers:
        teamname=run_matcher(matcher, text, teams)
        if teamname:
            # print ("%s matched as %s by %s" % (text, teamname, matchername))
            return teamname
    return None

 def match_teams(texts, teams):
    matched={}
    for text in texts:
        teamname=match_team(text, teams)
        if teamname:
            matched[text]=teamname
    return {"matched": matched,
            "unmatched": [text for text in texts
                          if text not in matched]}

 def match_event(text, teams):
    rawteamnames=re.split(" vs? ", text, re.I)
    if len(rawteamnames)!=2:
        return None
    teamnames=[match_team(rawteamname, teams)
               for rawteamname in rawteamnames]
    if None in teamnames:
        return None
    return "%s vs %s" % tuple(teamnames)

 def match_events(texts, teams):
    matched={}
    for text in texts:
        eventname=match_event(text, teams)
        if eventname:
            matched[text]=eventname
    return {"matched": matched,
            "unmatched": [text for text in texts
                          if text not in matched]}
 if __name__=="__main__":
    import yaml
    teams=yaml.load(open("teams.yaml").read(),
                    Loader=yaml.FullLoader)
    samples=yaml.load(open("samples.yaml").read(),
                      Loader=yaml.FullLoader)
    print (yaml.safe_dump(match_teams(samples.keys(), teams),
                          default_flow_style=False))
diff --git a/teams.yaml b/teams.yaml
 ---
 - name: Arsenal
 - name: Aston Villa
 - name: Brighton
 - name: Bournemouth
 - name: Burnley
 - name: Chelsea
 - name: Crystal Palace
 - name: Everton
 - name: Leicester
 - name: Liverpool
 - name: Man City
 - name: Man Utd
 - name: Newcastle
 - name: Norwich
 - name: Sheffield Utd
 - name: Southampton
 - name: Tottenham
  alt_names:
    - Spurs
 - name: Watford
 - name: West Ham
 - name: Wolves
  alt_names:
    - Wolverhampton Wanderers
	---
	AFC Bournemouth: Bournemouth
	Arsenal: Arsenal
	Aston Villa: Aston Villa
	Bournemouth: Bournemouth
	Brighton: Brighton
	Brighton & Hove Albion: Brighton
	Burnley: Burnley
	Chelsea: Chelsea
	Crystal Palace: Crystal Palace
	Everton: Everton
	Leicester: Leicester
	Leicester City: Leicester
	Liverpool: Liverpool
	Man City: Man City
	Man United: Man Utd
	Man Utd: Man Utd
	Manchester City: Man City
	Manchester United: Man Utd
	Newcastle: Newcastle
	Newcastle United: Newcastle
	Norwich: Norwich
	Norwich City: Norwich
	Sheffield United: Sheffield Utd
	Sheffield Utd: Sheffield Utd
	Southampton: Southampton
	Tottenham: Tottenham
	Tottenham Hotspur: Tottenham
	Watford: Watford
	West Ham: West Ham
	West Ham United: West Ham
	Wolverhampton Wanderers: Wolves
	Wolves: Wolves
	import re

	"""
	- don't use \\W here as will remove accented characters
	"""

	def clean_text(text,
	pattern="\\'\|\\-\|\\.\|\\,\|\\:\|\\;\|\\(\|\\)\|\\/",
	reject=["Real"]):
	tokens=[re.sub(pattern, "", tok)
	for tok in re.split("\\s", text)
	if (tok!='' and
	tok not in reject and
	re.search("^\\d+$", tok)==None)]
	if len(tokens)!=1: # QPR
	tokens=[tok for tok in tokens
	if not tok==tok.upper()]
	return " ".join([tok.lower()
	for tok in tokens])

	def run_matcher(matcher, text, teams):
	cleantext=clean_text(text)
	for team in teams:
	teamnames=[team["name"]]
	if "alt_names" in team:
	teamnames+=team["alt_names"]
	for teamname in teamnames:
	cleanteamname=clean_text(teamname)
	if matcher(cleantext, cleanteamname):
	return team["name"]
	return None

	"""
	https://stackoverflow.com/questions/7331462/check-if-a-string-is-a-possible-abbrevation-for-a-name
	"""

	def is_abbrev(abbrev, text):
	words=text.split()
	if not abbrev:
	return True
	if abbrev and not text:
	return False
	if abbrev[0]!=text[0]:
	return False
	else:
	return (is_abbrev(abbrev[1:],' '.join(words[1:])) or
	any(is_abbrev(abbrev[1:],text[i+1:])
	for i in range(len(words[0]))))

	def tokenmatch(x, y):
	xtokens, ytokens = x.split(" "), y.split(" ")
	count=0
	for xtok in xtokens:
	for ytok in ytokens:
	if xtok==ytok:
	count+=1
	return count/(len(xtokens)*len(ytokens))

	"""
	http://hetland.org/coding/python/levenshtein.py
	"""

	def levenshtein(a, b):
	n, m = len(a), len(b)
	if n > m:
	a, b = b, a
	n, m = m, n
	current=range(n+1)
	for i in range(1,m+1):
	previous, current = current, [i]+[0]*n
	for j in range(1, n+1):
	add, delete = previous[j]+1, current[j-1]+1
	change = previous[j-1]
	if a[j-1]!=b[i-1]:
	change=change+1
	current[j]=min(add, delete, change)
	return current[n]

	"""
	- NB don't make abbreviation matching two way or "Dundee Utd" will get matched against "Dundee"
	- NB levenshtein matching threshold increased to <= 2 for "Saint-Étienne"
	- NB levenshtein promoted to 2nd place to avoid "Atlético" being matched to Real Madrid by token matcher
	"""

	Matchers=[("exact", lambda x, y: clean_text(x)==clean_text(y)),
	("levenshtein", lambda x, y: levenshtein(x, y) <= 2),
	("abbrev0", lambda x, y: is_abbrev(x, y)),
	("abbrev1", lambda x, y: is_abbrev(y, x)),
	("token", lambda x, y: tokenmatch(x, y) >= 0.5)]

	def match_team(text, teams):
	for matchername, matcher in Matchers:
	teamname=run_matcher(matcher, text, teams)
	if teamname:
	# print ("%s matched as %s by %s" % (text, teamname, matchername))
	return teamname
	return None

	def match_teams(texts, teams):
	matched={}
	for text in texts:
	teamname=match_team(text, teams)
	if teamname:
	matched[text]=teamname
	return {"matched": matched,
	"unmatched": [text for text in texts
	if text not in matched]}

	def match_event(text, teams):
	rawteamnames=re.split(" vs? ", text, re.I)
	if len(rawteamnames)!=2:
	return None
	teamnames=[match_team(rawteamname, teams)
	for rawteamname in rawteamnames]
	if None in teamnames:
	return None
	return "%s vs %s" % tuple(teamnames)

	def match_events(texts, teams):
	matched={}
	for text in texts:
	eventname=match_event(text, teams)
	if eventname:
	matched[text]=eventname
	return {"matched": matched,
	"unmatched": [text for text in texts
	if text not in matched]}
	if __name__=="__main__":
	import yaml
	teams=yaml.load(open("teams.yaml").read(),
	Loader=yaml.FullLoader)
	samples=yaml.load(open("samples.yaml").read(),
	Loader=yaml.FullLoader)
	print (yaml.safe_dump(match_teams(samples.keys(), teams),
	default_flow_style=False))
	---
	- name: Arsenal
	- name: Aston Villa
	- name: Brighton
	- name: Bournemouth
	- name: Burnley
	- name: Chelsea
	- name: Crystal Palace
	- name: Everton
	- name: Leicester
	- name: Liverpool
	- name: Man City
	- name: Man Utd
	- name: Newcastle
	- name: Norwich
	- name: Sheffield Utd
	- name: Southampton
	- name: Tottenham
	alt_names:
	- Spurs
	- name: Watford
	- name: West Ham
	- name: Wolves
	alt_names:
	- Wolverhampton Wanderers