Last active
February 23, 2024 04:18
-
-
Save sonnyksimon/bc1c65bdfd28844c395a4d6af751bec3 to your computer and use it in GitHub Desktop.
string similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Can we implement all of these algorithms? | |
Similarity Algorithms: | |
- Cosine | |
- Fuzzy Wuzzy | |
- Jaccard | |
- Jaro | |
- Jaro Winkler | |
- Q-gram | |
- Sørensen DIce | |
Distance Algorithms: | |
- Damerau Levenshtein | |
- LCS Edit | |
- Levenshtein | |
- QSA Damerau Levenshtein | |
Phonetic Algorithms: | |
- Cologne | |
- Soundex | |
- Metaphone | |
/path/to/python run.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import math | |
def _tokenize(text): | |
return re.split(r'[^A-Za-z0-9]+', text) | |
def _get_tokens(text): | |
return [word.lower() for word in _tokenize(text) if not word.isnumeric()] | |
def _compute_frequency(arr, commons): | |
return [arr.count(word) for word in commons] | |
def _compute_vector_ab(v1, v2): | |
return sum(f1*f2 for f1,f2 in zip(v1,v2)) | |
def _abs_vector(v): | |
return math.sqrt(sum(f*f for f in v)) | |
def _vector_similarity(vAB, a, b): | |
return vAB / (a * b) | |
def cosine_similarity(string1, string2): | |
arr1 = _get_tokens(string1) | |
arr2 = _get_tokens(string2) | |
commons = arr1 | |
v1 = _compute_frequency(arr1, commons) | |
v2 = _compute_frequency(arr2, commons) | |
vAB = _compute_vector_ab(v1, v2) | |
a = _abs_vector(v1) | |
b = _abs_vector(v2) | |
return _vector_similarity(vAB, a, b) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import datetime | |
color_prefix = "\x1b[" | |
color_separator = ";" | |
color_suffix = "m" | |
color_thin = "20" | |
color_bold = "1" | |
color = lambda _color, bold: f"{color_prefix}{_color}{color_separator}{color_bold if bold else color_thin}{color_suffix}" | |
color_red = color("31", bold=False) | |
color_bold_red = color("31", bold = True) | |
color_green = color("32", bold=False) | |
color_bold_green = color("32", bold=True) | |
color_yellow = color("33", bold=False) | |
color_bold_yellow = color("33", bold=True) | |
color_blue = color("34", bold=False) | |
color_bold_blue = color("34", bold=True) | |
color_grey = color("38", bold=False) | |
color_reset = "\x1b[0m" | |
log_timestamped = lambda msgs, _level, _color: print(f"{color_bold_blue}[{datetime.datetime.now()}]{color_reset} {_color}{_level}{color_reset} {' '.join(str(m) for m in msgs)}") | |
log_info = lambda *msgs: log_timestamped(msgs = msgs, _level = "INFO", _color = color_bold_green) | |
log_error= lambda *msgs: log_timestamped(msgs = msgs, _level ="ERROR", _color = color_bold_red) | |
log_debug= lambda *msgs: log_timestamped(msgs = msgs, _level ="DEBUG", _color = color_bold_yellow) | |
_normalize_regex = re.compile(r'(?ui)\W') | |
_whitespace_regex = re.compile(r'\s+') | |
def normalize_text(text): | |
if text is None: | |
return "" | |
text = str(text) | |
text = _normalize_regex.sub(' ', text) | |
text = text.strip() | |
text = text.lower() | |
text = _whitespace_regex.sub(' ', text) | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import cossim | |
from helpers import normalize_text, log_info, log_error, log_debug | |
s1 = "ELON MUSK" | |
s2 = "COLON MUSK" | |
log_debug("string1", s1) | |
log_debug("string2", s2) | |
s1 = normalize_text(s1) | |
s2 = normalize_text(s2) | |
log_debug("string1-normalized", s1) | |
log_debug("string2-normalized", s2) | |
try: | |
cosine_similarity = cossim.cosine_similarity(s1,s2) | |
except Exception as e: | |
cosine_similarity = 0.0 | |
log_error("compute-error", str(e)) | |
log_info("cosine-similarity", cosine_similarity) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment