Last active
October 30, 2020 15:53
-
-
Save selimslab/8e80403b84c635e87bbf4e03455b9306 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fuzzywuzzy import process, fuzz | |
def match_by_fuzzy_string_search( | |
possible_matches: List[str], string_to_be_searched: str | |
) -> str: | |
scores = dict() | |
for candidate in possible_matches: | |
n = len(candidate.split()) | |
n_grams = generate_ngrams(string_to_be_searched, n) | |
for n_gram in n_grams: | |
possible_match, score = process.extractOne( | |
n_gram, possible_matches, scorer=fuzz.ratio | |
) | |
old_score = scores.get(possible_match, 0) | |
if score > old_score: | |
scores[possible_match] = score | |
if scores: | |
most_possible_match = max(scores, key=scores.get) | |
most_score = scores.get(most_possible_match) | |
if most_score > 80: | |
return most_possible_match | |
return "" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def tr_chars_to_eng(tr_str): | |
pairs = [("ş", "s"), ("ğ", "g"), ("ç", "c"), ("ı", "i"), ("ö", "o"), ("ü", "u")] | |
for pair in pairs: | |
tr_str = tr_str.replace(pair[0], pair[1]) | |
return tr_str | |
def create_eng_char_name_map(d, eng_map): | |
for k, v in d.iteritems(): | |
if isinstance(v, dict): | |
create_eng_char_name_map(d, eng_map) | |
else: | |
eng_map[k] = tr_chars_to_eng(k) | |
eng_map[v] = tr_chars_to_eng(v) | |
return eng_map | |
def generate_ngrams(s, n): | |
# Convert to lowercases | |
s = s.lower() | |
# Replace all none alphanumeric characters with spaces | |
s = re.sub(r"[^a-zA-Z0-9\s]", " ", s) | |
# Break sentence in the token, remove empty tokens | |
tokens = [token for token in s.split(" ") if token != ""] | |
# generate n grams | |
n_grams = list() | |
for i in range(len(tokens)): | |
n_gram = " ".join(tokens[i: i + n]) | |
n_grams.append(n_gram) | |
return n_grams | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def normalize(s: str): | |
return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def string_sliding_windows(s: str): | |
# ("a b c", ["a", "a b", "a b c", "b", "b c", "c"]) | |
tokens = s.split() | |
windows = [] | |
for i in range(len(tokens)): | |
for j in range(i + 1, len(tokens) + 1): | |
windows.append(" ".join(tokens[i:j])) | |
return windows | |
def string_to_extending_windows(s: str, end: int = None) -> list: | |
# ("a b c", ["a", "a b", "a b c"]) | |
tokens = s.split() | |
if not end: | |
end = len(tokens) | |
return [" ".join(tokens[:i]) for i in range(1, end + 1)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import requests | |
def get_soup(url): | |
r = requests.get(url, verify=False) | |
soup = bs4.BeautifulSoup(r.content, features="lxml") | |
return soup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment