Last active
June 10, 2021 10:43
-
-
Save alexsavio/bcfd5b17e041833ef73f62d90f56951c to your computer and use it in GitHub Desktop.
Recapitalize names
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Recapitalize a string of words that has passed a pre-processing, word-cuttind and case lowering process. | |
""" | |
import re | |
import difflib | |
from typing import Iterator, List, Tuple | |
def recapitalize_name(original: str, result: str) -> str: | |
"""Return the `result` with the words capitalized as they appear in `original`.""" | |
if len(result) == len(original): | |
indices = _find_capital_letters(original) | |
return _capitalize(result, indices) | |
original_words = original.split() | |
result_words = result.split() | |
word_matches = _find_word_matches(result_words, original_words) | |
recapitalized_words = [] | |
for word, original_word in word_matches: | |
if word == original_word: | |
recapitalized_word = word | |
else: | |
indices = _find_capital_letters(original_word) | |
recapitalized_word = _capitalize(word, indices) | |
recapitalized_words.append(recapitalized_word) | |
return ' '.join(recapitalized_words) | |
def _capitalize(word: str, indices: Iterator[int]) -> str: | |
"""Capitalize the characters in `word` indicated in `indices`. | |
To make this function public, it should raise an IndexError exception.""" | |
if not indices or not word: | |
return word | |
characters = list(word) | |
for i in indices: | |
characters[i] = word[i].upper() | |
return "".join(characters) | |
def _find_capital_letters(word: str) -> List[int]: | |
"""Return the list of indices where the capital letters are found in `word`. | |
Only works on [A-Z] letters. | |
""" | |
if not word: | |
return [] | |
capital_letters = re.compile(r'[A-Z]') | |
return [group.start() for group in capital_letters.finditer(word)] | |
def _find_word_matches(result_words: Iterator[str], original_words: Iterator[str],) -> List[Tuple[str, str]]: | |
"""Find the words in `original_words` that match the words in `result_words`. | |
Return a list of pairs (<result word>, <matching original>). | |
This is not perfect with words lengths < 3. | |
""" | |
def _get_diff_match_cutoff(word_length: int) -> float: | |
if word_length < 4: | |
return 1 - 1/word_length | |
return 0.8 | |
lowered_source_words = {word.lower(): word for word in original_words} | |
result_words_matches = [] | |
for word in result_words: | |
cutoff = _get_diff_match_cutoff(len(word)) | |
closest_matches = difflib.get_close_matches(word, lowered_source_words.keys(), n=1, cutoff=cutoff) | |
closest_match = lowered_source_words[closest_matches[0]] if closest_matches else word | |
result_words_matches.append((word, closest_match)) | |
return result_words_matches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment