Skip to content

Instantly share code, notes, and snippets.

@bgusach
Last active July 10, 2024 14:14
Show Gist options
  • Save bgusach/a967e0587d6e01e889fd1d776c5f3729 to your computer and use it in GitHub Desktop.
Save bgusach/a967e0587d6e01e889fd1d776c5f3729 to your computer and use it in GitHub Desktop.
Python string multireplacement
def multireplace(string, replacements, ignore_case=False):
"""
Given a string and a replacement map, it returns the replaced string.
:param str string: string to execute replacements on
:param dict replacements: replacement dictionary {value to find: value to replace}
:param bool ignore_case: whether the match should be case insensitive
:rtype: str
"""
if not replacements:
# Edge case that'd produce a funny regex and cause a KeyError
return string
# If case insensitive, we need to normalize the old string so that later a replacement
# can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
# "HEY", "hEy", etc.
if ignore_case:
def normalize_old(s):
return s.lower()
re_mode = re.IGNORECASE
else:
def normalize_old(s):
return s
re_mode = 0
replacements = {normalize_old(key): val for key, val in replacements.items()}
# Place longer ones first to keep shorter substrings from matching where the longer ones should take place
# For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
# 'hey ABC' and not 'hey ABc'
rep_sorted = sorted(replacements, key=len, reverse=True)
rep_escaped = map(re.escape, rep_sorted)
# Create a big OR regex that matches any of the substrings to replace
pattern = re.compile("|".join(rep_escaped), re_mode)
# For each match, look up the new string in the replacements, being the key the normalized old string
return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)
@mnesarco
Copy link

mnesarco commented Sep 15, 2021

Based on @bgusach and @elidchan proposals, I have created a version with support for basic regex replacement. The main restriction is that expressions must not contain subgroups, and there may be some edge cases:

import re

class StringReplacer:

    def __init__(self, replacements, ignore_case=False):
        patterns = sorted(replacements, key=len, reverse=True)
        self.replacements = [replacements[k] for k in patterns]
        re_mode = re.IGNORECASE if ignore_case else 0
        self.pattern = re.compile('|'.join(("({})".format(p) for p in patterns)), re_mode)
        def tr(matcher):
            index = next((index for index,value in enumerate(matcher.groups()) if value), None)
            return self.replacements[index]
        self.tr = tr

    def __call__(self, string):
        return self.pattern.sub(self.tr, string)

Tests

table = {
    "aaa"    : "[This is three a]",
    "b+"     : "[This is one or more b]",
    r"<\w+>" : "[This is a tag]"
}

replacer = StringReplacer(table, True)

sample1 = "whatever bb, aaa, <star> BBB <end>"

print(replacer(sample1))

# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag]

The trick is to identify the matched group by its position. It is not super efficient (O(n)), but it works.

index = next((index for index,value in enumerate(matcher.groups()) if value), None)

Replacement is done in one pass.

@gsalfourn
Copy link

How would one apply multireplace to strings in pandas dataframe?

@jrom99
Copy link

jrom99 commented Nov 8, 2021

Based on @mnesarco approach, I tried a functional one with support for one subgroup per expression:

import re
from typing import Dict, Union


def multireplace(table: Dict[str, str], string: str, flags: Union[int, re.RegexFlag] = 0):
    patterns = {
        f"_g{n}": pattern for n, pattern in enumerate(table)
    }

    def repl(match: re.Match):
        repkey = None
        groupkey = None

        for key, value in match.groupdict().items():
            if value is None:
                continue

            if key.startswith("_g"):
                repkey = key
            else:
                groupkey, groupval = key, value

        repval = table[patterns[repkey]]
        return repval if groupkey is None else repval.replace(fr"\g<{groupkey}>", groupval)

    regex = "|".join(fr"(?P<{group}>{rep})" for group, rep in patterns.items())
    return re.sub(regex, repl, string, flags=flags)

Test

table = {
    "aaa": "[This is three a]",
    "b+": "[This is one or more b]",
    r"(?<=<spam>).+(?=</spam>)": "[REDACTED]",
    r"</?\w+>": "[This is a tag]",
}

txt = multireplace(table, "whatever bb, aaa, <star> BBB <end> <tag>keep me</tag> and <spam>delete me</spam>", re.IGNORECASE)
print(txt)
# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag] [This is a tag]keep me[This is a tag] and [This is a tag][REDACTED][This is a tag]

table = {
    "aaa": "[This is three a]",
    "b+": "[This is one or more b]",
    r"<(?P<name>\w+)>(?P<value>.+)</(?P=name)>": r"[This is an HTML tag with text (\g<value>)]",
    r"</?\w+>": "[This is a tag]",
}

txt = multireplace(table, "whatever bb, aaa, <star> BBB <end> <tag>keep me</tag> and <spam>delete me</spam>", re.IGNORECASE)
print(txt)

# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag] [This is an HTML tag with text (keep me)] and [This is an HTML tag with text (delete me)]

It's still O(n), I don't know how priorities are being set inside the main regex, they should be based on the dictionary order, but when there is competition (eg r"<(?P<name>\w+)>(?P<value>.+)</(?P=name)>" versus r"(?<=<spam>).+(?=</spam>)") the first has precedence. Also, one cannot reference a group by its order, only by name.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment