Skip to content

Instantly share code, notes, and snippets.

@andriisoldatenko
Created September 3, 2016 08:38
Show Gist options
  • Save andriisoldatenko/5014143931a57436205bfc96720720be to your computer and use it in GitHub Desktop.
Save andriisoldatenko/5014143931a57436205bfc96720720be to your computer and use it in GitHub Desktop.
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'can not'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in
patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
s = re.sub(pattern, repl, s)
return s
replacer = RegexpReplacer()
print(replacer.replace("can't tokenize this"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment