Last active
September 24, 2015 22:58
-
-
Save pschwede/823057 to your computer and use it in GitHub Desktop.
Reduces a set of strings to one regular expression string. Respects a black list.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
===RegexReduce=== | |
Goal is to generate regular expressions out of lists of strings to match | |
all strings in that list. | |
This happens by reducing strings to their unique substrings.. | |
@author: spazzpp2 | |
''' | |
import re | |
class RegexReduce: | |
def __init__(self): | |
self.regex = "" | |
self.allowed = [] | |
self.forbidden = [] | |
pass | |
def whitestr(self, str): | |
whitelist([str]) | |
def whitelist(self, listOfStrings): | |
self.allowed += listOfStrings | |
def blackstr(self, str): | |
self.blacklist([str]) | |
def blacklist(self, listOfStrings): | |
self.forbidden += listOfStrings | |
def __same(self, a, b): | |
if len(a) < len(b): | |
a, b = b, a | |
res = "" | |
i = 0 | |
while i < len(a)-len(res): | |
j = b.find(a[i]) | |
if j > -1: | |
c = 0 | |
while i+c < len(a) and j+c < len(b) and a[i+c] == b[j+c]: | |
c += 1 | |
if len(res) < c: | |
res = b[j:j+c] | |
i += 1 | |
return res | |
def __do(self, l, nolist): | |
# remove double strings | |
l = list(set(l)) | |
nolist = list(set(nolist)) | |
# gather similarites between the strings in l | |
eq = set([self.__same(a,b) for a in l for b in l if a!=b]) - set([""]) | |
# gather similarities between the strings in l and the strings in nolist | |
neq = set([self.__same(a,b) for a in l for b in nolist if a!=b]) - set([""]) | |
# remove similarities from eq which may lead to allow forbidden strings | |
eq -= neq | |
# make sure all allowed strings are still recognized | |
# for that we need to gather all strings which arent covered by eq | |
#eqall = [self.__same(a,b) for a in l+nolist for b in l+nolist] | |
res = set([]) | |
for k in l: | |
for e in eq: | |
if e in k: | |
break | |
else: | |
t = str(k) | |
allsame = list(set([self.__same(a,b) for a in neq|set(l) for b in neq|set(l) if a!=b])-set([''])) | |
allsame.sort(lambda a,b: cmp(len(a), len(b))) | |
while len(allsame) and len(allsame[0]) > 2: | |
t = t.replace(self.__escape(allsame[0]),".+",1) | |
allsame.remove(allsame[0]) | |
res |= set([t]) | |
s = "(" | |
if len(res): | |
s += "|".join(map(lambda a: self.__escape(a), res)) | |
if len(eq): | |
return s + "|" + ".*" + ".*|.*".join(map(lambda a: self.__escape(a), eq)) + ".*)" | |
else: | |
return s + ")" | |
def __escape(self, word): | |
return word.replace(".","\.").replace("^","\^") | |
def __str__(self): | |
self.regex = self.__do(self.allowed, self.forbidden) | |
return self.regex | |
if __name__ == "__main__": | |
rg = RegexGen() | |
#rg.addlist(["xx","abcabc", "efabc","ref","abef"]) | |
#rg.forbidlist(["xxx","cabc"]) | |
rg.addlist(["Python - Mozilla Firefox", "Python.org - Mozilla Firefox", "Pythonizer - Mozilla Firefox"]) | |
rg.forbidlist(["Ministry of Silly Walking (Monty) - Mozilla Firefox"]) | |
print rg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment