pschwede · September 24, 2015 22:58
diff --git a/RegexReduce.py b/RegexReduce.py
 '''
 ===RegexReduce===
 Goal is to generate regular expressions out of lists of strings to match
 all strings in that list.
 This happens by reducing strings to their unique substrings..

 @author: spazzpp2
 '''

 import re

 class RegexReduce:
  def __init__(self):
    self.regex = ""
    self.allowed = []
    self.forbidden = []
    pass
  
  def whitestr(self, str):
    whitelist([str])
    
  def whitelist(self, listOfStrings):
    self.allowed += listOfStrings
    
  def blackstr(self, str):
    self.blacklist([str])
  
  def blacklist(self, listOfStrings):
    self.forbidden += listOfStrings
  
  def __same(self, a, b):
    if len(a) < len(b):
      a, b = b, a
    res = ""
    i = 0
    while i < len(a)-len(res):
      j = b.find(a[i])
      if j > -1:
        c = 0
        while i+c < len(a) and j+c < len(b) and a[i+c] == b[j+c]:
          c += 1
          if len(res) < c:
            res = b[j:j+c]
      i += 1
    return res
    
  def __do(self, l, nolist):
    # remove double strings
    l = list(set(l))
    nolist = list(set(nolist))
    # gather similarites between the strings in l
    eq = set([self.__same(a,b) for a in l for b in l if a!=b]) - set([""])
    # gather similarities between the strings in l and the strings in nolist
    neq = set([self.__same(a,b) for a in l for b in nolist if a!=b]) - set([""])
    # remove similarities from eq which may lead to allow forbidden strings
    eq -= neq
    # make sure all allowed strings are still recognized
    # for that we need to gather all strings which arent covered by eq
    #eqall = [self.__same(a,b) for a in l+nolist for b in l+nolist]
    res = set([])
    for k in l:
        for e in eq:
            if e in k:
                break
        else:
            t = str(k)
            allsame = list(set([self.__same(a,b) for a in neq|set(l) for b in neq|set(l) if a!=b])-set(['']))
            allsame.sort(lambda a,b: cmp(len(a), len(b)))
            while len(allsame) and len(allsame[0]) > 2:
                t = t.replace(self.__escape(allsame[0]),".+",1)
                allsame.remove(allsame[0])
            res |= set([t])
    s = "("
    if len(res):
        s += "|".join(map(lambda a: self.__escape(a), res))
    if len(eq):
        return s + "|" + ".*" + ".*|.*".join(map(lambda a: self.__escape(a), eq)) + ".*)"
    else:
        return s + ")"
    
  def __escape(self, word):
    return word.replace(".","\.").replace("^","\^")
  
  def __str__(self):
    self.regex = self.__do(self.allowed, self.forbidden)
    return self.regex

 if __name__ == "__main__":
  rg = RegexGen()
  #rg.addlist(["xx","abcabc", "efabc","ref","abef"])
  #rg.forbidlist(["xxx","cabc"])
  rg.addlist(["Python - Mozilla Firefox", "Python.org - Mozilla Firefox", "Pythonizer - Mozilla Firefox"])
  rg.forbidlist(["Ministry of Silly Walking (Monty) - Mozilla Firefox"])
  print rg
	'''
	===RegexReduce===
	Goal is to generate regular expressions out of lists of strings to match
	all strings in that list.
	This happens by reducing strings to their unique substrings..

	@author: spazzpp2
	'''

	import re

	class RegexReduce:
	def __init__(self):
	self.regex = ""
	self.allowed = []
	self.forbidden = []
	pass

	def whitestr(self, str):
	whitelist([str])

	def whitelist(self, listOfStrings):
	self.allowed += listOfStrings

	def blackstr(self, str):
	self.blacklist([str])

	def blacklist(self, listOfStrings):
	self.forbidden += listOfStrings

	def __same(self, a, b):
	if len(a) < len(b):
	a, b = b, a
	res = ""
	i = 0
	while i < len(a)-len(res):
	j = b.find(a[i])
	if j > -1:
	c = 0
	while i+c < len(a) and j+c < len(b) and a[i+c] == b[j+c]:
	c += 1
	if len(res) < c:
	res = b[j:j+c]
	i += 1
	return res

	def __do(self, l, nolist):
	# remove double strings
	l = list(set(l))
	nolist = list(set(nolist))
	# gather similarites between the strings in l
	eq = set([self.__same(a,b) for a in l for b in l if a!=b]) - set([""])
	# gather similarities between the strings in l and the strings in nolist
	neq = set([self.__same(a,b) for a in l for b in nolist if a!=b]) - set([""])
	# remove similarities from eq which may lead to allow forbidden strings
	eq -= neq
	# make sure all allowed strings are still recognized
	# for that we need to gather all strings which arent covered by eq
	#eqall = [self.__same(a,b) for a in l+nolist for b in l+nolist]
	res = set([])
	for k in l:
	for e in eq:
	if e in k:
	break
	else:
	t = str(k)
	allsame = list(set([self.__same(a,b) for a in neq\|set(l) for b in neq\|set(l) if a!=b])-set(['']))
	allsame.sort(lambda a,b: cmp(len(a), len(b)))
	while len(allsame) and len(allsame[0]) > 2:
	t = t.replace(self.__escape(allsame[0]),".+",1)
	allsame.remove(allsame[0])
	res \|= set([t])
	s = "("
	if len(res):
	s += "\|".join(map(lambda a: self.__escape(a), res))
	if len(eq):
	return s + "\|" + "." + ".\|.".join(map(lambda a: self.__escape(a), eq)) + ".)"
	else:
	return s + ")"

	def __escape(self, word):
	return word.replace(".","\.").replace("^","\^")

	def __str__(self):
	self.regex = self.__do(self.allowed, self.forbidden)
	return self.regex

	if __name__ == "__main__":
	rg = RegexGen()
	#rg.addlist(["xx","abcabc", "efabc","ref","abef"])
	#rg.forbidlist(["xxx","cabc"])
	rg.addlist(["Python - Mozilla Firefox", "Python.org - Mozilla Firefox", "Pythonizer - Mozilla Firefox"])
	rg.forbidlist(["Ministry of Silly Walking (Monty) - Mozilla Firefox"])
	print rg