Skip to content

Instantly share code, notes, and snippets.

@wildonion
Created October 28, 2020 12:01
Show Gist options
  • Select an option

  • Save wildonion/d625a4ea62417bbce730df7b64fda1ac to your computer and use it in GitHub Desktop.

Select an option

Save wildonion/d625a4ea62417bbce730df7b64fda1ac to your computer and use it in GitHub Desktop.
pattern searching problem
# PATTERN SEARCHING (REGX)
# help : https://www.geeksforgeeks.org/finite-automata-algorithm-for-pattern-searching/
class FA:
def __init__(self) -> None:
self.__state_char_mat = None
self.__pattern = ""
def compute(self, pattern: str) -> None:
self.__pattern = pattern
'''
pattern: ACACAGA
Number of states in FA will be M+1 where M is length of the pattern. The main thing to construct FA
is to get the next state from the current state for every possible character. Given a character x and a state k,
we can get the next state by considering the string “pat[0..k-1]x” which is basically concatenation of pattern
characters pat[0], pat[1] … pat[k-1] and the character x. The idea is to get length of the longest prefix of
the given pattern such that the prefix is also suffix of “pat[0..k-1]x”. The value of length gives us the next state.
For example, let us see how to get the next state from current state 5 and character ‘C’ in the above diagram.
We need to consider the string, “pat[0..4]C” which is “ACACAC”. The length of the longest prefix of the pattern
such that the prefix is suffix of “ACACAC”is 4 (“ACAC”). So the next state (from state 5) is 4 for character ‘C’.
'''
number_of_states = len(pattern) + 1
cols = {pattern[i]: 0 for i in range(len(pattern))}
code_ascii = [ord(c) for c in cols.keys()]
self.__state_char_mat = [[0 for i in range(len(code_ascii))] for _ in range(number_of_states)]
for state in range(number_of_states):
for x in range(len(code_ascii)):
next_state = self.__next_state(state, code_ascii[x])
self.__state_char_mat[state][x] = next_state
def __next_state(self, state, x):
'''
there are “len(pattern) + 1” states and when
we move from state k with char x the next state
will be one of the remaining states except the one
that has accepted x which we have ”pat[0..k-1]” states
and the -1 is because of x after that which is a
final state that accept our pattern.
so to calculate the next state we have to find
length of the longest prefix of “pat[0..k-1]x”
such that the prefix is also suffix of “pat[0..k-1]x”.
trying all possible prefixes starting from the
longest possible that can be a suffix of “pat[0..k-1]x”.
'''
# if the char x is same as next char in pattern[state], then simply increment state
if state < len(self.__pattern) and x == ord(self.__pattern[state]):
return state + 1
i = 0
for next_state in range(state, 0, -1): # 0 to k
if ord(self.__pattern[next_state-1]) == x:
while(i<next_state-1): # not next_state-1 because self.__pattern[next_state-1] is x and we want to check every chars before x
if self.__pattern[i] != self.__pattern[state-next_state+1+i]:
break
i+=1
if i == next_state-1: # we found prefix which is also a suffix
return next_state
return 0
def accept(self, string: str) -> bool:
'''
build a FA machine which accept the pattern then
we pass our string through our machine if it reach
the final state at any position meaning that we found our pattern
at that position.
'''
state = 0
for i in range(len(string)):
state = self.__state_char_mat[state][ord(string[i])]
if state == len(self.__pattern):
print(f"[+] pattern found at index: {i-len(self.__pattern)+1}")
return True
else:
return False
class Regx(FA):
def __init__(self, _input: str) -> None:
self.__input = _input
self.compute(_input)
def match(self, pattern) -> bool:
not_in_string = []
built_pattern = []
code_ascii = [ord(c) for c in self.__input]
ascii_count = {ord(c): self.__input.count(c) for c in self.__input}
if len(pattern) < len(self.__input):
return False
else:
for i in range(len(pattern)):
if ord(pattern[i]) in ascii_count:
built_pattern.append(pattern[i])
elif ord(pattern[i]) not in ascii_count and i+1 < len(pattern) and pattern[i] != "*" and pattern[i+1] == "*":
built_pattern.append(pattern[i])
elif pattern[i] == "*":
if i == 0:
break
star_no_rep = ascii_count[ord(built_pattern[i-1])] if ord(built_pattern[i-1]) in ascii_count else 0
if star_no_rep == 0:
built_pattern.append("0")
else:
while built_pattern.count(built_pattern[i-1]) != star_no_rep:
built_pattern.append(built_pattern[i-1])
elif pattern[i] == ".":
built_pattern.append(self.__input[i])
built_pattern = [ord(c) for c in built_pattern if ord(c) in ascii_count and c != "0"]
if built_pattern == code_ascii:
return True
else:
return False
_input = input("Enter string ::: ")
pattern = input("Enter pattern ::: ")
reg = Regx(_input)
if not reg.match(pattern):
print("[-] pattern doesn't match the entire string")
else:
print("[+] pattern match with the entire string")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment