Created
October 28, 2020 12:01
-
-
Save wildonion/d625a4ea62417bbce730df7b64fda1ac to your computer and use it in GitHub Desktop.
pattern searching problem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # PATTERN SEARCHING (REGX) | |
| # help : https://www.geeksforgeeks.org/finite-automata-algorithm-for-pattern-searching/ | |
| class FA: | |
| def __init__(self) -> None: | |
| self.__state_char_mat = None | |
| self.__pattern = "" | |
| def compute(self, pattern: str) -> None: | |
| self.__pattern = pattern | |
| ''' | |
| pattern: ACACAGA | |
| Number of states in FA will be M+1 where M is length of the pattern. The main thing to construct FA | |
| is to get the next state from the current state for every possible character. Given a character x and a state k, | |
| we can get the next state by considering the string “pat[0..k-1]x” which is basically concatenation of pattern | |
| characters pat[0], pat[1] … pat[k-1] and the character x. The idea is to get length of the longest prefix of | |
| the given pattern such that the prefix is also suffix of “pat[0..k-1]x”. The value of length gives us the next state. | |
| For example, let us see how to get the next state from current state 5 and character ‘C’ in the above diagram. | |
| We need to consider the string, “pat[0..4]C” which is “ACACAC”. The length of the longest prefix of the pattern | |
| such that the prefix is suffix of “ACACAC”is 4 (“ACAC”). So the next state (from state 5) is 4 for character ‘C’. | |
| ''' | |
| number_of_states = len(pattern) + 1 | |
| cols = {pattern[i]: 0 for i in range(len(pattern))} | |
| code_ascii = [ord(c) for c in cols.keys()] | |
| self.__state_char_mat = [[0 for i in range(len(code_ascii))] for _ in range(number_of_states)] | |
| for state in range(number_of_states): | |
| for x in range(len(code_ascii)): | |
| next_state = self.__next_state(state, code_ascii[x]) | |
| self.__state_char_mat[state][x] = next_state | |
| def __next_state(self, state, x): | |
| ''' | |
| there are “len(pattern) + 1” states and when | |
| we move from state k with char x the next state | |
| will be one of the remaining states except the one | |
| that has accepted x which we have ”pat[0..k-1]” states | |
| and the -1 is because of x after that which is a | |
| final state that accept our pattern. | |
| so to calculate the next state we have to find | |
| length of the longest prefix of “pat[0..k-1]x” | |
| such that the prefix is also suffix of “pat[0..k-1]x”. | |
| trying all possible prefixes starting from the | |
| longest possible that can be a suffix of “pat[0..k-1]x”. | |
| ''' | |
| # if the char x is same as next char in pattern[state], then simply increment state | |
| if state < len(self.__pattern) and x == ord(self.__pattern[state]): | |
| return state + 1 | |
| i = 0 | |
| for next_state in range(state, 0, -1): # 0 to k | |
| if ord(self.__pattern[next_state-1]) == x: | |
| while(i<next_state-1): # not next_state-1 because self.__pattern[next_state-1] is x and we want to check every chars before x | |
| if self.__pattern[i] != self.__pattern[state-next_state+1+i]: | |
| break | |
| i+=1 | |
| if i == next_state-1: # we found prefix which is also a suffix | |
| return next_state | |
| return 0 | |
| def accept(self, string: str) -> bool: | |
| ''' | |
| build a FA machine which accept the pattern then | |
| we pass our string through our machine if it reach | |
| the final state at any position meaning that we found our pattern | |
| at that position. | |
| ''' | |
| state = 0 | |
| for i in range(len(string)): | |
| state = self.__state_char_mat[state][ord(string[i])] | |
| if state == len(self.__pattern): | |
| print(f"[+] pattern found at index: {i-len(self.__pattern)+1}") | |
| return True | |
| else: | |
| return False | |
| class Regx(FA): | |
| def __init__(self, _input: str) -> None: | |
| self.__input = _input | |
| self.compute(_input) | |
| def match(self, pattern) -> bool: | |
| not_in_string = [] | |
| built_pattern = [] | |
| code_ascii = [ord(c) for c in self.__input] | |
| ascii_count = {ord(c): self.__input.count(c) for c in self.__input} | |
| if len(pattern) < len(self.__input): | |
| return False | |
| else: | |
| for i in range(len(pattern)): | |
| if ord(pattern[i]) in ascii_count: | |
| built_pattern.append(pattern[i]) | |
| elif ord(pattern[i]) not in ascii_count and i+1 < len(pattern) and pattern[i] != "*" and pattern[i+1] == "*": | |
| built_pattern.append(pattern[i]) | |
| elif pattern[i] == "*": | |
| if i == 0: | |
| break | |
| star_no_rep = ascii_count[ord(built_pattern[i-1])] if ord(built_pattern[i-1]) in ascii_count else 0 | |
| if star_no_rep == 0: | |
| built_pattern.append("0") | |
| else: | |
| while built_pattern.count(built_pattern[i-1]) != star_no_rep: | |
| built_pattern.append(built_pattern[i-1]) | |
| elif pattern[i] == ".": | |
| built_pattern.append(self.__input[i]) | |
| built_pattern = [ord(c) for c in built_pattern if ord(c) in ascii_count and c != "0"] | |
| if built_pattern == code_ascii: | |
| return True | |
| else: | |
| return False | |
| _input = input("Enter string ::: ") | |
| pattern = input("Enter pattern ::: ") | |
| reg = Regx(_input) | |
| if not reg.match(pattern): | |
| print("[-] pattern doesn't match the entire string") | |
| else: | |
| print("[+] pattern match with the entire string") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment