Created
April 12, 2022 15:29
-
-
Save Steboss89/b76455eb583368c72b33b91b996a0c38 to your computer and use it in GitHub Desktop.
Second approach a non-frequent approach to build up a regex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| regex = r"\b(" | |
| remaining_numbers = [] | |
| remaining_numbers2 = [] | |
| for i in range(10000, 1000, -1): | |
| if i%10==0: | |
| # these are numbers that could cause false matching | |
| # these numbers ends with zero but we may have something more in the text | |
| # three thousand forty (3040) vs three thousand forty one | |
| remaining_numbers.append(i) | |
| elif (i%100 > 1) and (i%100<10): | |
| # these are numbers that could cause false matching | |
| # e.g. three thousand four (3004) vs three thousand four hundred | |
| remaining_numbers2.append(i) | |
| else: | |
| current_number = num2words(i) | |
| current_number = re.sub(" and ", " ", current_number) | |
| splitter = current_number.split(",") | |
| numb = "" | |
| for splitt in splitter: | |
| dash_split = splitt.split("-") | |
| for elem in dash_split: | |
| for subelem in elem.split(): | |
| if subelem==" ": | |
| continue | |
| else: | |
| numb+= rf"\s{subelem.strip()}" | |
| regex+=f"{numb}" | |
| regex+=r"|" | |
| # sort the number in ascending order, from Rare to Frequent | |
| remaining_numbers.sort(reverse=True) | |
| for val in remaining_numbers: | |
| current_number = num2words(val) | |
| current_number = re.sub(" and ", " ", current_number) | |
| splitter = current_number.split(",") | |
| numb = "" | |
| for splitt in splitter: | |
| dash_split = splitt.split("-") | |
| for elem in dash_split: | |
| for subelem in elem.split(): | |
| if subelem==" ": | |
| continue | |
| else: | |
| numb+= rf"\s{subelem.strip()}" | |
| regex+=f"{numb}" | |
| regex+=r"|" | |
| # add the remaining nubmers which ends in 1-9 | |
| remaining_numbers2.sort(reverse=True) | |
| for val in remaining_numbers2: | |
| current_number = num2words(val) | |
| current_number = re.sub(" and ", " ", current_number) | |
| splitter = current_number.split(",") | |
| numb = "" | |
| for splitt in splitter: | |
| dash_split = splitt.split("-") | |
| for elem in dash_split: | |
| for subelem in elem.split(): | |
| if subelem==" ": | |
| continue | |
| else: | |
| numb+= rf"\s{subelem.strip()}" | |
| regex+=f"{numb}" | |
| regex+=r"|" | |
| # add the final numbers | |
| for i in range(1000, -1, -1): | |
| current_number = num2words(i) | |
| current_number = re.sub(" and ", " ", current_number) | |
| splitter = current_number.split(",") | |
| numb = "" | |
| for splitt in splitter: | |
| dash_split = splitt.split("-") | |
| for elem in dash_split: | |
| for subelem in elem.split(): | |
| numb+= rf"\s{subelem.strip()}" | |
| regex+=f"{numb}" | |
| if i==0: | |
| regex+=r")\b" | |
| else: | |
| regex+=r"|" | |
| # run the query - pick the book name from the books | |
| match_dict = {} | |
| for i, val in enumerate(glob.glob("books/*.txt"),0): | |
| book_name = val.split("/")[-1].split(".")[0] | |
| match_dict[book_name] = {} | |
| matches = re.findall(regex, data[i]) | |
| for numb in matches: | |
| if numb.strip() in match_dict[book_name]: | |
| match_dict[book_name][numb.strip()]+=1 | |
| else: | |
| match_dict[book_name][numb.strip()] = 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment