Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created April 12, 2022 15:29
Show Gist options
  • Save Steboss89/b76455eb583368c72b33b91b996a0c38 to your computer and use it in GitHub Desktop.
Save Steboss89/b76455eb583368c72b33b91b996a0c38 to your computer and use it in GitHub Desktop.
Second approach a non-frequent approach to build up a regex
regex = r"\b("
remaining_numbers = []
remaining_numbers2 = []
for i in range(10000, 1000, -1):
if i%10==0:
# these are numbers that could cause false matching
# these numbers ends with zero but we may have something more in the text
# three thousand forty (3040) vs three thousand forty one
remaining_numbers.append(i)
elif (i%100 > 1) and (i%100<10):
# these are numbers that could cause false matching
# e.g. three thousand four (3004) vs three thousand four hundred
remaining_numbers2.append(i)
else:
current_number = num2words(i)
current_number = re.sub(" and ", " ", current_number)
splitter = current_number.split(",")
numb = ""
for splitt in splitter:
dash_split = splitt.split("-")
for elem in dash_split:
for subelem in elem.split():
if subelem==" ":
continue
else:
numb+= rf"\s{subelem.strip()}"
regex+=f"{numb}"
regex+=r"|"
# sort the number in ascending order, from Rare to Frequent
remaining_numbers.sort(reverse=True)
for val in remaining_numbers:
current_number = num2words(val)
current_number = re.sub(" and ", " ", current_number)
splitter = current_number.split(",")
numb = ""
for splitt in splitter:
dash_split = splitt.split("-")
for elem in dash_split:
for subelem in elem.split():
if subelem==" ":
continue
else:
numb+= rf"\s{subelem.strip()}"
regex+=f"{numb}"
regex+=r"|"
# add the remaining nubmers which ends in 1-9
remaining_numbers2.sort(reverse=True)
for val in remaining_numbers2:
current_number = num2words(val)
current_number = re.sub(" and ", " ", current_number)
splitter = current_number.split(",")
numb = ""
for splitt in splitter:
dash_split = splitt.split("-")
for elem in dash_split:
for subelem in elem.split():
if subelem==" ":
continue
else:
numb+= rf"\s{subelem.strip()}"
regex+=f"{numb}"
regex+=r"|"
# add the final numbers
for i in range(1000, -1, -1):
current_number = num2words(i)
current_number = re.sub(" and ", " ", current_number)
splitter = current_number.split(",")
numb = ""
for splitt in splitter:
dash_split = splitt.split("-")
for elem in dash_split:
for subelem in elem.split():
numb+= rf"\s{subelem.strip()}"
regex+=f"{numb}"
if i==0:
regex+=r")\b"
else:
regex+=r"|"
# run the query - pick the book name from the books
match_dict = {}
for i, val in enumerate(glob.glob("books/*.txt"),0):
book_name = val.split("/")[-1].split(".")[0]
match_dict[book_name] = {}
matches = re.findall(regex, data[i])
for numb in matches:
if numb.strip() in match_dict[book_name]:
match_dict[book_name][numb.strip()]+=1
else:
match_dict[book_name][numb.strip()] = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment