Steboss89 · April 12, 2022 15:29
diff --git a/check_numbers2a.py b/check_numbers2a.py
 regex = r"\b("

 remaining_numbers = []
 remaining_numbers2 = []
 for i in range(10000, 1000, -1):
    if i%10==0:
        # these are numbers that could cause false matching 
        # these numbers ends with zero but we may have something more in the text 
        # three thousand forty (3040) vs three thousand forty one
        remaining_numbers.append(i)
    elif (i%100 > 1) and (i%100<10):
        # these are numbers that could cause false matching
        # e.g. three thousand four (3004) vs three thousand four hundred
        remaining_numbers2.append(i)
    else:
        current_number = num2words(i)
        current_number = re.sub(" and ", " ", current_number)
        splitter = current_number.split(",")
        numb = ""
        for splitt in splitter: 
            dash_split = splitt.split("-")
            for elem in dash_split:
                for subelem in elem.split():
                    if subelem==" ":
                        continue 
                    else:
                        numb+= rf"\s{subelem.strip()}"
        
        regex+=f"{numb}"
        regex+=r"|"

 # sort the number in ascending order, from Rare to Frequent
 remaining_numbers.sort(reverse=True)
 for val in remaining_numbers:
    current_number = num2words(val)
    current_number = re.sub(" and ", " ", current_number)
    splitter = current_number.split(",")
    numb = ""
    for splitt in splitter: 
        dash_split = splitt.split("-")
        for elem in dash_split:
            for subelem in elem.split():
                if subelem==" ":
                    continue 
                else:
                    numb+= rf"\s{subelem.strip()}"

    regex+=f"{numb}"
    regex+=r"|"

 # add the remaining nubmers which ends in 1-9
 remaining_numbers2.sort(reverse=True)
 for val in remaining_numbers2:
    current_number = num2words(val)
    current_number = re.sub(" and ", " ", current_number)
    splitter = current_number.split(",")
    numb = ""
    for splitt in splitter: 
        dash_split = splitt.split("-")
        for elem in dash_split:
            for subelem in elem.split():
                if subelem==" ":
                    continue 
                else:
                    numb+= rf"\s{subelem.strip()}"

    regex+=f"{numb}"
    regex+=r"|"

    
 # add the final numbers 
 for i in range(1000, -1, -1):
    current_number = num2words(i)
    current_number = re.sub(" and ", " ", current_number)
    splitter = current_number.split(",")
    numb = ""
    for splitt in splitter: 
        dash_split = splitt.split("-")
        for elem in dash_split:
            for subelem in elem.split():
                numb+= rf"\s{subelem.strip()}"
    
    regex+=f"{numb}"

    if i==0:
        regex+=r")\b"
    else:
        regex+=r"|"

 # run the query - pick the book name from the books
 match_dict = {}
 for i, val in enumerate(glob.glob("books/*.txt"),0):
    book_name = val.split("/")[-1].split(".")[0]
    match_dict[book_name] = {}
    matches = re.findall(regex, data[i])
    for numb in matches:
        if numb.strip() in match_dict[book_name]:
            match_dict[book_name][numb.strip()]+=1 
        else:
            match_dict[book_name][numb.strip()] = 1
	regex = r"\b("

	remaining_numbers = []
	remaining_numbers2 = []
	for i in range(10000, 1000, -1):
	if i%10==0:
	# these are numbers that could cause false matching
	# these numbers ends with zero but we may have something more in the text
	# three thousand forty (3040) vs three thousand forty one
	remaining_numbers.append(i)
	elif (i%100 > 1) and (i%100<10):
	# these are numbers that could cause false matching
	# e.g. three thousand four (3004) vs three thousand four hundred
	remaining_numbers2.append(i)
	else:
	current_number = num2words(i)
	current_number = re.sub(" and ", " ", current_number)
	splitter = current_number.split(",")
	numb = ""
	for splitt in splitter:
	dash_split = splitt.split("-")
	for elem in dash_split:
	for subelem in elem.split():
	if subelem==" ":
	continue
	else:
	numb+= rf"\s{subelem.strip()}"

	regex+=f"{numb}"
	regex+=r"\|"

	# sort the number in ascending order, from Rare to Frequent
	remaining_numbers.sort(reverse=True)
	for val in remaining_numbers:
	current_number = num2words(val)
	current_number = re.sub(" and ", " ", current_number)
	splitter = current_number.split(",")
	numb = ""
	for splitt in splitter:
	dash_split = splitt.split("-")
	for elem in dash_split:
	for subelem in elem.split():
	if subelem==" ":
	continue
	else:
	numb+= rf"\s{subelem.strip()}"

	regex+=f"{numb}"
	regex+=r"\|"

	# add the remaining nubmers which ends in 1-9
	remaining_numbers2.sort(reverse=True)
	for val in remaining_numbers2:
	current_number = num2words(val)
	current_number = re.sub(" and ", " ", current_number)
	splitter = current_number.split(",")
	numb = ""
	for splitt in splitter:
	dash_split = splitt.split("-")
	for elem in dash_split:
	for subelem in elem.split():
	if subelem==" ":
	continue
	else:
	numb+= rf"\s{subelem.strip()}"

	regex+=f"{numb}"
	regex+=r"\|"


	# add the final numbers
	for i in range(1000, -1, -1):
	current_number = num2words(i)
	current_number = re.sub(" and ", " ", current_number)
	splitter = current_number.split(",")
	numb = ""
	for splitt in splitter:
	dash_split = splitt.split("-")
	for elem in dash_split:
	for subelem in elem.split():
	numb+= rf"\s{subelem.strip()}"

	regex+=f"{numb}"

	if i==0:
	regex+=r")\b"
	else:
	regex+=r"\|"

	# run the query - pick the book name from the books
	match_dict = {}
	for i, val in enumerate(glob.glob("books/*.txt"),0):
	book_name = val.split("/")[-1].split(".")[0]
	match_dict[book_name] = {}
	matches = re.findall(regex, data[i])
	for numb in matches:
	if numb.strip() in match_dict[book_name]:
	match_dict[book_name][numb.strip()]+=1
	else:
	match_dict[book_name][numb.strip()] = 1