This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| df = pd.DataFrame.from_dict(match_dict) | |
| #Extract a subset from the books we want to analyse | |
| books = ['Genesis','Exodus','1Kings','2Kings','Psalms','Isaias','Mark','Matthew','Luke','John','Apocalypse'] | |
| # drop NaN, so where occurrences = 0, so we can have a comparison of the common set of numbers | |
| # transpose the dataframe so numbers will be the name of the columns | |
| subset = df[book].dropna().T | |
| # melt the dataframe | |
| subset = subset.reset_index().melt(id_vars='index') | |
| # rename columns | |
| subset.rename(columns={"index":"Book", "variable":"Number","value":"Occurrence"},inplace=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # preprocessing on data | |
| # data is a list of all the Bible's books | |
| # call the CountVectorizer | |
| cvec = CountVectorizer() | |
| # fit transform as we're working directly on all the corpus | |
| cvec.fit_transform(data) | |
| # np matrix sparse | |
| all_df = cvec.transform(data) | |
| # create a dataframe: sum on all the term occurrences |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| one_to_9 = r"(?:f(?:ive|our)|s(?:even|ix)|t(?:hree|wo)|(?:ni|o)ne|eight)" # end one_to_9 definition | |
| ten_to_19 = r"(?:(?:(?:s(?:even|ix)|f(?:our|if)|nine)te|e(?:ighte|lev))en|t(?:(?:hirte)?en|welve))" # end ten_to_19 definition | |
| two_digit_prefix = r"(?:(?:s(?:even|ix)|t(?:hir|wen)|f(?:if|or)|eigh|nine)ty)" # end two_digit_prefix definition | |
| one_to_99 = fr"(?:{two_digit_prefix}(?:[-\s]{one_to_9})?|{ten_to_19}|{one_to_9})" # end one_to_99 definition | |
| one_to_999 = fr"(?:{one_to_9}\shundred(?:\s(?:and\s)?{one_to_99})?|{one_to_99})" # end one_to_999 definition | |
| one_to_999_999 = fr"(?:{one_to_999}\sthousand(?:\s{one_to_999})?|{one_to_999})" # end one_to_999_999 definition | |
| one_to_999_999_999 = fr"(?:{one_to_999}\smillion(?:\s{one_to_999_999})?|{one_to_999_999})" # end one_to_999_999_999 definition | |
| one_to_999_999_999_999 = fr"(?:{one_to_999}\sbillion(?:\s{one_to_999_999_999})?|{one_to_999_999_999})" # end one_to_999_999_999_999 definition | |
| one_to_999_999_999_999_999 = fr"(?:{one_to_999}\strillion(?:\s{one_to_999_999_999_99 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| regex = r"\b(" | |
| remaining_numbers = [] | |
| remaining_numbers2 = [] | |
| for i in range(10000, 1000, -1): | |
| if i%10==0: | |
| # these are numbers that could cause false matching | |
| # these numbers ends with zero but we may have something more in the text | |
| # three thousand forty (3040) vs three thousand forty one | |
| remaining_numbers.append(i) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # this is a test string with numbers to be found | |
| test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers" | |
| # firstly we could think of a simple regex to match numbers | |
| regex = r"\b(three thousand five hundred forty five|three thousand thirty four|twenty thousand|three thousand|forty five|thirty four|twenty|five|four|three|two|one)\b" | |
| re.findall(regex, test_text) | |
| # the result is not we were expecting | |
| # recalibrate the order from "rare" numbers to "frequent" ones | |
| regex = r"\b(three thousand five hundred forty five|three thousand thirty four|twenty thousand|three thousand|forty five|thirty four|twenty|five|four|three|two|one)\b" | |
| re.findall(regex, test_text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| numbers = [] | |
| for i in range(0,100001): | |
| # convert a number to string | |
| numbers.append(num2words(i)) | |
| # clean the book and obtain the data list | |
| # for each book collect the number occurrences | |
| numb_dict = {} | |
| for i, book in enumerate(data, 0): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| for book in data: | |
| big_string += book + " " | |
| wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(big_string) | |
| plt.figure(figsize=(12,10)) | |
| plt.imshow(wordcloud, interpolation="bilinear") | |
| plt.axis("off") | |
| plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = [] | |
| ifiles = glob.glob("books/*.txt") | |
| for ifile in ifiles: | |
| book = open(ifile, "r").read().strip() | |
| data.append(book) | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(["thy","thou","thee", "hath", "upon", "me", "him", "them", "shall","ye", "one", "unto", "us"]) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| output = "books" | |
| if not os.path.exists(output): | |
| os.makedirs(output) | |
| lines = open("cleaned_bible.txt", "r").readlines() | |
| books = [] | |
| books_idx = {} | |
| # there are books which are First of, second of etc |