Steboss89’s gists

Steboss89 / histogram_numbers.py

Created April 13, 2022 08:54

Create a histogram to compare numbers across principal books

	df = pd.DataFrame.from_dict(match_dict)
	#Extract a subset from the books we want to analyse
	books = ['Genesis','Exodus','1Kings','2Kings','Psalms','Isaias','Mark','Matthew','Luke','John','Apocalypse']
	# drop NaN, so where occurrences = 0, so we can have a comparison of the common set of numbers
	# transpose the dataframe so numbers will be the name of the columns
	subset = df[book].dropna().T
	# melt the dataframe
	subset = subset.reset_index().melt(id_vars='index')
	# rename columns
	subset.rename(columns={"index":"Book", "variable":"Number","value":"Occurrence"},inplace=True)

Steboss89 / zipf_law.py

Created April 12, 2022 20:33

Compute Zipf's law

	# preprocessing on data
	# data is a list of all the Bible's books

	# call the CountVectorizer
	cvec = CountVectorizer()
	# fit transform as we're working directly on all the corpus
	cvec.fit_transform(data)
	# np matrix sparse
	all_df = cvec.transform(data)
	# create a dataframe: sum on all the term occurrences

Steboss89 / check_numbers3.py

Created April 12, 2022 15:34

Third winning approach for regex

	one_to_9 = r"(?:f(?:ive\|our)\|s(?:even\|ix)\|t(?:hree\|wo)\|(?:ni\|o)ne\|eight)" # end one_to_9 definition
	ten_to_19 = r"(?:(?:(?:s(?:even\|ix)\|f(?:our\|if)\|nine)te\|e(?:ighte\|lev))en\|t(?:(?:hirte)?en\|welve))" # end ten_to_19 definition
	two_digit_prefix = r"(?:(?:s(?:even\|ix)\|t(?:hir\|wen)\|f(?:if\|or)\|eigh\|nine)ty)" # end two_digit_prefix definition
	one_to_99 = fr"(?:{two_digit_prefix}(?:[-\s]{one_to_9})?\|{ten_to_19}\|{one_to_9})" # end one_to_99 definition
	one_to_999 = fr"(?:{one_to_9}\shundred(?:\s(?:and\s)?{one_to_99})?\|{one_to_99})" # end one_to_999 definition
	one_to_999_999 = fr"(?:{one_to_999}\sthousand(?:\s{one_to_999})?\|{one_to_999})" # end one_to_999_999 definition
	one_to_999_999_999 = fr"(?:{one_to_999}\smillion(?:\s{one_to_999_999})?\|{one_to_999_999})" # end one_to_999_999_999 definition
	one_to_999_999_999_999 = fr"(?:{one_to_999}\sbillion(?:\s{one_to_999_999_999})?\|{one_to_999_999_999})" # end one_to_999_999_999_999 definition
	one_to_999_999_999_999_999 = fr"(?:{one_to_999}\strillion(?:\s{one_to_999_999_999_99

Steboss89 / check_numbers2a.py

Created April 12, 2022 15:29

Second approach a non-frequent approach to build up a regex

	regex = r"\b("

	remaining_numbers = []
	remaining_numbers2 = []
	for i in range(10000, 1000, -1):
	if i%10==0:
	# these are numbers that could cause false matching
	# these numbers ends with zero but we may have something more in the text
	# three thousand forty (3040) vs three thousand forty one
	remaining_numbers.append(i)

Steboss89 / check_numbers2.py

Created April 12, 2022 15:22

Second approach use regex

	# this is a test string with numbers to be found
	test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers"
	# firstly we could think of a simple regex to match numbers
	regex = r"\b(three thousand five hundred forty five\|three thousand thirty four\|twenty thousand\|three thousand\|forty five\|thirty four\|twenty\|five\|four\|three\|two\|one)\b"
	re.findall(regex, test_text)
	# the result is not we were expecting

	# recalibrate the order from "rare" numbers to "frequent" ones
	regex = r"\b(three thousand five hundred forty five\|three thousand thirty four\|twenty thousand\|three thousand\|forty five\|thirty four\|twenty\|five\|four\|three\|two\|one)\b"
	re.findall(regex, test_text)

Steboss89 / check_numbers2.py

Created April 12, 2022 15:19

Second approach, use regex

test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers"

Steboss89 / check_numbers1.py

Created April 12, 2022 15:00

First approach from int to string to list

	numbers = []
	for i in range(0,100001):
	# convert a number to string
	numbers.append(num2words(i))

	# clean the book and obtain the data list

	# for each book collect the number occurrences
	numb_dict = {}
	for i, book in enumerate(data, 0):

Steboss89 / wordcloud1.py

Created April 8, 2022 15:30

Create a wordcloud with all the input books

	for book in data:
	big_string += book + " "

	wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(big_string)
	plt.figure(figsize=(12,10))
	plt.imshow(wordcloud, interpolation="bilinear")
	plt.axis("off")
	plt.show()

Steboss89 / cleaning_2.py

Last active April 8, 2022 15:30

cleaning of input text data

	data = []
	ifiles = glob.glob("books/*.txt")
	for ifile in ifiles:
	book = open(ifile, "r").read().strip()
	data.append(book)

	stop_words = stopwords.words('english')
	stop_words.extend(["thy","thou","thee", "hath", "upon", "me", "him", "them", "shall","ye", "one", "unto", "us"])

Steboss89 / cleaning_1.py

Created April 8, 2022 15:00

Subdivide books in old and new tetamente

	import os
	output = "books"
	if not os.path.exists(output):
	os.makedirs(output)

	lines = open("cleaned_bible.txt", "r").readlines()

	books = []
	books_idx = {}
	# there are books which are First of, second of etc

	one_to_9 = r"(?:f(?:ive\|our)\|s(?:even\|ix)\|t(?:hree\|wo)\|(?:ni\|o)ne\|eight)" # end one_to_9 definition
	ten_to_19 = r"(?:(?:(?:s(?:even\|ix)\|f(?:our\|if)\|nine)te\|e(?:ighte\|lev))en\|t(?:(?:hirte)?en\|welve))" # end ten_to_19 definition
	two_digit_prefix = r"(?:(?:s(?:even\|ix)\|t(?:hir\|wen)\|f(?:if\|or)\|eigh\|nine)ty)" # end two_digit_prefix definition
	one_to_99 = fr"(?:{two_digit_prefix}(?:[-\s]{one_to_9})?\|{ten_to_19}\|{one_to_9})" # end one_to_99 definition
	one_to_999 = fr"(?:{one_to_9}\shundred(?:\s(?:and\s)?{one_to_99})?\|{one_to_99})" # end one_to_999 definition
	one_to_999_999 = fr"(?:{one_to_999}\sthousand(?:\s{one_to_999})?\|{one_to_999})" # end one_to_999_999 definition
	one_to_999_999_999 = fr"(?:{one_to_999}\smillion(?:\s{one_to_999_999})?\|{one_to_999_999})" # end one_to_999_999_999 definition
	one_to_999_999_999_999 = fr"(?:{one_to_999}\sbillion(?:\s{one_to_999_999_999})?\|{one_to_999_999_999})" # end one_to_999_999_999_999 definition
	one_to_999_999_999_999_999 = fr"(?:{one_to_999}\strillion(?:\s{one_to_999_999_999_99

Stefano Bosisio Steboss89