Skip to content

Instantly share code, notes, and snippets.

View Steboss89's full-sized avatar

Stefano Bosisio Steboss89

View GitHub Profile
@Steboss89
Steboss89 / histogram_numbers.py
Created April 13, 2022 08:54
Create a histogram to compare numbers across principal books
df = pd.DataFrame.from_dict(match_dict)
#Extract a subset from the books we want to analyse
books = ['Genesis','Exodus','1Kings','2Kings','Psalms','Isaias','Mark','Matthew','Luke','John','Apocalypse']
# drop NaN, so where occurrences = 0, so we can have a comparison of the common set of numbers
# transpose the dataframe so numbers will be the name of the columns
subset = df[book].dropna().T
# melt the dataframe
subset = subset.reset_index().melt(id_vars='index')
# rename columns
subset.rename(columns={"index":"Book", "variable":"Number","value":"Occurrence"},inplace=True)
@Steboss89
Steboss89 / zipf_law.py
Created April 12, 2022 20:33
Compute Zipf's law
# preprocessing on data
# data is a list of all the Bible's books
# call the CountVectorizer
cvec = CountVectorizer()
# fit transform as we're working directly on all the corpus
cvec.fit_transform(data)
# np matrix sparse
all_df = cvec.transform(data)
# create a dataframe: sum on all the term occurrences
@Steboss89
Steboss89 / check_numbers3.py
Created April 12, 2022 15:34
Third winning approach for regex
one_to_9 = r"(?:f(?:ive|our)|s(?:even|ix)|t(?:hree|wo)|(?:ni|o)ne|eight)" # end one_to_9 definition
ten_to_19 = r"(?:(?:(?:s(?:even|ix)|f(?:our|if)|nine)te|e(?:ighte|lev))en|t(?:(?:hirte)?en|welve))" # end ten_to_19 definition
two_digit_prefix = r"(?:(?:s(?:even|ix)|t(?:hir|wen)|f(?:if|or)|eigh|nine)ty)" # end two_digit_prefix definition
one_to_99 = fr"(?:{two_digit_prefix}(?:[-\s]{one_to_9})?|{ten_to_19}|{one_to_9})" # end one_to_99 definition
one_to_999 = fr"(?:{one_to_9}\shundred(?:\s(?:and\s)?{one_to_99})?|{one_to_99})" # end one_to_999 definition
one_to_999_999 = fr"(?:{one_to_999}\sthousand(?:\s{one_to_999})?|{one_to_999})" # end one_to_999_999 definition
one_to_999_999_999 = fr"(?:{one_to_999}\smillion(?:\s{one_to_999_999})?|{one_to_999_999})" # end one_to_999_999_999 definition
one_to_999_999_999_999 = fr"(?:{one_to_999}\sbillion(?:\s{one_to_999_999_999})?|{one_to_999_999_999})" # end one_to_999_999_999_999 definition
one_to_999_999_999_999_999 = fr"(?:{one_to_999}\strillion(?:\s{one_to_999_999_999_99
@Steboss89
Steboss89 / check_numbers2a.py
Created April 12, 2022 15:29
Second approach a non-frequent approach to build up a regex
regex = r"\b("
remaining_numbers = []
remaining_numbers2 = []
for i in range(10000, 1000, -1):
if i%10==0:
# these are numbers that could cause false matching
# these numbers ends with zero but we may have something more in the text
# three thousand forty (3040) vs three thousand forty one
remaining_numbers.append(i)
@Steboss89
Steboss89 / check_numbers2.py
Created April 12, 2022 15:22
Second approach use regex
# this is a test string with numbers to be found
test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers"
# firstly we could think of a simple regex to match numbers
regex = r"\b(three thousand five hundred forty five|three thousand thirty four|twenty thousand|three thousand|forty five|thirty four|twenty|five|four|three|two|one)\b"
re.findall(regex, test_text)
# the result is not we were expecting
# recalibrate the order from "rare" numbers to "frequent" ones
regex = r"\b(three thousand five hundred forty five|three thousand thirty four|twenty thousand|three thousand|forty five|thirty four|twenty|five|four|three|two|one)\b"
re.findall(regex, test_text)
@Steboss89
Steboss89 / check_numbers2.py
Created April 12, 2022 15:19
Second approach, use regex
test_text = "this is a string with one number and then twenty thousand numbers and three thousand thirty four and three thousand five hundred forty five numbers"
@Steboss89
Steboss89 / check_numbers1.py
Created April 12, 2022 15:00
First approach from int to string to list
numbers = []
for i in range(0,100001):
# convert a number to string
numbers.append(num2words(i))
# clean the book and obtain the data list
# for each book collect the number occurrences
numb_dict = {}
for i, book in enumerate(data, 0):
@Steboss89
Steboss89 / wordcloud1.py
Created April 8, 2022 15:30
Create a wordcloud with all the input books
for book in data:
big_string += book + " "
wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(big_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
@Steboss89
Steboss89 / cleaning_2.py
Last active April 8, 2022 15:30
cleaning of input text data
data = []
ifiles = glob.glob("books/*.txt")
for ifile in ifiles:
book = open(ifile, "r").read().strip()
data.append(book)
stop_words = stopwords.words('english')
stop_words.extend(["thy","thou","thee", "hath", "upon", "me", "him", "them", "shall","ye", "one", "unto", "us"])
@Steboss89
Steboss89 / cleaning_1.py
Created April 8, 2022 15:00
Subdivide books in old and new tetamente
import os
output = "books"
if not os.path.exists(output):
os.makedirs(output)
lines = open("cleaned_bible.txt", "r").readlines()
books = []
books_idx = {}
# there are books which are First of, second of etc