Victor Murcia victormurcia

😀

Playing with data :]

Data Scientist, PhD in Materials Science and Engineering (WSU, 2022). I'm interested in algorithm development, data science, and mathematical modeling.

33 followers · 39 following

US Department of Veterans Affairs
Jersey City, NJ
04:15 (UTC -05:00)
https://victormurcia.github.io/
in/vmmr5596
https://medium.com/@victormurcia-53351

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

victormurcia / make_midi.py

Created September 3, 2022 05:39

make_midi

	#Convert midi number column to a numpy array
	midi_number = catterina_df['midi_number'].to_numpy()

	degrees = list(midi_number) # MIDI note number
	track = 0
	channel = 0
	time = 0 # In beats
	duration = 1 # In beats
	tempo = 240 # In BPM
	volume = 100 # 0-127, as per the MIDI standard

victormurcia / download_url.py

Created September 5, 2022 04:25

download a file from a URL, returns content of downloaded file

	# download a file from a URL, returns content of downloaded file
	def download_url(urlpath):
	try:
	# open a connection to the server
	with urlopen(urlpath, timeout=3) as connection:
	# read the contents of the url as bytes and return it
	return connection.read()
	except:
	return None

victormurcia / get_urls_from_html.py

Created September 5, 2022 04:28

decode downloaded html and extract all <a href=""> links

	# decode downloaded html and extract all <a href=""> links
	def get_urls_from_html(content):

	# decode the provided content as ascii text
	html = content.decode('utf-8')

	# parse the document as best we can
	soup = BeautifulSoup(html, 'html.parser')

	# find all all of the <a href=""> tags in the document

victormurcia / get_book_identifiers.py

Created September 5, 2022 04:30

return all book unique identifiers from a list of raw links

	# return all book unique identifiers from a list of raw links
	def get_book_identifiers(links):
	# define a url pattern we are looking for
	pattern = re.compile('/ebooks/[0-9]+')
	# process the list of links for those that match the pattern
	books = set()
	for link in links:
	# check of the link matches the pattern
	if not pattern.match(link):
	continue

victormurcia / download_book.py

Created September 5, 2022 04:38

download one book from project gutenberg

	# download one book from project gutenberg
	def download_book(book_id, save_path):
	print(save_path)
	# construct the download url
	url = f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt'
	# download the content
	data = download_url(url)
	if data is None:
	#print(f'Failed to download {url}')
	url = f'https://www.gutenberg.org/files/{book_id}/{book_id}.txt'

victormurcia / download_all_books.py

Created September 5, 2022 04:46

download_all_books from search query in project gutemberg

	def download_all_books(url, save_path):
	# download the page that lists top books
	data = download_url(url)
	print(f'.downloaded {url}')
	# extract all links from the page
	links = get_urls_from_html(data)
	print(f'.found {len(links)} links on the page')
	# retrieve all unique book ids
	book_ids = get_book_identifiers(links)
	print(f'.found {len(book_ids)} unique book ids')

victormurcia / make_chapter_files.py

Created September 5, 2022 05:04

routine that makes chapters out of books from project gutemberg

	book_id = '\\174.txt' #What book are we processing?
	book_name = book_dir + book_id #Location of book

	#Open the book
	book = open(book_name, "r", encoding="utf8")
	#Assign the book a name as string
	book = str(book.read())
	#Use regex to split the book into chapters by finding instances of the word CHAPTER
	chapters = re.split("CHAPTER ", book)
	#Remove first 21 CHAPTER instances since they are just fluff

victormurcia / list_chapter.py

Created September 5, 2022 05:06

list chapters in book

	chapter_list = []
	for file in os.listdir("."):
	if file.endswith(".txt"):
	chapter_list.append(file)

	#Do a natural sort on the texts files
	chapter_list.sort(key=lambda x: '{0:0>8}'.format(x).lower())
	chapter_list

victormurcia / read_chapter.py

Created September 5, 2022 05:07

read contents of book chapter

	with open('2.txt', encoding = 'utf-8') as f:
	contents = f.read().rstrip()
	print(contents[0:248])

victormurcia / tokenize_chapter.py

Created September 5, 2022 14:54

tokenize_chapter

	chap_token = word_tokenize(contents)
	del(chap_token[0:2])

Older Newer