Skip to content

Instantly share code, notes, and snippets.

View victormurcia's full-sized avatar
😀
Playing with data :]

Victor Murcia victormurcia

😀
Playing with data :]
View GitHub Profile
@victormurcia
victormurcia / make_midi.py
Created September 3, 2022 05:39
make_midi
#Convert midi number column to a numpy array
midi_number = catterina_df['midi_number'].to_numpy()
degrees = list(midi_number) # MIDI note number
track = 0
channel = 0
time = 0 # In beats
duration = 1 # In beats
tempo = 240 # In BPM
volume = 100 # 0-127, as per the MIDI standard
@victormurcia
victormurcia / download_url.py
Created September 5, 2022 04:25
download a file from a URL, returns content of downloaded file
# download a file from a URL, returns content of downloaded file
def download_url(urlpath):
try:
# open a connection to the server
with urlopen(urlpath, timeout=3) as connection:
# read the contents of the url as bytes and return it
return connection.read()
except:
return None
@victormurcia
victormurcia / get_urls_from_html.py
Created September 5, 2022 04:28
decode downloaded html and extract all <a href=""> links
# decode downloaded html and extract all <a href=""> links
def get_urls_from_html(content):
# decode the provided content as ascii text
html = content.decode('utf-8')
# parse the document as best we can
soup = BeautifulSoup(html, 'html.parser')
# find all all of the <a href=""> tags in the document
@victormurcia
victormurcia / get_book_identifiers.py
Created September 5, 2022 04:30
return all book unique identifiers from a list of raw links
# return all book unique identifiers from a list of raw links
def get_book_identifiers(links):
# define a url pattern we are looking for
pattern = re.compile('/ebooks/[0-9]+')
# process the list of links for those that match the pattern
books = set()
for link in links:
# check of the link matches the pattern
if not pattern.match(link):
continue
@victormurcia
victormurcia / download_book.py
Created September 5, 2022 04:38
download one book from project gutenberg
# download one book from project gutenberg
def download_book(book_id, save_path):
print(save_path)
# construct the download url
url = f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt'
# download the content
data = download_url(url)
if data is None:
#print(f'Failed to download {url}')
url = f'https://www.gutenberg.org/files/{book_id}/{book_id}.txt'
@victormurcia
victormurcia / download_all_books.py
Created September 5, 2022 04:46
download_all_books from search query in project gutemberg
def download_all_books(url, save_path):
# download the page that lists top books
data = download_url(url)
print(f'.downloaded {url}')
# extract all links from the page
links = get_urls_from_html(data)
print(f'.found {len(links)} links on the page')
# retrieve all unique book ids
book_ids = get_book_identifiers(links)
print(f'.found {len(book_ids)} unique book ids')
@victormurcia
victormurcia / make_chapter_files.py
Created September 5, 2022 05:04
routine that makes chapters out of books from project gutemberg
book_id = '\\174.txt' #What book are we processing?
book_name = book_dir + book_id #Location of book
#Open the book
book = open(book_name, "r", encoding="utf8")
#Assign the book a name as string
book = str(book.read())
#Use regex to split the book into chapters by finding instances of the word CHAPTER
chapters = re.split("CHAPTER ", book)
#Remove first 21 CHAPTER instances since they are just fluff
@victormurcia
victormurcia / list_chapter.py
Created September 5, 2022 05:06
list chapters in book
chapter_list = []
for file in os.listdir("."):
if file.endswith(".txt"):
chapter_list.append(file)
#Do a natural sort on the texts files
chapter_list.sort(key=lambda x: '{0:0>8}'.format(x).lower())
chapter_list
@victormurcia
victormurcia / read_chapter.py
Created September 5, 2022 05:07
read contents of book chapter
with open('2.txt', encoding = 'utf-8') as f:
contents = f.read().rstrip()
print(contents[0:248])
@victormurcia
victormurcia / tokenize_chapter.py
Created September 5, 2022 14:54
tokenize_chapter
chap_token = word_tokenize(contents)
del(chap_token[0:2])