This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Convert midi number column to a numpy array | |
| midi_number = catterina_df['midi_number'].to_numpy() | |
| degrees = list(midi_number) # MIDI note number | |
| track = 0 | |
| channel = 0 | |
| time = 0 # In beats | |
| duration = 1 # In beats | |
| tempo = 240 # In BPM | |
| volume = 100 # 0-127, as per the MIDI standard |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # download a file from a URL, returns content of downloaded file | |
| def download_url(urlpath): | |
| try: | |
| # open a connection to the server | |
| with urlopen(urlpath, timeout=3) as connection: | |
| # read the contents of the url as bytes and return it | |
| return connection.read() | |
| except: | |
| return None |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # decode downloaded html and extract all <a href=""> links | |
| def get_urls_from_html(content): | |
| # decode the provided content as ascii text | |
| html = content.decode('utf-8') | |
| # parse the document as best we can | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # find all all of the <a href=""> tags in the document |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # return all book unique identifiers from a list of raw links | |
| def get_book_identifiers(links): | |
| # define a url pattern we are looking for | |
| pattern = re.compile('/ebooks/[0-9]+') | |
| # process the list of links for those that match the pattern | |
| books = set() | |
| for link in links: | |
| # check of the link matches the pattern | |
| if not pattern.match(link): | |
| continue |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # download one book from project gutenberg | |
| def download_book(book_id, save_path): | |
| print(save_path) | |
| # construct the download url | |
| url = f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt' | |
| # download the content | |
| data = download_url(url) | |
| if data is None: | |
| #print(f'Failed to download {url}') | |
| url = f'https://www.gutenberg.org/files/{book_id}/{book_id}.txt' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def download_all_books(url, save_path): | |
| # download the page that lists top books | |
| data = download_url(url) | |
| print(f'.downloaded {url}') | |
| # extract all links from the page | |
| links = get_urls_from_html(data) | |
| print(f'.found {len(links)} links on the page') | |
| # retrieve all unique book ids | |
| book_ids = get_book_identifiers(links) | |
| print(f'.found {len(book_ids)} unique book ids') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| book_id = '\\174.txt' #What book are we processing? | |
| book_name = book_dir + book_id #Location of book | |
| #Open the book | |
| book = open(book_name, "r", encoding="utf8") | |
| #Assign the book a name as string | |
| book = str(book.read()) | |
| #Use regex to split the book into chapters by finding instances of the word CHAPTER | |
| chapters = re.split("CHAPTER ", book) | |
| #Remove first 21 CHAPTER instances since they are just fluff |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| chapter_list = [] | |
| for file in os.listdir("."): | |
| if file.endswith(".txt"): | |
| chapter_list.append(file) | |
| #Do a natural sort on the texts files | |
| chapter_list.sort(key=lambda x: '{0:0>8}'.format(x).lower()) | |
| chapter_list |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| with open('2.txt', encoding = 'utf-8') as f: | |
| contents = f.read().rstrip() | |
| print(contents[0:248]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| chap_token = word_tokenize(contents) | |
| del(chap_token[0:2]) |