Created
January 17, 2023 12:09
-
-
Save endes0/edb7c63fa492a9e7f530b3306c9a6f72 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from TTS.api import TTS | |
import PyPDF2 | |
import subprocess | |
import os | |
import torch | |
import re | |
import argparse | |
# helpers | |
def limit_text_len(text, max_len=200, seps=['.', ',', '\n', ' ']) -> list: | |
# If the some element of text is too long, divide it in smaller parts using the separators | |
return_text = [] | |
i = 0 | |
for t in text: | |
if len(t) > max_len: | |
splited = t.split(seps[0]) | |
# Restore the separators | |
for j in range(0, len(splited)-1): | |
splited[j] = splited[j] + seps[0] | |
# If the split is not good enough, try with the next separator | |
if len(seps) > 1 and len(max(splited, key=len)) > max_len: | |
splited = limit_text_len(splited, max_len, seps[1:]) | |
return_text = return_text + splited | |
else: | |
return_text.append(t) | |
i = i + 1 | |
return return_text | |
def force_limit_len(text, max_len=200) -> list: | |
# Splice the string elements of text to make them fit in max_len | |
return_text = [] | |
i = 0 | |
for t in text: | |
if len(t) > max_len: | |
splited = [t[i:i+max_len] for i in range(0, len(t), max_len)] | |
return_text = return_text + splited | |
else: | |
return_text.append(t) | |
i = i + 1 | |
return return_text | |
def generate_index(pdfobj, outlines, top='', recur=True) -> dict: | |
result = {} | |
last = None | |
for bookmark in outlines: | |
if hasattr(bookmark, 'title') and bookmark.title is not None: | |
title = re.sub(' +', ' ', bookmark.title.replace('\n', | |
'').replace('/', '').replace('\\', '').strip()) | |
result[top + title] = pdfobj.get_destination_page_number(bookmark) | |
last = title | |
elif type(bookmark) is list and recur: | |
result.update(generate_index( | |
pdfobj, bookmark, top + last + ' - ', True)) | |
return result | |
def search_page_in_index(index, page): | |
# search for the value with the minimum difference | |
min_diff = 1000000 | |
min_key = None | |
for key, value in index.items(): | |
if value == page: | |
return key | |
elif value < page: | |
diff = page - value | |
if diff < min_diff: | |
min_diff = diff | |
min_key = key | |
return min_key if min_key != None else 'No chapter' | |
# Get the command line arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--pdf', type=str, default='example.pdf', | |
help='PDF file to read') | |
parser.add_argument('--out', type=str, default='out', help='Output folder') | |
parser.add_argument('--page', type=int, default=1, | |
help='Page to start reading') | |
parser.add_argument('--limit', type=int, default=350, | |
help='Maximum number of characters to synthesize at once') | |
parser.add_argument('--model', type=str, default='facebook/fastspeech2-en-ljspeech', | |
help='fairseq model to use from HuggingFace Hub') | |
parser.add_argument('--vocoder', type=str, default='hifigan', | |
help='Vocoder to use from the model') | |
parser.add_argument('--speaker', type=int, default=0, | |
help='Speaker to use from the model') | |
args = parser.parse_args() | |
# Init TTS with the target model name | |
tts = TTS(model_name="tts_models/en/ljspeech/vits", | |
progress_bar=True, gpu=True) | |
# check if the model is in GPU | |
use_cuda = torch.cuda.is_available() | |
if use_cuda: | |
print("Using GPU") | |
# read the PDF file | |
pdfReader = PyPDF2.PdfFileReader(args.pdf) | |
# number of pages in the PDF file | |
print(pdfReader.numPages) | |
# get the chapter names | |
index = generate_index(pdfReader, pdfReader.outlines) | |
top_index = generate_index(pdfReader, pdfReader.outlines, recur=False) | |
# show the index | |
for key, value in index.items(): | |
print(key, value) | |
# create the output folders | |
os.mkdir(args.out) | |
os.mkdir(args.out + '/No chapter') | |
for key in top_index.keys(): | |
os.mkdir(args.out + '/' + key) | |
# create a playlist | |
m3u = open(args.out + '/playlist.m3u', 'w') | |
m3u.write('#EXTM3U\n') | |
last_top_chapter = None | |
last_chapter = None | |
# iterate through the pages | |
i = args.page - 1 | |
for page in pdfReader.pages: | |
i = i + 1 | |
# extract the text from the page using pdftotext command | |
text = subprocess.check_output( | |
['pdftotext', '-f', str(i), '-l', str(i), '-layout', args.pdf, '-']).decode('utf-8') | |
# Remove \x0c characters | |
text = text.replace('\x0c', '') | |
# Remove duplicated spaces | |
text = re.sub(' +', ' ', text) | |
text = text.strip() | |
# Check if the page is empty | |
if text == None or text == '': | |
continue | |
# Divide the text | |
text = limit_text_len([text], args.limit) | |
# text = force_limit_len(text, 200) | |
waveforms = [] | |
for t in text: | |
t = t.replace('\n', ' ').strip() | |
# Skip empty text | |
if t == None or t == '': | |
continue | |
# check if contains words or numbers | |
if not re.search('[a-zA-Z0-9]', t): | |
continue | |
print(t) | |
# Synthesize the text | |
wav = tts.tts(text=t, speaker=None, language=None) | |
waveforms += wav | |
# Running the TTS | |
# sample = TTSHubInterface.get_model_input( | |
# task, t, verbose=False) | |
# if use_cuda: | |
# sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda() | |
# sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda() | |
# sample["speaker"] = sample["speaker"].cuda( | |
# ) if sample["speaker"] is not None else None | |
# wav, rate = TTSHubInterface.get_prediction( | |
# task, model, generator, sample) | |
# waveforms.append(wav) | |
# Concatenate the waveforms | |
if len(waveforms) == 0: | |
continue | |
#waveforms = torch.cat(waveforms).repeat(1, 1) | |
# Get the chapter name | |
chapter = search_page_in_index(index, i+1) | |
out_dir = (search_page_in_index(top_index, i+1) + '/') | |
# Save the waverform | |
# torchaudio.save(os.path.join(args.out, out_dir, | |
# '[' + str(i) + '] ' + chapter + '.wav'), waveforms.cpu(), task.sr) | |
# tts.tts_to_file(text=text, file_path=os.path.join( | |
# args.out, out_dir, '[' + str(i) + '] ' + chapter + '.wav')) | |
tts.synthesizer.save_wav(wav=waveforms, path=os.path.join( | |
args.out, out_dir, '[' + str(i) + '] ' + chapter + '.wav')) | |
# Add the file to the playlist | |
if last_top_chapter != out_dir: | |
m3u.write('#EXTGRP:' + out_dir[:-1] + '\n') | |
last_top_chapter = out_dir | |
if last_chapter != chapter: | |
m3u.write('#EXTINF:-1,' + chapter + '\n') | |
last_chapter = chapter | |
m3u.write(out_dir + '[' + str(i) + '] ' + chapter + '.wav\n') | |
m3u.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
# Read all the files in the current directory and subdirectories | |
def read_files(path): | |
files = [] | |
for root, dirs, filenames in os.walk(path): | |
for f in filenames: | |
files.append(os.path.join(root, f)) | |
return files | |
# Convert the files to ogg | |
def convert(files): | |
for f in files: | |
if f.endswith('.wav'): | |
print(f) | |
os.system('ffmpeg -i "' + f + '" -acodec libvorbis -aq 4 "' + f[:-4] + '".ogg') | |
os.remove(f) | |
# Add metadata to the ogg files | |
def add_metadata(files): | |
for f in files: | |
if f.endswith('.ogg'): | |
print(f) | |
splited = f.split('/')[-1][:-4].split('->') | |
os.system('vorbiscomment -a -t TITLE="' + splited[-1] + '" "' + f + '"') | |
os.system('vorbiscomment -a -t ALBUM="' + (" - ".join(splited[1:-2]) if len(splited) > 2 else " - ".join(splited[0:-1])) + '" "' + f + '"') | |
path = sys.argv[1] | |
files = read_files(path) | |
convert(files) | |
files = read_files(path) | |
add_metadata(files) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment