Skip to content

Instantly share code, notes, and snippets.

@endes0
Created January 17, 2023 12:09
Show Gist options
  • Save endes0/edb7c63fa492a9e7f530b3306c9a6f72 to your computer and use it in GitHub Desktop.
Save endes0/edb7c63fa492a9e7f530b3306c9a6f72 to your computer and use it in GitHub Desktop.
from TTS.api import TTS
import PyPDF2
import subprocess
import os
import torch
import re
import argparse
# helpers
def limit_text_len(text, max_len=200, seps=['.', ',', '\n', ' ']) -> list:
# If the some element of text is too long, divide it in smaller parts using the separators
return_text = []
i = 0
for t in text:
if len(t) > max_len:
splited = t.split(seps[0])
# Restore the separators
for j in range(0, len(splited)-1):
splited[j] = splited[j] + seps[0]
# If the split is not good enough, try with the next separator
if len(seps) > 1 and len(max(splited, key=len)) > max_len:
splited = limit_text_len(splited, max_len, seps[1:])
return_text = return_text + splited
else:
return_text.append(t)
i = i + 1
return return_text
def force_limit_len(text, max_len=200) -> list:
# Splice the string elements of text to make them fit in max_len
return_text = []
i = 0
for t in text:
if len(t) > max_len:
splited = [t[i:i+max_len] for i in range(0, len(t), max_len)]
return_text = return_text + splited
else:
return_text.append(t)
i = i + 1
return return_text
def generate_index(pdfobj, outlines, top='', recur=True) -> dict:
result = {}
last = None
for bookmark in outlines:
if hasattr(bookmark, 'title') and bookmark.title is not None:
title = re.sub(' +', ' ', bookmark.title.replace('\n',
'').replace('/', '').replace('\\', '').strip())
result[top + title] = pdfobj.get_destination_page_number(bookmark)
last = title
elif type(bookmark) is list and recur:
result.update(generate_index(
pdfobj, bookmark, top + last + ' - ', True))
return result
def search_page_in_index(index, page):
# search for the value with the minimum difference
min_diff = 1000000
min_key = None
for key, value in index.items():
if value == page:
return key
elif value < page:
diff = page - value
if diff < min_diff:
min_diff = diff
min_key = key
return min_key if min_key != None else 'No chapter'
# Get the command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--pdf', type=str, default='example.pdf',
help='PDF file to read')
parser.add_argument('--out', type=str, default='out', help='Output folder')
parser.add_argument('--page', type=int, default=1,
help='Page to start reading')
parser.add_argument('--limit', type=int, default=350,
help='Maximum number of characters to synthesize at once')
parser.add_argument('--model', type=str, default='facebook/fastspeech2-en-ljspeech',
help='fairseq model to use from HuggingFace Hub')
parser.add_argument('--vocoder', type=str, default='hifigan',
help='Vocoder to use from the model')
parser.add_argument('--speaker', type=int, default=0,
help='Speaker to use from the model')
args = parser.parse_args()
# Init TTS with the target model name
tts = TTS(model_name="tts_models/en/ljspeech/vits",
progress_bar=True, gpu=True)
# check if the model is in GPU
use_cuda = torch.cuda.is_available()
if use_cuda:
print("Using GPU")
# read the PDF file
pdfReader = PyPDF2.PdfFileReader(args.pdf)
# number of pages in the PDF file
print(pdfReader.numPages)
# get the chapter names
index = generate_index(pdfReader, pdfReader.outlines)
top_index = generate_index(pdfReader, pdfReader.outlines, recur=False)
# show the index
for key, value in index.items():
print(key, value)
# create the output folders
os.mkdir(args.out)
os.mkdir(args.out + '/No chapter')
for key in top_index.keys():
os.mkdir(args.out + '/' + key)
# create a playlist
m3u = open(args.out + '/playlist.m3u', 'w')
m3u.write('#EXTM3U\n')
last_top_chapter = None
last_chapter = None
# iterate through the pages
i = args.page - 1
for page in pdfReader.pages:
i = i + 1
# extract the text from the page using pdftotext command
text = subprocess.check_output(
['pdftotext', '-f', str(i), '-l', str(i), '-layout', args.pdf, '-']).decode('utf-8')
# Remove \x0c characters
text = text.replace('\x0c', '')
# Remove duplicated spaces
text = re.sub(' +', ' ', text)
text = text.strip()
# Check if the page is empty
if text == None or text == '':
continue
# Divide the text
text = limit_text_len([text], args.limit)
# text = force_limit_len(text, 200)
waveforms = []
for t in text:
t = t.replace('\n', ' ').strip()
# Skip empty text
if t == None or t == '':
continue
# check if contains words or numbers
if not re.search('[a-zA-Z0-9]', t):
continue
print(t)
# Synthesize the text
wav = tts.tts(text=t, speaker=None, language=None)
waveforms += wav
# Running the TTS
# sample = TTSHubInterface.get_model_input(
# task, t, verbose=False)
# if use_cuda:
# sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
# sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
# sample["speaker"] = sample["speaker"].cuda(
# ) if sample["speaker"] is not None else None
# wav, rate = TTSHubInterface.get_prediction(
# task, model, generator, sample)
# waveforms.append(wav)
# Concatenate the waveforms
if len(waveforms) == 0:
continue
#waveforms = torch.cat(waveforms).repeat(1, 1)
# Get the chapter name
chapter = search_page_in_index(index, i+1)
out_dir = (search_page_in_index(top_index, i+1) + '/')
# Save the waverform
# torchaudio.save(os.path.join(args.out, out_dir,
# '[' + str(i) + '] ' + chapter + '.wav'), waveforms.cpu(), task.sr)
# tts.tts_to_file(text=text, file_path=os.path.join(
# args.out, out_dir, '[' + str(i) + '] ' + chapter + '.wav'))
tts.synthesizer.save_wav(wav=waveforms, path=os.path.join(
args.out, out_dir, '[' + str(i) + '] ' + chapter + '.wav'))
# Add the file to the playlist
if last_top_chapter != out_dir:
m3u.write('#EXTGRP:' + out_dir[:-1] + '\n')
last_top_chapter = out_dir
if last_chapter != chapter:
m3u.write('#EXTINF:-1,' + chapter + '\n')
last_chapter = chapter
m3u.write(out_dir + '[' + str(i) + '] ' + chapter + '.wav\n')
m3u.close()
import os
import sys
# Read all the files in the current directory and subdirectories
def read_files(path):
files = []
for root, dirs, filenames in os.walk(path):
for f in filenames:
files.append(os.path.join(root, f))
return files
# Convert the files to ogg
def convert(files):
for f in files:
if f.endswith('.wav'):
print(f)
os.system('ffmpeg -i "' + f + '" -acodec libvorbis -aq 4 "' + f[:-4] + '".ogg')
os.remove(f)
# Add metadata to the ogg files
def add_metadata(files):
for f in files:
if f.endswith('.ogg'):
print(f)
splited = f.split('/')[-1][:-4].split('->')
os.system('vorbiscomment -a -t TITLE="' + splited[-1] + '" "' + f + '"')
os.system('vorbiscomment -a -t ALBUM="' + (" - ".join(splited[1:-2]) if len(splited) > 2 else " - ".join(splited[0:-1])) + '" "' + f + '"')
path = sys.argv[1]
files = read_files(path)
convert(files)
files = read_files(path)
add_metadata(files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment