Skip to content

Instantly share code, notes, and snippets.

@THEMVFFINMAN
Created October 4, 2016 16:42
Show Gist options
  • Save THEMVFFINMAN/1563e284c3343f5f8785cc79ff51a55d to your computer and use it in GitHub Desktop.
Save THEMVFFINMAN/1563e284c3343f5f8785cc79ff51a55d to your computer and use it in GitHub Desktop.
Cleans up a specific type of bad conversion from pdf to epub
import re, os, sys
path = "EXTRACTED EPUB CONTENTS"
dirs = os.listdir(path)
for file in dirs:
if '.html' in file:
with open(path + "\\" + file, "r+") as f:
data = f.read()
data = data.replace('<p class="calibre1"><a id="12"></a>3RD PASS MASTER</p>', '')
data = data.replace('<p class="calibre1">12/29/11 11:58 AM</p>', '')
data = data.replace('<p class="calibre1">1L</p>', '')
data = data.replace('<p class="calibre1">index</p>', '')
data = data.replace('<p class="calibre1">R</p>', '')
data = data.replace('<p class="calibre1">1S</p>', '')
data = data.replace('<p class="calibre1">Intentional Blank</p>', '')
data = data.replace('1S', '')
data = data.replace('1L', '')
data = data.replace('<p class="calibre1">1S</p>', '')
data = data.replace(' R<', ' <')
data = re.sub('<p class="calibre1">1S<\/p>((.|\n)*)<p class="calibre1"><a id="11"><\/a>3RD PASS MASTER<\/p>', '', data)
data = re.sub('<p class="calibre1">85885_TheArt(.*).indd(.*)<\/p>', '', data)
data = re.sub('<p class="calibre1"><a id="(.*)"><\/a>3RD PASS MASTER<\/p>', '', data)
data = re.sub('<p class="calibre1"> <i class="calibre3">(.*)<\/i><\/p>', '', data)
data = re.sub('Th</p>((\n)*)<p class="calibre1">', 'Th', data)
data = re.sub('(?<=[A-z])<\/p>(\n*)<p class="calibre1">(?=[A-z])', '', data)
data = re.sub('(?<=[A-z])-<\/p>(\n*)<p class="calibre1">(?=[A-z])', '', data)
data = re.sub('(?<=[A-z]\s)<\/p>(\n*)<p class="calibre1">(?=[A-z])', '', data)
data = re.sub(r'\n\s*\n', '\n', data)
f.seek(0)
f.write(data)
f.truncate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment