Created
July 19, 2020 10:39
-
-
Save cftang0827/36caa8c594e02d06c8634c05fda97c7c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from glob import glob | |
import subprocess as sb | |
def parse_html(filename): | |
print(filename.split("."[0] + ".txt")) | |
with open(os.path.join(".", "txt", filename.split(".")[0] + ".txt")) as f: | |
texts = f.readlines() | |
with open(os.path.join(".", "./html/{}_text.html".format(filename.split(".")[0])), "w") as f: | |
f.writelines("<html>\n<div>") | |
for t in texts: | |
f.writelines(t.replace("\n", "<br>")) | |
f.writelines("</div>\n</html>") | |
print("Generate txt folder") | |
if not os.path.isdir("txt"): | |
os.mkdir("txt") | |
print("Generate html folder") | |
if not os.path.isdir("html"): | |
os.mkdir("html") | |
print("Get all pdf files in this folder!") | |
all_pdfs = glob("*.pdf") | |
print("Overall {} pdf files.".format(len(all_pdfs))) | |
print("################################") | |
for file in all_pdfs: | |
print("Remove space in file name") | |
old_file = file | |
file.replace(" ", "") | |
os.rename(old_file, file) | |
print("Analyzing file {}".format(file)) | |
sb.call(["ocrmypdf", file, "output.pdf", "--sidecar", "./txt/" + file.split(".")[0] + ".txt", "--force-ocr"]) | |
print("Generate html file") | |
parse_html(file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment