Created
March 24, 2019 07:52
-
-
Save epogrebnyak/30d73ae57642173d0b9c31d85a362b7d to your computer and use it in GitHub Desktop.
PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source of truth: | |
# https://stackoverflow.com/questions/34837707/how-to-extract-text-from-a-pdf-file | |
# Вариант 0 | |
# ========= | |
# tika | |
import os | |
from tika import parser | |
from shutil import copyfile | |
from pathlib import Path | |
WORKER = "worker" | |
def make_copy(path): | |
"Rename file to avoid problems of long filenames." | |
copyfile(path, WORKER) | |
return WORKER | |
def get_raw_text(path): | |
s = str(parser.from_file(path)['content']) | |
return s.replace("\n", "") | |
def text_path(path, subfolder = "text"): | |
filename, ext = os.path.splitext(path) | |
txt_name = filename.replace(" ", "_").replace("'","")+".txt" | |
if not os.path.exists(subfolder): | |
os.mkdir(subfolder) | |
return os.path.join(subfolder, txt_name) | |
def to_text(path, content): | |
Path(text_path(path)).write_text(content, encoding="utf-8") | |
def list_pdf(folder): | |
for path in os.listdir(folder): | |
fn, ext = os.path.splitext(path) | |
if ext == ".pdf": | |
yield make_copy(path), path | |
for path_temp, path in list_pdf("."): | |
print(path) | |
content = get_raw_text(path_temp) | |
print(content) | |
to_text(path, content) | |
# Вариант 1 | |
# ========= | |
#import PyPDF2 | |
#read_pdf = PyPDF2.PdfFileReader(PATH) | |
#number_of_pages = read_pdf.getNumPages() | |
#page = read_pdf.getPage(0) | |
#page_content = page.extractText() | |
#print (page_content) | |
# Вариант 2 | |
# ========= | |
#import textract | |
#text = textract.process(PATH) | |
# оболочка для pdftotext / pdf.miner.six | |
# Вариант 3 | |
# ========= | |
# pdf.miner.six | |
# Вариант 4 | |
# ========= | |
# xpdf binary | |
# Вариант 5 | |
# ========= | |
# pdftotext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://stackoverflow.com/questions/34837707/how-to-extract-text-from-a-pdf-file