Skip to content

Instantly share code, notes, and snippets.

Created March 24, 2019 07:52
Show Gist options
  • Save epogrebnyak/30d73ae57642173d0b9c31d85a362b7d to your computer and use it in GitHub Desktop.
Save epogrebnyak/30d73ae57642173d0b9c31d85a362b7d to your computer and use it in GitHub Desktop.
# Source of truth:
# Вариант 0
# =========
# tika
import os
from tika import parser
from shutil import copyfile
from pathlib import Path
WORKER = "worker"
def make_copy(path):
"Rename file to avoid problems of long filenames."
copyfile(path, WORKER)
return WORKER
def get_raw_text(path):
s = str(parser.from_file(path)['content'])
return s.replace("\n", "")
def text_path(path, subfolder = "text"):
filename, ext = os.path.splitext(path)
txt_name = filename.replace(" ", "_").replace("'","")+".txt"
if not os.path.exists(subfolder):
return os.path.join(subfolder, txt_name)
def to_text(path, content):
Path(text_path(path)).write_text(content, encoding="utf-8")
def list_pdf(folder):
for path in os.listdir(folder):
fn, ext = os.path.splitext(path)
if ext == ".pdf":
yield make_copy(path), path
for path_temp, path in list_pdf("."):
content = get_raw_text(path_temp)
to_text(path, content)
# Вариант 1
# =========
#import PyPDF2
#read_pdf = PyPDF2.PdfFileReader(PATH)
#number_of_pages = read_pdf.getNumPages()
#page = read_pdf.getPage(0)
#page_content = page.extractText()
#print (page_content)
# Вариант 2
# =========
#import textract
#text = textract.process(PATH)
# оболочка для pdftotext / pdf.miner.six
# Вариант 3
# =========
# pdf.miner.six
# Вариант 4
# =========
# xpdf binary
# Вариант 5
# =========
# pdftotext
Copy link

Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment