Skip to content

Instantly share code, notes, and snippets.

@alonsopg
Last active October 24, 2016 01:28
Show Gist options
  • Save alonsopg/4fed1fc15bf76fd74f4bf76dc95af0c7 to your computer and use it in GitHub Desktop.
Save alonsopg/4fed1fc15bf76fd74f4bf76dc95af0c7 to your computer and use it in GitHub Desktop.
pdf -> txt
from textract import process
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# para spa, hay que bajarlo de aquí: https://github.com/tesseract-ocr/langdata/tree/master/spa y ponerlo en
# el folder correspondiente
def transform_files(input_directory, output_directory):
import codecs, glob, os
from collections import OrderedDict
all_texts = OrderedDict()
for filename in glob.glob(os.path.join(input_directory, '*.pdf')):
texts = process(filename, method='tesseract', language='spa')
filename = os.path.basename(filename)
all_texts[filename] = texts
for i, (original_filename, a_list) in enumerate(all_texts.items()):
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
new_dir_path = output_directory
path = os.path.join(new_dir_path, new_filename)
print('Transforming: %s => %s' % (original_filename, path,))
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
input_d = '/Users/user/Desktop/Imagenes/'
out_d = '/Users/user/Desktop/ImagenesTXT_OCR/'
%time transform_files(input_d, out_d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment