Created
November 5, 2019 05:47
-
-
Save dzakyputra/cdddbc0d59ed6a1bbb33c377a57b361b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
import sys | |
import subprocess | |
import tesserocr | |
from pyspark.sql import SparkSession | |
# Initiate spark session | |
spark = SparkSession.builder \ | |
.appName("parallel-test")\ | |
.getOrCreate() | |
# Define a function to run tesserocr (jpg to txt) | |
def run_tesseract(x): | |
# Define path | |
path_breakdown = x[0].split('/') | |
output_path = path_breakdown[:-2] + ['output'] | |
output_path = '/'.join(output_path) | |
# Make output directory | |
try: | |
os.mkdir(output_path) | |
except: | |
pass | |
# Convert jpg to txt using tesserocr | |
text = tesserocr.file_to_text('{}'.format(x[0]), lang='ktp') | |
# Write the result to .txt file | |
with io.open(output_path + '/' + path_breakdown[-1][:-4] + '.txt', 'w', encoding='utf8') as file: | |
file.write(text) | |
file.close() | |
# Convert the list of the jpg files path into RDD and run it | |
data = spark.sparkContext.textFile('tes.txt').map(lambda x: x.split('\n')) | |
result = data.map(run_tesseract) | |
result.collect() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment