Skip to content

Instantly share code, notes, and snippets.

@dzakyputra
Created November 5, 2019 05:47
Show Gist options
  • Save dzakyputra/cdddbc0d59ed6a1bbb33c377a57b361b to your computer and use it in GitHub Desktop.
Save dzakyputra/cdddbc0d59ed6a1bbb33c377a57b361b to your computer and use it in GitHub Desktop.
import os
import io
import sys
import subprocess
import tesserocr
from pyspark.sql import SparkSession
# Initiate spark session
spark = SparkSession.builder \
.appName("parallel-test")\
.getOrCreate()
# Define a function to run tesserocr (jpg to txt)
def run_tesseract(x):
# Define path
path_breakdown = x[0].split('/')
output_path = path_breakdown[:-2] + ['output']
output_path = '/'.join(output_path)
# Make output directory
try:
os.mkdir(output_path)
except:
pass
# Convert jpg to txt using tesserocr
text = tesserocr.file_to_text('{}'.format(x[0]), lang='ktp')
# Write the result to .txt file
with io.open(output_path + '/' + path_breakdown[-1][:-4] + '.txt', 'w', encoding='utf8') as file:
file.write(text)
file.close()
# Convert the list of the jpg files path into RDD and run it
data = spark.sparkContext.textFile('tes.txt').map(lambda x: x.split('\n'))
result = data.map(run_tesseract)
result.collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment