dzakyputra · November 5, 2019 05:47
diff --git a/parallel_spark.py b/parallel_spark.py
 import os
 import io
 import sys
 import subprocess
 import tesserocr

 from pyspark.sql import SparkSession

 # Initiate spark session
 spark = SparkSession.builder \
         .appName("parallel-test")\
         .getOrCreate()

 # Define a function to run tesserocr (jpg to txt)
 def run_tesseract(x):

    # Define path
    path_breakdown = x[0].split('/')
    output_path = path_breakdown[:-2] + ['output']
    output_path = '/'.join(output_path)

    # Make output directory
    try:
        os.mkdir(output_path)
    except:
        pass

    # Convert jpg to txt using tesserocr
    text = tesserocr.file_to_text('{}'.format(x[0]), lang='ktp')

    # Write the result to .txt file
    with io.open(output_path + '/' + path_breakdown[-1][:-4] + '.txt', 'w', encoding='utf8') as file:
        file.write(text)
    file.close()


 # Convert the list of the jpg files path into RDD and run it
 data = spark.sparkContext.textFile('tes.txt').map(lambda x: x.split('\n'))
 result = data.map(run_tesseract)
 result.collect()
	import os
	import io
	import sys
	import subprocess
	import tesserocr

	from pyspark.sql import SparkSession

	# Initiate spark session
	spark = SparkSession.builder \
	.appName("parallel-test")\
	.getOrCreate()

	# Define a function to run tesserocr (jpg to txt)
	def run_tesseract(x):

	# Define path
	path_breakdown = x[0].split('/')
	output_path = path_breakdown[:-2] + ['output']
	output_path = '/'.join(output_path)

	# Make output directory
	try:
	os.mkdir(output_path)
	except:
	pass

	# Convert jpg to txt using tesserocr
	text = tesserocr.file_to_text('{}'.format(x[0]), lang='ktp')

	# Write the result to .txt file
	with io.open(output_path + '/' + path_breakdown[-1][:-4] + '.txt', 'w', encoding='utf8') as file:
	file.write(text)
	file.close()


	# Convert the list of the jpg files path into RDD and run it
	data = spark.sparkContext.textFile('tes.txt').map(lambda x: x.split('\n'))
	result = data.map(run_tesseract)
	result.collect()