Created
November 3, 2017 16:36
-
-
Save KFDCompiled/e2fd68df6652d7d35ed679b9d5651a3c to your computer and use it in GitHub Desktop.
Second Attempt at Python Implementation of scal.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import wget | |
import textract | |
# url = 'https://www.utcourts.gov/cal/data/SLC_Calendar.pdf' | |
# pdf = wget.download(url) | |
print "Processing pdf into text..." | |
pdf_text_raw = textract.process("SLC_Calendar.pdf") # Load text into list | |
print "Formatting..." | |
pdf_text_lines=pdf_text_raw.splitlines() # Convert '\n' into new lines | |
print "Structuring data..." | |
pdf_text_array=[line.split() for line in pdf_text_lines] # Load lines into 2D array | |
print "Searching for start of entries..." | |
row = 0 | |
while row < len(pdf_text_array)-1: | |
col = 0 | |
while col < len(pdf_text_array[row])-1: | |
if (( pdf_text_array[row][col] == "Page" ) and | |
( pdf_text_array[row+2][1] == "3RD" ) and | |
( pdf_text_array[row+3][1] == "BERNARDS-GOODMAN" ) and | |
( pdf_text_array[row+7][0] == "September" ) and | |
( pdf_text_array[row+7][1] == 29 ) and | |
( pdf_text_array[row+7][2] == 2017 )): | |
start = [row][col] | |
break | |
col = col+1 | |
row = row+1 | |
print "Found start position %" % start | |
while pos <= len(pdf_text_array) - 1: | |
if (( pdf_text_array[pos][1] == "Page" ) and | |
( pdf_text_array[pos+1][1] != "BERNARDS-GOODMAN" )): | |
end = pos | |
break | |
pos = pos+1 | |
print "Found end position %" % end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment