Created
November 3, 2017 16:35
-
-
Save KFDCompiled/f778eee765d831ac4605e906e0ad41f7 to your computer and use it in GitHub Desktop.
Python implementation of scal.pl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import wget | |
import textract | |
# url = 'https://www.utcourts.gov/cal/data/SLC_Calendar.pdf' | |
# pdf = wget.download(url) | |
print "Processing pdf into text..." | |
pdf_text_raw = textract.process("SLC_Calendar.pdf") # Load text into list | |
print "Formatting..." | |
pdf_text_lines=pdf_text_raw.splitlines() # Convert '\n' into new lines | |
print "Structuring data..." | |
pdf_text_array=[line.split() for line in pdf_text_lines] # Load lines into 2D array | |
print "Searching for start of entries..." | |
pos = 0 | |
while pos <= len(pdf_text_array) - 1: | |
if (( pdf_text_array[pos][0] == "Page" ) and | |
( pdf_text_array[pos+2][0] == "3RD" ) and | |
( pdf_text_array[pos+3][1] == "BERNARDS-GOODMAN" ) and | |
( pdf_text_array[pos+7][0] == "September" ) and | |
( pdf_text_array[pos+7][1] == 29 ) and | |
( pdf_text_array[pos+7][3] == 2017 )): | |
start = pos | |
break | |
pos = pos+1 | |
print "Found start position %" % start | |
while pos <= len(pdf_text_array) - 1: | |
if (( pdf_text_array[pos][1] == "Page" ) and | |
( pdf_text_array[pos+1][1] != "BERNARDS-GOODMAN" )): | |
end = pos | |
break | |
pos = pos+1 | |
print "Found end position %" % end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment