Skip to content

Instantly share code, notes, and snippets.

@blu3r4d0n
Last active August 11, 2024 04:46
Show Gist options
  • Save blu3r4d0n/bc9d5f63e246fd9a9fa7efc031b20dc2 to your computer and use it in GitHub Desktop.
Save blu3r4d0n/bc9d5f63e246fd9a9fa7efc031b20dc2 to your computer and use it in GitHub Desktop.
This is a quick and dirty script to get a dataframe of parties to SC Supreme Court cases that have filed opinions. This is the first step to extracting information about opinions filed by the Court. NOT ALL EDGE CASES ARE HANDLED YET.
import pandas as pd
import requests
from io import BytesIO
import urllib
from pathlib import Path
import pdfplumber
headers = {'user-agent': 'INSERT YOUR DESIRED USER AGENT HERE'):
def extract_text_between(start_text, stop_text, full_text):
start_index = full_text.find(start_text) + len(start_text)
stop_index = full_text.find(stop_text)
if start_index == -1 or stop_index == -1:
return "Both start or stop text not found in the input string."
extracted_text = full_text[start_index:stop_index]
return extracted_text
def get_pdf(url):
print(url)
return BytesIO(requests.get(url,headers=headers).content)
def get_fonts(pdf_byte_array):
#resp = BytesIO(requests.get(url,headers=headers).content)
with pdfplumber.open(pdf_byte_array) as pdf:
meta = pdf.metadata
font_sizes = sorted(set([x['size'] for x in pdf.chars]))
sizes=set([round(size,0) for size in font_sizes])
print(meta)
print(sizes)
print(len(sizes))
pdfs = [f'https://www.sccourts.org/opinions/HTMLFiles/SC/{x}.pdf' for x in range(28000,28228)]
pdf_bytes = [(pdf,get_pdf(pdf)) for pdf in pdfs]
pdf_dict = dict(pdf_bytes)
cases = []
max_parties = 0
case_max_parties = ''
for case_url,pdf_bytes in pdf_dict.items():
print('-----------------')
print(case_url)
with pdfplumber.open(pdf_bytes) as pdf:
first_party = None
second_party = None
third_party = None
fourth_party = None
single_party = False
case_num = Path(urllib.parse.urlparse(case_url).path).stem.split('/')[-1]
print(case_num)
rects = pdf.rects
if len(rects) == 0:
print("BAD FORMAT")
continue
page = rects[0]['page_number']
infoblock = ""
if page != 0:
for curr_page in range(0,page):
if page != curr_page:
infoblock+= pdf.pages[curr_page].extract_text()
else:
infoblock+=pdf.pages[curr_page].crop((0,0,612,rects[0]['top'])).extract_text()
#print(case_url)
#print('--')
else:
infoblock = pdf.pages[0].crop((0,0,612,rects[0]['top'])).extract_text()
#print(infoblock)
#print(case_url)
if "in the matter of" not in infoblock.lower(): #and '\nand\n' not in infoblock.lower():
if "In The Supreme Court" in infoblock:
parties = extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n','\nAppellate Case',infoblock).split('\nv.\n')
else:
parties = extract_text_between('The Supreme Court of South Carolina\n','\nAppellate Case', infoblock).split('\nv.\n')
for i, party in enumerate(parties):
if i == 0:
first_party = party # Assuming the first party is at index 0
elif i == 1:
second_party = party
elif i == 2:
third_party = party
elif i == 3:
fourth_party = party
#if len(parties) > max_parties:
# max_parties = len(parties)
# case_max_parties = case_num
#print("FIRST_PARTY:\t" + extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n',',\nv.\n',infoblock))
#first_party = extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n',',\nv.\n',infoblock)
#print("SECOND PARTY:\t" + extract_text_between("\nv.\n","Appellate Case",infoblock))
#second_party = extract_text_between("\nv.\n","Appellate Case",infoblock)
#print(get_fonts(pdf_bytes))
#if "\nAnd\n" in infoblock:
# cases = "FIRST_PARTY:\t" + extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n','Appellate Case',infoblock)
# for i in cases.split('\nAnd\n'):
# print(repr(i))
# print('FOUND')
elif "in the matter of" in infoblock.lower():
single_party = True
if "THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n" in infoblock:
print(extract_text_between("THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n","\nAppellate Case",infoblock))
first_party = extract_text_between("THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n","\nAppellate Case",infoblock)
elif "The Supreme Court of South Carolina\nIn the Matter of" in infoblock:
print(extract_text_between("The Supreme Court of South Carolina\n","\nAppellate Case",infoblock))
first_party = extract_text_between("The Supreme Court of South Carolina\n","\nAppellate Case",infoblock)
#else:
# break
pdf.close()
cases.append({"case_num":case_num,
"single_party":single_party,
"first_party":first_party,
"second_party":second_party,
"third_party":third_party,
"fourth_party":fourth_party
})
df=pd.DataFrame(cases)
df['url']= 'https://www.sccourts.org/opinions/HTMLFiles/SC/' + df.case_num + '.pdf'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment