Last active
August 11, 2024 04:46
-
-
Save blu3r4d0n/bc9d5f63e246fd9a9fa7efc031b20dc2 to your computer and use it in GitHub Desktop.
This is a quick and dirty script to get a dataframe of parties to SC Supreme Court cases that have filed opinions. This is the first step to extracting information about opinions filed by the Court. NOT ALL EDGE CASES ARE HANDLED YET.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
from io import BytesIO | |
import urllib | |
from pathlib import Path | |
import pdfplumber | |
headers = {'user-agent': 'INSERT YOUR DESIRED USER AGENT HERE'): | |
def extract_text_between(start_text, stop_text, full_text): | |
start_index = full_text.find(start_text) + len(start_text) | |
stop_index = full_text.find(stop_text) | |
if start_index == -1 or stop_index == -1: | |
return "Both start or stop text not found in the input string." | |
extracted_text = full_text[start_index:stop_index] | |
return extracted_text | |
def get_pdf(url): | |
print(url) | |
return BytesIO(requests.get(url,headers=headers).content) | |
def get_fonts(pdf_byte_array): | |
#resp = BytesIO(requests.get(url,headers=headers).content) | |
with pdfplumber.open(pdf_byte_array) as pdf: | |
meta = pdf.metadata | |
font_sizes = sorted(set([x['size'] for x in pdf.chars])) | |
sizes=set([round(size,0) for size in font_sizes]) | |
print(meta) | |
print(sizes) | |
print(len(sizes)) | |
pdfs = [f'https://www.sccourts.org/opinions/HTMLFiles/SC/{x}.pdf' for x in range(28000,28228)] | |
pdf_bytes = [(pdf,get_pdf(pdf)) for pdf in pdfs] | |
pdf_dict = dict(pdf_bytes) | |
cases = [] | |
max_parties = 0 | |
case_max_parties = '' | |
for case_url,pdf_bytes in pdf_dict.items(): | |
print('-----------------') | |
print(case_url) | |
with pdfplumber.open(pdf_bytes) as pdf: | |
first_party = None | |
second_party = None | |
third_party = None | |
fourth_party = None | |
single_party = False | |
case_num = Path(urllib.parse.urlparse(case_url).path).stem.split('/')[-1] | |
print(case_num) | |
rects = pdf.rects | |
if len(rects) == 0: | |
print("BAD FORMAT") | |
continue | |
page = rects[0]['page_number'] | |
infoblock = "" | |
if page != 0: | |
for curr_page in range(0,page): | |
if page != curr_page: | |
infoblock+= pdf.pages[curr_page].extract_text() | |
else: | |
infoblock+=pdf.pages[curr_page].crop((0,0,612,rects[0]['top'])).extract_text() | |
#print(case_url) | |
#print('--') | |
else: | |
infoblock = pdf.pages[0].crop((0,0,612,rects[0]['top'])).extract_text() | |
#print(infoblock) | |
#print(case_url) | |
if "in the matter of" not in infoblock.lower(): #and '\nand\n' not in infoblock.lower(): | |
if "In The Supreme Court" in infoblock: | |
parties = extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n','\nAppellate Case',infoblock).split('\nv.\n') | |
else: | |
parties = extract_text_between('The Supreme Court of South Carolina\n','\nAppellate Case', infoblock).split('\nv.\n') | |
for i, party in enumerate(parties): | |
if i == 0: | |
first_party = party # Assuming the first party is at index 0 | |
elif i == 1: | |
second_party = party | |
elif i == 2: | |
third_party = party | |
elif i == 3: | |
fourth_party = party | |
#if len(parties) > max_parties: | |
# max_parties = len(parties) | |
# case_max_parties = case_num | |
#print("FIRST_PARTY:\t" + extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n',',\nv.\n',infoblock)) | |
#first_party = extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n',',\nv.\n',infoblock) | |
#print("SECOND PARTY:\t" + extract_text_between("\nv.\n","Appellate Case",infoblock)) | |
#second_party = extract_text_between("\nv.\n","Appellate Case",infoblock) | |
#print(get_fonts(pdf_bytes)) | |
#if "\nAnd\n" in infoblock: | |
# cases = "FIRST_PARTY:\t" + extract_text_between('THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n','Appellate Case',infoblock) | |
# for i in cases.split('\nAnd\n'): | |
# print(repr(i)) | |
# print('FOUND') | |
elif "in the matter of" in infoblock.lower(): | |
single_party = True | |
if "THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n" in infoblock: | |
print(extract_text_between("THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n","\nAppellate Case",infoblock)) | |
first_party = extract_text_between("THE STATE OF SOUTH CAROLINA\nIn The Supreme Court\n","\nAppellate Case",infoblock) | |
elif "The Supreme Court of South Carolina\nIn the Matter of" in infoblock: | |
print(extract_text_between("The Supreme Court of South Carolina\n","\nAppellate Case",infoblock)) | |
first_party = extract_text_between("The Supreme Court of South Carolina\n","\nAppellate Case",infoblock) | |
#else: | |
# break | |
pdf.close() | |
cases.append({"case_num":case_num, | |
"single_party":single_party, | |
"first_party":first_party, | |
"second_party":second_party, | |
"third_party":third_party, | |
"fourth_party":fourth_party | |
}) | |
df=pd.DataFrame(cases) | |
df['url']= 'https://www.sccourts.org/opinions/HTMLFiles/SC/' + df.case_num + '.pdf' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment