Created
November 23, 2022 01:05
-
-
Save firstworldproblems/a59d797a816b8ce1d8bb5013e3b394cc to your computer and use it in GitHub Desktop.
scrapes caselaw website to extract data for analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import furl | |
import requests | |
import htmlmin | |
from bs4 import BeautifulSoup | |
from datetime import date, datetime | |
from dateutil import parser | |
import datefinder | |
import dateparser | |
from difflib import SequenceMatcher as SM | |
from fuzzywuzzy import fuzz, process | |
from nltk.util import ngrams | |
import codecs | |
import pandas as pd | |
import re | |
import json | |
import locale | |
locale.setlocale(locale.LC_ALL, '') | |
# scrapes caselaw website and calculates decision times for each case | |
# could be used to examine the efficiency of the court system | |
# in my case it was used to see just how abnormal a wait time of 756 days | |
# takes case title as argument e.g. ALZ v WorkCover NSW [2014] NSWCATAD 93 | |
# returns medium neutral citation e.g. [2014] NSWCATAD 93 | |
def medium_neutral_citation(title): | |
sliced = re.split('(\[\d{4}\])', title) | |
return "".join(sliced[1:]).strip() | |
# takes string and extract | |
# strips out # checks if on the papers, | |
# extracts checks if string contains dates in different date formats | |
# then iterates | |
def date_parser(text, key = "Hearing"): | |
print(f"\n\nkey: {key}, checking string for date: {text}") | |
if text.lower() == "on the papers": | |
print("returning nothing") | |
return "" | |
stringsplit = re.split(',|&|and|\n',text) | |
for fmt in ('%d %B %Y', '%d/%m/%Y', '%d/%m/%y','%d/%#m/%Y','%d.%m.%y','%m/%d/%Y'): | |
try: | |
x = datetime.strptime(stringsplit[-1].rstrip('.').strip(), fmt) | |
return str(x.strftime("%#d %B %Y")) | |
except Exception as e: | |
print(f"Error parsing: {stringsplit[-1].rstrip('.').strip()} into {fmt} e: {e}") | |
pass | |
try: | |
print("checking if string has any dates in it...") | |
matches = datefinder.find_dates(text) | |
parsed = [] | |
for m in matches: | |
parsed.append(m) | |
if len(parsed) > 0: | |
print(f'oldest date is: {max(parsed).strftime("%#d %B %Y")}') | |
return str(max(parsed).strftime("%#d %B %Y")) | |
except Exception as e: | |
print(f"Error parsing: {max(parsed)} e: {e}") | |
pass | |
return f"invalid: {text}" | |
# create dataframe columns for scraper | |
columns = ['Citation','Title','URL','Corrected','Papers','Submissions','Hearing','Decision','Days','Missing','Catchwords','Content'] | |
df = pd.DataFrame(columns = columns) | |
all_columns = list(df) | |
df[all_columns] = df[all_columns].astype(str) | |
# constructs url query | |
baseurl = "https://www.caselaw.nsw.gov.au" | |
# url of search query used | |
search = "https://www.caselaw.nsw.gov.au/search/advanced?sort=decisionDate%2Casc&body=&title=&before=&catchwords=&party=&mnc=&startDate=&endDate=&fileNumber=&legislationCited=%22Privacy+and+Personal+Information+Protection+Act%22+OR+%22Health+Records+and+Information+Privacy+Act%22&casesCited=&courts=54a634063004de94513d827a&_courts=on&courts=54a634063004de94513d827b&_courts=on&courts=54a634063004de94513d8278&_courts=on&courts=54a634063004de94513d8279&_courts=on&courts=54a634063004de94513d827c&_courts=on&courts=54a634063004de94513d827d&_courts=on&courts=54a634063004de94513d828e&_courts=on&courts=54a634063004de94513d8285&_courts=on&courts=54a634063004de94513d827e&_courts=on&courts=54a634063004de94513d827f&_courts=on&courts=54a634063004de94513d8286&_courts=on&courts=54a634063004de94513d8280&_courts=on&courts=54a634063004de94513d8281&_courts=on&tribunals=54a634063004de94513d8282&_tribunals=on&tribunals=54a634063004de94513d8287&_tribunals=on&tribunals=54a634063004de94513d8289&_tribunals=on&tribunals=54a634063004de94513d828d&_tribunals=on&tribunals=54a634063004de94513d828b&_tribunals=on&tribunals=173b71a8beab2951cc1fab8d&_tribunals=on&tribunals=54a634063004de94513d828c&_tribunals=on&tribunals=54a634063004de94513d828a&_tribunals=on&tribunals=54a634063004de94513d8283&_tribunals=on&tribunals=1723173e41f6b6d63f2105d3&_tribunals=on&tribunals=5e5c92e1e4b0c8604babc749&_tribunals=on&tribunals=5e5c92c5e4b0c8604babc748&_tribunals=on&tribunals=54a634063004de94513d8284&_tribunals=on&tribunals=54a634063004de94513d8288&_tribunals=on" | |
pages = 32 | |
data = [] | |
# iterates through each row on each page of search results and extracts basic details of each case | |
# case title, medium neutral citation, url, decision date, catchwords | |
for count in range(0, pages): | |
try: | |
print(f"Search results page: {count}") | |
page = requests.get(search, params={'page': count}) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
results = soup.select_one("div.container.searchresults") | |
cases = results.find_all("div", {"class": ["row", "result"] }) | |
for row in cases: | |
details = row.find("h4").find_next('a') | |
published = baseurl + details.get('href') | |
title = details.get_text() | |
citation = medium_neutral_citation(title) | |
decision = row.find(lambda tag:tag.name=="li" and "Decision date" in tag.text).find_next('li').get_text().strip() | |
try: | |
catchwords = row.find(lambda tag:tag.name=="p" and "Catchwords" in tag.text).find_next('p').get_text().strip() or "n/a" | |
except: | |
pass | |
data.append({'Title': title,'Citation' : citation, 'URL': published, 'Decision': decision, 'Catchwords': catchwords}) | |
except: | |
pass | |
# creates dataframe containing every case from search results | |
df = df.append(data, ignore_index = True) | |
# iterates through each case in dataframe and performs http get request of case url | |
# extracts html containing case details does fuzzy search for html tags containing submission and hearing info | |
# extracts dates and checks if on the papers, then calculates decision time in days | |
# for debugging, uncomment printed text | |
for index, row in df.iterrows(): | |
try: | |
print(f"\n\n----- {df.loc[index,'Citation']} -----\n\n") | |
response = requests.get(df.loc[index,'URL']) | |
page_html = response.text | |
soup = BeautifulSoup(page_html, "html.parser") | |
table = soup.find(lambda tag:tag.name=="table" and "HEARING DATE" in tag.text) or soup.find("div", {"class": "coversheet"}) | |
df.loc[index,'Content'] = htmlmin.minify(str(table)) | |
keywords = ['Hearing date', 'SUBMISSIONS CLOSED', 'HEARING DATE', 'Submissions close', 'Papers', 'papers', "DATE OF DECISION", 'Jurisdiction', 'JURISDICTION'] | |
v1 = table.find_all("dt", string=re.compile('|'.join(keywords))) | |
v2 = table.find_all("td", string=re.compile('|'.join(keywords))) | |
matches = {'appeal': "", 'hearing': "", 'submissions': "", 'papers': "", 'jurisdiction': ""} | |
# fuzzy search match options | |
choices = ["appeal", "hearing", "submissions", "papers", "jurisdiction"] | |
for tag in v1: | |
print(tag.get_text()) | |
print(process.extract(tag.get_text(), choices)) | |
fuzzymatch = process.extractOne(tag.get_text(), choices) | |
matches[fuzzymatch[0]] = tag.find_next('dd').get_text().strip() | |
print((tag.find_next('dd').get_text().strip()).lower() ) | |
for tag in v2: | |
print(tag) | |
print(tag.get_text()) | |
print(process.extract(tag.get_text(), choices)) | |
fuzzymatch = process.extractOne(tag.get_text(), choices) | |
matches[fuzzymatch[0]] = tag.find_next('td').get_text().strip() | |
print( (tag.find_next('td').get_text().strip()).lower() ) | |
# prints dates extracted from page before being parsed | |
print(json.dumps(matches, indent=4, sort_keys=True, default=str)) | |
print(f"values extracted: \n\n{list(matches.values())}") | |
# checks if on papers | |
df.loc[index,'Papers'] = "Yes" if any("paper" in string.lower() for string in list(matches.values())) else "" | |
df.loc[index,'Jurisdiction'] = matches.pop('jurisdiction', "n/a") | |
for key, value in matches.items(): | |
print(f"key: {key}, value: {value}") | |
matches[key] = str(date_parser(value,key)) if len(value) > 0 else "" | |
print(json.dumps(matches, indent=4, sort_keys=True, default=str)) | |
df.loc[index,'Hearing'] = pd.to_datetime(matches['hearing'], errors='coerce') | |
df.loc[index,'Submissions'] = pd.to_datetime(matches['submissions'], errors='coerce') | |
df.loc[index,'Decision'] = pd.to_datetime(df.loc[index, 'Decision'], errors='coerce') | |
df.loc[index,'Days'] = pd.to_datetime(df.loc[index, 'Days'], errors='coerce') | |
# checks if valid dates have been extracted for either hearing or submissions | |
# calculates | |
valid = [d for d in [df.loc[index,'Hearing'], df.loc[index,'Submissions']] if type(d) is pd.Timestamp] | |
if len(valid)>0: | |
df.loc[index,'Days'] = df.loc[index,'Decision'] - max(valid) | |
# flags case if unable to calculate decision time but only if decision was not on the papers | |
if pd.isnull(df.loc[index,'Days']) and df.loc[index,'Papers'] != "Yes": | |
df.loc[index,'Missing'] = "Yes" | |
# prints final dataframe row constructed for case | |
print(df.loc[index]) | |
except Exception as e: | |
print(f"Error: {e}") | |
pass | |
# outputs entire dataframe to spreadsheet file | |
writer = pd.ExcelWriter(r'caselawscraper.ppip.hrip.act.v2.xlsx', engine='xlsxwriter') | |
df.to_excel(writer, sheet_name='results', index=False) | |
writer.save() | |
print("Saved") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment