Created
October 21, 2024 00:08
-
-
Save yanlesin/3e080c777dfd69a017ccd99815c406c7 to your computer and use it in GitHub Desktop.
Python parsing of SEC 13(f) list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from poppler import load_from_file, PageRenderer | |
import pandas as pd | |
import polars as pl | |
import numpy as np | |
pdf_document = load_from_file("/Users/yanlyesin/Downloads/13flist2024q3.pdf") | |
df_from_pdf = pd.DataFrame(columns=['text', 'page', 'cusip', 'issuer_name', 'issuer_description', 'status', 'text_length']) | |
for page in range(2, pdf_document.pages): | |
text_from_pdf = pdf_document.create_page(page).text().split("\n") | |
df_text = pd.DataFrame({'text': text_from_pdf, 'page': [page] * len(text_from_pdf)}) | |
df_text['cusip'] = df_text['text'].apply(lambda x: x.find('CUSIP NO') if 'CUSIP NO' in x else -1) | |
df_text['issuer_name'] = df_text['text'].apply(lambda x: x.find('ISSUER NAME') if 'ISSUER NAME' in x else -1) | |
df_text['issuer_description'] = df_text['text'].apply(lambda x: x.find('ISSUER DESCRIPTION') if 'ISSUER DESCRIPTION' in x else -1) | |
df_text['status'] = df_text['text'].apply(lambda x: x.find('STATUS') if 'STATUS' in x else -1) | |
df_text['text_length'] = df_text['text'].str.len() | |
df_from_pdf = pd.concat([df_from_pdf, df_text], ignore_index=True) | |
columns_to_replace = ['cusip','issuer_name','issuer_description','status'] | |
df_from_pdf.loc[:, columns_to_replace] = df_from_pdf.loc[:, columns_to_replace].replace(-1, pd.NA) | |
df_from_pdf.loc[:, columns_to_replace] = df_from_pdf.loc[:, columns_to_replace].ffill() | |
df_from_pdf = df_from_pdf.dropna(subset=['cusip']) | |
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('Run ')] | |
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('CUSIP NO')] | |
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('\f')] | |
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.contains('Total C')] | |
df_from_pdf = df_from_pdf[df_from_pdf['text'].str.len() > 0] | |
def extract_cusip(row): | |
return row['text'][0:11] | |
def extract_issuer_name(row): | |
return row['text'][row['issuer_name']:row['issuer_description']] | |
def extract_issuer_desc(row): | |
return row['text'][row['issuer_description']:row['status']] | |
def extract_status(row): | |
return row['text'][row['status']:row['text_length']] | |
def extract_options(row): | |
return row['text'][11:row['issuer_name']] | |
# Apply the function to create a new column 'substring' | |
df_from_pdf['cusip'] = df_from_pdf.apply(extract_cusip, axis=1) | |
df_from_pdf['has_listed_options'] = df_from_pdf.apply(extract_options, axis=1) | |
df_from_pdf['issuer_name'] = df_from_pdf.apply(extract_issuer_name, axis=1) | |
df_from_pdf['issuer_description'] = df_from_pdf.apply(extract_issuer_desc, axis=1) | |
df_from_pdf['status'] = df_from_pdf.apply(extract_status, axis=1) | |
#print(df_from_pdf['status'].unique()) | |
#[ | |
#'' | |
#'DELETED' | |
#'ADDED' | |
#' ADDED' | |
#' DELETED' | |
#'DDED' | |
#'ELETED' | |
#' DELETED' | |
#' ADDED' | |
#' ADDED' | |
# 'RG' | |
# ] | |
#Need to handle broken statuses | |
df_from_pdf['cusip'] = df_from_pdf['cusip'].str.replace(' ', '') | |
df_from_pdf['has_listed_options'] = df_from_pdf['has_listed_options'].str.strip() | |
df_from_pdf['status'] = df_from_pdf['status'].str.strip() | |
df_from_pdf['issuer_name'] = df_from_pdf['issuer_name'].str.strip() | |
df_from_pdf['issuer_description'] = df_from_pdf['issuer_description'].str.strip() | |
df_from_pdf = df_from_pdf.drop(columns= ['text', 'text_length']) | |
def modify_columns(row): | |
if row['status'] =='DDED' or row['status'] =='ELETED': | |
return row['issuer_description'][:-1], row['issuer_description'][-1]+row['status'] | |
elif row['status'] == 'RG': | |
return row['issuer_description']+row['status'], row['status'][:-2] | |
else: | |
return row['issuer_description'], row['status'] | |
# Apply the function to each row | |
df_from_pdf[['issuer_description', 'status']] = df_from_pdf.apply(modify_columns, axis=1, result_type='expand') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment