Skip to content

Instantly share code, notes, and snippets.

@yanlesin
Created October 21, 2024 00:08
Show Gist options
  • Save yanlesin/3e080c777dfd69a017ccd99815c406c7 to your computer and use it in GitHub Desktop.
Save yanlesin/3e080c777dfd69a017ccd99815c406c7 to your computer and use it in GitHub Desktop.
Python parsing of SEC 13(f) list
from poppler import load_from_file, PageRenderer
import pandas as pd
import polars as pl
import numpy as np
pdf_document = load_from_file("/Users/yanlyesin/Downloads/13flist2024q3.pdf")
df_from_pdf = pd.DataFrame(columns=['text', 'page', 'cusip', 'issuer_name', 'issuer_description', 'status', 'text_length'])
for page in range(2, pdf_document.pages):
text_from_pdf = pdf_document.create_page(page).text().split("\n")
df_text = pd.DataFrame({'text': text_from_pdf, 'page': [page] * len(text_from_pdf)})
df_text['cusip'] = df_text['text'].apply(lambda x: x.find('CUSIP NO') if 'CUSIP NO' in x else -1)
df_text['issuer_name'] = df_text['text'].apply(lambda x: x.find('ISSUER NAME') if 'ISSUER NAME' in x else -1)
df_text['issuer_description'] = df_text['text'].apply(lambda x: x.find('ISSUER DESCRIPTION') if 'ISSUER DESCRIPTION' in x else -1)
df_text['status'] = df_text['text'].apply(lambda x: x.find('STATUS') if 'STATUS' in x else -1)
df_text['text_length'] = df_text['text'].str.len()
df_from_pdf = pd.concat([df_from_pdf, df_text], ignore_index=True)
columns_to_replace = ['cusip','issuer_name','issuer_description','status']
df_from_pdf.loc[:, columns_to_replace] = df_from_pdf.loc[:, columns_to_replace].replace(-1, pd.NA)
df_from_pdf.loc[:, columns_to_replace] = df_from_pdf.loc[:, columns_to_replace].ffill()
df_from_pdf = df_from_pdf.dropna(subset=['cusip'])
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('Run ')]
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('CUSIP NO')]
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.startswith('\f')]
df_from_pdf = df_from_pdf.loc[~df_from_pdf['text'].str.contains('Total C')]
df_from_pdf = df_from_pdf[df_from_pdf['text'].str.len() > 0]
def extract_cusip(row):
return row['text'][0:11]
def extract_issuer_name(row):
return row['text'][row['issuer_name']:row['issuer_description']]
def extract_issuer_desc(row):
return row['text'][row['issuer_description']:row['status']]
def extract_status(row):
return row['text'][row['status']:row['text_length']]
def extract_options(row):
return row['text'][11:row['issuer_name']]
# Apply the function to create a new column 'substring'
df_from_pdf['cusip'] = df_from_pdf.apply(extract_cusip, axis=1)
df_from_pdf['has_listed_options'] = df_from_pdf.apply(extract_options, axis=1)
df_from_pdf['issuer_name'] = df_from_pdf.apply(extract_issuer_name, axis=1)
df_from_pdf['issuer_description'] = df_from_pdf.apply(extract_issuer_desc, axis=1)
df_from_pdf['status'] = df_from_pdf.apply(extract_status, axis=1)
#print(df_from_pdf['status'].unique())
#[
#''
#'DELETED'
#'ADDED'
#' ADDED'
#' DELETED'
#'DDED'
#'ELETED'
#' DELETED'
#' ADDED'
#' ADDED'
# 'RG'
# ]
#Need to handle broken statuses
df_from_pdf['cusip'] = df_from_pdf['cusip'].str.replace(' ', '')
df_from_pdf['has_listed_options'] = df_from_pdf['has_listed_options'].str.strip()
df_from_pdf['status'] = df_from_pdf['status'].str.strip()
df_from_pdf['issuer_name'] = df_from_pdf['issuer_name'].str.strip()
df_from_pdf['issuer_description'] = df_from_pdf['issuer_description'].str.strip()
df_from_pdf = df_from_pdf.drop(columns= ['text', 'text_length'])
def modify_columns(row):
if row['status'] =='DDED' or row['status'] =='ELETED':
return row['issuer_description'][:-1], row['issuer_description'][-1]+row['status']
elif row['status'] == 'RG':
return row['issuer_description']+row['status'], row['status'][:-2]
else:
return row['issuer_description'], row['status']
# Apply the function to each row
df_from_pdf[['issuer_description', 'status']] = df_from_pdf.apply(modify_columns, axis=1, result_type='expand')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment