Skip to content

Instantly share code, notes, and snippets.

@mgao6767
Created March 19, 2023 22:42
Show Gist options
  • Save mgao6767/137049c4b0d591ebfdf46e075999171d to your computer and use it in GitHub Desktop.
Save mgao6767/137049c4b0d591ebfdf46e075999171d to your computer and use it in GitHub Desktop.
Script to get firm historical HQ state and zipcode from 8K filings. See https://mingze-gao.com/posts/firm-historical-headquarter-state-from-10k/
import sqlite3, sys, os, pathlib, logging
import pandas as pd
# fmt: off
states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
# fmt: on
logging.basicConfig(
stream=sys.stdout,
level=logging.DEBUG,
format="%(levelname)s - %(asctime)s - %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
def main(db: str, out: str):
conn = sqlite3.connect(db)
sql = """SELECT
DISTINCT cik, strftime("%Y", date) as year, state, zipcode
FROM files_zipcode
WHERE file_type="8-K"
ORDER BY cik, year;"""
df = pd.read_sql_query(sql, conn, parse_dates=["date"])
conn.close()
logging.info(f"{len(df)} records")
# restrict to US firms
df = df[df.state.isin(states)]
logging.info(f"{len(df)} records of US firms")
dups = df.duplicated(subset=["cik", "year"])
logging.info(f"{sum(dups)} duplicates")
df = df.drop_duplicates(subset=["cik", "year"], keep="last")
logging.info(f"{len(df)} records of US firms, keeping last record by year")
df.to_csv(out, index=False)
logging.info(f"result saved to {out}")
if __name__ == "__main__":
assert len(sys.argv) == 3
db = pathlib.Path(sys.argv[1]).expanduser().resolve().as_posix()
out = pathlib.Path(sys.argv[2]).expanduser().resolve().as_posix()
if os.path.isfile(db):
main(db, out)
else:
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment