Skip to content

Instantly share code, notes, and snippets.

@iangow
Created March 21, 2026 11:16
Show Gist options
  • Select an option

  • Save iangow/ef9a80612c69a9dbf6e6e8af3509aad3 to your computer and use it in GitHub Desktop.

Select an option

Save iangow/ef9a80612c69a9dbf6e6e8af3509aad3 to your computer and use it in GitHub Desktop.
Code adapted from Tidy Microstructure
library(db2pq)
library(dplyr, warn.conflicts = FALSE)
library(farr, warn.conflicts = FALSE)

wrds_update_pq("stksecurityinfohist", "crsp", force = TRUE)
#> Forcing update based on user request.
#> Beginning file download at 2026-03-21 11:15:00 UTC.
#> Completed file download at 2026-03-21 11:15:10 UTC.
wrds_update_pq("dsf_v2", "crsp")
#> crsp.dsf_v2 already up to date.

db <- DBI::dbConnect(duckdb::duckdb())

dsf_db <- farr::load_parquet(db, "dsf_v2", "crsp")
stksecurityinfohist_db <- farr::load_parquet(db, "stksecurityinfohist", "crsp")
crsp_daily_sub <-
  dsf_db |>
  # SELECTION DATE
  # filter(between(dlycaldt, start_date, end_date)) |> 
  inner_join(
    stksecurityinfohist_db |>
      # no special share types (sharetype = 'NS')
      filter(sharetype == "NS",
               # security type equity (securitytype = 'EQTY')
               securitytype == "EQTY",
               # security sub type common stock (securitysubtype = 'COM'), 
               securitysubtype == "COM",
               # only US-listed stocks
               usincflg == "Y",
               # issuers that are a corporation 
               # (issuertype %in% c("ACOR", "CORP"))
               issuertype %in% c("ACOR", "CORP"),
               # we use only stock prices from NYSE, Amex, and
               # NASDAQ (primaryexch %in% c("N", "A", "Q")) 
               primaryexch %in% c("N", "A", "Q"),
               # when or after issuance (conditionaltype %in% c("RW", "NW")) 
               conditionaltype %in% c("RW", "NW"),
               # for actively traded stocks (tradingstatusflg == "A")
               tradingstatusflg == "A") |> 
      select(permno, secinfostartdt, secinfoenddt),
    join_by(permno)
  ) |>
  # (iv) we keep only months within permno-specific start dates
  # (secinfostartdt) and end dates (secinfoenddt)
  filter(between(dlycaldt, secinfostartdt, secinfoenddt)) |> 
  mutate(
    mktcap = (1000 * shrout / dlycumfacshr) * dlyprc / 10^6,
    mktcap = na_if(mktcap, 0),
    dlyprc = dlyprc/ dlycumfacpr
  )

crsp_daily_sub |> count()
#> # Source:   SQL [?? x 1]
#> # Database: DuckDB 1.4.4 [root@Darwin 25.4.0:R 4.5.2/:memory:]
#>          n
#>      <dbl>
#> 1 81665021

Created on 2026-03-21 with reprex v2.1.1

@iangow
Copy link
Copy Markdown
Author

iangow commented Mar 21, 2026

library(db2pq)
library(dplyr, warn.conflicts = FALSE)
library(wrds)
#> ── Checking WRDS credentials ───────────────────────────────────── wrds 0.0.1 ──
#> ✔ Credentials found for user iangow

db <- wrds::wrds_connect()

dsf_db <- tbl(db, I("crsp.dsf_v2"))
stksecurityinfohist_db <- tbl(db, I("crsp.stksecurityinfohist"))

crsp_daily_sub <-
  dsf_db |>
  # SELECTION DATE
  # filter(between(dlycaldt, start_date, end_date)) |> 
  inner_join(
    stksecurityinfohist_db |>
      # no special share types (sharetype = 'NS')
      filter(sharetype == "NS",
             # security type equity (securitytype = 'EQTY')
             securitytype == "EQTY",
             # security sub type common stock (securitysubtype = 'COM'), 
             securitysubtype == "COM",
             # only US-listed stocks
             usincflg == "Y",
             # issuers that are a corporation 
             # (issuertype %in% c("ACOR", "CORP"))
             issuertype %in% c("ACOR", "CORP"),
             # we use only stock prices from NYSE, Amex, and
             # NASDAQ (primaryexch %in% c("N", "A", "Q")) 
             primaryexch %in% c("N", "A", "Q"),
             # when or after issuance (conditionaltype %in% c("RW", "NW")) 
             conditionaltype %in% c("RW", "NW"),
             # for actively traded stocks (tradingstatusflg == "A")
             tradingstatusflg == "A") |> 
      select(permno, secinfostartdt, secinfoenddt),
    join_by(permno)
  ) |>
  # (iv) we keep only months within permno-specific start dates
  # (secinfostartdt) and end dates (secinfoenddt)
  filter(between(dlycaldt, secinfostartdt, secinfoenddt)) |> 
  mutate(
    mktcap = if_else(dlycumfacshr != 0, 
                     (1000 * shrout / dlycumfacshr) * dlyprc / 10^6,
                     NA),
    mktcap = na_if(mktcap, 0),
    dlyprc = if_else(dlycumfacpr != 0, dlyprc / dlycumfacpr, NA)
  ) |>
  select(permno, dlycaldt, dlyret, dlyprc) |>
  filter(dlycaldt >= "2020-01-01")

crsp_daily_sub |> 
  lazy_tbl_to_pq("crsp_daily.parquet") |>
  farr::system_time()
#>    user  system elapsed 
#>   5.498   0.993  37.752
#> [1] "crsp_daily.parquet"

Created on 2026-03-21 with reprex v2.1.1

@iangow
Copy link
Copy Markdown
Author

iangow commented Mar 21, 2026

@NiklasLandsberg Note that I added some if_else() calls to make sure that the denominator is positive. I also added fairly arbitrary select() and filter() calls, so the data sets weren't too large.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment