The full raw file can be downloaded from this WaPo article page (direct link here]
Here's a link to the Drug Enforcement Agency Office of Diversion Control's ARCOS Registrant Handbook
$ curl -Lo arcos_all_washpost.tsv.gz \
https://d2ty8gaf6rmowa.cloudfront.net/dea-pain-pill-database/bulk/arcos_all_washpost.tsv.gz
$ gunzip arcos_all_washpost.tsv.gz
This creates a file named arcos_all_washpost.tsv
Here are its wc
stats:
$ time wc arcos_all_washpost.tsv
178598027 11298317334 79996770669 arcos_all_washpost.tsv
real 3m57.846s
user 3m22.978s
sys 0m33.727s
Here is a Google Sheet containing a random sample of 10,000 rows: https://docs.google.com/spreadsheets/d/19NZtjOEPxNdNfZS9l-yZzmM5b7KqJkREL06N7nRm_n8/edit?usp=sharing
$ csvcut -n arcos_all_washpost.csv
1: REPORTER_DEA_NO
2: REPORTER_BUS_ACT
3: REPORTER_NAME
4: REPORTER_ADDL_CO_INFO
5: REPORTER_ADDRESS1
6: REPORTER_ADDRESS2
7: REPORTER_CITY
8: REPORTER_STATE
9: REPORTER_ZIP
10: REPORTER_COUNTY
11: BUYER_DEA_NO
12: BUYER_BUS_ACT
13: BUYER_NAME
14: BUYER_ADDL_CO_INFO
15: BUYER_ADDRESS1
16: BUYER_ADDRESS2
17: BUYER_CITY
18: BUYER_STATE
19: BUYER_ZIP
20: BUYER_COUNTY
21: TRANSACTION_CODE
22: DRUG_CODE
23: NDC_NO
24: DRUG_NAME
25: QUANTITY
26: UNIT
27: ACTION_INDICATOR
28: ORDER_FORM_NO
29: CORRECTION_NO
30: STRENGTH
31: TRANSACTION_DATE
32: CALC_BASE_WT_IN_GM
33: DOSAGE_UNIT
34: TRANSACTION_ID
35: Product_Name
36: Ingredient_Name
37: Measure
38: MME_Conversion_Factor
39: Combined_Labeler_Name
40: Revised_Company_Name
41: Reporter_family
42: dos_str
$ xsv sample -d '\t' 10000000 arcos_all_washpost.tsv > 10m-sample-arcos.csv
Finding most frequent records for:
- REPORTER_DEA_NO
- BUYER_DEA_NO
- NDC_NO
- TRANSACTION_ID
$ for fname in REPORTER_DEA_NO BUYER_DEA_NO NDC_NO TRANSACTION_ID; do
echo $fname
xsv select ${fname} 10m-sample-arcos.csv | sort | uniq -c | sort -rn | head -n 5
done
REPORTER_DEA_NO
342887 RW0294493
317986 PW0186785
264713 RW0277752
235747 RW0282145
216266 RW0204026
BUYER_DEA_NO
2735 BK4844658
1232 BW5431084
1129 BW8910879
1060 FS1106194
1025 BP8791091
NDC_NO
540640 00406035705
335789 00591034905
280216 00406051201
233583 00406035801
222033 00406036601
TRANSACTION_ID
340 29
337 39
334 30
326 20
316 23
Find fields related to NDC_NO, i.e National Drug Code
# given a NDC_NO of 00406035705
$ csvgrep -c NDC_NO -m 00406035705 random-sample-arcos.csv | csvcut -c 24,35,36,37,38,39,40 | sort | uniq -c
53921 HYDROCODONE,HYDROCODONE BIT/ACETAMINOPHEN 5MG/50,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,SpecGx LLC,Mallinckrodt
# alternative way to filter for columns that seem redundant
$ xsv search --select NDC_NO 00406035705 random-sample-arcos.csv \
| xsv frequency | xsv select field | sed '1d' | sort | uniq -c | sort -n
1 Combined_Labeler_Name
1 DRUG_CODE
1 DRUG_NAME
1 Ingredient_Name
1 MME_Conversion_Factor
1 Measure
1 NDC_NO
1 ORDER_FORM_NO
1 Product_Name
1 Revised_Company_Name
1 TRANSACTION_CODE
1 UNIT
1 dos_str
2 REPORTER_BUS_ACT
3 ACTION_INDICATOR
4 CORRECTION_NO
5 BUYER_BUS_ACT
10 BUYER_ADDL_CO_INFO
10 BUYER_ADDRESS1
10 BUYER_ADDRESS2
10 BUYER_CITY
10 BUYER_COUNTY
10 BUYER_DEA_NO
10 BUYER_NAME
10 BUYER_STATE
10 BUYER_ZIP
10 CALC_BASE_WT_IN_GM
10 DOSAGE_UNIT
10 QUANTITY
10 REPORTER_ADDL_CO_INFO
10 REPORTER_ADDRESS1
10 REPORTER_ADDRESS2
10 REPORTER_CITY
10 REPORTER_COUNTY
10 REPORTER_DEA_NO
10 REPORTER_NAME
10 REPORTER_STATE
10 REPORTER_ZIP
10 Reporter_family
10 STRENGTH
10 TRANSACTION_DATE
10 TRANSACTION_ID
$ xsv search --select REPORTER_DEA_NO RW0204026 random-sample-arcos.csv | xsv frequency | xsv select field | sed '1d' | sort | uniq -c | sort -n
1 BUYER_BUS_ACT
1 CORRECTION_NO
1 Measure
1 REPORTER_ADDL_CO_INFO
1 REPORTER_ADDRESS1
1 REPORTER_ADDRESS2
1 REPORTER_BUS_ACT
1 REPORTER_CITY
1 REPORTER_COUNTY
1 REPORTER_DEA_NO
1 REPORTER_NAME
1 REPORTER_STATE
1 REPORTER_ZIP
1 Reporter_family
1 TRANSACTION_CODE
1 UNIT
2 ACTION_INDICATOR
2 DRUG_CODE
2 DRUG_NAME
2 Ingredient_Name
2 MME_Conversion_Factor
2 STRENGTH
10 BUYER_ADDL_CO_INFO
10 BUYER_ADDRESS1
10 BUYER_ADDRESS2
10 BUYER_CITY
10 BUYER_COUNTY
10 BUYER_DEA_NO
10 BUYER_NAME
10 BUYER_STATE
10 BUYER_ZIP
10 CALC_BASE_WT_IN_GM
10 Combined_Labeler_Name
10 DOSAGE_UNIT
10 NDC_NO
10 ORDER_FORM_NO
10 Product_Name
10 QUANTITY
10 Revised_Company_Name
10 TRANSACTION_DATE
10 TRANSACTION_ID
10 dos_str
#
$ xsv search --select BUYER_DEA_NO BW7767277 random-sample-arcos.csv | xsv frequency | xsv select field | sed '1d' | sort | uniq -c | sort -n
1 ACTION_INDICATOR
1 BUYER_ADDL_CO_INFO
1 BUYER_ADDRESS1
1 BUYER_ADDRESS2
1 BUYER_BUS_ACT
1 BUYER_CITY
1 BUYER_COUNTY
1 BUYER_DEA_NO
1 BUYER_NAME
1 BUYER_STATE
1 BUYER_ZIP
1 CORRECTION_NO
1 Measure
1 REPORTER_ADDL_CO_INFO
1 REPORTER_BUS_ACT
1 TRANSACTION_CODE
1 UNIT
2 DRUG_CODE
2 DRUG_NAME
2 Ingredient_Name
2 MME_Conversion_Factor
2 Reporter_family
2 STRENGTH
3 REPORTER_ADDRESS2
3 REPORTER_NAME
5 REPORTER_ADDRESS1
5 REPORTER_CITY
5 REPORTER_COUNTY
5 REPORTER_DEA_NO
5 REPORTER_STATE
5 REPORTER_ZIP
5 Revised_Company_Name
6 Combined_Labeler_Name
6 dos_str
7 QUANTITY
8 DOSAGE_UNIT
10 CALC_BASE_WT_IN_GM
10 NDC_NO
10 ORDER_FORM_NO
10 Product_Name
10 TRANSACTION_DATE
10 TRANSACTION_ID