Created
July 27, 2020 19:13
-
-
Save vaclavdekanovsky/c4595c32802d721af8e9c0e65013300b to your computer and use it in GitHub Desktop.
What rows in the SEC 2020Q1 dataset are duplicated?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-07-27T19:12:38.965303Z", | |
| "start_time": "2020-07-27T19:12:38.386717Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import os" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-07-27T19:12:39.128238Z", | |
| "start_time": "2020-07-27T19:12:38.966333Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Folder with the unzipped data\n", | |
| "folder = r\"..\\Data_Sec\\2020Q1\"\n", | |
| "\n", | |
| "# load the content of the sub file into the dataFrame\n", | |
| "sub = pd.read_csv(os.path.join(folder,\"sub.txt\"),sep=\"\\t\",dtype={\"cik\":str})\n", | |
| "\n", | |
| "# filter 10-K/Q\n", | |
| "tens = sub[sub[\"form\"].isin([\"10-K\",\"10-Q\"])]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-07-27T19:12:39.162147Z", | |
| "start_time": "2020-07-27T19:12:39.130232Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "cik period \n", | |
| "1191334 20130930 2\n", | |
| "1635748 20190630 2\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# what are the companies and the periods which are more than once? \n", | |
| "duplicates = tens.groupby([\"cik\",\"period\"]).size()[tens.groupby([\"cik\",\"period\"]).size()>1].T\n", | |
| "duplicates" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-07-27T19:12:39.202156Z", | |
| "start_time": "2020-07-27T19:12:39.165139Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>2</th>\n", | |
| " <th>3</th>\n", | |
| " <th>0</th>\n", | |
| " <th>1</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>adsh</td>\n", | |
| " <td>0001483798-20-000031</td>\n", | |
| " <td>0001483798-20-000033</td>\n", | |
| " <td>0001079974-20-000047</td>\n", | |
| " <td>0001079974-20-000048</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>cik</td>\n", | |
| " <td>1191334</td>\n", | |
| " <td>1191334</td>\n", | |
| " <td>1635748</td>\n", | |
| " <td>1635748</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>name</td>\n", | |
| " <td>CHUN CAN CAPITAL GROUP</td>\n", | |
| " <td>CHUN CAN CAPITAL GROUP</td>\n", | |
| " <td>ZOOMPASS HOLDINGS, INC.</td>\n", | |
| " <td>ZOOMPASS HOLDINGS, INC.</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>sic</td>\n", | |
| " <td>3674</td>\n", | |
| " <td>3674</td>\n", | |
| " <td>7374</td>\n", | |
| " <td>7374</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>countryba</td>\n", | |
| " <td>DO</td>\n", | |
| " <td>DO</td>\n", | |
| " <td>CA</td>\n", | |
| " <td>CA</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " 2 3 \\\n", | |
| "adsh 0001483798-20-000031 0001483798-20-000033 \n", | |
| "cik 1191334 1191334 \n", | |
| "name CHUN CAN CAPITAL GROUP CHUN CAN CAPITAL GROUP \n", | |
| "sic 3674 3674 \n", | |
| "countryba DO DO \n", | |
| "\n", | |
| " 0 1 \n", | |
| "adsh 0001079974-20-000047 0001079974-20-000048 \n", | |
| "cik 1635748 1635748 \n", | |
| "name ZOOMPASS HOLDINGS, INC. ZOOMPASS HOLDINGS, INC. \n", | |
| "sic 7374 7374 \n", | |
| "countryba CA CA " | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# groupby().size() will results in a serie with multiindex, which can be merged on the original dataframe after the index is reset\n", | |
| "duplicates = tens.merge(duplicates.reset_index(), on=[\"cik\",\"period\"]).sort_values(by=[\"cik\",\"period\"]).T\n", | |
| "duplicates.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-07-27T19:12:39.221989Z", | |
| "start_time": "2020-07-27T19:12:39.205033Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>0</th>\n", | |
| " <th>1</th>\n", | |
| " <th>same01</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>adsh</td>\n", | |
| " <td>0001079974-20-000047</td>\n", | |
| " <td>0001079974-20-000048</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>filed</td>\n", | |
| " <td>20200121</td>\n", | |
| " <td>20200122</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>accepted</td>\n", | |
| " <td>2020-01-21 16:51:00.0</td>\n", | |
| " <td>2020-01-21 17:53:00.0</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " 0 1 same01\n", | |
| "adsh 0001079974-20-000047 0001079974-20-000048 False\n", | |
| "filed 20200121 20200122 False\n", | |
| "accepted 2020-01-21 16:51:00.0 2020-01-21 17:53:00.0 False" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# let's have a look at rows 2,3 which belongs to the company \"ZOOMPASS HOLDINGS, INC.\"\"\n", | |
| "duplicates[\"same01\"] = duplicates[0]==duplicates[1]\n", | |
| "duplicates.loc[duplicates[\"same01\"]==False,[0,1,\"same01\"]].dropna()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.4" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| }, | |
| "varInspector": { | |
| "cols": { | |
| "lenName": 16, | |
| "lenType": 16, | |
| "lenVar": 40 | |
| }, | |
| "kernels_config": { | |
| "python": { | |
| "delete_cmd_postfix": "", | |
| "delete_cmd_prefix": "del ", | |
| "library": "var_list.py", | |
| "varRefreshCmd": "print(var_dic_list())" | |
| }, | |
| "r": { | |
| "delete_cmd_postfix": ") ", | |
| "delete_cmd_prefix": "rm(", | |
| "library": "var_list.r", | |
| "varRefreshCmd": "cat(var_dic_list()) " | |
| } | |
| }, | |
| "types_to_exclude": [ | |
| "module", | |
| "function", | |
| "builtin_function_or_method", | |
| "instance", | |
| "_Feature" | |
| ], | |
| "window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment