Created
October 5, 2023 17:08
-
-
Save simon-mo/7446ef286e3fc938d0e177bc4ea1cdaf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"from tqdm import tqdm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
" 0%| | 0/2335590 [00:00<?, ?it/s]" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 2335590/2335590 [00:18<00:00, 126098.37it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of lines with 'categories' key starting with 'cs.': 448448/2335590 = 19.20%\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"filename = \"arxiv-metadata-oai-snapshot.json\"\n", | |
"total_lines = 2335590\n", | |
"cs_lines = []\n", | |
"\n", | |
"with open(filename, \"r\") as f:\n", | |
" for line in tqdm(f, total=total_lines):\n", | |
" data = json.loads(line)\n", | |
" if \"categories\" in data and data[\"categories\"].startswith(\"cs.\"):\n", | |
" cs_lines.append(data)\n", | |
"\n", | |
"print(f\"Number of lines with 'categories' key starting with 'cs.': {len(cs_lines)}/{total_lines} = {len(cs_lines)/total_lines*100:.2f}%\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"df = pd.DataFrame(cs_lines)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>submitter</th>\n", | |
" <th>authors</th>\n", | |
" <th>title</th>\n", | |
" <th>comments</th>\n", | |
" <th>journal-ref</th>\n", | |
" <th>doi</th>\n", | |
" <th>report-no</th>\n", | |
" <th>categories</th>\n", | |
" <th>license</th>\n", | |
" <th>abstract</th>\n", | |
" <th>versions</th>\n", | |
" <th>update_date</th>\n", | |
" <th>authors_parsed</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0704.0047</td>\n", | |
" <td>Igor Grabec</td>\n", | |
" <td>T. Kosel and I. Grabec</td>\n", | |
" <td>Intelligent location of simultaneously active ...</td>\n", | |
" <td>5 pages, 5 eps figures, uses IEEEtran.cls</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>cs.NE cs.AI</td>\n", | |
" <td>None</td>\n", | |
" <td>The intelligent acoustic emission locator is...</td>\n", | |
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n", | |
" <td>2009-09-29</td>\n", | |
" <td>[[Kosel, T., ], [Grabec, I., ]]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0704.0050</td>\n", | |
" <td>Igor Grabec</td>\n", | |
" <td>T. Kosel and I. Grabec</td>\n", | |
" <td>Intelligent location of simultaneously active ...</td>\n", | |
" <td>5 pages, 7 eps figures, uses IEEEtran.cls</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>cs.NE cs.AI</td>\n", | |
" <td>None</td>\n", | |
" <td>Part I describes an intelligent acoustic emi...</td>\n", | |
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n", | |
" <td>2007-05-23</td>\n", | |
" <td>[[Kosel, T., ], [Grabec, I., ]]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0704.0062</td>\n", | |
" <td>Tom\\'a\\v{s} Vina\\v{r}</td>\n", | |
" <td>Rastislav \\v{S}r\\'amek, Bro\\v{n}a Brejov\\'a, T...</td>\n", | |
" <td>On-line Viterbi Algorithm and Its Relationship...</td>\n", | |
" <td>None</td>\n", | |
" <td>Algorithms in Bioinformatics: 7th Internationa...</td>\n", | |
" <td>10.1007/978-3-540-74126-8_23</td>\n", | |
" <td>None</td>\n", | |
" <td>cs.DS</td>\n", | |
" <td>None</td>\n", | |
" <td>In this paper, we introduce the on-line Vite...</td>\n", | |
" <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n", | |
" <td>2010-01-25</td>\n", | |
" <td>[[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0704.0090</td>\n", | |
" <td>Lester Ingber</td>\n", | |
" <td>Lester Ingber</td>\n", | |
" <td>Real Options for Project Schedules (ROPS)</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>Report 2007:ROPS</td>\n", | |
" <td>cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d...</td>\n", | |
" <td>None</td>\n", | |
" <td>Real Options for Project Schedules (ROPS) ha...</td>\n", | |
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n", | |
" <td>2007-05-23</td>\n", | |
" <td>[[Ingber, Lester, ]]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0704.0098</td>\n", | |
" <td>Jack Raymond</td>\n", | |
" <td>Jack Raymond, David Saad</td>\n", | |
" <td>Sparsely-spread CDMA - a statistical mechanics...</td>\n", | |
" <td>23 pages, 5 figures, figure 1 amended since pu...</td>\n", | |
" <td>J. Phys. A: Math. Theor. 40 No 41 (12 October ...</td>\n", | |
" <td>10.1088/1751-8113/40/41/004</td>\n", | |
" <td>None</td>\n", | |
" <td>cs.IT math.IT</td>\n", | |
" <td>None</td>\n", | |
" <td>Sparse Code Division Multiple Access (CDMA),...</td>\n", | |
" <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n", | |
" <td>2009-11-13</td>\n", | |
" <td>[[Raymond, Jack, ], [Saad, David, ]]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id submitter \\\n", | |
"0 0704.0047 Igor Grabec \n", | |
"1 0704.0050 Igor Grabec \n", | |
"2 0704.0062 Tom\\'a\\v{s} Vina\\v{r} \n", | |
"3 0704.0090 Lester Ingber \n", | |
"4 0704.0098 Jack Raymond \n", | |
"\n", | |
" authors \\\n", | |
"0 T. Kosel and I. Grabec \n", | |
"1 T. Kosel and I. Grabec \n", | |
"2 Rastislav \\v{S}r\\'amek, Bro\\v{n}a Brejov\\'a, T... \n", | |
"3 Lester Ingber \n", | |
"4 Jack Raymond, David Saad \n", | |
"\n", | |
" title \\\n", | |
"0 Intelligent location of simultaneously active ... \n", | |
"1 Intelligent location of simultaneously active ... \n", | |
"2 On-line Viterbi Algorithm and Its Relationship... \n", | |
"3 Real Options for Project Schedules (ROPS) \n", | |
"4 Sparsely-spread CDMA - a statistical mechanics... \n", | |
"\n", | |
" comments \\\n", | |
"0 5 pages, 5 eps figures, uses IEEEtran.cls \n", | |
"1 5 pages, 7 eps figures, uses IEEEtran.cls \n", | |
"2 None \n", | |
"3 None \n", | |
"4 23 pages, 5 figures, figure 1 amended since pu... \n", | |
"\n", | |
" journal-ref \\\n", | |
"0 None \n", | |
"1 None \n", | |
"2 Algorithms in Bioinformatics: 7th Internationa... \n", | |
"3 None \n", | |
"4 J. Phys. A: Math. Theor. 40 No 41 (12 October ... \n", | |
"\n", | |
" doi report-no \\\n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 10.1007/978-3-540-74126-8_23 None \n", | |
"3 None Report 2007:ROPS \n", | |
"4 10.1088/1751-8113/40/41/004 None \n", | |
"\n", | |
" categories license \\\n", | |
"0 cs.NE cs.AI None \n", | |
"1 cs.NE cs.AI None \n", | |
"2 cs.DS None \n", | |
"3 cs.CE cond-mat.stat-mech cs.MS cs.NA physics.d... None \n", | |
"4 cs.IT math.IT None \n", | |
"\n", | |
" abstract \\\n", | |
"0 The intelligent acoustic emission locator is... \n", | |
"1 Part I describes an intelligent acoustic emi... \n", | |
"2 In this paper, we introduce the on-line Vite... \n", | |
"3 Real Options for Project Schedules (ROPS) ha... \n", | |
"4 Sparse Code Division Multiple Access (CDMA),... \n", | |
"\n", | |
" versions update_date \\\n", | |
"0 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2009-09-29 \n", | |
"1 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2007-05-23 \n", | |
"2 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2010-01-25 \n", | |
"3 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2007-05-23 \n", | |
"4 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2009-11-13 \n", | |
"\n", | |
" authors_parsed \n", | |
"0 [[Kosel, T., ], [Grabec, I., ]] \n", | |
"1 [[Kosel, T., ], [Grabec, I., ]] \n", | |
"2 [[Šrámek, Rastislav, ], [Brejová, Broňa, ], [V... \n", | |
"3 [[Ingber, Lester, ]] \n", | |
"4 [[Raymond, Jack, ], [Saad, David, ]] " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[\"latest_version\"] = df[\"versions\"].apply(lambda x: x[-1][\"version\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = df[~df[\"id\"].str.startswith(\"cs\")] # this filters out (7147 out of 448448)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[\"filename\"] = df[\"id\"] + df[\"latest_version\"] + \".pdf\"\n", | |
"df[\"url\"] = df[\"filename\"].apply(lambda x: f\"https://storage.googleapis.com/arxiv-dataset/arxiv/arxiv/pdf/{x.split('.')[0]}/{x}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[\"url\"].apply(lambda x: f'curl -sSo- {x} | pdftotext -q - {x.split(\"/\")[-1].replace(\"pdf\", \"txt\")}').to_csv(\n", | |
" \"download.sh\", index=False, header=False, sep=\" \"\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_parquet(\"arxiv-cs-metadata.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(\"cs_lines.json\", \"w\") as f:\n", | |
" for l in cs_lines:\n", | |
" f.write(json.dumps(l) + \"\\n\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 441301 download.sh\n" | |
] | |
} | |
], | |
"source": [ | |
"!wc -l download.sh" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "base", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment