Created
March 4, 2023 18:07
-
-
Save adzcai/f527e8a1df08c18cf22093e783ab8b9b to your computer and use it in GitHub Desktop.
Scrape arXiv categories from https://arxiv.org/category_taxonomy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const categories = document.querySelector("#category_taxonomy_list"); | |
// get each h2 and div pair from categories | |
const categoryPairs = Array.from(categories.children).filter( | |
(child) => child.tagName === "H2" || child.tagName === "DIV" | |
); | |
// group the h2 and div pairs into an array of arrays | |
const categoryGroups = []; | |
for (let i = 0; i < categoryPairs.length; i += 2) { | |
categoryGroups.push(categoryPairs.slice(i, i + 2)); | |
} | |
// get the category name and id from each group | |
const categoriesList = categoryGroups.map(([header, body]) => { | |
const name = header.textContent; | |
// two nested divs inside the body is a list of divs corresponding to the subcategories | |
const subcategories = Array.from(body.children[0].children[0].children).map( | |
(subcategory) => { | |
try { | |
const subname = subcategory.children[0].children[0].firstChild; | |
const description = subcategory.children[0].children[0].children[0]; | |
return { | |
name: subname.textContent.trim(), | |
description: description.textContent.slice(1, -1), | |
}; | |
} catch (err) { | |
return null; | |
} | |
} | |
); | |
return { | |
name, | |
subcategories, | |
}; | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"name": "Computer Science", | |
"subcategories": [ | |
{ | |
"name": "cs.AI", | |
"description": "Artificial Intelligence" | |
}, | |
{ | |
"name": "cs.AR", | |
"description": "Hardware Architecture" | |
}, | |
{ | |
"name": "cs.CC", | |
"description": "Computational Complexity" | |
}, | |
{ | |
"name": "cs.CE", | |
"description": "Computational Engineering, Finance, and Science" | |
}, | |
{ | |
"name": "cs.CG", | |
"description": "Computational Geometry" | |
}, | |
{ | |
"name": "cs.CL", | |
"description": "Computation and Language" | |
}, | |
{ | |
"name": "cs.CR", | |
"description": "Cryptography and Security" | |
}, | |
{ | |
"name": "cs.CV", | |
"description": "Computer Vision and Pattern Recognition" | |
}, | |
{ | |
"name": "cs.CY", | |
"description": "Computers and Society" | |
}, | |
{ | |
"name": "cs.DB", | |
"description": "Databases" | |
}, | |
{ | |
"name": "cs.DC", | |
"description": "Distributed, Parallel, and Cluster Computing" | |
}, | |
{ | |
"name": "cs.DL", | |
"description": "Digital Libraries" | |
}, | |
{ | |
"name": "cs.DM", | |
"description": "Discrete Mathematics" | |
}, | |
{ | |
"name": "cs.DS", | |
"description": "Data Structures and Algorithms" | |
}, | |
{ | |
"name": "cs.ET", | |
"description": "Emerging Technologies" | |
}, | |
{ | |
"name": "cs.FL", | |
"description": "Formal Languages and Automata Theory" | |
}, | |
{ | |
"name": "cs.GL", | |
"description": "General Literature" | |
}, | |
{ | |
"name": "cs.GR", | |
"description": "Graphics" | |
}, | |
{ | |
"name": "cs.GT", | |
"description": "Computer Science and Game Theory" | |
}, | |
{ | |
"name": "cs.HC", | |
"description": "Human-Computer Interaction" | |
}, | |
{ | |
"name": "cs.IR", | |
"description": "Information Retrieval" | |
}, | |
{ | |
"name": "cs.IT", | |
"description": "Information Theory" | |
}, | |
{ | |
"name": "cs.LG", | |
"description": "Machine Learning" | |
}, | |
{ | |
"name": "cs.LO", | |
"description": "Logic in Computer Science" | |
}, | |
{ | |
"name": "cs.MA", | |
"description": "Multiagent Systems" | |
}, | |
{ | |
"name": "cs.MM", | |
"description": "Multimedia" | |
}, | |
{ | |
"name": "cs.MS", | |
"description": "Mathematical Software" | |
}, | |
{ | |
"name": "cs.NA", | |
"description": "Numerical Analysis" | |
}, | |
{ | |
"name": "cs.NE", | |
"description": "Neural and Evolutionary Computing" | |
}, | |
{ | |
"name": "cs.NI", | |
"description": "Networking and Internet Architecture" | |
}, | |
{ | |
"name": "cs.OH", | |
"description": "Other Computer Science" | |
}, | |
{ | |
"name": "cs.OS", | |
"description": "Operating Systems" | |
}, | |
{ | |
"name": "cs.PF", | |
"description": "Performance" | |
}, | |
{ | |
"name": "cs.PL", | |
"description": "Programming Languages" | |
}, | |
{ | |
"name": "cs.RO", | |
"description": "Robotics" | |
}, | |
{ | |
"name": "cs.SC", | |
"description": "Symbolic Computation" | |
}, | |
{ | |
"name": "cs.SD", | |
"description": "Sound" | |
}, | |
{ | |
"name": "cs.SE", | |
"description": "Software Engineering" | |
}, | |
{ | |
"name": "cs.SI", | |
"description": "Social and Information Networks" | |
}, | |
{ | |
"name": "cs.SY", | |
"description": "Systems and Control" | |
} | |
] | |
}, | |
{ | |
"name": "Economics", | |
"subcategories": [ | |
{ | |
"name": "econ.EM", | |
"description": "Econometrics" | |
}, | |
{ | |
"name": "econ.GN", | |
"description": "General Economics" | |
}, | |
{ | |
"name": "econ.TH", | |
"description": "Theoretical Economics" | |
} | |
] | |
}, | |
{ | |
"name": "Electrical Engineering and Systems Science", | |
"subcategories": [ | |
{ | |
"name": "eess.AS", | |
"description": "Audio and Speech Processing" | |
}, | |
{ | |
"name": "eess.IV", | |
"description": "Image and Video Processing" | |
}, | |
{ | |
"name": "eess.SP", | |
"description": "Signal Processing" | |
}, | |
{ | |
"name": "eess.SY", | |
"description": "Systems and Control" | |
} | |
] | |
}, | |
{ | |
"name": "Mathematics", | |
"subcategories": [ | |
{ | |
"name": "math.AC", | |
"description": "Commutative Algebra" | |
}, | |
{ | |
"name": "math.AG", | |
"description": "Algebraic Geometry" | |
}, | |
{ | |
"name": "math.AP", | |
"description": "Analysis of PDEs" | |
}, | |
{ | |
"name": "math.AT", | |
"description": "Algebraic Topology" | |
}, | |
{ | |
"name": "math.CA", | |
"description": "Classical Analysis and ODEs" | |
}, | |
{ | |
"name": "math.CO", | |
"description": "Combinatorics" | |
}, | |
{ | |
"name": "math.CT", | |
"description": "Category Theory" | |
}, | |
{ | |
"name": "math.CV", | |
"description": "Complex Variables" | |
}, | |
{ | |
"name": "math.DG", | |
"description": "Differential Geometry" | |
}, | |
{ | |
"name": "math.DS", | |
"description": "Dynamical Systems" | |
}, | |
{ | |
"name": "math.FA", | |
"description": "Functional Analysis" | |
}, | |
{ | |
"name": "math.GM", | |
"description": "General Mathematics" | |
}, | |
{ | |
"name": "math.GN", | |
"description": "General Topology" | |
}, | |
{ | |
"name": "math.GR", | |
"description": "Group Theory" | |
}, | |
{ | |
"name": "math.GT", | |
"description": "Geometric Topology" | |
}, | |
{ | |
"name": "math.HO", | |
"description": "History and Overview" | |
}, | |
{ | |
"name": "math.IT", | |
"description": "Information Theory" | |
}, | |
{ | |
"name": "math.KT", | |
"description": "K-Theory and Homology" | |
}, | |
{ | |
"name": "math.LO", | |
"description": "Logic" | |
}, | |
{ | |
"name": "math.MG", | |
"description": "Metric Geometry" | |
}, | |
{ | |
"name": "math.MP", | |
"description": "Mathematical Physics" | |
}, | |
{ | |
"name": "math.NA", | |
"description": "Numerical Analysis" | |
}, | |
{ | |
"name": "math.NT", | |
"description": "Number Theory" | |
}, | |
{ | |
"name": "math.OA", | |
"description": "Operator Algebras" | |
}, | |
{ | |
"name": "math.OC", | |
"description": "Optimization and Control" | |
}, | |
{ | |
"name": "math.PR", | |
"description": "Probability" | |
}, | |
{ | |
"name": "math.QA", | |
"description": "Quantum Algebra" | |
}, | |
{ | |
"name": "math.RA", | |
"description": "Rings and Algebras" | |
}, | |
{ | |
"name": "math.RT", | |
"description": "Representation Theory" | |
}, | |
{ | |
"name": "math.SG", | |
"description": "Symplectic Geometry" | |
}, | |
{ | |
"name": "math.SP", | |
"description": "Spectral Theory" | |
}, | |
{ | |
"name": "math.ST", | |
"description": "Statistics Theory" | |
} | |
] | |
}, | |
{ | |
"name": "Quantitative Biology", | |
"subcategories": [ | |
{ | |
"name": "q-bio.BM", | |
"description": "Biomolecules" | |
}, | |
{ | |
"name": "q-bio.CB", | |
"description": "Cell Behavior" | |
}, | |
{ | |
"name": "q-bio.GN", | |
"description": "Genomics" | |
}, | |
{ | |
"name": "q-bio.MN", | |
"description": "Molecular Networks" | |
}, | |
{ | |
"name": "q-bio.NC", | |
"description": "Neurons and Cognition" | |
}, | |
{ | |
"name": "q-bio.OT", | |
"description": "Other Quantitative Biology" | |
}, | |
{ | |
"name": "q-bio.PE", | |
"description": "Populations and Evolution" | |
}, | |
{ | |
"name": "q-bio.QM", | |
"description": "Quantitative Methods" | |
}, | |
{ | |
"name": "q-bio.SC", | |
"description": "Subcellular Processes" | |
}, | |
{ | |
"name": "q-bio.TO", | |
"description": "Tissues and Organs" | |
} | |
] | |
}, | |
{ | |
"name": "Quantitative Finance", | |
"subcategories": [ | |
{ | |
"name": "q-fin.CP", | |
"description": "Computational Finance" | |
}, | |
{ | |
"name": "q-fin.EC", | |
"description": "Economics" | |
}, | |
{ | |
"name": "q-fin.GN", | |
"description": "General Finance" | |
}, | |
{ | |
"name": "q-fin.MF", | |
"description": "Mathematical Finance" | |
}, | |
{ | |
"name": "q-fin.PM", | |
"description": "Portfolio Management" | |
}, | |
{ | |
"name": "q-fin.PR", | |
"description": "Pricing of Securities" | |
}, | |
{ | |
"name": "q-fin.RM", | |
"description": "Risk Management" | |
}, | |
{ | |
"name": "q-fin.ST", | |
"description": "Statistical Finance" | |
}, | |
{ | |
"name": "q-fin.TR", | |
"description": "Trading and Market Microstructure" | |
} | |
] | |
}, | |
{ | |
"name": "Statistics", | |
"subcategories": [ | |
{ | |
"name": "stat.AP", | |
"description": "Applications" | |
}, | |
{ | |
"name": "stat.CO", | |
"description": "Computation" | |
}, | |
{ | |
"name": "stat.ME", | |
"description": "Methodology" | |
}, | |
{ | |
"name": "stat.ML", | |
"description": "Machine Learning" | |
}, | |
{ | |
"name": "stat.OT", | |
"description": "Other Statistics" | |
}, | |
{ | |
"name": "stat.TH", | |
"description": "Statistics Theory" | |
} | |
] | |
} | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const categories = document.querySelector("#category_taxonomy_list > div:nth-child(10)"); | |
// get each h2 and div pair from categories | |
const categoryGroups = Array.from(categories.children).map(c => c.children); | |
// group the h2 and div pairs into an array of arrays | |
console.log(categoryGroups) | |
// get the category name and id from each group | |
const categoriesList = categoryGroups.map(([header, body]) => { | |
console.log(header.children[0].firstChild) | |
const name = header.children[0].firstChild.textContent.trim(); | |
const subtitle = header.querySelector('span').textContent.trim(); | |
// two nested divs inside the body is a list of divs corresponding to the subcategories | |
const subcategories = Array.from(body.children).map( | |
(subcategory) => { | |
try { | |
const subname = subcategory.children[0].children[0].firstChild; | |
const description = subcategory.children[0].children[0].children[0]; | |
return { | |
name: subname.textContent.trim(), | |
description: description.textContent.slice(1, -1), | |
}; | |
} catch (err) { | |
return null; | |
} | |
} | |
); | |
return { | |
name, | |
subtitle, | |
subcategories, | |
}; | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"name": "Astrophysics", | |
"subtitle": "astro-ph", | |
"subcategories": [ | |
{ | |
"name": "astro-ph.CO", | |
"description": "Cosmology and Nongalactic Astrophysics" | |
}, | |
{ | |
"name": "astro-ph.EP", | |
"description": "Earth and Planetary Astrophysics" | |
}, | |
{ | |
"name": "astro-ph.GA", | |
"description": "Astrophysics of Galaxies" | |
}, | |
{ | |
"name": "astro-ph.HE", | |
"description": "High Energy Astrophysical Phenomena" | |
}, | |
{ | |
"name": "astro-ph.IM", | |
"description": "Instrumentation and Methods for Astrophysics" | |
}, | |
{ | |
"name": "astro-ph.SR", | |
"description": "Solar and Stellar Astrophysics" | |
} | |
] | |
}, | |
{ | |
"name": "Condensed Matter", | |
"subtitle": "cond-mat", | |
"subcategories": [ | |
{ | |
"name": "cond-mat.dis-nn", | |
"description": "Disordered Systems and Neural Networks" | |
}, | |
{ | |
"name": "cond-mat.mes-hall", | |
"description": "Mesoscale and Nanoscale Physics" | |
}, | |
{ | |
"name": "cond-mat.mtrl-sci", | |
"description": "Materials Science" | |
}, | |
{ | |
"name": "cond-mat.other", | |
"description": "Other Condensed Matter" | |
}, | |
{ | |
"name": "cond-mat.quant-gas", | |
"description": "Quantum Gases" | |
}, | |
{ | |
"name": "cond-mat.soft", | |
"description": "Soft Condensed Matter" | |
}, | |
{ | |
"name": "cond-mat.stat-mech", | |
"description": "Statistical Mechanics" | |
}, | |
{ | |
"name": "cond-mat.str-el", | |
"description": "Strongly Correlated Electrons" | |
}, | |
{ | |
"name": "cond-mat.supr-con", | |
"description": "Superconductivity" | |
} | |
] | |
}, | |
{ | |
"name": "General Relativity and Quantum Cosmology", | |
"subtitle": "gr-qc", | |
"subcategories": [ | |
{ | |
"name": "gr-qc", | |
"description": "General Relativity and Quantum Cosmology" | |
} | |
] | |
}, | |
{ | |
"name": "High Energy Physics - Experiment", | |
"subtitle": "hep-ex", | |
"subcategories": [ | |
{ | |
"name": "hep-ex", | |
"description": "High Energy Physics - Experiment" | |
} | |
] | |
}, | |
{ | |
"name": "High Energy Physics - Lattice", | |
"subtitle": "hep-lat", | |
"subcategories": [ | |
{ | |
"name": "hep-lat", | |
"description": "High Energy Physics - Lattice" | |
} | |
] | |
}, | |
{ | |
"name": "High Energy Physics - Phenomenology", | |
"subtitle": "hep-ph", | |
"subcategories": [ | |
{ | |
"name": "hep-ph", | |
"description": "High Energy Physics - Phenomenology" | |
} | |
] | |
}, | |
{ | |
"name": "High Energy Physics - Theory", | |
"subtitle": "hep-th", | |
"subcategories": [ | |
{ | |
"name": "hep-th", | |
"description": "High Energy Physics - Theory" | |
} | |
] | |
}, | |
{ | |
"name": "Mathematical Physics", | |
"subtitle": "math-ph", | |
"subcategories": [ | |
{ | |
"name": "math-ph", | |
"description": "Mathematical Physics" | |
} | |
] | |
}, | |
{ | |
"name": "Nonlinear Sciences", | |
"subtitle": "nlin", | |
"subcategories": [ | |
{ | |
"name": "nlin.AO", | |
"description": "Adaptation and Self-Organizing Systems" | |
}, | |
{ | |
"name": "nlin.CD", | |
"description": "Chaotic Dynamics" | |
}, | |
{ | |
"name": "nlin.CG", | |
"description": "Cellular Automata and Lattice Gases" | |
}, | |
{ | |
"name": "nlin.PS", | |
"description": "Pattern Formation and Solitons" | |
}, | |
{ | |
"name": "nlin.SI", | |
"description": "Exactly Solvable and Integrable Systems" | |
} | |
] | |
}, | |
{ | |
"name": "Nuclear Experiment", | |
"subtitle": "nucl-ex", | |
"subcategories": [ | |
{ | |
"name": "nucl-ex", | |
"description": "Nuclear Experiment" | |
} | |
] | |
}, | |
{ | |
"name": "Nuclear Theory", | |
"subtitle": "nucl-th", | |
"subcategories": [ | |
{ | |
"name": "nucl-th", | |
"description": "Nuclear Theory" | |
} | |
] | |
}, | |
{ | |
"name": "Physics", | |
"subtitle": "physics", | |
"subcategories": [ | |
{ | |
"name": "physics.acc-ph", | |
"description": "Accelerator Physics" | |
}, | |
{ | |
"name": "physics.ao-ph", | |
"description": "Atmospheric and Oceanic Physics" | |
}, | |
{ | |
"name": "physics.app-ph", | |
"description": "Applied Physics" | |
}, | |
{ | |
"name": "physics.atm-clus", | |
"description": "Atomic and Molecular Clusters" | |
}, | |
{ | |
"name": "physics.atom-ph", | |
"description": "Atomic Physics" | |
}, | |
{ | |
"name": "physics.bio-ph", | |
"description": "Biological Physics" | |
}, | |
{ | |
"name": "physics.chem-ph", | |
"description": "Chemical Physics" | |
}, | |
{ | |
"name": "physics.class-ph", | |
"description": "Classical Physics" | |
}, | |
{ | |
"name": "physics.comp-ph", | |
"description": "Computational Physics" | |
}, | |
{ | |
"name": "physics.data-an", | |
"description": "Data Analysis, Statistics and Probability" | |
}, | |
{ | |
"name": "physics.ed-ph", | |
"description": "Physics Education" | |
}, | |
{ | |
"name": "physics.flu-dyn", | |
"description": "Fluid Dynamics" | |
}, | |
{ | |
"name": "physics.gen-ph", | |
"description": "General Physics" | |
}, | |
{ | |
"name": "physics.geo-ph", | |
"description": "Geophysics" | |
}, | |
{ | |
"name": "physics.hist-ph", | |
"description": "History and Philosophy of Physics" | |
}, | |
{ | |
"name": "physics.ins-det", | |
"description": "Instrumentation and Detectors" | |
}, | |
{ | |
"name": "physics.med-ph", | |
"description": "Medical Physics" | |
}, | |
{ | |
"name": "physics.optics", | |
"description": "Optics" | |
}, | |
{ | |
"name": "physics.plasm-ph", | |
"description": "Plasma Physics" | |
}, | |
{ | |
"name": "physics.pop-ph", | |
"description": "Popular Physics" | |
}, | |
{ | |
"name": "physics.soc-ph", | |
"description": "Physics and Society" | |
}, | |
{ | |
"name": "physics.space-ph", | |
"description": "Space Physics" | |
} | |
] | |
}, | |
{ | |
"name": "Quantum Physics", | |
"subtitle": "quant-ph", | |
"subcategories": [ | |
{ | |
"name": "quant-ph", | |
"description": "Quantum Physics" | |
} | |
] | |
} | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment