Last active
April 3, 2024 15:36
-
-
Save dmarx/88cb3cae143f357d546a5a8c72bf71d3 to your computer and use it in GitHub Desktop.
will not work on newest articles because ar5iv is not indexed live (yet). At present, two weeks lag.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NB: ar5iv endpoint is lagged relative main arxiv | |
import subprocess | |
def convert_arxiv_url_to_markdown(arxiv_url, target_format='markdown_github'): | |
parts = arxiv_url.split('/') | |
article_id = parts[-1] if parts[-1] else parts[-2] # Account for trailing slash | |
article_id = article_id.replace('.pdf', '') | |
html_url = f'https://ar5iv.labs.arxiv.org/html/{article_id}' | |
cmd = ['pandoc', '-t', target_format, '-o', '-', html_url] | |
try: | |
output = subprocess.run(cmd, check=True, capture_output=True, text=True) | |
markdown_content = output.stdout | |
return markdown_content | |
except subprocess.CalledProcessError as e: | |
return f"Error during conversion: {e}" | |
# demo | |
if __name__ == '__main__': | |
arxiv_url = 'https://arxiv.org/abs/1910.06709' | |
markdown = convert_arxiv_url_to_markdown(arxiv_url) | |
print(markdown) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment