Last active
May 21, 2023 15:28
-
-
Save mjlavin80/506d58f0b8183e8804b29446424e5118 to your computer and use it in GitHub Desktop.
Download all Github-archived EEBO-TCP xml files from their associated repositories on Github
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Download all Github-archived EEBO-TCP xml files from their associated repositories on Github | |
| # Files were created "by converting TCP files to TEI P5 using tcp2tei.xsl,TEI @ Oxford." | |
| # Running this script requires two preparatory steps. Either could be eliminated with a simple modification | |
| # 1. Creating a destination folder called tcp (all lowercase) that is placed in the same folder as this script | |
| # 2. Downloading "TCP.csv" (all caps filename) from https://github.com/textcreationpartnership/Texts and placing it in the same folder as this script | |
| import requests | |
| import pandas as pd | |
| # comment these lines out if you have the file already | |
| r = requests.get("https://raw.githubusercontent.com/textcreationpartnership/Texts/master/TCP.csv") | |
| with open("TCP.csv", "a") as f: | |
| f.write(r.text) | |
| df = pd.read_csv("TCP.csv") | |
| ids = list(df['TCP']) | |
| for i in ids: | |
| x = "https://raw.githubusercontent.com/textcreationpartnership/%s/master/%s.xml" % (i, i) | |
| try: | |
| xml_file = requests.get(x) | |
| xml_txt = xml_file.text | |
| with open("tcp/%s.xml"%i, "a") as myfile: | |
| myfile.write(xml_txt) | |
| except: | |
| pass | |
| myfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is great! Thanks.