Last active
January 31, 2022 01:05
-
-
Save morisy/86cdd91858207e1626e23fc49055fdd1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
file: proj_get_doc_urls.py | |
what: script to download all the PDFs from a specified project | |
To use, install the python DocumentCloud wrapper (often using `pip install documentcloud` from the command line will do), and then run this script from the directory you want to save your files to. | |
Read more about the DocumentCloud Python Wrapper here: | |
https://documentcloud.readthedocs.io/en/latest/index.html | |
""" | |
# import the modules for this script | |
from documentcloud import DocumentCloud | |
from getpass import getpass | |
try: | |
import credentials | |
username = credentials.username | |
password = credentials.password | |
print("Retrieved password from credentials.py. NOTE: YOUR PASSWORD IS SAVED IN PLAINTEXT ON THIS DEVICE.") | |
except Exception as e: | |
print(e) | |
username = input('Username: ') | |
password = getpass() | |
# Note: Uncommenting the following lines will save your password in the directory, which is convenient but also risky. | |
# print ("Note: This will save your username and password in plaintext on your local device in credentials.py") | |
# credentials_file = open("credentials.py", "w") | |
# credentials_file.writelines("username = \"" + usernname +"\"") | |
# credentials_file.writelines("Username = \"" + password +"\"") | |
# credentials_file.close | |
# authenticate with document cloud with user_name & password in docConfig.py | |
client = DocumentCloud(username, password) | |
# varible to hold the project we're targeting | |
project_id = input('Project ID (Number at end of URL): ') | |
def proj_download_docs(project_id): | |
""" | |
begin function to return document ids | |
""" | |
# creates an object that contains the documents in the project | |
project = client.projects.get(id = str(project_id)) | |
print("Saving files from " + project.title + " locally.") | |
# Need to come back and integrate this | |
location = str(project.id) + " " + project.title + "/" | |
for document in project.document_ids: | |
document = client.documents.get(document) | |
# Should come back and adjust later to have drop in a project folder | |
f = open(str(document.id) + " " + document.title +".pdf", "wb") | |
f.write(document.pdf) | |
f.close() | |
# url = document.asset_url + "documents/" + str(document.id) + "/" + document.slug + ".pdf" | |
# print("Saving the following file locally:" + url) | |
# with urllib.request.urlopen(url) as response, open(location, 'wb') as out_file: | |
# shutil.copyfileobj(response, location + str(document.id) + " " + document.title) | |
# runs the function specified | |
if __name__ == "__main__": | |
proj_download_docs(project_id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment