Skip to content

Instantly share code, notes, and snippets.

@morisy
Last active January 31, 2022 01:05
Show Gist options
  • Save morisy/86cdd91858207e1626e23fc49055fdd1 to your computer and use it in GitHub Desktop.
Save morisy/86cdd91858207e1626e23fc49055fdd1 to your computer and use it in GitHub Desktop.
"""
file: proj_get_doc_urls.py
what: script to download all the PDFs from a specified project
To use, install the python DocumentCloud wrapper (often using `pip install documentcloud` from the command line will do), and then run this script from the directory you want to save your files to.
Read more about the DocumentCloud Python Wrapper here:
https://documentcloud.readthedocs.io/en/latest/index.html
"""
# import the modules for this script
from documentcloud import DocumentCloud
from getpass import getpass
try:
import credentials
username = credentials.username
password = credentials.password
print("Retrieved password from credentials.py. NOTE: YOUR PASSWORD IS SAVED IN PLAINTEXT ON THIS DEVICE.")
except Exception as e:
print(e)
username = input('Username: ')
password = getpass()
# Note: Uncommenting the following lines will save your password in the directory, which is convenient but also risky.
# print ("Note: This will save your username and password in plaintext on your local device in credentials.py")
# credentials_file = open("credentials.py", "w")
# credentials_file.writelines("username = \"" + usernname +"\"")
# credentials_file.writelines("Username = \"" + password +"\"")
# credentials_file.close
# authenticate with document cloud with user_name & password in docConfig.py
client = DocumentCloud(username, password)
# varible to hold the project we're targeting
project_id = input('Project ID (Number at end of URL): ')
def proj_download_docs(project_id):
"""
begin function to return document ids
"""
# creates an object that contains the documents in the project
project = client.projects.get(id = str(project_id))
print("Saving files from " + project.title + " locally.")
# Need to come back and integrate this
location = str(project.id) + " " + project.title + "/"
for document in project.document_ids:
document = client.documents.get(document)
# Should come back and adjust later to have drop in a project folder
f = open(str(document.id) + " " + document.title +".pdf", "wb")
f.write(document.pdf)
f.close()
# url = document.asset_url + "documents/" + str(document.id) + "/" + document.slug + ".pdf"
# print("Saving the following file locally:" + url)
# with urllib.request.urlopen(url) as response, open(location, 'wb') as out_file:
# shutil.copyfileobj(response, location + str(document.id) + " " + document.title)
# runs the function specified
if __name__ == "__main__":
proj_download_docs(project_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment