Last active
March 3, 2020 15:58
-
-
Save Romern/225ceeca7a7825c0d2be7554c03b2bea to your computer and use it in GitHub Desktop.
StudyDriveDownloader: Python implementation of the StudyDrive API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import os | |
from datetime import datetime | |
baseurl = "https://api.studydrive.net/" | |
def login(user, passwd): | |
param = {"client_id": 4, | |
"client_secret": "nmGaT4rJ3VVGQXu75ymi5Cu5bdqb3tFnkWw9f1IX", | |
"grant_type":"password", | |
"username": user, | |
"password": passwd} | |
req = requests.post('{}oauth/token'.format(baseurl), data=param) | |
req.raise_for_status() | |
return json.loads(req.text)['access_token'] | |
def getTime(): | |
return str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
def getUniversityData(universityid, token): #returns all courses of the university | |
headers={"authorization": "Bearer "+token} | |
req = requests.get('{}api/app/v1/universities/{}/courses'.format(baseurl,universityid), headers=headers) | |
req.raise_for_status() | |
return json.loads(req.text) | |
def getCourseData(courseid, token, page=0, reference_time=None): | |
#reference_time="2019-09-24 14:52:08" | |
param = {"sort": "time", | |
"page": page, | |
"semester_from":0, | |
"semester_until":0, | |
"type_ids":0} | |
if page>0: | |
if reference_time is None: | |
reference_time = getTime() | |
param["reference_time"] = reference_time | |
headers={"authorization": "Bearer "+token} | |
req = requests.get('{}api/app/v1/feed/courses/{}/documents'.format(baseurl,courseid), params=param, headers=headers) | |
req.raise_for_status() | |
return json.loads(req.text) | |
def getFullCourseData(courseid, token, until=None): # until is of type date, e.g. datetime.now() | |
reference_time = getTime() | |
init_data = getCourseData(courseid, token, page=0, reference_time=reference_time) | |
last_page = int(init_data["last_page"]) | |
files = init_data["files"] | |
for i in range(1,last_page+1): | |
if (until != None) and (len([f for f in files if datetime.strptime(f["uploaded"], '%Y-%m-%d %H:%M:%S')>until])==0): | |
break | |
files.extend(getCourseData(courseid, token, page=i, reference_time=reference_time)["files"]) | |
init_data["files"] = files | |
return init_data | |
def getDocument(docid, token): | |
headers={"authorization": "Bearer "+token} | |
#uploadDate = datetime.strptime(data["files"][0]["uploaded"], '%Y-%m-%d %H:%M:%S') | |
req = requests.get('{}api/app/v1/documents/{}/download'.format(baseurl,docid), headers=headers) | |
req.raise_for_status() | |
return req.content | |
def downloadAllFilesInCourse(filelist, token, folder="."): | |
for f in filelist: | |
docid = f['file_id'] | |
docname = f['file_name'] + docid + f["file_name"].split(".")[-1] | |
print("Downloading {}...".format(docname)) | |
doc = getDocument(docid,login_token) | |
if os.path.isfile(docname): | |
print("Found duplicate: {} already exists".format(docname)) | |
file = open(folder + "/" + docname, "wb") | |
file.write(doc) | |
file.close() | |
def crawlAllCourses(lastcrawled, university_id, token): | |
#lastcrawled = {'50936': "2019-09-27 11:38:57", ...} | |
courses = getUniversityData(universityid, token) | |
for c in courses: | |
if c["course_id"] in lastcrawled.keys(): | |
until = lastcrawled[c["course_id"]] | |
else: | |
until = None | |
data = getFullCourseData(c["course_id"], token, until=until) | |
if not os.path.exists(c["course_name"]): | |
os.mkdir(c["course_name"]) | |
downloadAllFilesInCourse(data["files"], token, folder=c["course_name"]) | |
lastcrawled[c["course_id"]] = data["files"][0]["uploaded"] | |
return lastcrawled #return updated lastcrawled |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for your fast answer - for testing purposes I used curl instead of python and forgot the -X POST 🙈