Last active
September 14, 2024 19:48
-
-
Save branw/195a6f1a34f3068ba63d9ebfaca17b47 to your computer and use it in GitHub Desktop.
Chegg textbook solutions scraper (Sept. 2018) -- exploited client-side enforcement of trial mode on old Android app (hardcoded account is now deleted)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from requests.auth import HTTPBasicAuth | |
from pprint import pprint | |
import secrets | |
import uuid | |
from urllib.parse import urlparse | |
from collections import deque | |
import pickle | |
import sys | |
s = requests.Session() | |
s.auth = HTTPBasicAuth('hlDpZAPF05mqjAmg7cqtIKLOhUryB8p1', 'uBjzakmxGx6WtqAr') | |
s.headers.update({ | |
'X-CHEGG-DEVICEID': secrets.token_hex(8), | |
'X-CHEGG-SESSIONID': str(uuid.uuid4()), | |
'X-CHEGG-XYZPASS': '1', | |
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; Pixel XL Build/PPR1.180610.009)' | |
}) | |
def get_tbs_book(book_id): | |
output = [] | |
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}') | |
if r.status_code == 404: | |
return None | |
r.raise_for_status() | |
j = r.json() | |
result = j['result'] | |
output = { | |
'id': book_id, | |
'name': result['title'], | |
'full_name': result['fullTitle'], | |
'edition': result['edition'], | |
'image': result['imgLarge'] if 'imgLarge' in result else result['imgThumb'], | |
'has_solutions': result['hasSolutions'] | |
} | |
return output | |
def get_tbs_chapters(book_id, offset=0, all=True): | |
output = [] | |
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}/chapters', params={ | |
'offset': offset | |
}) | |
r.raise_for_status() | |
j = r.json() | |
output.extend(j['result']) | |
while all and 'nextPage' in j: | |
r = s.get(j['nextPage']) | |
r.raise_for_status() | |
j = r.json() | |
output.extend(j['result']) | |
return output | |
def get_tbs_problems(chapter_id, offset=0, all_problems=True): | |
output = [] | |
r = s.get(f'https://hub.chegg.com/v1/chapter/{chapter_id}/problems', params={ | |
'offset': offset | |
}) | |
r.raise_for_status() | |
j = r.json() | |
output.extend(j['result']) | |
while all_problems and 'nextPage' in j: | |
r = s.get(j['nextPage']) | |
r.raise_for_status() | |
j = r.json() | |
output.extend(j['result']) | |
return output | |
def get_tbs_problem_text(problem_id): | |
r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html') | |
if r.status_code == 404: | |
return None | |
r.raise_for_status() | |
return r.text | |
def get_tbs_solutions(problem_id): | |
r = s.get(f'https://hub.chegg.com/v1/problem/{problem_id}/solutions') | |
r.raise_for_status() | |
j = r.json() | |
return j['result'] | |
def load_solutions(problem_id): | |
solutions = get_tbs_solutions(problem_id) | |
output = [] | |
for solution in solutions: | |
solution_output = [] | |
steps = solutions[0]['steps'] | |
for i, step in enumerate(steps): | |
r = s.get(step['link']) | |
solution_output.append({ | |
'i': i + 1, | |
'text': r.text | |
}) | |
output.append({ | |
'num_steps': len(steps), | |
'steps': solution_output | |
}) | |
return output | |
def load_problems(chapter_id): | |
problems = get_tbs_problems(chapter_id) | |
output = [] | |
for problem in problems: | |
output.append({ | |
'name': problem['name'], | |
'id': problem['id'] | |
}) | |
return output | |
def load_chapters(book_id): | |
chapters = get_tbs_chapters(book_id) | |
output = [] | |
for chapter in chapters: | |
output.append({ | |
'name': chapter['name'], | |
'id': chapter['id'] | |
}) | |
return output | |
if __name__ == '__main__': | |
book_id = REDACTED | |
all_solutions = {} | |
book = get_tbs_book(book_id) | |
chapters = load_chapters(book_id) | |
output = f"<section><img src='{book['image']}'><h1>{book['full_name']}</h1><p>" + ' '.join(f"<a href='#{chapter['id']}'>{chapter['name']}</a>" for chapter in chapters) + '</p></section>' | |
for i, chapter in enumerate(chapters): | |
print(i, 'Chapter', chapter['name']) | |
problems = load_problems(chapter['id']) | |
output += f"<section id='{chapter['id']}'><h2>Chapter {chapter['name']}</h2><p>" + ' '.join(f"<a href='#{chapter['id']}-{problem['id']}'>{problem['name']}</a>" for problem in problems) + "</p></section>" | |
for j, problem in enumerate(problems): | |
print(i, j, 'Problem', problem['name']) | |
solutions = load_solutions(problem['id']) | |
if not any(solutions): | |
continue | |
solution = solutions[0] | |
all_solutions[problem['id']] = solution | |
output += f"<section id='{chapter['id']}-{problem['id']}'><a href='#{chapter['id']}'>Go to Chapter {chapter['name']}</a><h3>Problem {problem['name']}</h3>" | |
output += "<ul class='list-group mb-3'>" + ''.join(f"<li class='list-group-item justify-content-between'><h4>Step {step['i']} <span class='text-muted'>of {solution['num_steps']}</span></h4>{step['text']}</li>" for step in solution['steps']) + "</ul>" | |
output += "</section>" | |
html_output = """<!doctype html> | |
<html lang="en"> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> | |
<title>Chegg</title> | |
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous"> | |
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script> | |
<style> | |
:root { font-size: 18px; } | |
section { page-break-after: always; } | |
</style> | |
</head> | |
<body class="bg-light"> | |
""" + output + """ | |
</body> | |
</html> | |
""" | |
title = book['name'].lower().replace(' ', '-') | |
with open(f'book-{book_id}-{title}.html', 'w') as f: | |
f.write(html_output.encode(sys.stdout.encoding, errors='replace')) | |
pickle.dump(all_solutions, open('book.pickle', 'wb')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment