branw · September 14, 2024 19:48
diff --git a/scrape_chegg.py b/scrape_chegg.py
 import requests
 from requests.auth import HTTPBasicAuth
 from pprint import pprint
 import secrets
 import uuid
 from urllib.parse import urlparse
 from collections import deque
 import pickle
 import sys

 s = requests.Session()
 s.auth = HTTPBasicAuth('hlDpZAPF05mqjAmg7cqtIKLOhUryB8p1', 'uBjzakmxGx6WtqAr')
 s.headers.update({
    'X-CHEGG-DEVICEID': secrets.token_hex(8),
    'X-CHEGG-SESSIONID': str(uuid.uuid4()),
    'X-CHEGG-XYZPASS': '1',
    'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; Pixel XL Build/PPR1.180610.009)'
    })

 def get_tbs_book(book_id):
    output = []

    r = s.get(f'https://hub.chegg.com/v1/book/{book_id}')
    if r.status_code == 404:
        return None

    r.raise_for_status()

    j = r.json()
    result = j['result']

    output = {
        'id': book_id,
        'name': result['title'],
        'full_name': result['fullTitle'],
        'edition': result['edition'],
        'image': result['imgLarge'] if 'imgLarge' in result else result['imgThumb'],
        'has_solutions': result['hasSolutions']
    }
    
    return output

 def get_tbs_chapters(book_id, offset=0, all=True):
    output = []

    r = s.get(f'https://hub.chegg.com/v1/book/{book_id}/chapters', params={
        'offset': offset
        })
    r.raise_for_status()
    j = r.json()

    output.extend(j['result'])

    while all and 'nextPage' in j:
        r = s.get(j['nextPage'])
        r.raise_for_status()
        j = r.json()

        output.extend(j['result'])
    
    return output

 def get_tbs_problems(chapter_id, offset=0, all_problems=True):
    output = []

    r = s.get(f'https://hub.chegg.com/v1/chapter/{chapter_id}/problems', params={
        'offset': offset
        })
    r.raise_for_status()
    j = r.json()

    output.extend(j['result'])

    while all_problems and 'nextPage' in j:
        r = s.get(j['nextPage'])
        r.raise_for_status()
        j = r.json()

        output.extend(j['result'])

    return output

 def get_tbs_problem_text(problem_id):
    r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html')
    if r.status_code == 404:
        return None
    r.raise_for_status()

    return r.text

 def get_tbs_solutions(problem_id):
    r = s.get(f'https://hub.chegg.com/v1/problem/{problem_id}/solutions')
    r.raise_for_status()
    j = r.json()

    return j['result']

 def load_solutions(problem_id):
    solutions = get_tbs_solutions(problem_id)

    output = []

    for solution in solutions:
        solution_output = []
        steps = solutions[0]['steps']
        for i, step in enumerate(steps):
            r = s.get(step['link'])
            solution_output.append({
                'i': i + 1,
                'text': r.text
                })
        output.append({
            'num_steps': len(steps),
            'steps': solution_output
            })

    return output

 def load_problems(chapter_id):
    problems = get_tbs_problems(chapter_id)

    output = []

    for problem in problems:
        output.append({
            'name': problem['name'],
            'id': problem['id']
            })

    return output

 def load_chapters(book_id):
    chapters = get_tbs_chapters(book_id)

    output = []

    for chapter in chapters:
        output.append({
            'name': chapter['name'],
            'id': chapter['id']
            })

    return output

 if __name__ == '__main__':
    book_id = REDACTED

    all_solutions = {}

    book = get_tbs_book(book_id)
    
    chapters = load_chapters(book_id)

    output = f"<section><img src='{book['image']}'><h1>{book['full_name']}</h1><p>" + ' '.join(f"<a href='#{chapter['id']}'>{chapter['name']}</a>" for chapter in chapters) + '</p></section>'
    
    for i, chapter in enumerate(chapters):
        print(i, 'Chapter', chapter['name'])
        
        problems = load_problems(chapter['id'])

        output += f"<section id='{chapter['id']}'><h2>Chapter {chapter['name']}</h2><p>" + ' '.join(f"<a href='#{chapter['id']}-{problem['id']}'>{problem['name']}</a>" for problem in problems) + "</p></section>"
        
        for j, problem in enumerate(problems):
            print(i, j, 'Problem', problem['name'])

            solutions = load_solutions(problem['id'])
            if not any(solutions):
                continue
            solution = solutions[0]

            all_solutions[problem['id']] = solution
            
            output += f"<section id='{chapter['id']}-{problem['id']}'><a href='#{chapter['id']}'>Go to Chapter {chapter['name']}</a><h3>Problem {problem['name']}</h3>"
            output += "<ul class='list-group mb-3'>" + ''.join(f"<li class='list-group-item justify-content-between'><h4>Step {step['i']} <span class='text-muted'>of {solution['num_steps']}</span></h4>{step['text']}</li>" for step in solution['steps']) + "</ul>"
            output += "</section>"


    html_output = """<!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

    <title>Chegg</title>

    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script>

    <style>
      :root { font-size: 18px; }
      section { page-break-after: always; }
    </style>
  </head>

  <body class="bg-light">
  """ + output + """
  </body>
 </html>
 """

    title = book['name'].lower().replace(' ', '-')
    with open(f'book-{book_id}-{title}.html', 'w') as f:
        f.write(html_output.encode(sys.stdout.encoding, errors='replace'))

    pickle.dump(all_solutions, open('book.pickle', 'wb'))
	import requests
	from requests.auth import HTTPBasicAuth
	from pprint import pprint
	import secrets
	import uuid
	from urllib.parse import urlparse
	from collections import deque
	import pickle
	import sys

	s = requests.Session()
	s.auth = HTTPBasicAuth('hlDpZAPF05mqjAmg7cqtIKLOhUryB8p1', 'uBjzakmxGx6WtqAr')
	s.headers.update({
	'X-CHEGG-DEVICEID': secrets.token_hex(8),
	'X-CHEGG-SESSIONID': str(uuid.uuid4()),
	'X-CHEGG-XYZPASS': '1',
	'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; Pixel XL Build/PPR1.180610.009)'
	})

	def get_tbs_book(book_id):
	output = []

	r = s.get(f'https://hub.chegg.com/v1/book/{book_id}')
	if r.status_code == 404:
	return None

	r.raise_for_status()

	j = r.json()
	result = j['result']

	output = {
	'id': book_id,
	'name': result['title'],
	'full_name': result['fullTitle'],
	'edition': result['edition'],
	'image': result['imgLarge'] if 'imgLarge' in result else result['imgThumb'],
	'has_solutions': result['hasSolutions']
	}

	return output

	def get_tbs_chapters(book_id, offset=0, all=True):
	output = []

	r = s.get(f'https://hub.chegg.com/v1/book/{book_id}/chapters', params={
	'offset': offset
	})
	r.raise_for_status()
	j = r.json()

	output.extend(j['result'])

	while all and 'nextPage' in j:
	r = s.get(j['nextPage'])
	r.raise_for_status()
	j = r.json()

	output.extend(j['result'])

	return output

	def get_tbs_problems(chapter_id, offset=0, all_problems=True):
	output = []

	r = s.get(f'https://hub.chegg.com/v1/chapter/{chapter_id}/problems', params={
	'offset': offset
	})
	r.raise_for_status()
	j = r.json()

	output.extend(j['result'])

	while all_problems and 'nextPage' in j:
	r = s.get(j['nextPage'])
	r.raise_for_status()
	j = r.json()

	output.extend(j['result'])

	return output

	def get_tbs_problem_text(problem_id):
	r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html')
	if r.status_code == 404:
	return None
	r.raise_for_status()

	return r.text

	def get_tbs_solutions(problem_id):
	r = s.get(f'https://hub.chegg.com/v1/problem/{problem_id}/solutions')
	r.raise_for_status()
	j = r.json()

	return j['result']

	def load_solutions(problem_id):
	solutions = get_tbs_solutions(problem_id)

	output = []

	for solution in solutions:
	solution_output = []
	steps = solutions[0]['steps']
	for i, step in enumerate(steps):
	r = s.get(step['link'])
	solution_output.append({
	'i': i + 1,
	'text': r.text
	})
	output.append({
	'num_steps': len(steps),
	'steps': solution_output
	})

	return output

	def load_problems(chapter_id):
	problems = get_tbs_problems(chapter_id)

	output = []

	for problem in problems:
	output.append({
	'name': problem['name'],
	'id': problem['id']
	})

	return output

	def load_chapters(book_id):
	chapters = get_tbs_chapters(book_id)

	output = []

	for chapter in chapters:
	output.append({
	'name': chapter['name'],
	'id': chapter['id']
	})

	return output

	if __name__ == '__main__':
	book_id = REDACTED

	all_solutions = {}

	book = get_tbs_book(book_id)

	chapters = load_chapters(book_id)

	output = f"<section><img src='{book['image']}'><h1>{book['full_name']}</h1><p>" + ' '.join(f"<a href='#{chapter['id']}'>{chapter['name']}</a>" for chapter in chapters) + '</p></section>'

	for i, chapter in enumerate(chapters):
	print(i, 'Chapter', chapter['name'])

	problems = load_problems(chapter['id'])

	output += f"<section id='{chapter['id']}'><h2>Chapter {chapter['name']}</h2><p>" + ' '.join(f"<a href='#{chapter['id']}-{problem['id']}'>{problem['name']}</a>" for problem in problems) + "</p></section>"

	for j, problem in enumerate(problems):
	print(i, j, 'Problem', problem['name'])

	solutions = load_solutions(problem['id'])
	if not any(solutions):
	continue
	solution = solutions[0]

	all_solutions[problem['id']] = solution

	output += f"<section id='{chapter['id']}-{problem['id']}'><a href='#{chapter['id']}'>Go to Chapter {chapter['name']}</a><h3>Problem {problem['name']}</h3>"
	output += "<ul class='list-group mb-3'>" + ''.join(f"<li class='list-group-item justify-content-between'><h4>Step {step['i']} <span class='text-muted'>of {solution['num_steps']}</span></h4>{step['text']}</li>" for step in solution['steps']) + "</ul>"
	output += "</section>"


	html_output = """<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

	<title>Chegg</title>

	<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
	<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script>

	<style>
	:root { font-size: 18px; }
	section { page-break-after: always; }
	</style>
	</head>

	<body class="bg-light">
	""" + output + """
	</body>
	</html>
	"""

	title = book['name'].lower().replace(' ', '-')
	with open(f'book-{book_id}-{title}.html', 'w') as f:
	f.write(html_output.encode(sys.stdout.encoding, errors='replace'))

	pickle.dump(all_solutions, open('book.pickle', 'wb'))