Skip to content

Instantly share code, notes, and snippets.

@Akasurde
Created October 27, 2020 08:24
Show Gist options
  • Save Akasurde/00fcc74f7366a6e0117f6ee36a4b9b7c to your computer and use it in GitHub Desktop.
Save Akasurde/00fcc74f7366a6e0117f6ee36a4b9b7c to your computer and use it in GitHub Desktop.
scribd document download
import requests
from bs4 import BeautifulSoup
import os
import shutil
url = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
def download_stream(url, filepath):
"""
Stream stuff from the Internet to a local file.
"""
response = requests.get(url, stream=True)
with open(filepath, "wb") as out_file:
shutil.copyfileobj(response.raw, out_file)
def _save_image(url, imagename):
"""
Skips downloading if the image is already downloaded,
otherwise downloads it locally.
"""
print("Downloading", imagename)
already_present = os.listdir(".")
if imagename in already_present:
return
download_stream(url, imagename)
def _convert_jsonp_url_to_image_url(jsonp_url, found):
"""
Gets the image URL corresponding to the '.jsonp' URL.
"""
if jsonp_url.endswith(".jsonp"):
replacement = jsonp_url.replace("/pages/", "/images/")
if found:
replacement = replacement.replace(".jsonp", "/000.jpg")
else:
replacement = replacement.replace(".jsonp", ".jpg")
else:
replacement = jsonp_url
return replacement
def _extract_jsonp_url(inner_opening):
"""
Extracts URLs ending with '.jsonp'. These URLs contain the
raw document text.
"""
portion1 = inner_opening.find("https://")
if portion1 == -1:
jsonp = None
else:
portion2 = inner_opening.find(".jsonp")
jsonp = inner_opening[portion1 : portion2 + 6]
return jsonp
js_text = soup.find_all("script", type="text/javascript")
jsonp_urls = []
for opening in js_text:
for inner_opening in opening:
jsonp = _extract_jsonp_url(inner_opening)
if jsonp:
jsonp_urls.append(jsonp)
page_counter = 1
initial_filename = "hacking_"
found = False
downloaded_images = []
for jsonp_url in jsonp_urls:
filename = "{}_{}.jpg".format(initial_filename, page_counter)
img_url = _convert_jsonp_url_to_image_url(jsonp_url, found=found)
img_url = _save_image(img_url, filename)
downloaded_images.append(filename)
page_counter += 1
print(jsonp_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment