Created
October 27, 2020 08:24
-
-
Save Akasurde/00fcc74f7366a6e0117f6ee36a4b9b7c to your computer and use it in GitHub Desktop.
scribd document download
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import shutil | |
url = "" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
def download_stream(url, filepath): | |
""" | |
Stream stuff from the Internet to a local file. | |
""" | |
response = requests.get(url, stream=True) | |
with open(filepath, "wb") as out_file: | |
shutil.copyfileobj(response.raw, out_file) | |
def _save_image(url, imagename): | |
""" | |
Skips downloading if the image is already downloaded, | |
otherwise downloads it locally. | |
""" | |
print("Downloading", imagename) | |
already_present = os.listdir(".") | |
if imagename in already_present: | |
return | |
download_stream(url, imagename) | |
def _convert_jsonp_url_to_image_url(jsonp_url, found): | |
""" | |
Gets the image URL corresponding to the '.jsonp' URL. | |
""" | |
if jsonp_url.endswith(".jsonp"): | |
replacement = jsonp_url.replace("/pages/", "/images/") | |
if found: | |
replacement = replacement.replace(".jsonp", "/000.jpg") | |
else: | |
replacement = replacement.replace(".jsonp", ".jpg") | |
else: | |
replacement = jsonp_url | |
return replacement | |
def _extract_jsonp_url(inner_opening): | |
""" | |
Extracts URLs ending with '.jsonp'. These URLs contain the | |
raw document text. | |
""" | |
portion1 = inner_opening.find("https://") | |
if portion1 == -1: | |
jsonp = None | |
else: | |
portion2 = inner_opening.find(".jsonp") | |
jsonp = inner_opening[portion1 : portion2 + 6] | |
return jsonp | |
js_text = soup.find_all("script", type="text/javascript") | |
jsonp_urls = [] | |
for opening in js_text: | |
for inner_opening in opening: | |
jsonp = _extract_jsonp_url(inner_opening) | |
if jsonp: | |
jsonp_urls.append(jsonp) | |
page_counter = 1 | |
initial_filename = "hacking_" | |
found = False | |
downloaded_images = [] | |
for jsonp_url in jsonp_urls: | |
filename = "{}_{}.jpg".format(initial_filename, page_counter) | |
img_url = _convert_jsonp_url_to_image_url(jsonp_url, found=found) | |
img_url = _save_image(img_url, filename) | |
downloaded_images.append(filename) | |
page_counter += 1 | |
print(jsonp_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment