Created
July 13, 2020 16:58
-
-
Save neingeist/59f55f8c406efda1d5f17eb0eeec2b6b to your computer and use it in GitHub Desktop.
archive.py soup scraper (almost the same as https://www.reddit.com/r/DataHoarder/comments/hpy65f/soupio_backup/fxv4ibi/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from bs4 import BeautifulSoup | |
import requests | |
import urllib.parse | |
from pathlib import PurePath, Path | |
import sys | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
def einfo(msg): | |
print("[INFO] >>> {}".format(msg)) | |
def download(url, directory, session=None): | |
if not session: | |
session = requests.session() | |
Path(directory).mkdir(exist_ok=True, parents=True) | |
filename = PurePath(urllib.parse.unquote(urllib.parse.urlparse(url).path)).parts[-1] | |
target = Path(PurePath().joinpath(directory, filename)) | |
target_partial = Path("{}.__partial__".format(target)) | |
if target.exists(): | |
einfo("Target {} already exists.".format(filename)) | |
return | |
einfo("Downloading {} ...".format(url)) | |
with session.get(url, stream=True) as r: | |
# dumb hack to ignore 404. | |
if r.status_code == 404: | |
return | |
else: | |
r.raise_for_status() | |
with target_partial.open(mode='wb') as f: | |
for chunk in r.iter_content(chunk_size=64*1024): | |
if chunk: | |
f.write(chunk) | |
target_partial.rename(target) | |
def scrape_post(url, soup_name, session=None): | |
einfo("Scraping {} ...".format(url)) | |
url_parsed = urllib.parse.urlparse(url) | |
if not session: | |
session = requests.session() | |
payload = session.get(url).content.decode('utf-8') | |
bs = BeautifulSoup(payload, 'html.parser') | |
images_download_directory = PurePath().joinpath('archives', soup_name, 'images') | |
posts_download_directory = Path().joinpath('archives', soup_name, 'posts') | |
Path(posts_download_directory).mkdir(exist_ok=True, parents=True) | |
post_id = url_parsed.path.split('/')[2] | |
for image in bs.find_all("meta", property="og:image"): | |
image_url = image.attrs['content'] | |
if image_url: | |
download(image_url, images_download_directory, session=session) | |
post_payload_target = Path(PurePath().joinpath(posts_download_directory, post_id)) | |
if not post_payload_target.exists(): | |
post_payload_target.write_text(payload) | |
def get_posts(url, session=None): | |
einfo("Getting posts from {} ...".format(url)) | |
url_parsed = urllib.parse.urlparse(url) | |
if not session: | |
session = requests.session() | |
payload = session.get(url).content.decode('utf-8') | |
bs = BeautifulSoup(payload, 'html.parser') | |
post_urls = set() | |
for post_url in bs.find_all('a', href=re.compile(r'{}/post/.*'.format(url_parsed.netloc), re.IGNORECASE)): | |
post_urls.add(post_url.attrs['href']) | |
next_page = bs.find('a', href=re.compile(r'/since/[0-9]+\?mode=own$'), text='more posts') | |
if next_page: | |
next_page = urllib.parse.urljoin(url, next_page.attrs['href']) | |
if not post_urls: | |
print(payload) | |
return post_urls, next_page | |
def main(): | |
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
session = requests.session() | |
# custom domains will bitch about ssl cert. | |
session.verify = False | |
retry_strategy = Retry( | |
total=10, | |
status_forcelist=[429, 500, 502, 503, 504], | |
method_whitelist=["HEAD", "GET", "OPTIONS"], | |
backoff_factor=1 | |
) | |
adapter = HTTPAdapter(max_retries=retry_strategy) | |
session.mount('https://', adapter) | |
session.mount('http://', adapter) | |
domain = sys.argv[1] | |
page = "https://{}".format(domain) | |
while page: | |
posts, page = get_posts(page, session=session) | |
for post in posts: | |
scrape_post(post, domain, session=session) | |
if __name__ == "__main__": | |
main() |
Got an encoding error. Adding encoding="utf-8"
on line 77 seems to have fixed it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
original script: https://www.reddit.com/r/DataHoarder/comments/hpy65f/soupio_backup/fxv4ibi/