Skip to content

Instantly share code, notes, and snippets.

@neingeist
Created July 13, 2020 16:58
Show Gist options
  • Save neingeist/59f55f8c406efda1d5f17eb0eeec2b6b to your computer and use it in GitHub Desktop.
Save neingeist/59f55f8c406efda1d5f17eb0eeec2b6b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
from bs4 import BeautifulSoup
import requests
import urllib.parse
from pathlib import PurePath, Path
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from requests.packages.urllib3.exceptions import InsecureRequestWarning
def einfo(msg):
print("[INFO] >>> {}".format(msg))
def download(url, directory, session=None):
if not session:
session = requests.session()
Path(directory).mkdir(exist_ok=True, parents=True)
filename = PurePath(urllib.parse.unquote(urllib.parse.urlparse(url).path)).parts[-1]
target = Path(PurePath().joinpath(directory, filename))
target_partial = Path("{}.__partial__".format(target))
if target.exists():
einfo("Target {} already exists.".format(filename))
return
einfo("Downloading {} ...".format(url))
with session.get(url, stream=True) as r:
# dumb hack to ignore 404.
if r.status_code == 404:
return
else:
r.raise_for_status()
with target_partial.open(mode='wb') as f:
for chunk in r.iter_content(chunk_size=64*1024):
if chunk:
f.write(chunk)
target_partial.rename(target)
def scrape_post(url, soup_name, session=None):
einfo("Scraping {} ...".format(url))
url_parsed = urllib.parse.urlparse(url)
if not session:
session = requests.session()
payload = session.get(url).content.decode('utf-8')
bs = BeautifulSoup(payload, 'html.parser')
images_download_directory = PurePath().joinpath('archives', soup_name, 'images')
posts_download_directory = Path().joinpath('archives', soup_name, 'posts')
Path(posts_download_directory).mkdir(exist_ok=True, parents=True)
post_id = url_parsed.path.split('/')[2]
for image in bs.find_all("meta", property="og:image"):
image_url = image.attrs['content']
if image_url:
download(image_url, images_download_directory, session=session)
post_payload_target = Path(PurePath().joinpath(posts_download_directory, post_id))
if not post_payload_target.exists():
post_payload_target.write_text(payload)
def get_posts(url, session=None):
einfo("Getting posts from {} ...".format(url))
url_parsed = urllib.parse.urlparse(url)
if not session:
session = requests.session()
payload = session.get(url).content.decode('utf-8')
bs = BeautifulSoup(payload, 'html.parser')
post_urls = set()
for post_url in bs.find_all('a', href=re.compile(r'{}/post/.*'.format(url_parsed.netloc), re.IGNORECASE)):
post_urls.add(post_url.attrs['href'])
next_page = bs.find('a', href=re.compile(r'/since/[0-9]+\?mode=own$'), text='more posts')
if next_page:
next_page = urllib.parse.urljoin(url, next_page.attrs['href'])
if not post_urls:
print(payload)
return post_urls, next_page
def main():
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
session = requests.session()
# custom domains will bitch about ssl cert.
session.verify = False
retry_strategy = Retry(
total=10,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=1
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('https://', adapter)
session.mount('http://', adapter)
domain = sys.argv[1]
page = "https://{}".format(domain)
while page:
posts, page = get_posts(page, session=session)
for post in posts:
scrape_post(post, domain, session=session)
if __name__ == "__main__":
main()
@neingeist
Copy link
Author

@couchsofa
Copy link

Got an encoding error. Adding encoding="utf-8" on line 77 seems to have fixed it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment