Last active
April 14, 2021 04:09
-
-
Save jaimergp/b6cd9970a48afe948649 to your computer and use it in GitHub Desktop.
Download your entire Fotolog to disk, comments included [DEPRECATED]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
Download your entire Fotolog to disk, comments included | |
@jaimergp, 2016 | |
Dependencies: requests, beautifulsoup4 | |
""" | |
# Python | |
import os | |
import json | |
import sys | |
import time | |
# Dependencies | |
import requests | |
from bs4 import BeautifulSoup | |
session = requests.Session() | |
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5)) | |
class FotologClient(): | |
""" | |
Create an API client for given username | |
""" | |
def __init__(self, username): | |
self.username = username | |
self.profile_url = 'http://www.fotolog.com/{}'.format(username) | |
self.mosaic_url = '{}/mosaic'.format(self.profile_url) | |
self.profile_bio = self.bio() | |
def bio(self): | |
""" | |
Parses `/mosaic` page to retrieve the profile stats and bio | |
Returns | |
------- | |
data : dict | |
Metadata of the user, including stats (number of posts, views, | |
number of friends, groups, flashs), avatar, gender, marital status, | |
birthday, register date, user location, profile description. | |
""" | |
try: | |
r = session.get(self.mosaic_url) | |
r.raise_for_status() | |
except requests.exceptions.HTTPError as e: | |
sys.exit("HTTP Error. Please try again!\n{}".format(e)) | |
soup = BeautifulSoup(r.content, 'html.parser') | |
data = {} | |
# Stats | |
stats_bar = soup.find('ul', attrs={'id': 'profile_bar'}) | |
categories = ['posts_number', 'views', 'friends_number', 'groups', 'flashs'] | |
for li, category in zip(stats_bar, categories): | |
num, tag = li.text.split('\n') | |
data[category] = num | |
# Bio | |
bio_col = soup.find('div', attrs={'id': 'wall_infos_profile'}) | |
bio_p = bio_col.find_all('p') | |
avatar_url = bio_col.find('img', attrs={'alt': self.username}).get('src') | |
personal, member_since = bio_p[0].text.split('\n') | |
member_since = member_since.split()[-1] | |
gender, marital, birthday = personal.split(' - ') | |
location = bio_p[1].text.strip() | |
description = '\n'.join([p.text for p in bio_p[2:]]) | |
data.update({'avatar_url': avatar_url, | |
'gender': gender, | |
'marital': marital, | |
'birthday': birthday, | |
'member_since': member_since, | |
'location': location, | |
'description': description | |
}) | |
return data | |
def all_links(self, url=None): | |
""" | |
Scrapes /mosaic to retrieve all links to published photos. | |
Yields | |
------ | |
href : str | |
The link to each post | |
""" | |
if url is None: | |
url = self.mosaic_url | |
r = session.get(url) | |
r.raise_for_status() | |
soup = BeautifulSoup(r.content, 'html.parser') | |
links = soup.find_all('a', {'class': 'wall_img_container'}) | |
for a in links: | |
yield a.get('href') | |
# Go to next page | |
navigation = soup.find('div', {'id': 'pagination'}).find_all('a') | |
for page in navigation: | |
if page.text in ('>', '>'): | |
yield from self.all_links(page.get('href')) | |
def all_posts(self, resume_url=''): | |
""" | |
Iterates over Fotolog profile, from newer to older, building dict | |
of each post: image url, comments, date, views. | |
Parameters | |
---------- | |
resume_url : str | |
Starting URL. User frontpage by default. | |
Yields | |
------ | |
post : dict | |
Dict metadata of each post | |
""" | |
total = self.profile_bio['posts_number'] if not resume_url else '???' | |
print('Scraping', total, 'photos for user', self.username) | |
url = "{}/{}".format(self.profile_url, resume_url) | |
i = 1 | |
while url: | |
try: | |
print('Getting {}/{}... [{}] '.format(i, total, url), end='\r') | |
post = self.post(url) | |
except (requests.exceptions.HTTPError, AttributeError): | |
print('Getting {}/{}... [{}] Retrying...'.format(i, total, url), end='\r') | |
time.sleep(1.0) | |
continue | |
except: | |
raise StopIteration | |
else: | |
url = post['next'] | |
i += 1 | |
yield post | |
print('\nDone!') | |
def download(self, path=None, resume_url=None): | |
""" | |
Download everything (images and metadata) to disk | |
Parameters | |
---------- | |
path : str | |
Base location of files. <user>/ by default. | |
resume_url : str | |
Starting url to scrape. User frontpage by default. It | |
will iterate from newer to older. | |
""" | |
if path is None: | |
path = self.username | |
self.mkdir(path) | |
posts = [] | |
for post in self.all_posts(resume_url=resume_url): | |
self.download_image(post['image'], basedir=path) | |
posts.append(post) | |
self.download_image(self.profile_bio['avatar_url'], basedir=path) | |
data = {'bio': self.profile_bio, 'posts': posts} | |
json_path = os.path.join(path, '{}.json'.format(self.username)) | |
with open(json_path, 'w+') as f: | |
json.dump(data, f, ensure_ascii=False) | |
def download_images(self, path=None): | |
""" | |
Download all images of the profile, with no metadata | |
Parameters | |
---------- | |
path : str | |
Base location of files. <user>/img by default. | |
""" | |
if path is None: | |
path = self.username | |
self.mkdir(path) | |
for post in self.all_posts(): | |
img_path = os.path.join(path, 'img') | |
self.download_image(post['image'], basedir=img_path) | |
def download_metadata(self, path=None): | |
""" | |
Dump each post metadata to JSON | |
Parameters | |
---------- | |
path : str | |
Name of dumped JSON file. <user>/<user>.json by default. | |
""" | |
if path is None: | |
path = '{0}/{0}.json'.format(self.username) | |
bio = self.bio() | |
d = {'bio': bio, 'posts': list(self.all_posts())} | |
with open(path, 'w+') as f: | |
json.dump(d, f, ensure_ascii=False) | |
@staticmethod | |
def post(url): | |
""" | |
Scrapes a post url to obtain image url, description, comments, date, views. | |
Parameters | |
---------- | |
url : str | |
URL of desired post | |
Returns | |
------- | |
data : dict | |
Dict with all metadata | |
""" | |
r = session.get(url) | |
r.raise_for_status() | |
soup = BeautifulSoup(r.content, 'html.parser') | |
image = soup.find('div', {'id': 'flog_img_holder'}).find('img').get('src') | |
description_photo = soup.find('div', {'id': 'description_photo'}) | |
title = getattr(description_photo.find('h1'), 'text', '') | |
description_lines = getattr(description_photo.find('p'), 'text', '').split('\n') | |
description = '\n'.join(description_lines[:-1]) | |
date_and_views = description_lines[-1].split() | |
date = ' '.join(date_and_views[1:-2]) | |
views = date_and_views[-2] | |
comments = list(FotologClient.parse_comments(soup)) | |
next_post = soup.find('a', {'class': 'arrow_change_photo_right'}) | |
next_url = next_post.get('href') if next_post else None | |
return {'image': image, | |
'title': title, | |
'description': description, | |
'date': date, | |
'views': views, | |
'comments': comments, | |
'url': url, | |
'next': next_url} | |
@staticmethod | |
def parse_comments(soup): | |
""" | |
Parse the comment section to obtain text, user, and date. | |
Parameters | |
---------- | |
soup : BeautifulSoup | |
Parsed HTML of post page | |
Yields | |
------ | |
data : dict | |
User, date and text of each comment | |
""" | |
wrapper = soup.find('div', {'id': 'list_all_comments'}) | |
divs = wrapper.find_all('div', {'class': 'flog_img_comments'})[1:] | |
for div in divs: | |
lines = [l.strip() for l in div.get_text('\n').split('\n') | |
if l and '<![CDATA[' not in l] | |
user = lines.pop(0) if lines else '' | |
date = lines.pop(0) if lines else '' | |
text = '\n'.join(lines) if lines else '' | |
yield {'user': user, 'date': date, 'text': text} | |
@staticmethod | |
def download_image(url, basedir=''): | |
""" | |
Download an image url to disk | |
Parameters | |
---------- | |
url : str | |
URL of desired image | |
basedir : str | |
Base location of downloaded image. Working directory by default. | |
""" | |
while url: | |
try: | |
r = session.get(url, stream=True) | |
r.raise_for_status() | |
except (requests.exceptions.HTTPError): | |
time.sleep(1.0) | |
continue | |
else: | |
path = os.path.join(basedir, url.split('/')[-1]) | |
with open(path, 'wb') as f: | |
for chunk in r: | |
f.write(chunk) | |
url = None | |
@staticmethod | |
def mkdir(path): | |
""" | |
Create directory if it doesn't exist | |
""" | |
try: | |
os.makedirs(path) | |
except (OSError, IOError): | |
if os.path.isfile(path): | |
raise IOError('[!] Path {} is a file'.format(path)) | |
if __name__ == '__main__': | |
try: | |
client = FotologClient(sys.argv[1]) | |
except IndexError: | |
sys.exit('Usage: python fotologbackup.py <username>') | |
else: | |
resume = sys.argv[2] if sys.argv[2:3] else '' | |
client.download(resume_url=resume) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hola Jaime. Tú sabes como recuperar mi Fotolog? aunque sea el texto?
Gracias