Skip to content

Instantly share code, notes, and snippets.

@yfe404
Created May 25, 2018 16:57
Show Gist options
  • Save yfe404/6837e7ca67d0c29e02f5e03f3b18bb7a to your computer and use it in GitHub Desktop.
Save yfe404/6837e7ca67d0c29e02f5e03f3b18bb7a to your computer and use it in GitHub Desktop.
scrap-instagram
"""
Scrap caption and images from instagram using https://deskgram.org
"""
import requests
from bs4 import BeautifulSoup
BASE_URL = 'https://deskgram.org'
USER = 'healthymealsberlin'
start_url = BASE_URL + '/' + USER
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'html.parser')
captions = soup.findAll("div", {"class": "post-caption"})
images = soup.findAll("div", {"class": "post-img"})
has_next = True
print ('Found {0} captions.'.format(len(captions)))
print ('Found {0} images.'.format(len(images)))
while True:
links = soup.findAll('a')
next_link = list(filter( lambda x: 'next_id' in x['href'], links))
if len(next_link) == 0:
break
else:
dest = next_link[0]['href']
next_url = BASE_URL + dest
print ('fetching {0}'.format(next_url))
r = requests.get(next_url)
soup = BeautifulSoup(r.text, 'html.parser')
captions = soup.findAll("div", {"class": "post-caption"})
images = soup.findAll("div", {"class": "post-img"})
print ('Found {0} captions.'.format(len(captions)))
print ('Found {0} images.'.format(len(images)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment