Skip to content

Instantly share code, notes, and snippets.

@smaybius
Last active November 10, 2024 03:56
Show Gist options
  • Save smaybius/7dd26eebee5447511329e3768124786a to your computer and use it in GitHub Desktop.
Save smaybius/7dd26eebee5447511329e3768124786a to your computer and use it in GitHub Desktop.
DeviantArt gallery folder page scraper (written by ChatGPT)
import requests
from bs4 import BeautifulSoup
import re
import sys
import time
import random
import datetime
import json
base_url = f"https://www.deviantart.com/{sys.argv[1]}/gallery"
scraps_url = f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps"
pattern = re.compile(f"https://www.deviantart.com/{sys.argv[1]}/gallery/\\d+/.*")
page_pattern = re.compile(r'\?page=(\d+)')
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
]
final_links = []
def print_to_stderr(*a):
print(*a, file=sys.stderr)
def fetch_links(url):
headers = {"User-Agent": random.choice(user_agents)}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'window.__INITIAL_STATE__' in script.text:
json_text = script.text.split('window.__INITIAL_STATE__ = JSON.parse("')[1].split('");')[0]
json_text = json_text.replace('\\"', '"') # Unescape double quotes
json_text = json_text.encode().decode('unicode_escape') # Decode any remaining escape sequences
json_data = json.loads(json_text)
return json_data['@@entities']['galleryFolder']
print_to_stderr(f"Failed to fetch links from {url} with status code {response.status_code}")
return {}
def get_highest_page_number(link):
headers = {"User-Agent": random.choice(user_agents)}
max_page = 0
response = requests.get(link, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
for a in soup.find_all('a', href=True):
match = page_pattern.search(a['href'])
if match:
page_number = int(match.group(1))
if page_number > max_page:
max_page = page_number
return max_page
if __name__ == "__main__":
processed_links = set()
to_process_links = {base_url, scraps_url}
while to_process_links:
current_link = to_process_links.pop()
if current_link in processed_links:
continue
try:
gallery_folders = fetch_links(current_link)
if not gallery_folders:
print_to_stderr(f"No links found for {current_link}. Moving to next link.")
continue
for folder_id, folder_info in gallery_folders.items():
link = f"https://www.deviantart.com/{sys.argv[1]}/gallery/{folder_id}/{folder_info['name'].lower().replace(' ', '-')}"
if link not in processed_links and int(folder_id) > 0:
highest_page_number = get_highest_page_number(link)
print(f"{link}")
final_links.append(f"{link}")
processed_links.add(link)
if highest_page_number >= 2:
for page in range(2, highest_page_number + 1):
print(f"{link}?page={page}")
final_links.append(f"{link}?page={page}")
to_process_links.add(f"{link}?page={highest_page_number}")
time.sleep(random.uniform(1, 3)) # Delay between requests to avoid being blocked
except Exception as e:
print_to_stderr(f"Error processing {current_link}: {e}")
break # Exit the loop on error
processed_links.add(current_link)
if not to_process_links: # Check if there are no more links to process
break
highest_page_number = get_highest_page_number(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all")
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all")
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all")
if highest_page_number >= 2:
for page in range(2, highest_page_number + 1):
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all?page={page}")
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all?page={page}")
highest_page_number = get_highest_page_number(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps")
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps")
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps")
if highest_page_number >= 2:
for page in range(2, highest_page_number + 1):
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps?page={page}")
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps?page={page}")
with open(f"{sys.argv[1]} gallery on DeviantArt {str(datetime.datetime.now()).replace(':', '-')}.txt", "w") as file:
for final_link in final_links:
file.write(final_link + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment