Last active
November 10, 2024 03:56
-
-
Save smaybius/7dd26eebee5447511329e3768124786a to your computer and use it in GitHub Desktop.
DeviantArt gallery folder page scraper (written by ChatGPT)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import sys | |
import time | |
import random | |
import datetime | |
import json | |
base_url = f"https://www.deviantart.com/{sys.argv[1]}/gallery" | |
scraps_url = f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps" | |
pattern = re.compile(f"https://www.deviantart.com/{sys.argv[1]}/gallery/\\d+/.*") | |
page_pattern = re.compile(r'\?page=(\d+)') | |
user_agents = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
] | |
final_links = [] | |
def print_to_stderr(*a): | |
print(*a, file=sys.stderr) | |
def fetch_links(url): | |
headers = {"User-Agent": random.choice(user_agents)} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
scripts = soup.find_all('script') | |
for script in scripts: | |
if 'window.__INITIAL_STATE__' in script.text: | |
json_text = script.text.split('window.__INITIAL_STATE__ = JSON.parse("')[1].split('");')[0] | |
json_text = json_text.replace('\\"', '"') # Unescape double quotes | |
json_text = json_text.encode().decode('unicode_escape') # Decode any remaining escape sequences | |
json_data = json.loads(json_text) | |
return json_data['@@entities']['galleryFolder'] | |
print_to_stderr(f"Failed to fetch links from {url} with status code {response.status_code}") | |
return {} | |
def get_highest_page_number(link): | |
headers = {"User-Agent": random.choice(user_agents)} | |
max_page = 0 | |
response = requests.get(link, headers=headers) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for a in soup.find_all('a', href=True): | |
match = page_pattern.search(a['href']) | |
if match: | |
page_number = int(match.group(1)) | |
if page_number > max_page: | |
max_page = page_number | |
return max_page | |
if __name__ == "__main__": | |
processed_links = set() | |
to_process_links = {base_url, scraps_url} | |
while to_process_links: | |
current_link = to_process_links.pop() | |
if current_link in processed_links: | |
continue | |
try: | |
gallery_folders = fetch_links(current_link) | |
if not gallery_folders: | |
print_to_stderr(f"No links found for {current_link}. Moving to next link.") | |
continue | |
for folder_id, folder_info in gallery_folders.items(): | |
link = f"https://www.deviantart.com/{sys.argv[1]}/gallery/{folder_id}/{folder_info['name'].lower().replace(' ', '-')}" | |
if link not in processed_links and int(folder_id) > 0: | |
highest_page_number = get_highest_page_number(link) | |
print(f"{link}") | |
final_links.append(f"{link}") | |
processed_links.add(link) | |
if highest_page_number >= 2: | |
for page in range(2, highest_page_number + 1): | |
print(f"{link}?page={page}") | |
final_links.append(f"{link}?page={page}") | |
to_process_links.add(f"{link}?page={highest_page_number}") | |
time.sleep(random.uniform(1, 3)) # Delay between requests to avoid being blocked | |
except Exception as e: | |
print_to_stderr(f"Error processing {current_link}: {e}") | |
break # Exit the loop on error | |
processed_links.add(current_link) | |
if not to_process_links: # Check if there are no more links to process | |
break | |
highest_page_number = get_highest_page_number(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all") | |
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all") | |
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all") | |
if highest_page_number >= 2: | |
for page in range(2, highest_page_number + 1): | |
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all?page={page}") | |
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/all?page={page}") | |
highest_page_number = get_highest_page_number(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps") | |
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps") | |
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps") | |
if highest_page_number >= 2: | |
for page in range(2, highest_page_number + 1): | |
print(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps?page={page}") | |
final_links.append(f"https://www.deviantart.com/{sys.argv[1]}/gallery/scraps?page={page}") | |
with open(f"{sys.argv[1]} gallery on DeviantArt {str(datetime.datetime.now()).replace(':', '-')}.txt", "w") as file: | |
for final_link in final_links: | |
file.write(final_link + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment