Skip to content

Instantly share code, notes, and snippets.

@SansPapyrus683
Last active January 13, 2026 04:32
Show Gist options
  • Select an option

  • Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.

Select an option

Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.
download all your twitter anime girls!
import os
from collections import defaultdict
import re
from PIL import Image
import imagehash
fmt = re.compile(r"(.*)\_(\d+)_(\d+)")
path = os.path.expanduser("~/OneDrive/Pictures/twitter")
os.chdir(path)
tweets = defaultdict(lambda: defaultdict(list))
for i in os.listdir():
if i.endswith(".mp4"):
continue
match = fmt.match(i)
author = match.group(1)
id_ = int(match.group(2))
pos = int(match.group(3))
tweets[author][id_].append((pos, i, imagehash.average_hash(Image.open(i))))
to_del = []
for author, g in tweets.items():
ids = sorted(g.keys())
for i in ids:
g[i].sort()
for i in range(len(ids)):
g1 = g[ids[i]]
for j in range(i + 1, len(ids)):
g2 = g[ids[j]]
if len(g1) != len(g2):
continue
for a, b in zip(g1, g2):
if a[2] - b[2] >= 3:
break
else:
to_del.extend(g1)
break
for d in to_del:
os.remove(d[1])
import json
import os
import re
import shutil
import sys
from datetime import datetime, timedelta
from email import utils
import requests
def extract_id(tweet: str) -> int:
tweet = os.path.splitext(tweet)[0]
first = tweet.rfind("_")
second = tweet[:first].rfind("_")
return int(tweet[second + 1 : first])
def load_twt_obj(file: str) -> list:
raw = open(file, encoding="utf8").read()
return json.loads(raw[raw.find("=") + 1 :])
# also add deleted-tweets.js if your'e like that
tweets = load_twt_obj("data/tweets.js")
del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
shutil.copy(os.path.join(del_dir, fn), gen_dir)
have_alr = set()
for store in sys.argv[1:]:
for name in os.listdir(store):
have_alr.add(extract_id(name))
# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
post_id = i[: i.find("-")]
img_id = i[i.find("-") + 1 : i.rfind(".")]
_, ext = os.path.splitext(i)
if post_id not in all_media:
all_media[post_id] = {}
all_media[post_id][img_id] = ext
# sort them from oldest to newest
tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))
handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
all_paths = []
print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
for v, t in enumerate(tweets):
if (v + 1) % 100 == 0:
print(f"at tweet #{v + 1}")
t = t["tweet"]
match = handle_fmt.match(t["full_text"])
if match is None:
continue
handle = match.group(1)
og_id = t["id"]
if "media" not in t["entities"]:
continue
media = t["extended_entities"]["media"]
src_id = [m["source_status_id"] for m in media]
assert len(set(src_id)) == 1 # just a sanity check
src_id = int(src_id[0])
if src_id in have_alr:
continue
curr_paths = []
# quick hack to get videos to download
vid = all_media[og_id]
# most videos are standalone. there's one (1) tweet so far that violates this
if ".mp4" in vid.values() and len(vid) == 1:
vid_id = list(vid.keys())[0]
stupid_path = os.path.join(gen_dir, f"{og_id}-{vid_id}.mp4")
sigma_path = f"good_media/{handle}_{src_id}_1.mp4"
shutil.copy(stupid_path, sigma_path)
curr_paths.append(sigma_path)
for img_at, m in enumerate(media):
img_id = img_id_fmt.match(m["media_url"])
# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
if img_id is None:
continue
img_id = img_id.group(1)
if img_id not in all_media.get(og_id, []):
continue
ext = all_media[og_id][img_id]
sigma_path = f"good_media/{handle}_{src_id}_{img_at + 1}{ext}"
stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
img_data = requests.get(dl_url).content
if not img_data:
shutil.copy(stupid_path, sigma_path)
else:
with open(sigma_path, "wb") as written:
written.write(img_data)
curr_paths.append(sigma_path)
all_paths.extend(reversed(curr_paths))
now = datetime.now()
epoch = datetime(1970, 1, 1)
for v, p in enumerate(reversed(all_paths)):
delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
os.utime(p, times=(delta, delta))
import os
from collections import defaultdict
from datetime import datetime, timedelta
path = os.path.expanduser("~/OneDrive/Pictures/twitter")
os.chdir(path)
groups = defaultdict(list)
for i in os.listdir():
start = i.rfind("_") + 1
end = i.rfind(".")
num = int(i[start:end])
author_end = i.rfind("_", 0, start - 1)
author = i[:author_end]
id_ = int(i[author_end + 1:start - 1])
groups[id_].append((author, num, i))
all_times = []
now = datetime.now()
epoch = datetime(1970, 1, 1)
at = 0
to_fix = []
for id_, tweets in sorted(groups.items(), reverse=True):
tweets.sort(key=lambda t: t[1])
all_authors = {t[0].lower() for t in tweets}
all_nums = [t[1] for t in tweets]
assert len(all_authors) == 1
assert all_nums == list(range(1, len(all_nums) + 1))
for t in tweets:
delta = (now - timedelta(seconds=2 * at) - epoch).total_seconds()
os.utime(t[2], times=(delta, delta))
at += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment