Last active
January 23, 2025 23:58
-
-
Save Lucent/b999cf0c1c98e799d097d81e1b9ff8a5 to your computer and use it in GitHub Desktop.
Convert Bluesky archive to plain text file with threaded replies and quotes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/bluesky-social/indigo/tree/main/cmd/goat | |
# goat repo export username.bsky.social | |
# goat repo unpack username.car | |
# python thread_replies.py did:plc:string/ > threaded_replies.txt | |
import json | |
import os | |
import sys | |
def read_json(filename): | |
with open(filename, 'r') as file: | |
return json.load(file) | |
def transform_text_to_markdown(text, facets): | |
for facet in reversed(facets): | |
if facet['features'][0]['$type'] == "app.bsky.richtext.facet#mention": | |
continue | |
start = facet['index']['byteStart'] | |
end = facet['index']['byteEnd'] | |
link = facet['features'][0]['uri'] | |
link_text = text[start:end] | |
markdown_link = f"[{link_text}]({link})" | |
text = text[:start] + markdown_link + text[end:] | |
return text | |
def read_posts_from_directory(directory): | |
posts = [] | |
for root, _, files in os.walk(directory): | |
for file in files: | |
filepath = os.path.join(root, file) | |
post = read_json(filepath) | |
rkey = os.path.splitext(file)[0] | |
post['rkey'] = rkey | |
posts.append(post) | |
posts.sort(key=lambda x: x['createdAt']) | |
return posts | |
def process_posts(posts): | |
posts_by_rkey = {post['rkey']: post for post in posts} | |
for post in posts: | |
post['replies'] = [] | |
if 'facets' in post: | |
post['text'] = post['text'] | |
# post['text'] = transform_text_to_markdown(post['text'], post['facets']) | |
if 'embed' in post and post['embed']['$type'] == "app.bsky.embed.images": | |
images_text = '\n'.join([f"[{image['alt']}]" for image in post['embed']['images']]) | |
post['text'] += f"\n{images_text}" | |
if 'embed' in post and post['embed']['$type'] == "app.bsky.embed.external": | |
embed = post['embed']['external'] | |
post['text'] += f"\n[{embed['title']}]({embed['uri']})" | |
if 'embed' in post and post['embed']['$type'] == "app.bsky.embed.record": | |
quoted_rkey = post['embed']['record']['uri'].split('/')[-1] | |
if quoted_rkey in posts_by_rkey: | |
quoted_post = posts_by_rkey[quoted_rkey] | |
date = quoted_post['createdAt'].split('T')[0] | |
# Store raw quoted text | |
post['quotedText'] = quoted_post['text'] | |
post['quotedDate'] = date | |
for post in posts: | |
if 'reply' in post and 'parent' in post['reply']: | |
parent_rkey = post['reply']['parent']['uri'].split('/')[-1] | |
if parent_rkey in posts_by_rkey: | |
posts_by_rkey[parent_rkey]['replies'].append(post) | |
else: | |
post['external_reply'] = 1 | |
root_posts = [ | |
post for post in posts | |
if ('reply' not in post or 'parent' not in post['reply']) | |
or ('reply' in post and 'parent' in post['reply'] and post['reply']['parent']['uri'].split('/')[-1] not in posts_by_rkey) | |
] | |
return root_posts | |
def print_posts(posts): | |
last_root_date = [None] # Tracks the date of the last root post | |
def print_date_if_new(date): | |
# Print the date only if it's different from the last root post's date | |
if date != last_root_date[0]: | |
print() | |
print("--" + date + "--") | |
last_root_date[0] = date | |
def print_post(post, depth=0): | |
date = post['createdAt'].split('T')[0] | |
indent = ' ↳ ' * depth # Adjust indent for replies | |
print() | |
if depth == 0 or post.get('external_reply'): | |
print() | |
if depth == 0: # It's a root post | |
print_date_if_new(date) | |
print(f"{indent}{post['text']}", end="") | |
if depth != 0 and not post.get('external_reply'): | |
print(f" —{date}", end="") | |
if 'quotedText' in post: | |
quote_lines = post['quotedText'].split('\n') | |
quote_indent = ' ' * len(indent) | |
for line in quote_lines: | |
print(f"\n{quote_indent}> {line}", end="") | |
print(f" —{post['quotedDate']}", end="") | |
for reply in post['replies']: | |
print_post(reply, depth + 1) | |
for post in posts: | |
print_post(post, post.get('external_reply', 0)) | |
directory = sys.argv[1] | |
posts = read_posts_from_directory(os.path.join(directory, "app.bsky.feed.post")) | |
root_posts = process_posts(posts) | |
profile_path = os.path.join(directory, "app.bsky.actor.profile", "self.json") | |
profile = read_json(profile_path) | |
print(profile['displayName']) | |
print() | |
print(profile['description']) | |
print_posts(root_posts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment