deepfates · November 17, 2024 19:33
diff --git a/convert_archive.py b/convert_archive.py
 import argparse
 import json
 import logging
 import os
 import re
 import shutil
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

 # Logging setup
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 @dataclass
 class MediaFile:
    id: str
    content_type: str
    path: str
    metadata: Dict[str, Any]

 @dataclass
 class Content:
    id: str
    text: str
    metadata: Dict[str, Any]
    timestamp: str
    parent_id: Optional[str]
    media_files: List[Dict[str, Any]]
    content_source: str

 @dataclass
 class Thread:
    id: str
    contents: List[Content]

 @dataclass
 class Message:
    role: Literal["assistant", "user"]
    content: str

 # Data extraction functions
 def clean_json_string(json_string: str) -> str:
    return re.sub(r'^window\.[^=]+=\s*', '', json_string.strip()).rstrip(';')

 def process_file(file_path: str) -> List[Dict[str, Any]]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = clean_json_string(f.read())
            results = json.loads(data)
            return results
    except Exception as e:
        logger.warning(f"Error processing file {file_path}: {e}")
        return []

 def extract_manifest(file_path: str) -> Dict[str, Any]:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = clean_json_string(file.read())
            return json.loads(content)
    except json.JSONDecodeError:
        match = re.search(r'window\.__THAR_CONFIG\s*=\s*({.*})', content, re.DOTALL)
        if match:
            return json.loads(match.group(1))
        logger.error(f"Could not parse __THAR_CONFIG in manifest file: {file_path}")
        raise
    except Exception as e:
        logger.error(f"Error extracting manifest from {file_path}: {e}")
        raise

 def get_media_files(tweet_id: str, media_folder: str) -> List[str]:
    try:
        all_files = os.listdir(media_folder)
        media_files = [
            f for f in all_files 
            if f.startswith(f"{tweet_id}-") and os.path.getsize(os.path.join(media_folder, f)) > 0
        ]
        return media_files
    except Exception as e:
        logger.error(f"Error getting media files for tweet_id {tweet_id}: {e}")
        return []

 def get_media_type(filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower()
    if ext in ('.mp4', '.mov'):
        return 'video'
    elif ext in ('.jpg', '.jpeg', '.png', '.gif'):
        return 'photo'
    return 'unknown'

 def extract_content(item: Dict[str, Any], content_source: str, media_folder: str) -> List[Content]:
    content_id = item.get('id') or item.get('tweetId')
    text = item.get('text') or item.get('fullText') or item.get('full_text')

    media_files = get_media_files(content_id, media_folder)
    media_file_objects = [{
        'id': f"{content_id}_{os.path.splitext(media_file)[0]}",
        'content_type': get_media_type(media_file),
        'path': os.path.join(media_folder, media_file),
        'metadata': {
            'parent_tweet': item,
            'media_info': item.get('extended_entities', {}).get('media', [])
        }
    } for media_file in media_files]

    return [Content(
        id=content_id,
        text=text,
        metadata=item,
        timestamp=item.get('created_at', ''),
        parent_id=item.get('in_reply_to_status_id', None),
        media_files=media_file_objects,
        content_source=content_source
    )]

 def process_file_wrapper(args: Tuple[str, Dict[str, Any], str, str]) -> List[Content]:
    archive_path, file_info, extractor_name, media_folder = args
    file_path = os.path.join(archive_path, file_info['fileName'])
    file_data = process_file(file_path)
    extractor = globals()[extractor_name]  # Get the extractor function by name
    return extractor(file_data, media_folder)

 def extract_content_data(archive_path: str, file_info: Dict[str, Any], extractor: Callable, media_folder: str) -> List[Content]:
    try:
        return extractor(file_info['data'], media_folder)
    except Exception as e:
        logger.error(f"Error extracting data with {extractor.__name__}: {e}")
        return []

 def extract_data(archive_path: str, type_info: Dict[str, Any], extractor: Callable) -> List[Content]:
    media_folder = os.path.join(archive_path, 'data', 'tweets_media')
    contents = []
    extractor_name = extractor.__name__

    with ProcessPoolExecutor() as executor:
        args_list = [
            (archive_path, file_info, extractor_name, media_folder) 
            for file_info in type_info.get('files', [])
        ]
        futures = [executor.submit(process_file_wrapper, args) for args in args_list]

        total_futures = len(futures)
        logger.info(f"Processing {total_futures} files with {extractor_name}")
        completed_count = 0

        for future in as_completed(futures):
            result = future.result()
            if result:
                contents.extend(result)
            completed_count += 1
            if completed_count % 10 == 0 or completed_count == total_futures:
                logger.info(f"Processed {completed_count}/{total_futures} files")

    logger.info(f"Total {extractor_name} extracted: {len(contents)} from {len(type_info.get('files', []))} files")
    return contents

 def extract_tweets(file_data: List[Dict[str, Any]], media_folder: str) -> List[Content]:
    logger.info(f"Extracting tweets from {len(file_data)} items")
    contents = [
        content 
        for tweet in file_data if 'tweet' in tweet 
        for content in extract_content(tweet['tweet'], 'tweet', media_folder)
    ]
    logger.info(f"Extracted {len(contents)} tweet contents")
    return contents

 def extract_likes(file_data: List[Dict[str, Any]], media_folder: str) -> List[Content]:
    logger.info(f"Extracting likes from {len(file_data)} items")
    contents = [
        content 
        for like in file_data if 'like' in like 
        for content in extract_content(like['like'], 'like', media_folder)
    ]
    logger.info(f"Extracted {len(contents)} like contents")
    return contents

 def extract_archive_data(archive_path: str) -> Dict[str, List[Content]]:
    try:
        manifest_path = os.path.join(archive_path, 'data', 'manifest.js')
        manifest = extract_manifest(manifest_path)
        data_types = manifest.get('dataTypes', {})
        
        extractors = {
            'tweets': extract_tweets,
            'like': extract_likes,
            # Add more extractors as needed
        }
        
        response = {}
        for data_type, extractor in extractors.items():
            if data_type in data_types:
                response[data_type] = extract_data(archive_path, data_types[data_type], extractor)
        
        return response
    
    except Exception as e:
        logger.error(f"Error occurred during data extraction: {e}")
        return {}

 # Data transformation functions
 def clean_text(text: str, entities: Optional[Dict] = None) -> str:
    if entities:
        for url in entities.get('urls', []):
            short_url = url.get('url', '')
            expanded_url = url.get('expanded_url', '')
            if short_url and expanded_url:
                text = text.replace(short_url, expanded_url)
    
    text = re.sub(r'https://t.co/\w+', '', text)
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

 def get_all_tweets(data: Dict[str, List[Content]]) -> Dict[str, Content]:
    logger.info("Combining tweets and likes into all_tweets")
    all_tweets = {tweet.id: tweet for tweet in data.get('tweets', []) if tweet.id}
    logger.info(f"Added {len(data.get('tweets', []))} tweets to all_tweets")
    
    likes = data.get('like', [])
    for like in likes:
        if like.id:
            all_tweets[like.id] = like
        else:
            logger.warning("Like without id encountered and skipped.")
    logger.info(f"Added {len(likes)} likes to all_tweets")
    logger.info(f"Total {len(all_tweets)} tweets/likes in all_tweets")
    
    return all_tweets

 def get_conversation_texts(conversation: List[Content]) -> List[Tuple[str, str]]:
    return [
        (tweet.text, "assistant" if 'full_text' in tweet.metadata else "user")
        for tweet in conversation
        if tweet.text
    ]

 def trim_conversation_to_last_assistant(conversation_data: List[Message]) -> List[Message]:
    for i in range(len(conversation_data) - 1, -1, -1):
        if conversation_data[i].role == "assistant":
            return conversation_data[:i+1]
    return []

 def get_conversation_data(conversation: List[Content]) -> List[Message]:
    conversation_data = []
    current_role = None
    current_content = []

    for text, role in get_conversation_texts(conversation):
        cleaned_text = clean_text(text)
        if cleaned_text:
            if role != current_role and current_role is not None:
                conversation_data.append(format_message(current_content, current_role))
                current_content = []
            current_role = role
            current_content.append(cleaned_text)

    if current_content:
        conversation_data.append(format_message(current_content, current_role))

    return trim_conversation_to_last_assistant(conversation_data)

 def extract_threads_and_conversations(all_tweets: Dict[str, Content]) -> Tuple[List[Thread], List[List[Content]]]:
    """Extract threads and conversations from all tweets."""
    threads = []
    conversations = []

    # Keep track of processed tweet IDs to avoid duplicates
    processed_ids = set()

    for tweet in all_tweets.values():
        if tweet.id in processed_ids:
            continue

        if tweet.content_source == 'tweet' and tweet.parent_id and tweet.parent_id in all_tweets and not tweet.text.startswith('RT'):
            # Initialize the chain
            chain = [tweet]
            current_tweet = tweet

            # Walk up the chain of replies
            while current_tweet.parent_id and current_tweet.parent_id in all_tweets:
                parent_tweet = all_tweets[current_tweet.parent_id]
                chain.append(parent_tweet)
                current_tweet = parent_tweet

                if current_tweet.id in processed_ids:
                    break  # Avoid cycles

            # Mark tweets as processed
            for t in chain:
                processed_ids.add(t.id)

            # Determine if it's a thread or conversation
            if all(t.content_source == 'tweet' for t in chain):
                # This is a thread (user replying to themselves)
                threads.append(Thread(id=tweet.id, contents=list(reversed(chain))))
            else:
                # This is a conversation (user replying to others)
                conversations.append(list(reversed(chain)))

    return threads, conversations

 # Data export functions
 def process_media_files(media_files: List[Dict[str, Any]], images_folder: str) -> List[str]:
    media_links = []
    for media_file in media_files:
        media_path = media_file.get('path')
        if media_path and os.path.isfile(media_path):
            orig_filename = os.path.basename(media_path)
            new_filename = f"_{orig_filename}"
            dest_path = os.path.join(images_folder, new_filename)
            shutil.copy(media_path, dest_path)
            media_links.append(f"![{new_filename}](images/{new_filename})")
        else:
            logger.warning(f"Invalid or missing media path: {media_path}")
    return media_links
 def save_thread_markdown(thread: Thread, output_dir: str, media_folder: str, images_folder: str):
    if not thread.contents:
        logger.warning("Attempted to save an empty thread.")
        return

    try:
        date_str = thread.contents[0].timestamp
        date = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y').date()
    except ValueError:
        logger.warning(f"Invalid date format: {date_str}")
        date = datetime.today().date()

    frontmatter = f"---\nDate: {date.isoformat()}\n---\n"

    thread_text = []
    for tweet in thread.contents:
        media_links = process_media_files(tweet.media_files, images_folder)
        cleaned_text = clean_text(tweet.text, tweet.metadata.get('entities'))
        combined_text = f"{cleaned_text}\n\n" + '\n\n'.join(media_links)
        thread_text.append(combined_text)

    first_words = ' '.join(thread_text[0].split()[:5])
    sanitized_filename = re.sub(r'[^\w\-_ ]', '', first_words).strip().replace(' ', '_')[:50]
    filename = f"{sanitized_filename}.md"
    file_path = os.path.join(output_dir, filename)

    top_tweet_id = thread.contents[0].id
    top_tweet_link = f"https://twitter.com/i/web/status/{top_tweet_id}"

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(f"{frontmatter}\n\n" + '\n\n'.join(thread_text) + f"\n\n[View on Twitter]({top_tweet_link})")

 def save_tweets_by_date(all_tweets: Dict[str, Content], threads: List[Thread], output_dir: str, images_folder: str):
    thread_ids = {tweet.id for thread in threads for tweet in thread.contents}
    non_thread_tweets = [
        tweet for tweet_id, tweet in all_tweets.items()
        if tweet_id not in thread_ids 
        and not tweet.parent_id 
        and tweet.content_source == 'tweet'
        and not tweet.text.startswith('RT')
    ]

    tweets_by_date: Dict[datetime.date, List[Content]] = {}
    for tweet in non_thread_tweets:
        date_str = tweet.timestamp
        if not date_str:
            logger.warning(f"Tweet missing date information: {tweet}")
            continue
        try:
            date = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y').date()
            tweets_by_date.setdefault(date, []).append(tweet)
        except ValueError:
            logger.warning(f"Invalid date format: {date_str}")

    for date, tweets_on_date in tweets_by_date.items():
        filename = f"{date.isoformat()}.md"
        file_path = os.path.join(output_dir, filename)
        tweets_on_date.sort(key=lambda x: x.timestamp)
        content = '\n\n---\n\n'.join(
            f"*{datetime.strptime(tweet.timestamp, '%a %b %d %H:%M:%S %z %Y').strftime('%I:%M %p')}*  \n{clean_text(tweet.text, tweet.metadata.get('entities'))}" +
            ''.join(process_media_files(tweet.media_files, images_folder))
            for tweet in tweets_on_date
        )
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

 def format_message(content: List[str], role: Literal['assistant', 'user']) -> Message:
    return Message(role=role, content="\n\n".join(content))

 def format_conversation(conversation_data: List[Message], system_message: str) -> Dict[str, Any]:
    messages = [{"role": "system", "content": system_message}]
    messages.extend([message.__dict__ for message in conversation_data])
    return {"messages": messages}

 def save_conversations_to_jsonl(threads: List[Thread], conversations: List[List[Content]], output_path: str, system_message: str = "You have been uploaded to the internet"):
    logger.info(f"Saving {len(conversations) + len(threads)} conversations to {output_path} in oai format")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for thread in threads:
            formatted_thread = get_conversation_data(thread.contents)
            if not formatted_thread:
                continue
            formatted_thread = format_conversation(formatted_thread, system_message)
            f.write(json.dumps(formatted_thread) + '\n')

        for conversation in conversations:
            formatted_conv = get_conversation_data(conversation)
            if not formatted_conv:
                continue
            formatted_conv = format_conversation(formatted_conv, system_message)
            f.write(json.dumps(formatted_conv) + '\n')

 def main(archive_path: str, output_dir: str, output_formats: List[str], system_message: str):
    data = extract_archive_data(archive_path)
    all_tweets = get_all_tweets(data)
    threads, conversations = extract_threads_and_conversations(all_tweets)

    if 'markdown' in output_formats:
        threads_output_dir = os.path.join(output_dir, 'threads')
        images_folder = os.path.join(output_dir, 'images')
        non_thread_output_dir = os.path.join(output_dir, 'tweets_by_date')

        os.makedirs(threads_output_dir, exist_ok=True)
        os.makedirs(images_folder, exist_ok=True)
        os.makedirs(non_thread_output_dir, exist_ok=True)

        logger.info(f"Saving {len(threads)} threads")
        for i, thread in enumerate(threads, start=1):
            save_thread_markdown(
                thread, 
                threads_output_dir, 
                os.path.join(archive_path, 'data', 'tweets_media'), 
                images_folder
            )
            if i % 10 == 0 or i == len(threads):
                logger.info(f"Saved {i}/{len(threads)} threads")

        save_tweets_by_date(all_tweets, threads, non_thread_output_dir, images_folder)

    if 'oai' in output_formats:
        output_path = os.path.join(output_dir, 'conversations_oai.jsonl')
        save_conversations_to_jsonl(threads, conversations, output_path, system_message)

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process Twitter archive")
    parser.add_argument("--archive-path", default="test", help="Path to the Twitter archive directory")
    parser.add_argument("--output-dir", default="output", help="Directory where outputs will be saved")
    parser.add_argument("--output-formats", nargs='+', default=['markdown', 'oai'],
                        help="Output formats to generate (markdown, oai)")
    parser.add_argument("--system-message", default="You have been uploaded to the internet", 
                        help="System message for the conversation")
    args = parser.parse_args()

    main(args.archive_path, args.output_dir, args.output_formats, args.system_message)
	import argparse
	import json
	import logging
	import os
	import re
	import shutil
	from concurrent.futures import ProcessPoolExecutor, as_completed
	from dataclasses import dataclass
	from datetime import datetime
	from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

	# Logging setup
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	@dataclass
	class MediaFile:
	id: str
	content_type: str
	path: str
	metadata: Dict[str, Any]

	@dataclass
	class Content:
	id: str
	text: str
	metadata: Dict[str, Any]
	timestamp: str
	parent_id: Optional[str]
	media_files: List[Dict[str, Any]]
	content_source: str

	@dataclass
	class Thread:
	id: str
	contents: List[Content]

	@dataclass
	class Message:
	role: Literal["assistant", "user"]
	content: str

	# Data extraction functions
	def clean_json_string(json_string: str) -> str:
	return re.sub(r'^window\.[^=]+=\s*', '', json_string.strip()).rstrip(';')

	def process_file(file_path: str) -> List[Dict[str, Any]]:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = clean_json_string(f.read())
	results = json.loads(data)
	return results
	except Exception as e:
	logger.warning(f"Error processing file {file_path}: {e}")
	return []

	def extract_manifest(file_path: str) -> Dict[str, Any]:
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	content = clean_json_string(file.read())
	return json.loads(content)
	except json.JSONDecodeError:
	match = re.search(r'window\.__THAR_CONFIG\s=\s({.*})', content, re.DOTALL)
	if match:
	return json.loads(match.group(1))
	logger.error(f"Could not parse __THAR_CONFIG in manifest file: {file_path}")
	raise
	except Exception as e:
	logger.error(f"Error extracting manifest from {file_path}: {e}")
	raise

	def get_media_files(tweet_id: str, media_folder: str) -> List[str]:
	try:
	all_files = os.listdir(media_folder)
	media_files = [
	f for f in all_files
	if f.startswith(f"{tweet_id}-") and os.path.getsize(os.path.join(media_folder, f)) > 0
	]
	return media_files
	except Exception as e:
	logger.error(f"Error getting media files for tweet_id {tweet_id}: {e}")
	return []

	def get_media_type(filename: str) -> str:
	ext = os.path.splitext(filename)[1].lower()
	if ext in ('.mp4', '.mov'):
	return 'video'
	elif ext in ('.jpg', '.jpeg', '.png', '.gif'):
	return 'photo'
	return 'unknown'

	def extract_content(item: Dict[str, Any], content_source: str, media_folder: str) -> List[Content]:
	content_id = item.get('id') or item.get('tweetId')
	text = item.get('text') or item.get('fullText') or item.get('full_text')

	media_files = get_media_files(content_id, media_folder)
	media_file_objects = [{
	'id': f"{content_id}_{os.path.splitext(media_file)[0]}",
	'content_type': get_media_type(media_file),
	'path': os.path.join(media_folder, media_file),
	'metadata': {
	'parent_tweet': item,
	'media_info': item.get('extended_entities', {}).get('media', [])
	}
	} for media_file in media_files]

	return [Content(
	id=content_id,
	text=text,
	metadata=item,
	timestamp=item.get('created_at', ''),
	parent_id=item.get('in_reply_to_status_id', None),
	media_files=media_file_objects,
	content_source=content_source
	)]

	def process_file_wrapper(args: Tuple[str, Dict[str, Any], str, str]) -> List[Content]:
	archive_path, file_info, extractor_name, media_folder = args
	file_path = os.path.join(archive_path, file_info['fileName'])
	file_data = process_file(file_path)
	extractor = globals()[extractor_name] # Get the extractor function by name
	return extractor(file_data, media_folder)

	def extract_content_data(archive_path: str, file_info: Dict[str, Any], extractor: Callable, media_folder: str) -> List[Content]:
	try:
	return extractor(file_info['data'], media_folder)
	except Exception as e:
	logger.error(f"Error extracting data with {extractor.__name__}: {e}")
	return []

	def extract_data(archive_path: str, type_info: Dict[str, Any], extractor: Callable) -> List[Content]:
	media_folder = os.path.join(archive_path, 'data', 'tweets_media')
	contents = []
	extractor_name = extractor.__name__

	with ProcessPoolExecutor() as executor:
	args_list = [
	(archive_path, file_info, extractor_name, media_folder)
	for file_info in type_info.get('files', [])
	]
	futures = [executor.submit(process_file_wrapper, args) for args in args_list]

	total_futures = len(futures)
	logger.info(f"Processing {total_futures} files with {extractor_name}")
	completed_count = 0

	for future in as_completed(futures):
	result = future.result()
	if result:
	contents.extend(result)
	completed_count += 1
	if completed_count % 10 == 0 or completed_count == total_futures:
	logger.info(f"Processed {completed_count}/{total_futures} files")

	logger.info(f"Total {extractor_name} extracted: {len(contents)} from {len(type_info.get('files', []))} files")
	return contents

	def extract_tweets(file_data: List[Dict[str, Any]], media_folder: str) -> List[Content]:
	logger.info(f"Extracting tweets from {len(file_data)} items")
	contents = [
	content
	for tweet in file_data if 'tweet' in tweet
	for content in extract_content(tweet['tweet'], 'tweet', media_folder)
	]
	logger.info(f"Extracted {len(contents)} tweet contents")
	return contents

	def extract_likes(file_data: List[Dict[str, Any]], media_folder: str) -> List[Content]:
	logger.info(f"Extracting likes from {len(file_data)} items")
	contents = [
	content
	for like in file_data if 'like' in like
	for content in extract_content(like['like'], 'like', media_folder)
	]
	logger.info(f"Extracted {len(contents)} like contents")
	return contents

	def extract_archive_data(archive_path: str) -> Dict[str, List[Content]]:
	try:
	manifest_path = os.path.join(archive_path, 'data', 'manifest.js')
	manifest = extract_manifest(manifest_path)
	data_types = manifest.get('dataTypes', {})

	extractors = {
	'tweets': extract_tweets,
	'like': extract_likes,
	# Add more extractors as needed
	}

	response = {}
	for data_type, extractor in extractors.items():
	if data_type in data_types:
	response[data_type] = extract_data(archive_path, data_types[data_type], extractor)

	return response

	except Exception as e:
	logger.error(f"Error occurred during data extraction: {e}")
	return {}

	# Data transformation functions
	def clean_text(text: str, entities: Optional[Dict] = None) -> str:
	if entities:
	for url in entities.get('urls', []):
	short_url = url.get('url', '')
	expanded_url = url.get('expanded_url', '')
	if short_url and expanded_url:
	text = text.replace(short_url, expanded_url)

	text = re.sub(r'https://t.co/\w+', '', text)
	text = re.sub(r'@\w+', '', text)
	text = re.sub(r'#\w+', '', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def get_all_tweets(data: Dict[str, List[Content]]) -> Dict[str, Content]:
	logger.info("Combining tweets and likes into all_tweets")
	all_tweets = {tweet.id: tweet for tweet in data.get('tweets', []) if tweet.id}
	logger.info(f"Added {len(data.get('tweets', []))} tweets to all_tweets")

	likes = data.get('like', [])
	for like in likes:
	if like.id:
	all_tweets[like.id] = like
	else:
	logger.warning("Like without id encountered and skipped.")
	logger.info(f"Added {len(likes)} likes to all_tweets")
	logger.info(f"Total {len(all_tweets)} tweets/likes in all_tweets")

	return all_tweets

	def get_conversation_texts(conversation: List[Content]) -> List[Tuple[str, str]]:
	return [
	(tweet.text, "assistant" if 'full_text' in tweet.metadata else "user")
	for tweet in conversation
	if tweet.text
	]

	def trim_conversation_to_last_assistant(conversation_data: List[Message]) -> List[Message]:
	for i in range(len(conversation_data) - 1, -1, -1):
	if conversation_data[i].role == "assistant":
	return conversation_data[:i+1]
	return []

	def get_conversation_data(conversation: List[Content]) -> List[Message]:
	conversation_data = []
	current_role = None
	current_content = []

	for text, role in get_conversation_texts(conversation):
	cleaned_text = clean_text(text)
	if cleaned_text:
	if role != current_role and current_role is not None:
	conversation_data.append(format_message(current_content, current_role))
	current_content = []
	current_role = role
	current_content.append(cleaned_text)

	if current_content:
	conversation_data.append(format_message(current_content, current_role))

	return trim_conversation_to_last_assistant(conversation_data)

	def extract_threads_and_conversations(all_tweets: Dict[str, Content]) -> Tuple[List[Thread], List[List[Content]]]:
	"""Extract threads and conversations from all tweets."""
	threads = []
	conversations = []

	# Keep track of processed tweet IDs to avoid duplicates
	processed_ids = set()

	for tweet in all_tweets.values():
	if tweet.id in processed_ids:
	continue

	if tweet.content_source == 'tweet' and tweet.parent_id and tweet.parent_id in all_tweets and not tweet.text.startswith('RT'):
	# Initialize the chain
	chain = [tweet]
	current_tweet = tweet

	# Walk up the chain of replies
	while current_tweet.parent_id and current_tweet.parent_id in all_tweets:
	parent_tweet = all_tweets[current_tweet.parent_id]
	chain.append(parent_tweet)
	current_tweet = parent_tweet

	if current_tweet.id in processed_ids:
	break # Avoid cycles

	# Mark tweets as processed
	for t in chain:
	processed_ids.add(t.id)

	# Determine if it's a thread or conversation
	if all(t.content_source == 'tweet' for t in chain):
	# This is a thread (user replying to themselves)
	threads.append(Thread(id=tweet.id, contents=list(reversed(chain))))
	else:
	# This is a conversation (user replying to others)
	conversations.append(list(reversed(chain)))

	return threads, conversations

	# Data export functions
	def process_media_files(media_files: List[Dict[str, Any]], images_folder: str) -> List[str]:
	media_links = []
	for media_file in media_files:
	media_path = media_file.get('path')
	if media_path and os.path.isfile(media_path):
	orig_filename = os.path.basename(media_path)
	new_filename = f"_{orig_filename}"
	dest_path = os.path.join(images_folder, new_filename)
	shutil.copy(media_path, dest_path)
	media_links.append(f"![{new_filename}](images/{new_filename})")
	else:
	logger.warning(f"Invalid or missing media path: {media_path}")
	return media_links
	def save_thread_markdown(thread: Thread, output_dir: str, media_folder: str, images_folder: str):
	if not thread.contents:
	logger.warning("Attempted to save an empty thread.")
	return

	try:
	date_str = thread.contents[0].timestamp
	date = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y').date()
	except ValueError:
	logger.warning(f"Invalid date format: {date_str}")
	date = datetime.today().date()

	frontmatter = f"---\nDate: {date.isoformat()}\n---\n"

	thread_text = []
	for tweet in thread.contents:
	media_links = process_media_files(tweet.media_files, images_folder)
	cleaned_text = clean_text(tweet.text, tweet.metadata.get('entities'))
	combined_text = f"{cleaned_text}\n\n" + '\n\n'.join(media_links)
	thread_text.append(combined_text)

	first_words = ' '.join(thread_text[0].split()[:5])
	sanitized_filename = re.sub(r'[^\w\-_ ]', '', first_words).strip().replace(' ', '_')[:50]
	filename = f"{sanitized_filename}.md"
	file_path = os.path.join(output_dir, filename)

	top_tweet_id = thread.contents[0].id
	top_tweet_link = f"https://twitter.com/i/web/status/{top_tweet_id}"

	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(f"{frontmatter}\n\n" + '\n\n'.join(thread_text) + f"\n\n[View on Twitter]({top_tweet_link})")

	def save_tweets_by_date(all_tweets: Dict[str, Content], threads: List[Thread], output_dir: str, images_folder: str):
	thread_ids = {tweet.id for thread in threads for tweet in thread.contents}
	non_thread_tweets = [
	tweet for tweet_id, tweet in all_tweets.items()
	if tweet_id not in thread_ids
	and not tweet.parent_id
	and tweet.content_source == 'tweet'
	and not tweet.text.startswith('RT')
	]

	tweets_by_date: Dict[datetime.date, List[Content]] = {}
	for tweet in non_thread_tweets:
	date_str = tweet.timestamp
	if not date_str:
	logger.warning(f"Tweet missing date information: {tweet}")
	continue
	try:
	date = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y').date()
	tweets_by_date.setdefault(date, []).append(tweet)
	except ValueError:
	logger.warning(f"Invalid date format: {date_str}")

	for date, tweets_on_date in tweets_by_date.items():
	filename = f"{date.isoformat()}.md"
	file_path = os.path.join(output_dir, filename)
	tweets_on_date.sort(key=lambda x: x.timestamp)
	content = '\n\n---\n\n'.join(
	f"{datetime.strptime(tweet.timestamp, '%a %b %d %H:%M:%S %z %Y').strftime('%I:%M %p')} \n{clean_text(tweet.text, tweet.metadata.get('entities'))}" +
	''.join(process_media_files(tweet.media_files, images_folder))
	for tweet in tweets_on_date
	)
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(content)

	def format_message(content: List[str], role: Literal['assistant', 'user']) -> Message:
	return Message(role=role, content="\n\n".join(content))

	def format_conversation(conversation_data: List[Message], system_message: str) -> Dict[str, Any]:
	messages = [{"role": "system", "content": system_message}]
	messages.extend([message.__dict__ for message in conversation_data])
	return {"messages": messages}

	def save_conversations_to_jsonl(threads: List[Thread], conversations: List[List[Content]], output_path: str, system_message: str = "You have been uploaded to the internet"):
	logger.info(f"Saving {len(conversations) + len(threads)} conversations to {output_path} in oai format")
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	with open(output_path, 'w', encoding='utf-8') as f:
	for thread in threads:
	formatted_thread = get_conversation_data(thread.contents)
	if not formatted_thread:
	continue
	formatted_thread = format_conversation(formatted_thread, system_message)
	f.write(json.dumps(formatted_thread) + '\n')

	for conversation in conversations:
	formatted_conv = get_conversation_data(conversation)
	if not formatted_conv:
	continue
	formatted_conv = format_conversation(formatted_conv, system_message)
	f.write(json.dumps(formatted_conv) + '\n')

	def main(archive_path: str, output_dir: str, output_formats: List[str], system_message: str):
	data = extract_archive_data(archive_path)
	all_tweets = get_all_tweets(data)
	threads, conversations = extract_threads_and_conversations(all_tweets)

	if 'markdown' in output_formats:
	threads_output_dir = os.path.join(output_dir, 'threads')
	images_folder = os.path.join(output_dir, 'images')
	non_thread_output_dir = os.path.join(output_dir, 'tweets_by_date')

	os.makedirs(threads_output_dir, exist_ok=True)
	os.makedirs(images_folder, exist_ok=True)
	os.makedirs(non_thread_output_dir, exist_ok=True)

	logger.info(f"Saving {len(threads)} threads")
	for i, thread in enumerate(threads, start=1):
	save_thread_markdown(
	thread,
	threads_output_dir,
	os.path.join(archive_path, 'data', 'tweets_media'),
	images_folder
	)
	if i % 10 == 0 or i == len(threads):
	logger.info(f"Saved {i}/{len(threads)} threads")

	save_tweets_by_date(all_tweets, threads, non_thread_output_dir, images_folder)

	if 'oai' in output_formats:
	output_path = os.path.join(output_dir, 'conversations_oai.jsonl')
	save_conversations_to_jsonl(threads, conversations, output_path, system_message)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process Twitter archive")
	parser.add_argument("--archive-path", default="test", help="Path to the Twitter archive directory")
	parser.add_argument("--output-dir", default="output", help="Directory where outputs will be saved")
	parser.add_argument("--output-formats", nargs='+', default=['markdown', 'oai'],
	help="Output formats to generate (markdown, oai)")
	parser.add_argument("--system-message", default="You have been uploaded to the internet",
	help="System message for the conversation")
	args = parser.parse_args()

	main(args.archive_path, args.output_dir, args.output_formats, args.system_message)