jongan69 · December 17, 2024 08:35
diff --git a/scrape.py b/scrape.py
 import tweepy
 import os
 import json
 import time
 from dotenv import load_dotenv
 from datetime import datetime, timedelta

 # Load environment variables from .env
 load_dotenv()

 TWITTER_CONSUMER_KEY = os.getenv("TWITTER_CONSUMER_KEY")
 TWITTER_CONSUMER_SECRET = os.getenv("TWITTER_CONSUMER_SECRET")
 TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
 TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")
 TWITTER_BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN")

 # Define a monthly tweet scrape limit
 MONTHLY_TWEET_LIMIT = 100  # Adjust based on your monthly quota
 current_usage = 20  # Track how many tweets have been pulled (update this dynamically)

 # Get the current date and calculate the days remaining until reset
 current_date = datetime.utcnow()
 reset_date = datetime(current_date.year, 12, 31, 23, 59, 59)  # Reset is at the end of the month (UTC)

 # Function to calculate the remaining time until reset
 def time_until_reset():
    return reset_date - current_date

 # Function to create a Twitter client
 def create_twitter_client():
    retries = 5  # Max retry attempts
    for attempt in range(retries):
        try:
            # Set up Twitter API v2 authentication
            twitter_api = tweepy.Client(
                bearer_token=TWITTER_BEARER_TOKEN,
                consumer_key=TWITTER_CONSUMER_KEY,
                consumer_secret=TWITTER_CONSUMER_SECRET,
                access_token=TWITTER_ACCESS_TOKEN,
                access_token_secret=TWITTER_ACCESS_TOKEN_SECRET
            )
            print("Twitter client created successfully.")
            return twitter_api
        except tweepy.errors.TooManyRequests as e:
            # Handle rate limit error (429) by sleeping for the reset time
            reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
            wait_time = reset_time - int(time.time())
            print(f"Rate limit exceeded. Sleeping for {wait_time} seconds before retrying...")
            time.sleep(wait_time)
        except tweepy.TweepyException as e:
            # Handle other types of API exceptions (e.g., authentication errors)
            print(f"Error creating Twitter client: {e}")
            time.sleep(5)  # Short sleep before retrying
    print("Max retries reached. Exiting.")
    return None

 # Initialize Twitter client
 twitter_api = create_twitter_client()

 if twitter_api:
    username = 'vydamo_'  # Replace with the target username
    output_file = "data.json"  # File to save the data
    batch_size = 10  # Number of tweets to fetch per request
    max_tweets = 1000  # Total number of tweets to scrape

    # Initialize variables
    fetched_tweets = []
    pagination_token = None  # Token to fetch the next batch of tweets
    total_scraped = 0

    def wait_for_rate_limit_reset(reset_time):
        """
        Wait until the rate limit is reset.
        """
        current_time = int(time.time())
        wait_time = reset_time - current_time
        if wait_time > 0:
            print(f"Rate limit exceeded. Sleeping for {wait_time} seconds...")
            time.sleep(wait_time)
        else:
            print("Rate limit reset time has passed.")

    def handle_rate_limit_error(e):
        """
        Handle the 429 rate limit error and sleep until the reset time.
        """
        if e.response is not None:
            # Extract reset time from the rate limit response headers
            reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
            wait_for_rate_limit_reset(reset_time)
        else:
            print("Rate limit error without response headers.")
            time.sleep(15 * 60)  # Default backoff if headers are not found

    try:
        # Get the user's Twitter ID
        user = twitter_api.get_user(username=username)
        if user.data:
            user_id = user.data.id

            print(f"Fetching tweets for user: {username} (User ID: {user_id})")

            # Loop to fetch tweets in batches
            while total_scraped < max_tweets:
                # If the current usage exceeds monthly limit, stop scraping
                if current_usage >= MONTHLY_TWEET_LIMIT:
                    remaining_time = time_until_reset()
                    print(f"Tweet scrape limit reached. Sleeping for {remaining_time} until the reset.")
                    time.sleep(remaining_time.total_seconds())  # Sleep until the reset

                try:
                    # Fetch a batch of tweets
                    response = twitter_api.get_users_tweets(
                        id=user_id,
                        max_results=batch_size,
                        pagination_token=pagination_token,
                        tweet_fields=["text", "id"]
                    )

                    # Check if there are tweets in the response
                    if response.data:
                        for tweet in response.data:
                            fetched_tweets.append({
                                "prompt": "Tweet:",  # Static prompt for fine-tuning
                                "completion": tweet.text.strip()  # Text of the tweet
                            })
                            total_scraped += 1
                            current_usage += 1  # Update the usage count

                            # Stop if we've reached the max_tweets limit
                            if total_scraped >= max_tweets:
                                break

                        # Update pagination token for the next request
                        pagination_token = response.meta.get("next_token")
                        print(f"Scraped {total_scraped} tweets so far...")

                        # Stop if there's no next page of tweets
                        if not pagination_token:
                            print("No more tweets available to fetch.")
                            break
                    else:
                        print("No tweets found in the current batch.")
                        break

                except tweepy.errors.TooManyRequests as e:
                    # Handle the rate limit error and retry after waiting for reset
                    print("Rate limit exceeded.")
                    handle_rate_limit_error(e)

            # Save all fetched tweets to a JSON file
            with open(output_file, "w") as f:
                json.dump(fetched_tweets, f, indent=4)
            print(f"Scraped {total_scraped} tweets. Saved to {output_file}")

        else:
            print(f"User {username} not found.")
    except tweepy.errors.TweepyException as e:
        print(f"Error: {str(e)}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
 else:
    print("Failed to create Twitter client after retries.")
	import tweepy
	import os
	import json
	import time
	from dotenv import load_dotenv
	from datetime import datetime, timedelta

	# Load environment variables from .env
	load_dotenv()

	TWITTER_CONSUMER_KEY = os.getenv("TWITTER_CONSUMER_KEY")
	TWITTER_CONSUMER_SECRET = os.getenv("TWITTER_CONSUMER_SECRET")
	TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
	TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")
	TWITTER_BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN")

	# Define a monthly tweet scrape limit
	MONTHLY_TWEET_LIMIT = 100 # Adjust based on your monthly quota
	current_usage = 20 # Track how many tweets have been pulled (update this dynamically)

	# Get the current date and calculate the days remaining until reset
	current_date = datetime.utcnow()
	reset_date = datetime(current_date.year, 12, 31, 23, 59, 59) # Reset is at the end of the month (UTC)

	# Function to calculate the remaining time until reset
	def time_until_reset():
	return reset_date - current_date

	# Function to create a Twitter client
	def create_twitter_client():
	retries = 5 # Max retry attempts
	for attempt in range(retries):
	try:
	# Set up Twitter API v2 authentication
	twitter_api = tweepy.Client(
	bearer_token=TWITTER_BEARER_TOKEN,
	consumer_key=TWITTER_CONSUMER_KEY,
	consumer_secret=TWITTER_CONSUMER_SECRET,
	access_token=TWITTER_ACCESS_TOKEN,
	access_token_secret=TWITTER_ACCESS_TOKEN_SECRET
	)
	print("Twitter client created successfully.")
	return twitter_api
	except tweepy.errors.TooManyRequests as e:
	# Handle rate limit error (429) by sleeping for the reset time
	reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
	wait_time = reset_time - int(time.time())
	print(f"Rate limit exceeded. Sleeping for {wait_time} seconds before retrying...")
	time.sleep(wait_time)
	except tweepy.TweepyException as e:
	# Handle other types of API exceptions (e.g., authentication errors)
	print(f"Error creating Twitter client: {e}")
	time.sleep(5) # Short sleep before retrying
	print("Max retries reached. Exiting.")
	return None

	# Initialize Twitter client
	twitter_api = create_twitter_client()

	if twitter_api:
	username = 'vydamo_' # Replace with the target username
	output_file = "data.json" # File to save the data
	batch_size = 10 # Number of tweets to fetch per request
	max_tweets = 1000 # Total number of tweets to scrape

	# Initialize variables
	fetched_tweets = []
	pagination_token = None # Token to fetch the next batch of tweets
	total_scraped = 0

	def wait_for_rate_limit_reset(reset_time):
	"""
	Wait until the rate limit is reset.
	"""
	current_time = int(time.time())
	wait_time = reset_time - current_time
	if wait_time > 0:
	print(f"Rate limit exceeded. Sleeping for {wait_time} seconds...")
	time.sleep(wait_time)
	else:
	print("Rate limit reset time has passed.")

	def handle_rate_limit_error(e):
	"""
	Handle the 429 rate limit error and sleep until the reset time.
	"""
	if e.response is not None:
	# Extract reset time from the rate limit response headers
	reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
	wait_for_rate_limit_reset(reset_time)
	else:
	print("Rate limit error without response headers.")
	time.sleep(15 * 60) # Default backoff if headers are not found

	try:
	# Get the user's Twitter ID
	user = twitter_api.get_user(username=username)
	if user.data:
	user_id = user.data.id

	print(f"Fetching tweets for user: {username} (User ID: {user_id})")

	# Loop to fetch tweets in batches
	while total_scraped < max_tweets:
	# If the current usage exceeds monthly limit, stop scraping
	if current_usage >= MONTHLY_TWEET_LIMIT:
	remaining_time = time_until_reset()
	print(f"Tweet scrape limit reached. Sleeping for {remaining_time} until the reset.")
	time.sleep(remaining_time.total_seconds()) # Sleep until the reset

	try:
	# Fetch a batch of tweets
	response = twitter_api.get_users_tweets(
	id=user_id,
	max_results=batch_size,
	pagination_token=pagination_token,
	tweet_fields=["text", "id"]
	)

	# Check if there are tweets in the response
	if response.data:
	for tweet in response.data:
	fetched_tweets.append({
	"prompt": "Tweet:", # Static prompt for fine-tuning
	"completion": tweet.text.strip() # Text of the tweet
	})
	total_scraped += 1
	current_usage += 1 # Update the usage count

	# Stop if we've reached the max_tweets limit
	if total_scraped >= max_tweets:
	break

	# Update pagination token for the next request
	pagination_token = response.meta.get("next_token")
	print(f"Scraped {total_scraped} tweets so far...")

	# Stop if there's no next page of tweets
	if not pagination_token:
	print("No more tweets available to fetch.")
	break
	else:
	print("No tweets found in the current batch.")
	break

	except tweepy.errors.TooManyRequests as e:
	# Handle the rate limit error and retry after waiting for reset
	print("Rate limit exceeded.")
	handle_rate_limit_error(e)

	# Save all fetched tweets to a JSON file
	with open(output_file, "w") as f:
	json.dump(fetched_tweets, f, indent=4)
	print(f"Scraped {total_scraped} tweets. Saved to {output_file}")

	else:
	print(f"User {username} not found.")
	except tweepy.errors.TweepyException as e:
	print(f"Error: {str(e)}")
	except Exception as e:
	print(f"Unexpected error: {str(e)}")
	else:
	print("Failed to create Twitter client after retries.")