Skip to content

Instantly share code, notes, and snippets.

@jongan69
Created December 17, 2024 08:35
Show Gist options
  • Save jongan69/d9a43b9177fce7c08b872926b645f126 to your computer and use it in GitHub Desktop.
Save jongan69/d9a43b9177fce7c08b872926b645f126 to your computer and use it in GitHub Desktop.
scrape vydamo_.exe
import tweepy
import os
import json
import time
from dotenv import load_dotenv
from datetime import datetime, timedelta
# Load environment variables from .env
load_dotenv()
TWITTER_CONSUMER_KEY = os.getenv("TWITTER_CONSUMER_KEY")
TWITTER_CONSUMER_SECRET = os.getenv("TWITTER_CONSUMER_SECRET")
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")
TWITTER_BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN")
# Define a monthly tweet scrape limit
MONTHLY_TWEET_LIMIT = 100 # Adjust based on your monthly quota
current_usage = 20 # Track how many tweets have been pulled (update this dynamically)
# Get the current date and calculate the days remaining until reset
current_date = datetime.utcnow()
reset_date = datetime(current_date.year, 12, 31, 23, 59, 59) # Reset is at the end of the month (UTC)
# Function to calculate the remaining time until reset
def time_until_reset():
return reset_date - current_date
# Function to create a Twitter client
def create_twitter_client():
retries = 5 # Max retry attempts
for attempt in range(retries):
try:
# Set up Twitter API v2 authentication
twitter_api = tweepy.Client(
bearer_token=TWITTER_BEARER_TOKEN,
consumer_key=TWITTER_CONSUMER_KEY,
consumer_secret=TWITTER_CONSUMER_SECRET,
access_token=TWITTER_ACCESS_TOKEN,
access_token_secret=TWITTER_ACCESS_TOKEN_SECRET
)
print("Twitter client created successfully.")
return twitter_api
except tweepy.errors.TooManyRequests as e:
# Handle rate limit error (429) by sleeping for the reset time
reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
wait_time = reset_time - int(time.time())
print(f"Rate limit exceeded. Sleeping for {wait_time} seconds before retrying...")
time.sleep(wait_time)
except tweepy.TweepyException as e:
# Handle other types of API exceptions (e.g., authentication errors)
print(f"Error creating Twitter client: {e}")
time.sleep(5) # Short sleep before retrying
print("Max retries reached. Exiting.")
return None
# Initialize Twitter client
twitter_api = create_twitter_client()
if twitter_api:
username = 'vydamo_' # Replace with the target username
output_file = "data.json" # File to save the data
batch_size = 10 # Number of tweets to fetch per request
max_tweets = 1000 # Total number of tweets to scrape
# Initialize variables
fetched_tweets = []
pagination_token = None # Token to fetch the next batch of tweets
total_scraped = 0
def wait_for_rate_limit_reset(reset_time):
"""
Wait until the rate limit is reset.
"""
current_time = int(time.time())
wait_time = reset_time - current_time
if wait_time > 0:
print(f"Rate limit exceeded. Sleeping for {wait_time} seconds...")
time.sleep(wait_time)
else:
print("Rate limit reset time has passed.")
def handle_rate_limit_error(e):
"""
Handle the 429 rate limit error and sleep until the reset time.
"""
if e.response is not None:
# Extract reset time from the rate limit response headers
reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60))
wait_for_rate_limit_reset(reset_time)
else:
print("Rate limit error without response headers.")
time.sleep(15 * 60) # Default backoff if headers are not found
try:
# Get the user's Twitter ID
user = twitter_api.get_user(username=username)
if user.data:
user_id = user.data.id
print(f"Fetching tweets for user: {username} (User ID: {user_id})")
# Loop to fetch tweets in batches
while total_scraped < max_tweets:
# If the current usage exceeds monthly limit, stop scraping
if current_usage >= MONTHLY_TWEET_LIMIT:
remaining_time = time_until_reset()
print(f"Tweet scrape limit reached. Sleeping for {remaining_time} until the reset.")
time.sleep(remaining_time.total_seconds()) # Sleep until the reset
try:
# Fetch a batch of tweets
response = twitter_api.get_users_tweets(
id=user_id,
max_results=batch_size,
pagination_token=pagination_token,
tweet_fields=["text", "id"]
)
# Check if there are tweets in the response
if response.data:
for tweet in response.data:
fetched_tweets.append({
"prompt": "Tweet:", # Static prompt for fine-tuning
"completion": tweet.text.strip() # Text of the tweet
})
total_scraped += 1
current_usage += 1 # Update the usage count
# Stop if we've reached the max_tweets limit
if total_scraped >= max_tweets:
break
# Update pagination token for the next request
pagination_token = response.meta.get("next_token")
print(f"Scraped {total_scraped} tweets so far...")
# Stop if there's no next page of tweets
if not pagination_token:
print("No more tweets available to fetch.")
break
else:
print("No tweets found in the current batch.")
break
except tweepy.errors.TooManyRequests as e:
# Handle the rate limit error and retry after waiting for reset
print("Rate limit exceeded.")
handle_rate_limit_error(e)
# Save all fetched tweets to a JSON file
with open(output_file, "w") as f:
json.dump(fetched_tweets, f, indent=4)
print(f"Scraped {total_scraped} tweets. Saved to {output_file}")
else:
print(f"User {username} not found.")
except tweepy.errors.TweepyException as e:
print(f"Error: {str(e)}")
except Exception as e:
print(f"Unexpected error: {str(e)}")
else:
print("Failed to create Twitter client after retries.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment