Created
December 17, 2024 08:35
-
-
Save jongan69/d9a43b9177fce7c08b872926b645f126 to your computer and use it in GitHub Desktop.
scrape vydamo_.exe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tweepy | |
import os | |
import json | |
import time | |
from dotenv import load_dotenv | |
from datetime import datetime, timedelta | |
# Load environment variables from .env | |
load_dotenv() | |
TWITTER_CONSUMER_KEY = os.getenv("TWITTER_CONSUMER_KEY") | |
TWITTER_CONSUMER_SECRET = os.getenv("TWITTER_CONSUMER_SECRET") | |
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN") | |
TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET") | |
TWITTER_BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN") | |
# Define a monthly tweet scrape limit | |
MONTHLY_TWEET_LIMIT = 100 # Adjust based on your monthly quota | |
current_usage = 20 # Track how many tweets have been pulled (update this dynamically) | |
# Get the current date and calculate the days remaining until reset | |
current_date = datetime.utcnow() | |
reset_date = datetime(current_date.year, 12, 31, 23, 59, 59) # Reset is at the end of the month (UTC) | |
# Function to calculate the remaining time until reset | |
def time_until_reset(): | |
return reset_date - current_date | |
# Function to create a Twitter client | |
def create_twitter_client(): | |
retries = 5 # Max retry attempts | |
for attempt in range(retries): | |
try: | |
# Set up Twitter API v2 authentication | |
twitter_api = tweepy.Client( | |
bearer_token=TWITTER_BEARER_TOKEN, | |
consumer_key=TWITTER_CONSUMER_KEY, | |
consumer_secret=TWITTER_CONSUMER_SECRET, | |
access_token=TWITTER_ACCESS_TOKEN, | |
access_token_secret=TWITTER_ACCESS_TOKEN_SECRET | |
) | |
print("Twitter client created successfully.") | |
return twitter_api | |
except tweepy.errors.TooManyRequests as e: | |
# Handle rate limit error (429) by sleeping for the reset time | |
reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60)) | |
wait_time = reset_time - int(time.time()) | |
print(f"Rate limit exceeded. Sleeping for {wait_time} seconds before retrying...") | |
time.sleep(wait_time) | |
except tweepy.TweepyException as e: | |
# Handle other types of API exceptions (e.g., authentication errors) | |
print(f"Error creating Twitter client: {e}") | |
time.sleep(5) # Short sleep before retrying | |
print("Max retries reached. Exiting.") | |
return None | |
# Initialize Twitter client | |
twitter_api = create_twitter_client() | |
if twitter_api: | |
username = 'vydamo_' # Replace with the target username | |
output_file = "data.json" # File to save the data | |
batch_size = 10 # Number of tweets to fetch per request | |
max_tweets = 1000 # Total number of tweets to scrape | |
# Initialize variables | |
fetched_tweets = [] | |
pagination_token = None # Token to fetch the next batch of tweets | |
total_scraped = 0 | |
def wait_for_rate_limit_reset(reset_time): | |
""" | |
Wait until the rate limit is reset. | |
""" | |
current_time = int(time.time()) | |
wait_time = reset_time - current_time | |
if wait_time > 0: | |
print(f"Rate limit exceeded. Sleeping for {wait_time} seconds...") | |
time.sleep(wait_time) | |
else: | |
print("Rate limit reset time has passed.") | |
def handle_rate_limit_error(e): | |
""" | |
Handle the 429 rate limit error and sleep until the reset time. | |
""" | |
if e.response is not None: | |
# Extract reset time from the rate limit response headers | |
reset_time = int(e.response.headers.get("x-rate-limit-reset", time.time() + 15 * 60)) | |
wait_for_rate_limit_reset(reset_time) | |
else: | |
print("Rate limit error without response headers.") | |
time.sleep(15 * 60) # Default backoff if headers are not found | |
try: | |
# Get the user's Twitter ID | |
user = twitter_api.get_user(username=username) | |
if user.data: | |
user_id = user.data.id | |
print(f"Fetching tweets for user: {username} (User ID: {user_id})") | |
# Loop to fetch tweets in batches | |
while total_scraped < max_tweets: | |
# If the current usage exceeds monthly limit, stop scraping | |
if current_usage >= MONTHLY_TWEET_LIMIT: | |
remaining_time = time_until_reset() | |
print(f"Tweet scrape limit reached. Sleeping for {remaining_time} until the reset.") | |
time.sleep(remaining_time.total_seconds()) # Sleep until the reset | |
try: | |
# Fetch a batch of tweets | |
response = twitter_api.get_users_tweets( | |
id=user_id, | |
max_results=batch_size, | |
pagination_token=pagination_token, | |
tweet_fields=["text", "id"] | |
) | |
# Check if there are tweets in the response | |
if response.data: | |
for tweet in response.data: | |
fetched_tweets.append({ | |
"prompt": "Tweet:", # Static prompt for fine-tuning | |
"completion": tweet.text.strip() # Text of the tweet | |
}) | |
total_scraped += 1 | |
current_usage += 1 # Update the usage count | |
# Stop if we've reached the max_tweets limit | |
if total_scraped >= max_tweets: | |
break | |
# Update pagination token for the next request | |
pagination_token = response.meta.get("next_token") | |
print(f"Scraped {total_scraped} tweets so far...") | |
# Stop if there's no next page of tweets | |
if not pagination_token: | |
print("No more tweets available to fetch.") | |
break | |
else: | |
print("No tweets found in the current batch.") | |
break | |
except tweepy.errors.TooManyRequests as e: | |
# Handle the rate limit error and retry after waiting for reset | |
print("Rate limit exceeded.") | |
handle_rate_limit_error(e) | |
# Save all fetched tweets to a JSON file | |
with open(output_file, "w") as f: | |
json.dump(fetched_tweets, f, indent=4) | |
print(f"Scraped {total_scraped} tweets. Saved to {output_file}") | |
else: | |
print(f"User {username} not found.") | |
except tweepy.errors.TweepyException as e: | |
print(f"Error: {str(e)}") | |
except Exception as e: | |
print(f"Unexpected error: {str(e)}") | |
else: | |
print("Failed to create Twitter client after retries.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment