Last active
May 16, 2019 18:11
-
-
Save juanalonso/aa21da260754e97d46ac5c1b7b9f1881 to your computer and use it in GitHub Desktop.
Script para scrapear los tweets con un hashtag determinado.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import tweepy | |
import re | |
import sys | |
import os | |
consumer_key = '' | |
consumer_secret = '' | |
access_token = '' | |
access_token_secret = '' | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) | |
indexFilePath = os.path.join(os.path.dirname(sys.argv[0]),'index.txt') | |
if os.path.exists(indexFilePath): | |
indexFile = open(indexFilePath, 'r', encoding='utf8') | |
lastTweet = indexFile.readline().strip() | |
indexFile.close() | |
else: | |
lastTweet = '0' | |
print("\n\n") | |
print(" Folder:", os.path.dirname(sys.argv[0])) | |
print("Rate Limit:", api.rate_limit_status()['resources']['search']) | |
print(" Old index:", lastTweet) | |
counter = 0 | |
firstTweet = True | |
csvFile = open(os.path.join(os.path.dirname(sys.argv[0]),'tweets.txt'), 'a', encoding='utf8') | |
for tweet in tweepy.Cursor(api.search, | |
q='#sitges2018 -filter:retweets', | |
lang='es', | |
tweet_mode='extended').items(): | |
if "@" in tweet.full_text: | |
continue | |
if firstTweet: | |
firstTweet = False | |
indexFile = open(indexFilePath, 'w', encoding='utf8') | |
indexFile.write(tweet.id_str + '\n') | |
print(" New index:", tweet.id_str, '\n\n') | |
if tweet.id_str <= lastTweet: | |
break | |
tweet.full_text = tweet.full_text.replace('\n', ' ').replace('\r', '') | |
tweet.full_text = tweet.full_text.replace('“', '').replace('”', '') | |
tweet.full_text = tweet.full_text.replace('-', ' ') | |
tweet.full_text = tweet.full_text.replace('\'', ' ').replace('"', ' ') | |
tweet.full_text = tweet.full_text.replace('•', ' ') | |
tweet.full_text = re.sub(' +',' ',tweet.full_text).strip() | |
counter = counter + 1 | |
print ('%04d' % counter, | |
tweet.id_str, | |
tweet.created_at.strftime('%d-%m %H:%M'), | |
tweet.full_text[:70]) | |
csvFile.write(tweet.full_text + '\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for this code! It's working, but I'm still getting truncated tweets. Am I doing something wrong?
You included "tweet_mode='extended'", so that can't be the issue.
Please help! I would really appreciate it.
Edit: I figured it out. "tweet.full_text[:70])" below needs to be changed to "tweet.full_text[:])" to display full tweets.