Last active
July 13, 2020 17:43
-
-
Save anshajk/de395ec643693ed4b0fb825668113287 to your computer and use it in GitHub Desktop.
A python script which uses google cloud functions to do some web scraping and saves the data in a GCS bucket
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import datetime | |
import json | |
import logging | |
import os | |
from google.cloud import storage | |
from twitter_scraper import get_tweets | |
log = logging.getLogger() | |
bucket = os.getenv("bucket") | |
data_path = os.getenv("data_path") | |
local_scraped_file = data_path + "/data.csv" | |
local_keywords_file = data_path + "/keywords.json" | |
remote_keywords_file = "keywords/keywords.json" | |
remote_scraped_data_path = "tweets/" | |
fields = [ | |
"tweetId", | |
"word", | |
"text", | |
"isRetweet", | |
"replies", | |
"retweets", | |
"likes", | |
"time", | |
] | |
client = storage.Client() | |
bucket = client.bucket(bucket) | |
def download_and_get_keywords(): | |
blob = bucket.get_blob(remote_keywords_file) | |
with open(local_keywords_file, "wb") as file_: | |
blob.download_to_file(file_) | |
with open(local_keywords_file) as file_: | |
keywords_dict = json.load(file_) | |
keywords = keywords_dict["keywords"] | |
return keywords | |
def scraper(): | |
keywords = download_and_get_keywords() | |
for word in keywords: | |
log.info("Starting scrape for %s" % word) | |
tweets = get_tweets(word, pages=3) | |
try: | |
for tweet in tweets: | |
row = { | |
"tweetId": tweet["tweetId"], | |
"word": word, | |
"text": [tweet["text"]], | |
"isRetweet": tweet["isRetweet"], | |
"replies": tweet["replies"], | |
"retweets": tweet["retweets"], | |
"likes": tweet["likes"], | |
"time": tweet["time"], | |
} | |
with open(local_scraped_file, "a", newline="") as csvfile: | |
csvwriter = csv.DictWriter(csvfile, fieldnames=fields, delimiter=",") | |
csvwriter.writerow(row) | |
del row | |
except Exception as e: | |
log.error(e) | |
continue | |
def upload_to_bucket(): | |
now = datetime.datetime.now().strftime("%m-%d-%Y-%H:%M:%S") | |
blob = bucket.blob(remote_scraped_data_path + now + ".csv") | |
with open(local_scraped_file, "rb") as file_: | |
blob.upload_from_file(file_) | |
os.remove(local_scraped_file) | |
log.info("Scraped file uploaded to bucket") | |
def run_processes(request): | |
scraper() | |
upload_to_bucket() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment