Last active
December 31, 2022 17:08
-
-
Save ameerkat/0b218d3552b6be47fa3bccdf43d2001b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapingbee import ScrapingBeeClient | |
import time | |
import logging | |
import json | |
SCRAPING_BEE_API_KEY = "RBUHWF4Y0ORC8RGXVRG07VNCBNFN3AH3083P3CHJKEF00HIFGQD2Z0BIMXD4C7AHF14S361H85NZ5TYF" # replace with your API key | |
class ScrappingBeeClientWrapper: | |
def __init__(self, client, client_config): | |
self.client = client | |
self.client_config = client_config | |
def get(self, url, params = {}): | |
retry_delay = self.client_config["retry_delay_ms"] / 1000.0 | |
for i in range(self.client_config["max_retries"]): | |
try: | |
response = self.client.get(url, params=params) | |
if response.ok: | |
return response | |
except Exception as e: | |
logging.error("Woah! That request failed with:") | |
logging.error(e) | |
if i != self.client_config["max_retries"] - 1: | |
time.sleep(retry_delay) | |
retry_delay *= self.client_config["retry_delay_growth_factor"] | |
return response | |
client = ScrappingBeeClientWrapper(ScrapingBeeClient(api_key=SCRAPING_BEE_API_KEY), { | |
"max_retries": 5, | |
"retry_delay_ms": 2000, | |
"retry_delay_growth_factor": 2 # set to 1 to have delay be static | |
}) | |
search_term="google.com" | |
target_url = f"https://twitter.com/search?q={search_term}&src=typed_query&f=live" | |
tweet_response = client.get(target_url, params = { | |
'render_js': 'True', | |
'window_height': 4320, | |
'wait': 5000, | |
# The JS scenario here is quite tricky as the site only keeps the | |
# last X tweets in the DOM. You have to capture the data, then | |
# scroll then capture the next chunk almost tweet by tweet. Our | |
# samples could actually be quite small though. | |
# 'js_scenario': { | |
# "instructions": [ | |
# # scroll and wait and scroll and wait if possible to load | |
# # latest tweets. Figuring out when to stop scrolling | |
# # can be a little tricky. We might want to use frequency | |
# # to estimate based on the sample we get. | |
# ] | |
# }, | |
'extract_rules':{ | |
"tweets": { | |
"selector": "article[data-testid='tweet']", | |
"type": "list", | |
"output": { | |
"handle": "div[data-testid='User-Names'] a[tabindex='-1'] span", | |
"permalink": { | |
"selector": "div[data-testid='User-Names'] a[dir='auto']", | |
"output": "@href" | |
}, | |
"time": { | |
"selector": "div[data-testid='User-Names'] time", | |
"output": "@datetime" | |
}, | |
"text": "div[data-testid='tweetText']", | |
"replies": "div[data-testid='reply']", | |
"retweets": "div[data-testid='retweet']", | |
"likes": "div[data-testid='like']" | |
} | |
} | |
} | |
}) | |
if tweet_response.status_code != 200: | |
print(f"Failed to get twitter search page ({target_url}) with response code {tweet_response.status_code}") | |
print(tweet_response.content) | |
json_result = json.loads(tweet_response.content) | |
# This is optional, if you think something is wrong with the code above for example and you aren't getting | |
# the same output as you expect, try running this. | |
if not json_result["tweets"]: | |
print("Failed to find any tweets. Check screenshot to see if page loaded correctly.") | |
screenshot_response = client.get(target_url, params = { | |
'render_js': 'True', # they've changed it to have some redirect | |
'window_height': 4320, | |
'timeout': 20000, | |
'wait': 5000, | |
'screenshot': True | |
}) | |
if not screenshot_response.ok: | |
logging.warning(f"Failed to get a screenshot of the target page {target_url}. {screenshot_response.content}") | |
else: | |
logging.warning(f"Writing screenshot to file.") | |
target_file = f"./twitter.png" | |
try: | |
with open(target_file, "wb") as f: | |
f.write(screenshot_response.content) | |
logging.warn(f"Wrote screenshot to file {target_file}") | |
except Exception as e: | |
logging.error(f"Failed to write screenshot due to exception {e}.") | |
else: | |
# do something with the response | |
print(json.dumps(json_result, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment