Created
April 15, 2024 17:32
-
-
Save greg-randall/5c227c5106543fd61152ccdbcd98b2cc to your computer and use it in GitHub Desktop.
Try several methods of getting a webpage, starting with just a basic page download, then using lynx, then using Pyppeteer which runs Chrome.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import asyncio | |
from termcolor import cprint | |
from pyppeteer import launch | |
from pyppeteer_stealth import stealth | |
import subprocess | |
def page_content_vaild(page_content): | |
excluded_strings = ["Page Not Found", "Human Verification", "About Lynx"] | |
if not any(excluded_string.lower() in page_content.lower() for excluded_string in excluded_strings) and not page_content.strip() == "": | |
return True | |
else: | |
return False | |
def basic_pull(url): | |
try: | |
response = requests.get(url) | |
page_content = response.text | |
if page_content_vaild(page_content): | |
return page_content | |
else: | |
cprint(f"basic pull failed", "magenta") | |
return False | |
except: | |
return False | |
def lynx_pull(url): | |
try: | |
result = subprocess.run(['lynx', f"-source {url}"], stdout=subprocess.PIPE) | |
page_content = result.stdout.decode('utf-8') | |
if page_content_vaild(page_content): | |
return page_content | |
else: | |
cprint(f"lynx pull failed", "magenta") | |
return False | |
except: | |
return False | |
def pyppeteer_pull(url): | |
try: | |
async def get_page_raw(): | |
browser = await launch() | |
page = await browser.newPage() | |
await stealth(page) | |
await page.goto(url) | |
page_raw = await page.content() | |
return page_raw | |
try: | |
page_content = asyncio.get_event_loop().run_until_complete(get_page_raw()) | |
except: | |
return False | |
if page_content_vaild(page_content): | |
return page_content | |
else: | |
cprint(f"pyppeteer pull failed", "magenta") | |
return False | |
except: | |
return False | |
def get_page_content(url): | |
#the order here can be swapped around depending on your preferences or if one of them always fails: | |
# basic pull is fastest | |
# lynx is more reliable since it's a real browser | |
# pyppeteer is slowest but most reliable since it's a full version of Chrome | |
cprint(f"trying basic pull", "green") | |
output = basic_pull(url) | |
if output: | |
return output | |
cprint(f"trying lynx pull", "green") | |
output = lynx_pull(url) | |
if output: | |
return output | |
cprint(f"trying pyppeteer pull", "green") | |
output = pyppeteer_pull(url) | |
if output: | |
return output | |
return False | |
url="https://example.com" | |
print(get_page_content(url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment