Last active
May 7, 2024 18:54
-
-
Save tushortz/cba8b25f9d80f584f807b65890f37be5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.chrome.options import DesiredCapabilities | |
from selenium.webdriver.common.proxy import Proxy, ProxyType | |
import time | |
co = webdriver.ChromeOptions() | |
co.add_argument("log-level=3") | |
co.add_argument("--headless") | |
def get_proxies(co=co): | |
driver = webdriver.Chrome(chrome_options=co) | |
driver.get("https://free-proxy-list.net/") | |
PROXIES = [] | |
proxies = driver.find_elements_by_css_selector("tr[role='row']") | |
for p in proxies: | |
result = p.text.split(" ") | |
if result[-1] == "yes": | |
PROXIES.append(result[0]+":"+result[1]) | |
driver.close() | |
return PROXIES | |
ALL_PROXIES = get_proxies() | |
def proxy_driver(PROXIES, co=co): | |
prox = Proxy() | |
if len(PROXIES) < 1: | |
print("--- Proxies used up (%s)" % len(PROXIES)) | |
PROXIES = get_proxies() | |
pxy = PROXIES[-1] | |
prox.proxy_type = ProxyType.MANUAL | |
prox.http_proxy = pxy | |
prox.socks_proxy = pxy | |
prox.ssl_proxy = pxy | |
capabilities = webdriver.DesiredCapabilities.CHROME | |
prox.add_to_capabilities(capabilities) | |
driver = webdriver.Chrome(chrome_options=co, desired_capabilities=capabilities) | |
return driver | |
# --- YOU ONLY NEED TO CARE FROM THIS LINE --- | |
# creating new driver to use proxy | |
pd = proxy_driver(ALL_PROXIES) | |
# code must be in a while loop with a try to keep trying with different proxies | |
running = True | |
while running: | |
try: | |
mycodehere() | |
# if statement to terminate loop if code working properly | |
# you need to modify condition_met | |
if condition_met: | |
running = False | |
# you | |
except: | |
new = ALL_PROXIES.pop() | |
# reassign driver if fail to switch proxy | |
pd = proxy_driver(ALL_PROXIES) | |
print("--- Switched proxy to: %s" % new) | |
time.sleep(1) |
Why don't you try like the following way instead:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import requests
from bs4 import BeautifulSoup
link = 'https://stackoverflow.com/questions/tagged/web-scraping'
def filter_proxies():
response = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(response.text,"html.parser")
proxies = []
for item in soup.select("table.table tbody tr"):
if not item.select_one("td"):break
ip = item.select_one("td").text
port = item.select_one("td:nth-of-type(2)").text
proxies.append(f"{ip}:{port}")
return proxies
def create_proxy_driver(PROXY):
options = Options()
options.add_argument("--headless")
options.add_argument(f'--proxy-server={PROXY}')
driver = webdriver.Chrome(options=options)
return driver
def get_content(ALL_PROXIES,driver):
while True:
try:
driver.get(link)
title = driver.find_element(By.CSS_SELECTOR,"h3.s-post-summary--content-title > a").text
if title:
driver.quit()
return title
except Exception as e:
driver.quit()
if not ALL_PROXIES:
print("Proxies used up (%s)" % len(ALL_PROXIES))
ALL_PROXIES = filter_proxies()
new_proxy = ALL_PROXIES.pop()
driver = create_proxy_driver(new_proxy)
print("proxy being used: %s" % new_proxy)
time.sleep(1)
if __name__ == '__main__':
ALL_PROXIES = filter_proxies()
new_proxy = ALL_PROXIES.pop()
driver = create_proxy_driver(new_proxy)
print(get_content(ALL_PROXIES,driver))
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
glad it helped