Created
July 21, 2021 15:19
-
-
Save websofter/96bcb9901ae65f59a0fd9f3b4470c075 to your computer and use it in GitHub Desktop.
Cross platform selenium parsing with chrome/chromedriver
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from bs4 import BeautifulSoup | |
| #from fake_useragent import UserAgent | |
| import datetime | |
| import os | |
| import platform | |
| from selenium.common.exceptions import NoSuchElementException, TimeoutException | |
| import time | |
| from selenium.webdriver.common.proxy import Proxy, ProxyType | |
| plt = platform.system() | |
| driver_path = None | |
| browser_path = None | |
| if plt == "Windows": | |
| #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Win_x64/902856/ | |
| driver_path = '../bin/windows/chromedriver_win32/chromedriver.exe' | |
| browser_path = '../bin/windows/chrome-win/chrome.exe' | |
| print(f"Your OS {plt}") | |
| elif plt == "Linux": | |
| #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/902856/ | |
| driver_path = '../bin/linux/chromedriver_linux64/chromedriver' | |
| browser_path = '../bin/linux/chrome-linux/chrome' | |
| print(f"Your OS {plt}") | |
| elif plt == "Darwin": | |
| #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Mac/902856/ | |
| driver_path = '../bin/linux/chromedriver_mac64/chromedriver' | |
| browser_path = '../bin/linux/chrome-mac/chrome' | |
| print(f"Your OS {plt}") | |
| else: | |
| print(f"Unidentified {plt}") | |
| options = webdriver.ChromeOptions() | |
| # | |
| #ua = UserAgent() | |
| #userAgent = ua.random | |
| #options.add_argument(f'user-agent={userAgent}') | |
| # | |
| #options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"') | |
| options.add_argument('--headless') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| options.add_argument('--disable-extensions') | |
| options.add_argument('--test-type') | |
| #options.add_argument('--single-process') | |
| options.add_argument('--disable-gpu') | |
| options.add_argument('--disable-software-rasterizer') | |
| options.add_experimental_option("excludeSwitches", ["enable-logging"]) | |
| options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
| options.binary_location = browser_path | |
| browser = webdriver.Chrome(executable_path = driver_path, options=options) | |
| #browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057", "platform":"Windows"}) | |
| #browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "platform":"Windows"}) | |
| browser.set_window_size(1920, 1080) | |
| browser.maximize_window() | |
| parsing_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| baseurls = ["https://www.social-searcher.com"] #Service for parsing | |
| methods = ['google-social-search', 'social-buzz'] #Subservices/plans for parsing | |
| sites = ['fb', 'tw', 'in'] #Social networks for parsing | |
| keywords = ["מפלגת העבודה", "שרת התחבורה", "מיכאלי", "מירב מיכאלי"] #Keywords | |
| page_to = 10 #Total pages for parsing(max=10) | |
| posts = [] #List for saving parsed posts | |
| def parse_list(browser, social_name, keyword, page_num = 1): | |
| #Print current page info | |
| print(f"Parsing {social_name} with keyword @{keyword} on page #{page_num}") | |
| html = browser.page_source | |
| bs = BeautifulSoup(html, 'lxml') | |
| cards_html = bs.find('div', {'class': 'gsc-expansionArea'}) | |
| cards_array = cards_html.find_all('div', {'class': 'gsc-webResult'}) | |
| #Iterate by post list on current page | |
| for card in cards_array: | |
| name = card.find('a', {'class': 'gs-title'}).text | |
| text = card.find('div', {'class': 'gs-snippet'}).text | |
| url = card.find('div', {'class': 'gs-visibleUrl-long'}).text | |
| date = None | |
| pic=None | |
| try: | |
| pic = card.find('img', {'class': 'gs-image'})['src'] | |
| except: | |
| pass | |
| #Creating post info set | |
| post = { | |
| 'date_of_add': parsing_date, | |
| 'post': url, | |
| 'pic': pic, | |
| 'name': name, | |
| 'date': date, | |
| 'text': text, | |
| 'keyword': keyword, | |
| 'sentiment': None, | |
| 'social_name': social_name | |
| } | |
| posts.append(post) | |
| time.sleep(3) | |
| browser.save_screenshot("./screenshot.png") | |
| #Iterating by keyords | |
| for keyword in keywords: | |
| #Generate URL for parsing with keyword and social params | |
| params = '?q=' + keyword + ''.join(['&' + str(p) + '=on' for p in sites]) | |
| url = f'{baseurls[0]}/{methods[0]}/' + params | |
| print(url) | |
| #Locate by generated URL | |
| browser.get(url) | |
| time.sleep(2) | |
| browser.save_screenshot("./screenshot.png") | |
| #Iterate by social data by him iframe columns | |
| for social in browser.find_elements_by_class_name('iframe-column'): | |
| social_name = social.find_element_by_class_name('network-name').text | |
| social_iframe = social.find_element_by_tag_name('iframe') | |
| browser.switch_to.frame(social_iframe) | |
| #Parsing the first page | |
| parse_list(browser, social_name, keyword, page_num = 1) | |
| #Parsing other pages | |
| for page_num in range(2, page_to + 1, 1): | |
| page_link = browser.find_element_by_xpath(f"//div[@aria-label='Page {page_num}']") | |
| browser.execute_script("arguments[0].click();", page_link) | |
| parse_list(browser, social_name, keyword, page_num) | |
| #Switch browser content from iframe to main frame | |
| browser.switch_to.default_content() | |
| print(posts) | |
| browser.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment