Last active
June 2, 2021 10:53
-
-
Save mcchae/c9323d426aba8fcde3c1b54731f6cfbe to your computer and use it in GitHub Desktop.
Some python selenium examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
################################################################################ | |
import sys | |
from time import sleep | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
################################################################################ | |
# SEARCH = "Hotels in California, CA, USA" | |
SEARCH = "언주역 부근 호텔" | |
TIMEOUT = 5 | |
driver = None | |
try: | |
############################################################################ | |
# driver = webdriver.PhantomJS() | |
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver") | |
driver.set_window_size(1200, 800) | |
############################################################################ | |
# 1) search | |
driver.get("https://www.google.co.kr/maps") | |
driver.implicitly_wait(TIMEOUT) | |
# 검색 엘리먼트를 찾아 검색어를 입력하고 | |
elem = driver.find_element_by_id("searchboxinput") | |
elem.send_keys(SEARCH) | |
# 검색 단추를 누른다 | |
elem = driver.find_element_by_id("searchbox-searchbutton") | |
elem.click() | |
############################################################################ | |
# 2) get result list | |
for ndx in range(100): | |
driver.implicitly_wait(TIMEOUT) | |
# 현재 검색 목록에 대해 목록의 상위에 해당하는 엘리먼트를 구해옴 (기다렸다) | |
elem = driver.find_element_by_class_name('widget-pane-content-holder') | |
dt = elem.find_element_by_xpath('.//div/div[@role="listbox"]') | |
rd = {} | |
try: | |
# 검색 결과 중에 ndx 번째 결과의 엘리먼트를 구해옴 | |
d = dt.find_element_by_xpath('.//div[@data-result-index="%s"]' % ndx) | |
lines = d.text.split('\n') | |
# 첫번째 줄은 호텔이름 | |
rd['hotel'] = lines[0] | |
# 나머지 줄은 정보로 | |
rd['info'] = ','.join(lines[1:]) | |
# 해당 정보를 눌러 상세 정보 보기 | |
d.click() | |
# 다음 몇초를 쉬는 이유는 아래의 elem 이나 back_button 등을 | |
# WebDriverWait로 구해와도 ElementNotVisibleException 등의 예외 때문 | |
# (아마도 지도에 표시를 하는 등 data binding 시간이 꽤 걸리는 듯) | |
driver.implicitly_wait(TIMEOUT) | |
# 상세 정보 엘리먼트 구해옴 (기다리며) | |
elem = driver.find_element_by_class_name('widget-pane-content-holder') | |
# 주소 구해옴 : 생략될 수 있기 때문에 try | |
try: | |
it = elem.find_element_by_xpath('.//div/div[@data-section-id="ad"]') | |
rd['address'] = it.text | |
except Exception: | |
pass | |
# 홈페이지 구해옴 : 생략될 수 있기 때문에 try | |
try: | |
it = elem.find_element_by_xpath('.//div/div[@data-section-id="ap"]') | |
rd['homepage'] = it.text | |
except Exception: | |
pass | |
# phone 구해옴 : 생략될 수 있기 때문에 try | |
try: | |
it = elem.find_element_by_xpath('.//div/div[@data-section-id="pn0"]') | |
rd['phone'] = it.text | |
except Exception: | |
pass | |
print(rd) | |
# 이전 "검색결과로 돌아가기" 누름 | |
back_button = elem.find_element_by_xpath('.//div/button') | |
back_button.click() | |
except NoSuchElementException: | |
# 검색 결과 중에 구해오기 위한 ndx 번째를 너머서면 못구하고 해당 오류가 | |
# 발생하므로 for loop 빠짐 | |
break # End of list | |
except Exception: | |
raise | |
finally: | |
############################################################################ | |
# quit | |
if driver is not None: | |
driver.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
################################################################################ | |
import sys | |
from selenium import webdriver | |
from selenium.common.exceptions import NoSuchElementException, TimeoutException | |
################################################################################ | |
URL = "https://www.houzz.com/" | |
MENU = 'professionals/architect' | |
LOCATION = 'San Jose, CA' | |
DISTANCE = 10 # one of (10, 25, 50, 100) | |
TIMEOUT = 20 | |
################################################################################ | |
driver = None | |
try: | |
############################################################################ | |
# driver = webdriver.PhantomJS() | |
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver") | |
driver.set_window_size(1200, 1000) | |
############################################################################ | |
# 1) house | |
page = None | |
driver.get(URL + MENU) | |
driver.implicitly_wait(TIMEOUT) | |
page = 'search' | |
if LOCATION: | |
el = driver.find_element_by_id('proLocationSearch') | |
el.send_keys(LOCATION) | |
if DISTANCE in (10, 25, 50, 100): | |
el = driver.find_element_by_id('proDistanceFilter') | |
for option in el.find_elements_by_tag_name('option'): | |
if option.text == 'Within %s miles' % DISTANCE: | |
option.click() # select() in earlier versions of webdriver | |
break | |
############################################################################ | |
# 2) search | |
el = driver.find_element_by_id('proSearchBtn') | |
el.click() | |
driver.implicitly_wait(TIMEOUT) | |
page = 'find' | |
while True: | |
######################################################################## | |
# 3) get results list | |
# for div in el.find_elements_by_tag_name('div'): | |
for ndx in range(100): | |
el = driver.find_element_by_class_name('browseListBody') | |
page = 'results' | |
try: | |
rd = {} | |
d = el.find_element_by_xpath('.//div[@posid="%s"]' % ndx) | |
rd['name'] = d.text.split('\n')[0] | |
a = d.find_element_by_class_name('pro-title') | |
a.click() | |
driver.implicitly_wait(TIMEOUT) | |
################################################################ | |
# click "Click to Call" | |
page = 'detail' | |
try: | |
el = driver.find_element_by_class_name('click-to-call-link') | |
el.click() | |
driver.implicitly_wait(TIMEOUT) | |
el = driver.find_element_by_class_name('pro-contact-text') | |
rd['phone'] = el.text | |
except Exception: | |
pass | |
el = driver.find_element_by_class_name('profile-about-right') | |
lastkey = None | |
for i, line in enumerate(el.text.split('\n')): | |
if i == 0: | |
rd['kind'] = line.strip() | |
continue | |
if line.startswith('Contact:'): | |
lastkey = 'contact' | |
rd[lastkey] = line[len('Contact:'):].strip() | |
elif line.startswith('Location:'): | |
lastkey = 'location' | |
rd[lastkey] = line[len('Location:'):].strip() | |
elif line.startswith('License Number:'): | |
lastkey = 'license' | |
rd[lastkey] = line[len('License Number:'):].strip() | |
elif line.startswith('Typical Job Costs:'): | |
lastkey = 'cost' | |
rd[lastkey] = line[len('Typical Job Costs:'):].strip() | |
else: | |
if lastkey: | |
rd[lastkey] += ' %s' % line.strip() | |
print(rd) | |
driver.back() | |
driver.implicitly_wait(TIMEOUT) | |
except NoSuchElementException: | |
break # End of list | |
except TimeoutException as e: | |
sys.stderr.write('TimeoutException skip for %s: %s' % (rd, e)) | |
if page == 'detail' and rd: | |
driver.back() | |
driver.implicitly_wait(TIMEOUT) | |
break | |
except Exception as e: | |
raise | |
######################################################################## | |
# "Next Page" 가 있으면 넘어감 | |
el = driver.find_element_by_class_name('pagination-wrapper') | |
if el.text.find('Next Page') < 0: | |
break | |
a = driver.find_element_by_link_text('Next Page') | |
a.click() | |
driver.implicitly_wait(TIMEOUT) | |
finally: | |
############################################################################ | |
# quit | |
driver.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
################################################################################ | |
import sys | |
from selenium import webdriver | |
from selenium.common.exceptions import NoSuchElementException | |
################################################################################ | |
LOGIN_ID = '[email protected]' | |
LOGIN_PW = 'linkedin_password' | |
SEARCH = 'CEO' | |
TIMEOUT = 10 | |
driver = None | |
try: | |
############################################################################ | |
# driver = webdriver.PhantomJS() | |
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver") | |
driver.set_window_size(1200, 1000) | |
############################################################################ | |
# 1) login : save this session as logined | |
driver.get("https://www.linkedin.com/") | |
driver.implicitly_wait(TIMEOUT) | |
elem = driver.find_element_by_id("login-email") | |
elem.send_keys(LOGIN_ID) | |
elem = driver.find_element_by_id("login-password") | |
elem.send_keys(LOGIN_PW) | |
elem.submit() | |
driver.implicitly_wait(TIMEOUT) | |
############################################################################ | |
# 2) search | |
# 1촌 중에서 해당 검색어로 검색 | |
driver.get('https://www.linkedin.com/search/results/people/' | |
'?facetNetwork=%5B"F"%5D&keywords={search}&' | |
'origin=FACETED_SEARCH'.format(search=SEARCH)) | |
driver.implicitly_wait(TIMEOUT) | |
############################################################################ | |
# 3) list page | |
page_num = 1 | |
while True: | |
# 일단 검색 결과 목록을 구하고 그 개수 만큼 내부 한 페이지 loop | |
elem = driver.find_element_by_class_name('results-list') | |
pl = driver.find_element_by_class_name('results-list').find_elements_by_xpath(".//li") | |
for i in range(len(pl)): | |
try: | |
elem = driver.find_element_by_class_name('results-list') | |
# 스크롤을 하여 다음 결과를 봐야 컴포넌트가 넘어감 | |
if i >= 5: | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
pl = driver.find_element_by_class_name( | |
'results-list').find_elements_by_xpath(".//li") | |
# 자동으로 크롤링을 하니 커머셜이 너머가면 중간에 나와 스킵 (그 얼 이후 막힘) | |
if pl[i].text.find('the commercial use limit.') > 0: | |
continue | |
# 해당 상세 정보로 들어감 | |
face = pl[i].find_elements_by_xpath(".//div/div/div/a")[1] | |
face.click() | |
driver.implicitly_wait(TIMEOUT) | |
# "Show more" 누름 | |
ml = driver.find_element_by_class_name('contact-see-more-less') | |
ml.click() | |
driver.implicitly_wait(TIMEOUT) | |
# 상세 정보를 가져옴 | |
info = {} | |
name = driver.find_element_by_class_name('pv-top-card-section__name') | |
info['name'] = name.text.strip() | |
job = driver.find_element_by_class_name('pv-top-card-section__headline') | |
info['job'] = job.text.strip() | |
cid = driver.find_element_by_class_name('right-rail__info-container') | |
cis = cid.find_element_by_xpath('.//section').\ | |
find_element_by_class_name('pv-profile-section__section-info') | |
key = None | |
for k, line in enumerate(cis.text.split('\n')): | |
if k % 2 == 0: | |
key = line.strip() | |
else: | |
info[key] = line.strip() | |
print(info) | |
# 이전 결과 목록 페이지로 되돌아가기 | |
driver.back() | |
driver.implicitly_wait(TIMEOUT) | |
except Exception as err: | |
sys.stdout.write('Error: %s' % err) | |
try: | |
try: | |
# 다음 페이지가 없으면 바깥 loop 나가기 | |
_ = driver.find_element_by_class_name('next-text') | |
except NoSuchElementException: | |
break | |
page_num += 1 | |
driver.get( | |
'https://www.linkedin.com/search/results/people/?' | |
'facetNetwork=%5B"F"%5D&keywords={search}&' | |
'origin=FACETED_SEARCH&page={page}'.format( | |
search=SEARCH, page=page_num)) | |
driver.implicitly_wait(TIMEOUT) | |
except Exception as e: | |
raise | |
finally: | |
############################################################################ | |
# 4) quit | |
if driver is not None: | |
driver.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# requirement for RPA | |
# in macos to install chromedriver or phantomjs | |
# brew install chromedriver | |
# brew install phantomjs | |
selenium>=3.8.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment