Skip to content

Instantly share code, notes, and snippets.

@mcchae
Last active June 2, 2021 10:53
Show Gist options
  • Save mcchae/c9323d426aba8fcde3c1b54731f6cfbe to your computer and use it in GitHub Desktop.
Save mcchae/c9323d426aba8fcde3c1b54731f6cfbe to your computer and use it in GitHub Desktop.
Some python selenium examples
#!/usr/bin/env python
################################################################################
import sys
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
################################################################################
# SEARCH = "Hotels in California, CA, USA"
SEARCH = "언주역 부근 호텔"
TIMEOUT = 5
driver = None
try:
############################################################################
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
driver.set_window_size(1200, 800)
############################################################################
# 1) search
driver.get("https://www.google.co.kr/maps")
driver.implicitly_wait(TIMEOUT)
# 검색 엘리먼트를 찾아 검색어를 입력하고
elem = driver.find_element_by_id("searchboxinput")
elem.send_keys(SEARCH)
# 검색 단추를 누른다
elem = driver.find_element_by_id("searchbox-searchbutton")
elem.click()
############################################################################
# 2) get result list
for ndx in range(100):
driver.implicitly_wait(TIMEOUT)
# 현재 검색 목록에 대해 목록의 상위에 해당하는 엘리먼트를 구해옴 (기다렸다)
elem = driver.find_element_by_class_name('widget-pane-content-holder')
dt = elem.find_element_by_xpath('.//div/div[@role="listbox"]')
rd = {}
try:
# 검색 결과 중에 ndx 번째 결과의 엘리먼트를 구해옴
d = dt.find_element_by_xpath('.//div[@data-result-index="%s"]' % ndx)
lines = d.text.split('\n')
# 첫번째 줄은 호텔이름
rd['hotel'] = lines[0]
# 나머지 줄은 정보로
rd['info'] = ','.join(lines[1:])
# 해당 정보를 눌러 상세 정보 보기
d.click()
# 다음 몇초를 쉬는 이유는 아래의 elem 이나 back_button 등을
# WebDriverWait로 구해와도 ElementNotVisibleException 등의 예외 때문
# (아마도 지도에 표시를 하는 등 data binding 시간이 꽤 걸리는 듯)
driver.implicitly_wait(TIMEOUT)
# 상세 정보 엘리먼트 구해옴 (기다리며)
elem = driver.find_element_by_class_name('widget-pane-content-holder')
# 주소 구해옴 : 생략될 수 있기 때문에 try
try:
it = elem.find_element_by_xpath('.//div/div[@data-section-id="ad"]')
rd['address'] = it.text
except Exception:
pass
# 홈페이지 구해옴 : 생략될 수 있기 때문에 try
try:
it = elem.find_element_by_xpath('.//div/div[@data-section-id="ap"]')
rd['homepage'] = it.text
except Exception:
pass
# phone 구해옴 : 생략될 수 있기 때문에 try
try:
it = elem.find_element_by_xpath('.//div/div[@data-section-id="pn0"]')
rd['phone'] = it.text
except Exception:
pass
print(rd)
# 이전 "검색결과로 돌아가기" 누름
back_button = elem.find_element_by_xpath('.//div/button')
back_button.click()
except NoSuchElementException:
# 검색 결과 중에 구해오기 위한 ndx 번째를 너머서면 못구하고 해당 오류가
# 발생하므로 for loop 빠짐
break # End of list
except Exception:
raise
finally:
############################################################################
# quit
if driver is not None:
driver.quit()
#!/usr/bin/env python
################################################################################
import sys
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
################################################################################
URL = "https://www.houzz.com/"
MENU = 'professionals/architect'
LOCATION = 'San Jose, CA'
DISTANCE = 10 # one of (10, 25, 50, 100)
TIMEOUT = 20
################################################################################
driver = None
try:
############################################################################
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
driver.set_window_size(1200, 1000)
############################################################################
# 1) house
page = None
driver.get(URL + MENU)
driver.implicitly_wait(TIMEOUT)
page = 'search'
if LOCATION:
el = driver.find_element_by_id('proLocationSearch')
el.send_keys(LOCATION)
if DISTANCE in (10, 25, 50, 100):
el = driver.find_element_by_id('proDistanceFilter')
for option in el.find_elements_by_tag_name('option'):
if option.text == 'Within %s miles' % DISTANCE:
option.click() # select() in earlier versions of webdriver
break
############################################################################
# 2) search
el = driver.find_element_by_id('proSearchBtn')
el.click()
driver.implicitly_wait(TIMEOUT)
page = 'find'
while True:
########################################################################
# 3) get results list
# for div in el.find_elements_by_tag_name('div'):
for ndx in range(100):
el = driver.find_element_by_class_name('browseListBody')
page = 'results'
try:
rd = {}
d = el.find_element_by_xpath('.//div[@posid="%s"]' % ndx)
rd['name'] = d.text.split('\n')[0]
a = d.find_element_by_class_name('pro-title')
a.click()
driver.implicitly_wait(TIMEOUT)
################################################################
# click "Click to Call"
page = 'detail'
try:
el = driver.find_element_by_class_name('click-to-call-link')
el.click()
driver.implicitly_wait(TIMEOUT)
el = driver.find_element_by_class_name('pro-contact-text')
rd['phone'] = el.text
except Exception:
pass
el = driver.find_element_by_class_name('profile-about-right')
lastkey = None
for i, line in enumerate(el.text.split('\n')):
if i == 0:
rd['kind'] = line.strip()
continue
if line.startswith('Contact:'):
lastkey = 'contact'
rd[lastkey] = line[len('Contact:'):].strip()
elif line.startswith('Location:'):
lastkey = 'location'
rd[lastkey] = line[len('Location:'):].strip()
elif line.startswith('License Number:'):
lastkey = 'license'
rd[lastkey] = line[len('License Number:'):].strip()
elif line.startswith('Typical Job Costs:'):
lastkey = 'cost'
rd[lastkey] = line[len('Typical Job Costs:'):].strip()
else:
if lastkey:
rd[lastkey] += ' %s' % line.strip()
print(rd)
driver.back()
driver.implicitly_wait(TIMEOUT)
except NoSuchElementException:
break # End of list
except TimeoutException as e:
sys.stderr.write('TimeoutException skip for %s: %s' % (rd, e))
if page == 'detail' and rd:
driver.back()
driver.implicitly_wait(TIMEOUT)
break
except Exception as e:
raise
########################################################################
# "Next Page" 가 있으면 넘어감
el = driver.find_element_by_class_name('pagination-wrapper')
if el.text.find('Next Page') < 0:
break
a = driver.find_element_by_link_text('Next Page')
a.click()
driver.implicitly_wait(TIMEOUT)
finally:
############################################################################
# quit
driver.quit()
#!/usr/bin/env python
################################################################################
import sys
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
################################################################################
LOGIN_ID = '[email protected]'
LOGIN_PW = 'linkedin_password'
SEARCH = 'CEO'
TIMEOUT = 10
driver = None
try:
############################################################################
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
driver.set_window_size(1200, 1000)
############################################################################
# 1) login : save this session as logined
driver.get("https://www.linkedin.com/")
driver.implicitly_wait(TIMEOUT)
elem = driver.find_element_by_id("login-email")
elem.send_keys(LOGIN_ID)
elem = driver.find_element_by_id("login-password")
elem.send_keys(LOGIN_PW)
elem.submit()
driver.implicitly_wait(TIMEOUT)
############################################################################
# 2) search
# 1촌 중에서 해당 검색어로 검색
driver.get('https://www.linkedin.com/search/results/people/'
'?facetNetwork=%5B"F"%5D&keywords={search}&'
'origin=FACETED_SEARCH'.format(search=SEARCH))
driver.implicitly_wait(TIMEOUT)
############################################################################
# 3) list page
page_num = 1
while True:
# 일단 검색 결과 목록을 구하고 그 개수 만큼 내부 한 페이지 loop
elem = driver.find_element_by_class_name('results-list')
pl = driver.find_element_by_class_name('results-list').find_elements_by_xpath(".//li")
for i in range(len(pl)):
try:
elem = driver.find_element_by_class_name('results-list')
# 스크롤을 하여 다음 결과를 봐야 컴포넌트가 넘어감
if i >= 5:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pl = driver.find_element_by_class_name(
'results-list').find_elements_by_xpath(".//li")
# 자동으로 크롤링을 하니 커머셜이 너머가면 중간에 나와 스킵 (그 얼 이후 막힘)
if pl[i].text.find('the commercial use limit.') > 0:
continue
# 해당 상세 정보로 들어감
face = pl[i].find_elements_by_xpath(".//div/div/div/a")[1]
face.click()
driver.implicitly_wait(TIMEOUT)
# "Show more" 누름
ml = driver.find_element_by_class_name('contact-see-more-less')
ml.click()
driver.implicitly_wait(TIMEOUT)
# 상세 정보를 가져옴
info = {}
name = driver.find_element_by_class_name('pv-top-card-section__name')
info['name'] = name.text.strip()
job = driver.find_element_by_class_name('pv-top-card-section__headline')
info['job'] = job.text.strip()
cid = driver.find_element_by_class_name('right-rail__info-container')
cis = cid.find_element_by_xpath('.//section').\
find_element_by_class_name('pv-profile-section__section-info')
key = None
for k, line in enumerate(cis.text.split('\n')):
if k % 2 == 0:
key = line.strip()
else:
info[key] = line.strip()
print(info)
# 이전 결과 목록 페이지로 되돌아가기
driver.back()
driver.implicitly_wait(TIMEOUT)
except Exception as err:
sys.stdout.write('Error: %s' % err)
try:
try:
# 다음 페이지가 없으면 바깥 loop 나가기
_ = driver.find_element_by_class_name('next-text')
except NoSuchElementException:
break
page_num += 1
driver.get(
'https://www.linkedin.com/search/results/people/?'
'facetNetwork=%5B"F"%5D&keywords={search}&'
'origin=FACETED_SEARCH&page={page}'.format(
search=SEARCH, page=page_num))
driver.implicitly_wait(TIMEOUT)
except Exception as e:
raise
finally:
############################################################################
# 4) quit
if driver is not None:
driver.quit()
# requirement for RPA
# in macos to install chromedriver or phantomjs
# brew install chromedriver
# brew install phantomjs
selenium>=3.8.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment