Created
May 28, 2016 05:46
-
-
Save abhaystoic/b221db3105959a040c92e94229aa59d3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
Created on May 27, 2016 | |
@author: abgupta | |
''' | |
from selenium.webdriver import Firefox | |
from selenium.webdriver.support.ui import WebDriverWait | |
import time, sys, traceback | |
from HTMLParser import HTMLParser | |
from bs4 import BeautifulSoup | |
class Scraper(object): | |
''' | |
classdocs | |
''' | |
def __init__(self): | |
''' | |
Constructor | |
''' | |
self.url = 'https://sjobs.brassring.com/TGWebHost/home.aspx?partnerid=25667&siteid=5417' | |
self.base_job_url = 'https://sjobs.brassring.com/TGWebHost/jobdetails.aspx?' | |
self.browser = Firefox() | |
self.first_page_search_opening_id = 'srchOpenLink' | |
self.second_page_search_btn_id = 'ctl00_MainContent_submit2' | |
self.next_link_id = 'yui-pg0-0-next-link' | |
#Spinner | |
def DrawSpinner(self, counter): | |
if counter % 4 == 0: | |
sys.stdout.write("/") | |
elif counter % 4 == 1: | |
sys.stdout.write("-") | |
elif counter % 4 == 2: | |
sys.stdout.write("\\") | |
elif counter % 4 == 3: | |
sys.stdout.write("|") | |
sys.stdout.flush() | |
sys.stdout.write('\b') | |
def first_page(self, url): | |
try: | |
self.browser.get(url) | |
#link = self.browser.find_element_by_link_text('Search openings') | |
link = self.browser.find_element_by_id(self.first_page_search_opening_id) | |
link.click() | |
# wait for the page to load | |
WebDriverWait(self.browser, timeout=100).until( | |
lambda x: x.find_element_by_id(self.second_page_search_btn_id)) | |
except Exception as e: | |
print 'exception= ', str(e) | |
print 'stacktrace= ', traceback.print_exc() | |
print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) | |
def click_search_button(self): | |
#Click search button | |
link = self.browser.find_element_by_id(self.second_page_search_btn_id) | |
link.click() | |
# wait for the page to load | |
WebDriverWait(self.browser, timeout=100).until( | |
lambda x: x.find_element_by_class_name('t_full')) | |
def click_next_button(self): | |
#Click NEXT | |
link = self.browser.find_element_by_id(self.next_link_id) | |
link.click() | |
# wait for the page to load | |
WebDriverWait(self.browser, timeout=100).until( | |
lambda x: x.find_element_by_class_name('t_full')) | |
def get_page_source(self): | |
page_source = self.browser.page_source.decode('utf8') | |
f = open('myhtml.html','a') | |
f.write(page_source) | |
f.close() | |
return page_source | |
def get_job_info(self, new_browser, job_url): | |
try: | |
new_browser.get(job_url) | |
html = new_browser.page_source | |
soup = BeautifulSoup(html, 'html.parser') | |
#Find designation | |
data = soup.find('span', attrs={'id' : 'Designation'}) | |
if data: | |
#print data.text | |
f = open('descriptions.txt','a') | |
f.write(data.text + '\n') | |
f.close() | |
else: | |
pass | |
#Find Qualifications | |
data_ql = soup.find('span', attrs={'id' : 'Qualification'}) | |
if data_ql: | |
#print data_ql.text | |
f = open('descriptions.txt','a') | |
f.write(data_ql.text + '\n') | |
f.close() | |
else: | |
pass | |
except Exception as e: | |
print 'exception= ', str(e) | |
#print 'stacktrace= ', traceback.print_exc() | |
print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) | |
def get_jobs(self): | |
try: | |
h = HTMLParser() | |
html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore') | |
soup = BeautifulSoup(html, 'html.parser') | |
data = soup.findAll('a', id=lambda x: x and x.startswith('popup')) | |
#print data | |
counter = 0 | |
for a in data: | |
if a.has_attr('href'): | |
counter = counter + 1 | |
self.DrawSpinner(counter) | |
try: | |
self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1]) | |
except Exception: | |
continue | |
print counter | |
except Exception as e: | |
print 'exception= ', str(e) | |
#print 'stacktrace= ', traceback.print_exc() | |
print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) | |
def main(self): | |
self.first_page(self.url) | |
self.click_search_button() | |
try: | |
for i in range(1, int(5309/50) + 1): | |
self.get_jobs() | |
f = open('myhtml.html','a') | |
f.write(self.browser.page_source) | |
f.close() | |
self.first_page(self.url) | |
self.click_search_button() | |
for _ in range(1, i+1): | |
self.click_next_button() | |
except Exception as e: | |
print 'exception= ', str(e) | |
#print 'stacktrace= ', traceback.print_exc() | |
print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) | |
if __name__ == '__main__': | |
start_time = time.time() | |
sys.stdout.flush() | |
sys.stdout.write('\b') | |
Scraper().main() | |
sys.stdout.flush() | |
sys.stdout.write('\b') | |
end_time = time.time() | |
print 'Processing Time = ', str(end_time-start_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment