Created
August 31, 2015 03:46
-
-
Save jkarnows/5a506876ba9340aac08c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Code modified from https://jessesw.com/Data-Science-Skills/ | |
Jeremy Karnowski August 30, 2015 | |
""" | |
from bs4 import BeautifulSoup # For HTML parsing | |
import urllib2 # Website connections | |
import tldextract # Extracts domain information | |
import re # Regular expressions | |
from time import sleep # To prevent overwhelming the server between connections | |
def cleanIndeedWebsite(website): | |
# Check if site is from indeed.com (or formatting will be different) | |
redirect = urllib2.urlopen(website) | |
redirect_domain = tldextract.extract(redirect.geturl()).domain | |
# If from indee.com, grab the cleaned up job posting | |
if redirect_domain == 'indeed': | |
site = redirect.read() | |
soup_obj = BeautifulSoup(site) | |
for item in soup_obj(["script", | |
"style", | |
"head", | |
"noscript", | |
"separator_top"]): | |
item.extract() | |
div_ids = ["g_nav", "footerWrapper","indeed_apply", "emailApplyWrapper"] | |
for item in div_ids: | |
it = soup_obj.find("div", {"id": item}) | |
it.extract() | |
div_classes = ["result-link-bar-container result-link-bar-viewjob", | |
"tab-container"] | |
for item in div_classes: | |
it = soup_obj.find("div", {"class": item}) | |
it.extract() | |
table_classes = ["lnav"] | |
for item in table_classes: | |
it = soup_obj.find("table", {"class": item}) | |
it.extract() | |
soup_obj = soup_obj.decode('unicode_escape').encode('ascii', 'ignore') | |
return soup_obj | |
else: | |
return | |
def indeedSearch(city = None, state = None, final_job = 'data scientist', save_dir = None): | |
''' | |
This function will take a desired city/state and look for all new job postings | |
on Indeed.com. It will crawl all of the job postings and keep track of how many | |
use a preset list of typical data science skills. The final percentage for each skill | |
is then displayed at the end of the collation. | |
Inputs: The location's city and state and desired job. These are optional. If no | |
city/state is input, the function will assume a national search (this can take a | |
while!!!). Input the city/state as strings, such as skills_info('Chicago', 'IL'). | |
Use a two letter abbreviation for the state. Searching for a job requires the text | |
to have plus signs between words. For example, a data scientist search would be | |
"data scientist" and the search term would be "data+scientist". | |
Output: Saved files representing the text from the indeed.com job postings | |
''' | |
# final_job = final_job.replace(' ', '+') # commented to get effect I want at moment | |
# Make sure the city specified works properly if it has more than one word (such as San Francisco) | |
if city is not None: | |
final_city = city.split() | |
final_city = '+'.join(word for word in final_city) | |
final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city, | |
'%2C+', state] # Join all of our strings together so that indeed will search correctly | |
else: | |
final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"'] | |
final_site = ''.join(final_site_list) # Merge the html address together into one string | |
base_url = 'http://www.indeed.com' | |
try: | |
html = urllib2.urlopen(final_site).read() # Open up the front page of our search first | |
except: | |
'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid | |
return | |
soup = BeautifulSoup(html) # Get the html from the first page | |
# Now find out how many jobs there were | |
num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found | |
# The 'searchCount' object has this | |
job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result | |
print job_numbers | |
if len(job_numbers) > 3: # Have a total number of jobs greater than 1000 | |
total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3]) | |
else: | |
total_num_jobs = int(job_numbers[2]) | |
city_title = city | |
if city is None: | |
city_title = 'Nationwide' | |
print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found | |
num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new | |
# search result page | |
job_descriptions = [] # Store all our descriptions in this list | |
# for i in xrange(1,num_pages+1): # Loop through all of our search result pages | |
for i in xrange(1,100): # Just loop through the first 100 pages. After this, posts get old and aren't from indeed.com | |
print 'Getting page', i | |
start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want | |
current_page = ''.join([final_site, '&start=', start_num]) | |
# Now that we can view the correct 10 job returns, start collecting the text samples from each | |
html_page = urllib2.urlopen(current_page).read() # Get the page | |
page_obj = BeautifulSoup(html_page) # Locate all of the job links | |
job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist | |
job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs | |
job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS | |
for j in xrange(0,len(job_URLS)): | |
try: | |
final_description = cleanIndeedWebsite(job_URLS[j]) | |
except: | |
final_description = None | |
if final_description: | |
# For each job, save the text on our machine | |
tmp_file = open(save_dir + city + "_" + str(i) + "_" + str(j) + '.txt','w') | |
tmp_file.write(final_description) | |
tmp_file.close() | |
sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! | |
print 'Done with collecting the job postings!' | |
print 'There were', len(job_descriptions), 'jobs successfully found.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment