-
-
Save alinakhay/1d279ff59f82cafc95a1cd7478e0f6e9 to your computer and use it in GitHub Desktop.
Python 3 code to extract job listings from Glassdoor.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html, etree | |
import requests | |
import re | |
import os | |
import sys | |
import unicodecsv as csv | |
import argparse | |
import json | |
def parse(keyword, place): | |
headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6', | |
'referer': 'https://www.glassdoor.com/', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive' | |
} | |
location_headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.01', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6', | |
'referer': 'https://www.glassdoor.com/', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive' | |
} | |
data = {"term": place, | |
"maxLocationsToReturn": 10} | |
location_url = "https://www.glassdoor.co.in/findPopularLocationAjax.htm?" | |
try: | |
# Getting location id for search location | |
print("Fetching location details") | |
location_response = requests.post(location_url, headers=location_headers, data=data).json() | |
place_id = location_response[0]['locationId'] | |
job_litsting_url = 'https://www.glassdoor.com/Job/jobs.htm' | |
# Form data to get job results | |
data = { | |
'clickSource': 'searchBtn', | |
'sc.keyword': keyword, | |
'locT': 'C', | |
'locId': place_id, | |
'jobType': '' | |
} | |
job_listings = [] | |
if place_id: | |
response = requests.post(job_litsting_url, headers=headers, data=data) | |
# extracting data from | |
# https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=andr&sc.keyword=android+developer&locT=C&locId=1146821&jobType= | |
parser = html.fromstring(response.text) | |
# Making absolute url | |
base_url = "https://www.glassdoor.com" | |
parser.make_links_absolute(base_url) | |
XPATH_ALL_JOB = '//li[@class="jl"]' | |
XPATH_NAME = './/a/text()' | |
XPATH_JOB_URL = './/a/@href' | |
XPATH_LOC = './/span[@class="subtle loc"]/text()' | |
XPATH_COMPANY = './/div[@class="flexbox empLoc"]/div/text()' | |
XPATH_SALARY = './/span[@class="green small"]/text()' | |
listings = parser.xpath(XPATH_ALL_JOB) | |
for job in listings: | |
raw_job_name = job.xpath(XPATH_NAME) | |
raw_job_url = job.xpath(XPATH_JOB_URL) | |
raw_lob_loc = job.xpath(XPATH_LOC) | |
raw_company = job.xpath(XPATH_COMPANY) | |
raw_salary = job.xpath(XPATH_SALARY) | |
# Cleaning data | |
job_name = ''.join(raw_job_name).strip('–') if raw_job_name else None | |
job_location = ''.join(raw_lob_loc) if raw_lob_loc else None | |
raw_state = re.findall(",\s?(.*)\s?", job_location) | |
state = ''.join(raw_state).strip() | |
raw_city = job_location.replace(state, '') | |
city = raw_city.replace(',', '').strip() | |
company = ''.join(raw_company).replace('–','') | |
salary = ''.join(raw_salary).strip() | |
job_url = raw_job_url[0] if raw_job_url else None | |
jobs = { | |
"Name": job_name, | |
"Company": company, | |
"State": state, | |
"City": city, | |
"Salary": salary, | |
"Location": job_location, | |
"Url": job_url | |
} | |
job_listings.append(jobs) | |
return job_listings | |
else: | |
print("location id not available") | |
except: | |
print("Failed to load locations") | |
if __name__ == "__main__": | |
''' eg-:python 1934_glassdoor.py "Android developer", "new york" ''' | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('keyword', help='job name', type=str) | |
argparser.add_argument('place', help='job location', type=str) | |
args = argparser.parse_args() | |
keyword = args.keyword | |
place = args.place | |
print("Fetching job details") | |
scraped_data = parse(keyword, place) | |
print("Writing data to output file") | |
with open('%s-%s-job-results.csv' % (keyword, place), 'wb')as csvfile: | |
fieldnames = ['Name', 'Company', 'State', | |
'City', 'Salary', 'Location', 'Url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
if scraped_data: | |
for data in scraped_data: | |
writer.writerow(data) | |
else: | |
print("Your search for %s, in %s does not match any jobs"%(keyword,place)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment