Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active December 2, 2021 12:29
Show Gist options
  • Save scrapehero/352286d0f9dee87990cd45c3f979e7cb to your computer and use it in GitHub Desktop.
Save scrapehero/352286d0f9dee87990cd45c3f979e7cb to your computer and use it in GitHub Desktop.
Python 3 code to extract job listings from Glassdoor.com
from lxml import html, etree
import requests
import re
import os
import sys
import unicodecsv as csv
import argparse
import json
def parse(keyword, place):
headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
'referer': 'https://www.glassdoor.com/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
location_headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.01',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
'referer': 'https://www.glassdoor.com/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
data = {"term": place,
"maxLocationsToReturn": 10}
location_url = "https://www.glassdoor.co.in/findPopularLocationAjax.htm?"
try:
# Getting location id for search location
print("Fetching location details")
location_response = requests.post(location_url, headers=location_headers, data=data).json()
place_id = location_response[0]['locationId']
job_litsting_url = 'https://www.glassdoor.com/Job/jobs.htm'
# Form data to get job results
data = {
'clickSource': 'searchBtn',
'sc.keyword': keyword,
'locT': 'C',
'locId': place_id,
'jobType': ''
}
job_listings = []
if place_id:
response = requests.post(job_litsting_url, headers=headers, data=data)
# extracting data from
# https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=andr&sc.keyword=android+developer&locT=C&locId=1146821&jobType=
parser = html.fromstring(response.text)
# Making absolute url
base_url = "https://www.glassdoor.com"
parser.make_links_absolute(base_url)
XPATH_ALL_JOB = '//li[@class="jl"]'
XPATH_NAME = './/a/text()'
XPATH_JOB_URL = './/a/@href'
XPATH_LOC = './/span[@class="subtle loc"]/text()'
XPATH_COMPANY = './/div[@class="flexbox empLoc"]/div/text()'
XPATH_SALARY = './/span[@class="green small"]/text()'
listings = parser.xpath(XPATH_ALL_JOB)
for job in listings:
raw_job_name = job.xpath(XPATH_NAME)
raw_job_url = job.xpath(XPATH_JOB_URL)
raw_lob_loc = job.xpath(XPATH_LOC)
raw_company = job.xpath(XPATH_COMPANY)
raw_salary = job.xpath(XPATH_SALARY)
# Cleaning data
job_name = ''.join(raw_job_name).strip('–') if raw_job_name else None
job_location = ''.join(raw_lob_loc) if raw_lob_loc else None
raw_state = re.findall(",\s?(.*)\s?", job_location)
state = ''.join(raw_state).strip()
raw_city = job_location.replace(state, '')
city = raw_city.replace(',', '').strip()
company = ''.join(raw_company).replace('–','')
salary = ''.join(raw_salary).strip()
job_url = raw_job_url[0] if raw_job_url else None
jobs = {
"Name": job_name,
"Company": company,
"State": state,
"City": city,
"Salary": salary,
"Location": job_location,
"Url": job_url
}
job_listings.append(jobs)
return job_listings
else:
print("location id not available")
except:
print("Failed to load locations")
if __name__ == "__main__":
''' eg-:python 1934_glassdoor.py "Android developer", "new york" '''
argparser = argparse.ArgumentParser()
argparser.add_argument('keyword', help='job name', type=str)
argparser.add_argument('place', help='job location', type=str)
args = argparser.parse_args()
keyword = args.keyword
place = args.place
print("Fetching job details")
scraped_data = parse(keyword, place)
print("Writing data to output file")
with open('%s-%s-job-results.csv' % (keyword, place), 'wb')as csvfile:
fieldnames = ['Name', 'Company', 'State',
'City', 'Salary', 'Location', 'Url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
if scraped_data:
for data in scraped_data:
writer.writerow(data)
else:
print("Your search for %s, in %s does not match any jobs"%(keyword,place))
@JoshuaDombal
Copy link

When I run this, there are a max of 31 lines in the CSV file.. How do I make it include more lines?

@teng618
Copy link

teng618 commented Apr 28, 2019

same question as Joshua. The code only download the first page of results with 31 rows of data. how to loop through the other pages as well?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment