Skip to content

Instantly share code, notes, and snippets.

@Passett
Created June 15, 2021 18:05
Show Gist options
  • Save Passett/0f3d160ad964ee442d6998fe02b512ee to your computer and use it in GitHub Desktop.
Save Passett/0f3d160ad964ee442d6998fe02b512ee to your computer and use it in GitHub Desktop.
Script that scrapes salary data from PeopleFirst. I hard-code in my directory for results, but you will need to add your own
#Import dependencies
import requests
import pandas as pd
import re
from datetime import datetime as dt
from bs4 import BeautifulSoup
#Create function that scrapes PeopleFirst jobs and filters for salary where the floor is $60k or above
#or $60k is atleast in the salary range, and puts results in an excel file for us
#Make a list of possible pages that you want to scrape, so you don't scrape all of PeopleFirst.
#Include a few more than you reasonably believe will exist in case the amount of job listings ever gets abnormally large
def get_links(url):
print('Please wait while I gather your data')
url_2=url+'&startrow=25'
url_3=url+'&startrow=50'
url_4=url+'&startrow=75'
url_5=url+'&startrow=100'
url_6=url+'&startrow=125'
url_7=url+'&startrow=150'
url_8=url+'&startrow=175'
url_9=url+'&startrow=200'
url_10=url+'&startrow=225'
url_11=url+'&startrow=250'
url_12=url+'&startrow=275'
url_13=url+'&startrow=300'
url_14=url+'&startrow=325'
url_15=url+'&startrow=350'
url_16=url+'&startrow=375'
url_17=url+'&startrow=400'
#Set up a list of your urls from above and make an empty list of links, where we will add all links on the pages from our urls list
urls=[url,url_2,url_3,url_4,url_5,url_6,url_7,url_8,url_9,url_10,url_11,url_12,url_13,url_14,url_15,url_16,url_17]
links=[]
#Append all links on all pages in our urls list to our links list
for i in urls:
website=requests.get(i)
websiteText=website.text
soup=BeautifulSoup(websiteText,features="html.parser")
for link in soup.find_all('a'):
links.append(link.get('href'))
#Make a string version of the links list and add other necessary empty lists for things we want to scrape
stringLinks=[str(i) for i in links]
jobLinks=[]
salaries=[]
title=[]
#Now narrow our links down to ones that contain "job" in the link, appending our filtered results to our list above titled jobLinks
r=re.compile('/job/.*')
firstScrape=list(filter(r.match,stringLinks))
for i in firstScrape:
jobLinks.append('https://jobs.myflorida.com'+i)
#Put the salary listed in each job link in our salaries list and the job title in the title list.
#If it doesn't list salary, append a note to the list mentioning the lack of salary
for i in jobLinks:
website=requests.get(i)
websiteText=website.text
soup=BeautifulSoup(websiteText,features="html.parser")
element1=soup.select('#content > div > div.jobDisplayShell > div > div.job > span > p:nth-child(5)')
for i in element1:
try:
salaries.append(element1[0].text.strip())
break
except IndexError:
salaries.append('Could not locate salary')
element2=soup.select('#job-title')
title.append(element2[0].text.strip())
#Convert your other lists to strings
stringSalaries=[str(i) for i in salaries]
stringTitle=[str(i) for i in title]
#combine your lists into a pandas dataframe with column titles for title, salary, and links
df=pd.DataFrame(list(zip(stringTitle,stringSalaries,jobLinks)),columns=['Title','Salary','Links'])
#Function that we will use to try to make the links in Excel hyperlinked, without having to format in excel
def make_clickable(val):
return '<a href="{}">{}</a>'.format(val)
df.style.format({'Links': make_clickable})
#Here we finally filter out the lower salaries using our poorly written regex. We need to account for if they list the salary as annual, monthly, hourly, bi-weekly, etc.
#We also save our results as an Excel file with today's date in the title, in a folder on our desktop
filt=df['Salary'].str.contains('[6-9]\d,\d\d\d|\$[3-4],\d\d\d.*[Bb][iI]|[5-9],\d\d\d.*[Mm][Oo][Nn][Tt][Hh]|\$[3-9]\d.*[hH][oOrR]|Could not locate salary',regex=True, na=False)
df=df.loc[filt]
mask = '%m-%d-%Y'
dte = dt.now().strftime(mask)
fname = r'Hello LinkedIn members, enter your_own_path here\Desktop\PeopleFirst\PositionsAndSalaries_{}.xlsx'.format(dte)
df.to_excel(fname, index=False)
print(r'Success! Please see your results at your_own_path\Desktop\PeopleFirst')
#Call your function and get your results!
get_links('https://jobs.myflorida.com/search/?q=&locationsearch=tallahassee&searchby=location&d=10')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment