Created
June 15, 2021 18:05
-
-
Save Passett/0f3d160ad964ee442d6998fe02b512ee to your computer and use it in GitHub Desktop.
Script that scrapes salary data from PeopleFirst. I hard-code in my directory for results, but you will need to add your own
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import dependencies | |
import requests | |
import pandas as pd | |
import re | |
from datetime import datetime as dt | |
from bs4 import BeautifulSoup | |
#Create function that scrapes PeopleFirst jobs and filters for salary where the floor is $60k or above | |
#or $60k is atleast in the salary range, and puts results in an excel file for us | |
#Make a list of possible pages that you want to scrape, so you don't scrape all of PeopleFirst. | |
#Include a few more than you reasonably believe will exist in case the amount of job listings ever gets abnormally large | |
def get_links(url): | |
print('Please wait while I gather your data') | |
url_2=url+'&startrow=25' | |
url_3=url+'&startrow=50' | |
url_4=url+'&startrow=75' | |
url_5=url+'&startrow=100' | |
url_6=url+'&startrow=125' | |
url_7=url+'&startrow=150' | |
url_8=url+'&startrow=175' | |
url_9=url+'&startrow=200' | |
url_10=url+'&startrow=225' | |
url_11=url+'&startrow=250' | |
url_12=url+'&startrow=275' | |
url_13=url+'&startrow=300' | |
url_14=url+'&startrow=325' | |
url_15=url+'&startrow=350' | |
url_16=url+'&startrow=375' | |
url_17=url+'&startrow=400' | |
#Set up a list of your urls from above and make an empty list of links, where we will add all links on the pages from our urls list | |
urls=[url,url_2,url_3,url_4,url_5,url_6,url_7,url_8,url_9,url_10,url_11,url_12,url_13,url_14,url_15,url_16,url_17] | |
links=[] | |
#Append all links on all pages in our urls list to our links list | |
for i in urls: | |
website=requests.get(i) | |
websiteText=website.text | |
soup=BeautifulSoup(websiteText,features="html.parser") | |
for link in soup.find_all('a'): | |
links.append(link.get('href')) | |
#Make a string version of the links list and add other necessary empty lists for things we want to scrape | |
stringLinks=[str(i) for i in links] | |
jobLinks=[] | |
salaries=[] | |
title=[] | |
#Now narrow our links down to ones that contain "job" in the link, appending our filtered results to our list above titled jobLinks | |
r=re.compile('/job/.*') | |
firstScrape=list(filter(r.match,stringLinks)) | |
for i in firstScrape: | |
jobLinks.append('https://jobs.myflorida.com'+i) | |
#Put the salary listed in each job link in our salaries list and the job title in the title list. | |
#If it doesn't list salary, append a note to the list mentioning the lack of salary | |
for i in jobLinks: | |
website=requests.get(i) | |
websiteText=website.text | |
soup=BeautifulSoup(websiteText,features="html.parser") | |
element1=soup.select('#content > div > div.jobDisplayShell > div > div.job > span > p:nth-child(5)') | |
for i in element1: | |
try: | |
salaries.append(element1[0].text.strip()) | |
break | |
except IndexError: | |
salaries.append('Could not locate salary') | |
element2=soup.select('#job-title') | |
title.append(element2[0].text.strip()) | |
#Convert your other lists to strings | |
stringSalaries=[str(i) for i in salaries] | |
stringTitle=[str(i) for i in title] | |
#combine your lists into a pandas dataframe with column titles for title, salary, and links | |
df=pd.DataFrame(list(zip(stringTitle,stringSalaries,jobLinks)),columns=['Title','Salary','Links']) | |
#Function that we will use to try to make the links in Excel hyperlinked, without having to format in excel | |
def make_clickable(val): | |
return '<a href="{}">{}</a>'.format(val) | |
df.style.format({'Links': make_clickable}) | |
#Here we finally filter out the lower salaries using our poorly written regex. We need to account for if they list the salary as annual, monthly, hourly, bi-weekly, etc. | |
#We also save our results as an Excel file with today's date in the title, in a folder on our desktop | |
filt=df['Salary'].str.contains('[6-9]\d,\d\d\d|\$[3-4],\d\d\d.*[Bb][iI]|[5-9],\d\d\d.*[Mm][Oo][Nn][Tt][Hh]|\$[3-9]\d.*[hH][oOrR]|Could not locate salary',regex=True, na=False) | |
df=df.loc[filt] | |
mask = '%m-%d-%Y' | |
dte = dt.now().strftime(mask) | |
fname = r'Hello LinkedIn members, enter your_own_path here\Desktop\PeopleFirst\PositionsAndSalaries_{}.xlsx'.format(dte) | |
df.to_excel(fname, index=False) | |
print(r'Success! Please see your results at your_own_path\Desktop\PeopleFirst') | |
#Call your function and get your results! | |
get_links('https://jobs.myflorida.com/search/?q=&locationsearch=tallahassee&searchby=location&d=10') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment