Passett · June 15, 2021 18:05
diff --git a/SalaryScraper.py b/SalaryScraper.py
 #Import dependencies 
 import requests
 import pandas as pd
 import re
 from datetime import datetime as dt
 from bs4 import BeautifulSoup

 #Create function that scrapes PeopleFirst jobs and filters for salary where the floor is $60k or above
 #or $60k is atleast in the salary range, and puts results in an excel file for us

 #Make a list of possible pages that you want to scrape, so you don't scrape all of PeopleFirst.
 #Include a few more than you reasonably believe will exist in case the amount of job listings ever gets abnormally large
 def get_links(url):
    print('Please wait while I gather your data')
    url_2=url+'&startrow=25'
    url_3=url+'&startrow=50'
    url_4=url+'&startrow=75'
    url_5=url+'&startrow=100'
    url_6=url+'&startrow=125'
    url_7=url+'&startrow=150' 
    url_8=url+'&startrow=175'
    url_9=url+'&startrow=200'
    url_10=url+'&startrow=225'
    url_11=url+'&startrow=250'
    url_12=url+'&startrow=275'
    url_13=url+'&startrow=300'
    url_14=url+'&startrow=325'
    url_15=url+'&startrow=350'
    url_16=url+'&startrow=375'
    url_17=url+'&startrow=400'
    
    #Set up a list of your urls from above and make an empty list of links, where we will add all links on the pages from our urls list
    urls=[url,url_2,url_3,url_4,url_5,url_6,url_7,url_8,url_9,url_10,url_11,url_12,url_13,url_14,url_15,url_16,url_17]
    links=[]
        
    #Append all links on all pages in our urls list to our links list
    for i in urls:   
        website=requests.get(i)
        websiteText=website.text
        soup=BeautifulSoup(websiteText,features="html.parser")

        for link in soup.find_all('a'):
            links.append(link.get('href'))

    #Make a string version of the links list and add other necessary empty lists for things we want to scrape
    stringLinks=[str(i) for i in links]
    jobLinks=[]
    salaries=[]
    title=[]

    #Now narrow our links down to ones that contain "job" in the link, appending our filtered results to our list above titled jobLinks
    r=re.compile('/job/.*') 
    firstScrape=list(filter(r.match,stringLinks))
    for i in firstScrape:
        jobLinks.append('https://jobs.myflorida.com'+i)
        
    #Put the salary listed in each job link in our salaries list and the job title in the title list. 
    #If it doesn't list salary, append a note to the list mentioning the lack of salary
    for i in jobLinks:
        website=requests.get(i)
        websiteText=website.text
        soup=BeautifulSoup(websiteText,features="html.parser")
        element1=soup.select('#content > div > div.jobDisplayShell > div > div.job > span > p:nth-child(5)')
        for i in element1:
            try:
                salaries.append(element1[0].text.strip())
                break
            except IndexError:
                salaries.append('Could not locate salary') 
        element2=soup.select('#job-title')
        title.append(element2[0].text.strip())
        
    #Convert your other lists to strings    
    stringSalaries=[str(i) for i in salaries]
    stringTitle=[str(i) for i in title]
    
    #combine your lists into a pandas dataframe with column titles for title, salary, and links
    df=pd.DataFrame(list(zip(stringTitle,stringSalaries,jobLinks)),columns=['Title','Salary','Links'])

    #Function that we will use to try to make the links in Excel hyperlinked, without having to format in excel
    def make_clickable(val):
        return '<a href="{}">{}</a>'.format(val)
    df.style.format({'Links': make_clickable})

    #Here we finally filter out the lower salaries using our poorly written regex. We need to account for if they list the salary as annual, monthly, hourly, bi-weekly, etc.
    #We also save our results as an Excel file with today's date in the title, in a folder on our desktop
    filt=df['Salary'].str.contains('[6-9]\d,\d\d\d|\$[3-4],\d\d\d.*[Bb][iI]|[5-9],\d\d\d.*[Mm][Oo][Nn][Tt][Hh]|\$[3-9]\d.*[hH][oOrR]|Could not locate salary',regex=True, na=False)
    df=df.loc[filt]
    mask = '%m-%d-%Y'
    dte = dt.now().strftime(mask)
    fname = r'Hello LinkedIn members, enter your_own_path here\Desktop\PeopleFirst\PositionsAndSalaries_{}.xlsx'.format(dte)
    df.to_excel(fname, index=False)
    print(r'Success! Please see your results at your_own_path\Desktop\PeopleFirst')
 
 #Call your function and get your results!    
 get_links('https://jobs.myflorida.com/search/?q=&locationsearch=tallahassee&searchby=location&d=10')
	#Import dependencies
	import requests
	import pandas as pd
	import re
	from datetime import datetime as dt
	from bs4 import BeautifulSoup

	#Create function that scrapes PeopleFirst jobs and filters for salary where the floor is $60k or above
	#or $60k is atleast in the salary range, and puts results in an excel file for us

	#Make a list of possible pages that you want to scrape, so you don't scrape all of PeopleFirst.
	#Include a few more than you reasonably believe will exist in case the amount of job listings ever gets abnormally large
	def get_links(url):
	print('Please wait while I gather your data')
	url_2=url+'&startrow=25'
	url_3=url+'&startrow=50'
	url_4=url+'&startrow=75'
	url_5=url+'&startrow=100'
	url_6=url+'&startrow=125'
	url_7=url+'&startrow=150'
	url_8=url+'&startrow=175'
	url_9=url+'&startrow=200'
	url_10=url+'&startrow=225'
	url_11=url+'&startrow=250'
	url_12=url+'&startrow=275'
	url_13=url+'&startrow=300'
	url_14=url+'&startrow=325'
	url_15=url+'&startrow=350'
	url_16=url+'&startrow=375'
	url_17=url+'&startrow=400'

	#Set up a list of your urls from above and make an empty list of links, where we will add all links on the pages from our urls list
	urls=[url,url_2,url_3,url_4,url_5,url_6,url_7,url_8,url_9,url_10,url_11,url_12,url_13,url_14,url_15,url_16,url_17]
	links=[]

	#Append all links on all pages in our urls list to our links list
	for i in urls:
	website=requests.get(i)
	websiteText=website.text
	soup=BeautifulSoup(websiteText,features="html.parser")

	for link in soup.find_all('a'):
	links.append(link.get('href'))

	#Make a string version of the links list and add other necessary empty lists for things we want to scrape
	stringLinks=[str(i) for i in links]
	jobLinks=[]
	salaries=[]
	title=[]

	#Now narrow our links down to ones that contain "job" in the link, appending our filtered results to our list above titled jobLinks
	r=re.compile('/job/.*')
	firstScrape=list(filter(r.match,stringLinks))
	for i in firstScrape:
	jobLinks.append('https://jobs.myflorida.com'+i)

	#Put the salary listed in each job link in our salaries list and the job title in the title list.
	#If it doesn't list salary, append a note to the list mentioning the lack of salary
	for i in jobLinks:
	website=requests.get(i)
	websiteText=website.text
	soup=BeautifulSoup(websiteText,features="html.parser")
	element1=soup.select('#content > div > div.jobDisplayShell > div > div.job > span > p:nth-child(5)')
	for i in element1:
	try:
	salaries.append(element1[0].text.strip())
	break
	except IndexError:
	salaries.append('Could not locate salary')
	element2=soup.select('#job-title')
	title.append(element2[0].text.strip())

	#Convert your other lists to strings
	stringSalaries=[str(i) for i in salaries]
	stringTitle=[str(i) for i in title]

	#combine your lists into a pandas dataframe with column titles for title, salary, and links
	df=pd.DataFrame(list(zip(stringTitle,stringSalaries,jobLinks)),columns=['Title','Salary','Links'])

	#Function that we will use to try to make the links in Excel hyperlinked, without having to format in excel
	def make_clickable(val):
	return '<a href="{}">{}</a>'.format(val)
	df.style.format({'Links': make_clickable})

	#Here we finally filter out the lower salaries using our poorly written regex. We need to account for if they list the salary as annual, monthly, hourly, bi-weekly, etc.
	#We also save our results as an Excel file with today's date in the title, in a folder on our desktop
	filt=df['Salary'].str.contains('[6-9]\d,\d\d\d\|\$[3-4],\d\d\d.[Bb][iI]\|[5-9],\d\d\d.[Mm][Oo][Nn][Tt][Hh]\|\$[3-9]\d.*[hH][oOrR]\|Could not locate salary',regex=True, na=False)
	df=df.loc[filt]
	mask = '%m-%d-%Y'
	dte = dt.now().strftime(mask)
	fname = r'Hello LinkedIn members, enter your_own_path here\Desktop\PeopleFirst\PositionsAndSalaries_{}.xlsx'.format(dte)
	df.to_excel(fname, index=False)
	print(r'Success! Please see your results at your_own_path\Desktop\PeopleFirst')

	#Call your function and get your results!
	get_links('https://jobs.myflorida.com/search/?q=&locationsearch=tallahassee&searchby=location&d=10')