Created
November 23, 2021 09:16
-
-
Save shashank-sharma/6f6fddc6d6d01d92b8f3abf1d9880ca8 to your computer and use it in GitHub Desktop.
Linkedin job scraper using Scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Small script to scrape job data using linkedin API | |
# NOTE: It doesn't check for duplicates, it can be improved here or can be cleaned up | |
# later on by using job_id (unique) | |
from urllib.parse import urlencode | |
from scrapy.http import HtmlResponse | |
import dateutil.relativedelta | |
import datetime | |
import scrapy | |
class QuotesSpider(scrapy.Spider): | |
name = "job_spider" | |
def parse_date(self, date_str): | |
years = 0 | |
months = 0 | |
weeks = 0 | |
days = 0 | |
hours = 0 | |
minutes = 0 | |
seconds = 0 | |
date_list = date_str.split(" ") | |
date_count = -int(date_list[0]) | |
date_ago = date_list[1].lower() | |
if date_ago in ["year", "years"]: | |
years = date_count | |
elif date_ago in ["month", "months"]: | |
months = date_count | |
elif date_ago in ["week", "weeks"]: | |
weeks = date_count | |
elif date_ago in ["days", "day"]: | |
days = date_count | |
elif date_ago in ["hours", "hour"]: | |
hours = date_count | |
elif date_ago in ["minutes", "minute"]: | |
minutes = date_count | |
elif date_ago in ["seconds", "second"]: | |
seconds = date_count | |
else: | |
print("[Error]: {0}".format(date_str)) | |
now = datetime.datetime.now() | |
original_date = now + dateutil.relativedelta.relativedelta( | |
years=years, | |
months=months, | |
weeks=weeks, | |
days=days, | |
hours=hours, | |
minutes=minutes, | |
seconds=seconds) | |
return original_date.strftime("%d/%m/%y") | |
def get_requests_url(self, start): | |
BASE_URL = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search" | |
PER_PAGE_COUNTER = 25 | |
# Max 1000 | |
# Location can be updated to country name | |
while start < 1000: | |
params = {"keywords": "software engineer", "location": "Earth", "position": "1", "pageNum": "0", "start": start} | |
# Uncomment next 2 lines if you want recently posted job | |
# Good to use when you run your scraper every 2 hours | |
# params["sortBy"] = "R" | |
# params["f_TPR"] = "r86400" | |
yield BASE_URL + "?" + urlencode(params) | |
start += PER_PAGE_COUNTER | |
def start_requests(self): | |
for url in self.get_requests_url(0): | |
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) | |
def parse(self, response): | |
JOB_URL = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}" | |
job_links = [JOB_URL.format(job_id=i) for i in [i.css("div.base-card::attr(data-entity-urn)").get().split(":")[3] if i.css("div.base-card") else i.css("a.base-card::attr(data-entity-urn)").get().split(":")[3] for i in response.css("li")]] | |
yield from response.follow_all(job_links, self.parse_job, dont_filter=True) | |
def parse_job(self, response): | |
temp_num = response.css("span.num-applicants__caption::text") | |
# To get body in a specific encoding and do some formating | |
# Ideally formatting should be done later on, while doing clean up | |
temp_body = response.css("div.description__text").css("div.show-more-less-html__markup").get().replace("<br/>", "\n").replace("<li>", "\n- ").replace("</li>", "") | |
temp_body = HtmlResponse(url="", body=temp_body, encoding='utf-8') | |
# It's hard to get company name | |
temp_company_name = response.css("span.topcard__flavor > *::text").get() | |
company_name = temp_company_name.strip() if temp_company_name else response.css("p.unify-apply-page__company-name-location > *::text").get() | |
yield { | |
"job_id": response.url.split("/")[-1], | |
"company_id": response.css("a::attr(href)").get().split("?")[0].split("/")[4], | |
"company_name": company_name.strip() if company_name else response.css("span.topcard__flavor::text")[0].extract().strip(), | |
"job_url": response.css("div.top-card-layout__entity-info > a::attr(href)").get().split("?")[0], | |
"job_title": response.css("h2.top-card-layout__title::text").get(), | |
"job_posting": response.css("span.topcard__flavor--bullet::text").get().strip(), | |
"job_date_string": response.css("span.posted-time-ago__text::text").get().strip(), | |
"job_date": self.parse_date(response.css("span.posted-time-ago__text::text").get().strip()), | |
"job_applicants": int(response.css("span.num-applicants__caption::text").get().strip().split(" ")[0]) if temp_num else -1, | |
"job_content": " ".join(temp_body.css("div.show-more-less-html__markup *::text").extract()).strip(), | |
"job_metadata": dict(zip([i.get().strip() for i in response.css("h3.description__job-criteria-subheader::text")], [i.get().strip() for i in response.css("span.description__job-criteria-text::text")])), | |
"created_at": str(datetime.datetime.now()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment