Skip to content

Instantly share code, notes, and snippets.

@akrami
Created December 21, 2018 00:51
Show Gist options
  • Save akrami/489970a64accfb75e96ce77d375a1893 to your computer and use it in GitHub Desktop.
Save akrami/489970a64accfb75e96ce77d375a1893 to your computer and use it in GitHub Desktop.
scraping stackoverflow jobs
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: alireza
"""
import requests
from bs4 import BeautifulSoup
import mysql.connector
import json
import time
url = 'https://stackoverflow.com/jobs?sort=p&pg='
mydb = mysql.connector.connect(
host="localhost",
user="alireza",
passwd="********",
database="jobs"
)
mycursor = mydb.cursor()
query = "INSERT INTO `jobs`.`jobs` \
(`title`, \
`company`, \
`location`, \
`salary`, \
`visa`, \
`relocation`, \
`remote`, \
`tags`) \
VALUES \
(%s, \
%s, \
%s, \
%s, \
%s, \
%s, \
%s, \
%s);"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
}
for i in range(1,11):
time.sleep(5)
print("scraping page number "+str(i))
response = requests.get(url+'1', headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
jobs = soup.findAll("div", {"class": "-job-summary"})
for job in jobs:
title = job.find("h2", {"class": "job-details__spaced"})
if title is not None:
title = title.get_text().strip()
company = job.find("div", {"class": "-company"}).find("span")
if company is not None:
company = company.get_text().strip()
location = job.find("div", {"class": "-company"}).find("span", {"class": "fc-black-500"})
if location is not None:
location = location.get_text().replace('-','').strip()
salary = job.find("span", {"class": "-salary"})
if salary is not None:
salary = " ".join(salary.get_text().split())
visa = job.find("span", {"class": "-visa"})
if visa is not None:
visa = visa.get_text().strip()
relocation = job.find("span", {"class": "-relocation"})
if relocation is not None:
relocation = relocation.get_text().strip()
remote = job.find("span", {"class": "-remote"})
if remote is not None:
remote = remote.get_text().strip()
tags = job.find("div", {"class": "-tags"})
if tags is not None:
tags = tags.find_all("a")
for x in range(0,len(tags)):
tags[x] = tags[x].get_text().strip()
mycursor.execute(query, (title, company, location, salary, visa, relocation, remote, json.dumps(tags)))
mydb.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment