Skip to content

Instantly share code, notes, and snippets.

@nicobrenner
Created February 28, 2024 05:48
Show Gist options
  • Save nicobrenner/88cc2aaf4fde7cbb119c6ca67fd50bc2 to your computer and use it in GitHub Desktop.
Save nicobrenner/88cc2aaf4fde7cbb119c6ca67fd50bc2 to your computer and use it in GitHub Desktop.
Scrapes "Ask HN: Who is hiring? (February 2024)" page and saves job listings to local sqlite3 db
# This connects to an "Ask HN: Who is hiring?" page
# eg. https://news.ycombinator.com/item?id=39217310&p=1
# Gets every top-level job listing and saves it to a sqlite3 db (job_listings.db)
# every entry has id, listing text, listing html and source url
# (html is saved to preserve link urls, which are usually redacted within the text)
# It handles pagination (follows More link at the bottom of the page)
import requests
from bs4 import BeautifulSoup
import sqlite3
def create_database():
conn = sqlite3.connect('job_listings.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS job_listings
(id INTEGER PRIMARY KEY AUTOINCREMENT,
original_text TEXT,
original_html TEXT,
source TEXT)''')
conn.commit()
conn.close()
def scrape_hn_jobs(url):
while url:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all comments
comments = soup.find_all('tr', class_='athing comtr')
for comment in comments:
# Identify top-level comments by checking the indentation
ind_cell = comment.find('td', class_='ind')
img = ind_cell.find('img') if ind_cell else None
# This assumes top-level comments have a width of "0"
if img and img.get('width') == "0":
# This comment is a top-level job posting
job_description = comment.find('span', class_='commtext c00')
if job_description:
original_text = job_description.text
original_html = job_description.prettify()
source = "Hacker News"
save_to_database(original_text, original_html, source)
# Check for the 'More' link for pagination
more_link = soup.find('a', class_='morelink')
if more_link:
url = 'https://news.ycombinator.com/' + more_link['href']
else:
url = None # No more comments to process
def save_to_database(original_text, original_html, source):
conn = sqlite3.connect('job_listings.db')
c = conn.cursor()
c.execute("INSERT INTO job_listings (original_text, original_html, source) VALUES (?, ?, ?)",
(original_text, original_html, source))
conn.commit()
conn.close()
if __name__ == "__main__":
create_database()
# This url is for "Ask HN: Who is hiring? (February 2024)" posted on Feb 1, 2024
# Each top level comment is a job listing
start_url = 'https://news.ycombinator.com/item?id=39217310&p=1'
scrape_hn_jobs(start_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment