Created
February 28, 2024 05:48
-
-
Save nicobrenner/88cc2aaf4fde7cbb119c6ca67fd50bc2 to your computer and use it in GitHub Desktop.
Scrapes "Ask HN: Who is hiring? (February 2024)" page and saves job listings to local sqlite3 db
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This connects to an "Ask HN: Who is hiring?" page | |
# eg. https://news.ycombinator.com/item?id=39217310&p=1 | |
# Gets every top-level job listing and saves it to a sqlite3 db (job_listings.db) | |
# every entry has id, listing text, listing html and source url | |
# (html is saved to preserve link urls, which are usually redacted within the text) | |
# It handles pagination (follows More link at the bottom of the page) | |
import requests | |
from bs4 import BeautifulSoup | |
import sqlite3 | |
def create_database(): | |
conn = sqlite3.connect('job_listings.db') | |
c = conn.cursor() | |
c.execute('''CREATE TABLE IF NOT EXISTS job_listings | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
original_text TEXT, | |
original_html TEXT, | |
source TEXT)''') | |
conn.commit() | |
conn.close() | |
def scrape_hn_jobs(url): | |
while url: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find all comments | |
comments = soup.find_all('tr', class_='athing comtr') | |
for comment in comments: | |
# Identify top-level comments by checking the indentation | |
ind_cell = comment.find('td', class_='ind') | |
img = ind_cell.find('img') if ind_cell else None | |
# This assumes top-level comments have a width of "0" | |
if img and img.get('width') == "0": | |
# This comment is a top-level job posting | |
job_description = comment.find('span', class_='commtext c00') | |
if job_description: | |
original_text = job_description.text | |
original_html = job_description.prettify() | |
source = "Hacker News" | |
save_to_database(original_text, original_html, source) | |
# Check for the 'More' link for pagination | |
more_link = soup.find('a', class_='morelink') | |
if more_link: | |
url = 'https://news.ycombinator.com/' + more_link['href'] | |
else: | |
url = None # No more comments to process | |
def save_to_database(original_text, original_html, source): | |
conn = sqlite3.connect('job_listings.db') | |
c = conn.cursor() | |
c.execute("INSERT INTO job_listings (original_text, original_html, source) VALUES (?, ?, ?)", | |
(original_text, original_html, source)) | |
conn.commit() | |
conn.close() | |
if __name__ == "__main__": | |
create_database() | |
# This url is for "Ask HN: Who is hiring? (February 2024)" posted on Feb 1, 2024 | |
# Each top level comment is a job listing | |
start_url = 'https://news.ycombinator.com/item?id=39217310&p=1' | |
scrape_hn_jobs(start_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment