nicobrenner · February 28, 2024 05:48
diff --git a/get_ask_hn_jobs.py b/get_ask_hn_jobs.py
 # This connects to an "Ask HN: Who is hiring?" page
 # eg. https://news.ycombinator.com/item?id=39217310&p=1
 # Gets every top-level job listing and saves it to a sqlite3 db (job_listings.db)
 # every entry has id, listing text, listing html and source url
 # (html is saved to preserve link urls, which are usually redacted within the text)

 # It handles pagination (follows More link at the bottom of the page)

 import requests
 from bs4 import BeautifulSoup
 import sqlite3

 def create_database():
    conn = sqlite3.connect('job_listings.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS job_listings
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                    original_text TEXT,
                    original_html TEXT,
                    source TEXT)''')
    conn.commit()
    conn.close()

 def scrape_hn_jobs(url):
    while url:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all comments
        comments = soup.find_all('tr', class_='athing comtr')
        
        for comment in comments:
            # Identify top-level comments by checking the indentation
            ind_cell = comment.find('td', class_='ind')
            img = ind_cell.find('img') if ind_cell else None
            # This assumes top-level comments have a width of "0"
            if img and img.get('width') == "0":
                # This comment is a top-level job posting
                job_description = comment.find('span', class_='commtext c00')
                if job_description:
                    original_text = job_description.text
                    original_html = job_description.prettify()
                    source = "Hacker News"
                    save_to_database(original_text, original_html, source)

        # Check for the 'More' link for pagination
        more_link = soup.find('a', class_='morelink')
        if more_link:
            url = 'https://news.ycombinator.com/' + more_link['href']
        else:
            url = None  # No more comments to process

 def save_to_database(original_text, original_html, source):
    conn = sqlite3.connect('job_listings.db')
    c = conn.cursor()
    c.execute("INSERT INTO job_listings (original_text, original_html, source) VALUES (?, ?, ?)",
              (original_text, original_html, source))
    conn.commit()
    conn.close()

 if __name__ == "__main__":
    create_database()
    # This url is for "Ask HN: Who is hiring? (February 2024)" posted on Feb 1, 2024
    # Each top level comment is a job listing
    start_url = 'https://news.ycombinator.com/item?id=39217310&p=1'
    scrape_hn_jobs(start_url)
	# This connects to an "Ask HN: Who is hiring?" page
	# eg. https://news.ycombinator.com/item?id=39217310&p=1
	# Gets every top-level job listing and saves it to a sqlite3 db (job_listings.db)
	# every entry has id, listing text, listing html and source url
	# (html is saved to preserve link urls, which are usually redacted within the text)

	# It handles pagination (follows More link at the bottom of the page)

	import requests
	from bs4 import BeautifulSoup
	import sqlite3

	def create_database():
	conn = sqlite3.connect('job_listings.db')
	c = conn.cursor()
	c.execute('''CREATE TABLE IF NOT EXISTS job_listings
	(id INTEGER PRIMARY KEY AUTOINCREMENT,
	original_text TEXT,
	original_html TEXT,
	source TEXT)''')
	conn.commit()
	conn.close()

	def scrape_hn_jobs(url):
	while url:
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all comments
	comments = soup.find_all('tr', class_='athing comtr')

	for comment in comments:
	# Identify top-level comments by checking the indentation
	ind_cell = comment.find('td', class_='ind')
	img = ind_cell.find('img') if ind_cell else None
	# This assumes top-level comments have a width of "0"
	if img and img.get('width') == "0":
	# This comment is a top-level job posting
	job_description = comment.find('span', class_='commtext c00')
	if job_description:
	original_text = job_description.text
	original_html = job_description.prettify()
	source = "Hacker News"
	save_to_database(original_text, original_html, source)

	# Check for the 'More' link for pagination
	more_link = soup.find('a', class_='morelink')
	if more_link:
	url = 'https://news.ycombinator.com/' + more_link['href']
	else:
	url = None # No more comments to process

	def save_to_database(original_text, original_html, source):
	conn = sqlite3.connect('job_listings.db')
	c = conn.cursor()
	c.execute("INSERT INTO job_listings (original_text, original_html, source) VALUES (?, ?, ?)",
	(original_text, original_html, source))
	conn.commit()
	conn.close()

	if __name__ == "__main__":
	create_database()
	# This url is for "Ask HN: Who is hiring? (February 2024)" posted on Feb 1, 2024
	# Each top level comment is a job listing
	start_url = 'https://news.ycombinator.com/item?id=39217310&p=1'
	scrape_hn_jobs(start_url)