Created
September 19, 2024 21:24
-
-
Save jamesmurdza/ac57da892a2fa85982cc6b3d76417ad6 to your computer and use it in GitHub Desktop.
CrewAI Hacker News Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
***Code Interpreting... | |
!pip install requests beautifulsoup4 | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract headlines | |
headlines = [] | |
for item in soup.find_all('a', class_='storylink'): | |
headlines.append(item.get_text()) | |
headlines | |
==== | |
***Code Interpreting... | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract headlines with correct class name | |
headlines = [] | |
for item in soup.find_all('a', class_='titlelink'): | |
headlines.append(item.get_text()) | |
headlines | |
==== | |
***Code Interpreting... | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Output the raw HTML content | |
raw_html = soup.prettify() | |
raw_html[:2000] # First 2000 characters for inspection | |
==== | |
***Code Interpreting... | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract headlines using correct structure | |
headlines = [] | |
for item in soup.find_all('tr', class_='athing'): | |
title = item.find('a', class_='titlelink') | |
if title: | |
headlines.append(title.get_text()) | |
headlines | |
==== | |
***Code Interpreting... | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Output more raw HTML content | |
raw_html = soup.prettify() | |
raw_html[:4000] | |
==== | |
***Code Interpreting... | |
import requests | |
from bs4 import BeautifulSoup | |
# Fetch the Hacker News homepage | |
url = 'https://news.ycombinator.com/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract headlines using correct structure | |
headlines = [] | |
for item in soup.find_all('tr', class_='athing'): | |
titleline = item.find('span', class_='titleline') | |
if titleline: | |
title = titleline.find('a') | |
if title: | |
headlines.append(title.get_text()) | |
headlines | |
==== | |
['Zb: An Early-Stage Build System', 'Glass Antenna Turns windows into 5G Base Stations', 'MicroPython on Flipper Zero', 'Digital signatures and how to avoid them', 'Show HN: Numscript, a declarative language to model financial transactions', 'FTC: Vast Surveillance of Users by Social Media and Video Streaming Companies', 'Finley (YC W21) is hiring engineers to build cap markets software (US Remote)', 'Show HN: Chili. Rust port of Spice, a low-overhead parallelization library', 'The Cheating Device (ChatGPT on a TI-84) [video]', 'DirectX Adopting SPIR-V as the Interchange Format of the Future', 'GPU Debug Scopes', 'Show HN: A CLI tool I made to self-host any app with two commands on a VPS', 'Diatom Arrangements', 'Forbes Marketplace: The Parasite SEO Company Trying to Devour Its Host', 'OpenNMS: Visualize and monitor everything on your local and distributed networks', 'Drift towards danger and the normalization of deviance (2017)', 'Show HN: Selectable – mobile-friendly Postgres client', 'Biggest ever seen black hole jets; blasting plasma well beyond their own galaxy', 'We accidentally burned through 200GB of proxy bandwidth in 6 hours', 'Show HN: An Elliptic Curve-Based Secure Chat, Written Using Rust and Protobuf', 'JVM statistics cause garbage collection pauses (2015)', 'I Revived 3-Axis CNC Mill G-Code Simulator', 'Real-time Linux is officially part of the kernel', "Quantum researchers cause controlled 'wobble' in the nucleus of a single atom", 'Vapour: A typed superset of the R programming language', 'Seeing Like a Network', 'Leonard Cohen: The Man Who Saw the Angels Fall', 'Pivotal Tracker will shut down', 'Modelling the archetype of a message-passing bug with TLA+ (2022)', 'Show HN: ts-remove-unused – Remove unused code from your TypeScript project'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment