Skip to content

Instantly share code, notes, and snippets.

@HariharanUmapathi
Last active September 2, 2024 18:46
Show Gist options
  • Save HariharanUmapathi/91f1d5d5ffd37ba73d777e70eb099a9e to your computer and use it in GitHub Desktop.
Save HariharanUmapathi/91f1d5d5ffd37ba73d777e70eb099a9e to your computer and use it in GitHub Desktop.
LWN Calendar data extraction
import requests
from lxml import html
import re
year = "2024"
month = "09"
response = requests.get(f'https://lwn.net/Calendar/Monthly/{year}-{month}/')
if response.status_code==200:
document_root =html.fromstring(response.content)
# Find all <a> elements (links)
links = document_root.xpath('/html/body/div[4]/div[1]/div[2]/div/table//a')
# Use a set to store unique links
unique_links = set()
# Extract and store unique href attributes
for link in links:
href = link.get('href')
text = link.text_content().strip()
if href:
unique_links.add((href, text))
# Print the unique links
for href, text in unique_links:
#filtering external url from the calander
if re.match("/^http|https://(.*)/",href):
print(f"Link: {href}, Text: {text}")
else:
print(response.status_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment