Last active
September 2, 2024 18:46
-
-
Save HariharanUmapathi/91f1d5d5ffd37ba73d777e70eb099a9e to your computer and use it in GitHub Desktop.
LWN Calendar data extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
import re | |
year = "2024" | |
month = "09" | |
response = requests.get(f'https://lwn.net/Calendar/Monthly/{year}-{month}/') | |
if response.status_code==200: | |
document_root =html.fromstring(response.content) | |
# Find all <a> elements (links) | |
links = document_root.xpath('/html/body/div[4]/div[1]/div[2]/div/table//a') | |
# Use a set to store unique links | |
unique_links = set() | |
# Extract and store unique href attributes | |
for link in links: | |
href = link.get('href') | |
text = link.text_content().strip() | |
if href: | |
unique_links.add((href, text)) | |
# Print the unique links | |
for href, text in unique_links: | |
#filtering external url from the calander | |
if re.match("/^http|https://(.*)/",href): | |
print(f"Link: {href}, Text: {text}") | |
else: | |
print(response.status_code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment