Skip to content

Instantly share code, notes, and snippets.

@brett-miller
Last active September 1, 2018 23:52
Show Gist options
  • Save brett-miller/7bd021a2791f68d5c43f9827ebfc7481 to your computer and use it in GitHub Desktop.
Save brett-miller/7bd021a2791f68d5c43f9827ebfc7481 to your computer and use it in GitHub Desktop.
Get top news sites from alexa
import requests
from bs4 import BeautifulSoup
import re
from datetime import timedelta
regex = re.compile(r'((?P<hours>\d+?):)?((?P<minutes>\d+?)?:)((?P<seconds>\d+?))')
def parse_duration(time_str):
parts = regex.match(time_str)
if not parts:
return
parts = parts.groupdict()
time_params = {}
for (name, param) in parts.items():
if param:
time_params[name] = int(param)
return int(timedelta(**time_params).total_seconds())
def get_int(s):
return int("".join(filter(lambda x: x.isdigit(), s)))
def get_top_news_sites():
html = requests.get("https://www.alexa.com/topsites/category/News").text
soup = BeautifulSoup(html, 'html.parser')
listings = soup.select("div.tr.site-listing")
top_sites = []
for listing in listings:
site = listing.select("a")[0].get_text().strip()
description = listing.select(".description")[0].get_text().strip().split(u'\xa0')[0]
stats = (p.get_text().strip() for p in listing.select(".right > p"))
avg_time_on_site, daily_page_views, search_traffic, backlinks = stats
top_sites.append(dict(
site = site,
description = description,
avg_seconds_on_site = parse_duration(avg_time_on_site),
daily_page_views = get_int(daily_page_views),
search_traffic = get_int(search_traffic),
backlinks = get_int(backlinks)
))
return top_sites
top_news_sites = get_top_news_sites()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment