Last active
September 1, 2018 23:52
-
-
Save brett-miller/7bd021a2791f68d5c43f9827ebfc7481 to your computer and use it in GitHub Desktop.
Get top news sites from alexa
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from datetime import timedelta | |
regex = re.compile(r'((?P<hours>\d+?):)?((?P<minutes>\d+?)?:)((?P<seconds>\d+?))') | |
def parse_duration(time_str): | |
parts = regex.match(time_str) | |
if not parts: | |
return | |
parts = parts.groupdict() | |
time_params = {} | |
for (name, param) in parts.items(): | |
if param: | |
time_params[name] = int(param) | |
return int(timedelta(**time_params).total_seconds()) | |
def get_int(s): | |
return int("".join(filter(lambda x: x.isdigit(), s))) | |
def get_top_news_sites(): | |
html = requests.get("https://www.alexa.com/topsites/category/News").text | |
soup = BeautifulSoup(html, 'html.parser') | |
listings = soup.select("div.tr.site-listing") | |
top_sites = [] | |
for listing in listings: | |
site = listing.select("a")[0].get_text().strip() | |
description = listing.select(".description")[0].get_text().strip().split(u'\xa0')[0] | |
stats = (p.get_text().strip() for p in listing.select(".right > p")) | |
avg_time_on_site, daily_page_views, search_traffic, backlinks = stats | |
top_sites.append(dict( | |
site = site, | |
description = description, | |
avg_seconds_on_site = parse_duration(avg_time_on_site), | |
daily_page_views = get_int(daily_page_views), | |
search_traffic = get_int(search_traffic), | |
backlinks = get_int(backlinks) | |
)) | |
return top_sites | |
top_news_sites = get_top_news_sites() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment