Skip to content

Instantly share code, notes, and snippets.

@hex128
Last active August 26, 2016 20:13
Show Gist options
  • Save hex128/2886c2ded8e922331c7a to your computer and use it in GitHub Desktop.
Save hex128/2886c2ded8e922331c7a to your computer and use it in GitHub Desktop.
#!/usr/bin/python2
# -*- coding: utf-8 -*-
from urllib2 import urlopen, HTTPError
from bs4 import BeautifulSoup
from json import dumps
from sys import stdout, exit, stdin
from codecs import getwriter, getreader
from signal import signal, SIGINT
from re import compile
def youtube(username):
xml = urlopen("https://gdata.youtube.com/feeds/api/users/" + username).read()
soup = BeautifulSoup(xml, "lxml")
videos = int(soup.find("gd:feedlink", {
"rel": "http://gdata.youtube.com/schemas/2007#user.uploads"
})["counthint"])
stats = soup.find("yt:statistics")
subscribers = int(stats["subscribercount"])
views = int(stats["totaluploadviews"])
return {
"videos_uploaded": videos,
"channel_subscribers": subscribers,
"total_video_views": views
}
def main():
sout = getwriter("utf8")(stdout)
sin = getreader("utf8")(stdin)
regexp = compile("^http(s)?://www.youtube.com/user/\w+$")
while 1:
line = sin.readline().strip()
if not line:
break
if regexp.match(line):
username = line.rsplit("/", 1)[1]
try:
sout.write(dumps({"id": line, "realtime": youtube(username)}) + "\n")
except HTTPError:
pass
if __name__ == "__main__":
def signal_handler(signal, frame):
exit(0)
signal(SIGINT, signal_handler)
main()
#!/usr/bin/python2
# -*- coding: utf-8 -*-
from urllib2 import Request, urlopen, HTTPError
from bs4 import BeautifulSoup
from time import mktime, strptime
from json import dumps
from sys import stdout, exit
from codecs import getwriter
from signal import signal, SIGINT
def parse_users(url):
found = []
# SocialBlade returns 403 if urllib is detected, so we spoof UA
req = Request(url, None, {
"User-agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
})
html = urlopen(req).read()
soup = BeautifulSoup(html, "lxml")
body = soup.find("div", {"id": "BodyContainer"})
elements = body.find_all("div", {"class": "TableMonthlyStats"})
for element in elements:
links = element.find_all("a")
for link in links:
found.append(link.text)
return found
def unique(source):
seen = set()
seen_add = seen.add
return [item for item in source if not (item in seen or seen_add(item))]
def youtube_links(username):
result = []
html = urlopen("https://www.youtube.com/user/%s/about" % username).read()
soup = BeautifulSoup(html, "lxml")
container = soup.find("div", {"class": "branded-page-v2-col-container"})
links = container.find_all("li", {"class": "channel-links-item"})
for link in links:
result.append(link.find("a")["href"])
return result
def youtube(username):
xml = urlopen("https://gdata.youtube.com/feeds/api/users/" + username).read()
soup = BeautifulSoup(xml, "lxml")
link = "https://www.youtube.com/user/" + username
name = soup.find("title").text
stats = soup.find("yt:statistics")
subscribers = int(stats["subscribercount"])
views = int(stats["totaluploadviews"])
logo = soup.find("media:thumbnail")["url"]
published = int(mktime(strptime(soup.find("published").text[:19], "%Y-%m-%dT%H:%M:%S")))
description = soup.find("content").text
return {
"id": link,
"name": name,
"channel_subscribers": subscribers,
"total_video_views": views,
"youtube_url": link,
"logo": logo,
"joined_at": published,
"description": description,
"links": youtube_links(username)
}
def main():
sout = getwriter("utf8")(stdout)
users = parse_users("http://socialblade.com/youtube/top/country/RU") + parse_users(
"http://socialblade.com/youtube/top/country/RU/mostsubscribed") + parse_users(
"http://socialblade.com/youtube/top/country/RU/mostviewed")
for user in unique(users):
try:
sout.write(dumps(youtube(user), ensure_ascii=False) + "\n")
except HTTPError:
pass
if __name__ == "__main__":
def signal_handler(signal, frame):
exit(0)
signal(SIGINT, signal_handler)
main()
#!/usr/bin/python2
# -*- coding: utf-8 -*-
from urllib2 import Request, urlopen, HTTPError
from bs4 import BeautifulSoup
from time import mktime, strptime
from json import dumps
from sys import stdout, exit
from codecs import getwriter
from signal import signal, SIGINT
def parse_users(url):
found = []
# SocialBlade returns 403 if urllib is detected, so we spoof UA
req = Request(url, None, {
"User-agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
})
html = urlopen(req).read()
soup = BeautifulSoup(html, "lxml")
body = soup.find("div", {"id": "BodyContainer"})
elements = body.find_all("div", {"class": "TableMonthlyStats"})
for element in elements:
links = element.find_all("a")
for link in links:
found.append(link.text)
return found
def unique(source):
seen = set()
seen_add = seen.add
return [item for item in source if not (item in seen or seen_add(item))]
def youtube_links(username):
result = []
html = urlopen("https://www.youtube.com/user/%s/about" % username).read()
soup = BeautifulSoup(html, "lxml")
container = soup.find("div", {"class": "branded-page-v2-col-container"})
links = container.find_all("li", {"class": "channel-links-item"})
for link in links:
result.append(link.find("a")["href"])
return result
def youtube(username):
xml = urlopen("https://gdata.youtube.com/feeds/api/users/" + username).read()
soup = BeautifulSoup(xml, "lxml")
link = "https://www.youtube.com/user/" + username
name = soup.find("title").text
stats = soup.find("yt:statistics")
subscribers = int(stats["subscribercount"])
views = int(stats["totaluploadviews"])
logo = soup.find("media:thumbnail")["url"]
published = int(mktime(strptime(soup.find("published").text[:19], "%Y-%m-%dT%H:%M:%S")))
description = soup.find("content").text
return {
"id": link,
"name": name,
"channel_subscribers": subscribers,
"total_video_views": views,
"youtube_url": link,
"logo": logo,
"joined_at": published,
"description": description,
"links": youtube_links(username)
}
def main():
sout = getwriter("utf8")(stdout)
users = parse_users("http://socialblade.com/youtube/top/country/TR") + parse_users(
"http://socialblade.com/youtube/top/country/TR/mostsubscribed") + parse_users(
"http://socialblade.com/youtube/top/country/TR/mostviewed")
for user in unique(users):
try:
sout.write(dumps(youtube(user), ensure_ascii=False) + "\n")
except HTTPError:
pass
if __name__ == "__main__":
def signal_handler(signal, frame):
exit(0)
signal(SIGINT, signal_handler)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment