Skip to content

Instantly share code, notes, and snippets.

@phaustin
Forked from MBAustin/tournament_scrape.py
Last active November 12, 2016 20:51
Show Gist options
  • Save phaustin/990c5d4c94604a0541e4d35c24f9eef3 to your computer and use it in GitHub Desktop.
Save phaustin/990c5d4c94604a0541e4d35c24f9eef3 to your computer and use it in GitHub Desktop.
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
qInstallMsgHandler(lambda *args: None)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
# QWebPage.__init__(self)
super(Render, self).__init__()
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def retreive_tournament(schedule_url, id, name, year, location):
pass
def make_soup(in_url):
r = Render(in_url)
result = r.frame.toHtml()
return BeautifulSoup(result, 'html.parser')
def recursive_scrape(in_url, seen_urls):
super_round = in_url.split('schedule/')[1].strip('/')
sub_round = in_url.split(super_round + '/')
sub_round = None if len(sub_round) < 2 else sub_round[1].strip('/')
matches = []
soup = make_soup(in_url)
for link in soup.find_all('a'):
href = link.get('href')
if href is None:
href=[]
if 'schedule' in href and link not in seen_urls:
seen_urls.append(link)
recursive_scrape(href, seen_urls)
elif 'matches' in href:
match_soup = make_soup(href)
for link in match_soup.find_all('a'):
if 'matchhistory' in link.get('href'):
match_paths.append({'path':
link.get('href').split('details/')[1],
'super_round': super_round,
'sub_round': sub_round})
if matches is None:
print('no matches here')
return matches
for match in recursive_scrape(
'http://www.lolesports.com/en_US/msi/msi_2016/schedule/default', []):
print(match)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment