Skip to content

Instantly share code, notes, and snippets.

@mitar
Forked from brbsix/pyqt5_scraper.py
Last active December 23, 2019 16:18
Show Gist options
  • Save mitar/de00adf9aa597a60586acc83ce7a825d to your computer and use it in GitHub Desktop.
Save mitar/de00adf9aa597a60586acc83ce7a825d to your computer and use it in GitHub Desktop.
PyQt5 Scraper (Basic Example)
#!/usr/bin/env python3
# standard imports
import sys
# third-party imports
from pyvirtualdisplay import Display
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Scrape(QWebEngineView):
def __init__(self, start_url, person_name):
self.results = []
self.start_url = QUrl(start_url)
self.person_name = person_name
self.app = QApplication(sys.argv)
super().__init__()
self.loadFinished.connect(self._loadFinished)
self.load(self.start_url)
self.app.exec_()
def _loadFinished(self, result):
if self.url() == self.start_url:
# On the start page we submit the request.
self.page().runJavaScript("""
document.getElementById('txtDKTNAME').value = "%s";
document.getElementById('btnGO').click();
""" % (self.person_name,));
elif self.url() == QUrl('http://jis.36thdistrictcourt.org/ROAWEBINQ/ROASched.aspx'):
# Page with results.
self.page().runJavaScript("""
(function () {
var index = %s;
var button = index %% 25;
var page = Math.floor(index / 25);
var buttons = Array.prototype.filter.call(document.getElementsByTagName('input'), function (el) {return !el.name});
var pages = document.getElementsByClassName('mypager')[0].querySelector('table').querySelectorAll('td');
if (page >= pages.length) {
return false;
}
if (pages[page].querySelector('a')) {
pages[page].querySelector('a').click();
return true;
}
if (button < buttons.length) {
buttons[button].click();
return true;
}
return false;
})();
""" % (len(self.results),), self._callback);
else:
self.results.append(self.url().toString())
print("Found", len(self.results), self.url().toString())
self.back()
def _callback(self, result):
if result != True:
self.app.quit()
url = 'http://jis.36thdistrictcourt.org/ROAWEBINQ/'
with Display(visible=0, size=(800, 600)):
#Scrape(url, "O'Connor/Ryan")
results = Scrape(url, "/").results
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment