Last active
December 14, 2018 09:59
-
-
Save initbrain/6864902 to your computer and use it in GitHub Desktop.
RIPE Database - Full Text Search Scraper (using QtWebKit.QWebView from PySide)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
from PySide import QtCore, QtGui, QtWebKit | |
class Browser(QtGui.QMainWindow): | |
def __init__(self): | |
""" | |
Initialize the browser GUI and connect the events | |
""" | |
super(Browser, self).__init__() | |
self.initUI() | |
def initUI(self): | |
self.resize(800,600) | |
self.center() | |
self.setWindowTitle('RIPE Database - Full Text Search Scraper') | |
# self.setWindowIcon(QtGui.QIcon('icone.png')) # TODO | |
self.centralWidget = QtGui.QWidget(self) | |
self.mainLayout = QtGui.QHBoxLayout(self.centralWidget) | |
self.mainLayout.setSpacing(0) | |
self.mainLayout.setContentsMargins(10, 10, 10, 10) # setMargin(10) for PyQT4 | |
self.frame = QtGui.QFrame(self.centralWidget) | |
self.gridLayout = QtGui.QVBoxLayout(self.frame) | |
self.gridLayout.setContentsMargins(0, 0, 0, 0) | |
self.gridLayout.setSpacing(10) | |
self.horizontalLayout = QtGui.QHBoxLayout() | |
self.tb_url = QtGui.QLineEdit(self.frame) | |
# self.bt_back = QtGui.QPushButton(self.frame) | |
# self.bt_ahead = QtGui.QPushButton(self.frame) | |
# self.bt_stop = QtGui.QPushButton(self.frame) | |
self.tb_go = QtGui.QPushButton(self.frame) | |
# self.bt_back.setText("<") # setIcon(QtGui.QIcon().fromTheme("go-previous")) for Icon | |
# self.bt_ahead.setText(">") | |
# self.bt_stop.setText("x") | |
self.tb_go.setText("Search") | |
self.tb_url.setText("Type your search here...") | |
self.tb_url.setToolTip('RIPE Database text search') | |
# self.horizontalLayout.addWidget(self.bt_back) | |
# self.horizontalLayout.addWidget(self.bt_ahead) | |
# self.horizontalLayout.addWidget(self.bt_stop) | |
self.horizontalLayout.addWidget(self.tb_url) | |
self.horizontalLayout.addWidget(self.tb_go) | |
self.gridLayout.addLayout(self.horizontalLayout) | |
self.html = QtWebKit.QWebView() | |
self.gridLayout.addWidget(self.html) | |
self.mainLayout.addWidget(self.frame) | |
self.setCentralWidget(self.centralWidget) | |
# self.connect(self.bt_back, QtCore.SIGNAL("clicked()"), self.html.back) | |
# self.connect(self.bt_ahead, QtCore.SIGNAL("clicked()"), self.html.forward) | |
# self.connect(self.bt_stop, QtCore.SIGNAL("clicked()"), self.html.stop) | |
self.connect(self.tb_url, QtCore.SIGNAL("returnPressed()"), self.browse) | |
self.connect(self.tb_go, QtCore.SIGNAL("clicked()"), self.browse) | |
self.connect(self.html, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished) | |
self.connect(self.html, QtCore.SIGNAL('loadStarted()'), self.loadStarted) | |
self.ripe_url = "https://apps.db.ripe.net/search/full-text.html" | |
self.currentPage = 0 | |
self.result = [] | |
self.eraseList = ['(\/b)', | |
'<b>', | |
'</b>', | |
'\n'] | |
self.replaceDict = {'&': '&', | |
' ': '%20'} | |
# Status bar | |
self.statusBar().showMessage('Ready') | |
self.statusProgressBar = QtGui.QProgressBar() | |
self.statusBar().addPermanentWidget(self.statusProgressBar) | |
self.statusProgressBar.setGeometry(30, 40, 200, 25) | |
self.statusProgressBar.setRange(0, 0) | |
self.statusProgressBar.setVisible(False) | |
# self.browse() # TODO debug | |
def center(self): | |
qr = self.frameGeometry() | |
cp = QtGui.QDesktopWidget().availableGeometry().center() | |
qr.moveCenter(cp) | |
self.move(qr.topLeft()) | |
def browse(self): | |
""" | |
Make a web browse on a specific url and show the page on the | |
Webview widget. | |
""" | |
ripe_url = QtCore.QUrl(self.ripe_url) | |
self.html.load(ripe_url) | |
self.html.show() | |
def loadStarted(self): | |
self.statusProgressBar.setVisible(True) | |
if not self.currentPage: | |
self.statusBar().showMessage('Search started : %s' % self.tb_url.text()) | |
self.tb_go.setEnabled(False) | |
self.tb_url.setEnabled(False) | |
else: | |
self.statusBar().showMessage('Search : %s (parsing page %d)' % (self.tb_url.text(), self.currentPage)) | |
def loadFinished(self, ok): | |
self.statusProgressBar.setVisible(False) | |
# If page pointer is not set, launch search | |
if not self.currentPage: | |
# Using JavaScript to submit search | |
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("home_search:searchform_q").value = "%s";' % self.tb_url.text()) | |
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("home_search:doSearch").click();') | |
print "[+] Query initiated" | |
self.result = [] | |
self.currentPage += 1 | |
else: | |
# Search in progress, source code parsing | |
sourceCode = "%s" % self.html.page().currentFrame().toHtml() | |
regex_part_a = '<a href="([\S ]+?)" target="_blank" style="color:blue">' | |
regex_part_b = '(.+?)\s+?<br>\s+?</a>\s+?' | |
regex_part_c = '<small><small>([\S\s]+?)</small></small>' | |
regex = regex_part_a + regex_part_b + regex_part_c | |
# Debug | |
#file = open('page%d.html' % self.currentPage, 'w') | |
#file.write(sourceCode.encode('utf-8')) | |
#file.close() | |
result = re.compile(regex, re.MULTILINE).findall(sourceCode) | |
print "[+] Parsing page %d (%d bytes), %d result%s" % (self.currentPage, | |
self.html.page().bytesReceived(), | |
len(self.result) + len(result), | |
's' if len(self.result) + len(result) > 1 else '') | |
# if len(result) != 10: | |
# print "#"*50 | |
# print sourceCode | |
# print "#"*50 | |
# Noise removal | |
for link, res, found in result: | |
for pattern in self.eraseList: | |
link = link.replace(pattern, '') | |
res = res.replace(pattern, '') | |
found = found.replace(pattern, '') | |
for pattern in self.replaceDict.keys(): | |
link = link.replace(pattern, self.replaceDict[pattern]) | |
res = re.sub('\s+', ' ', res) | |
found = re.sub('\s+', ' ', found) | |
# print "%s, %s, %s" % (link, res, found) | |
self.result.append("%s, %s, %s" % (found, res, link)) | |
# Use JavaScript to go to the next page if link is present | |
if 'resultsView:paginationViewTop:paginationForm:main:after:repeat:0:byIndex' in sourceCode: | |
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("resultsView:paginationViewTop:paginationForm:main:after:repeat:0:byIndex").click();') | |
self.currentPage += 1 | |
else: | |
# If no next page, parsing is finished | |
print "[+] Research completed (%d result%s)" % (len(self.result), | |
's' if len(self.result) > 1 else '') | |
print '\n'.join(self.result) | |
self.showResult() | |
self.statusBar().showMessage('Research completed : %s (%s result%s)' % (self.tb_url.text(), | |
len(self.result), | |
's' if len(self.result) > 1 else '')) | |
self.currentPage = 0 | |
self.tb_go.setEnabled(True) | |
self.tb_url.setEnabled(True) | |
def showResult(self): | |
resultDialog = QtGui.QDialog(self) | |
# resultDialog.setModal(True) | |
resultDialog.resize(600, 400) | |
resultDialog.setWindowTitle("%s result%s : %s" % (len(self.result), | |
's' if len(self.result) > 1 else '', | |
self.tb_url.text())) | |
text_editor = QtGui.QTextEdit(self) | |
text_editor.setLineWrapMode(QtGui.QTextEdit.NoWrap) | |
text_editor.setText('\n'.join(self.result)) | |
resultLayout = QtGui.QVBoxLayout() | |
resultLayout.addWidget(text_editor) | |
resultDialog.setLayout(resultLayout) | |
resultDialog.show() | |
if __name__ == "__main__": | |
app = QtGui.QApplication(sys.argv) | |
main = Browser() | |
main.show() | |
sys.exit(app.exec_()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have extracted your search bar from this code for a personal project (browse 3D elements name in a json file with this kind of QLineEdit search bar). Thanks !