Skip to content

Instantly share code, notes, and snippets.

@s3rgeym
Created July 9, 2017 00:41
Show Gist options
  • Save s3rgeym/f417a5f4dbf647ed5fc346f1ad3895ff to your computer and use it in GitHub Desktop.
Save s3rgeym/f417a5f4dbf647ed5fc346f1ad3895ff to your computer and use it in GitHub Desktop.
Python Google search results grabber
import random
import re
import sys
from argparse import ArgumentParser
# pip install pyqt5
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from PyQt5.QtWidgets import *
PAGES = 10
START = 1
GOOGLE_URL = 'http://www.google.com/'
ITEMS_PER_PAGE = 10
def addslashes(s):
# TODO: протестировать
d = {'\0': r'\0', '\t': r'\t', '\n': r'\n', '\r': r'\r', "'": r"\'", '"': r'\"', '\\': r'\\'}
return ''.join([d.get(c, c) for c in s])
class GoogleSearch(QObject):
def __init__(self, fp, query, pages=PAGES, start=START):
super().__init__(None)
self.fp = fp
self.query = query
self.pages = pages
self.start = start
self.view = QWebEngineView()
self.view.loadFinished.connect(self.onload)
self.links = []
def onload(self, ok):
assert(ok)
print('page loaded')
self.state()
def submit_form(self):
self.view.page().runJavaScript(
"""
document.getElementById('lst-ib').value = '{}'
document.getElementById('tsf').submit()
undefined
""".format(addslashes(self.query))
)
self.state = self.grab_links
def grab_links(self):
self.view.page().runJavaScript(
"""
[...document.querySelectorAll('h3.r > a')].map(a => a.href)
""",
self.process_links
)
def process_links(self, links):
if not links:
print('no links')
self.view.close()
return
for link in links:
self.fp.write(link)
self.fp.write('\n')
self.fp.flush()
if self.page >= self.pages:
print('finished')
self.view.close()
return
url = self.view.url()
query = url.query()
query = re.sub(r'&start=[^&]*', '', query)
query += '&start={}'.format(ITEMS_PER_PAGE * self.page)
url.setQuery(query)
ms = random.randint(999999, 2999999)
print('sleep {} microseconds'.format(ms))
QThread.usleep(ms)
print('load next page')
self.view.load(url)
self.page += 1
def run(self):
self.page = self.start
self.state = self.submit_form
self.view.load(QUrl(GOOGLE_URL))
self.view.show()
if __name__ == '__main__':
# D:\Projects\sqli\tools>python google_search.py -q "Недвижимость СПб" -f "Недвижимость СПб.txt" -p 50
parser = ArgumentParser(description='Google Search by <[email protected]>.')
parser.add_argument('-q', '--query', help='search string', required=True, type=str)
parser.add_argument('-f', '--file', default='links.txt', help='output filename', type=str)
parser.add_argument('-p', '--pages', default=PAGES, help='number of pages', type=int)
parser.add_argument('-s', '--start', default=START, help='start page', type=int)
args = parser.parse_args()
with open(args.file, 'a', encoding='utf-8', newline='') as fp:
a = QApplication(sys.argv)
g = GoogleSearch(fp, args.query, args.pages, args.start)
g.run()
a.exec()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment