Created
July 9, 2017 00:41
-
-
Save s3rgeym/f417a5f4dbf647ed5fc346f1ad3895ff to your computer and use it in GitHub Desktop.
Python Google search results grabber
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import re | |
import sys | |
from argparse import ArgumentParser | |
# pip install pyqt5 | |
from PyQt5.QtCore import * | |
from PyQt5.QtWebEngineWidgets import * | |
from PyQt5.QtWidgets import * | |
PAGES = 10 | |
START = 1 | |
GOOGLE_URL = 'http://www.google.com/' | |
ITEMS_PER_PAGE = 10 | |
def addslashes(s): | |
# TODO: протестировать | |
d = {'\0': r'\0', '\t': r'\t', '\n': r'\n', '\r': r'\r', "'": r"\'", '"': r'\"', '\\': r'\\'} | |
return ''.join([d.get(c, c) for c in s]) | |
class GoogleSearch(QObject): | |
def __init__(self, fp, query, pages=PAGES, start=START): | |
super().__init__(None) | |
self.fp = fp | |
self.query = query | |
self.pages = pages | |
self.start = start | |
self.view = QWebEngineView() | |
self.view.loadFinished.connect(self.onload) | |
self.links = [] | |
def onload(self, ok): | |
assert(ok) | |
print('page loaded') | |
self.state() | |
def submit_form(self): | |
self.view.page().runJavaScript( | |
""" | |
document.getElementById('lst-ib').value = '{}' | |
document.getElementById('tsf').submit() | |
undefined | |
""".format(addslashes(self.query)) | |
) | |
self.state = self.grab_links | |
def grab_links(self): | |
self.view.page().runJavaScript( | |
""" | |
[...document.querySelectorAll('h3.r > a')].map(a => a.href) | |
""", | |
self.process_links | |
) | |
def process_links(self, links): | |
if not links: | |
print('no links') | |
self.view.close() | |
return | |
for link in links: | |
self.fp.write(link) | |
self.fp.write('\n') | |
self.fp.flush() | |
if self.page >= self.pages: | |
print('finished') | |
self.view.close() | |
return | |
url = self.view.url() | |
query = url.query() | |
query = re.sub(r'&start=[^&]*', '', query) | |
query += '&start={}'.format(ITEMS_PER_PAGE * self.page) | |
url.setQuery(query) | |
ms = random.randint(999999, 2999999) | |
print('sleep {} microseconds'.format(ms)) | |
QThread.usleep(ms) | |
print('load next page') | |
self.view.load(url) | |
self.page += 1 | |
def run(self): | |
self.page = self.start | |
self.state = self.submit_form | |
self.view.load(QUrl(GOOGLE_URL)) | |
self.view.show() | |
if __name__ == '__main__': | |
# D:\Projects\sqli\tools>python google_search.py -q "Недвижимость СПб" -f "Недвижимость СПб.txt" -p 50 | |
parser = ArgumentParser(description='Google Search by <[email protected]>.') | |
parser.add_argument('-q', '--query', help='search string', required=True, type=str) | |
parser.add_argument('-f', '--file', default='links.txt', help='output filename', type=str) | |
parser.add_argument('-p', '--pages', default=PAGES, help='number of pages', type=int) | |
parser.add_argument('-s', '--start', default=START, help='start page', type=int) | |
args = parser.parse_args() | |
with open(args.file, 'a', encoding='utf-8', newline='') as fp: | |
a = QApplication(sys.argv) | |
g = GoogleSearch(fp, args.query, args.pages, args.start) | |
g.run() | |
a.exec() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment