Created
December 18, 2017 13:16
-
-
Save vivonk/1096bd6d23b3371eb0cbcf9356587d6f to your computer and use it in GitHub Desktop.
How to scrap google search content using python and scraping tools like urllibs, requests and scrapy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from generalized import Scraper | |
class Google(Scraper): | |
"""Scrapper class for Google""" | |
def __init__(self): | |
self.url = 'https://www.google.com/search' | |
self.defaultStart = 0 | |
self.startKey = 'start' | |
def nextStart(self, currentStart, prevResults): | |
return currentStart + len(prevResults) | |
def parseResponse(self, soup): | |
""" | |
Parses the response and returns set of urls | |
Returns: urls (list) | |
[[Tile1,url1], [Title2, url2],..] | |
""" | |
urls = [] | |
for h3 in soup.findAll('h3', {'class': 'r'}): | |
links = h3.find('a') | |
urls.append({'title': links.getText(), 'link': links.get('href')}) | |
print('Google parsed: ' + str(urls)) | |
return urls |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment