Last active
May 7, 2019 21:02
-
-
Save Phyks/a9458568ef3b66590a860f0afcc4a59c to your computer and use it in GitHub Desktop.
Fetch Google "Now" answers from the CLI. Usage: `python3 google_now.py QUERY`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import urllib.parse | |
import html2text | |
import scrapy | |
from scrapy.crawler import CrawlerProcess | |
results = [] | |
class MyPipeline(): | |
def process_item(self, item, spider): | |
results.append(dict(item)) | |
class GoogleNowSpider(scrapy.Spider): | |
name = "googlenow" | |
def __init__(self, query="", *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.start_urls = ["https://www.google.fr/search?q=%s" % | |
urllib.parse.quote(query)] | |
def parse(self, response): | |
if len(response.css("h2.r")) > 0: | |
# Google calc | |
return self.parse_calc(response) | |
elif len(response.css("td#rhs_block>*")) > 0: | |
return self.parse_col(response) | |
else: | |
return self.parse_rest(response) | |
def parse_calc(self, response): | |
return { | |
"type": "calc", | |
"result": response.css("h2.r::text")[0].extract() | |
} | |
def parse_col(self, response): | |
return { | |
"type": "col", | |
"result": html2text.html2text( | |
response.xpath("//td[@id='rhs_block']/ol/*[1]")[0].extract()) | |
} | |
def parse_rest(self, response): | |
return { | |
"type": "other", | |
"result": html2text.html2text( | |
response.xpath("//div[@id='ires']/ol/*[1]")[0].extract()) | |
} | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
sys.exit("Usage: %s QUERY" % sys.argv[0]) | |
process = CrawlerProcess({ | |
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', | |
'ITEM_PIPELINES': {'__main__.MyPipeline': 1}, | |
'LOG_LEVEL': 'ERROR' | |
}) | |
process.crawl(GoogleNowSpider, query=(" ".join(sys.argv[1:]))) | |
process.start() | |
print(results[0]["result"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey @Phyks, I was wondering if you'd be willing to update this gist with a header comment specifying which license you intend this code to be released under, if any. Thanks!