Skip to content

Instantly share code, notes, and snippets.

@Tydus
Last active August 29, 2015 14:11
Show Gist options
  • Save Tydus/276f071ca19eb5697338 to your computer and use it in GitHub Desktop.
Save Tydus/276f071ca19eb5697338 to your computer and use it in GitHub Desktop.
go.py
#!/usr/bin/python2
import sys
import json
import urllib2
import lxml.html
def fetch(url):
ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:33.0) Gecko/20100101 Firefox/33.0.2"
return urllib2.urlopen(urllib2.Request(
url,
headers = {"User-Agent": ua},
))
def main():
if len(sys.argv) < 2:
print "%s keyword [page]" % sys.argv[0]
print "page start at 1"
return -1
page = 1
if len(sys.argv) == 3:
page = int(sys.argv[2])
kw = sys.argv[1]
url = "http://scholar.google.com/scholar?as_sdt=1,5&q=%s&as_vis=1&start=%d" % (kw, (page + 1) * 10)
xml = lxml.html.parse(fetch(url))
l = zip(
xml.xpath("//div[@class='gs_r']//h3[@class='gs_rt']/a/@href"), # url
map(lambda x: x.text_content().split("-")[0], xml.xpath("//div[@class='gs_r']//div[@class='gs_a']")), # author
map(lambda x: x.text_content(), xml.xpath("//div[@class='gs_r']//h3[@class='gs_rt']/a")), # title
)
l = map(lambda x: {
'url': x[0],
'author': x[1],
'title': x[2],
}, l)
print json.dumps(l)
return 0
if __name__ == "__main__":
exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment