Created
March 12, 2014 07:02
-
-
Save strogonoff/9502160 to your computer and use it in GitHub Desktop.
Search your saved HN items
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
u""" | |
Searches your saved HN items. Many false positives (overkill with synonyms). | |
.. | |
$ pip install mechanize nltk | |
$ python hnsavedsearch.py username "space separated query" | |
""" | |
if __name__ != '__main__': | |
raise ImportError("hnsavedsearch isn't supposed to be imported") | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="Search your HN saved stories by title text.") | |
parser.add_argument('username', type=str) | |
parser.add_argument('query', type=str) | |
args = parser.parse_args() | |
# Mechanize setup | |
import mechanize | |
import cookielib | |
br = mechanize.Browser() | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
#br.set_debug_http(True) | |
#br.set_debug_redirects(True) | |
#br.set_debug_responses(True) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
# NLTK check | |
try: | |
from nltk.corpus import wordnet | |
wordnet.synsets('cake') | |
except LookupError: | |
print "wordnet corpus appears to be not installed, initiating download. Download to home directory!" | |
import nltk | |
result = nltk.download() | |
if result == True: | |
print "Installation hopefully successful" | |
from ntlk.corpus import wordnet | |
# Log in | |
import getpass | |
br.open('https://news.ycombinator.com/newslogin?whence=news') | |
br.select_form(nr=0) | |
br.form['u'] = args.username | |
br.form['p'] = getpass.getpass("Pass for %s: " % args.username) | |
br.submit() | |
# Prepare search | |
def lemmas(words, synonyms=False): | |
from nltk import wordnet as wn | |
if synonyms: | |
lemmas = set(lemma | |
for word in words | |
for synset in wordnet.synsets(word) | |
for lemma in synset.lemma_names) | |
else: | |
lemmas = set(wn.WordNetLemmatizer().lemmatize(word) | |
for word in words) | |
return lemmas.union(set(words)) | |
query = lemmas(args.query.split()) | |
print "Original query: %s" % args.query | |
print " expanded: %s" % ', '.join(w for w in query) | |
# Search | |
def iterate_links(url): | |
global _page | |
global _links_processed | |
global _matches_found | |
_match = None | |
br.open(url) | |
for link in br.links(): | |
# Internal links | |
if 'news.ycombinator.com' in link.absolute_url: | |
if link.url.startswith('item?id=') and _match is not None: | |
print "{:<30} \"{}\" on page {}".format( | |
link.absolute_url, _match, _page) | |
_match = None | |
continue | |
elif link.text == "More": | |
_page += 1 | |
iterate_links(link.absolute_url) | |
break | |
# External link | |
if query.intersection(lemmas(link.text.split(), True)): | |
_matches_found += 1 | |
_match = link.text | |
_links_processed += 1 | |
try: | |
_page = 1 | |
_matches_found = 0 | |
_links_processed = 0 | |
iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username) | |
except KeyboardInterrupt: | |
print "\n" | |
print "Interrupted on page {}".format(_page) | |
print "Links processed: {}".format(_links_processed) | |
print "Matches found: {}".format(_matches_found) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment