Created
August 2, 2016 20:00
-
-
Save sleibrock/d7d5393c4250bf9e95fd59afa555a3b8 to your computer and use it in GitHub Desktop.
Search "Who's Hiring" HackerNews threads
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
""" | |
Scraping "Who's Hiring" Posts on Hackernews | |
Requires: BeautifulSoup4, Requests, Python 3 | |
""" | |
from sys import argv | |
from requests import get as re_get | |
from bs4 import BeautifulSoup as BS | |
DEF_OUTPUT = "output.html" | |
HELP_MSG = """ | |
Search HN "Who's Hiring" and export it to HTML file | |
Return comments if they match any search terms ie: | |
the search term is inside of a comment | |
Usage: | |
program.py <URL> <search_terms> [output] --and | |
* <URL> is the target URL to scan for | |
* <search_terms> is a comma-separated list of strings to look for | |
- [output] is an optional to redirect output to a target file | |
Ex: | |
program.py https://news.ycombinator.com/item?id=12202865 "Python,New York" | |
program.py https://news.ycombinator.com/item?id=12345678 "New York" | |
""" | |
def main(url, search_terms, output=DEF_OUTPUT): | |
""" | |
Main function to flip through all HN comments and yield | |
comments based on whether or not `search_term in comment` | |
""" | |
terms = search_terms.split(",") | |
bs = BS(re_get(url).text, 'html.parser') | |
comments = bs.find_all('tr', class_=["athing", "comtr"]) | |
matches = [c for c in comments if SEARCH_FUN([st for st in terms if st in c.text])] | |
print("Comment size: {}".format(len(comments))) | |
print("Matches size: {}".format(len(matches))) | |
with open(output, 'w') as f: | |
for c in matches: | |
f.write(str(c)) | |
f.write("\n") | |
pass | |
if __name__ == "__main__": | |
argv.pop(0) # pop off filename | |
if len(argv) < 2: | |
print(HELP_MSG) | |
quit() | |
try: | |
main(*argv) | |
except Exception as e: | |
print("Error: {}".format(e)) | |
# end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment