Last active
August 29, 2015 14:03
-
-
Save Svenito/fce6a9e44eb4984c464f to your computer and use it in GitHub Desktop.
Extracts the main Phrack article and writes it to a file for easier printing. (or download the tar of the issue if you are boring; ))
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python2.7 | |
from BeautifulSoup import BeautifulSoup | |
import requests | |
import sys | |
import re | |
import os | |
import argparse | |
def validateUrl(phrack_url): | |
''' | |
Prefix with http if missing and check it's in the expected format of | |
http://phrack.org/issues/67/9.html | |
''' | |
if not phrack_url.startswith('htt'): | |
phrack_url = 'http://'+phrack_url | |
m = re.match('^http(s)?:\/\/phrack\.org\/issues\/(\d)+\/(\d)+\.htm(l)?', phrack_url) | |
if m is None: | |
print('%s is not a valid URL.' % phrack_url) | |
print('Something like phrack.org/issues/43/42.html would be nice') | |
sys.exit(1) | |
issue_number = m.group(2) | |
article_number = m.group(3) | |
return phrack_url, issue_number, article_number | |
def prep_title_for_filename(title, number): | |
title = title.replace(' : ', '') | |
title = title.replace(' ', '_') | |
title = title.replace('/', '') | |
return '%02d_%s.txt' % (int(number), title) | |
def main(url, autofile, outfile=''): | |
''' | |
Get the requested URL and parse it to extract the article. | |
Print to stdout or a file. | |
''' | |
phrack_url, issue_number, article_number = validateUrl(url) | |
content = requests.get(phrack_url) | |
if content.status_code != 200: | |
print('Didn`t manage to fetch that page: %d' % content.status_code) | |
return 1 | |
soup = BeautifulSoup(content.text) | |
article_title = soup.find('div', {'id':'article'}) | |
article_author = article_title.findNext('div').contents | |
out_filename = outfile | |
if autofile: | |
target_dir = os.path.join(os.getcwd(), 'phrack_'+issue_number) | |
try: | |
if not os.path.exists(target_dir): | |
os.mkdir(target_dir) | |
except: | |
print "Unable to create directory." | |
return 1 | |
title_filename = prep_title_for_filename(article_title.contents[1].string, | |
article_number) | |
out_filename = os.path.join(target_dir, title_filename) | |
if out_filename: | |
try: | |
old_stdout = sys.stdout | |
sys.stdout = open(out_filename, 'w') | |
except IOError as e: | |
print('Error opening %s %s' % (out_filename, e)) | |
return 1 | |
print('This article was extracted from: %s\n\n' % phrack_url) | |
if article_title: | |
print(article_title.contents[0].string + article_title.contents[1].string) | |
if article_author: | |
print(article_author[1].string + article_author[2].string) | |
article = soup.find('pre') | |
print(article.contents[0]) | |
sys.stdout.close() | |
sys.stdout = old_stdout | |
return 0 | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Get a printable Phrack article') | |
parser.add_argument('url', | |
help='The URL to the Phrack article. eg: http://phrack.org/issues/67/9.html') | |
parser.add_argument('-o', '--outfile', | |
help='File to write output to. If omitted writes to stdout') | |
parser.add_argument('-a', '--autofile', action='store_true', | |
help='Filename is title of article and saved to issue number directory.') | |
args = parser.parse_args() | |
sys.exit(main(args.url, args.autofile, args.outfile)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment