Created
March 22, 2016 17:55
-
-
Save mgeeky/bf0e367098047330b58b to your computer and use it in GitHub Desktop.
RFC files gathering and renaming utility.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import httplib | |
from sys import exit | |
from string import strip, replace | |
import os | |
from os.path import normpath | |
import re | |
g_MAX_RFC_NUMBER = 5200 | |
g_NOT_FOUND_ERROR = "404: Page Not Found" | |
if __name__ == '__main__': | |
print "Simple script that downloads every RFC document, "\ | |
"names it and\n stores on your local machin"\ | |
"e (in dir: 'RFC')" | |
try: os.mkdir( 'RFC') | |
except: pass | |
os.chdir('RFC') | |
h = 0 | |
try: | |
h = httplib.HTTPConnection('www.ietf.org') | |
except: | |
print '[!] Error during HTTPConnection(\'www.ietf.org\')' | |
exit(0) | |
pages_re = "[\s|\w|\d]{1,}\[Page\s([0-9]{1.5})\]" | |
for i in range(g_MAX_RFC_NUMBER): | |
print "Processing RFC%d..." % i | |
resp = 0 | |
try: | |
h.request("GET", "/rfc/rfc"+`i`+".txt") | |
resp = h.getresponse() | |
except: | |
print "\tError. Omitting this RFC." | |
continue | |
if resp != 0: | |
data = "" | |
data = resp.read() | |
if data.find("was never issued.") != -1: | |
print "\t"+resp.read() | |
continue | |
if data.find(g_NOT_FOUND_ERROR) != -1: | |
print "\tThere is no any RFC%d." % i | |
continue | |
n = 'rfc%d.txt' % i | |
f = open( n, 'w') | |
f.write(data) | |
title = "" | |
pages = 0 | |
f.close() | |
f = open( n, 'r') | |
empty = 0 | |
cnt = 0 | |
cnt1 = 0 | |
del data | |
for j in f.readlines(): | |
if cnt == 0: | |
if re.match(r'\w{1,}', j) != None: | |
cnt = 1 | |
continue | |
if cnt == 1 and cnt1 == 0: | |
if re.match(r'\w{1,}', j) == None: | |
cnt1 = 1 | |
if cnt1 == 1: | |
if re.match(r'\w{1,}', j) == None: | |
empty += 1 | |
elif empty > 0: | |
if re.match('^\s{1,}\w*', j): | |
title = strip(j) | |
break | |
del cnt | |
del empty | |
for j in reversed( f.readlines() ): | |
pg = re.match( pages_re, j) | |
if pg != None: | |
pages = int(pg.group(1)) | |
break | |
title = "" | |
if len(title) > 0: | |
print '\tPages %d.' % pages | |
title = title.replace('\\', '') | |
newname = "RFC%d - %s (pgs %d).txt" \ | |
% (i, title, pages) | |
newname = normpath(newname) | |
print 'New name: "%s"' % newname | |
f.close() | |
cwd = os.getcwd()+"\\" | |
os.rename( cwd+n, cwd+newname) | |
h.close() | |
raw_input("Press ENTER to exit.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment