mgeeky · March 22, 2016 17:55
diff --git a/RFCGatherer.py b/RFCGatherer.py
 #!/usr/bin/python

 import httplib
 from sys import exit
 from string import strip, replace
 import os
 from os.path import normpath
 import re


 g_MAX_RFC_NUMBER = 5200
 g_NOT_FOUND_ERROR = "404: Page Not Found"


 if __name__ == '__main__':

 	print "Simple script that downloads every RFC document, "\
 			"names it and\n stores on your local machin"\
 			"e (in dir: 'RFC')"

 	try: os.mkdir( 'RFC')
 	except: pass

 	os.chdir('RFC')

 	h = 0
 	try:
 		h = httplib.HTTPConnection('www.ietf.org')
 	except:
 		print '[!] Error during HTTPConnection(\'www.ietf.org\')'
 		exit(0)

 	pages_re = "[\s|\w|\d]{1,}\[Page\s([0-9]{1.5})\]"

 	for i in range(g_MAX_RFC_NUMBER):
 		print "Processing RFC%d..." % i

 		resp = 0
 		try:
 			h.request("GET", "/rfc/rfc"+`i`+".txt")
 			resp = h.getresponse()
 		except:
 			print "\tError. Omitting this RFC."
 			continue
 		
 		if resp != 0:
 			data = ""
 			data = resp.read()

 			if data.find("was never issued.") != -1:
 				print "\t"+resp.read()
 				continue

 			if data.find(g_NOT_FOUND_ERROR) != -1:
 				print "\tThere is no any RFC%d." % i
 				continue
 			
 			n = 'rfc%d.txt' % i
 			f = open( n, 'w')
 			f.write(data)
 			title = ""
 			pages = 0
 			f.close()
 			f = open( n, 'r')

 			empty = 0
 			cnt = 0
 			cnt1 = 0
 			
 			del data

 			for j in f.readlines():
 				if cnt == 0:
 					if re.match(r'\w{1,}', j) != None:
 						cnt = 1
 						continue

 				if cnt == 1 and cnt1 == 0:
 					if re.match(r'\w{1,}', j) == None:
 						cnt1 = 1
 				
 				if cnt1 == 1:
 					if re.match(r'\w{1,}', j) == None:
 						empty += 1
 					elif empty > 0:
 						if re.match('^\s{1,}\w*', j):
 							title = strip(j)
 						break
 					
 			del cnt
 			del empty
 			
 			for j in reversed( f.readlines() ):
 				pg = re.match( pages_re, j)
 				if pg != None:
 					pages = int(pg.group(1))
 					break
 			
 			title = ""
 			if len(title) > 0:
 				print '\tPages %d.' % pages
 				title = title.replace('\\', '')
 				newname = "RFC%d - %s (pgs %d).txt" \
 						% (i, title, pages)
 				newname = normpath(newname)
 				print 'New name: "%s"' % newname
 				f.close()
 				cwd = os.getcwd()+"\\"
 				os.rename( cwd+n, cwd+newname) 

 	h.close()
 	raw_input("Press ENTER to exit.")
	#!/usr/bin/python

	import httplib
	from sys import exit
	from string import strip, replace
	import os
	from os.path import normpath
	import re


	g_MAX_RFC_NUMBER = 5200
	g_NOT_FOUND_ERROR = "404: Page Not Found"


	if __name__ == '__main__':

	print "Simple script that downloads every RFC document, "\
	"names it and\n stores on your local machin"\
	"e (in dir: 'RFC')"

	try: os.mkdir( 'RFC')
	except: pass

	os.chdir('RFC')

	h = 0
	try:
	h = httplib.HTTPConnection('www.ietf.org')
	except:
	print '[!] Error during HTTPConnection(\'www.ietf.org\')'
	exit(0)

	pages_re = "[\s\|\w\|\d]{1,}\[Page\s([0-9]{1.5})\]"

	for i in range(g_MAX_RFC_NUMBER):
	print "Processing RFC%d..." % i

	resp = 0
	try:
	h.request("GET", "/rfc/rfc"+`i`+".txt")
	resp = h.getresponse()
	except:
	print "\tError. Omitting this RFC."
	continue

	if resp != 0:
	data = ""
	data = resp.read()

	if data.find("was never issued.") != -1:
	print "\t"+resp.read()
	continue

	if data.find(g_NOT_FOUND_ERROR) != -1:
	print "\tThere is no any RFC%d." % i
	continue

	n = 'rfc%d.txt' % i
	f = open( n, 'w')
	f.write(data)
	title = ""
	pages = 0
	f.close()
	f = open( n, 'r')

	empty = 0
	cnt = 0
	cnt1 = 0

	del data

	for j in f.readlines():
	if cnt == 0:
	if re.match(r'\w{1,}', j) != None:
	cnt = 1
	continue

	if cnt == 1 and cnt1 == 0:
	if re.match(r'\w{1,}', j) == None:
	cnt1 = 1

	if cnt1 == 1:
	if re.match(r'\w{1,}', j) == None:
	empty += 1
	elif empty > 0:
	if re.match('^\s{1,}\w*', j):
	title = strip(j)
	break

	del cnt
	del empty

	for j in reversed( f.readlines() ):
	pg = re.match( pages_re, j)
	if pg != None:
	pages = int(pg.group(1))
	break

	title = ""
	if len(title) > 0:
	print '\tPages %d.' % pages
	title = title.replace('\\', '')
	newname = "RFC%d - %s (pgs %d).txt" \
	% (i, title, pages)
	newname = normpath(newname)
	print 'New name: "%s"' % newname
	f.close()
	cwd = os.getcwd()+"\\"
	os.rename( cwd+n, cwd+newname)

	h.close()
	raw_input("Press ENTER to exit.")