MaxMatti · February 21, 2017 02:44
diff --git a/wordhoarder.py b/wordhoarder.py
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-

 import argparse
 import glob
 import string
 import time
 from chardet.universaldetector import UniversalDetector
 from sys import stderr

 def main():
 	parser = argparse.ArgumentParser(description = "collect multiple wordlists into one file while handling doubles and weird input encoding")
 	parser.add_argument("infiles", help = "the files you want to include into the wordlist", nargs="*")
 	group = parser.add_mutually_exclusive_group()
 	group.add_argument("-v", "--verbose", help = "print verbose output", action = "store_true")
 	group.add_argument("-q", "--quiet", help = "only print warnings and errors", action = "store_true")
 	parser.add_argument("-o", "--outfile", help = "write wordlist to OUTFILE", default = "")
 	parser.add_argument("-t", "--timeout", help = "timeout when detecting input encoding in seconds", default = 10, type = int)
 	parser.add_argument("-l", "--lowercase", help = "convert words to lowercase", action = "store_true")
 	parser.add_argument("-u", "--umlaute", help = "convert german Umlaute and sharp s to their ascii representation", action = "store_true")
 	parser.add_argument("-d", "--debug", help = "prints out every word collected", action = "store_true")
 	args = parser.parse_args()
 	paths = []
 	replacement = {ord(elem): "" for elem in string.punctuation + string.whitespace + "_'\n"}
 	if args.lowercase:
 		if args.verbose:
 			print("Converting all words to lowercase.")
 		for letter in string.ascii_uppercase:
 			replacement[ord(letter)] = letter.lower()
 	if args.umlaute:
 		if args.verbose:
 			print("Converting all Umlauts to ascii.")
 		if args.lowercase:
 			replacement[ord("Ä")] = "ae"
 			replacement[ord("Ö")] = "oe"
 			replacement[ord("Ü")] = "ue"
 		else:
 			replacement[ord("Ä")] = "Ae"
 			replacement[ord("Ö")] = "Oe"
 			replacement[ord("Ü")] = "Ue"
 		replacement[ord("ä")] = "ae"
 		replacement[ord("ö")] = "oe"
 		replacement[ord("ü")] = "ue"
 		replacement[ord("ß")] = "ss"
 	if len(args.infiles) > 0:
 		for arg in args.infiles:
 			paths.append(arg)
 	else:
 		while True:
 			path = input("Enter input paths, leave blank to start: ")
 			if len(path) == 0:
 				break
 			paths.append(path)
 	if args.outfile == "":
 		outpath = input("Enter output path: ")
 	else:
 		outpath = args.outfile
 	all_words = set()
 	for path in paths:
 		if not args.quiet:
 			print("Reading " + path, end = "", flush = True)
 		detector = UniversalDetector()
 		timeout = time.time() + args.timeout
 		for line in open(path, "rb"):
 			detector.feed(line)
 			if detector.done or time.time() > timeout:
 				break
 		detector.close()
 		if not args.quiet:
 			print("... ", end = "", flush = True)
 		if time.time() > timeout:
 			if not args.quiet:
 				print("")
 			print("Warning: Input encoding detection timed out. Guessing " + detector.result["encoding"] + " with " + str(int(detector.result["confidence"] * 100)) + "% confidence.")
 		elif args.verbose:
 			print("\nDetected encoding: " + detector.result["encoding"] + " with " + str(int(detector.result["confidence"] * 100)) + "% confidence.")
 		try:
 			with open(path, "r", encoding = detector.result["encoding"]) as file_contents:
 				if args.debug:
 					for line in file_contents:
 						print("\"" + line.translate(replacement) + "\"")
 						all_words.add(line.translate(replacement))
 				else:
 					for line in file_contents:
 						all_words.add(line.translate(replacement))
 		except UnicodeDecodeError:
 			print("UnicodeDecodeError - detected wrong file encoding.", flush = True, file = stderr)
 			if not args.quiet:
 				print("Reading " + path + " again (without timeout)", end = "", flush = True)
 			detector = UniversalDetector()
 			for line in open(path, "rb"):
 				detector.feed(line)
 				if detector.done:
 					break
 			detector.close()
 			if not args.quiet:
 				print("... ", end = "", flush = True)
 			with open(path, "r", encoding = detector.result["encoding"]) as file_contents:
 				if args.debug:
 					for line in file_contents:
 						print("\"" + line.translate(replacement) + "\"")
 						all_words.add(line.translate(replacement))
 				else:
 					for line in file_contents:
 						all_words.add(line.translate(replacement))
 		if not args.quiet:
 			print("wordlist now contains " + str(len(all_words)) + " words.")
 	if not args.quiet:
 		print("Writing wordlist to file...", flush = True)
 	with open(outpath, "w") as outfile:
 		outfile.write("\n".join(all_words))

 if __name__ == "__main__":
 	main()
	#!/usr/bin/python3
	# -- coding: utf-8 --

	import argparse
	import glob
	import string
	import time
	from chardet.universaldetector import UniversalDetector
	from sys import stderr

	def main():
	parser = argparse.ArgumentParser(description = "collect multiple wordlists into one file while handling doubles and weird input encoding")
	parser.add_argument("infiles", help = "the files you want to include into the wordlist", nargs="*")
	group = parser.add_mutually_exclusive_group()
	group.add_argument("-v", "--verbose", help = "print verbose output", action = "store_true")
	group.add_argument("-q", "--quiet", help = "only print warnings and errors", action = "store_true")
	parser.add_argument("-o", "--outfile", help = "write wordlist to OUTFILE", default = "")
	parser.add_argument("-t", "--timeout", help = "timeout when detecting input encoding in seconds", default = 10, type = int)
	parser.add_argument("-l", "--lowercase", help = "convert words to lowercase", action = "store_true")
	parser.add_argument("-u", "--umlaute", help = "convert german Umlaute and sharp s to their ascii representation", action = "store_true")
	parser.add_argument("-d", "--debug", help = "prints out every word collected", action = "store_true")
	args = parser.parse_args()
	paths = []
	replacement = {ord(elem): "" for elem in string.punctuation + string.whitespace + "_'\n"}
	if args.lowercase:
	if args.verbose:
	print("Converting all words to lowercase.")
	for letter in string.ascii_uppercase:
	replacement[ord(letter)] = letter.lower()
	if args.umlaute:
	if args.verbose:
	print("Converting all Umlauts to ascii.")
	if args.lowercase:
	replacement[ord("Ä")] = "ae"
	replacement[ord("Ö")] = "oe"
	replacement[ord("Ü")] = "ue"
	else:
	replacement[ord("Ä")] = "Ae"
	replacement[ord("Ö")] = "Oe"
	replacement[ord("Ü")] = "Ue"
	replacement[ord("ä")] = "ae"
	replacement[ord("ö")] = "oe"
	replacement[ord("ü")] = "ue"
	replacement[ord("ß")] = "ss"
	if len(args.infiles) > 0:
	for arg in args.infiles:
	paths.append(arg)
	else:
	while True:
	path = input("Enter input paths, leave blank to start: ")
	if len(path) == 0:
	break
	paths.append(path)
	if args.outfile == "":
	outpath = input("Enter output path: ")
	else:
	outpath = args.outfile
	all_words = set()
	for path in paths:
	if not args.quiet:
	print("Reading " + path, end = "", flush = True)
	detector = UniversalDetector()
	timeout = time.time() + args.timeout
	for line in open(path, "rb"):
	detector.feed(line)
	if detector.done or time.time() > timeout:
	break
	detector.close()
	if not args.quiet:
	print("... ", end = "", flush = True)
	if time.time() > timeout:
	if not args.quiet:
	print("")
	print("Warning: Input encoding detection timed out. Guessing " + detector.result["encoding"] + " with " + str(int(detector.result["confidence"] * 100)) + "% confidence.")
	elif args.verbose:
	print("\nDetected encoding: " + detector.result["encoding"] + " with " + str(int(detector.result["confidence"] * 100)) + "% confidence.")
	try:
	with open(path, "r", encoding = detector.result["encoding"]) as file_contents:
	if args.debug:
	for line in file_contents:
	print("\"" + line.translate(replacement) + "\"")
	all_words.add(line.translate(replacement))
	else:
	for line in file_contents:
	all_words.add(line.translate(replacement))
	except UnicodeDecodeError:
	print("UnicodeDecodeError - detected wrong file encoding.", flush = True, file = stderr)
	if not args.quiet:
	print("Reading " + path + " again (without timeout)", end = "", flush = True)
	detector = UniversalDetector()
	for line in open(path, "rb"):
	detector.feed(line)
	if detector.done:
	break
	detector.close()
	if not args.quiet:
	print("... ", end = "", flush = True)
	with open(path, "r", encoding = detector.result["encoding"]) as file_contents:
	if args.debug:
	for line in file_contents:
	print("\"" + line.translate(replacement) + "\"")
	all_words.add(line.translate(replacement))
	else:
	for line in file_contents:
	all_words.add(line.translate(replacement))
	if not args.quiet:
	print("wordlist now contains " + str(len(all_words)) + " words.")
	if not args.quiet:
	print("Writing wordlist to file...", flush = True)
	with open(outpath, "w") as outfile:
	outfile.write("\n".join(all_words))

	if __name__ == "__main__":
	main()