Created
February 13, 2016 21:46
-
-
Save ryo1kato/f307014d9e106dfe2bcc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
desc="100x faster uniq(1) implementation with hash - you don't have to sort." | |
import sys | |
import argparse | |
parser = argparse.ArgumentParser(description=desc) | |
parser.add_argument('-c', '--count', action='store_true', | |
help='Precede each output line with the count of the number of times' | |
'the line occurred in the input, followed by a single space.' | |
'With this option, output is sorted by number of duplicates') | |
parser.add_argument('-r', '--reverse', action='store_true', | |
help='With -c option, reverse the sort order.') | |
parser.add_argument('-d', '--dup', action='store_true', | |
help='only output lines with duplicates in the input') | |
parser.add_argument('-u', '--uniq', action='store_true', | |
help='Only output lines without duplicates in the input.') | |
parser.add_argument('filename', metavar='FILE', type=str, nargs='?') | |
def main(args): | |
if args.filename: | |
if args.filename == '-': | |
infd = sys.stdin | |
else: | |
infd = open(args.filename) | |
else: | |
infd = sys.stdin | |
counter = dict() | |
for line in infd: | |
if line in counter: | |
counter[line] += 1 | |
if not args.count and args.dup and counter[line] == 2: | |
sys.stdout.write(line) | |
else: | |
counter[line] = 1 | |
if not (args.count or args.uniq or args.dup): | |
sys.stdout.write(line) | |
if args.count: | |
maxdigits = len(str(max(counter.values()))) | |
f = "{:" + str(maxdigits) + "d} {}" | |
sorted_counter = sorted(counter, key=counter.get) | |
if args.reverse: | |
sorted_counter.reverse() | |
for line in sorted_counter: | |
count = counter[line] | |
if not (args.dup or args.uniq) \ | |
or (args.dup and count >= 2) \ | |
or (args.uniq and count == 1): | |
sys.stdout.write(f.format(counter[line], line)) | |
elif args.uniq: | |
for line in counter: | |
if counter[line] == 1: | |
sys.stdout.write(line) | |
if __name__ == '__main__': | |
(args) = parser.parse_args() | |
main(args) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment