Skip to content

Instantly share code, notes, and snippets.

@tsuchm
Created June 4, 2020 08:34
Show Gist options
  • Save tsuchm/fca11c2c4dba8c469a2f5275c744ad3d to your computer and use it in GitHub Desktop.
Save tsuchm/fca11c2c4dba8c469a2f5275c744ad3d to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
# The alternative script of "cat files|sort|uniq -c|sort -r -n".
import marisa
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
def parse_args():
import argparse
p = argparse.ArgumentParser()
p.add_argument('inputs', nargs='+', help='input file(s)')
p.add_argument('--limit', type=int, default=1)
return p.parse_args()
if __name__ == '__main__':
args = parse_args()
keyset = marisa.Keyset()
for file in args.inputs:
with open(file, encoding='utf-8') as fp:
for line in fp:
keyset.push_back(line.rstrip())
trie = marisa.Trie()
trie.build(keyset)
freq = [0] * trie.num_keys()
agent = marisa.Agent()
for file in args.inputs:
with open(file, encoding='utf-8') as fp:
for line in fp:
agent.set_query(line.rstrip())
assert trie.lookup(agent), "%s is not found" % line.rstrip()
freq[agent.key_id()] += 1
for i,f in sorted(enumerate(freq), key=lambda pair: -pair[1]):
if f >= args.limit:
print("%d\t%s" % (f, trie.reverse_lookup(i)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment