Skip to content

Instantly share code, notes, and snippets.

@eliasdorneles
Created May 18, 2016 22:36
Show Gist options
  • Save eliasdorneles/7a14bd494801cbaa963e1d8dafd7e676 to your computer and use it in GitHub Desktop.
Save eliasdorneles/7a14bd494801cbaa963e1d8dafd7e676 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Reservoir sampling for line-based input
"""
from __future__ import print_function
import sys
import random
def stream_input(fname):
if fname == '-':
for l in sys.stdin:
yield l
else:
with open(fname) as f:
for l in f:
yield l
def run(args):
size = args.sample_size
count = 0
samples = []
for line in stream_input(args.input):
line = line.strip()
if count < size:
samples.append(line)
else:
index = random.randint(0, count)
if index < size:
samples[index] = line
count += 1
print('\n'.join(samples))
if '__main__' == __name__:
import argparse
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('sample_size', help='Random sample size')
parser.add_argument('--input', default='-',
help='Input file (default is stdin)')
args = parser.parse_args()
run(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment