Skip to content

Instantly share code, notes, and snippets.

@ivan-krukov
Last active August 29, 2015 14:18
Show Gist options
  • Save ivan-krukov/0e93228f022ffa18c61a to your computer and use it in GitHub Desktop.
Save ivan-krukov/0e93228f022ffa18c61a to your computer and use it in GitHub Desktop.
Shuffle and relabel observations in a csv file

#Randomization script

The idea is to do bootstrap resampling on the control dataset to make sure it is appropriate as a control dataset.

#Usage

python randomize.py <input.csv> <resample_times>

This will generate resample_times files called relabel_X.

#Operation

Start by reading a csv table, and then suffle it N times, labeling the first half control, second half test - make a new csv file for each re-label

#Authors

This is the idea of Brian McDonald, for his metagenomics analyses. I (Ivan Kryukov) have implemented a basic framework for the randomization, and assume no responsibility in how it is used.

#!/usr/bin/python2
import sys
import random
import csv
#Generate two columns of random variables, called 'a' and 'b'
def generate_data(variables = ['a','b'],
samples = 10,
generator = random.gauss,
generator_args = {'mu':1, 'sigma':2}):
data = [[{v:generator(**generator_args)} for v in variables] for s in range(samples)]
return data
#Read data from a csv file
def read_data(input_file):
data = []
with open(input_file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data.append(row)
return data
#Write data to a csv file
def write_data(data,output_file, labels = ['a','b','label']):
with open(output_file,'w') as csvfile:
writer = csv.DictWriter(csvfile, labels)
writer.writeheader()
for d in data:
writer.writerow(d)
#Shuffle data; label first half labels[0], second - labels[1]
def relabel_half(data, labels = ['test','control']):
random.shuffle(data)
half = len(data)/2
for i,d in enumerate(data):
if i < half:
d['label'] = labels[0]
else:
d['label'] = labels[1]
return data
if __name__ == '__main__':
input_file = sys.argv[1]
data = read_data(input_file)
for i in range(int(sys.argv[2])):
r = relabel_half(data)
write_data(r, "relabel_"+str(i))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment