Skip to content

Instantly share code, notes, and snippets.

@btseytlin
Created March 20, 2016 18:11
Show Gist options
  • Save btseytlin/63d6c9d0f4c890259fdf to your computer and use it in GitHub Desktop.
Save btseytlin/63d6c9d0f4c890259fdf to your computer and use it in GitHub Desktop.
import numpy as np
import math
import csv
input_file = 'output_5k.csv'
#N;ComparedIDs;Name;A;ADV;ADVPRO;ANUM;APRO;COM;CONJ;INTJ;NUM;PART;PR;SPRO;V;S;Duplicate;
#'i8, object, i4, i4, i4, i4, i4,i4, i4, i4, i4, i4,i4, i4, i4, i4,i4,i4')
#"i8,S30,i4, i4, i4, i4, i4,i4, i4, i4, i4, i4,i4, i4, i4, i4,i4,i4,i4"
#data = np.loadtxt(input_file, delimiter=';', usecols=(2,4,5,6,7,8,9,10,11,12,13,14,15,16,17),skiprows=1, dtype=int)
#read potentially huge csv file
with open(input_file, 'rb') as csvfile:
num_lines = sum(1 for line in csvfile)
with open(input_file, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=";")
memmap = np.memmap('buffer', dtype = 'int', mode = 'w+', shape=(num_lines, 15))
i = 0
for row in reader:
if i==0:
next(reader)
i+=1
continue
row = [int(x) for x in row[2:17]]
#print(row)
memmap[i, :] = row
i+=1
memmap.mode = 'r'
repeat_indexes = ["Name","A","ADV","ADVPRO","ANUM","APRO","COM","CONJ","INTJ","NUM","PART","PR","SPRO","V","S"]
amount_dupes = 0
def get_sum_repeats_duplicates():
global amount_dupes
patrition_size = int(num_lines/10)
i = 0
sum_dupes = 0
while i < num_lines:
j = i + patrition_size
if j > num_lines:
j = num_lines - 1
submap = memmap[i:j]
mask = submap[:,14] == 1
duplicates = submap[mask]
amount_dupes += len(duplicates)
sum_dupes += sum([ sum(x[:13]) for x in duplicates ] )
i+= patrition_size
return sum_dupes / amount_dupes
amount_nondupes = 0
def get_sum_repeats_non_duplicates():
global amount_nondupes
patrition_size = int(num_lines/10)
i = 0
sum_nondupes = 0
while i < num_lines:
j = i + patrition_size
if j > num_lines:
j = num_lines - 1
submap = memmap[i:j]
mask = submap[:,14] != 1
non_duplicates = submap[mask]
amount_nondupes += len(non_duplicates)
sum_nondupes += sum([ sum(x[:13]) for x in non_duplicates ] )
i+= patrition_size
return sum_nondupes / amount_nondupes
#Calculate average amount of repeats on duplicates
print('Average sum of repeats')
print('Average amount of repeats for duplicates:', get_sum_repeats_duplicates() )
print('Average amount of repeats for non duplicates:', get_sum_repeats_non_duplicates() )
def get_sum_repeats_index_dupes(ind):
global amount_dupes
patrition_size = int(num_lines/10)
i = 0
sum_dupes = 0
while i < num_lines:
j = i + patrition_size
if j > num_lines:
j = num_lines - 1
submap = memmap[i:j]
mask = submap[:,14] == 1
duplicates = submap[mask]
column = duplicates[:,ind]
sum_dupes += column.sum()
i+= patrition_size
return sum_dupes / amount_dupes
def get_sum_repeats_index_nondupes(ind):
global amount_nondupes
patrition_size = int(num_lines/10)
i = 0
sum_nondupes = 0
while i < num_lines:
j = i + patrition_size
if j > num_lines:
j = num_lines - 1
submap = memmap[i:j]
mask = submap[:,14] != 1
non_duplicates = submap[mask]
column = non_duplicates[:,ind]
sum_nondupes += column.sum()
i+= patrition_size
return sum_nondupes / amount_nondupes
for i in range(len(repeat_indexes)):
index = repeat_indexes[i]
print("Average amount of " + index + " repeats in duplicates", get_sum_repeats_index_dupes(i) )
print("Average amount of " + index + " repeats in non_duplicates", get_sum_repeats_index_nondupes(i) )
#non_duplicates = data[ data[:17] == 0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment