alastairparagas · April 27, 2019 18:13
diff --git a/test.py b/test.py
 from multiprocessing import Pool, Array, Manager
 import functools
 import ctypes
 import numpy as np
 import pandas
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 import matplotlib
 import matplotlib.pyplot as plt


 """
 Prep dataframe from a given CSV file
 """
 def get_df_set(csv_path="./results.csv"):
  eeg_data = pandas.read_csv(csv_path)
  print('Read CSV into Pandas')
  
  # Drop first and last 15 minutes of data per patient
  grouped = eeg_data.groupby(by=["subject_id", "cohort"]).apply(
      lambda df: df.drop(
          df.head(900).index
      ).drop(
          df.tail(900).index
      )
  )

  groups_df = [
      (x[["eeg_{0}".format(i) for i in range(125)]], 
       x['sleep_stage'])
      for _, x in grouped.groupby(['subject_id'], as_index=False)
  ]
  
  row_count = len(groups_df)
  shared_matrix_base = Array(ctypes.c_double, row_count * row_count)
  shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj()).reshape(
    (row_count, row_count)
  )
  
  return groups_df, shared_matrix

 """
 Scoring metric used
 """
 def scoring_metric(tuplet, score_matrix, lock):
  (i, groups), (x_train, y_train) = tuplet
  
  lock.acquire()
  model.fit(x_train, y_train)
  lock.release()
  
  print('Trained on subject {0}'.format(i))
      
  for j, (x_test, y_test) in enumerate(groups):
    score_matrix[i][j] = np.mean(
      1 - (np.absolute(y_test - model.predict(x_test)) * 0.2)
    )
        
  return group_score_vector

 """
 Run the scoring metric in parallel and any other 
 pre/post-scoring functionality
 """
 def score_model(model, groups, score_matrix, lock=Manager().Lock()):
  pool_executor = Pool(2)
  
  print('Running parallel scoring')
  return np.array(pool_executor.map(
    functools.partial(scoring_metric, score_matrix=score_matrix, lock=lock), 
    map(
      lambda tuplet: ((tuplet[0], groups), tuplet[1]), 
      enumerate(groups)
    ), 
    1
  ))

 """
 Plot results
 """
 def plotit(accuracy_score_matrix, img_file_path="accuracy_scores.png"):
  print('Plotting results')
  
  fig, ax = plt.subplots()
  fig.set_figheight(20)
  fig.set_figwidth(20)
  im = ax.imshow(accuracy_score_matrix)

  ax.set_xticks(np.arange(20))
  ax.set_yticks(np.arange(20))

  subject_ids = ["subject_{0}".format(i) for i in range(20)]
  ax.set_xticklabels(subject_ids)
  ax.set_yticklabels(subject_ids)

  plt.setp(
    ax.get_xticklabels(), rotation=45, ha="right",
    rotation_mode="anchor"
  )

  # Loop over data dimensions and create text annotations.
  for i in range(20):
      for j in range(20):
          text = ax.text(
            j, i, round(accuracy_score_matrix[i, j], 3),
            ha="center", va="center", 
            color='black' if accuracy_score_matrix[i, j] > 0.8 else 'white'
          )

  ax.set_title("Accuracy scores")
  fig.tight_layout()
  plt.savefig(
      img_file_path,
      bbox_inches='tight',
      pad_inches=0.25
  )

 if __name__ == '__main__':
  df_set, score_matrix = get_df_set()
  model = RandomForestClassifier(64)
  plotit(score_model(model, df_set, score_matrix))
	from multiprocessing import Pool, Array, Manager
	import functools
	import ctypes
	import numpy as np
	import pandas
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split
	import matplotlib
	import matplotlib.pyplot as plt


	"""
	Prep dataframe from a given CSV file
	"""
	def get_df_set(csv_path="./results.csv"):
	eeg_data = pandas.read_csv(csv_path)
	print('Read CSV into Pandas')

	# Drop first and last 15 minutes of data per patient
	grouped = eeg_data.groupby(by=["subject_id", "cohort"]).apply(
	lambda df: df.drop(
	df.head(900).index
	).drop(
	df.tail(900).index
	)
	)

	groups_df = [
	(x[["eeg_{0}".format(i) for i in range(125)]],
	x['sleep_stage'])
	for _, x in grouped.groupby(['subject_id'], as_index=False)
	]

	row_count = len(groups_df)
	shared_matrix_base = Array(ctypes.c_double, row_count * row_count)
	shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj()).reshape(
	(row_count, row_count)
	)

	return groups_df, shared_matrix

	"""
	Scoring metric used
	"""
	def scoring_metric(tuplet, score_matrix, lock):
	(i, groups), (x_train, y_train) = tuplet

	lock.acquire()
	model.fit(x_train, y_train)
	lock.release()

	print('Trained on subject {0}'.format(i))

	for j, (x_test, y_test) in enumerate(groups):
	score_matrix[i][j] = np.mean(
	1 - (np.absolute(y_test - model.predict(x_test)) * 0.2)
	)

	return group_score_vector

	"""
	Run the scoring metric in parallel and any other
	pre/post-scoring functionality
	"""
	def score_model(model, groups, score_matrix, lock=Manager().Lock()):
	pool_executor = Pool(2)

	print('Running parallel scoring')
	return np.array(pool_executor.map(
	functools.partial(scoring_metric, score_matrix=score_matrix, lock=lock),
	map(
	lambda tuplet: ((tuplet[0], groups), tuplet[1]),
	enumerate(groups)
	),
	1
	))

	"""
	Plot results
	"""
	def plotit(accuracy_score_matrix, img_file_path="accuracy_scores.png"):
	print('Plotting results')

	fig, ax = plt.subplots()
	fig.set_figheight(20)
	fig.set_figwidth(20)
	im = ax.imshow(accuracy_score_matrix)

	ax.set_xticks(np.arange(20))
	ax.set_yticks(np.arange(20))

	subject_ids = ["subject_{0}".format(i) for i in range(20)]
	ax.set_xticklabels(subject_ids)
	ax.set_yticklabels(subject_ids)

	plt.setp(
	ax.get_xticklabels(), rotation=45, ha="right",
	rotation_mode="anchor"
	)

	# Loop over data dimensions and create text annotations.
	for i in range(20):
	for j in range(20):
	text = ax.text(
	j, i, round(accuracy_score_matrix[i, j], 3),
	ha="center", va="center",
	color='black' if accuracy_score_matrix[i, j] > 0.8 else 'white'
	)

	ax.set_title("Accuracy scores")
	fig.tight_layout()
	plt.savefig(
	img_file_path,
	bbox_inches='tight',
	pad_inches=0.25
	)

	if __name__ == '__main__':
	df_set, score_matrix = get_df_set()
	model = RandomForestClassifier(64)
	plotit(score_model(model, df_set, score_matrix))
No results found