shaabhishek · November 12, 2019 21:11
diff --git a/client_side.py b/client_side.py
 ## imports and other stuff

 # Boilerplate for setting things up
 query_args = {'dbname': dbname, 'host': host, 'port': port, 'user': dbusername}
 conn = psycopg2.connect(**query_args)
 cur = conn.cursor()
 cur.execute('SET search_path to ' + schema_name)

 # Setup the query- note it is ALMOST the same as the sql file's query
 query = \
   """
   with visitcnts as
   (
       select d.subject_id, count(distinct d.hadm_id) as visits
       from diagnoses_icd d
       group by d.subject_id
   ), diag as
   (
       select visitcnts.subject_id, d2.hadm_id, d2.icd9_code
       from visitcnts
       inner join diagnoses_icd d2
       on visitcnts.subject_id = d2.subject_id
       where visits > 1
   )
   select diag.*, a.admittime
   from diag
   inner join admissions a
   on diag.hadm_id = a.hadm_id
   """

 # read table into dataframe
 data_df = pd.read_sql_query(query, conn)

 # extract out only the unique subject-hospital_admission-admittime combinations
 hadm_times = data_df.drop('icd9_code', axis=1)
 hadm_times = hadm_times.drop_duplicates()

 ## group by subject - create a list of indices for each hospital visit - assign each (subject,hadm) pair an index
 # works because group keys are sorted
 visit_idxs = hadm_times.groupby(['subject_id']).apply(lambda x: pd.Series(np.arange(len(x)))).values
 hadm_times = hadm_times.sort_values(['subject_id', 'admittime'])
 hadm_times['visit_idx'] = visit_idxs

 # don't really need admittimes anymore
 hadm_times.drop('admittime', axis=1, inplace=True)
 data_df.drop('admittime', axis=1, inplace=True)

 # join the visit-ranking with the full table
 data_df = hadm_times.merge(data_df, on=['subject_id', 'hadm_id'], how='inner')
diff --git a/sql_side.sql b/sql_side.sql
 with visitcnts as 
 (
    select d.subject_id, count(distinct d.hadm_id) as visits
    from diagnoses_icd d
    group by d.subject_id
 ), diag as
 (
    select visitcnts.subject_id, d2.hadm_id, d2.icd9_code
    from visitcnts
    inner join diagnoses_icd d2
    on visitcnts.subject_id = d2.subject_id
    where visits > 1
 )
 select diag.*, a.admittime,
 dense_rank() over (
    partition by diag.subject_id
    order by a.admittime
 ) visit_idx
 from diag
 inner join admissions a
 on diag.hadm_id = a.hadm_id
	## imports and other stuff

	# Boilerplate for setting things up
	query_args = {'dbname': dbname, 'host': host, 'port': port, 'user': dbusername}
	conn = psycopg2.connect(**query_args)
	cur = conn.cursor()
	cur.execute('SET search_path to ' + schema_name)

	# Setup the query- note it is ALMOST the same as the sql file's query
	query = \
	"""
	with visitcnts as
	(
	select d.subject_id, count(distinct d.hadm_id) as visits
	from diagnoses_icd d
	group by d.subject_id
	), diag as
	(
	select visitcnts.subject_id, d2.hadm_id, d2.icd9_code
	from visitcnts
	inner join diagnoses_icd d2
	on visitcnts.subject_id = d2.subject_id
	where visits > 1
	)
	select diag.*, a.admittime
	from diag
	inner join admissions a
	on diag.hadm_id = a.hadm_id
	"""

	# read table into dataframe
	data_df = pd.read_sql_query(query, conn)

	# extract out only the unique subject-hospital_admission-admittime combinations
	hadm_times = data_df.drop('icd9_code', axis=1)
	hadm_times = hadm_times.drop_duplicates()

	## group by subject - create a list of indices for each hospital visit - assign each (subject,hadm) pair an index
	# works because group keys are sorted
	visit_idxs = hadm_times.groupby(['subject_id']).apply(lambda x: pd.Series(np.arange(len(x)))).values
	hadm_times = hadm_times.sort_values(['subject_id', 'admittime'])
	hadm_times['visit_idx'] = visit_idxs

	# don't really need admittimes anymore
	hadm_times.drop('admittime', axis=1, inplace=True)
	data_df.drop('admittime', axis=1, inplace=True)

	# join the visit-ranking with the full table
	data_df = hadm_times.merge(data_df, on=['subject_id', 'hadm_id'], how='inner')