Skip to content

Instantly share code, notes, and snippets.

@tashrifbillah
Created January 1, 2024 22:47
Show Gist options
  • Save tashrifbillah/b7cba99a4a298153bfa3d7132cdae74a to your computer and use it in GitHub Desktop.
Save tashrifbillah/b7cba99a4a298153bfa3d7132cdae74a to your computer and use it in GitHub Desktop.
import pandas as pd
df=pd.read_csv('pennebakerctq01_definitions.csv')
dfgiven=pd.read_excel('cnb_ndar_codebook_v5.xlsx')
_aliases=df['Aliases'].unique()
__aliases=[]
for e in _aliases:
if pd.isna(e):
continue
if ',' in e:
for sub_e in e.split(','):
__aliases.append(sub_e)
__aliases.append(e)
df1=df.copy()
df2=df.copy()
df1.set_index('ElementName',inplace=True)
df2.set_index('Aliases',inplace=True)
for i,row in dfgiven.iterrows():
# .strip() is necessary because some of
# Kosha's ElementName cells appear to have trailing w/ space
var=row['ElementName'].strip()
try:
# ideal case: exists in ElementName
df1.loc[var]
except:
# print('does not exist in ElementName:', var)
# check if exists in single element Aliases cell
try:
df2.loc[var]
except:
# print('does not exist in single-element Aliases cell:', var)
pass
# final check: does it exist in multiple-element Aliases cell?
# this step will take O(n) time for each var
if var not in __aliases:
print('does not exist anywhere:', var)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment