Created
January 1, 2024 22:47
-
-
Save tashrifbillah/b7cba99a4a298153bfa3d7132cdae74a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
df=pd.read_csv('pennebakerctq01_definitions.csv') | |
dfgiven=pd.read_excel('cnb_ndar_codebook_v5.xlsx') | |
_aliases=df['Aliases'].unique() | |
__aliases=[] | |
for e in _aliases: | |
if pd.isna(e): | |
continue | |
if ',' in e: | |
for sub_e in e.split(','): | |
__aliases.append(sub_e) | |
__aliases.append(e) | |
df1=df.copy() | |
df2=df.copy() | |
df1.set_index('ElementName',inplace=True) | |
df2.set_index('Aliases',inplace=True) | |
for i,row in dfgiven.iterrows(): | |
# .strip() is necessary because some of | |
# Kosha's ElementName cells appear to have trailing w/ space | |
var=row['ElementName'].strip() | |
try: | |
# ideal case: exists in ElementName | |
df1.loc[var] | |
except: | |
# print('does not exist in ElementName:', var) | |
# check if exists in single element Aliases cell | |
try: | |
df2.loc[var] | |
except: | |
# print('does not exist in single-element Aliases cell:', var) | |
pass | |
# final check: does it exist in multiple-element Aliases cell? | |
# this step will take O(n) time for each var | |
if var not in __aliases: | |
print('does not exist anywhere:', var) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment