Last active
December 17, 2021 04:05
-
-
Save monk1337/3f33571c87a622bba762a4efe626b83c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from tqdm import tqdm | |
import nlpaug.augmenter.char as nac | |
import nlpaug.augmenter.word as naw | |
import nlpaug.augmenter.sentence as nas | |
import nlpaug.flow as nafc | |
from nlpaug.util import Action | |
import os | |
import uuid | |
def trans_(df): | |
df['y_'] = df['Full_Labels'].apply(lambda x : x.replace('|', '_')) | |
freq_count = dict(df.y_.value_counts()) | |
aug_fr = {'y_': [], 'freq_f': []} | |
for label_data, fre in freq_count.items(): | |
aug_fr['y_'].append(label_data) | |
aug_fr['freq_f'].append((50 - fre)) | |
rty = pd.DataFrame(aug_fr) | |
als = pd.merge(df, rty, on='y_') | |
als['random_key'] = ["".join(str(uuid.uuid4())) for _ in range(len(als))] | |
return als | |
class TextAug_base(object): | |
def __init__(self, model_name, method, device): | |
if model_name == 't5': | |
self.aug = nas.AbstSummAug(model_path='t5-base', device = device) | |
elif model_name == 'gpt': | |
self.aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device = device) | |
elif model_name == 'facebook': | |
self.aug = naw.BackTranslationAug( | |
from_model_name='facebook/wmt19-en-de', | |
to_model_name='facebook/wmt19-de-en', device = device) | |
else: | |
self.aug = naw.ContextualWordEmbsAug(model_path=model_name, action=method, device = device) | |
def augmention_pipeline(self, text, total_num): | |
augmented_text = self.aug.augment(text, n = total_num) | |
return augmented_text | |
tf_models = ["sentence-transformers/LaBSE", | |
"sentence-transformers/bert-base-nli-mean-tokens", | |
"sentence-transformers/paraphrase-xlm-r-multilingual-v1", | |
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", | |
"allenai/scibert_scivocab_uncased", | |
"xlm-roberta-base","roberta-large", | |
"roberta-base","bert-large-uncased","albert-base-v2", | |
"roberta-large-mnli","deepset/sentence_bert", | |
"xlm-roberta-large","kamalkraj/bioelectra-base-discriminator-pubmed", | |
"bert-base-uncased", | |
"monologg/biobert_v1.0_pubmed_pmc", | |
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", | |
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", | |
"minhpqn/bio_roberta-base_pubmed", | |
"bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12", | |
"bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16", | |
"sentence-transformers/roberta-large-nli-stsb-mean-tokens", | |
"sentence-transformers/bert-large-nli-mean-tokens", | |
"bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12", | |
"adamlin/NCBI_BERT_pubmed_mimic_uncased_large_transformers", | |
"TheLongSentance/MIMIC-III-t5-large-v1", | |
"adamlin/NCBI_BERT_pubmed_mimic_uncased_base_transformers", | |
"mnaylor/bigbird-base-mimic-mortality", | |
'ian/BlueBERT-Finetuned-MIMIC-III-ICD-9-Classification', 'gpt', 't5'] | |
def bulk_aug(df, model, method, chunk_size, device): | |
device = f'cuda:{device}' | |
if not os.path.exists('./datasets/'): | |
os.mkdir('./datasets/') | |
aug_pip = TextAug_base(model, method, device) | |
final_dataframe = {'Patient_Id' : [], 'Admission_Id' : [], | |
'Chapter_Labels': [], 'Three_Character_Labels' : [], | |
'Full_Labels' : [], 'Text': [], 'y_': [], 'random_key': []} | |
for index, row in tqdm(df_group_shot.iterrows(), | |
total=df_group_shot.shape[0]): | |
random_key = row['random_key'] | |
sentences = row['Text'] | |
aug_no = row['freq_f'] | |
aug_data = [] | |
chunks = list(chunks_pipe(sentences.split(), chunk_size)) | |
for chunked_sentence in tqdm(chunks): | |
ug_d = aug_pip.augmention_pipeline(chunked_sentence, 2) | |
aug_data.append(ug_d) | |
final_aug_data = [" ".join(k) for k in list(zip(*aug_data))] | |
for cli_name, i in enumerate(final_aug_data): | |
pat_name = row['Patient_Id'] | |
mo_name = f'{pat_name}_{cli_name}' | |
ad_name = row['Admission_Id'] | |
mo_ad_name= f'{ad_name}_{cli_name}' | |
final_dataframe['Patient_Id'].append(mo_name) | |
final_dataframe['Admission_Id'].append(mo_ad_name) | |
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels']) | |
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels']) | |
final_dataframe['Full_Labels'].append(row['Full_Labels']) | |
final_dataframe['Text'].append(i) | |
final_dataframe['y_'].append(row['y_']) | |
final_dataframe['random_key'].append(row['random_key']) | |
final_dataframe['Patient_Id'].append(row['Patient_Id']) | |
final_dataframe['Admission_Id'].append(row['Admission_Id']) | |
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels']) | |
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels']) | |
final_dataframe['Full_Labels'].append(row['Full_Labels']) | |
final_dataframe['Text'].append(sentences) | |
final_dataframe['y_'].append(row['y_']) | |
final_dataframe['random_key'].append(row['random_key']) | |
df_au = pd.DataFrame(final_dataframe).reset_index(drop=True) | |
model_name_save = model.split('/')[-1] | |
d_n_name = f'{model_name_save}_{chunk_size}_{method}' | |
df_au.to_csv(f'./datasets/{d_n_name}.csv', index=False) | |
return 'saved' | |
def chunks(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield " ".join(lst[i:i + n]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv('/content/drive/MyDrive/aug_data/raw_data_aug_group.csv') | |
device = 1 | |
for i_no in [100, 200, 300, 400, 500]: | |
for j_me in ['insert', 'substitute']: | |
for k_mo in tf_models: | |
print(f"currently_running {j_no}_{j_me}_{k_mo}") | |
ty = bulk_aug(df, k_mo, j_me, i_no, device) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from tqdm import tqdm | |
import nlpaug.augmenter.char as nac | |
import nlpaug.augmenter.word as naw | |
import nlpaug.augmenter.sentence as nas | |
import nlpaug.flow as nafc | |
from nlpaug.util import Action | |
import os | |
import uuid | |
def trans_(df): | |
df['y_'] = df['Full_Labels'].apply(lambda x : x.replace('|', '_')) | |
freq_count = dict(df.y_.value_counts()) | |
aug_fr = {'y_': [], 'freq_f': []} | |
for label_data, fre in freq_count.items(): | |
aug_fr['y_'].append(label_data) | |
aug_fr['freq_f'].append((50 - fre)) | |
rty = pd.DataFrame(aug_fr) | |
als = pd.merge(df, rty, on='y_') | |
als['random_key'] = ["".join(str(uuid.uuid4())) for _ in range(len(als))] | |
return als | |
class TextAug_base(object): | |
def __init__(self, model_name, method, device): | |
if model_name == 't5': | |
self.aug = nas.AbstSummAug(model_path='t5-base', device = device) | |
elif model_name == 'gpt': | |
self.aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device = device) | |
elif model_name == 'facebook': | |
self.aug = naw.BackTranslationAug( | |
from_model_name='facebook/wmt19-en-de', | |
to_model_name='facebook/wmt19-de-en', device = device) | |
else: | |
self.aug = naw.ContextualWordEmbsAug(model_path=model_name, action=method, device = device) | |
def augmention_pipeline(self, text, total_num): | |
augmented_text = self.aug.augment(text, n = total_num) | |
return augmented_text | |
tf_models = ["sentence-transformers/LaBSE", | |
"sentence-transformers/bert-base-nli-mean-tokens", | |
"sentence-transformers/paraphrase-xlm-r-multilingual-v1", | |
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", | |
"allenai/scibert_scivocab_uncased", | |
"xlm-roberta-base","roberta-large", | |
"roberta-base","bert-large-uncased","albert-base-v2", | |
"roberta-large-mnli","deepset/sentence_bert", | |
"xlm-roberta-large","kamalkraj/bioelectra-base-discriminator-pubmed", | |
"bert-base-uncased", | |
"monologg/biobert_v1.0_pubmed_pmc", | |
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", | |
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", | |
"minhpqn/bio_roberta-base_pubmed", | |
"bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12", | |
"bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16", | |
"sentence-transformers/roberta-large-nli-stsb-mean-tokens", | |
"sentence-transformers/bert-large-nli-mean-tokens", | |
"bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12", | |
"adamlin/NCBI_BERT_pubmed_mimic_uncased_large_transformers", | |
"TheLongSentance/MIMIC-III-t5-large-v1", | |
"adamlin/NCBI_BERT_pubmed_mimic_uncased_base_transformers", | |
"mnaylor/bigbird-base-mimic-mortality", | |
'ian/BlueBERT-Finetuned-MIMIC-III-ICD-9-Classification', 'gpt', 't5'] | |
def bulk_aug(df, model, method, chunk_size, device): | |
device = f'cuda:{device}' | |
if not os.path.exists('./datasets/'): | |
os.mkdir('./datasets/') | |
aug_pip = TextAug_base(model, method, device) | |
final_dataframe = {'Patient_Id' : [], 'Admission_Id' : [], | |
'Chapter_Labels': [], 'Three_Character_Labels' : [], | |
'Full_Labels' : [], 'Text': [], 'y_': [], 'random_key': []} | |
for index, row in tqdm(df.iterrows(), | |
total=df.shape[0]): | |
random_key = row['random_key'] | |
sentences = row['Text'] | |
aug_no = row['freq_f'] | |
aug_data = [] | |
chunks = list(chunks_pipe(sentences.split(), chunk_size)) | |
for chunked_sentence in tqdm(chunks): | |
ug_d = aug_pip.augmention_pipeline(chunked_sentence, 2) | |
aug_data.append(ug_d) | |
final_aug_data = [" ".join(k) for k in list(zip(*aug_data))] | |
for cli_name, i in enumerate(final_aug_data): | |
pat_name = row['Patient_Id'] | |
mo_name = f'{pat_name}_{cli_name}' | |
ad_name = row['Admission_Id'] | |
mo_ad_name= f'{ad_name}_{cli_name}' | |
final_dataframe['Patient_Id'].append(mo_name) | |
final_dataframe['Admission_Id'].append(mo_ad_name) | |
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels']) | |
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels']) | |
final_dataframe['Full_Labels'].append(row['Full_Labels']) | |
final_dataframe['Text'].append(i) | |
final_dataframe['y_'].append(row['y_']) | |
final_dataframe['random_key'].append(row['random_key']) | |
final_dataframe['Patient_Id'].append(row['Patient_Id']) | |
final_dataframe['Admission_Id'].append(row['Admission_Id']) | |
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels']) | |
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels']) | |
final_dataframe['Full_Labels'].append(row['Full_Labels']) | |
final_dataframe['Text'].append(sentences) | |
final_dataframe['y_'].append(row['y_']) | |
final_dataframe['random_key'].append(row['random_key']) | |
df_au = pd.DataFrame(final_dataframe).reset_index(drop=True) | |
model_name_save = model.split('/')[-1] | |
d_n_name = f'{model_name_save}_{chunk_size}_{method}' | |
df_au.to_csv(f'./datasets/{d_n_name}.csv', index=False) | |
return 'saved' | |
def chunks_pipe(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield " ".join(lst[i:i + n]) | |
df = pd.read_csv('raw_data_aug_group.csv') | |
device = 1 | |
for i_no in [100, 200, 300, 400, 500]: | |
for j_me in ['insert', 'substitute']: | |
for k_mo in tf_models: | |
try: | |
print(f"currently_running {i_no}_{j_me}_{k_mo}") | |
ty = bulk_aug(df, k_mo, j_me, i_no, device) | |
except Exception as e: | |
print(e) | |
print(f"Model not supported_{k_mo}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rename_df(df_new, caption): | |
print(caption) | |
df_new['Patient_Id'] = df_new['Patient_Id'].apply(lambda x: f'x_{caption}') | |
df_new['Admission_Id'] = df_new['Admission_Id'].apply(lambda x: f'x_{caption}') | |
return df_new | |
def aug_manual(df, aug_no): | |
all_dataframes = [] | |
for i in range(aug_no): | |
df_current = df.copy() | |
all_dataframes.append(rename_df(df_current, i)) | |
df_aug = pd.concat(all_dataframes) | |
df_aug = df_aug.sample(frac=1).reset_index(drop=True) | |
return df_aug |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment