Skip to content

Instantly share code, notes, and snippets.

@monk1337
Last active December 17, 2021 04:05
Show Gist options
  • Save monk1337/3f33571c87a622bba762a4efe626b83c to your computer and use it in GitHub Desktop.
Save monk1337/3f33571c87a622bba762a4efe626b83c to your computer and use it in GitHub Desktop.
import pandas as pd
from tqdm import tqdm
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
import os
import uuid
def trans_(df):
df['y_'] = df['Full_Labels'].apply(lambda x : x.replace('|', '_'))
freq_count = dict(df.y_.value_counts())
aug_fr = {'y_': [], 'freq_f': []}
for label_data, fre in freq_count.items():
aug_fr['y_'].append(label_data)
aug_fr['freq_f'].append((50 - fre))
rty = pd.DataFrame(aug_fr)
als = pd.merge(df, rty, on='y_')
als['random_key'] = ["".join(str(uuid.uuid4())) for _ in range(len(als))]
return als
class TextAug_base(object):
def __init__(self, model_name, method, device):
if model_name == 't5':
self.aug = nas.AbstSummAug(model_path='t5-base', device = device)
elif model_name == 'gpt':
self.aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device = device)
elif model_name == 'facebook':
self.aug = naw.BackTranslationAug(
from_model_name='facebook/wmt19-en-de',
to_model_name='facebook/wmt19-de-en', device = device)
else:
self.aug = naw.ContextualWordEmbsAug(model_path=model_name, action=method, device = device)
def augmention_pipeline(self, text, total_num):
augmented_text = self.aug.augment(text, n = total_num)
return augmented_text
tf_models = ["sentence-transformers/LaBSE",
"sentence-transformers/bert-base-nli-mean-tokens",
"sentence-transformers/paraphrase-xlm-r-multilingual-v1",
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"allenai/scibert_scivocab_uncased",
"xlm-roberta-base","roberta-large",
"roberta-base","bert-large-uncased","albert-base-v2",
"roberta-large-mnli","deepset/sentence_bert",
"xlm-roberta-large","kamalkraj/bioelectra-base-discriminator-pubmed",
"bert-base-uncased",
"monologg/biobert_v1.0_pubmed_pmc",
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
"minhpqn/bio_roberta-base_pubmed",
"bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12",
"bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16",
"sentence-transformers/roberta-large-nli-stsb-mean-tokens",
"sentence-transformers/bert-large-nli-mean-tokens",
"bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12",
"adamlin/NCBI_BERT_pubmed_mimic_uncased_large_transformers",
"TheLongSentance/MIMIC-III-t5-large-v1",
"adamlin/NCBI_BERT_pubmed_mimic_uncased_base_transformers",
"mnaylor/bigbird-base-mimic-mortality",
'ian/BlueBERT-Finetuned-MIMIC-III-ICD-9-Classification', 'gpt', 't5']
def bulk_aug(df, model, method, chunk_size, device):
device = f'cuda:{device}'
if not os.path.exists('./datasets/'):
os.mkdir('./datasets/')
aug_pip = TextAug_base(model, method, device)
final_dataframe = {'Patient_Id' : [], 'Admission_Id' : [],
'Chapter_Labels': [], 'Three_Character_Labels' : [],
'Full_Labels' : [], 'Text': [], 'y_': [], 'random_key': []}
for index, row in tqdm(df_group_shot.iterrows(),
total=df_group_shot.shape[0]):
random_key = row['random_key']
sentences = row['Text']
aug_no = row['freq_f']
aug_data = []
chunks = list(chunks_pipe(sentences.split(), chunk_size))
for chunked_sentence in tqdm(chunks):
ug_d = aug_pip.augmention_pipeline(chunked_sentence, 2)
aug_data.append(ug_d)
final_aug_data = [" ".join(k) for k in list(zip(*aug_data))]
for cli_name, i in enumerate(final_aug_data):
pat_name = row['Patient_Id']
mo_name = f'{pat_name}_{cli_name}'
ad_name = row['Admission_Id']
mo_ad_name= f'{ad_name}_{cli_name}'
final_dataframe['Patient_Id'].append(mo_name)
final_dataframe['Admission_Id'].append(mo_ad_name)
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels'])
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels'])
final_dataframe['Full_Labels'].append(row['Full_Labels'])
final_dataframe['Text'].append(i)
final_dataframe['y_'].append(row['y_'])
final_dataframe['random_key'].append(row['random_key'])
final_dataframe['Patient_Id'].append(row['Patient_Id'])
final_dataframe['Admission_Id'].append(row['Admission_Id'])
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels'])
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels'])
final_dataframe['Full_Labels'].append(row['Full_Labels'])
final_dataframe['Text'].append(sentences)
final_dataframe['y_'].append(row['y_'])
final_dataframe['random_key'].append(row['random_key'])
df_au = pd.DataFrame(final_dataframe).reset_index(drop=True)
model_name_save = model.split('/')[-1]
d_n_name = f'{model_name_save}_{chunk_size}_{method}'
df_au.to_csv(f'./datasets/{d_n_name}.csv', index=False)
return 'saved'
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield " ".join(lst[i:i + n])
df = pd.read_csv('/content/drive/MyDrive/aug_data/raw_data_aug_group.csv')
device = 1
for i_no in [100, 200, 300, 400, 500]:
for j_me in ['insert', 'substitute']:
for k_mo in tf_models:
print(f"currently_running {j_no}_{j_me}_{k_mo}")
ty = bulk_aug(df, k_mo, j_me, i_no, device)
import pandas as pd
from tqdm import tqdm
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
import os
import uuid
def trans_(df):
df['y_'] = df['Full_Labels'].apply(lambda x : x.replace('|', '_'))
freq_count = dict(df.y_.value_counts())
aug_fr = {'y_': [], 'freq_f': []}
for label_data, fre in freq_count.items():
aug_fr['y_'].append(label_data)
aug_fr['freq_f'].append((50 - fre))
rty = pd.DataFrame(aug_fr)
als = pd.merge(df, rty, on='y_')
als['random_key'] = ["".join(str(uuid.uuid4())) for _ in range(len(als))]
return als
class TextAug_base(object):
def __init__(self, model_name, method, device):
if model_name == 't5':
self.aug = nas.AbstSummAug(model_path='t5-base', device = device)
elif model_name == 'gpt':
self.aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device = device)
elif model_name == 'facebook':
self.aug = naw.BackTranslationAug(
from_model_name='facebook/wmt19-en-de',
to_model_name='facebook/wmt19-de-en', device = device)
else:
self.aug = naw.ContextualWordEmbsAug(model_path=model_name, action=method, device = device)
def augmention_pipeline(self, text, total_num):
augmented_text = self.aug.augment(text, n = total_num)
return augmented_text
tf_models = ["sentence-transformers/LaBSE",
"sentence-transformers/bert-base-nli-mean-tokens",
"sentence-transformers/paraphrase-xlm-r-multilingual-v1",
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"allenai/scibert_scivocab_uncased",
"xlm-roberta-base","roberta-large",
"roberta-base","bert-large-uncased","albert-base-v2",
"roberta-large-mnli","deepset/sentence_bert",
"xlm-roberta-large","kamalkraj/bioelectra-base-discriminator-pubmed",
"bert-base-uncased",
"monologg/biobert_v1.0_pubmed_pmc",
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
"minhpqn/bio_roberta-base_pubmed",
"bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12",
"bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16",
"sentence-transformers/roberta-large-nli-stsb-mean-tokens",
"sentence-transformers/bert-large-nli-mean-tokens",
"bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12",
"adamlin/NCBI_BERT_pubmed_mimic_uncased_large_transformers",
"TheLongSentance/MIMIC-III-t5-large-v1",
"adamlin/NCBI_BERT_pubmed_mimic_uncased_base_transformers",
"mnaylor/bigbird-base-mimic-mortality",
'ian/BlueBERT-Finetuned-MIMIC-III-ICD-9-Classification', 'gpt', 't5']
def bulk_aug(df, model, method, chunk_size, device):
device = f'cuda:{device}'
if not os.path.exists('./datasets/'):
os.mkdir('./datasets/')
aug_pip = TextAug_base(model, method, device)
final_dataframe = {'Patient_Id' : [], 'Admission_Id' : [],
'Chapter_Labels': [], 'Three_Character_Labels' : [],
'Full_Labels' : [], 'Text': [], 'y_': [], 'random_key': []}
for index, row in tqdm(df.iterrows(),
total=df.shape[0]):
random_key = row['random_key']
sentences = row['Text']
aug_no = row['freq_f']
aug_data = []
chunks = list(chunks_pipe(sentences.split(), chunk_size))
for chunked_sentence in tqdm(chunks):
ug_d = aug_pip.augmention_pipeline(chunked_sentence, 2)
aug_data.append(ug_d)
final_aug_data = [" ".join(k) for k in list(zip(*aug_data))]
for cli_name, i in enumerate(final_aug_data):
pat_name = row['Patient_Id']
mo_name = f'{pat_name}_{cli_name}'
ad_name = row['Admission_Id']
mo_ad_name= f'{ad_name}_{cli_name}'
final_dataframe['Patient_Id'].append(mo_name)
final_dataframe['Admission_Id'].append(mo_ad_name)
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels'])
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels'])
final_dataframe['Full_Labels'].append(row['Full_Labels'])
final_dataframe['Text'].append(i)
final_dataframe['y_'].append(row['y_'])
final_dataframe['random_key'].append(row['random_key'])
final_dataframe['Patient_Id'].append(row['Patient_Id'])
final_dataframe['Admission_Id'].append(row['Admission_Id'])
final_dataframe['Chapter_Labels'].append(row['Chapter_Labels'])
final_dataframe['Three_Character_Labels'].append(row['Three_Character_Labels'])
final_dataframe['Full_Labels'].append(row['Full_Labels'])
final_dataframe['Text'].append(sentences)
final_dataframe['y_'].append(row['y_'])
final_dataframe['random_key'].append(row['random_key'])
df_au = pd.DataFrame(final_dataframe).reset_index(drop=True)
model_name_save = model.split('/')[-1]
d_n_name = f'{model_name_save}_{chunk_size}_{method}'
df_au.to_csv(f'./datasets/{d_n_name}.csv', index=False)
return 'saved'
def chunks_pipe(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield " ".join(lst[i:i + n])
df = pd.read_csv('raw_data_aug_group.csv')
device = 1
for i_no in [100, 200, 300, 400, 500]:
for j_me in ['insert', 'substitute']:
for k_mo in tf_models:
try:
print(f"currently_running {i_no}_{j_me}_{k_mo}")
ty = bulk_aug(df, k_mo, j_me, i_no, device)
except Exception as e:
print(e)
print(f"Model not supported_{k_mo}")
def rename_df(df_new, caption):
print(caption)
df_new['Patient_Id'] = df_new['Patient_Id'].apply(lambda x: f'x_{caption}')
df_new['Admission_Id'] = df_new['Admission_Id'].apply(lambda x: f'x_{caption}')
return df_new
def aug_manual(df, aug_no):
all_dataframes = []
for i in range(aug_no):
df_current = df.copy()
all_dataframes.append(rename_df(df_current, i))
df_aug = pd.concat(all_dataframes)
df_aug = df_aug.sample(frac=1).reset_index(drop=True)
return df_aug
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment