Created
December 4, 2021 07:23
-
-
Save seanbenhur/f7f4c44d93ab4e229c9623a8b2f390c8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import wandb | |
from datasets import load_dataset, concatenate_datasets | |
from functools import partial | |
import logging | |
logger = logging.getLogger(__name__) | |
def load_hf_format_dataset(file_path,split): | |
"""Loads a huggingface dataset""" | |
dataset = load_dataset("text",data_files=file_path) | |
dataset = dataset['train'] | |
return dataset | |
#remove duplicates | |
memory = set() | |
def is_unique(elem , column, memory): | |
if elem[column] in memory: | |
return False | |
else: | |
memory.add(elem[column]) | |
return True | |
def clean_text(example): | |
#remove html | |
example['text'] = re.sub(r'http\S+', '', example['text']) | |
#remove dates | |
example['text'] = re.sub('^(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4}|am|pm)$', '', example['text']) | |
#remove emojis | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
"]+", flags=re.UNICODE) | |
example['text'] = re.sub(emoji_pattern, '', example['text']) | |
#remove \n | |
example['text'] = example['text'].replace('\n','') | |
#remove \t | |
example['text'] = example['text'].replace('\t', '') | |
return example | |
if __name__ == "__main__": | |
run = wandb.init(project="nvidia-tamil",group="format_data") | |
artifact = wandb.Artifact("processed_dataset",type="dataset") | |
tamil_common_crawl = load_hf_format_dataset(file_path="tamil_data/ta.txt.xz",split=None) | |
logging.info("Loaded Tamil Common Crawl") | |
oscar_tamil = load_dataset("oscar","unshuffled_original_ta",ignore_verifications=True) | |
oscar_tamil = oscar_tamil['train'] | |
oscar_tamil = oscar_tamil.remove_columns(['id']) | |
logging.info("Loaded Oscar Tamil") | |
ai4bharat_tamil = load_hf_format_dataset(file_path="tamil_data/data/ta/ta.txt",split=None) | |
logging.info("Loaded AI4BHARAT TAMIL DATASET") | |
#concatenate datasets | |
tamil_final_dataset = concatenate_datasets([tamil_common_crawl,oscar_tamil,ai4bharat_tamil]) | |
logging.info("Cocatenated the datasets") | |
# Drop duplicates | |
tamil_final_dataset = tamil_final_dataset.filter(partial(is_unique, column="text", memory=memory)) | |
logging.info("Duplicates are dropped") | |
#preprocessing | |
logging.info("Started cleaning the dataset") | |
tamil_final_dataset = tamil_final_dataset.map(lambda x:clean_text(x)) | |
logging.info("Dataset is cleaned") | |
tamil_final_dataset.save_to_disk("tamil_final_processed_dataset") | |
artifact.add_dir("tamil_final_processed_dataset") | |
run.log_artifact(artifact) | |
logging.info("Dataset is saved to disk") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment