Skip to content

Instantly share code, notes, and snippets.

@mrm8488
Forked from kabirahuja2431/iterable_dataset_v2.py
Created February 23, 2020 03:28
Show Gist options
  • Save mrm8488/1a934e958815480402184cd7453e81a3 to your computer and use it in GitHub Desktop.
Save mrm8488/1a934e958815480402184cd7453e81a3 to your computer and use it in GitHub Desktop.
class CustomIterableDatasetv2(IterableDataset):
def __init__(self, filename_en, filename_gm):
#Store the filenames in object's memory
self.filename_en = filename_en
self.filename_gm = filename_gm
#And that's it, we no longer need to store the contents in the memory
def preprocess(self, text):
### Do something with text here
text_pp = text.split()
###
return text_pp
def line_mapper(self, line):
#We only have the text in the file for this case
text = line
text = self.preprocess(text)
return text
def __iter__(self):
#Create an iterator
en_itr = open(self.filename_en)
gm_itr = open(self.filename_gm)
#Map each element using the line_mapper
mapped_en_itr = map(self.line_mapper, en_itr)
mapped_gm_itr = map(self.line_mapper, gm_itr)
#Zip both iterators
zipped_itr = zip(mapped_en_itr, mapped_gm_itr)
return zipped_itr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment