-
-
Save mrm8488/1a934e958815480402184cd7453e81a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CustomIterableDatasetv2(IterableDataset): | |
def __init__(self, filename_en, filename_gm): | |
#Store the filenames in object's memory | |
self.filename_en = filename_en | |
self.filename_gm = filename_gm | |
#And that's it, we no longer need to store the contents in the memory | |
def preprocess(self, text): | |
### Do something with text here | |
text_pp = text.split() | |
### | |
return text_pp | |
def line_mapper(self, line): | |
#We only have the text in the file for this case | |
text = line | |
text = self.preprocess(text) | |
return text | |
def __iter__(self): | |
#Create an iterator | |
en_itr = open(self.filename_en) | |
gm_itr = open(self.filename_gm) | |
#Map each element using the line_mapper | |
mapped_en_itr = map(self.line_mapper, en_itr) | |
mapped_gm_itr = map(self.line_mapper, gm_itr) | |
#Zip both iterators | |
zipped_itr = zip(mapped_en_itr, mapped_gm_itr) | |
return zipped_itr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment