Created
November 15, 2017 15:03
-
-
Save mataney/67cfb05b0b84e88da3e0fe04fb80cfc8 to your computer and use it in GitHub Desktop.
Using finished files to openNTM style
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tensorflow.core.example import example_pb2 | |
import tensorflow as tf | |
import glob, struct | |
def text_generator(example_generator): | |
"""Generates article and abstract text from tf.Example. | |
Args: | |
example_generator: a generator of tf.Examples from file. See data.example_generator""" | |
while True: | |
e = example_generator.__next__() # e is a tf.Example | |
try: | |
article_text = e.features.feature['article'].bytes_list.value[0] # the article text was saved under the key 'article' in the data files | |
abstract_text = e.features.feature['abstract'].bytes_list.value[0] # the abstract text was saved under the key 'abstract' in the data files | |
except ValueError: | |
tf.logging.error('Failed to get article or abstract from example') | |
continue | |
if len(article_text)==0: # See https://github.com/abisee/pointer-generator/issues/1 | |
tf.logging.warning('Found an example with empty article text. Skipping it.') | |
else: | |
yield (article_text, abstract_text) | |
def example_generator(data_path, single_pass): | |
"""Generates tf.Examples from data files. | |
Binary data format: <length><blob>. <length> represents the byte size | |
of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains | |
the tokenized article text and summary. | |
Args: | |
data_path: | |
Path to tf.Example data files. Can include wildcards, e.g. if you have several training data chunk files train_001.bin, train_002.bin, etc, then pass data_path=train_* to access them all. | |
single_pass: | |
Boolean. If True, go through the dataset exactly once, generating examples in the order they appear, then return. Otherwise, generate random examples indefinitely. | |
Yields: | |
Deserialized tf.Example. | |
""" | |
while True: | |
filelist = glob.glob(data_path) # get the list of datafiles | |
assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty | |
if single_pass: | |
filelist = sorted(filelist) | |
else: | |
random.shuffle(filelist) | |
for f in filelist: | |
reader = open(f, 'rb') | |
while True: | |
len_bytes = reader.read(8) | |
if not len_bytes: break # finished reading this file | |
str_len = struct.unpack('q', len_bytes)[0] | |
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] | |
yield example_pb2.Example.FromString(example_str) | |
if single_pass: | |
print("example_generator completed reading all datafiles. No more data.") | |
break | |
def writeFile(docs, filename): | |
file = open(filename, 'w', encoding="utf-8") | |
for doc in docs: | |
file.write(doc.decode('utf-8') + '\n') | |
file.close() | |
def run(d, _data_path, save_to): | |
_single_pass = True | |
input_gen = text_generator(example_generator(_data_path, _single_pass)) | |
articles = [] | |
abstracts = [] | |
i = 0 | |
while True: | |
try: | |
next_gen = input_gen.__next__() | |
(article, abstract) = next_gen # read the next example from file. article and abstract are both strings. | |
articles.append(article) | |
abstracts.append(abstract) | |
if i % 1000 == 0: | |
print(str(i) + ' for dataset ' + d) | |
i += 1 | |
except StopIteration: # if there are no more examples: | |
tf.logging.info("The example generator for this example queue filling thread has exhausted data.") | |
if _single_pass: | |
tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.") | |
_finished_reading = True | |
break | |
else: | |
raise Exception("single_pass mode is off but the example generator is out of data; error.") | |
print('done fetching ', d, 'data with ', i, 'examples') | |
writeFile(articles, save_to+'src-'+d+'.txt') | |
writeFile(abstracts, save_to+'tgt-'+d+'.txt') | |
print('done ' + d) | |
for d in ['train', 'val', 'test']: | |
_data_path = '/path/to/finished_files/as/downloaded/from/link/' + d + '.bin' | |
save_to = '/path/to/where/to/save/' | |
run(d, _data_path, save_to) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment