Skip to content

Instantly share code, notes, and snippets.

@mataney
Created November 15, 2017 15:03
Show Gist options
  • Save mataney/67cfb05b0b84e88da3e0fe04fb80cfc8 to your computer and use it in GitHub Desktop.
Save mataney/67cfb05b0b84e88da3e0fe04fb80cfc8 to your computer and use it in GitHub Desktop.
Using finished files to openNTM style
from tensorflow.core.example import example_pb2
import tensorflow as tf
import glob, struct
def text_generator(example_generator):
"""Generates article and abstract text from tf.Example.
Args:
example_generator: a generator of tf.Examples from file. See data.example_generator"""
while True:
e = example_generator.__next__() # e is a tf.Example
try:
article_text = e.features.feature['article'].bytes_list.value[0] # the article text was saved under the key 'article' in the data files
abstract_text = e.features.feature['abstract'].bytes_list.value[0] # the abstract text was saved under the key 'abstract' in the data files
except ValueError:
tf.logging.error('Failed to get article or abstract from example')
continue
if len(article_text)==0: # See https://github.com/abisee/pointer-generator/issues/1
tf.logging.warning('Found an example with empty article text. Skipping it.')
else:
yield (article_text, abstract_text)
def example_generator(data_path, single_pass):
"""Generates tf.Examples from data files.
Binary data format: <length><blob>. <length> represents the byte size
of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
the tokenized article text and summary.
Args:
data_path:
Path to tf.Example data files. Can include wildcards, e.g. if you have several training data chunk files train_001.bin, train_002.bin, etc, then pass data_path=train_* to access them all.
single_pass:
Boolean. If True, go through the dataset exactly once, generating examples in the order they appear, then return. Otherwise, generate random examples indefinitely.
Yields:
Deserialized tf.Example.
"""
while True:
filelist = glob.glob(data_path) # get the list of datafiles
assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty
if single_pass:
filelist = sorted(filelist)
else:
random.shuffle(filelist)
for f in filelist:
reader = open(f, 'rb')
while True:
len_bytes = reader.read(8)
if not len_bytes: break # finished reading this file
str_len = struct.unpack('q', len_bytes)[0]
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
yield example_pb2.Example.FromString(example_str)
if single_pass:
print("example_generator completed reading all datafiles. No more data.")
break
def writeFile(docs, filename):
file = open(filename, 'w', encoding="utf-8")
for doc in docs:
file.write(doc.decode('utf-8') + '\n')
file.close()
def run(d, _data_path, save_to):
_single_pass = True
input_gen = text_generator(example_generator(_data_path, _single_pass))
articles = []
abstracts = []
i = 0
while True:
try:
next_gen = input_gen.__next__()
(article, abstract) = next_gen # read the next example from file. article and abstract are both strings.
articles.append(article)
abstracts.append(abstract)
if i % 1000 == 0:
print(str(i) + ' for dataset ' + d)
i += 1
except StopIteration: # if there are no more examples:
tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
if _single_pass:
tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
_finished_reading = True
break
else:
raise Exception("single_pass mode is off but the example generator is out of data; error.")
print('done fetching ', d, 'data with ', i, 'examples')
writeFile(articles, save_to+'src-'+d+'.txt')
writeFile(abstracts, save_to+'tgt-'+d+'.txt')
print('done ' + d)
for d in ['train', 'val', 'test']:
_data_path = '/path/to/finished_files/as/downloaded/from/link/' + d + '.bin'
save_to = '/path/to/where/to/save/'
run(d, _data_path, save_to)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment