Last active
August 23, 2019 06:14
-
-
Save seanie12/558ed7ec508f7f1fa1b7705170bd0077 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
set -e | |
OUTPUT=$1 | |
mkdir -p $OUTPUT | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
set -e | |
OUTPUT=$1 | |
mkdir -p $OUTPUT | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz | |
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_examples(input_file, debug=False): | |
# Read data | |
unproc_data = [] | |
with gzip.open(input_file, 'rt', encoding='utf-8') as f: # opening file in binary(rb) mode | |
for item in json_lines.reader(f): | |
# print(item) #or use print(item['X']) for printing specific data | |
unproc_data.append(item) | |
def is_whitespace(c): | |
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: | |
return True | |
return False | |
# Delete header | |
unproc_data = unproc_data[1:] | |
if debug: | |
unproc_data = unproc_data[:100] | |
###################### Make Examples ###################### | |
examples = [] | |
skip_tags = ['<Table>', '<Tr>', '<Td>', '<Ol>', '<Ul>', '<Li>'] | |
for item in unproc_data: | |
# in case of NQ dataset, context containing tags is excluded | |
context = item["context"] | |
skip_flag = False | |
for tag in skip_tags: | |
if tag in context: | |
skip_flag = True | |
break | |
if skip_flag: | |
continue | |
# 1. Get Context | |
paragraph_text = context.replace("[TLE]", "[SEP]") | |
paragraph_text = paragraph_text.replace("[PAR]", "[SEP]") | |
paragraph_text = paragraph_text.replace("[DOC]", "[SEP]") | |
doc_tokens = [] | |
char_to_word_offset = [] | |
prev_is_whitespace = True | |
for c in paragraph_text: | |
if is_whitespace(c): | |
prev_is_whitespace = True | |
else: | |
if prev_is_whitespace: | |
doc_tokens.append(c) | |
else: | |
doc_tokens[-1] += c | |
prev_is_whitespace = False | |
char_to_word_offset.append(len(doc_tokens) - 1) | |
# 2. qas | |
for qa in item['qas']: | |
qas_id = qa['qid'] | |
question_text = qa['question'] | |
# Only take the first answer | |
answer = qa['detected_answers'][0] | |
orig_answer_text = answer['text'] | |
answer_offset = answer['char_spans'][0][0] | |
answer_length = len(orig_answer_text) | |
start_position = char_to_word_offset[answer_offset] | |
try: | |
end_position = char_to_word_offset[answer_offset + answer_length - 1] | |
except IndexError: | |
print("invalid answer span. Exclude this example") | |
end_position = -1 | |
continue | |
# Only add answers where the text can be exactly recovered from the | |
# document. If this CAN'T happen it's likely due to weird Unicode | |
# stuff so we will just skip the example. | |
# | |
# Note that this means for training mode, every example is NOT | |
# guaranteed to be preserved. | |
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) | |
cleaned_answer_text = " ".join( | |
whitespace_tokenize(orig_answer_text)) | |
if actual_text.find(cleaned_answer_text) == -1: | |
continue | |
example = SquadExample( | |
qas_id=qas_id, | |
question_text=question_text, | |
doc_tokens=doc_tokens, | |
orig_answer_text=orig_answer_text, | |
start_position=start_position, | |
end_position=end_position) | |
examples.append(example) | |
return examples |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment