seanie12 · August 23, 2019 06:14
diff --git a/download_dev.sh b/download_dev.sh
 #! /bin/bash

 set -e

 OUTPUT=$1

 mkdir -p $OUTPUT

 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz
diff --git a/download_train.sh b/download_train.sh
 #! /bin/bash

 set -e

 OUTPUT=$1

 mkdir -p $OUTPUT

 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz
 wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz
diff --git a/read_examples.py b/read_examples.py
 def read_examples(input_file, debug=False):
    # Read data
    unproc_data = []
    with gzip.open(input_file, 'rt', encoding='utf-8') as f:  # opening file in binary(rb) mode
        for item in json_lines.reader(f):
            # print(item) #or use print(item['X']) for printing specific data
            unproc_data.append(item)

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    # Delete header
    unproc_data = unproc_data[1:]
    if debug:
        unproc_data = unproc_data[:100]

    ###################### Make Examples ######################
    examples = []
    skip_tags = ['<Table>', '<Tr>', '<Td>', '<Ol>', '<Ul>', '<Li>']
    for item in unproc_data:
        # in case of NQ dataset, context containing tags is excluded
        context = item["context"]
        skip_flag = False
        for tag in skip_tags:
            if tag in context:
                skip_flag = True
                break
        if skip_flag:
            continue
        # 1. Get Context
        paragraph_text = context.replace("[TLE]", "[SEP]")
        paragraph_text = paragraph_text.replace("[PAR]", "[SEP]")
        paragraph_text = paragraph_text.replace("[DOC]", "[SEP]")

        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        # 2. qas
        for qa in item['qas']:
            qas_id = qa['qid']
            question_text = qa['question']

            # Only take the first answer
            answer = qa['detected_answers'][0]
            orig_answer_text = answer['text']

            answer_offset = answer['char_spans'][0][0]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            try:
                end_position = char_to_word_offset[answer_offset + answer_length - 1]
            except IndexError:
                print("invalid answer span. Exclude this example")
                end_position = -1
                continue
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
                whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
                continue

            example = SquadExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position)
            examples.append(example)

    return examples
	#! /bin/bash

	set -e

	OUTPUT=$1

	mkdir -p $OUTPUT

	wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz
	wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz
	wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz
	wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz
	wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz
	def read_examples(input_file, debug=False):
	# Read data
	unproc_data = []
	with gzip.open(input_file, 'rt', encoding='utf-8') as f: # opening file in binary(rb) mode
	for item in json_lines.reader(f):
	# print(item) #or use print(item['X']) for printing specific data
	unproc_data.append(item)

	def is_whitespace(c):
	if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
	return True
	return False

	# Delete header
	unproc_data = unproc_data[1:]
	if debug:
	unproc_data = unproc_data[:100]

	###################### Make Examples ######################
	examples = []
	skip_tags = ['<Table>', '<Tr>', '<Td>', '<Ol>', '<Ul>', '<Li>']
	for item in unproc_data:
	# in case of NQ dataset, context containing tags is excluded
	context = item["context"]
	skip_flag = False
	for tag in skip_tags:
	if tag in context:
	skip_flag = True
	break
	if skip_flag:
	continue
	# 1. Get Context
	paragraph_text = context.replace("[TLE]", "[SEP]")
	paragraph_text = paragraph_text.replace("[PAR]", "[SEP]")
	paragraph_text = paragraph_text.replace("[DOC]", "[SEP]")

	doc_tokens = []
	char_to_word_offset = []
	prev_is_whitespace = True
	for c in paragraph_text:
	if is_whitespace(c):
	prev_is_whitespace = True
	else:
	if prev_is_whitespace:
	doc_tokens.append(c)
	else:
	doc_tokens[-1] += c
	prev_is_whitespace = False
	char_to_word_offset.append(len(doc_tokens) - 1)

	# 2. qas
	for qa in item['qas']:
	qas_id = qa['qid']
	question_text = qa['question']

	# Only take the first answer
	answer = qa['detected_answers'][0]
	orig_answer_text = answer['text']

	answer_offset = answer['char_spans'][0][0]
	answer_length = len(orig_answer_text)
	start_position = char_to_word_offset[answer_offset]
	try:
	end_position = char_to_word_offset[answer_offset + answer_length - 1]
	except IndexError:
	print("invalid answer span. Exclude this example")
	end_position = -1
	continue
	# Only add answers where the text can be exactly recovered from the
	# document. If this CAN'T happen it's likely due to weird Unicode
	# stuff so we will just skip the example.
	#
	# Note that this means for training mode, every example is NOT
	# guaranteed to be preserved.
	actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
	cleaned_answer_text = " ".join(
	whitespace_tokenize(orig_answer_text))
	if actual_text.find(cleaned_answer_text) == -1:
	continue

	example = SquadExample(
	qas_id=qas_id,
	question_text=question_text,
	doc_tokens=doc_tokens,
	orig_answer_text=orig_answer_text,
	start_position=start_position,
	end_position=end_position)
	examples.append(example)

	return examples