Skip to content

Instantly share code, notes, and snippets.

@vgaraujov
Last active June 11, 2023 13:30
Show Gist options
  • Save vgaraujov/fd17b0c151657fbce73189a98617f1c6 to your computer and use it in GitHub Desktop.
Save vgaraujov/fd17b0c151657fbce73189a98617f1c6 to your computer and use it in GitHub Desktop.
Function to convert SQuAD dataset from json format to dataframe. Used in this tutorial: https://github.com/vgaraujov/Question-Answering-Tutorial/blob/master/Question_Answering_BERT_Spanish.ipynb
# Code forked from https://www.kaggle.com/jagannathpatta/reading-json-data-getting-dataframe
import pandas as pd
import json
def json_to_dataframe(file):
f = open ( file , "r")
data = json.loads(f.read()) #loading the json file.
iid = []
tit = [] #Creating empty lists to store values.
con = []
Que = []
Ans_st = []
Txt = []
for i in range(len(data['data'])): #Root tag of the json file contains 'title' tag & 'paragraphs' list.
title = data['data'][i]['title']
for p in range(len(data['data'][i]['paragraphs'])): # 'paragraphs' list contains 'context' tag & 'qas' list.
context = data['data'][i]['paragraphs'][p]['context']
for q in range(len(data['data'][i]['paragraphs'][p]['qas'])): # 'qas' list contains 'question', 'Id' tag & 'answers' list.
question = data['data'][i]['paragraphs'][p]['qas'][q]['question']
Id = data['data'][i]['paragraphs'][p]['qas'][q]['id']
for a in range(len(data['data'][i]['paragraphs'][p]['qas'][q]['answers'])): # 'answers' list contains 'ans_start', 'text' tags.
ans_start = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['answer_start']
text = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['text']
tit.append(title)
con.append(context)
Que.append(question) # Appending values to lists
iid.append(Id)
Ans_st.append(ans_start)
Txt.append(text)
new_df = pd.DataFrame(columns=['Id','title','context','question','ans_start','text']) # Creating empty DataFrame.
new_df.Id = iid
new_df.title = tit #intializing list values to the DataFrame.
new_df.context = con
new_df.question = Que
new_df.ans_start = Ans_st
new_df.text = Txt
final_df = new_df.drop_duplicates(keep='first') # Dropping duplicate rows from the create Dataframe.
return final_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment