Last active
June 11, 2023 13:30
-
-
Save vgaraujov/fd17b0c151657fbce73189a98617f1c6 to your computer and use it in GitHub Desktop.
Function to convert SQuAD dataset from json format to dataframe. Used in this tutorial: https://github.com/vgaraujov/Question-Answering-Tutorial/blob/master/Question_Answering_BERT_Spanish.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Code forked from https://www.kaggle.com/jagannathpatta/reading-json-data-getting-dataframe | |
| import pandas as pd | |
| import json | |
| def json_to_dataframe(file): | |
| f = open ( file , "r") | |
| data = json.loads(f.read()) #loading the json file. | |
| iid = [] | |
| tit = [] #Creating empty lists to store values. | |
| con = [] | |
| Que = [] | |
| Ans_st = [] | |
| Txt = [] | |
| for i in range(len(data['data'])): #Root tag of the json file contains 'title' tag & 'paragraphs' list. | |
| title = data['data'][i]['title'] | |
| for p in range(len(data['data'][i]['paragraphs'])): # 'paragraphs' list contains 'context' tag & 'qas' list. | |
| context = data['data'][i]['paragraphs'][p]['context'] | |
| for q in range(len(data['data'][i]['paragraphs'][p]['qas'])): # 'qas' list contains 'question', 'Id' tag & 'answers' list. | |
| question = data['data'][i]['paragraphs'][p]['qas'][q]['question'] | |
| Id = data['data'][i]['paragraphs'][p]['qas'][q]['id'] | |
| for a in range(len(data['data'][i]['paragraphs'][p]['qas'][q]['answers'])): # 'answers' list contains 'ans_start', 'text' tags. | |
| ans_start = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['answer_start'] | |
| text = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['text'] | |
| tit.append(title) | |
| con.append(context) | |
| Que.append(question) # Appending values to lists | |
| iid.append(Id) | |
| Ans_st.append(ans_start) | |
| Txt.append(text) | |
| new_df = pd.DataFrame(columns=['Id','title','context','question','ans_start','text']) # Creating empty DataFrame. | |
| new_df.Id = iid | |
| new_df.title = tit #intializing list values to the DataFrame. | |
| new_df.context = con | |
| new_df.question = Que | |
| new_df.ans_start = Ans_st | |
| new_df.text = Txt | |
| final_df = new_df.drop_duplicates(keep='first') # Dropping duplicate rows from the create Dataframe. | |
| return final_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment