Created
October 26, 2021 11:09
-
-
Save temmyzeus/aa3c2c1dc2f96a798e2bf2d88f098927 to your computer and use it in GitHub Desktop.
News Classifier Datasets from json to csv format => https://www.kaggle.com/rmisra/news-category-dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Convert Data from .json to .csv easily readable by Pandas""" | |
import os | |
import sys | |
import ast | |
from pathlib import Path | |
from typing import List, Dict | |
import pandas as pd | |
# Set directory to file directory so other paths are easily relative to it without error | |
filename = sys.argv[0] | |
dir_name = os.path.dirname(filename) | |
os.chdir(dir_name) | |
with open(Path('../data/News_Category_Dataset_v2.json', mode='r')) as f: | |
data = f.readlines() | |
data_dict: Dict[str, List] = { | |
'categories': [], | |
'headlines': [], | |
'authors': [], | |
'links': [], | |
'short_descriptions': [], | |
'dates': [] | |
} | |
for line in data: | |
line = ast.literal_eval(line) | |
data_dict['categories'].append(line.get('category', 'Null')) | |
data_dict['headlines'].append(line.get('headline', 'Null')) | |
data_dict['authors'].append(line.get('authors', 'Null')) | |
data_dict['links'].append(line.get('link', 'Null')) | |
data_dict['short_descriptions'].append(line.get('short_description', 'Null')) | |
data_dict['dates'].append(line.get('date', 'Null')) | |
df = pd.DataFrame(data_dict) | |
# Insert None for error in values | |
df.to_csv(Path('../data/News Category.csv'), index=False, errors=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment