Skip to content

Instantly share code, notes, and snippets.

@previtus
Last active April 15, 2018 00:06
Show Gist options
  • Save previtus/9b669e83e11a4bfa8e8cddb454f9e65f to your computer and use it in GitHub Desktop.
Save previtus/9b669e83e11a4bfa8e8cddb454f9e65f to your computer and use it in GitHub Desktop.
Load all JSONs and process one with selected keywords to txt files
import os, fnmatch, random
import json
from pprint import pprint
def load(path, keywords_find, keywords_exclude):
# Frames to crops
files = sorted(os.listdir(path))
#print("files",len(files), files[0:10])
files = fnmatch.filter(files, '*.json')
random.shuffle(files)
print("jsons:",files[0:2],"...")
key_files = []
key_texts = []
key_title = []
#subfiles = files[0:100]
subfiles = files
for file in subfiles:
data = json.load(open(path+file))
text = data["body"]
title = data["title"]
# optional, limit lenght of the article - only those with len(text) > THR
if text == '':
continue
if len(text) < 400:
# 80 is roughly one sentence, lets say we want 3 and more
continue
keywords = data["features"]["content"]["keywords"]
#pprint(keywords)
#pprint(data)
keywords = [key["keyword"] for key in keywords]
contains = False
if len(keywords_find) == 0:
contains = True
else:
for key in keywords_find:
if key in keywords:
contains = True
break
if len(keywords_exclude) > 0:
for key in keywords_exclude:
if key in keywords:
contains = False
break
if contains:
key_files.append(file)
key_texts.append(text)
key_title.append(title)
#print(keywords)
print("Found", len(key_files), "articles with those key(s):", keywords_find)
return key_files, key_texts, key_title
def join_into_file(key_texts, key_title, file_texts, file_titles):
# join texts into one string
joined_texts = "\n".join(key_texts)
joined_titles = "\n".join(key_title)
print(joined_titles[0:500])
save_to_txt_file(joined_texts, file_texts)
save_to_txt_file(joined_titles, file_titles)
def save_to_txt_file(text, file):
with open(file, "w") as text_file:
text_file.write(text)
# set path to your folder
path = "./artml_fake-news/"
# will filter for articles with at least ONE word from keywords_find and at the same time NONE of the words from keywords_exclude
keywords_find = ['bunch', 'of', 'key', 'words']
keywords_exclude = []
key_files, key_texts, key_title = load(path, keywords_find, keywords_exclude)
file_texts = "korea_texts.txt"
file_titles = "korea_titles.txt"
print("\nSaving them into files separated by newline\nSample:")
join_into_file(key_texts, key_title, file_texts, file_titles)
#key_files, key_texts, key_title = load(path, [])
#join_into_file(key_texts, key_title, "all_texts.txt", "all_titles.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment