Last active
April 15, 2018 00:06
-
-
Save previtus/9b669e83e11a4bfa8e8cddb454f9e65f to your computer and use it in GitHub Desktop.
Load all JSONs and process one with selected keywords to txt files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, fnmatch, random | |
import json | |
from pprint import pprint | |
def load(path, keywords_find, keywords_exclude): | |
# Frames to crops | |
files = sorted(os.listdir(path)) | |
#print("files",len(files), files[0:10]) | |
files = fnmatch.filter(files, '*.json') | |
random.shuffle(files) | |
print("jsons:",files[0:2],"...") | |
key_files = [] | |
key_texts = [] | |
key_title = [] | |
#subfiles = files[0:100] | |
subfiles = files | |
for file in subfiles: | |
data = json.load(open(path+file)) | |
text = data["body"] | |
title = data["title"] | |
# optional, limit lenght of the article - only those with len(text) > THR | |
if text == '': | |
continue | |
if len(text) < 400: | |
# 80 is roughly one sentence, lets say we want 3 and more | |
continue | |
keywords = data["features"]["content"]["keywords"] | |
#pprint(keywords) | |
#pprint(data) | |
keywords = [key["keyword"] for key in keywords] | |
contains = False | |
if len(keywords_find) == 0: | |
contains = True | |
else: | |
for key in keywords_find: | |
if key in keywords: | |
contains = True | |
break | |
if len(keywords_exclude) > 0: | |
for key in keywords_exclude: | |
if key in keywords: | |
contains = False | |
break | |
if contains: | |
key_files.append(file) | |
key_texts.append(text) | |
key_title.append(title) | |
#print(keywords) | |
print("Found", len(key_files), "articles with those key(s):", keywords_find) | |
return key_files, key_texts, key_title | |
def join_into_file(key_texts, key_title, file_texts, file_titles): | |
# join texts into one string | |
joined_texts = "\n".join(key_texts) | |
joined_titles = "\n".join(key_title) | |
print(joined_titles[0:500]) | |
save_to_txt_file(joined_texts, file_texts) | |
save_to_txt_file(joined_titles, file_titles) | |
def save_to_txt_file(text, file): | |
with open(file, "w") as text_file: | |
text_file.write(text) | |
# set path to your folder | |
path = "./artml_fake-news/" | |
# will filter for articles with at least ONE word from keywords_find and at the same time NONE of the words from keywords_exclude | |
keywords_find = ['bunch', 'of', 'key', 'words'] | |
keywords_exclude = [] | |
key_files, key_texts, key_title = load(path, keywords_find, keywords_exclude) | |
file_texts = "korea_texts.txt" | |
file_titles = "korea_titles.txt" | |
print("\nSaving them into files separated by newline\nSample:") | |
join_into_file(key_texts, key_title, file_texts, file_titles) | |
#key_files, key_texts, key_title = load(path, []) | |
#join_into_file(key_texts, key_title, "all_texts.txt", "all_titles.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment