Last active
March 2, 2023 11:22
-
-
Save adamori/448884fe8f31da59285b7515b97bcecf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import os | |
import pandas as pd | |
import ijson | |
# Get a list of all JSON files in the current directory | |
json_files = [f for f in os.listdir('.') if f.endswith('.json')] | |
# If there are no JSON files, print an error message and exit | |
if not json_files: | |
print("No JSON files found in current directory.") | |
exit() | |
# Print the list of JSON files and their index numbers | |
print("Choose a JSON file to read:") | |
for i, f in enumerate(json_files): | |
print(f"{i + 1}: {f}") | |
default_choice = 1 | |
# Get the user's choice | |
while True: | |
try: | |
choice = input("Enter the number of the file you want to read (default: 1): ") | |
if choice == "": | |
choice = default_choice | |
break | |
choice = int(choice) | |
if choice < 1 or choice > len(json_files): | |
raise ValueError | |
break | |
except ValueError: | |
print("Invalid choice. Please enter a valid number.") | |
# Get today's date | |
today = datetime.datetime.now() | |
# Calculate default start date (3 months ago) | |
default_start_date = today - datetime.timedelta(days=90) | |
# Ask user for start date | |
while True: | |
temp = default_start_date.strftime('%d.%m.%Y') | |
start_date_str = input(f"Enter start date (default is {temp}): ") | |
if start_date_str == "": | |
start_date = default_start_date | |
break | |
try: | |
start_date = datetime.datetime.strptime(start_date_str, "%d.%m.%Y") | |
break | |
except ValueError: | |
print("Error: Incorrect date format. Please enter a date in the format DD-MM-YYYY.") | |
# Ask user for end date | |
while True: | |
temp = today.strftime('%d.%m.%Y') | |
end_date_str = input(f"Enter end date (default is {temp}): ") | |
if end_date_str == "": | |
end_date = today | |
break | |
try: | |
end_date = datetime.datetime.strptime(end_date_str, "%d.%m.%Y") | |
break | |
except ValueError: | |
print("Error: Incorrect date format. Please enter a date in the format DD-MM-YYYY.") | |
# Read the chosen file and generate .csv file | |
chosen_file = json_files[choice - 1] | |
companies = [] | |
try: | |
with open(chosen_file, 'r', encoding='utf8') as file: | |
objects = ijson.parse(file) | |
ariregistri_kood = None | |
company = {} | |
i = 0 | |
check_index = 0 | |
check_size_after = 2500 | |
for prefix, event, value in objects: | |
if prefix == 'item.ariregistri_kood': | |
if company.get('ariregistri_kood'): | |
if company.get('correct', False): | |
liik: list = company.get('sidevahendid.liik') or [] | |
sisu: list = company.get('sidevahendid.sisu') or [] | |
company['sidevahendid.liik'] = ', '.join(liik) | |
company['sidevahendid.sisu'] = ', '.join(sisu) | |
company.pop('correct') | |
companies.append(company) | |
company = {} | |
i += 1 | |
check_index += 1 | |
company['ariregistri_kood'] = value | |
elif prefix == 'item.nimi': | |
company['nimi'] = value | |
print(f"Обработано\t{i}\t\tТекущая компания: {value}\t\t", end='\r') | |
elif prefix == 'item.yldandmed.esmaregistreerimise_kpv': | |
if not (start_date <= datetime.datetime.strptime(value, '%d.%m.%Y') <= end_date): | |
continue | |
company['esmaregistreerimise_kpv'] = value | |
company['correct'] = True | |
elif prefix == 'item.yldandmed.oiguslik_vorm' and value == 'KÜ': | |
company['correct'] = False | |
elif prefix == 'item.yldandmed.oiguslik_vorm_tekstina': | |
company['oiguslik_vorm_tekstina'] = value | |
elif prefix.startswith( | |
'item.yldandmed.sidevahendid.item' | |
) and event in ['string', 'number']: | |
if prefix == 'item.yldandmed.sidevahendid.item.liik': | |
liik = company.get('sidevahendid.liik') or [] | |
liik.append(value) | |
company['sidevahendid.liik'] = liik | |
elif prefix == 'item.yldandmed.sidevahendid.item.sisu': | |
sisu = company.get('sidevahendid.sisu') or [] | |
sisu.append(value) | |
company['sidevahendid.sisu'] = sisu | |
if check_index >= check_size_after: | |
if len(companies) > 25000: | |
print("Внимание! Кол-во найденных компаний превышает 25.000. Уточните поиск") | |
break | |
check_index = 0 | |
print(f"Кол-во компаний соответствующих критериям: {len(companies)}") | |
print("Создание файла, подождите...") | |
df = pd.DataFrame(companies) | |
df.to_csv('company_list.csv', index=False) | |
print("Успешно") | |
except Exception as e: | |
print(f"Error reading file: {chosen_file}") | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Как использовать
Инструкция на Youtube
Первый запуск
cmd
или открыть командную строку через меню поиска в ПускИспользование
python filter_from_large_json_file.py