Skip to content

Instantly share code, notes, and snippets.

@adamori
Last active March 2, 2023 11:22
Show Gist options
  • Save adamori/448884fe8f31da59285b7515b97bcecf to your computer and use it in GitHub Desktop.
Save adamori/448884fe8f31da59285b7515b97bcecf to your computer and use it in GitHub Desktop.
import datetime
import os
import pandas as pd
import ijson
# Get a list of all JSON files in the current directory
json_files = [f for f in os.listdir('.') if f.endswith('.json')]
# If there are no JSON files, print an error message and exit
if not json_files:
print("No JSON files found in current directory.")
exit()
# Print the list of JSON files and their index numbers
print("Choose a JSON file to read:")
for i, f in enumerate(json_files):
print(f"{i + 1}: {f}")
default_choice = 1
# Get the user's choice
while True:
try:
choice = input("Enter the number of the file you want to read (default: 1): ")
if choice == "":
choice = default_choice
break
choice = int(choice)
if choice < 1 or choice > len(json_files):
raise ValueError
break
except ValueError:
print("Invalid choice. Please enter a valid number.")
# Get today's date
today = datetime.datetime.now()
# Calculate default start date (3 months ago)
default_start_date = today - datetime.timedelta(days=90)
# Ask user for start date
while True:
temp = default_start_date.strftime('%d.%m.%Y')
start_date_str = input(f"Enter start date (default is {temp}): ")
if start_date_str == "":
start_date = default_start_date
break
try:
start_date = datetime.datetime.strptime(start_date_str, "%d.%m.%Y")
break
except ValueError:
print("Error: Incorrect date format. Please enter a date in the format DD-MM-YYYY.")
# Ask user for end date
while True:
temp = today.strftime('%d.%m.%Y')
end_date_str = input(f"Enter end date (default is {temp}): ")
if end_date_str == "":
end_date = today
break
try:
end_date = datetime.datetime.strptime(end_date_str, "%d.%m.%Y")
break
except ValueError:
print("Error: Incorrect date format. Please enter a date in the format DD-MM-YYYY.")
# Read the chosen file and generate .csv file
chosen_file = json_files[choice - 1]
companies = []
try:
with open(chosen_file, 'r', encoding='utf8') as file:
objects = ijson.parse(file)
ariregistri_kood = None
company = {}
i = 0
check_index = 0
check_size_after = 2500
for prefix, event, value in objects:
if prefix == 'item.ariregistri_kood':
if company.get('ariregistri_kood'):
if company.get('correct', False):
liik: list = company.get('sidevahendid.liik') or []
sisu: list = company.get('sidevahendid.sisu') or []
company['sidevahendid.liik'] = ', '.join(liik)
company['sidevahendid.sisu'] = ', '.join(sisu)
company.pop('correct')
companies.append(company)
company = {}
i += 1
check_index += 1
company['ariregistri_kood'] = value
elif prefix == 'item.nimi':
company['nimi'] = value
print(f"Обработано\t{i}\t\tТекущая компания: {value}\t\t", end='\r')
elif prefix == 'item.yldandmed.esmaregistreerimise_kpv':
if not (start_date <= datetime.datetime.strptime(value, '%d.%m.%Y') <= end_date):
continue
company['esmaregistreerimise_kpv'] = value
company['correct'] = True
elif prefix == 'item.yldandmed.oiguslik_vorm' and value == 'KÜ':
company['correct'] = False
elif prefix == 'item.yldandmed.oiguslik_vorm_tekstina':
company['oiguslik_vorm_tekstina'] = value
elif prefix.startswith(
'item.yldandmed.sidevahendid.item'
) and event in ['string', 'number']:
if prefix == 'item.yldandmed.sidevahendid.item.liik':
liik = company.get('sidevahendid.liik') or []
liik.append(value)
company['sidevahendid.liik'] = liik
elif prefix == 'item.yldandmed.sidevahendid.item.sisu':
sisu = company.get('sidevahendid.sisu') or []
sisu.append(value)
company['sidevahendid.sisu'] = sisu
if check_index >= check_size_after:
if len(companies) > 25000:
print("Внимание! Кол-во найденных компаний превышает 25.000. Уточните поиск")
break
check_index = 0
print(f"Кол-во компаний соответствующих критериям: {len(companies)}")
print("Создание файла, подождите...")
df = pd.DataFrame(companies)
df.to_csv('company_list.csv', index=False)
print("Успешно")
except Exception as e:
print(f"Error reading file: {chosen_file}")
print(e)
@adamori
Copy link
Author

adamori commented Mar 1, 2023

Как использовать

Инструкция на Youtube

Первый запуск

  1. Скачайте python
  2. Откройте терминал(коммандную сроку). Win+R и ввести cmd или открыть командную строку через меню поиска в Пуск
  3. Установить нужные инструменты для скрипта через командную строку введя команду:
pip install pandas ijson
  1. Скачайте скрипт и скопируйте в папку с json файлами (базой данных компаний)

Использование

  1. Запустите скрипт с помощью командной строки введя python filter_from_large_json_file.py
  2. Или вы можете просто запустить два раза нажав на скрипт

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment