Last active
March 19, 2019 21:50
-
-
Save dmitryhd/76f5f0eaff7da58a55f4cfb00e1378b1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import json | |
import tqdm | |
import pandas as pd | |
class LabelEncoder: | |
def __init__(self, name=None): | |
self._values2index = {} | |
self._next_index = 0 | |
self._index2value = {} | |
self.name = name | |
def encode(self, value) -> int: | |
if value in self._values2index: | |
return self._values2index[value] | |
# new | |
self._values2index[value] = self._next_index | |
self._index2value[self._next_index] = value | |
self._next_index += 1 | |
return self._next_index - 1 | |
def decode(self, index: int): | |
if index in self._index2value: | |
return self._index2value[index] | |
return None | |
def encode_batch(self, values: list) -> list: | |
return [self.encode(val) for val in values] | |
def decode_batch(self, indices: list) -> list: | |
return [self.decode(i) for i in indices] | |
def __len__(self): | |
return len(self._values2index) | |
def value_byte_size(self) -> int: | |
str_size = sys.getsizeof('') | |
return sum(sys.getsizeof(value) - str_size for value in self._values2index.keys()) | |
def __repr__(self): | |
return f'<LabelEncoder(name={self.name}, n_entries={len(self)}, value_byte_size={self.value_byte_size()})>' | |
def test_label_encoder(): | |
x = ['a', 'b', 'c', 'd', 'aaaaa', 'aaaaa'] | |
l = LabelEncoder() | |
l.encode_batch(x) | |
print(l) | |
l.decode_batch([0, 1, 2, 3, 0]), l.value_byte_size(), len(l) | |
class AddressEncoder: | |
def __init__(self): | |
self.component_encoders = {} | |
self.component_name_encoder = LabelEncoder() | |
def encode(self, addr: dict) -> str: | |
if 'Components' in addr: | |
addr = { | |
component['kind']: component['name'] | |
for component in addr['Components'] | |
} | |
indices = {} | |
for component in addr.keys(): | |
component_id = self.component_name_encoder.encode(component) | |
if component not in self.component_encoders: | |
self.component_encoders[component] = LabelEncoder(component) | |
index = self.component_encoders[component].encode(addr[component]) | |
indices[component_id] = index | |
address_id = ','.join( | |
f'{component_id}:{indices[component_id]}' | |
for component_id in sorted(indices.keys()) | |
) | |
return address_id | |
def decode(self, addr_id: str) -> dict: | |
components = addr_id.split(',') | |
res = {} | |
for component in components: | |
component_id, value_id = component.split(':') | |
comonent_name = self.component_name_encoder.decode(int(component_id)) | |
res[comonent_name] = self.component_encoders[comonent_name].decode(int(value_id)) | |
return res | |
def stat(self): | |
rows = [ | |
{ | |
'name': encoder.name, | |
'entry_number': len(encoder), | |
'mb': round(encoder.value_byte_size() / 1024**2, 2) | |
} | |
for encoder in self.component_encoders.values() | |
] | |
df = pd.DataFrame.from_records(rows) | |
return df.sort_values('entry_number', ascending=False) | |
def test_addr_encoder(): | |
test_addrs = [ | |
{"_id":{"$oid":"5c78f1e0fb77eef92b38453a"},"Coordinates":{"longitude":60.026604,"latitude":56.868954},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Уральский федеральный округ"},{"name":"Свердловская область","kind":"province"},{"kind":"area","name":"городской округ Первоуральск"},{"kind":"locality","name":"Первоуральск"},{"kind":"district","name":"микрорайон Совхоза Первоуральский"}]}, | |
{"_id":{"$oid":"5c78f1e0fb77eef92b38453b"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Южный федеральный округ"},{"kind":"province","name":"Краснодарский край"},{"kind":"area","name":"Крымский район"},{"kind":"locality","name":"село Русское"},{"kind":"street","name":"улица Кашириной"}],"Coordinates":{"longitude":37.841486,"latitude":44.959272}}, | |
{"_id":{"$oid":"5c78f1e0fb77eef92b38453c"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Приволжский федеральный округ"},{"name":"Республика Башкортостан","kind":"province"},{"kind":"area","name":"городской округ Стерлитамак"},{"kind":"locality","name":"Стерлитамак"},{"kind":"district","name":"микрорайон Железнодорожный"}],"Coordinates":{"longitude":55.952951,"latitude":53.63335}}, | |
{"_id":{"$oid":"5c78f1e0fb77eef92b38453d"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Северо-Западный федеральный округ"},{"kind":"province","name":"Мурманская область"},{"kind":"area","name":"городской округ Город Мурманск"},{"kind":"locality","name":"Мурманск"},{"kind":"district","name":"101-й микрорайон"}],"Coordinates":{"latitude":68.958796,"longitude":33.077173}}, | |
] | |
addr_encoder = AddressEncoder() | |
addr_id = addr_encoder.encode(test_addrs[0]) | |
addr_encoder.decode(addr_id) | |
addr_encoder.stat() | |
def main(filename: str): | |
addr_encoder = AddressEncoder() | |
# load all this shit | |
addr_ids = [] | |
with open('geo_dump_coords.json') as f: | |
for line in tqdm.tqdm_notebook(f): | |
address = json.loads(line) | |
addr_ids.append(addr_encoder.encode(address)) | |
print('addresses : {:,}'.format(len(addr_ids))) | |
print('encoder size : {:.1f} mb'.format(addr_encoder.stat()['mb'].sum())) | |
print('addr_ids sizes : {:.1f} mb'.format(sum(len(addr_id) for addr_id in addr_ids) / 1024 ** 2)) | |
print('filesize : {:.1f} mb'.format(os.path.getsize('geo_dump_coords.json') / 1024 ** 2)) | |
print('-' * 80) | |
print('compressed sample : {}'.format(addr_ids[1111])) | |
print('uncompressed sample: {}'.format(addr_encoder.decode(addr_ids[1111]))) | |
print('-' * 80) | |
# check out dict sizes | |
addr_encoder.stat() | |
if __name__ == '__main__': | |
main('geo_dump_coords.json') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment