Skip to content

Instantly share code, notes, and snippets.

@dmitryhd
Last active March 19, 2019 21:50
Show Gist options
  • Save dmitryhd/76f5f0eaff7da58a55f4cfb00e1378b1 to your computer and use it in GitHub Desktop.
Save dmitryhd/76f5f0eaff7da58a55f4cfb00e1378b1 to your computer and use it in GitHub Desktop.
import sys
import os
import json
import tqdm
import pandas as pd
class LabelEncoder:
def __init__(self, name=None):
self._values2index = {}
self._next_index = 0
self._index2value = {}
self.name = name
def encode(self, value) -> int:
if value in self._values2index:
return self._values2index[value]
# new
self._values2index[value] = self._next_index
self._index2value[self._next_index] = value
self._next_index += 1
return self._next_index - 1
def decode(self, index: int):
if index in self._index2value:
return self._index2value[index]
return None
def encode_batch(self, values: list) -> list:
return [self.encode(val) for val in values]
def decode_batch(self, indices: list) -> list:
return [self.decode(i) for i in indices]
def __len__(self):
return len(self._values2index)
def value_byte_size(self) -> int:
str_size = sys.getsizeof('')
return sum(sys.getsizeof(value) - str_size for value in self._values2index.keys())
def __repr__(self):
return f'<LabelEncoder(name={self.name}, n_entries={len(self)}, value_byte_size={self.value_byte_size()})>'
def test_label_encoder():
x = ['a', 'b', 'c', 'd', 'aaaaa', 'aaaaa']
l = LabelEncoder()
l.encode_batch(x)
print(l)
l.decode_batch([0, 1, 2, 3, 0]), l.value_byte_size(), len(l)
class AddressEncoder:
def __init__(self):
self.component_encoders = {}
self.component_name_encoder = LabelEncoder()
def encode(self, addr: dict) -> str:
if 'Components' in addr:
addr = {
component['kind']: component['name']
for component in addr['Components']
}
indices = {}
for component in addr.keys():
component_id = self.component_name_encoder.encode(component)
if component not in self.component_encoders:
self.component_encoders[component] = LabelEncoder(component)
index = self.component_encoders[component].encode(addr[component])
indices[component_id] = index
address_id = ','.join(
f'{component_id}:{indices[component_id]}'
for component_id in sorted(indices.keys())
)
return address_id
def decode(self, addr_id: str) -> dict:
components = addr_id.split(',')
res = {}
for component in components:
component_id, value_id = component.split(':')
comonent_name = self.component_name_encoder.decode(int(component_id))
res[comonent_name] = self.component_encoders[comonent_name].decode(int(value_id))
return res
def stat(self):
rows = [
{
'name': encoder.name,
'entry_number': len(encoder),
'mb': round(encoder.value_byte_size() / 1024**2, 2)
}
for encoder in self.component_encoders.values()
]
df = pd.DataFrame.from_records(rows)
return df.sort_values('entry_number', ascending=False)
def test_addr_encoder():
test_addrs = [
{"_id":{"$oid":"5c78f1e0fb77eef92b38453a"},"Coordinates":{"longitude":60.026604,"latitude":56.868954},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Уральский федеральный округ"},{"name":"Свердловская область","kind":"province"},{"kind":"area","name":"городской округ Первоуральск"},{"kind":"locality","name":"Первоуральск"},{"kind":"district","name":"микрорайон Совхоза Первоуральский"}]},
{"_id":{"$oid":"5c78f1e0fb77eef92b38453b"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Южный федеральный округ"},{"kind":"province","name":"Краснодарский край"},{"kind":"area","name":"Крымский район"},{"kind":"locality","name":"село Русское"},{"kind":"street","name":"улица Кашириной"}],"Coordinates":{"longitude":37.841486,"latitude":44.959272}},
{"_id":{"$oid":"5c78f1e0fb77eef92b38453c"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Приволжский федеральный округ"},{"name":"Республика Башкортостан","kind":"province"},{"kind":"area","name":"городской округ Стерлитамак"},{"kind":"locality","name":"Стерлитамак"},{"kind":"district","name":"микрорайон Железнодорожный"}],"Coordinates":{"longitude":55.952951,"latitude":53.63335}},
{"_id":{"$oid":"5c78f1e0fb77eef92b38453d"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Северо-Западный федеральный округ"},{"kind":"province","name":"Мурманская область"},{"kind":"area","name":"городской округ Город Мурманск"},{"kind":"locality","name":"Мурманск"},{"kind":"district","name":"101-й микрорайон"}],"Coordinates":{"latitude":68.958796,"longitude":33.077173}},
]
addr_encoder = AddressEncoder()
addr_id = addr_encoder.encode(test_addrs[0])
addr_encoder.decode(addr_id)
addr_encoder.stat()
def main(filename: str):
addr_encoder = AddressEncoder()
# load all this shit
addr_ids = []
with open('geo_dump_coords.json') as f:
for line in tqdm.tqdm_notebook(f):
address = json.loads(line)
addr_ids.append(addr_encoder.encode(address))
print('addresses : {:,}'.format(len(addr_ids)))
print('encoder size : {:.1f} mb'.format(addr_encoder.stat()['mb'].sum()))
print('addr_ids sizes : {:.1f} mb'.format(sum(len(addr_id) for addr_id in addr_ids) / 1024 ** 2))
print('filesize : {:.1f} mb'.format(os.path.getsize('geo_dump_coords.json') / 1024 ** 2))
print('-' * 80)
print('compressed sample : {}'.format(addr_ids[1111]))
print('uncompressed sample: {}'.format(addr_encoder.decode(addr_ids[1111])))
print('-' * 80)
# check out dict sizes
addr_encoder.stat()
if __name__ == '__main__':
main('geo_dump_coords.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment