dmitryhd · March 19, 2019 21:50
diff --git a/simplest_address_encoder.py b/simplest_address_encoder.py
 import sys
 import os
 import json

 import tqdm
 import pandas as pd


 class LabelEncoder:
    
    def __init__(self, name=None):
        self._values2index = {}
        self._next_index = 0
        self._index2value = {}
        self.name = name
        
    def encode(self, value) -> int:
        if value in self._values2index:
            return self._values2index[value]
        # new
        self._values2index[value] = self._next_index
        
        self._index2value[self._next_index] = value
        
        self._next_index += 1
        return self._next_index - 1
    
    def decode(self, index: int):
        if index in self._index2value:
            return self._index2value[index]
        return None
    
    def encode_batch(self, values: list) -> list:
        return [self.encode(val) for val in values]
    
    def decode_batch(self, indices: list) -> list:
        return [self.decode(i) for i in indices]
    
    def __len__(self):
        return len(self._values2index)
    
    def value_byte_size(self) -> int:
        str_size = sys.getsizeof('')
        return sum(sys.getsizeof(value) - str_size for value in self._values2index.keys())
    
    def __repr__(self):
        return f'<LabelEncoder(name={self.name}, n_entries={len(self)}, value_byte_size={self.value_byte_size()})>'
        
        
 def test_label_encoder():
    x = ['a', 'b', 'c', 'd', 'aaaaa', 'aaaaa']
    l = LabelEncoder()
    l.encode_batch(x)
    print(l)
    l.decode_batch([0, 1, 2, 3, 0]), l.value_byte_size(), len(l)
    

 class AddressEncoder:
    
    def __init__(self):
        self.component_encoders = {}
        self.component_name_encoder = LabelEncoder()
        
    def encode(self, addr: dict) -> str:
        if 'Components' in addr:
            addr = {
                component['kind']: component['name']
                for component in addr['Components']
            }  
        indices = {}
        for component in addr.keys():
            component_id = self.component_name_encoder.encode(component)
            if component not in self.component_encoders:
                self.component_encoders[component] = LabelEncoder(component)

            index = self.component_encoders[component].encode(addr[component])
            indices[component_id] = index

        address_id = ','.join(
            f'{component_id}:{indices[component_id]}'
            for component_id in sorted(indices.keys())
        )

        return address_id

    def decode(self, addr_id: str) -> dict:
        components = addr_id.split(',')
        res = {}
        for component in components:
            component_id, value_id = component.split(':')
            comonent_name = self.component_name_encoder.decode(int(component_id))
            res[comonent_name] = self.component_encoders[comonent_name].decode(int(value_id))
        return res
    
    def stat(self):
        rows = [
            {
            'name': encoder.name,
            'entry_number': len(encoder),
            'mb': round(encoder.value_byte_size() / 1024**2, 2)
            }
            for encoder in self.component_encoders.values()
        ]
        df = pd.DataFrame.from_records(rows)
        return df.sort_values('entry_number', ascending=False)

    
 def test_addr_encoder():
    test_addrs = [
        {"_id":{"$oid":"5c78f1e0fb77eef92b38453a"},"Coordinates":{"longitude":60.026604,"latitude":56.868954},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Уральский федеральный округ"},{"name":"Свердловская область","kind":"province"},{"kind":"area","name":"городской округ Первоуральск"},{"kind":"locality","name":"Первоуральск"},{"kind":"district","name":"микрорайон Совхоза Первоуральский"}]},
        {"_id":{"$oid":"5c78f1e0fb77eef92b38453b"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Южный федеральный округ"},{"kind":"province","name":"Краснодарский край"},{"kind":"area","name":"Крымский район"},{"kind":"locality","name":"село Русское"},{"kind":"street","name":"улица Кашириной"}],"Coordinates":{"longitude":37.841486,"latitude":44.959272}},
        {"_id":{"$oid":"5c78f1e0fb77eef92b38453c"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Приволжский федеральный округ"},{"name":"Республика Башкортостан","kind":"province"},{"kind":"area","name":"городской округ Стерлитамак"},{"kind":"locality","name":"Стерлитамак"},{"kind":"district","name":"микрорайон Железнодорожный"}],"Coordinates":{"longitude":55.952951,"latitude":53.63335}},
        {"_id":{"$oid":"5c78f1e0fb77eef92b38453d"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Северо-Западный федеральный округ"},{"kind":"province","name":"Мурманская область"},{"kind":"area","name":"городской округ Город Мурманск"},{"kind":"locality","name":"Мурманск"},{"kind":"district","name":"101-й микрорайон"}],"Coordinates":{"latitude":68.958796,"longitude":33.077173}},
    ]

    addr_encoder = AddressEncoder()

    addr_id = addr_encoder.encode(test_addrs[0])
    addr_encoder.decode(addr_id)
    addr_encoder.stat()

 def main(filename: str):
    addr_encoder = AddressEncoder()
    # load all this shit
    addr_ids = []
    with open('geo_dump_coords.json') as f:
        for line in tqdm.tqdm_notebook(f):
            address = json.loads(line)
            addr_ids.append(addr_encoder.encode(address))

    print('addresses          : {:,}'.format(len(addr_ids)))
    print('encoder size       : {:.1f} mb'.format(addr_encoder.stat()['mb'].sum()))
    print('addr_ids sizes     : {:.1f} mb'.format(sum(len(addr_id) for addr_id in addr_ids) / 1024 ** 2))
    print('filesize           : {:.1f} mb'.format(os.path.getsize('geo_dump_coords.json') / 1024 ** 2))
    print('-' * 80)
    print('compressed sample  : {}'.format(addr_ids[1111]))
    print('uncompressed sample: {}'.format(addr_encoder.decode(addr_ids[1111])))
    print('-' * 80)
    # check out dict sizes
    addr_encoder.stat()
    
    
 if __name__ == '__main__':
    main('geo_dump_coords.json')
	import sys
	import os
	import json

	import tqdm
	import pandas as pd


	class LabelEncoder:

	def __init__(self, name=None):
	self._values2index = {}
	self._next_index = 0
	self._index2value = {}
	self.name = name

	def encode(self, value) -> int:
	if value in self._values2index:
	return self._values2index[value]
	# new
	self._values2index[value] = self._next_index

	self._index2value[self._next_index] = value

	self._next_index += 1
	return self._next_index - 1

	def decode(self, index: int):
	if index in self._index2value:
	return self._index2value[index]
	return None

	def encode_batch(self, values: list) -> list:
	return [self.encode(val) for val in values]

	def decode_batch(self, indices: list) -> list:
	return [self.decode(i) for i in indices]

	def __len__(self):
	return len(self._values2index)

	def value_byte_size(self) -> int:
	str_size = sys.getsizeof('')
	return sum(sys.getsizeof(value) - str_size for value in self._values2index.keys())

	def __repr__(self):
	return f'<LabelEncoder(name={self.name}, n_entries={len(self)}, value_byte_size={self.value_byte_size()})>'


	def test_label_encoder():
	x = ['a', 'b', 'c', 'd', 'aaaaa', 'aaaaa']
	l = LabelEncoder()
	l.encode_batch(x)
	print(l)
	l.decode_batch([0, 1, 2, 3, 0]), l.value_byte_size(), len(l)


	class AddressEncoder:

	def __init__(self):
	self.component_encoders = {}
	self.component_name_encoder = LabelEncoder()

	def encode(self, addr: dict) -> str:
	if 'Components' in addr:
	addr = {
	component['kind']: component['name']
	for component in addr['Components']
	}
	indices = {}
	for component in addr.keys():
	component_id = self.component_name_encoder.encode(component)
	if component not in self.component_encoders:
	self.component_encoders[component] = LabelEncoder(component)

	index = self.component_encoders[component].encode(addr[component])
	indices[component_id] = index

	address_id = ','.join(
	f'{component_id}:{indices[component_id]}'
	for component_id in sorted(indices.keys())
	)

	return address_id

	def decode(self, addr_id: str) -> dict:
	components = addr_id.split(',')
	res = {}
	for component in components:
	component_id, value_id = component.split(':')
	comonent_name = self.component_name_encoder.decode(int(component_id))
	res[comonent_name] = self.component_encoders[comonent_name].decode(int(value_id))
	return res

	def stat(self):
	rows = [
	{
	'name': encoder.name,
	'entry_number': len(encoder),
	'mb': round(encoder.value_byte_size() / 1024**2, 2)
	}
	for encoder in self.component_encoders.values()
	]
	df = pd.DataFrame.from_records(rows)
	return df.sort_values('entry_number', ascending=False)


	def test_addr_encoder():
	test_addrs = [
	{"_id":{"$oid":"5c78f1e0fb77eef92b38453a"},"Coordinates":{"longitude":60.026604,"latitude":56.868954},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Уральский федеральный округ"},{"name":"Свердловская область","kind":"province"},{"kind":"area","name":"городской округ Первоуральск"},{"kind":"locality","name":"Первоуральск"},{"kind":"district","name":"микрорайон Совхоза Первоуральский"}]},
	{"_id":{"$oid":"5c78f1e0fb77eef92b38453b"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Южный федеральный округ"},{"kind":"province","name":"Краснодарский край"},{"kind":"area","name":"Крымский район"},{"kind":"locality","name":"село Русское"},{"kind":"street","name":"улица Кашириной"}],"Coordinates":{"longitude":37.841486,"latitude":44.959272}},
	{"_id":{"$oid":"5c78f1e0fb77eef92b38453c"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Приволжский федеральный округ"},{"name":"Республика Башкортостан","kind":"province"},{"kind":"area","name":"городской округ Стерлитамак"},{"kind":"locality","name":"Стерлитамак"},{"kind":"district","name":"микрорайон Железнодорожный"}],"Coordinates":{"longitude":55.952951,"latitude":53.63335}},
	{"_id":{"$oid":"5c78f1e0fb77eef92b38453d"},"Components":[{"kind":"country","name":"Россия"},{"kind":"province","name":"Северо-Западный федеральный округ"},{"kind":"province","name":"Мурманская область"},{"kind":"area","name":"городской округ Город Мурманск"},{"kind":"locality","name":"Мурманск"},{"kind":"district","name":"101-й микрорайон"}],"Coordinates":{"latitude":68.958796,"longitude":33.077173}},
	]

	addr_encoder = AddressEncoder()

	addr_id = addr_encoder.encode(test_addrs[0])
	addr_encoder.decode(addr_id)
	addr_encoder.stat()

	def main(filename: str):
	addr_encoder = AddressEncoder()
	# load all this shit
	addr_ids = []
	with open('geo_dump_coords.json') as f:
	for line in tqdm.tqdm_notebook(f):
	address = json.loads(line)
	addr_ids.append(addr_encoder.encode(address))

	print('addresses : {:,}'.format(len(addr_ids)))
	print('encoder size : {:.1f} mb'.format(addr_encoder.stat()['mb'].sum()))
	print('addr_ids sizes : {:.1f} mb'.format(sum(len(addr_id) for addr_id in addr_ids) / 1024 ** 2))
	print('filesize : {:.1f} mb'.format(os.path.getsize('geo_dump_coords.json') / 1024 ** 2))
	print('-' * 80)
	print('compressed sample : {}'.format(addr_ids[1111]))
	print('uncompressed sample: {}'.format(addr_encoder.decode(addr_ids[1111])))
	print('-' * 80)
	# check out dict sizes
	addr_encoder.stat()


	if __name__ == '__main__':
	main('geo_dump_coords.json')