Skip to content

Instantly share code, notes, and snippets.

@dangpzanco
Last active September 4, 2016 01:35
Show Gist options
  • Save dangpzanco/3f4762181a38cdd0fd4b921a1f3ce41d to your computer and use it in GitHub Desktop.
Save dangpzanco/3f4762181a38cdd0fd4b921a1f3ce41d to your computer and use it in GitHub Desktop.
import pandas as pd
import os
import gc
import blaze
import numpy as np
# def generate_arrays_from_file(path):
# while 1:
# f = open(path)
# for line in f:
# # create Numpy arrays of input data
# # and labels, from each line in the file
# x, y = process_line(line)
# yield (x, y)
# f.close()
def simple_generator(hdf_filename, building_num=1, meter_num=5, batch_size=32, win_size=1024):
# NILMTK-like dataset keys
key_mains = 'building' + str(building_num) + '/elec/meter1'
key_appliance = 'building' + str(building_num) + '/elec/meter' + str(meter_num)
# Get data size
df = blaze.data(hdf_filename)
data_size = df['building' + str(building_num)]['elec']['meter' + str(meter_num)].table.values_block_0.shape[0]
X_train = np.zeros(shape=(batch_size, win_size, 1))
Y_train = np.zeros(shape=(batch_size, win_size, 1))
num_iters = data_size / (batch_size + win_size - 1)
while 1:
for i in xrange(num_iters):
start_index = (i + 1) * (win_size + batch_size - 1)
stop_index = start_index + (win_size + batch_size - 1)
mains = pd.HDFStore(hdf_filename).select(key=key_mains, start=start_index, stop=stop_index)
appliance = pd.HDFStore(hdf_filename).select(key=key_appliance, start=start_index, stop=stop_index)
for j in xrange(batch_size):
X_train[j, :, 0] = np.array(mains.values[j:j+win_size]).ravel()
Y_train[j, :, 0] = np.array(appliance.values[j:j+win_size]).ravel()
yield (X_train, Y_train)
# def dataset_generator(hdf_filename, building_nums, meter_nums, batch_size):
# for building in building_nums:
# for meter in meter_nums:
# hdf_filename = dataset_sync('ukdale.h5', building_num=building, meter_num=meter)
# key_mains = 'building' + str(building) + '/elec/meter1'
# key_appliance = 'building' + str(building) + '/elec/meter' + str(meter)
#
# df = blaze.data(hdf_filename)
# data_size = df['building'+str(building)]['elec']['meter'+str(meter)].table.values_block_0.shape[0]
#
# for i in xrange(data_size):
# pass
# mains = pd.HDFStore(hdf_filename).select(key=key_mains, start=0, stop=1000)
# appliance = pd.HDFStore(hdf_filename).select(key=key_appliance, start=0, stop=1000)
# pass
def dataset_sync(hdf_filename, building_num, meter_num, sample_period='6S'):
# NILMTK-like dataset keys
key_mains = 'building' + str(building_num) + '/elec/meter1'
key_appliance = 'building' + str(building_num) + '/elec/meter' + str(meter_num)
# Set filename
split_filename = os.path.splitext(hdf_filename)
output_filename = split_filename[0] + '-building' + str(building_num) + \
'-meter' + str(meter_num) + split_filename[1]
# Check if file exists
if os.path.isfile(output_filename):
return output_filename
else:
# Read original dataset
mains = pd.read_hdf(hdf_filename, key=key_mains)
appliance = pd.read_hdf(hdf_filename, key=key_appliance)
# Reindex and interpolate missing values
newindex = mains.index.union(appliance.index)
mains = mains.reindex(newindex)
mains = mains.interpolate(method='time')
appliance = appliance.reindex(newindex)
appliance = appliance.interpolate(method='time')
# Resample to 6 seconds
mains = mains.resample(sample_period).mean()
appliance = appliance.resample(sample_period).mean()
# Correct nan's via interpolation
mains = mains.interpolate(method='time')
appliance = appliance.interpolate(method='time')
# Store the new dataset
mains.to_hdf(output_filename, format='table', key=key_mains, complib='zlib')
appliance.to_hdf(output_filename, format='table', key=key_appliance, complib='zlib')
return output_filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment