Last active
September 4, 2016 01:35
-
-
Save dangpzanco/3f4762181a38cdd0fd4b921a1f3ce41d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import gc | |
import blaze | |
import numpy as np | |
# def generate_arrays_from_file(path): | |
# while 1: | |
# f = open(path) | |
# for line in f: | |
# # create Numpy arrays of input data | |
# # and labels, from each line in the file | |
# x, y = process_line(line) | |
# yield (x, y) | |
# f.close() | |
def simple_generator(hdf_filename, building_num=1, meter_num=5, batch_size=32, win_size=1024): | |
# NILMTK-like dataset keys | |
key_mains = 'building' + str(building_num) + '/elec/meter1' | |
key_appliance = 'building' + str(building_num) + '/elec/meter' + str(meter_num) | |
# Get data size | |
df = blaze.data(hdf_filename) | |
data_size = df['building' + str(building_num)]['elec']['meter' + str(meter_num)].table.values_block_0.shape[0] | |
X_train = np.zeros(shape=(batch_size, win_size, 1)) | |
Y_train = np.zeros(shape=(batch_size, win_size, 1)) | |
num_iters = data_size / (batch_size + win_size - 1) | |
while 1: | |
for i in xrange(num_iters): | |
start_index = (i + 1) * (win_size + batch_size - 1) | |
stop_index = start_index + (win_size + batch_size - 1) | |
mains = pd.HDFStore(hdf_filename).select(key=key_mains, start=start_index, stop=stop_index) | |
appliance = pd.HDFStore(hdf_filename).select(key=key_appliance, start=start_index, stop=stop_index) | |
for j in xrange(batch_size): | |
X_train[j, :, 0] = np.array(mains.values[j:j+win_size]).ravel() | |
Y_train[j, :, 0] = np.array(appliance.values[j:j+win_size]).ravel() | |
yield (X_train, Y_train) | |
# def dataset_generator(hdf_filename, building_nums, meter_nums, batch_size): | |
# for building in building_nums: | |
# for meter in meter_nums: | |
# hdf_filename = dataset_sync('ukdale.h5', building_num=building, meter_num=meter) | |
# key_mains = 'building' + str(building) + '/elec/meter1' | |
# key_appliance = 'building' + str(building) + '/elec/meter' + str(meter) | |
# | |
# df = blaze.data(hdf_filename) | |
# data_size = df['building'+str(building)]['elec']['meter'+str(meter)].table.values_block_0.shape[0] | |
# | |
# for i in xrange(data_size): | |
# pass | |
# mains = pd.HDFStore(hdf_filename).select(key=key_mains, start=0, stop=1000) | |
# appliance = pd.HDFStore(hdf_filename).select(key=key_appliance, start=0, stop=1000) | |
# pass | |
def dataset_sync(hdf_filename, building_num, meter_num, sample_period='6S'): | |
# NILMTK-like dataset keys | |
key_mains = 'building' + str(building_num) + '/elec/meter1' | |
key_appliance = 'building' + str(building_num) + '/elec/meter' + str(meter_num) | |
# Set filename | |
split_filename = os.path.splitext(hdf_filename) | |
output_filename = split_filename[0] + '-building' + str(building_num) + \ | |
'-meter' + str(meter_num) + split_filename[1] | |
# Check if file exists | |
if os.path.isfile(output_filename): | |
return output_filename | |
else: | |
# Read original dataset | |
mains = pd.read_hdf(hdf_filename, key=key_mains) | |
appliance = pd.read_hdf(hdf_filename, key=key_appliance) | |
# Reindex and interpolate missing values | |
newindex = mains.index.union(appliance.index) | |
mains = mains.reindex(newindex) | |
mains = mains.interpolate(method='time') | |
appliance = appliance.reindex(newindex) | |
appliance = appliance.interpolate(method='time') | |
# Resample to 6 seconds | |
mains = mains.resample(sample_period).mean() | |
appliance = appliance.resample(sample_period).mean() | |
# Correct nan's via interpolation | |
mains = mains.interpolate(method='time') | |
appliance = appliance.interpolate(method='time') | |
# Store the new dataset | |
mains.to_hdf(output_filename, format='table', key=key_mains, complib='zlib') | |
appliance.to_hdf(output_filename, format='table', key=key_appliance, complib='zlib') | |
return output_filename |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment