Created
June 18, 2018 16:36
-
-
Save wjohnson/7b8b7e087414cd75ca6727b0a154ab76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import argparse | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--input", help="Input file") | |
parser.add_argument("-o", "--output", help="Output file") | |
parser.add_argument("-r","--response", help="Your dependent variable") | |
parser.add_argument("-g","--groupvar", help="Your grouping variable for the random effects") | |
args = parser.parse_args() | |
fixed_vars = ['list','of','variable','names','for','fixed','effects'] | |
random_vars = ['list','of','variable','names','for','random','effects'] | |
x = pd.read_csv(args.input) | |
resp = x.loc[:,args.response] | |
TDL = x.loc[:,args.groupvar].astype("str").map(lambda x: x[2:(len(x)-1)]) | |
x = x.loc[:, fixed_vars+random_vars] | |
new_cols = [] | |
for col in x.columns: | |
if col in fixed_vars: | |
col = 'f_'+col | |
if col in random_vars: | |
col = 'r_'+col | |
new_cols.append(col) | |
x.columns = new_cols | |
with open(args.output, 'w') as f: | |
for res, row, grp in zip(resp, x.values,TDL): | |
non_sparse = row[row != 0] | |
cols = x.columns[row != 0] | |
out_string = ' '.join([col+':'+str(val) for col, val in zip(cols, non_sparse)]) | |
out_string = ' '.join([str(grp), out_string]) | |
out_string = ' '.join([str(res), out_string]) | |
f.write(out_string+'\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type" : "record", | |
"name" : "TrainingExample", | |
"namespace" : "com.linkedin.metronome.avro.generated", | |
"fields" : [ { | |
"name" : "uid", | |
"type" : [ "null", "string", "long", "int" ], | |
"doc" : "a unique id for the training event", | |
"default" : null | |
}, { | |
"name" : "label", | |
"type" : [ "double"], | |
"doc" : "label/response of the event" | |
}, { | |
"name" : "grouping", | |
"type" : [ "string" ], | |
"doc" : "The store grouping" | |
}, { | |
"name" : "fixedfeatures", | |
"type" : { | |
"type" : "array", | |
"items" : { | |
"type" : "record", | |
"name" : "FixedFeature", | |
"doc" : "a feature is a named numeric value", | |
"fields" : [ { | |
"name" : "name", | |
"type" : "string" | |
}, { | |
"name" : "value", | |
"type" : "double" | |
}, { | |
"name" : "term", | |
"type" : [ "null", "string" ], | |
"default" : null | |
} ] | |
} | |
}, | |
"doc" : "fixed effects features describing the event" | |
}, { | |
"name" : "randomfeatures", | |
"type" : { | |
"type" : "array", | |
"items" : { | |
"type" : "record", | |
"name" : "RandomFeature", | |
"doc" : "a feature is a named numeric value", | |
"fields" : [ { | |
"name" : "name", | |
"type" : "string" | |
}, { | |
"name" : "value", | |
"type" : "double" | |
}, { | |
"name" : "term", | |
"type" : [ "null", "string" ], | |
"default" : null | |
} ] | |
} | |
}, | |
"doc" : "random effects features describing the event" | |
},{ | |
"name" : "metadataMap", | |
"type" : [ "null", { | |
"type" : "map", | |
"values" : [ "boolean", "int", "long", "float", "double", "string" ] | |
} ], | |
"doc" : "optional field. allows attaching arbitrary non-feature metadata to each event", | |
"default" : null | |
}, { | |
"name" : "weight", | |
"type" : [ "null", "int", "long", "float", "double" ], | |
"doc" : "optional field. specifies strength of observation", | |
"default" : null | |
}, { | |
"name" : "offset", | |
"type" : [ "null", "int", "long", "float", "double" ], | |
"doc" : "optional field. when non-zero, admm will learn coefficient betas with the added offset", | |
"default" : null | |
} ] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2017 LinkedIn Corp. All rights reserved. | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain a | |
# copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
""" | |
[Doc]: | |
This script converts a text file in libsvm format into TrainingExample avro. | |
For each feature, the name is set as id, and the term is empty. | |
[Usage]: | |
python libsvm_text_to_trainingexample_avro.py [input_path] [output_schema_path] [output_path] (optional: -r for regression) | |
""" | |
import avro.schema | |
import getopt | |
import os | |
import sys | |
from avro.datafile import DataFileWriter | |
from avro.io import DatumWriter | |
def main(): | |
if len(sys.argv) <= 1: | |
print __doc__ | |
sys.exit(0) | |
reg = False | |
# parse command line options | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "hr", ["help", "regression"]) | |
except getopt.error, msg: | |
print msg | |
print "for help use --help" | |
sys.exit(2) | |
# process options | |
for o, a in opts: | |
if o in ("-h", "--help"): | |
print __doc__ | |
sys.exit(0) | |
if o in ("-r", "--regression"): | |
reg = True | |
# process arguments | |
input_path = args[0] | |
output_schema_path = args[1] | |
output_path = args[2] | |
if os.path.exists(output_path): | |
os.remove(output_path) | |
schema = avro.schema.parse(open(output_schema_path).read()) | |
writer = DataFileWriter(open(output_path, "w"), DatumWriter(), schema) | |
with open(input_path, 'r') as f: | |
count = 0 | |
for line in f: | |
count += 1 | |
r = {} | |
i = 0 | |
fixed_feature_arr = [] | |
random_feature_arr = [] | |
for token in line.strip().split(' '): | |
if i == 0: | |
if reg: | |
r['label'] = float(token) | |
else: | |
r['label'] = int(token) | |
if r['label'] <= 0: | |
r['label'] = 0 | |
else: | |
r['label'] = 1 | |
elif i == 1: | |
r['grouping'] = str(token) | |
else: | |
t = token.split(':') | |
ft = {} | |
ft['name'] = t[0] | |
ft['term'] = '' | |
ft['value'] = float(t[1]) | |
if t[0][0:2] == "f_": | |
fixed_feature_arr.append(ft) | |
else: | |
random_feature_arr.append(ft) | |
i += 1 | |
r['fixedfeatures'] = fixed_feature_arr | |
r['randomfeatures'] = random_feature_arr | |
writer.append(r) | |
print "converted " + str(count) + " examples" | |
writer.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The structure of the calls might looks like