Skip to content

Instantly share code, notes, and snippets.

@wjohnson
Created June 18, 2018 16:36
Show Gist options
  • Save wjohnson/7b8b7e087414cd75ca6727b0a154ab76 to your computer and use it in GitHub Desktop.
Save wjohnson/7b8b7e087414cd75ca6727b0a154ab76 to your computer and use it in GitHub Desktop.
import pandas as pd
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="Input file")
parser.add_argument("-o", "--output", help="Output file")
parser.add_argument("-r","--response", help="Your dependent variable")
parser.add_argument("-g","--groupvar", help="Your grouping variable for the random effects")
args = parser.parse_args()
fixed_vars = ['list','of','variable','names','for','fixed','effects']
random_vars = ['list','of','variable','names','for','random','effects']
x = pd.read_csv(args.input)
resp = x.loc[:,args.response]
TDL = x.loc[:,args.groupvar].astype("str").map(lambda x: x[2:(len(x)-1)])
x = x.loc[:, fixed_vars+random_vars]
new_cols = []
for col in x.columns:
if col in fixed_vars:
col = 'f_'+col
if col in random_vars:
col = 'r_'+col
new_cols.append(col)
x.columns = new_cols
with open(args.output, 'w') as f:
for res, row, grp in zip(resp, x.values,TDL):
non_sparse = row[row != 0]
cols = x.columns[row != 0]
out_string = ' '.join([col+':'+str(val) for col, val in zip(cols, non_sparse)])
out_string = ' '.join([str(grp), out_string])
out_string = ' '.join([str(res), out_string])
f.write(out_string+'\n')
{
"type" : "record",
"name" : "TrainingExample",
"namespace" : "com.linkedin.metronome.avro.generated",
"fields" : [ {
"name" : "uid",
"type" : [ "null", "string", "long", "int" ],
"doc" : "a unique id for the training event",
"default" : null
}, {
"name" : "label",
"type" : [ "double"],
"doc" : "label/response of the event"
}, {
"name" : "grouping",
"type" : [ "string" ],
"doc" : "The store grouping"
}, {
"name" : "fixedfeatures",
"type" : {
"type" : "array",
"items" : {
"type" : "record",
"name" : "FixedFeature",
"doc" : "a feature is a named numeric value",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "value",
"type" : "double"
}, {
"name" : "term",
"type" : [ "null", "string" ],
"default" : null
} ]
}
},
"doc" : "fixed effects features describing the event"
}, {
"name" : "randomfeatures",
"type" : {
"type" : "array",
"items" : {
"type" : "record",
"name" : "RandomFeature",
"doc" : "a feature is a named numeric value",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "value",
"type" : "double"
}, {
"name" : "term",
"type" : [ "null", "string" ],
"default" : null
} ]
}
},
"doc" : "random effects features describing the event"
},{
"name" : "metadataMap",
"type" : [ "null", {
"type" : "map",
"values" : [ "boolean", "int", "long", "float", "double", "string" ]
} ],
"doc" : "optional field. allows attaching arbitrary non-feature metadata to each event",
"default" : null
}, {
"name" : "weight",
"type" : [ "null", "int", "long", "float", "double" ],
"doc" : "optional field. specifies strength of observation",
"default" : null
}, {
"name" : "offset",
"type" : [ "null", "int", "long", "float", "double" ],
"doc" : "optional field. when non-zero, admm will learn coefficient betas with the added offset",
"default" : null
} ]
}
# Copyright 2017 LinkedIn Corp. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain a
# copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
[Doc]:
This script converts a text file in libsvm format into TrainingExample avro.
For each feature, the name is set as id, and the term is empty.
[Usage]:
python libsvm_text_to_trainingexample_avro.py [input_path] [output_schema_path] [output_path] (optional: -r for regression)
"""
import avro.schema
import getopt
import os
import sys
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
def main():
if len(sys.argv) <= 1:
print __doc__
sys.exit(0)
reg = False
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "hr", ["help", "regression"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
if o in ("-r", "--regression"):
reg = True
# process arguments
input_path = args[0]
output_schema_path = args[1]
output_path = args[2]
if os.path.exists(output_path):
os.remove(output_path)
schema = avro.schema.parse(open(output_schema_path).read())
writer = DataFileWriter(open(output_path, "w"), DatumWriter(), schema)
with open(input_path, 'r') as f:
count = 0
for line in f:
count += 1
r = {}
i = 0
fixed_feature_arr = []
random_feature_arr = []
for token in line.strip().split(' '):
if i == 0:
if reg:
r['label'] = float(token)
else:
r['label'] = int(token)
if r['label'] <= 0:
r['label'] = 0
else:
r['label'] = 1
elif i == 1:
r['grouping'] = str(token)
else:
t = token.split(':')
ft = {}
ft['name'] = t[0]
ft['term'] = ''
ft['value'] = float(t[1])
if t[0][0:2] == "f_":
fixed_feature_arr.append(ft)
else:
random_feature_arr.append(ft)
i += 1
r['fixedfeatures'] = fixed_feature_arr
r['randomfeatures'] = random_feature_arr
writer.append(r)
print "converted " + str(count) + " examples"
writer.close()
if __name__ == "__main__":
main()
@wjohnson
Copy link
Author

The structure of the calls might looks like

python csv_to_libsvm_for_avro.py -i inputfile.csv -o outputfile.libsvm -r independentvar -g groupingvar
python libsvm_to_avro.py -r outputfile.libvsm GAME.avsc outputfile.avro

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment