wjohnson · June 18, 2018 16:36 · wjohnson · Jun 18, 2018
diff --git a/csv_to_libsvm_for_avro.py b/csv_to_libsvm_for_avro.py
 import pandas as pd
 import argparse

 if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("-i", "--input", help="Input file")
  parser.add_argument("-o", "--output", help="Output file")
  parser.add_argument("-r","--response", help="Your dependent variable")
  parser.add_argument("-g","--groupvar", help="Your grouping variable for the random effects")
  args = parser.parse_args()

  fixed_vars = ['list','of','variable','names','for','fixed','effects']

  random_vars = ['list','of','variable','names','for','random','effects']

  x = pd.read_csv(args.input)

  resp = x.loc[:,args.response]
  TDL = x.loc[:,args.groupvar].astype("str").map(lambda x: x[2:(len(x)-1)])

  x = x.loc[:, fixed_vars+random_vars]

  new_cols = []
  for col in x.columns:
    if col in fixed_vars:
      col = 'f_'+col
    if col in random_vars:
      col = 'r_'+col
    new_cols.append(col)

  x.columns = new_cols

  with open(args.output, 'w') as f:
    for res, row, grp in zip(resp, x.values,TDL):
      non_sparse = row[row != 0]
      cols = x.columns[row != 0]
      out_string = ' '.join([col+':'+str(val) for col, val in zip(cols, non_sparse)])
      out_string = ' '.join([str(grp), out_string])
      out_string = ' '.join([str(res), out_string])
      f.write(out_string+'\n')
diff --git a/GAME.avsc b/GAME.avsc
 {
  "type" : "record",
  "name" : "TrainingExample",
  "namespace" : "com.linkedin.metronome.avro.generated",
  "fields" : [ {
    "name" : "uid",
    "type" : [ "null", "string", "long", "int" ],
    "doc" : "a unique id for the training event",
    "default" : null
  }, {
    "name" : "label",
    "type" : [ "double"],
    "doc" : "label/response of the event"
  }, {
    "name" : "grouping",
    "type" : [ "string" ],
    "doc" : "The store grouping"
  }, {
    "name" : "fixedfeatures",
    "type" : {
      "type" : "array",
      "items" : {
        "type" : "record",
        "name" : "FixedFeature",
        "doc" : "a feature is a named numeric value",
        "fields" : [ {
          "name" : "name",
          "type" : "string"
        }, {
          "name" : "value",
          "type" : "double"
        }, {
          "name" : "term",
          "type" : [ "null", "string" ],
          "default" : null
        } ]
      }
    },
    "doc" : "fixed effects features describing the event"
  }, {
    "name" : "randomfeatures",
    "type" : {
      "type" : "array",
      "items" : {
        "type" : "record",
        "name" : "RandomFeature",
        "doc" : "a feature is a named numeric value",
        "fields" : [ {
          "name" : "name",
          "type" : "string"
        }, {
          "name" : "value",
          "type" : "double"
        }, {
          "name" : "term",
          "type" : [ "null", "string" ],
          "default" : null
        } ]
      }
    },
    "doc" : "random effects features describing the event"
  },{
    "name" : "metadataMap",
    "type" : [ "null", {
      "type" : "map",
      "values" : [ "boolean", "int", "long", "float", "double", "string" ]
    } ],
    "doc" : "optional field. allows attaching arbitrary non-feature metadata to each event",
    "default" : null
  }, {
    "name" : "weight",
    "type" : [ "null", "int", "long", "float", "double" ],
    "doc" : "optional field. specifies strength of observation",
    "default" : null
  }, {
    "name" : "offset",
    "type" : [ "null", "int", "long", "float", "double" ],
    "doc" : "optional field. when non-zero, admm will learn coefficient betas with the added offset",
    "default" : null
  } ]
 }
diff --git a/libsvm_to_avro.py b/libsvm_to_avro.py
 # Copyright 2017 LinkedIn Corp. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain a
 # copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 """
 [Doc]:
 This script converts a text file in libsvm format into TrainingExample avro.
 For each feature, the name is set as id, and the term is empty.

 [Usage]:
 python libsvm_text_to_trainingexample_avro.py [input_path] [output_schema_path] [output_path] (optional: -r for regression)
 """
 import avro.schema
 import getopt
 import os
 import sys
 from avro.datafile import DataFileWriter
 from avro.io import DatumWriter


 def main():
  if len(sys.argv) <= 1:
    print __doc__
    sys.exit(0)

  reg = False

  # parse command line options
  try:
    opts, args = getopt.getopt(sys.argv[1:], "hr", ["help", "regression"])
  except getopt.error, msg:
    print msg
    print "for help use --help"
    sys.exit(2)
  # process options
  for o, a in opts:
    if o in ("-h", "--help"):
      print __doc__
      sys.exit(0)
    if o in ("-r", "--regression"):
      reg = True

  # process arguments
  input_path = args[0]
  output_schema_path = args[1]
  output_path = args[2]

  if os.path.exists(output_path):
    os.remove(output_path)

  schema = avro.schema.parse(open(output_schema_path).read())
  writer = DataFileWriter(open(output_path, "w"), DatumWriter(), schema)

  with open(input_path, 'r') as f:
    count = 0
    for line in f:
      count += 1
      r = {}
      i = 0
      fixed_feature_arr = []
      random_feature_arr = []
      for token in line.strip().split(' '):
        if i == 0:
          if reg:
            r['label'] = float(token)
          else:
            r['label'] = int(token)
            if r['label'] <= 0:
              r['label'] = 0
            else:
              r['label'] = 1
        elif i == 1:
          r['grouping'] = str(token)
        else:
          t = token.split(':')
          ft = {}
          ft['name'] = t[0]
          ft['term'] = ''
          ft['value'] = float(t[1])
          if t[0][0:2] == "f_":
            fixed_feature_arr.append(ft)
          else:
            random_feature_arr.append(ft)
        i += 1

      r['fixedfeatures'] = fixed_feature_arr
      r['randomfeatures'] = random_feature_arr
      writer.append(r)
    print "converted " + str(count) + " examples"
  writer.close()

 if __name__ == "__main__":
  main()
	import pandas as pd
	import argparse

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-i", "--input", help="Input file")
	parser.add_argument("-o", "--output", help="Output file")
	parser.add_argument("-r","--response", help="Your dependent variable")
	parser.add_argument("-g","--groupvar", help="Your grouping variable for the random effects")
	args = parser.parse_args()

	fixed_vars = ['list','of','variable','names','for','fixed','effects']

	random_vars = ['list','of','variable','names','for','random','effects']

	x = pd.read_csv(args.input)

	resp = x.loc[:,args.response]
	TDL = x.loc[:,args.groupvar].astype("str").map(lambda x: x[2:(len(x)-1)])

	x = x.loc[:, fixed_vars+random_vars]

	new_cols = []
	for col in x.columns:
	if col in fixed_vars:
	col = 'f_'+col
	if col in random_vars:
	col = 'r_'+col
	new_cols.append(col)

	x.columns = new_cols

	with open(args.output, 'w') as f:
	for res, row, grp in zip(resp, x.values,TDL):
	non_sparse = row[row != 0]
	cols = x.columns[row != 0]
	out_string = ' '.join([col+':'+str(val) for col, val in zip(cols, non_sparse)])
	out_string = ' '.join([str(grp), out_string])
	out_string = ' '.join([str(res), out_string])
	f.write(out_string+'\n')
	{
	"type" : "record",
	"name" : "TrainingExample",
	"namespace" : "com.linkedin.metronome.avro.generated",
	"fields" : [ {
	"name" : "uid",
	"type" : [ "null", "string", "long", "int" ],
	"doc" : "a unique id for the training event",
	"default" : null
	}, {
	"name" : "label",
	"type" : [ "double"],
	"doc" : "label/response of the event"
	}, {
	"name" : "grouping",
	"type" : [ "string" ],
	"doc" : "The store grouping"
	}, {
	"name" : "fixedfeatures",
	"type" : {
	"type" : "array",
	"items" : {
	"type" : "record",
	"name" : "FixedFeature",
	"doc" : "a feature is a named numeric value",
	"fields" : [ {
	"name" : "name",
	"type" : "string"
	}, {
	"name" : "value",
	"type" : "double"
	}, {
	"name" : "term",
	"type" : [ "null", "string" ],
	"default" : null
	} ]
	}
	},
	"doc" : "fixed effects features describing the event"
	}, {
	"name" : "randomfeatures",
	"type" : {
	"type" : "array",
	"items" : {
	"type" : "record",
	"name" : "RandomFeature",
	"doc" : "a feature is a named numeric value",
	"fields" : [ {
	"name" : "name",
	"type" : "string"
	}, {
	"name" : "value",
	"type" : "double"
	}, {
	"name" : "term",
	"type" : [ "null", "string" ],
	"default" : null
	} ]
	}
	},
	"doc" : "random effects features describing the event"
	},{
	"name" : "metadataMap",
	"type" : [ "null", {
	"type" : "map",
	"values" : [ "boolean", "int", "long", "float", "double", "string" ]
	} ],
	"doc" : "optional field. allows attaching arbitrary non-feature metadata to each event",
	"default" : null
	}, {
	"name" : "weight",
	"type" : [ "null", "int", "long", "float", "double" ],
	"doc" : "optional field. specifies strength of observation",
	"default" : null
	}, {
	"name" : "offset",
	"type" : [ "null", "int", "long", "float", "double" ],
	"doc" : "optional field. when non-zero, admm will learn coefficient betas with the added offset",
	"default" : null
	} ]
	}
	# Copyright 2017 LinkedIn Corp. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License"); you may
	# not use this file except in compliance with the License. You may obtain a
	# copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	# License for the specific language governing permissions and limitations
	# under the License.
	"""
	[Doc]:
	This script converts a text file in libsvm format into TrainingExample avro.
	For each feature, the name is set as id, and the term is empty.

	[Usage]:
	python libsvm_text_to_trainingexample_avro.py [input_path] [output_schema_path] [output_path] (optional: -r for regression)
	"""
	import avro.schema
	import getopt
	import os
	import sys
	from avro.datafile import DataFileWriter
	from avro.io import DatumWriter


	def main():
	if len(sys.argv) <= 1:
	print __doc__
	sys.exit(0)

	reg = False

	# parse command line options
	try:
	opts, args = getopt.getopt(sys.argv[1:], "hr", ["help", "regression"])
	except getopt.error, msg:
	print msg
	print "for help use --help"
	sys.exit(2)
	# process options
	for o, a in opts:
	if o in ("-h", "--help"):
	print __doc__
	sys.exit(0)
	if o in ("-r", "--regression"):
	reg = True

	# process arguments
	input_path = args[0]
	output_schema_path = args[1]
	output_path = args[2]

	if os.path.exists(output_path):
	os.remove(output_path)

	schema = avro.schema.parse(open(output_schema_path).read())
	writer = DataFileWriter(open(output_path, "w"), DatumWriter(), schema)

	with open(input_path, 'r') as f:
	count = 0
	for line in f:
	count += 1
	r = {}
	i = 0
	fixed_feature_arr = []
	random_feature_arr = []
	for token in line.strip().split(' '):
	if i == 0:
	if reg:
	r['label'] = float(token)
	else:
	r['label'] = int(token)
	if r['label'] <= 0:
	r['label'] = 0
	else:
	r['label'] = 1
	elif i == 1:
	r['grouping'] = str(token)
	else:
	t = token.split(':')
	ft = {}
	ft['name'] = t[0]
	ft['term'] = ''
	ft['value'] = float(t[1])
	if t[0][0:2] == "f_":
	fixed_feature_arr.append(ft)
	else:
	random_feature_arr.append(ft)
	i += 1

	r['fixedfeatures'] = fixed_feature_arr
	r['randomfeatures'] = random_feature_arr
	writer.append(r)
	print "converted " + str(count) + " examples"
	writer.close()

	if __name__ == "__main__":
	main()