krishna209 · April 30, 2015 04:40
diff --git a/Avro Files Working b/Avro Files Working
 Step 1: download avro tool jar
 wget http://mirrors.sonic.net/apache/avro/avro-1.7.7/java/avro-tools-1.7.7.jar

 Step 2: Generate schema
 java -jar avro-tools-1.7.7.jar getschema /home/hdfs/genre1/part-m-00000.avro

 Step 3: 
 sqoop import --connect jdbc:mysql://172.16.2.164/movielens --username hive -P --table genre --as-avrodatafile

 this imports genre data from mysql to hdfs as .avro files and generates .avsc schema in local filesystem
 data files need to be in hdfs but .avsc file can be either in local or hdfs

 Step 4: creating a hive table for .avro data file

 create external table genre row format serde 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' stored as inputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' outputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' location '/user/hdfs/genre' tblproperties('avro.schema.url'='hdfs:///user/hdfs/genre.avsc');

 Step 5:
 in the genre directory except _SUCCESS and part files nothing should be kept.
	Step 1: download avro tool jar
	wget http://mirrors.sonic.net/apache/avro/avro-1.7.7/java/avro-tools-1.7.7.jar

	Step 2: Generate schema
	java -jar avro-tools-1.7.7.jar getschema /home/hdfs/genre1/part-m-00000.avro

	Step 3:
	sqoop import --connect jdbc:mysql://172.16.2.164/movielens --username hive -P --table genre --as-avrodatafile

	this imports genre data from mysql to hdfs as .avro files and generates .avsc schema in local filesystem
	data files need to be in hdfs but .avsc file can be either in local or hdfs

	Step 4: creating a hive table for .avro data file

	create external table genre row format serde 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' stored as inputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' outputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' location '/user/hdfs/genre' tblproperties('avro.schema.url'='hdfs:///user/hdfs/genre.avsc');

	Step 5:
	in the genre directory except _SUCCESS and part files nothing should be kept.