- Local hive start
cd /usr/local/hive/bin
./hive
to close : exit ./hive
flights_info.txt
200K flight recordsinfo.text
contains table description
year INT,
month INT,
day INT,
..
...
late_aircraft_delay STRING
create dabase sid
use sid
show tables
NO OUTPUT- Insert the cols from
info.txt
here.
create table first (year INT,
month INT,
day INT,
day_of_week INT,
dep_time INT,
crs_dep_time INT,
arr_time INT,
crs_arr_time INT,
unique_carrier STRING,
flight_num INT,
tail_num STRING,
actual_elapsed_time INT,
crs_elapsed_time INT,
air_time INT,
arr_delay INT,
dep_delay INT,
origin STRING,
dest STRING,
distance INT,
taxi_in INT,
taxi_out INT,
cancelled INT,
cancellation_code STRING,
diverted INT,
carrier_delay STRING,
weather_delay STRING,
nas_delay STRING,
security_delay STRING,
late_aircraft_delay STRING)
row format delimited
fields terminated by ",";
and then load data
hive> load data local inpath "/home/cloudera/Downloads/flight_info.txt" into table first;
- check with
desc first;
- create second table
create table second as select flight_num , origin_dest, arr_time, dep_time,
arr_delay, dep_delay from first;
- Check
select * from second limit 50;
- table creation one
select avg(dep_delay) from second;
select avg(arr_delay) from second;
select * from flights where dest="TPA";
select * from flights where origin="IND"
select origin, count(flight_num) from second group by origin;
create index fl_index on table flights(flight_num)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD;
show index on second;
select count(flight_num) from flights;