Hi Experts,
I need one solution for hive in my project. Actually I have to
create a pivot table type of concept in hive. I don't know how to do the
same. But I have studied and implemented in hive for the same. But by
looking into it, I am not satisfied with the solution and I am not knowing
about the performance overhead also. You people please inform me whether
any other way better than this to implement pivot table in hive.
I have referenced the following site for understanding the basic concept of
pivot table.
http://blogs.msdn.com/b/spike/archive/2009/03/03/pivot-tables-in-sql-server-a-simple-sample.aspx
I am attaching the input file and hive implementation which I have done for
the same.
Attachments:
1. testfile_pivot.txt : This file contains the input data for creating table
for both sql server and hive. You please remove the heading and the under
line and load data into hive
2. pivottable.txt : contains sql server command for creating table,
populating table and pivot table creation and hive command for table
creation. loading data into hive and .
pivot table creation statements etc
Please reply me your suggestions.
--
Thanks and Regards
Reju. R
--
Thanks and Regards
Reju. R
VendorId IncomeDay IncomeAmount
---------- ---------- ------------
SPIKE FRI 100
SPIKE MON 300
FREDS SUN 400
SPIKE WED 500
SPIKE TUE 200
JOHNS WED 900
SPIKE FRI 100
JOHNS MON 300
SPIKE SUN 400
JOHNS FRI 300
FREDS TUE 200
SPIKE MON 900
FREDS FRI 900
FREDS MON 500
JOHNS SUN 600
SPIKE FRI 300
SPIKE FRI 300
SPIKE WED 500
SPIKE FRI 300
JOHNS THU 800
JOHNS SAT 800
SPIKE TUE 100
SPIKE THU 300
FREDS WED 500
SPIKE SAT 100
FREDS SAT 500
FREDS THU 800
JOHNS TUE 600
-- PIVOT TABLE CONCEPT IN SQL SERVER
create table DailyIncome(VendorId nvarchar(10), IncomeDay nvarchar(10),
IncomeAmount int);
--populating the sql server database
insert into DailyIncome values ('SPIKE', 'FRI', 100);
insert into DailyIncome values ('SPIKE', 'MON', 300);
insert into DailyIncome values ('FREDS', 'SUN', 400);
insert into DailyIncome values ('SPIKE', 'WED', 500);
insert into DailyIncome values ('SPIKE', 'TUE', 200);
insert into DailyIncome values ('JOHNS', 'WED', 900);
insert into DailyIncome values ('SPIKE', 'FRI', 100);
insert into DailyIncome values ('JOHNS', 'MON', 300);
insert into DailyIncome values ('SPIKE', 'SUN', 400);
insert into DailyIncome values ('JOHNS', 'FRI', 300);
insert into DailyIncome values ('FREDS', 'TUE', 500);
insert into DailyIncome values ('FREDS', 'TUE', 200);
insert into DailyIncome values ('SPIKE', 'MON', 900);
insert into DailyIncome values ('FREDS', 'FRI', 900);
insert into DailyIncome values ('FREDS', 'MON', 500);
insert into DailyIncome values ('JOHNS', 'SUN', 600);
insert into DailyIncome values ('SPIKE', 'FRI', 300);
insert into DailyIncome values ('SPIKE', 'WED', 500);
insert into DailyIncome values ('SPIKE', 'FRI', 300);
insert into DailyIncome values ('JOHNS', 'THU', 800);
insert into DailyIncome values ('JOHNS', 'SAT', 800);
insert into DailyIncome values ('SPIKE', 'TUE', 100);
insert into DailyIncome values ('SPIKE', 'THU', 300);
insert into DailyIncome values ('FREDS', 'WED', 500);
insert into DailyIncome values ('SPIKE', 'SAT', 100);
insert into DailyIncome values ('FREDS', 'SAT', 500);
insert into DailyIncome values ('FREDS', 'THU', 800);
insert into DailyIncome values ('JOHNS', 'TUE', 600);
select * from DailyIncome
pivot (avg (IncomeAmount) for IncomeDay in
([SUN],[MON],[TUE],[WED],[THU],[FRI],[SAT])) as AvgIncomePerDay
--PIVOT TABLE CONCEPT IN HIVE
CREATE TABLE pivot_source (vendorid string, incomeday string, incomeamount int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED
AS textfile;
LOAD DATA INPATH '/userdata/rejur/test/testfile_pivot.txt' INTO TABLE
pivot_source;
CREATE TABLE pivottest (vendorid string,SUN string, MON string, TUE string, WED
string,THU string, FRI string, SAT string) ROW FORMAT DELIMITED FIELDS
TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS textfile;
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT vendorid, '1', '2', '3',
'4','5', '6', '7' FROM pivot_source;
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid,
source.avgincome AS SUN, dest.MON, dest.TUE, dest.WED, dest.THU, dest.FRI,
dest.SAT FROM pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS
avgincome FROM pivot_source WHERE incomeday='SUN' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
source.avgincome AS MON, dest.TUE, dest.WED, dest.THU, dest.FRI, dest.SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='MON' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
dest.MON, source.avgincome AS TUE, dest.WED, dest.THU, dest.FRI, dest.SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='TUE' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
dest.MON, dest.TUE, source.avgincome AS WED, dest.THU, dest.FRI, dest.SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='WED' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
dest.MON, dest.TUE, dest.WED, source.avgincome AS THU, dest.FRI, dest.SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='THU' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
dest.MON, dest.TUE, dest.WED, dest.THU, source.avgincome AS FRI, dest.SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='FRI' group by vendorid) source ON
(dest.vendorid=source.vendorid);
INSERT OVERWRITE TABLE pivottest SELECT DISTINCT dest.vendorid, dest.SUN,
dest.MON, dest.TUE, dest.WED, dest.THU, dest.FRI, source.avgincome AS SAT FROM
pivottest dest JOIN (SELECT vendorid, avg(incomeamount) AS avgincome FROM
pivot_source WHERE incomeday='SAT' group by vendorid) source ON
(dest.vendorid=source.vendorid);
--PIVOT TABLE CONCEPT IN HIVE