Has anyone had success reading XML files in Hive?  I've been looking at the 
cloud9 XMLInputFormat to read in the top-level XML node in each file, with the 
goal to then use XPath (and/or LATERAL VIEW) to read individual records in the 
file.  There isn't much in the way of documentation or examples that I can find.

The structure of my XML is:

<?xml version="1.0" encoding="UTF-8"?>
<Report user="sample_user">
     <Session id="ID617693930">
     </Session>
     <Session id="ID617695571">
     </Session>
     <Variables>
           <var>
           </var>
           <var>
           </var>
     </Variables>
</Report>

Here are the Hive DDL statements that I've tried so far:

add jar /data1/mtucker/cloud9-1.3.2.jar;
add jar /usr/lib/mahout/mahout-examples-0.5-cdh3u3.jar;

-- Mahout XMLInputFormat
DROP TABLE IF EXISTS xmltable;
CREATE TABLE xmltable (
    xmldata STRING
)
STORED AS
    INPUTFORMAT 'org.apache.mahout.classifier.bayes.XmlInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
TBLPROPERTIES (
    "xmlinput.start"="<Report",
    "xmlinput.end"="</Report>"
);
LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable;
select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1;
--FAILED: Error in semantic analysis: Line 1:21 Input format must implement 
InputFormat xmltable


--Cloud9 XMLInputFormat
DROP TABLE IF EXISTS xmltable;
CREATE TABLE xmltable (
    xmldata STRING
)
STORED AS
    INPUTFORMAT 'edu.umd.cloud9.collection.XMLInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
    "xmlinput.start"="<Report",
    "xmlinput.end"="</Report>"
);
LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable;
select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1;
--FAILED: Error in semantic analysis: Line 1:21 Input format must implement 
InputFormat xmltable


--RegEx SerDe
DROP TABLE IF EXISTS xmltable;
CREATE TABLE xmltable (
    xmldata STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
    "input.regex" = "(<Report([\\w\\W]*)>?)(<[\\w\\W]*>?)(</Report>)*",
    "output.format.string" = "%2$s"
);
LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable;
select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1;
--FAILED: Execution Error, return code 2 from 
org.apache.hadoop.hive.ql.exec.MapRedTask


Thanks

Matt Tucker

Reply via email to