Has anyone had success reading XML files in Hive? I've been looking at the cloud9 XMLInputFormat to read in the top-level XML node in each file, with the goal to then use XPath (and/or LATERAL VIEW) to read individual records in the file. There isn't much in the way of documentation or examples that I can find.
The structure of my XML is: <?xml version="1.0" encoding="UTF-8"?> <Report user="sample_user"> <Session id="ID617693930"> </Session> <Session id="ID617695571"> </Session> <Variables> <var> </var> <var> </var> </Variables> </Report> Here are the Hive DDL statements that I've tried so far: add jar /data1/mtucker/cloud9-1.3.2.jar; add jar /usr/lib/mahout/mahout-examples-0.5-cdh3u3.jar; -- Mahout XMLInputFormat DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) STORED AS INPUTFORMAT 'org.apache.mahout.classifier.bayes.XmlInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' TBLPROPERTIES ( "xmlinput.start"="<Report", "xmlinput.end"="</Report>" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1; --FAILED: Error in semantic analysis: Line 1:21 Input format must implement InputFormat xmltable --Cloud9 XMLInputFormat DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) STORED AS INPUTFORMAT 'edu.umd.cloud9.collection.XMLInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' TBLPROPERTIES ( "xmlinput.start"="<Report", "xmlinput.end"="</Report>" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1; --FAILED: Error in semantic analysis: Line 1:21 Input format must implement InputFormat xmltable --RegEx SerDe DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "(<Report([\\w\\W]*)>?)(<[\\w\\W]*>?)(</Report>)*", "output.format.string" = "%2$s" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata != "" limit 1; --FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapRedTask Thanks Matt Tucker