Accessing Hive Table Data with MapReduce

dokondr Wed, 05 Mar 2014 05:13:33 -0800

In single node installation of Hadoop 2.2, I am trying to run Cloudera
example "Accessing Table Data with MapReduce" that copies data from one
table to another:


http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/cdh4ig_topic_19_6.html

Example code compiles with numerous deprecation warnings (see below).
Before running this example from Eclipse, I create input table 'simple' in
Hive default DB. I pass input 'simple' and output 'simpid' tables on a
command line. Notwithstanding input table already exists in default DB,
when I run this code I get exception:

     java.io.IOException: NoSuchObjectException(message:default.simple
table not found

Questions:

1) Why does "table not found" exception happen? How to solve this?

2) How does deprecated HCatRecord, HCatSchema, HCatBaseInputFormat in this
example translate to a latest, stable API?

    package com.bigdata;

    import java.io.IOException;
    import java.util.*;
    import org.apache.hadoop.conf.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.*;
    import org.apache.hadoop.util.*;
    import org.apache.hcatalog.mapreduce.*;
    import org.apache.hcatalog.data.*;
    import org.apache.hcatalog.data.schema.*;

    public class UseHCat extends Configured implements Tool {

     public static class Map extends Mapper<WritableComparable, HCatRecord,
Text, IntWritable> {
      String groupname;

        @Override
      protected void map( WritableComparable key,
                          HCatRecord value,

org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord,
                          Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
            // The group table from /etc/group has name, 'x', id
            groupname = (String) value.get(0);
            int id = (Integer) value.get(1);
            // Just select and emit the name and ID
            context.write(new Text(groupname), new IntWritable(id));
        }
    }

    public static class Reduce extends Reducer<Text, IntWritable,
                                       WritableComparable, HCatRecord> {

        protected void reduce( Text key,
                               java.lang.Iterable<IntWritable> values,
                               org.apache.hadoop.mapreduce.Reducer<Text,
IntWritable,
                               WritableComparable, HCatRecord>.Context
context)
            throws IOException, InterruptedException {
            // Only expecting one ID per group name
            Iterator<IntWritable> iter = values.iterator();
            IntWritable iw = iter.next();
            int id = iw.get();
            // Emit the group name and ID as a record
            HCatRecord record = new DefaultHCatRecord(2);
            record.set(0, key.toString());
            record.set(1, id);
            context.write(null, record);
        }
    }

    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        args = new GenericOptionsParser(conf, args).getRemainingArgs();

        // Get the input and output table names as arguments
        String inputTableName = args[0];
        String outputTableName = args[1];
        // Assume the default database
        String dbName = null;

        Job job = new Job(conf, "UseHCat");
        HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
                inputTableName, null));
        job.setJarByClass(UseHCat.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        // An HCatalog record as input
        job.setInputFormatClass(HCatInputFormat.class);

        // Mapper emits a string as key and an integer as value
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // Ignore the key for the reducer output; emitting an HCatalog
record as value
        job.setOutputKeyClass(WritableComparable.class);
        job.setOutputValueClass(DefaultHCatRecord.class);
        job.setOutputFormatClass(HCatOutputFormat.class);

        HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName,
                   outputTableName, null));
        HCatSchema s = HCatOutputFormat.getTableSchema(job);
        System.err.println("INFO: output schema explicitly set for
writing:" + s);
        HCatOutputFormat.setSchema(job, s);
        return (job.waitForCompletion(true) ? 0 : 1);
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new UseHCat(), args);
        System.exit(exitCode);
     }
    }


When I run this on a single-node Hadoop 2.2 I get the following exception:

14/03/05 15:17:21 WARN util.NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable
14/03/05 15:17:21 INFO Configuration.deprecation:
mapred.input.dir.recursive is deprecated. Instead, use
mapreduce.input.fileinputformat.input.dir.recursive
14/03/05 15:17:21 INFO Configuration.deprecation: mapred.max.split.size is
deprecated. Instead, use mapreduce.input.fileinputformat.split.maxsize
14/03/05 15:17:21 INFO Configuration.deprecation: mapred.min.split.size is
deprecated. Instead, use mapreduce.input.fileinputformat.split.minsize
14/03/05 15:17:21 INFO Configuration.deprecation:
mapred.min.split.size.per.rack is deprecated. Instead, use
mapreduce.input.fileinputformat.split.minsize.per.rack
14/03/05 15:17:21 INFO Configuration.deprecation:
mapred.min.split.size.per.node is deprecated. Instead, use
mapreduce.input.fileinputformat.split.minsize.per.node
14/03/05 15:17:21 INFO Configuration.deprecation: mapred.reduce.tasks is
deprecated. Instead, use mapreduce.job.reduces
14/03/05 15:17:21 INFO Configuration.deprecation:
mapred.reduce.tasks.speculative.execution is deprecated. Instead, use
mapreduce.reduce.speculative
14/03/05 15:17:22 INFO metastore.HiveMetaStore: 0: Opening raw store with
implemenation class:org.apache.hadoop.hive.metastore.ObjectStore
14/03/05 15:17:22 INFO metastore.ObjectStore: ObjectStore, initialize called
14/03/05 15:17:23 INFO DataNucleus.Persistence: Property
datanucleus.cache.level2 unknown - will be ignored
14/03/05 15:17:24 WARN bonecp.BoneCPConfig: Max Connections < 1. Setting to
20
14/03/05 15:17:25 INFO metastore.ObjectStore: Setting MetaStore object pin
classes with
hive.metastore.cache.pinobjtypes="Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order"
14/03/05 15:17:25 INFO metastore.ObjectStore: Initialized ObjectStore
14/03/05 15:17:27 WARN bonecp.BoneCPConfig: Max Connections < 1. Setting to
20
14/03/05 15:17:27 INFO metastore.HiveMetaStore: 0: get_database:
NonExistentDatabaseUsedForHealthCheck
14/03/05 15:17:27 INFO HiveMetaStore.audit: ugi=dk ip=unknown-ip-addr
cmd=get_database:
NonExistentDatabaseUsedForHealthCheck
14/03/05 15:17:27 ERROR metastore.RetryingHMSHandler:
NoSuchObjectException(message:There is no database named
nonexistentdatabaseusedforhealthcheck)

at
org.apache.hadoop.hive.metastore.ObjectStore.getMDatabase(ObjectStore.java:431)
at
org.apache.hadoop.hive.metastore.ObjectStore.getDatabase(ObjectStore.java:441)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.apache.hadoop.hive.metastore.RetryingRawStore.invoke(RetryingRawStore.java:124)
at com.sun.proxy.$Proxy6.getDatabase(Unknown Source)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_database(HiveMetaStore.java:628)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:103)
at com.sun.proxy.$Proxy7.get_database(Unknown Source)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getDatabase(HiveMetaStoreClient.java:810)
at
org.apache.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient.isOpen(HiveClientCache.java:277)
at org.apache.hcatalog.common.HiveClientCache.get(HiveClientCache.java:147)
at org.apache.hcatalog.common.HCatUtil.getHiveClient(HCatUtil.java:547)
at
org.apache.hcatalog.mapreduce.InitializeInput.getInputJobInfo(InitializeInput.java:104)
at
org.apache.hcatalog.mapreduce.InitializeInput.setInput(InitializeInput.java:86)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:87)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:56)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:48)
at com.bigdata.UseHCat.run(UseHCat.java:64)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at com.bigdata.UseHCat.main(UseHCat.java:91)
 14/03/05 15:17:27 INFO metastore.HiveMetaStore: 0: get_table : db=default
tbl=simple
14/03/05 15:17:27 INFO HiveMetaStore.audit: ugi=dk ip=unknown-ip-addr
cmd=get_table
: db=default tbl=simple
14/03/05 15:17:27 INFO DataNucleus.Datastore: The class
"org.apache.hadoop.hive.metastore.model.MFieldSchema" is tagged as
"embedded-only" so does not have its own datastore table.
14/03/05 15:17:27 INFO DataNucleus.Datastore: The class
"org.apache.hadoop.hive.metastore.model.MOrder" is tagged as
"embedded-only" so does not have its own datastore table.

         Exception in thread "main" java.io.IOException:
NoSuchObjectException(message:default.simple table not found)

at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:89)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:56)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:48)
at com.bigdata.UseHCat.run(UseHCat.java:64)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at com.bigdata.UseHCat.main(UseHCat.java:91)
        Caused by: NoSuchObjectException(message:default.simple table not
found)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_table(HiveMetaStore.java:1373)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:103)
at com.sun.proxy.$Proxy7.get_table(Unknown Source)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getTable(HiveMetaStoreClient.java:854)
at org.apache.hcatalog.common.HCatUtil.getTable(HCatUtil.java:193)
at
org.apache.hcatalog.mapreduce.InitializeInput.getInputJobInfo(InitializeInput.java:105)
at
org.apache.hcatalog.mapreduce.InitializeInput.setInput(InitializeInput.java:86)
at
org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:87)
... 6 more
14/03/05 15:17:29 INFO metastore.HiveMetaStore: 1: Shutting down the object
store...
14/03/05 15:17:29 INFO HiveMetaStore.audit: ugi=dk ip=unknown-ip-addr
cmd=Shutting
down the object store...
14/03/05 15:17:29 INFO metastore.HiveMetaStore: 1: Metastore shutdown
complete.
14/03/05 15:17:29 INFO HiveMetaStore.audit: ugi=dk ip=unknown-ip-addr
cmd=Metastore
shutdown complete.

Accessing Hive Table Data with MapReduce

Reply via email to