Sorry, the test should be following (changed extract_shas to extract_product_category): import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.testng.annotations.Test; import java.util.ArrayList;import java.util.List; public class TestGenericUDFExtractProductCategory{ ArrayList<String> fieldNames = new ArrayList<String>(); ArrayList<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>(); @Test public void simpleTest() throws Exception { ListObjectInspector firstInspector = new MyListObjectInspector(); ArrayList test = new ArrayList(); test.add("test"); ArrayList test2 = new ArrayList(); test2.add(test); StructObjectInspector soi = ObjectInspectorFactory.getStandardStructObjectInspector(test, test2); fieldNames.add("productCategory"); fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); GenericUDF.DeferredObject firstDeferredObject = new MyDeferredObject(test2); GenericUDF extract_product_category = new GenericUDFExtractProductCategory(); extract_product_category.initialize(new ObjectInspector[]{firstInspector}); extract_product_category.evaluate(new DeferredObject[]{firstDeferredObject}); } public class MyDeferredObject implements DeferredObject { private Object value; public MyDeferredObject(Object value) { this.value = value; } @Override public Object get() throws HiveException { return value; } } private class MyListObjectInspector implements ListObjectInspector { @Override public ObjectInspector getListElementObjectInspector() { return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldObjectInspectors); } @Override public Object getListElement(Object data, int index) { List myList = (List) data; if (myList == null || index > myList.size()) { return null; } return myList.get(index); } @Override public int getListLength(Object data) { if (data == null) { return -1; } return ((List) data).size(); } @Override public List<?> getList(Object data) { return (List) data; } @Override public String getTypeName() { return null; //To change body of implemented methods use File | Settings | File Templates. } @Override public Category getCategory() { return Category.LIST; } }} From: pete....@outlook.com To: user@hive.apache.org Subject: A GenericUDF Function to Extract a Field From an Array of Structs Date: Thu, 28 Mar 2013 14:16:33 -0700
I am trying to write a GenericUDF function to collect all of a specific struct field(s) within an array for each record, and return them in an array as well. I wrote the UDF (as below), and it seems to work but: 1) It does not work when I am performing this on an external table, it works fine on a managed table, any idea? 2) I am having a tough time writing a test on this. I have attached the test I have so far, and it does not work, always getting 'java.util.ArrayList cannot be cast to org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or cannot cast String to LazyString', my question is how do I supply a list of structs for the evalue method? Any help will be greatly appreciated. Thanks,Peter The table: CREATE EXTERNAL TABLE FOO ( TS string, customerId string, products array< struct<productCategory:string> > ) PARTITIONED BY (ds string) ROW FORMAT SERDE 'some.serde' WITH SERDEPROPERTIES ('error.ignore'='true') LOCATION 'some_locations' ; A row of record holds:1340321132000, 'some_company', [{"productCategory":"footwear"},{"productCategory":"eyewear"}] This is my code: import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.lazy.LazyString;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.Text; import java.util.ArrayList; @Description(name = "extract_product_category", value = "_FUNC_( array< struct<productCategory:string> > ) - Collect all product category field values inside an array of struct(s), and return the results in an array<string>", extended = "Example:\n SELECT _FUNC_(array_of_structs_with_product_category_field)")public class GenericUDFExtractProductCategory extends GenericUDF{ private ArrayList ret; private ListObjectInspector listOI; private StructObjectInspector structOI; private ObjectInspector prodCatOI; @Override public ObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentLengthException("The function extract_product_category() requires exactly one argument."); } if (args[0].getCategory() != Category.LIST) { throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_product_category but " + args[0].getTypeName() + " is found instead"); } listOI = ((ListObjectInspector) args[0]); structOI = ((StructObjectInspector) listOI.getListElementObjectInspector()); if (structOI.getAllStructFieldRefs().size() != 1) { throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one"); } StructField productCategoryField = structOI.getStructFieldRef("productCategory"); //If not, throw exception if (productCategoryField == null) { throw new UDFArgumentTypeException(0, "NO \"productCategory\" field in input structure"); } //Are they of the correct types? //We store these object inspectors for use in the evaluate() method prodCatOI = productCategoryField.getFieldObjectInspector(); //First are they primitives if (prodCatOI.getCategory() != Category.PRIMITIVE) { throw new UDFArgumentTypeException(0, "productCategory field must be of string type"); } //Are they of the correct primitives? if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) { throw new UDFArgumentTypeException(0, "productCategory field must be of string type"); } ret = new ArrayList(); return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); } @Override public ArrayList evaluate(DeferredObject[] arguments) throws HiveException { ret.clear(); if (arguments.length != 1) { return null; } if (arguments[0].get() == null) { return null; } int numElements = listOI.getListLength(arguments[0].get()); for (int i = 0; i < numElements; i++) { LazyString prodCatDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("productCategory"))); Text productCategoryValue = ((StringObjectInspector) prodCatOI).getPrimitiveWritableObject(prodCatDataObject); ret.add(productCategoryValue); } return ret; } @Override public String getDisplayString(String[] strings) { assert (strings.length > 0); StringBuilder sb = new StringBuilder(); sb.append("extract_product_category("); sb.append(strings[0]); sb.append(")"); return sb.toString(); }} My Test: import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.testng.annotations.Test; import java.util.ArrayList;import java.util.List; public class TestGenericUDFExtractShas{ ArrayList<String> fieldNames = new ArrayList<String>(); ArrayList<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>(); @Test public void simpleTest() throws Exception { ListObjectInspector firstInspector = new MyListObjectInspector(); ArrayList test = new ArrayList(); test.add("test"); ArrayList test2 = new ArrayList(); test2.add(test); StructObjectInspector soi = ObjectInspectorFactory.getStandardStructObjectInspector(test, test2); fieldNames.add("productCategory"); fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); GenericUDF.DeferredObject firstDeferredObject = new MyDeferredObject(test2); GenericUDF extract_shas = new GenericUDFExtractShas(); extract_shas.initialize(new ObjectInspector[]{firstInspector}); extract_shas.evaluate(new DeferredObject[]{firstDeferredObject}); } public class MyDeferredObject implements DeferredObject { private Object value; public MyDeferredObject(Object value) { this.value = value; } @Override public Object get() throws HiveException { return value; } } private class MyListObjectInspector implements ListObjectInspector { @Override public ObjectInspector getListElementObjectInspector() { return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldObjectInspectors); } @Override public Object getListElement(Object data, int index) { List myList = (List) data; if (myList == null || index > myList.size()) { return null; } return myList.get(index); } @Override public int getListLength(Object data) { if (data == null) { return -1; } return ((List) data).size(); } @Override public List<?> getList(Object data) { return (List) data; } @Override public String getTypeName() { return null; //To change body of implemented methods use File | Settings | File Templates. } @Override public Category getCategory() { return Category.LIST; } }}