pan3793 commented on code in PR #50022: URL: https://github.com/apache/spark/pull/50022#discussion_r1969188058
########## sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala: ########## @@ -1407,13 +1410,83 @@ private[hive] object HiveClientImpl extends Logging { case _ => new HiveConf(conf, classOf[HiveConf]) } - try { + val hive = try { Hive.getWithoutRegisterFns(hiveConf) } catch { // SPARK-37069: not all Hive versions have the above method (e.g., Hive 2.3.9 has it but - // 2.3.8 don't), therefore here we fallback when encountering the exception. + // 2.3.8 doesn't), therefore here we fallback when encountering the exception. case _: NoSuchMethodError => Hive.get(hiveConf) } + + // Follow behavior of HIVE-26633 (4.0.0), only apply the max message size when + // `hive.thrift.client.max.message.size` is set and the value is positive + Option(hiveConf.get("hive.thrift.client.max.message.size")) + .map(HiveConf.toSizeBytes(_).toInt).filter(_ > 0) + .foreach { maxMessageSize => + logDebug(s"Trying to set metastore client thrift max message to $maxMessageSize") + configureMaxThriftMessageSize(hiveConf, hive.getMSC, maxMessageSize) + } + + hive + } + + private def getFieldValue[T](obj: Any, fieldName: String): T = { + val field = obj.getClass.getDeclaredField(fieldName) + field.setAccessible(true) + field.get(obj).asInstanceOf[T] + } + + private def getFieldValue[T](obj: Any, clazz: Class[_], fieldName: String): T = { + val field = clazz.getDeclaredField(fieldName) + field.setAccessible(true) + field.get(obj).asInstanceOf[T] + } + + // SPARK-49489: a surgery for Hive 2.3.10 due to lack of HIVE-26633 + private def configureMaxThriftMessageSize( + hiveConf: HiveConf, msClient: IMetaStoreClient, maxMessageSize: Int): Unit = try { + msClient match { + // Hive uses Java Dynamic Proxy to enhance the MetaStoreClient to support synchronization + // and retrying, we should unwrap and access the underlying MetaStoreClient instance firstly + case proxy if JdkProxy.isProxyClass(proxy.getClass) => + JdkProxy.getInvocationHandler(proxy) match { + case syncHandler if syncHandler.getClass.getName.endsWith("SynchronizedHandler") => + val wrappedMsc = getFieldValue[IMetaStoreClient](syncHandler, "client") + configureMaxThriftMessageSize(hiveConf, wrappedMsc, maxMessageSize) + case retryHandler: RetryingMetaStoreClient => + val wrappedMsc = getFieldValue[IMetaStoreClient](retryHandler, "base") + configureMaxThriftMessageSize(hiveConf, wrappedMsc, maxMessageSize) + case _ => + } + case msc: HiveMetaStoreClient if !msc.isLocalMetaStore => + @tailrec + def configure(t: TTransport): Unit = t match { + // Unwrap and access the underlying TTransport when security enabled (Kerberos) + case tTransport: TFilterTransport => Review Comment: @@Madhukar525722 kerberos cases are addressed here ########## sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala: ########## @@ -1407,13 +1410,83 @@ private[hive] object HiveClientImpl extends Logging { case _ => new HiveConf(conf, classOf[HiveConf]) } - try { + val hive = try { Hive.getWithoutRegisterFns(hiveConf) } catch { // SPARK-37069: not all Hive versions have the above method (e.g., Hive 2.3.9 has it but - // 2.3.8 don't), therefore here we fallback when encountering the exception. + // 2.3.8 doesn't), therefore here we fallback when encountering the exception. case _: NoSuchMethodError => Hive.get(hiveConf) } + + // Follow behavior of HIVE-26633 (4.0.0), only apply the max message size when + // `hive.thrift.client.max.message.size` is set and the value is positive + Option(hiveConf.get("hive.thrift.client.max.message.size")) + .map(HiveConf.toSizeBytes(_).toInt).filter(_ > 0) + .foreach { maxMessageSize => + logDebug(s"Trying to set metastore client thrift max message to $maxMessageSize") + configureMaxThriftMessageSize(hiveConf, hive.getMSC, maxMessageSize) + } + + hive + } + + private def getFieldValue[T](obj: Any, fieldName: String): T = { + val field = obj.getClass.getDeclaredField(fieldName) + field.setAccessible(true) + field.get(obj).asInstanceOf[T] + } + + private def getFieldValue[T](obj: Any, clazz: Class[_], fieldName: String): T = { + val field = clazz.getDeclaredField(fieldName) + field.setAccessible(true) + field.get(obj).asInstanceOf[T] + } + + // SPARK-49489: a surgery for Hive 2.3.10 due to lack of HIVE-26633 + private def configureMaxThriftMessageSize( + hiveConf: HiveConf, msClient: IMetaStoreClient, maxMessageSize: Int): Unit = try { + msClient match { + // Hive uses Java Dynamic Proxy to enhance the MetaStoreClient to support synchronization + // and retrying, we should unwrap and access the underlying MetaStoreClient instance firstly + case proxy if JdkProxy.isProxyClass(proxy.getClass) => + JdkProxy.getInvocationHandler(proxy) match { + case syncHandler if syncHandler.getClass.getName.endsWith("SynchronizedHandler") => + val wrappedMsc = getFieldValue[IMetaStoreClient](syncHandler, "client") + configureMaxThriftMessageSize(hiveConf, wrappedMsc, maxMessageSize) + case retryHandler: RetryingMetaStoreClient => + val wrappedMsc = getFieldValue[IMetaStoreClient](retryHandler, "base") + configureMaxThriftMessageSize(hiveConf, wrappedMsc, maxMessageSize) + case _ => + } + case msc: HiveMetaStoreClient if !msc.isLocalMetaStore => + @tailrec + def configure(t: TTransport): Unit = t match { + // Unwrap and access the underlying TTransport when security enabled (Kerberos) + case tTransport: TFilterTransport => Review Comment: @Madhukar525722 kerberos cases are addressed here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org