[ 
https://issues.apache.org/jira/browse/IMPALA-13818?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Quanlong Huang updated IMPALA-13818:
------------------------------------
    Epic Link: IMPALA-13915

> Wasting space due to duplidated TColumnType between THdfsPartitions
> -------------------------------------------------------------------
>
>                 Key: IMPALA-13818
>                 URL: https://issues.apache.org/jira/browse/IMPALA-13818
>             Project: IMPALA
>          Issue Type: Bug
>          Components: Backend, Frontend
>            Reporter: Quanlong Huang
>            Assignee: Quanlong Huang
>            Priority: Critical
>         Attachments: TDescriptorTable.png, THdfsPartition_mem_footprint-1.png
>
>
> THdfsPartition is used in at least two places:
>  * In the legacy catalog mode, catalogd propagates metadata to coordinators.
>  * In TQueryExecRequest, it's used in the tableDescriptors sent to the 
> backend as part of query plans and fragments.
> It has a list of expressions to represent the partition values:
> {code:java}
> struct THdfsPartition {
>   // These are Literal expressions
>   7: list<Exprs.TExpr> partitionKeyExprs
>   ...
> }{code}
> E.g. here is a partition (year=2009, month=1) of the functional.alltypes 
> table:
> {noformat}
> THdfsPartition {
>           07: partitionKeyExprs (list) = list<struct>[2] {
>             [0] = TExpr {
>               01: nodes (list) = list<struct>[1] {
>                 [0] = TExprNode {
>                   01: node_type (i32) = 2,
>                   02: type (struct) = TColumnType {
>                     01: types (list) = list<struct>[1] {
>                       [0] = TTypeNode {
>                         01: type (i32) = 0,
>                         02: scalar_type (struct) = TScalarType {
>                           01: type (i32) = 5,
>                         },
>                       },
>                     },
>                   },
>                   03: num_children (i32) = 0,
>                   04: is_constant (bool) = true,
>                   11: int_literal (struct) = TIntLiteral {
>                     01: value (i64) = 2009,
>                   },
>                   23: is_codegen_disabled (bool) = false,
>                 },
>               },
>             },
>             [1] = TExpr {
>               01: nodes (list) = list<struct>[1] {
>                 [0] = TExprNode {
>                   01: node_type (i32) = 2,
>                   02: type (struct) = TColumnType {
>                     01: types (list) = list<struct>[1] {
>                       [0] = TTypeNode {
>                         01: type (i32) = 0,
>                         02: scalar_type (struct) = TScalarType {
>                           01: type (i32) = 5,
>                         },
>                       },
>                     },
>                   },
>                   03: num_children (i32) = 0,
>                   04: is_constant (bool) = true,
>                   11: int_literal (struct) = TIntLiteral {
>                     01: value (i64) = 1,
>                   },
>                   23: is_codegen_disabled (bool) = false,
>                 },
>               },
>             },
>           },
>           09: file_desc (list) = list<struct>[1] {
>             [0] = THdfsFileDesc {
>               01: file_desc_data (string) = 
> "\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00 
> \x00\x1c\x00\x10\x00\x00\x00\b\x00\x04\x00\x0e\x00\x00\x00\x1c\x00\x00\x00\xce\x80\x17\xf3\x93\x01\x00\x00\xd1O\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\f\x00\x00\x00\x01\x00\x00\x00
>  
> \x00\x00\x00\n\x00\x00\x00090101.txt\x00\x00\f\x00\x14\x00\x00\x00\f\x00\b\x00\x04\x00\f\x00\x00\x00\x10\x00\x00\x00\x18\x00\x00\x00\xd1O\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x01\x00\x02\x00\x00\x00\x03\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00",
>             },
>           },
>           10: location (struct) = THdfsPartitionLocation {
>             01: prefix_index (i32) = 0,
>             02: suffix (string) = "year=2009/month=1",
>           },
>           11: access_level (i32) = 1,
>           12: stats (struct) = TTableStats {
>             01: num_rows (i64) = 310,
>           },
>           13: is_marked_cached (bool) = false,
>           14: id (i64) = 1,
>           15: hms_parameters (map) = map<string,string>[8] {
>             "STATS_GENERATED" -> "TASK",
>             "impala.events.catalogServiceId" -> 
> "3378db64f1864d2a:afdb738fe029883c",
>             "impala.events.catalogVersion" -> "12277",
>             "numFiles" -> "1",
>             "numFilesErasureCoded" -> "0",
>             "numRows" -> "310",
>             "totalSize" -> "20433",
>             "transient_lastDdlTime" -> "1734950224",
>           },
>           16: num_blocks (i64) = 1,
>           17: total_file_size_bytes (i64) = 20433,
>           19: has_incremental_stats (bool) = false,
>           25: partition_name (string) = "year=2009/month=1",
>           26: prev_id (i64) = -1,
>           27: hdfs_storage_descriptor (struct) = THdfsStorageDescriptor {
>             01: lineDelim (byte) = 0x0a,
>             02: fieldDelim (byte) = 0x2c,
>             03: collectionDelim (byte) = 0x2c,
>             04: mapKeyDelim (byte) = 0x2c,
>             05: escapeChar (byte) = 0x5c,
>             06: quoteChar (byte) = 0x2c,
>             07: fileFormat (i32) = 0,
>             08: blockSize (i32) = 0,
>           },
>         }{noformat}
> The partitionKeyExprs actually duplicates the partition column types, which 
> is a waste in memory or network transmission. Here is an example from a heap 
> dump:
> !THdfsPartition_mem_footprint-1.png|width=655,height=447!
> The query reads a single table of 6M partitions and 6M files. It hits OOM of 
> exceeding the 2GB JVM byte array limit in serializing the DescriptorTable:
> {noformat}
> java.lang.Thread @ 0x7fdf6d220768 : Thread-12
>   at java.lang.OutOfMemoryError.<init>()V (OutOfMemoryError.java:48)
>   at java.util.Arrays.copyOf([BI)[B (Arrays.java:3236)
>   at java.io.ByteArrayOutputStream.toByteArray()[B 
> (ByteArrayOutputStream.java:191)
>   at org.apache.thrift.TSerializer.serialize(Lorg/apache/thrift/TBase;)[B 
> (TSerializer.java:85)
>   at 
> org.apache.impala.common.JniUtil.serializeToThrift(Lorg/apache/thrift/TBase;)[B
>  (JniUtil.java:109)
>   at 
> org.apache.impala.analysis.DescriptorTable.toSerializedThrift()Lorg/apache/impala/thrift/TDescriptorTableSerialized;
>  (DescriptorTable.java:260)
>   at 
> org.apache.impala.service.Frontend.getPlannedExecRequest(Lorg/apache/impala/service/Frontend$PlanCtx;Lorg/apache/impala/analysis/AnalysisContext$AnalysisResult;Lorg/apache/impala/util/EventSequence;)Lorg/apache/impala/thrift/TQueryExecRequest;
>  (Frontend.java:3138)
>   at 
> org.apache.impala.service.Frontend.doCreateExecRequest(Lorg/apache/impala/service/Frontend$PlanCtx;Ljava/util/List;Lorg/apache/impala/util/EventSequence;)Lorg/apache/impala/thrift/TExecRequest;
>  (Frontend.java:2893)
>   at 
> org.apache.impala.service.Frontend.getTExecRequest(Lorg/apache/impala/service/Frontend$PlanCtx;Lorg/apache/impala/util/EventSequence;)Lorg/apache/impala/thrift/TExecRequest;
>  (Frontend.java:2410)
>   at 
> org.apache.impala.service.Frontend.createExecRequest(Lorg/apache/impala/service/Frontend$PlanCtx;)Lorg/apache/impala/thrift/TExecRequest;
>  (Frontend.java:2036)
>   at org.apache.impala.service.JniFrontend.createExecRequest([B)[B 
> (JniFrontend.java:175){noformat}
> The TDescriptorTable objects occupies 9.67GB memory. The duplicated partition 
> column types wasted arround 6.7GB (6M * 2 * 600 bytes) memory.
> !TDescriptorTable.png|width=960,height=605!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to