This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit ef0994193758523aec0eb3398e016ec9b2bea7c6 Author: yagagagaga <zhangminkefromflyd...@gmail.com> AuthorDate: Mon Oct 16 16:47:21 2023 +0800 [Improvement](hive-udf)(doc) minimize hive-udf and add some docs. (#24786) --- docs/en/docs/ecosystem/hive-bitmap-udf.md | 56 ++++++++++++++++++++- docs/zh-CN/docs/ecosystem/hive-bitmap-udf.md | 56 ++++++++++++++++++++- fe/hive-udf/pom.xml | 75 ++++++++++++++++++++-------- 3 files changed, 163 insertions(+), 24 deletions(-) diff --git a/docs/en/docs/ecosystem/hive-bitmap-udf.md b/docs/en/docs/ecosystem/hive-bitmap-udf.md index 6629b9d6a24..7bb93bc90ba 100644 --- a/docs/en/docs/ecosystem/hive-bitmap-udf.md +++ b/docs/en/docs/ecosystem/hive-bitmap-udf.md @@ -72,7 +72,7 @@ mvn package -Dmaven.test.skip=true --You can also just package the hive-udf module mvn package -pl hive-udf -am -Dmaven.test.skip=true ``` -After packaging and compiling, enter the hive-udf directory and there will be a target directory,There will be hive-udf-jar-with-dependencies.jar package +After packaging and compiling, enter the hive-udf directory and there will be a target directory,There will be hive-udf.jar package ```sql -- Load the Hive Bitmap Udf jar package (Upload the compiled hive-udf jar package to HDFS) @@ -108,4 +108,58 @@ select k1,bitmap_union(uuid) from hive_bitmap_table group by k1 ## Hive Bitmap import into Doris +<version since="2.0.2"> + +### Method 1:Catalog (recommended) + +</version> + +When create a Hive table in the format specified as TEXT, for Binary type, Hive will be saved as a bash64 encoded string. Therefore, the binary data can be directly saved as Bitmap through bitmap_from_base64 function by using Doris's Hive Catalog. + +Here is a full example: + +1. Creating Hive Tables in Hive + +```sql +CREATE TABLE IF NOT EXISTS `test`.`hive_bitmap_table`( +`k1` int COMMENT '', +`k2` String COMMENT '', +`k3` String COMMENT '', +`uuid` binary COMMENT 'bitmap' +) stored as textfile +``` + +2. [Creating a Catalog in Doris](../lakehouse/multi-catalog/hive) + +```sql +CREATE CATALOG hive PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9083' +); +``` + +3. Create Doris internal table + +```sql +CREATE TABLE IF NOT EXISTS `test`.`doris_bitmap_table`( + `k1` int COMMENT '', + `k2` String COMMENT '', + `k3` String COMMENT '', + `uuid` BITMAP BITMAP_UNION COMMENT 'bitmap' +) +AGGREGATE KEY(k1, k2, k3) +DISTRIBUTED BY HASH(`user_id`) BUCKETS 1 +PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" +); +``` + +4. Inserting data from Hive into Doris + +```sql +insert into doris_bitmap_table select k1, k2, k3, bitmap_from_base64(uuid) from hive.test.hive_bitmap_table; +``` + +### Method 2:Spark Load + see details: [Spark Load](../data-operate/import/import-way/spark-load-manual.md) -> Basic operation -> Create load(Example 3: when the upstream data source is hive binary type table) diff --git a/docs/zh-CN/docs/ecosystem/hive-bitmap-udf.md b/docs/zh-CN/docs/ecosystem/hive-bitmap-udf.md index 7f886352ed4..d10a9250283 100644 --- a/docs/zh-CN/docs/ecosystem/hive-bitmap-udf.md +++ b/docs/zh-CN/docs/ecosystem/hive-bitmap-udf.md @@ -74,7 +74,7 @@ mvn package -Dmaven.test.skip=true --也可以只打hive-udf module mvn package -pl hive-udf -am -Dmaven.test.skip=true ``` -打包编译完成进入hive-udf目录会有target目录,里面就会有打包完成的hive-udf-jar-with-dependencies.jar包 +打包编译完成进入hive-udf目录会有target目录,里面就会有打包完成的hive-udf.jar包 ```sql @@ -117,4 +117,58 @@ select k1,bitmap_union(uuid) from hive_bitmap_table group by k1 ## Hive bitmap 导入 doris +<version since="2.0.2"> + +### 方法一:Catalog (推荐) + +</version> + +创建 Hive 表指定为 TEXT 格式,此时,对于 Binary 类型,Hive 会以 bash64 编码的字符串形式保存,此时可以通过 Hive Catalog 的形式,直接将位图数据通过 bitmap_from_bash64 函数插入到 Doris 内部。 + +以下是一个完整的例子: + +1. 在 Hive 中创建 Hive 表 + +```sql +CREATE TABLE IF NOT EXISTS `test`.`hive_bitmap_table`( +`k1` int COMMENT '', +`k2` String COMMENT '', +`k3` String COMMENT '', +`uuid` binary COMMENT 'bitmap' +) stored as textfile +``` + +2. [在 Doris 中创建 Catalog](../lakehouse/multi-catalog/hive) + +```sql +CREATE CATALOG hive PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9083' +); +``` + +3. 创建 Doris 内表 + +```sql +CREATE TABLE IF NOT EXISTS `test`.`doris_bitmap_table`( + `k1` int COMMENT '', + `k2` String COMMENT '', + `k3` String COMMENT '', + `uuid` BITMAP BITMAP_UNION COMMENT 'bitmap' +) +AGGREGATE KEY(k1, k2, k3) +DISTRIBUTED BY HASH(`user_id`) BUCKETS 1 +PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" +); +``` + +4. 从 Hive 插入数据到 Doris 中 + +```sql +insert into doris_bitmap_table select k1, k2, k3, bitmap_from_base64(uuid) from hive.test.hive_bitmap_table; +``` + +### 方法二:Spark Load + 详见: [Spark Load](../data-operate/import/import-way/spark-load-manual.md) -> 基本操作 -> 创建导入 (示例3:上游数据源是hive binary类型情况) diff --git a/fe/hive-udf/pom.xml b/fe/hive-udf/pom.xml index eb970b399f9..dbb620e6596 100644 --- a/fe/hive-udf/pom.xml +++ b/fe/hive-udf/pom.xml @@ -35,22 +35,37 @@ under the License. </properties> <dependencies> <dependency> - <groupId>org.apache.doris</groupId> - <artifactId>hive-catalog-shade</artifactId> - </dependency> - <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - </dependency> - <dependency> - <groupId>org.apache.velocity</groupId> - <artifactId>velocity-engine-core</artifactId> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <version>${hive.version}</version> + <scope>provided</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>fe-common</artifactId> <version>${project.version}</version> + <exclusions> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>commons-codec</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.aspectj</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> </dependency> </dependencies> <build> @@ -74,23 +89,39 @@ under the License. <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-assembly-plugin</artifactId> + <artifactId>maven-shade-plugin</artifactId> <configuration> - <descriptorRefs> - <descriptorRef>jar-with-dependencies</descriptorRef> - </descriptorRefs> - <archive> - <manifest> - <mainClass></mainClass> - </manifest> - </archive> + <minimizeJar>true</minimizeJar> + <relocations> + <relocation> + <pattern>org.joda.time</pattern> + <shadedPattern>shade.doris.org.joda.time</shadedPattern> + </relocation> + <relocation> + <pattern>org.roaringbitmap</pattern> + <shadedPattern>shade.doris.org.roaringbitmap</shadedPattern> + </relocation> + </relocations> + <filters> + <filter> + <artifact>org.apache.logging.log4j:*</artifact> + <excludes> + <exclude>**</exclude> + </excludes> + </filter> + <filter> + <artifact>org.awaitility:*</artifact> + <excludes> + <exclude>**</exclude> + </excludes> + </filter> + </filters> </configuration> <executions> <execution> - <id>make-assembly</id> <phase>package</phase> <goals> - <goal>single</goal> + <goal>shade</goal> </goals> </execution> </executions> --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org