这个bug已经修复了。现在可以把left_table的bucket默认10改成32,我测了下没有这个问题了
On Tue, Jun 21, 2022 at 10:59 姜勇辉 <jian...@fun.tv> wrote: > *Bucket Shuffle Join* > > 问题: > > 当我在使用动态分区表left_table的单个分区join维表right_table时,发现每次join结果不一样。我不知道为什么会产生这种原因。 > > 我进行了一些分析,发现当左表left_table不是分区表时,join结果正常。下面为我的测试用例。 > > > > 测试Bucket Shuffle Join > > 1.创建left_table和right_table > > SQL > #创建左表 > create table left_table( > `data_date` date, > `id` int, > `name` varchar(256) > ) > duplicate key(data_date,id) > PARTITION BY RANGE(data_date)() > distributed by hash(`id`) buckets 10 > PROPERTIES > ( > "dynamic_partition.enable" = "true", > "dynamic_partition.time_unit" = "DAY", > "dynamic_partition.end" = "3", > "dynamic_partition.prefix" = "p", > "dynamic_partition.buckets" = "32", > "dynamic_partition.create_history_partition" = "true", > "dynamic_partition.history_partition_num" = "30" > ); > > #创建右表 > CREATE TABLE `right_table` ( > `id` int(11) NULL COMMENT "", > `city` varchar(500) NULL COMMENT "", > `name` varchar(256) NULL COMMENT "", > `sex` varchar(100) NULL COMMENT "" > ) ENGINE=OLAP > DUPLICATE KEY(`id`, `city`) > COMMENT "OLAP" > DISTRIBUTED BY HASH(`id`) BUCKETS 10 > PROPERTIES ( > "replication_allocation" = "tag.location.default: 3", > "in_memory" = "false", > "storage_format" = "V2" > ); > > 插入数据 > > left_table > > SQL > insert into left_table > values('2022-06-20',1,'a1'),('2022-06-20',1,'a1'),('2022-06-20',1,'a1'),('2022-06-20',2,'a2'),('2022-06-20',2,'a2'),('2022-06-20',2,null),('2022-06-20',null,'a2'),('2022-06-20',null,null),('2022-06-20',3,'a3'),('2022-06-20',3,'a3'),('2022-06-20',4,'a4'),('2022-06-20',4,'a4'),('2022-06-20',5,'a5'),('2022-06-20',6,'a6'),('2022-06-20',7,'a7'),('2022-06-20',8,'a8'),('2022-06-20',9,'a9') > > > > right_table > > SQL > insert into right_table values(1,'bj','a1','man'),(2 > <https://www.google.com/maps/search/a1','man'),(2?entry=gmail&source=g> > ,'sh','a2','man'),(3 > <https://www.google.com/maps/search/a2','man'),(3?entry=gmail&source=g> > ,'wh','a3','man'),(4,'gz','a4','man'),(5,'bj','a5','man'),(6,'tj','a6','man'),(7,'cd','a7','man'),(8,'xm','a8','woman'),(9,'nc','a9','woman') > > > > 查询结果,每次随机生成数据,join不准确 > > SQL > select data_date,city,count(*) vv, count(distinct sex) uv from left_table > a > left join right_table b on a.id = b.id > where data_date='2022-06-20' > group by data_date,city; > > 第一次运行 > > > > 第二次运行 > > > > 第三次运行 > > > > > > >