[SUPPORT]Duplicate Flink Hudi data
See original GitHub issueDescribe the problem you faced
Duplicate Flink Hudi data
To Reproduce
Steps to reproduce the behavior:
CREATE TABLE hudi.datagen_test3 ( id BIGINT, name1 VARCHAR(10), name2 VARCHAR(10), name3 VARCHAR(10), name4 VARCHAR(10), name5 VARCHAR(10), name6 VARCHAR(10), name7 VARCHAR(10), name8 VARCHAR(10), name9 VARCHAR(10), name10 VARCHAR(10), name11 VARCHAR(10), name12 VARCHAR(10), name13 VARCHAR(10), name14 VARCHAR(10), name15 VARCHAR(10), name16 VARCHAR(10), name17 VARCHAR(10), name18 VARCHAR(10), name19 VARCHAR(10), name20 VARCHAR(10), name21 VARCHAR(10), name22 VARCHAR(10), name23 VARCHAR(10), name24 VARCHAR(10), name25 VARCHAR(10), name26 VARCHAR(10), name27 VARCHAR(10), name28 VARCHAR(10), name29 VARCHAR(10), name30 VARCHAR(10), name31 VARCHAR(10), name32 VARCHAR(10), name33 VARCHAR(10), name34 VARCHAR(10), name35 VARCHAR(10), name36 VARCHAR(10), name37 VARCHAR(10), name38 VARCHAR(10), name39 VARCHAR(10), name40 VARCHAR(10), name41 VARCHAR(10), name42 VARCHAR(10), name43 VARCHAR(10), name44 VARCHAR(10), name45 VARCHAR(10), name46 VARCHAR(10), name47 VARCHAR(10), name48 VARCHAR(10), name49 VARCHAR(10), name50 VARCHAR(10), name VARCHAR(20), age int, birthday TIMESTAMP(3), ts TIMESTAMP(3) ) WITH ( ‘connector’ = ‘datagen’, ‘rows-per-second’= ‘20’, ‘fields.id.min’ = ‘1’, ‘fields.id.max’ = ‘1000000’ ); 2. CREATE TABLE hudi.datagen_hudi8( id bigint , name1 VARCHAR(10), name2 VARCHAR(10), name3 VARCHAR(10), name4 VARCHAR(10), name5 VARCHAR(10), name6 VARCHAR(10), name7 VARCHAR(10), name8 VARCHAR(10), name9 VARCHAR(10), name10 VARCHAR(10), name11 VARCHAR(10), name12 VARCHAR(10), name13 VARCHAR(10), name14 VARCHAR(10), name15 VARCHAR(10), name16 VARCHAR(10), name17 VARCHAR(10), name18 VARCHAR(10), name19 VARCHAR(10), name20 VARCHAR(10), name21 VARCHAR(10), name22 VARCHAR(10), name23 VARCHAR(10), name24 VARCHAR(10), name25 VARCHAR(10), name26 VARCHAR(10), name27 VARCHAR(10), name28 VARCHAR(10), name29 VARCHAR(10), name30 VARCHAR(10), name31 VARCHAR(10), name32 VARCHAR(10), name33 VARCHAR(10), name34 VARCHAR(10), name35 VARCHAR(10), name36 VARCHAR(10), name37 VARCHAR(10), name38 VARCHAR(10), name39 VARCHAR(10), name40 VARCHAR(10), name41 VARCHAR(10), name42 VARCHAR(10), name43 VARCHAR(10), name44 VARCHAR(10), name45 VARCHAR(10), name46 VARCHAR(10), name47 VARCHAR(10), name48 VARCHAR(10), name49 VARCHAR(10), name50 VARCHAR(10), name VARCHAR(20),
birthday TIMESTAMP(3),
ts TIMESTAMP(3),
partition_str
VARCHAR(20),
primary key(id) not enforced --必须指定uuid 主键
)
PARTITIONED BY (partition_str
)
with(
‘connector’=‘hudi’,
‘path’= ‘hdfs://test/user/hive/warehouse/hudi.db/datagen_hudi8’
, ‘hoodie.datasource.write.recordkey.field’= ‘id’-- 主键
, ‘write.tasks’= ‘1’
, ‘compaction.tasks’= ‘1’ , ‘write.precombine.field’= ‘ts’-- 自动precombine的字段
, ‘table.type’= ‘MERGE_ON_READ’-- 默认COPY_ON_WRITE,可选MERGE_ON_READ
, ‘compaction.async.enabled’= ‘true’-- 是否开启异步压缩
, ‘compaction.trigger.strategy’= ‘num_or_time’
, ‘compaction.delta_commits’= ‘2’, – 默认为5
‘compaction.delta_seconds’ = ‘120’,
‘hive_sync.enable’ = ‘true’,
‘hive_sync.mode’ = ‘hms’ ,
‘hive_sync.metastore.uris’ = ‘thrift://test:53083’,
‘hive_sync.table’=‘datagen_hudi8’,
‘hive_sync.db’=‘hudi’ ,
‘index.global.enabled’ = ‘true’,
‘index.bootstrap.enabled’ = ‘true’
);
3.insert into hudi.datagen_hudi8
select id ,
name1 ,
name2 ,
name3 ,
name4 ,
name5 ,
name6 ,
name7 ,
name8 ,
name9 ,
name10 ,
name11 ,
name12 ,
name13 ,
name14 ,
name15 ,
name16 ,
name17 ,
name18 ,
name19 ,
name20 ,
name21 ,
name22 ,
name23 ,
name24 ,
name25 ,
name26 ,
name27 ,
name28 ,
name29 ,
name30 ,
name31 ,
name32 ,
name33 ,
name34 ,
name35 ,
name36 ,
name37 ,
name38 ,
name39 ,
name40 ,
name41 ,
name42 ,
name43 ,
name44 ,
name45 ,
name46 ,
name47 ,
name48 ,
name49 ,
name50 ,
name ,
birthday ,
ts ,DATE_FORMAT(birthday, ‘yyyyMMdd’) as partition_str
from hudi.datagen_test3;
4.
Expected behavior
A clear and concise description of what you expected to happen.
Environment Description
-
Hudi version : 0.9.0
-
Flink version :1.12.2
-
Hadoop version : 2.7.7
select id,count() from hudi.datagen_hudi8 group by id having count() > 1; sql result:
Issue Analytics
- State:
- Created 2 years ago
- Comments:13 (11 by maintainers)
Top GitHub Comments
try to set index.global.enabled=true
Closing due to inactivity, and there is a proposed solution https://github.com/apache/hudi/issues/4508#issuecomment-1006239768