korowa commented on code in PR #13768:
URL: https://github.com/apache/datafusion/pull/13768#discussion_r1887404004
##########
datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs:
##########
@@ -228,6 +228,85 @@ struct BloomFilterStatistics {
column_sbbf: HashMap<String, (Sbbf, Type)>,
}
+impl BloomFilterStatistics {
+ /// Helper function for checking if [`Sbbf`] filter contains
[`ScalarValue`].
+ ///
+ /// In case the type of scalar is not supported, returns `true`, assuming
that the
+ /// value may be present.
+ fn check_scalar(sbbf: &Sbbf, value: &ScalarValue, parquet_type: &Type) ->
bool {
+ match value {
+ ScalarValue::Utf8(Some(v))
+ | ScalarValue::Utf8View(Some(v))
+ | ScalarValue::LargeUtf8(Some(v)) => sbbf.check(&v.as_str()),
+ ScalarValue::Binary(Some(v)) | ScalarValue::BinaryView(Some(v)) =>
{
+ sbbf.check(v)
+ }
+ ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v),
+ ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+ ScalarValue::Float64(Some(v)) => sbbf.check(v),
+ ScalarValue::Float32(Some(v)) => sbbf.check(v),
+ ScalarValue::Int64(Some(v)) => sbbf.check(v),
+ ScalarValue::Int32(Some(v)) => sbbf.check(v),
+ ScalarValue::UInt64(Some(v)) => sbbf.check(v),
+ ScalarValue::UInt32(Some(v)) => sbbf.check(v),
+ ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
+ Type::INT32 => {
+
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
+ // All physical type are little-endian
+ if *p > 9 {
+ //DECIMAL can be used to annotate the following types:
+ //
+ // int32: for 1 <= precision <= 9
+ // int64: for 1 <= precision <= 18
+ return true;
+ }
+ let b = (*v as i32).to_le_bytes();
+ // Use Decimal constructor after
https://github.com/apache/arrow-rs/issues/5325
+ let decimal = Decimal::Int32 {
+ value: b,
+ precision: *p as i32,
+ scale: *s as i32,
+ };
+ sbbf.check(&decimal)
+ }
+ Type::INT64 => {
+ if *p > 18 {
+ return true;
+ }
+ let b = (*v as i64).to_le_bytes();
+ let decimal = Decimal::Int64 {
+ value: b,
+ precision: *p as i32,
+ scale: *s as i32,
+ };
+ sbbf.check(&decimal)
+ }
+ Type::FIXED_LEN_BYTE_ARRAY => {
+ // keep with from_bytes_to_i128
+ let b = v.to_be_bytes().to_vec();
+ // Use Decimal constructor after
https://github.com/apache/arrow-rs/issues/5325
+ let decimal = Decimal::Bytes {
+ value: b.into(),
+ precision: *p as i32,
+ scale: *s as i32,
+ };
+ sbbf.check(&decimal)
+ }
+ _ => true,
+ },
+ // Bloom filter pruning is performed only for Utf8 dictionary
types since
+ // pruning predicate is not created for Dictionary(Numeric/Binary)
types
+ ScalarValue::Dictionary(_, inner) => match inner.as_ref() {
Review Comment:
Indeed, after rechecking I've found out that it's only a matter of casting
literal to exact column type. I'll update filter check function and will add
additional tests for more data types.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]