jayshrivastava commented on issue #14218:
URL: https://github.com/apache/datafusion/issues/14218#issuecomment-3801143311

   I'm also getting this error when aggregating on a dictionary encoded column:
   ```
   pub async fn register_temp_parquet_table(
       table_name: &str,
       schema: SchemaRef,
       batches: Vec<RecordBatch>,
       ctx: &SessionContext,
   ) -> Result<PathBuf> {
       if batches.is_empty() {
           return Err(datafusion::error::DataFusionError::Execution(
               "cannot create parquet file from empty batch list".to_string(),
           ));
       }
       for batch in &batches {
           if batch.schema() != schema {
               return Err(datafusion::error::DataFusionError::Execution(
                   "all batches must have the same schema".to_string(),
               ));
           }
       }
   
       let temp_dir = std::env::temp_dir();
       let file_id = Uuid::new_v4();
       let temp_file_path = 
temp_dir.join(format!("{table_name}_{file_id}.parquet",));
   
       let file = std::fs::File::create(&temp_file_path)?;
       let schema = batches[0].schema();
       let mut writer = ArrowWriter::try_new(file, schema, None)?;
   
       for batch in batches {
           writer.write(&batch)?;
       }
       writer.close()?;
   
       ctx.register_parquet(
           table_name,
           temp_file_path.to_string_lossy().as_ref(),
           ParquetReadOptions::default(),
       )
       .await?;
   
       Ok(temp_file_path)
   }
   ```
   
   ```
   let config = SessionConfig::new().with_target_partitions(2);
   
   let state = SessionStateBuilder::new()
       .with_default_features()
       .with_config(config)
       .build();
   
   let ctx = SessionContext::from(state);
   
   let schema2 = Arc::new(Schema::new(vec![
       Field::new("id", DataType::Int32, false),
       Field::new("name", DataType::Utf8, false),
       Field::new("phone", DataType::Utf8, false),
       Field::new("balance", DataType::Float64, false),
       Field::new(
           "company",
           DataType::Dictionary(Box::new(DataType::UInt16), 
Box::new(DataType::Utf8)),
           false,
       ),
   ]));
   
   let batches2 = vec![
       RecordBatch::try_new(
           schema2.clone(),
           vec![
               Arc::new(Int32Array::from(vec![1, 2, 3])),
               Arc::new(StringArray::from(vec![
                   "customer1",
                   "customer2",
                   "customer3",
               ])),
               Arc::new(StringArray::from(vec![
                   "13-123-4567",
                   "31-456-7890",
                   "23-789-0123",
               ])),
               Arc::new(datafusion::arrow::array::Float64Array::from(vec![
                   100.5, 250.0, 50.25,
               ])),
               Arc::new(
                   vec!["company1", "company1", "company1"]
                       .into_iter()
                       .collect::<arrow::array::DictionaryArray<UInt16Type>>(),
               ),
           ],
       )
       .unwrap(),
   ];
   
   // Register the test data as parquet tables
   let _ = register_temp_parquet_table("table1", schema1, batches1, &ctx)
       .await
       .unwrap();
   ```
   
   The query `SELECT distinct company from table2` fails with 
`Shared(ArrowError(InvalidArgumentError("column types must match schema types, 
expected Dictionary(UInt16, Utf8) but found Utf8 at column index 0"), 
Some("")))`


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to