Copilot commented on code in PR #406: URL: https://github.com/apache/sedona-db/pull/406#discussion_r2590316693
########## rust/sedona-gdal/src/raster_io.rs: ########## @@ -0,0 +1,679 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::dataset::{geotransform_components, tile_size, to_banddatatype}; +use arrow::array::StructArray; +use arrow::error::ArrowError; +use gdal::raster::RasterBand; +use gdal::Dataset; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_raster::utils::{bytes_per_pixel, f64_to_bandtype_bytes}; +use sedona_schema::raster::{BandDataType, StorageType}; +use std::sync::Arc; + +// ============================================================================ +// Format-specific convenience wrappers +// ============================================================================ + +/// Reads a GeoTIFF file using GDAL and converts it into a StructArray of rasters. +/// +/// This is a convenience wrapper around [`read_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `filepath` - Path to the GeoTIFF file +/// * `tile_size_opt` - Optional tile size to override dataset metadata +pub fn read_geotiff( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + read_raster(filepath, tile_size_opt) +} + +/// Writes a tiled raster StructArray to a GeoTIFF file using GDAL. +/// +/// This is a convenience wrapper around [`write_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output GeoTIFF file +pub fn write_geotiff(raster_array: &StructArray, filepath: &str) -> Result<(), ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + write_raster(raster_array, filepath, "GTiff") +} + +/// Reads a raster file using GDAL and converts it into a StructArray of rasters. +/// +/// Currently only supports reading rasters into InDb storage type. +/// OutDb storage types are not yet implemented. +/// +/// # Arguments +/// * `filepath` - Path to the raster file +/// * `tile_size_opt` - Optional tile size to override dataset metadata (uses dataset metadata if None) +pub fn read_raster( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + let dataset = Dataset::open(filepath).map_err(|err| ArrowError::ParseError(err.to_string()))?; + + // Extract geotransform components + let (origin_upperleft_x, origin_upperleft_y, scale_x, scale_y, skew_x, skew_y) = + geotransform_components(&dataset)?; + + let (raster_width, raster_height) = dataset.raster_size(); + + // Use provided tile dimensions or fall back to dataset metadata + let (tile_width, tile_height) = match tile_size_opt { + Some((w, h)) => (w, h), + _ => tile_size(&dataset)?, + }; + + let x_tile_count = raster_width.div_ceil(tile_width); + let y_tile_count = raster_height.div_ceil(tile_height); + + let mut raster_builder = RasterBuilder::new(x_tile_count * y_tile_count); + let band_count = dataset.raster_count(); + + for tile_y in 0..y_tile_count { + for tile_x in 0..x_tile_count { + let x_offset = tile_x * tile_width; + let y_offset = tile_y * tile_height; + + // Calculate geographic coordinates for this tile + // using the geotransform from the original raster + let tile_upperleft_x = + origin_upperleft_x + (x_offset as f64) * scale_x + (y_offset as f64) * skew_x; + let tile_upperleft_y = + origin_upperleft_y + (x_offset as f64) * skew_y + (y_offset as f64) * scale_y; + + // Create raster metadata for this tile with actual geotransform values + let tile_metadata = RasterMetadata { + width: tile_width as u64, + height: tile_height as u64, + upperleft_x: tile_upperleft_x, + upperleft_y: tile_upperleft_y, + scale_x, + scale_y, + skew_x, + skew_y, + }; + + raster_builder.start_raster(&tile_metadata, None)?; + + for band_number in 1..=band_count { + let band: RasterBand = dataset.rasterband(band_number).unwrap(); + + // The band size is always the full raster size, not the tile size + // We read a specific window from the band + let actual_tile_width = tile_width.min(raster_width - x_offset); + let actual_tile_height = tile_height.min(raster_height - y_offset); + + let data_type = to_banddatatype(band.band_type())?; + let nodata_value = band + .no_data_value() + .map(|val| f64_to_bandtype_bytes(val, data_type.clone())); + + let band_metadata = BandMetadata { + nodata_value, + storage_type: StorageType::InDb, + datatype: data_type.clone(), + outdb_url: None, + outdb_band_id: None, + }; + + raster_builder.start_band(band_metadata)?; + + // Reserve capacity and write band data directly from GDAL + let pixel_count = actual_tile_width * actual_tile_height; + let bytes_needed = pixel_count * bytes_per_pixel(data_type.clone()); + + // Pre-allocate a vector of the exact size needed + let mut band_data = vec![0u8; bytes_needed]; + + // Read directly into the pre-allocated buffer + read_band_data_into( + &band, + (x_offset as isize, y_offset as isize), + (actual_tile_width, actual_tile_height), + &data_type, + &mut band_data, + )?; + + // Write the band data + raster_builder.band_data_writer().append_value(&band_data); + + // Finalize the band + raster_builder.finish_band()?; + } + + // Finalize the raster + raster_builder.finish_raster()?; + } + } + + // Finalize the raster struct array + let raster_struct = raster_builder.finish()?; + Ok(Arc::new(raster_struct)) +} + +/// Helper function to read band data from GDAL directly into a pre-allocated byte buffer +/// Casts the buffer to the appropriate type and reads directly onto `output` +fn read_band_data_into( + band: &RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + data_type: &BandDataType, + output: &mut [u8], +) -> Result<(), ArrowError> { + let pixel_count = window_size.0 * window_size.1; + + match data_type { + BandDataType::UInt8 => { + band.read_into_slice(window_origin, window_size, window_size, output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float64 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f64, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + } +} + +/// Write a tiled raster StructArray to a raster file using GDAL. +/// +/// This is a generic function that works with any GDAL-supported raster format. +/// Currently only supports writing rasters with InDb storage type. +/// OutDb storage types will return a NotYetImplemented error. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output file +/// * `driver_name` - GDAL driver name (e.g., "GTiff", "Zarr") +fn write_raster( + raster_array: &StructArray, + filepath: &str, + driver_name: &str, +) -> Result<(), ArrowError> { + use gdal::{DriverManager, Metadata}; + use sedona_raster::array::RasterStructArray; + use sedona_raster::traits::RasterRef; + + let raster_struct_array = RasterStructArray::new(raster_array); + + if raster_struct_array.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Cannot write empty raster array".to_string(), + )); + } + + // Get the first raster to determine dimensions and band count + let first_raster = raster_struct_array.get(0)?; + let first_metadata = first_raster.metadata(); + let band_count = first_raster.bands().len(); + + if band_count == 0 { + return Err(ArrowError::InvalidArgumentError( + "Raster has no bands".to_string(), + )); + } + + let first_band = first_raster.bands().band(1)?; + let data_type = first_band.metadata().data_type(); + + // Calculate total raster dimensions by analyzing tile positions + let mut max_x: f64 = 0.0; + let mut max_y: f64 = 0.0; + let mut min_x: f64 = f64::MAX; + let mut min_y: f64 = f64::MAX; + + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let metadata = raster.metadata(); + + let tile_max_x = metadata.upper_left_x() + (metadata.width() as f64) * metadata.scale_x(); + let tile_max_y = metadata.upper_left_y() + (metadata.height() as f64) * metadata.scale_y(); + + max_x = max_x.max(tile_max_x); + max_y = max_y.max(tile_max_y); + min_x = min_x.min(metadata.upper_left_x()); + min_y = min_y.min(metadata.upper_left_y()); + } + + // Calculate total raster dimensions + let scale_x = first_metadata.scale_x(); + let scale_y = first_metadata.scale_y(); + let total_width = ((max_x - min_x) / scale_x).abs().round() as usize; + let total_height = ((max_y - min_y) / scale_y).abs().round() as usize; + + // Get GDAL driver by name + let driver = DriverManager::get_driver_by_name(driver_name).map_err(|e| { + ArrowError::ParseError(format!("Failed to get {} driver: {e}", driver_name)) + })?; + + // Create dataset based on data type + let mut dataset = match data_type { + BandDataType::UInt8 => { + driver.create_with_band_type::<u8, _>(filepath, total_width, total_height, band_count) + } + BandDataType::UInt16 => { + driver.create_with_band_type::<u16, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Int16 => { + driver.create_with_band_type::<i16, _>(filepath, total_width, total_height, band_count) + } + BandDataType::UInt32 => { + driver.create_with_band_type::<u32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Int32 => { + driver.create_with_band_type::<i32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Float32 => { + driver.create_with_band_type::<f32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Float64 => { + driver.create_with_band_type::<f64, _>(filepath, total_width, total_height, band_count) + } + } + .map_err(|e| ArrowError::ParseError(format!("Failed to create GeoTIFF: {e}")))?; + + // Set geotransform + dataset + .set_geo_transform(&[ + min_x, + scale_x, + first_metadata.skew_x(), + min_y, + first_metadata.skew_y(), + scale_y, + ]) + .map_err(|e| ArrowError::ParseError(format!("Failed to set geotransform: {e}")))?; + + // Set tile size metadata if tiles are being used + let tile_width = first_metadata.width(); + let tile_height = first_metadata.height(); + dataset + .set_metadata_item("TILEWIDTH", &tile_width.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEWIDTH metadata: {e}")))?; + dataset + .set_metadata_item("TILEHEIGHT", &tile_height.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEHEIGHT metadata: {e}")))?; + + // Write each tile to the dataset + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let raster_metadata = raster.metadata(); + + // Calculate pixel offset for this tile + let x_offset = ((raster_metadata.upper_left_x() - min_x) / scale_x).round() as isize; + let y_offset = ((raster_metadata.upper_left_y() - min_y) / scale_y).round() as isize; + + let tile_width = raster_metadata.width() as usize; + let tile_height = raster_metadata.height() as usize; + + // Write each band + for band_index in 0..band_count { + let band = raster.bands().band(band_index + 1)?; + + // Check that storage type is InDb + if band.metadata().storage_type() != StorageType::InDb { + return Err(ArrowError::NotYetImplemented( + format!("Writing rasters with storage type {:?} is not yet implemented. Only InDb storage is currently supported.", + band.metadata().storage_type()) + )); + } + + let mut gdal_band = dataset.rasterband(band_index + 1).map_err(|e| { + ArrowError::ParseError(format!("Failed to get GDAL band {}: {e}", band_index + 1)) + })?; + + let band_data = band.data(); + let band_datatype = band.metadata().data_type(); + + // Write the band data to the appropriate location in the dataset + // Convert the byte slice to the appropriate type for GDAL + write_band_data( + &mut gdal_band, + (x_offset, y_offset), + (tile_width, tile_height), + band_data, + band_datatype, + )?; + + // Set nodata value if present + // Note: Some drivers (e.g., Zarr) don't support nodata values + if let Some(nodata_bytes) = band.metadata().nodata_value() { + if let Some(nodata_f64) = bytes_to_f64(nodata_bytes, band.metadata().data_type()) { + let _ = gdal_band.set_no_data_value(Some(nodata_f64)); + } + } + } + } + + // Flush the dataset + dataset.flush_cache().map_err(|e| { + ArrowError::ParseError(format!("Failed to flush cache when writing GeoTIFF: {e}")) + })?; + + Ok(()) +} + +/// Helper function to write band data to GDAL, converting from bytes to the appropriate type +fn write_band_data( + gdal_band: &mut RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + band_data: &[u8], + data_type: BandDataType, +) -> Result<(), ArrowError> { + use gdal::raster::Buffer; + + let (width, height) = window_size; + let pixel_count = width * height; + + match data_type { + BandDataType::UInt8 => { + let data: Vec<u8> = band_data.to_vec(); + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::UInt16 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(2) { + data.push(u16::from_ne_bytes([chunk[0], chunk[1]])); Review Comment: The byte-to-type conversion pattern is duplicated across multiple data types (lines 456, 464, 472, 480, 488, 496). Consider extracting this into a helper function to reduce duplication and improve maintainability. ########## rust/sedona-gdal/src/dataset.rs: ########## @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ArrowError; +use gdal::raster::GdalDataType; +use gdal::Dataset; +use gdal::Metadata; +use sedona_schema::raster::BandDataType; + +/// Extract geotransform components from a GDAL dataset +/// Returns (upper_left_x, upper_left_y, scale_x, scale_y, skew_x, skew_y) +pub fn geotransform_components( + dataset: &Dataset, +) -> Result<(f64, f64, f64, f64, f64, f64), ArrowError> { + let geotransform = dataset + .geo_transform() + .map_err(|e| ArrowError::ParseError(format!("Failed to get geotransform: {e}")))?; + Ok(( + geotransform[0], // Upper-left X coordinate + geotransform[3], // Upper-left Y coordinate + geotransform[1], // scale_x (pixel width) + geotransform[5], // scale_y (pixel height, usually negative) + geotransform[2], // skew_x (X-direction skew) + geotransform[4], // skew_y (Y-direction skew) + )) +} + +/// Extract tile size from a GDAL dataset +/// If not provided, defaults to raster size. In future, will consider +/// defaulting to an ideal tile size instead of full raster size once we know +/// what the idea tile size should be. +pub fn tile_size(dataset: &Dataset) -> Result<(usize, usize), ArrowError> { + let raster_width = dataset.raster_size().0; + let raster_height = dataset.raster_size().1; + + let tile_width = match dataset.metadata_item("TILEWIDTH", "") { + Some(val) => val.parse::<usize>().unwrap_or(raster_width), + None => raster_width, + }; + let tile_height = match dataset.metadata_item("TILEHEIGHT", "") { + Some(val) => val.parse::<usize>().unwrap_or(raster_height), Review Comment: Silent fallback to raster_width when parsing fails could hide configuration errors. Consider logging a warning when parse fails, or returning an error to make misconfiguration more visible. ```suggestion Some(val) => val.parse::<usize>().map_err(|e| { ArrowError::ParseError(format!( "Failed to parse TILEWIDTH ('{}'): {}", val, e )) })?, None => raster_width, }; let tile_height = match dataset.metadata_item("TILEHEIGHT", "") { Some(val) => val.parse::<usize>().map_err(|e| { ArrowError::ParseError(format!( "Failed to parse TILEHEIGHT ('{}'): {}", val, e )) })?, ``` ########## rust/sedona-gdal/src/raster_io.rs: ########## @@ -0,0 +1,679 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::dataset::{geotransform_components, tile_size, to_banddatatype}; +use arrow::array::StructArray; +use arrow::error::ArrowError; +use gdal::raster::RasterBand; +use gdal::Dataset; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_raster::utils::{bytes_per_pixel, f64_to_bandtype_bytes}; +use sedona_schema::raster::{BandDataType, StorageType}; +use std::sync::Arc; + +// ============================================================================ +// Format-specific convenience wrappers +// ============================================================================ + +/// Reads a GeoTIFF file using GDAL and converts it into a StructArray of rasters. +/// +/// This is a convenience wrapper around [`read_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `filepath` - Path to the GeoTIFF file +/// * `tile_size_opt` - Optional tile size to override dataset metadata +pub fn read_geotiff( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + read_raster(filepath, tile_size_opt) +} + +/// Writes a tiled raster StructArray to a GeoTIFF file using GDAL. +/// +/// This is a convenience wrapper around [`write_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output GeoTIFF file +pub fn write_geotiff(raster_array: &StructArray, filepath: &str) -> Result<(), ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + write_raster(raster_array, filepath, "GTiff") +} + +/// Reads a raster file using GDAL and converts it into a StructArray of rasters. +/// +/// Currently only supports reading rasters into InDb storage type. +/// OutDb storage types are not yet implemented. +/// +/// # Arguments +/// * `filepath` - Path to the raster file +/// * `tile_size_opt` - Optional tile size to override dataset metadata (uses dataset metadata if None) +pub fn read_raster( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + let dataset = Dataset::open(filepath).map_err(|err| ArrowError::ParseError(err.to_string()))?; + + // Extract geotransform components + let (origin_upperleft_x, origin_upperleft_y, scale_x, scale_y, skew_x, skew_y) = + geotransform_components(&dataset)?; + + let (raster_width, raster_height) = dataset.raster_size(); + + // Use provided tile dimensions or fall back to dataset metadata + let (tile_width, tile_height) = match tile_size_opt { + Some((w, h)) => (w, h), + _ => tile_size(&dataset)?, + }; + + let x_tile_count = raster_width.div_ceil(tile_width); + let y_tile_count = raster_height.div_ceil(tile_height); + + let mut raster_builder = RasterBuilder::new(x_tile_count * y_tile_count); + let band_count = dataset.raster_count(); + + for tile_y in 0..y_tile_count { + for tile_x in 0..x_tile_count { + let x_offset = tile_x * tile_width; + let y_offset = tile_y * tile_height; + + // Calculate geographic coordinates for this tile + // using the geotransform from the original raster + let tile_upperleft_x = + origin_upperleft_x + (x_offset as f64) * scale_x + (y_offset as f64) * skew_x; + let tile_upperleft_y = + origin_upperleft_y + (x_offset as f64) * skew_y + (y_offset as f64) * scale_y; + + // Create raster metadata for this tile with actual geotransform values + let tile_metadata = RasterMetadata { + width: tile_width as u64, + height: tile_height as u64, + upperleft_x: tile_upperleft_x, + upperleft_y: tile_upperleft_y, + scale_x, + scale_y, + skew_x, + skew_y, + }; + + raster_builder.start_raster(&tile_metadata, None)?; + + for band_number in 1..=band_count { + let band: RasterBand = dataset.rasterband(band_number).unwrap(); + + // The band size is always the full raster size, not the tile size + // We read a specific window from the band + let actual_tile_width = tile_width.min(raster_width - x_offset); + let actual_tile_height = tile_height.min(raster_height - y_offset); + + let data_type = to_banddatatype(band.band_type())?; + let nodata_value = band + .no_data_value() + .map(|val| f64_to_bandtype_bytes(val, data_type.clone())); Review Comment: The `data_type` enum is cloned multiple times unnecessarily (lines 143, 148, 157). Since `BandDataType` is likely small and Copy-able, consider deriving Copy for it, or reuse the reference without cloning. ########## rust/sedona-gdal/src/raster_io.rs: ########## @@ -0,0 +1,679 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::dataset::{geotransform_components, tile_size, to_banddatatype}; +use arrow::array::StructArray; +use arrow::error::ArrowError; +use gdal::raster::RasterBand; +use gdal::Dataset; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_raster::utils::{bytes_per_pixel, f64_to_bandtype_bytes}; +use sedona_schema::raster::{BandDataType, StorageType}; +use std::sync::Arc; + +// ============================================================================ +// Format-specific convenience wrappers +// ============================================================================ + +/// Reads a GeoTIFF file using GDAL and converts it into a StructArray of rasters. +/// +/// This is a convenience wrapper around [`read_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `filepath` - Path to the GeoTIFF file +/// * `tile_size_opt` - Optional tile size to override dataset metadata +pub fn read_geotiff( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + read_raster(filepath, tile_size_opt) +} + +/// Writes a tiled raster StructArray to a GeoTIFF file using GDAL. +/// +/// This is a convenience wrapper around [`write_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output GeoTIFF file +pub fn write_geotiff(raster_array: &StructArray, filepath: &str) -> Result<(), ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + write_raster(raster_array, filepath, "GTiff") +} + +/// Reads a raster file using GDAL and converts it into a StructArray of rasters. +/// +/// Currently only supports reading rasters into InDb storage type. +/// OutDb storage types are not yet implemented. +/// +/// # Arguments +/// * `filepath` - Path to the raster file +/// * `tile_size_opt` - Optional tile size to override dataset metadata (uses dataset metadata if None) +pub fn read_raster( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result<Arc<StructArray>, ArrowError> { + let dataset = Dataset::open(filepath).map_err(|err| ArrowError::ParseError(err.to_string()))?; + + // Extract geotransform components + let (origin_upperleft_x, origin_upperleft_y, scale_x, scale_y, skew_x, skew_y) = + geotransform_components(&dataset)?; + + let (raster_width, raster_height) = dataset.raster_size(); + + // Use provided tile dimensions or fall back to dataset metadata + let (tile_width, tile_height) = match tile_size_opt { + Some((w, h)) => (w, h), + _ => tile_size(&dataset)?, + }; + + let x_tile_count = raster_width.div_ceil(tile_width); + let y_tile_count = raster_height.div_ceil(tile_height); + + let mut raster_builder = RasterBuilder::new(x_tile_count * y_tile_count); + let band_count = dataset.raster_count(); + + for tile_y in 0..y_tile_count { + for tile_x in 0..x_tile_count { + let x_offset = tile_x * tile_width; + let y_offset = tile_y * tile_height; + + // Calculate geographic coordinates for this tile + // using the geotransform from the original raster + let tile_upperleft_x = + origin_upperleft_x + (x_offset as f64) * scale_x + (y_offset as f64) * skew_x; + let tile_upperleft_y = + origin_upperleft_y + (x_offset as f64) * skew_y + (y_offset as f64) * scale_y; + + // Create raster metadata for this tile with actual geotransform values + let tile_metadata = RasterMetadata { + width: tile_width as u64, + height: tile_height as u64, + upperleft_x: tile_upperleft_x, + upperleft_y: tile_upperleft_y, + scale_x, + scale_y, + skew_x, + skew_y, + }; + + raster_builder.start_raster(&tile_metadata, None)?; + + for band_number in 1..=band_count { + let band: RasterBand = dataset.rasterband(band_number).unwrap(); + + // The band size is always the full raster size, not the tile size + // We read a specific window from the band + let actual_tile_width = tile_width.min(raster_width - x_offset); + let actual_tile_height = tile_height.min(raster_height - y_offset); + + let data_type = to_banddatatype(band.band_type())?; + let nodata_value = band + .no_data_value() + .map(|val| f64_to_bandtype_bytes(val, data_type.clone())); + + let band_metadata = BandMetadata { + nodata_value, + storage_type: StorageType::InDb, + datatype: data_type.clone(), + outdb_url: None, + outdb_band_id: None, + }; + + raster_builder.start_band(band_metadata)?; + + // Reserve capacity and write band data directly from GDAL + let pixel_count = actual_tile_width * actual_tile_height; + let bytes_needed = pixel_count * bytes_per_pixel(data_type.clone()); + + // Pre-allocate a vector of the exact size needed + let mut band_data = vec![0u8; bytes_needed]; + + // Read directly into the pre-allocated buffer + read_band_data_into( + &band, + (x_offset as isize, y_offset as isize), + (actual_tile_width, actual_tile_height), + &data_type, + &mut band_data, + )?; + + // Write the band data + raster_builder.band_data_writer().append_value(&band_data); + + // Finalize the band + raster_builder.finish_band()?; + } + + // Finalize the raster + raster_builder.finish_raster()?; + } + } + + // Finalize the raster struct array + let raster_struct = raster_builder.finish()?; + Ok(Arc::new(raster_struct)) +} + +/// Helper function to read band data from GDAL directly into a pre-allocated byte buffer +/// Casts the buffer to the appropriate type and reads directly onto `output` +fn read_band_data_into( + band: &RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + data_type: &BandDataType, + output: &mut [u8], +) -> Result<(), ArrowError> { + let pixel_count = window_size.0 * window_size.1; + + match data_type { + BandDataType::UInt8 => { + band.read_into_slice(window_origin, window_size, window_size, output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float64 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f64, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + } +} + +/// Write a tiled raster StructArray to a raster file using GDAL. +/// +/// This is a generic function that works with any GDAL-supported raster format. +/// Currently only supports writing rasters with InDb storage type. +/// OutDb storage types will return a NotYetImplemented error. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output file +/// * `driver_name` - GDAL driver name (e.g., "GTiff", "Zarr") +fn write_raster( + raster_array: &StructArray, + filepath: &str, + driver_name: &str, +) -> Result<(), ArrowError> { + use gdal::{DriverManager, Metadata}; + use sedona_raster::array::RasterStructArray; + use sedona_raster::traits::RasterRef; + + let raster_struct_array = RasterStructArray::new(raster_array); + + if raster_struct_array.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Cannot write empty raster array".to_string(), + )); + } + + // Get the first raster to determine dimensions and band count + let first_raster = raster_struct_array.get(0)?; + let first_metadata = first_raster.metadata(); + let band_count = first_raster.bands().len(); + + if band_count == 0 { + return Err(ArrowError::InvalidArgumentError( + "Raster has no bands".to_string(), + )); + } + + let first_band = first_raster.bands().band(1)?; + let data_type = first_band.metadata().data_type(); + + // Calculate total raster dimensions by analyzing tile positions + let mut max_x: f64 = 0.0; + let mut max_y: f64 = 0.0; + let mut min_x: f64 = f64::MAX; + let mut min_y: f64 = f64::MAX; + + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let metadata = raster.metadata(); + + let tile_max_x = metadata.upper_left_x() + (metadata.width() as f64) * metadata.scale_x(); + let tile_max_y = metadata.upper_left_y() + (metadata.height() as f64) * metadata.scale_y(); + + max_x = max_x.max(tile_max_x); + max_y = max_y.max(tile_max_y); + min_x = min_x.min(metadata.upper_left_x()); + min_y = min_y.min(metadata.upper_left_y()); + } + + // Calculate total raster dimensions + let scale_x = first_metadata.scale_x(); + let scale_y = first_metadata.scale_y(); + let total_width = ((max_x - min_x) / scale_x).abs().round() as usize; + let total_height = ((max_y - min_y) / scale_y).abs().round() as usize; + + // Get GDAL driver by name + let driver = DriverManager::get_driver_by_name(driver_name).map_err(|e| { + ArrowError::ParseError(format!("Failed to get {} driver: {e}", driver_name)) + })?; + + // Create dataset based on data type + let mut dataset = match data_type { + BandDataType::UInt8 => { + driver.create_with_band_type::<u8, _>(filepath, total_width, total_height, band_count) + } + BandDataType::UInt16 => { + driver.create_with_band_type::<u16, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Int16 => { + driver.create_with_band_type::<i16, _>(filepath, total_width, total_height, band_count) + } + BandDataType::UInt32 => { + driver.create_with_band_type::<u32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Int32 => { + driver.create_with_band_type::<i32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Float32 => { + driver.create_with_band_type::<f32, _>(filepath, total_width, total_height, band_count) + } + BandDataType::Float64 => { + driver.create_with_band_type::<f64, _>(filepath, total_width, total_height, band_count) + } + } + .map_err(|e| ArrowError::ParseError(format!("Failed to create GeoTIFF: {e}")))?; + + // Set geotransform + dataset + .set_geo_transform(&[ + min_x, + scale_x, + first_metadata.skew_x(), + min_y, + first_metadata.skew_y(), + scale_y, + ]) + .map_err(|e| ArrowError::ParseError(format!("Failed to set geotransform: {e}")))?; + + // Set tile size metadata if tiles are being used + let tile_width = first_metadata.width(); + let tile_height = first_metadata.height(); + dataset + .set_metadata_item("TILEWIDTH", &tile_width.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEWIDTH metadata: {e}")))?; + dataset + .set_metadata_item("TILEHEIGHT", &tile_height.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEHEIGHT metadata: {e}")))?; + + // Write each tile to the dataset + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let raster_metadata = raster.metadata(); + + // Calculate pixel offset for this tile + let x_offset = ((raster_metadata.upper_left_x() - min_x) / scale_x).round() as isize; + let y_offset = ((raster_metadata.upper_left_y() - min_y) / scale_y).round() as isize; + + let tile_width = raster_metadata.width() as usize; + let tile_height = raster_metadata.height() as usize; + + // Write each band + for band_index in 0..band_count { + let band = raster.bands().band(band_index + 1)?; + + // Check that storage type is InDb + if band.metadata().storage_type() != StorageType::InDb { + return Err(ArrowError::NotYetImplemented( + format!("Writing rasters with storage type {:?} is not yet implemented. Only InDb storage is currently supported.", + band.metadata().storage_type()) + )); + } + + let mut gdal_band = dataset.rasterband(band_index + 1).map_err(|e| { + ArrowError::ParseError(format!("Failed to get GDAL band {}: {e}", band_index + 1)) + })?; + + let band_data = band.data(); + let band_datatype = band.metadata().data_type(); + + // Write the band data to the appropriate location in the dataset + // Convert the byte slice to the appropriate type for GDAL + write_band_data( + &mut gdal_band, + (x_offset, y_offset), + (tile_width, tile_height), + band_data, + band_datatype, + )?; + + // Set nodata value if present + // Note: Some drivers (e.g., Zarr) don't support nodata values + if let Some(nodata_bytes) = band.metadata().nodata_value() { + if let Some(nodata_f64) = bytes_to_f64(nodata_bytes, band.metadata().data_type()) { + let _ = gdal_band.set_no_data_value(Some(nodata_f64)); + } + } + } + } + + // Flush the dataset + dataset.flush_cache().map_err(|e| { + ArrowError::ParseError(format!("Failed to flush cache when writing GeoTIFF: {e}")) + })?; + + Ok(()) +} + +/// Helper function to write band data to GDAL, converting from bytes to the appropriate type +fn write_band_data( + gdal_band: &mut RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + band_data: &[u8], + data_type: BandDataType, +) -> Result<(), ArrowError> { + use gdal::raster::Buffer; + + let (width, height) = window_size; + let pixel_count = width * height; + + match data_type { + BandDataType::UInt8 => { + let data: Vec<u8> = band_data.to_vec(); + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::UInt16 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(2) { + data.push(u16::from_ne_bytes([chunk[0], chunk[1]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Int16 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(2) { + data.push(i16::from_ne_bytes([chunk[0], chunk[1]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::UInt32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Int32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(i32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Float32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(f32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Float64 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(8) { + data.push(f64::from_ne_bytes([ + chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7], + ])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + } + .map_err(|e| ArrowError::ParseError(format!("Failed to write band data: {e}"))) +} + +/// Convert nodata value bytes back to f64 +fn bytes_to_f64(bytes: &[u8], data_type: BandDataType) -> Option<f64> { + let expected_bytes = bytes_per_pixel(data_type.clone()); + if bytes.len() < expected_bytes { + return None; + } + match data_type { + BandDataType::UInt8 => Some(bytes[0] as f64), + BandDataType::UInt16 => Some(u16::from_ne_bytes([bytes[0], bytes[1]]) as f64), Review Comment: The bytes-to-type conversion pattern in `bytes_to_f64` duplicates logic from `write_band_data`. Consider extracting a common conversion function to reduce duplication. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
