diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 7b181179d3d..934a71aa67a 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result VariantToShreddedObjectVariantRowBuilder<'a> { let mut builder = StructArrayBuilder::new(); for (field_name, typed_value_builder) in self.typed_value_builders { let (value, typed_value, nulls) = typed_value_builder.finish()?; - let array = - ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + let array = ShreddedVariantFieldArray::from_parts( + Some(Arc::new(value) as ArrayRef), + Some(typed_value), + nulls, + ); builder = builder.with_field(field_name, ArrayRef::from(array), false); } if let Some(nulls) = self.typed_value_nulls.finish() { @@ -689,6 +692,7 @@ impl VariantSchemaNode { mod tests { use super::*; use crate::VariantArrayBuilder; + use crate::variant_array::binary_array_value; use arrow::array::{ Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray, @@ -867,7 +871,8 @@ mod tests { ) { assert_eq!(array.len(), expected_len); - let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field())); + let fallback_value = array.value_field().unwrap(); + let fallback_metadata = array.metadata_field(); let array = downcast_list_like_array::(array); assert_eq!( @@ -887,7 +892,7 @@ mod tests { ); assert_eq!( array.len(), - fallbacks.0.len(), + fallback_value.len(), "fallbacks value field should match array length" ); @@ -902,7 +907,7 @@ mod tests { // Successfully shredded: typed list value present, no fallback value assert!(array.is_valid(idx)); assert_eq!(array.value_size(idx), *len); - assert!(fallbacks.0.is_null(idx)); + assert!(fallback_value.is_null(idx)); } None => { // Unable to shred: typed list value absent, fallback should carry the variant @@ -910,20 +915,25 @@ mod tests { assert_eq!(array.value_size(idx), O::zero()); match expected_fallback { Some(expected_variant) => { - assert!(fallbacks.0.is_valid(idx)); - let metadata_bytes = fallbacks - .1 - .filter(|m| m.is_valid(idx)) - .map(|m| m.value(idx)) - .filter(|bytes| !bytes.is_empty()) - .unwrap_or(EMPTY_VARIANT_METADATA_BYTES); + assert!(fallback_value.is_valid(idx)); + let metadata_bytes = + binary_array_value(fallback_metadata.as_ref(), idx).unwrap(); + let metadata_bytes = + if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() { + metadata_bytes + } else { + EMPTY_VARIANT_METADATA_BYTES + }; assert_eq!( - Variant::new(metadata_bytes, fallbacks.0.value(idx)), + Variant::new( + metadata_bytes, + binary_array_value(fallback_value.as_ref(), idx).unwrap() + ), expected_variant.clone() ); } None => { - assert!(fallbacks.0.is_null(idx)); + assert!(fallback_value.is_null(idx)); } } } @@ -983,7 +993,10 @@ mod tests { Some(expected_variant) => { assert!(element_fallbacks.is_valid(idx)); assert_eq!( - Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)), + Variant::new( + EMPTY_VARIANT_METADATA_BYTES, + binary_array_value(element_fallbacks.as_ref(), idx).unwrap() + ), expected_variant.clone() ); } @@ -1129,7 +1142,7 @@ mod tests { #[test] fn test_all_null_input() { // Create VariantArray with no value field (all null case) - let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata + let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata let all_null_array = VariantArray::from_parts(metadata, None, None, None); let result = shred_variant(&all_null_array, &DataType::Int64).unwrap(); @@ -1243,7 +1256,10 @@ mod tests { assert!(!value_field.is_null(1)); // value should contain original assert!(typed_value_field.is_null(1)); // typed_value should be null assert_eq!( - Variant::new(metadata_field.value(1), value_field.value(1)), + Variant::new( + binary_array_value(metadata_field.as_ref(), 1).unwrap(), + binary_array_value(value_field.as_ref(), 1).unwrap() + ), Variant::from("hello") ); @@ -1259,7 +1275,10 @@ mod tests { assert!(!result.is_null(4)); assert!(!value_field.is_null(4)); // should contain Variant::Null assert_eq!( - Variant::new(metadata_field.value(4), value_field.value(4)), + Variant::new( + binary_array_value(metadata_field.as_ref(), 4).unwrap(), + binary_array_value(value_field.as_ref(), 4).unwrap() + ), Variant::Null ); assert!(typed_value_field.is_null(4)); @@ -1336,7 +1355,10 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new(metadata.value(1), value.value(1)), + Variant::new( + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap() + ), Variant::from(42i64) ); @@ -1350,7 +1372,10 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new(metadata.value(3), value.value(3)), + Variant::new( + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(value.as_ref(), 3).unwrap() + ), Variant::Null ); @@ -1392,7 +1417,10 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new(metadata.value(1), value.value(1)), + Variant::new( + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap() + ), Variant::from("not_binary") ); @@ -1406,7 +1434,10 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new(metadata.value(3), value.value(3)), + Variant::new( + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(value.as_ref(), 3).unwrap() + ), Variant::Null ); @@ -1682,14 +1713,14 @@ mod tests { .unwrap(); let outer_fallbacks = outer_elements.value_field().unwrap(); - let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( + let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, outer_elements.len(), - )); + ))) as ArrayRef; let outer_variant = VariantArray::from_parts( outer_metadata, Some(outer_fallbacks.clone()), - Some(Arc::new(outer_values.clone())), + Some(Arc::new(outer_values.clone()) as ArrayRef), None, ); @@ -1792,7 +1823,10 @@ mod tests { // null is stored as Variant::Null in values assert!(id_values.is_valid(1)); assert_eq!( - Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)), + Variant::new( + EMPTY_VARIANT_METADATA_BYTES, + binary_array_value(id_values.as_ref(), 1).unwrap() + ), Variant::Null ); assert!(id_typed_values.is_null(1)); @@ -1866,7 +1900,6 @@ mod tests { assert_eq!(result.len(), 9); let metadata = result.metadata_field(); - let value = result.value_field().unwrap(); let typed_value = result .typed_value_field() @@ -1882,24 +1915,14 @@ mod tests { let age_field = ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap(); - let score_value = score_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let score_value = score_field.value_field().unwrap(); let score_typed_value = score_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); - let age_value = age_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let age_value = age_field.value_field().unwrap(); let age_typed_value = age_field .typed_value_field() .unwrap() @@ -1918,10 +1941,13 @@ mod tests { } fn get_value<'m, 'v>( i: usize, - metadata: &'m BinaryViewArray, - value: &'v BinaryViewArray, + metadata: &'m dyn Array, + value: &'v dyn Array, ) -> Variant<'m, 'v> { - Variant::new(metadata.value(i), value.value(i)) + Variant::new( + binary_array_value(metadata, i).unwrap(), + binary_array_value(value, i).unwrap(), + ) } let expect = |i, expected_result: Option>| { match expected_result { @@ -1933,7 +1959,10 @@ mod tests { match expected_value { Some(expected_value) => { assert!(value.is_valid(i)); - assert_eq!(expected_value, get_value(i, metadata, value)); + assert_eq!( + expected_value, + get_value(i, metadata.as_ref(), value.as_ref()) + ); } None => { assert!(value.is_null(i)); @@ -1952,7 +1981,7 @@ mod tests { assert!(score_value.is_valid(i)); assert_eq!( expected_score_value, - get_value(i, metadata, score_value) + get_value(i, metadata.as_ref(), score_value.as_ref()) ); } None => { @@ -1973,7 +2002,7 @@ mod tests { assert!(age_value.is_valid(i)); assert_eq!( expected_age_value, - get_value(i, metadata, age_value) + get_value(i, metadata.as_ref(), age_value.as_ref()) ); } None => { @@ -2114,7 +2143,7 @@ mod tests { // Helper to correctly create a variant object using a row's existing metadata let object_with_foo_field = |i| { use parquet_variant::{ParentState, ValueBuilder, VariantMetadata}; - let metadata = VariantMetadata::new(metadata.value(i)); + let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap()); let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata); let mut value_builder = ValueBuilder::new(); let state = ParentState::variant(&mut value_builder, &mut metadata_builder); @@ -2213,7 +2242,10 @@ mod tests { assert!(value_field.is_null(2)); assert!(value_field.is_valid(3)); assert_eq!( - Variant::new(result.metadata_field().value(3), value_field.value(3)), + Variant::new( + binary_array_value(result.metadata_field().as_ref(), 3).unwrap(), + binary_array_value(value_field.as_ref(), 3).unwrap() + ), Variant::from("not an object") ); assert!(value_field.is_null(4)); @@ -2231,10 +2263,10 @@ mod tests { .unwrap(); assert_list_structure_and_elements::( &VariantArray::from_parts( - BinaryViewArray::from_iter_values(std::iter::repeat_n( + Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, scores_field.len(), - )), + ))) as ArrayRef, Some(scores_field.value_field().unwrap().clone()), Some(scores_field.typed_value_field().unwrap().clone()), None, @@ -2350,24 +2382,14 @@ mod tests { ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap()) .unwrap(); - let id_value = id_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let id_value = id_field.value_field().unwrap(); let id_typed_value = id_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); - let session_id_value = session_id_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let session_id_value = session_id_field.value_field().unwrap(); let session_id_typed_value = session_id_field .typed_value_field() .unwrap() @@ -2404,7 +2426,10 @@ mod tests { assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes()); // Verify the value field contains the name field - let row_1_variant = Variant::new(metadata.value(1), value.value(1)); + let row_1_variant = Variant::new( + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap(), + ); let Variant::Object(obj) = row_1_variant else { panic!("Expected object"); }; @@ -2436,7 +2461,10 @@ mod tests { assert!(session_id_value.is_valid(3)); // type mismatch, stored in value assert!(session_id_typed_value.is_null(3)); - let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3)); + let session_id_variant = Variant::new( + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(session_id_value.as_ref(), 3).unwrap(), + ); assert_eq!(session_id_variant, Variant::from("not-a-uuid")); // Row 4: Type mismatch - id is int64, not UUID @@ -2447,7 +2475,10 @@ mod tests { assert!(id_value.is_valid(4)); // type mismatch, stored in value assert!(id_typed_value.is_null(4)); - let id_variant = Variant::new(metadata.value(4), id_value.value(4)); + let id_variant = Variant::new( + binary_array_value(metadata.as_ref(), 4).unwrap(), + binary_array_value(id_value.as_ref(), 4).unwrap(), + ); assert_eq!(id_variant, Variant::from(12345i64)); assert!(session_id_value.is_null(4)); diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs index 2df36fa63f0..ecffd48bc41 100644 --- a/parquet-variant-compute/src/unshred_variant.rs +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -17,11 +17,13 @@ //! Module for unshredding VariantArray by folding typed_value columns back into the value column. +use crate::variant_array::binary_array_value; use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - Array, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, - FixedSizeListArray, GenericListArray, GenericListViewArray, LargeBinaryArray, LargeStringArray, - ListLikeArray, PrimitiveArray, StringArray, StringViewArray, StructArray, + Array, ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, + FixedSizeBinaryArray, FixedSizeListArray, GenericListArray, GenericListViewArray, + LargeBinaryArray, LargeStringArray, ListLikeArray, PrimitiveArray, StringArray, + StringViewArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::datatypes::{ @@ -38,6 +40,7 @@ use parquet_variant::{ VariantDecimal16, VariantDecimalType, VariantMetadata, }; use std::marker::PhantomData; +use std::sync::Arc; use uuid::Uuid; /// Removes all (nested) typed_value columns from a VariantArray by converting them back to binary @@ -73,7 +76,12 @@ pub fn unshred_variant(array: &VariantArray) -> Result { if array.is_null(i) { value_builder.append_null(); } else { - let metadata = VariantMetadata::new(metadata.value(i)); + let metadata_bytes = binary_array_value(metadata.as_ref(), i).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "metadata field must be a binary-like array".to_string(), + ) + })?; + let metadata = VariantMetadata::new(metadata_bytes); let mut value_builder = value_builder.builder_ext(&metadata); row_builder.append_row(&mut value_builder, &metadata, i)?; } @@ -82,7 +90,7 @@ pub fn unshred_variant(array: &VariantArray) -> Result { let value = value_builder.build()?; Ok(VariantArray::from_parts( metadata.clone(), - Some(value), + Some(Arc::new(value)), None, nulls.cloned(), )) @@ -308,11 +316,11 @@ impl<'a> NullUnshredVariantBuilder<'a> { /// Builder for arrays that only have value column (already unshredded) struct ValueOnlyUnshredVariantBuilder<'a> { - value: &'a arrow::array::BinaryViewArray, + value: &'a ArrayRef, } impl<'a> ValueOnlyUnshredVariantBuilder<'a> { - fn new(value: &'a BinaryViewArray) -> Self { + fn new(value: &'a ArrayRef) -> Self { Self { value } } @@ -325,7 +333,12 @@ impl<'a> ValueOnlyUnshredVariantBuilder<'a> { if self.value.is_null(index) { builder.append_null(); } else { - let variant = Variant::new_with_metadata(metadata.clone(), self.value.value(index)); + let value_bytes = binary_array_value(self.value.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "value field must be a binary-like array".to_string(), + ) + })?; + let variant = Variant::new_with_metadata(metadata.clone(), value_bytes); builder.append_value(variant); } Ok(()) @@ -347,7 +360,17 @@ trait AppendToVariantBuilder: Array { macro_rules! handle_unshredded_case { ($self:expr, $builder:expr, $metadata:expr, $index:expr, $partial_shredding:expr) => {{ let value = $self.value.as_ref().filter(|v| v.is_valid($index)); - let value = value.map(|v| Variant::new_with_metadata($metadata.clone(), v.value($index))); + let value = value + .map(|v| { + let bytes = binary_array_value(v.as_ref(), $index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "value field must be a binary-like array, instead got {}", + v.data_type(), + )) + })?; + Result::Ok(Variant::new_with_metadata($metadata.clone(), bytes)) + }) + .transpose()?; // If typed_value is null, handle unshredded case and return early if $self.typed_value.is_null($index) { @@ -372,12 +395,12 @@ macro_rules! handle_unshredded_case { /// Generic unshred builder that works with any Array implementing AppendToVariantBuilder struct UnshredPrimitiveRowBuilder<'a, T> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a T, } impl<'a, T: AppendToVariantBuilder> UnshredPrimitiveRowBuilder<'a, T> { - fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a T) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a T) -> Self { Self { value, typed_value } } @@ -475,17 +498,13 @@ impl TimestampType for TimestampNanosecondType { /// Generic builder for timestamp types that handles timezone-aware conversion struct TimestampUnshredRowBuilder<'a, T: TimestampType> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a PrimitiveArray, has_timezone: bool, } impl<'a, T: TimestampType> TimestampUnshredRowBuilder<'a, T> { - fn new( - value: Option<&'a BinaryViewArray>, - typed_value: &'a dyn Array, - has_timezone: bool, - ) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array, has_timezone: bool) -> Self { Self { value, typed_value: typed_value.as_primitive(), @@ -518,7 +537,7 @@ struct DecimalUnshredRowBuilder<'a, A: DecimalType, V> where V: VariantDecimalType, { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a PrimitiveArray, scale: i8, _phantom: PhantomData, @@ -528,7 +547,7 @@ impl<'a, A: DecimalType, V> DecimalUnshredRowBuilder<'a, A, V> where V: VariantDecimalType, { - fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a dyn Array, scale: i8) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array, scale: i8) -> Self { Self { value, typed_value: typed_value.as_primitive(), @@ -554,13 +573,13 @@ where /// Builder for unshredding struct/object types with nested fields struct StructUnshredVariantBuilder<'a> { - value: Option<&'a arrow::array::BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a arrow::array::StructArray, field_unshredders: IndexMap<&'a str, Option>>, } impl<'a> StructUnshredVariantBuilder<'a> { - fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a StructArray) -> Result { + fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a StructArray) -> Result { // Create unshredders for each field in constructor let mut field_unshredders = IndexMap::new(); for (field, field_array) in typed_value.fields().iter().zip(typed_value.columns()) { @@ -626,13 +645,13 @@ impl<'a> StructUnshredVariantBuilder<'a> { /// Builder for unshredding list/array types with recursive element processing struct ListUnshredVariantBuilder<'a, L: ListLikeArray> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a L, element_unshredder: Box>, } impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> { - fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a L) -> Result { + fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a L) -> Result { // Create a recursive unshredder for the list elements // The element type comes from the values array of the list let element_values = typed_value.values(); @@ -684,16 +703,18 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> { mod tests { use crate::VariantArray; use arrow::array::{ - BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray, + ArrayRef, BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray, }; use parquet_variant::Variant; + use std::sync::Arc; #[test] fn test_unshred_utf8view_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = std::sync::Arc::new(StringViewArray::from(vec![ + let typed_value: ArrayRef = Arc::new(StringViewArray::from(vec![ Some("hello"), Some("middle"), Some("world"), @@ -712,14 +733,14 @@ mod tests { #[test] fn test_unshred_largeutf8_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(LargeStringArray::from(vec![ - Some("hello"), - Some("middle"), - Some("world"), - ])); + let typed_value: ArrayRef = Arc::new(LargeStringArray::from(vec![ + Some("hello"), + Some("middle"), + Some("world"), + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); @@ -734,14 +755,14 @@ mod tests { #[test] fn test_unshred_binary_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(BinaryArray::from_iter_values(vec![ - &b"\x00\x01\x02"[..], - &b"\xff\xaa"[..], - &b"\xde\xad\xbe\xef"[..], - ])); + let typed_value: ArrayRef = Arc::new(BinaryArray::from_iter_values(vec![ + &b"\x00\x01\x02"[..], + &b"\xff\xaa"[..], + &b"\xde\xad\xbe\xef"[..], + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); @@ -756,14 +777,14 @@ mod tests { #[test] fn test_unshred_largebinary_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); - - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(LargeBinaryArray::from_iter_values(vec![ - &b"\x00\x01\x02"[..], - &b"\xff\xaa"[..], - &b"\xde\xad\xbe\xef"[..], - ])); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); + + let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from_iter_values(vec![ + &b"\x00\x01\x02"[..], + &b"\xff\xaa"[..], + &b"\xde\xad\xbe\xef"[..], + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 145de5edfb7..5c241232891 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -22,7 +22,7 @@ use crate::type_conversion::{ generic_conversion_single_value, generic_conversion_single_value_with_result, primitive_conversion_single_value, }; -use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; +use arrow::array::{Array, ArrayRef, AsArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ @@ -41,6 +41,31 @@ use parquet_variant::{ use std::borrow::Cow; use std::sync::Arc; +/// Returns the raw bytes at the given index from a binary-like array, return `None` if the array isn't binary-like. +pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> { + match array.data_type() { + DataType::Binary => Some(array.as_binary::().value(index)), + DataType::LargeBinary => Some(array.as_binary::().value(index)), + DataType::BinaryView => Some(array.as_binary_view().value(index)), + _ => None, + } +} + +/// Validates that an array has a binary-like data type. +fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> { + if matches!( + array.data_type(), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView + ) { + Ok(()) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}", + array.data_type() + ))) + } +} + /// Arrow Variant [`ExtensionType`]. /// /// Represents the canonical Arrow Extension Type for storing variants. @@ -213,13 +238,13 @@ impl ExtensionType for VariantType { /// assert_eq!(variant_array.value(0), Variant::from("such wow")); /// ``` /// -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct VariantArray { /// Reference to the underlying StructArray inner: StructArray, - /// The metadata column of this variant - metadata: BinaryViewArray, + /// The metadata column of this variant (Binary, LargeBinary, or BinaryView) + metadata: ArrayRef, /// how is this variant array shredded? shredding_state: ShreddingState, @@ -252,11 +277,9 @@ impl VariantArray { /// Dictionary-Encoded, preferably (but not required) with an index type of /// int8. /// - /// Currently, only [`BinaryViewArray`] are supported. pub fn try_new(inner: &dyn Array) -> Result { - // Workaround lack of support for Binary - // https://github.com/apache/arrow-rs/issues/8387 - let inner = cast_to_binary_view_arrays(inner)?; + // Canonicalize shredded typed_value fields (e.g. decimal narrowing) + let inner = canonicalize_shredded_types(inner)?; let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( @@ -266,37 +289,31 @@ impl VariantArray { // Note the specification allows for any order so we must search by name - // Ensure the StructArray has a metadata field of BinaryView - let Some(metadata_field) = inner.column_by_name("metadata") else { + // Ensure the StructArray has a metadata field that is a binary type + let Some(metadata_col) = inner.column_by_name("metadata") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); }; - let Some(metadata) = metadata_field.as_binary_view_opt() else { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'metadata' field must be BinaryView, got {}", - metadata_field.data_type() - ))); - }; + validate_binary_array(metadata_col.as_ref(), "metadata")?; // Note these clones are cheap, they just bump the ref count Ok(Self { inner: inner.clone(), - metadata: metadata.clone(), + metadata: metadata_col.clone(), shredding_state: ShreddingState::try_from(inner)?, }) } pub(crate) fn from_parts( - metadata: BinaryViewArray, - value: Option, + metadata: ArrayRef, + value: Option, typed_value: Option, nulls: Option, ) -> Self { - let mut builder = - StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false); + let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false); if let Some(value) = value.clone() { - builder = builder.with_field("value", Arc::new(value), true); + builder = builder.with_field("value", value, true); } if let Some(typed_value) = typed_value.clone() { builder = builder.with_field("typed_value", typed_value, true); @@ -375,7 +392,20 @@ impl VariantArray { } // Otherwise fall back to value, if available (_, Some(value)) if value.is_valid(index) => { - Ok(Variant::new(self.metadata.value(index), value.value(index))) + let metadata = + binary_array_value(self.metadata.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "metadata field must be a binary-like array, instead got {}", + self.metadata.data_type(), + )) + })?; + let value = binary_array_value(value.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "value field must be a binary-like array, instead got {}", + value.data_type(), + )) + })?; + Ok(Variant::new(metadata, value)) } // It is technically invalid for neither value nor typed_value fields to be available, // but the spec specifically requires readers to return Variant::Null in this case. @@ -384,12 +414,12 @@ impl VariantArray { } /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &BinaryViewArray { + pub fn metadata_field(&self) -> &ArrayRef { &self.metadata } /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.shredding_state.value_field() } @@ -453,6 +483,12 @@ impl VariantArray { } } +impl PartialEq for VariantArray { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} + impl From for StructArray { fn from(variant_array: VariantArray) -> Self { variant_array.into_inner() @@ -626,7 +662,6 @@ impl ShreddedVariantFieldArray { /// 2. An optional field named `typed_value` which can be any primitive type /// or be a list, large_list, list_view or struct /// - /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. pub fn try_new(inner: &dyn Array) -> Result { let Some(inner_struct) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( @@ -647,7 +682,7 @@ impl ShreddedVariantFieldArray { } /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.shredding_state.value_field() } @@ -662,13 +697,13 @@ impl ShreddedVariantFieldArray { } pub(crate) fn from_parts( - value: Option, + value: Option, typed_value: Option, nulls: Option, ) -> Self { let mut builder = StructArrayBuilder::new(); if let Some(value) = value.clone() { - builder = builder.with_field("value", Arc::new(value), true); + builder = builder.with_field("value", value, true); } if let Some(typed_value) = typed_value.clone() { builder = builder.with_field("typed_value", typed_value, true); @@ -766,9 +801,9 @@ impl From for StructArray { /// (partial shredding). /// /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct ShreddingState { - value: Option, + value: Option, typed_value: Option, } @@ -787,12 +822,12 @@ impl ShreddingState { /// let struct_array: StructArray = get_struct_array(); /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); /// ``` - pub fn new(value: Option, typed_value: Option) -> Self { + pub fn new(value: Option, typed_value: Option) -> Self { Self { value, typed_value } } /// Return a reference to the value field, if present - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.value.as_ref() } @@ -822,7 +857,7 @@ impl ShreddingState { /// for avoiding clone operations when the caller does not need a self-standing shredding state. #[derive(Clone, Debug)] pub struct BorrowedShreddingState<'a> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>, } @@ -841,12 +876,12 @@ impl<'a> BorrowedShreddingState<'a> { /// let struct_array: StructArray = get_struct_array(); /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap(); /// ``` - pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self { + pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>) -> Self { Self { value, typed_value } } /// Return a reference to the value field, if present - pub fn value_field(&self) -> Option<&'a BinaryViewArray> { + pub fn value_field(&self) -> Option<&'a ArrayRef> { self.value } @@ -860,15 +895,10 @@ impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { type Error = ArrowError; fn try_from(inner_struct: &'a StructArray) -> Result { - // The `value` column need not exist, but if it does it must be a binary view. + // The `value` column need not exist, but if it does it must be a binary type. let value = if let Some(value_col) = inner_struct.column_by_name("value") { - let Some(binary_view) = value_col.as_binary_view_opt() else { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - value_col.data_type() - ))); - }; - Some(binary_view) + validate_binary_array(value_col.as_ref(), "value")?; + Some(value_col) } else { None }; @@ -936,7 +966,7 @@ impl StructArrayBuilder { /// returns the non-null element at index as a Variant fn typed_value_to_variant<'a>( typed_value: &'a ArrayRef, - value: Option<&BinaryViewArray>, + value: Option<&'a ArrayRef>, index: usize, ) -> Result> { let data_type = typed_value.data_type(); @@ -957,6 +987,16 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes } + DataType::Binary => { + let array = typed_value.as_binary::(); + let value = array.value(index); + Ok(Variant::from(value)) + } + DataType::LargeBinary => { + let array = typed_value.as_binary::(); + let value = array.value(index); + Ok(Variant::from(value)) + } DataType::BinaryView => { let array = typed_value.as_binary_view(); let value = array.value(index); @@ -1099,17 +1139,9 @@ fn typed_value_to_variant<'a>( } } -/// Workaround for lack of direct support for BinaryArray -/// -/// -/// The values are read as -/// * `StructArray` -/// -/// but VariantArray needs them as -/// * `StructArray` -/// -/// So cast them to get the right type. -fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { +/// Canonicalize shredded typed_value fields (e.g. decimal narrowing) and +/// verify that all data types in the struct are legal for a variant array. +fn canonicalize_shredded_types(array: &dyn Array) -> Result { let new_type = canonicalize_and_verify_data_type(array.data_type())?; if let Cow::Borrowed(_) = new_type { if let Some(array) = array.as_struct_opt() { @@ -1120,8 +1152,8 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { } /// Recursively visits a data type, ensuring that it only contains data types that can legally -/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView, -/// since that's what comes back from the parquet reader and what the variant code expects to find. +/// appear in a (possibly shredded) variant array. It also narrows decimal types to the smallest +/// valid precision (e.g. Decimal128 -> Decimal32 when the precision fits). fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result> { use DataType::*; @@ -1172,10 +1204,8 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result borrow!(), Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(), - // Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet - // reader returns and what the rest of the variant code expects. - Binary | LargeBinary => Cow::Owned(BinaryView), - BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), + // Binary, string, and their view counterparts are allowed. + Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), // UUID maps to 16-byte fixed-size binary; no other width is allowed FixedSizeBinary(16) => borrow!(), @@ -1242,8 +1272,9 @@ mod test { use super::*; use arrow::array::{ - BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, Int64Array, - LargeListArray, LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray, + BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, + Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, ListArray, ListViewArray, + Time64MicrosecondArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow_schema::{Field, Fields}; @@ -1313,7 +1344,7 @@ mod test { let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32" + "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32" ); } @@ -1321,7 +1352,7 @@ mod test { fn invalid_value_field_type() { let fields = Fields::from(vec![ Field::new("metadata", DataType::BinaryView, true), - Field::new("value", DataType::Int32, true), // Not yet supported + Field::new("value", DataType::Int32, true), ]); let array = StructArray::new( fields, @@ -1331,7 +1362,7 @@ mod test { let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32" + "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32" ); } @@ -1445,27 +1476,28 @@ mod test { // use Parquet LIST encoding, but those fixtures do not cover Arrow-specific list container // variants (`LargeList`, `ListView`, `LargeListView`) accepted by `VariantArray::try_new`. let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true)); + let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true)); let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true)); let cases = vec![ - ( - DataType::LargeList(make_item_binary()), - DataType::LargeList(make_item_binary_view()), - ), - ( - DataType::ListView(make_item_binary()), - DataType::ListView(make_item_binary_view()), - ), - ( - DataType::LargeListView(make_item_binary()), - DataType::LargeListView(make_item_binary_view()), - ), + // Binary item + DataType::LargeList(make_item_binary()), + DataType::ListView(make_item_binary()), + DataType::LargeListView(make_item_binary()), + // Large binary item + DataType::LargeList(make_large_binary()), + DataType::ListView(make_large_binary()), + DataType::LargeListView(make_large_binary()), + // Binary view item + DataType::LargeList(make_item_binary_view()), + DataType::ListView(make_item_binary_view()), + DataType::LargeListView(make_item_binary_view()), ]; - for (input, expected) in cases { + for input in cases { assert_eq!( canonicalize_and_verify_data_type(&input).unwrap().as_ref(), - &expected + &input ); } } @@ -1666,6 +1698,40 @@ mod test { } } + #[test] + fn binary_typed_value_roundtrips() { + // Verify that a shredded variant with Binary typed_value can be read back + let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([ + EMPTY_VARIANT_METADATA_BYTES, + ])); + let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]])); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", metadata, false) + .with_field("typed_value", typed_value, true) + .build(); + + let variant_array = VariantArray::try_new(&struct_array).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8])); + } + + #[test] + fn large_binary_typed_value_roundtrips() { + // Verify that a shredded variant with LargeBinary typed_value can be read back + let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([ + EMPTY_VARIANT_METADATA_BYTES, + ])); + let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]])); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", metadata, false) + .with_field("typed_value", typed_value, true) + .build(); + + let variant_array = VariantArray::try_new(&struct_array).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8])); + } + macro_rules! invalid_variant_array_test { ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => { #[test] diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 86ece001004..2ef96180c35 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -609,7 +609,7 @@ mod test { let array2 = VariantArray::from_parts( array.metadata_field().clone(), - Some(value_builder.build().unwrap()), + Some(Arc::new(value_builder.build().unwrap()) as ArrayRef), None, None, ); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 29e28c850be..6a5c4623369 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, + array::{self, Array, ArrayRef, StructArray, make_array}, + buffer::NullBuffer, compute::CastOptions, datatypes::Field, error::Result, @@ -121,9 +122,9 @@ fn shredded_get_path( // Helper that creates a new VariantArray from the given nested value and typed_value columns, // properly accounting for accumulated nulls from path traversal let make_target_variant = - |value: Option, + |value: Option, typed_value: Option, - accumulated_nulls: Option| { + accumulated_nulls: Option| { let metadata = input.metadata_field().clone(); VariantArray::from_parts(metadata, value, typed_value, accumulated_nulls) }; @@ -168,10 +169,8 @@ fn shredded_get_path( ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed if let Some(typed_value) = shredding_state.typed_value_field() { - accumulated_nulls = arrow::buffer::NullBuffer::union( - accumulated_nulls.as_ref(), - typed_value.nulls(), - ); + accumulated_nulls = + NullBuffer::union(accumulated_nulls.as_ref(), typed_value.nulls()); } shredding_state = state; path_index += 1; @@ -258,6 +257,7 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti return None; } let typed_value = variant_array.typed_value_field()?; + if typed_value.data_type() == as_field.data_type() && variant_array .value_field() @@ -268,9 +268,25 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti // 2. If every row in the `value` column is null // This is a perfect shredding, where the value is entirely shredded out, - // so we can just return the typed value. - return Some(typed_value.clone()); + // so we can just return the typed value after merging the accumulated nulls. + let parent_nulls = variant_array.nulls(); + + let target_array = if parent_nulls.is_none() { + typed_value.clone() + } else { + let merged_nulls = NullBuffer::union(parent_nulls, typed_value.nulls()); + let data = typed_value + .to_data() + .into_builder() + .nulls(merged_nulls) + .build() + .ok()?; + make_array(data) + }; + + return Some(target_array); } + None } @@ -1055,7 +1071,13 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, typed_value.len(), )); - VariantArray::from_parts(metadata, None, Some(typed_value), None).into() + VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + Some(typed_value), + None, + ) + .into() } }; } @@ -1696,6 +1718,41 @@ mod test { ]) ); + #[test] + fn test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls() { + let metadata = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); + let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![ + Some(b"Apache" as &[u8]), + Some(b"masked-null" as &[u8]), + Some(b"Parquet-variant" as &[u8]), + ])); + let variant_array: ArrayRef = VariantArray::from_parts( + Arc::new(metadata) as _, + None, + Some(typed_value), + Some(NullBuffer::from(vec![true, false, true])), + ) + .into(); + + let result = variant_get( + &variant_array, + GetOptions::new().with_as_type(Some(FieldRef::from(Field::new( + "result", + DataType::Binary, + true, + )))), + ) + .unwrap(); + + let result = result.as_binary::(); + assert_eq!(result.len(), 3); + assert_eq!(result.null_count(), 1); + assert_eq!(result.value(0), b"Apache"); + assert!(result.is_null(1)); + assert_eq!(result.value(2), b"Parquet-variant"); + } + /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): /// @@ -1723,7 +1780,12 @@ mod test { let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); - ArrayRef::from(VariantArray::from_parts(metadata, None, None, Some(nulls))) + ArrayRef::from(VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + None, + Some(nulls), + )) } /// This test manually constructs a shredded variant array representing objects /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field @@ -1829,8 +1891,8 @@ mod test { // Create the main VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2206,8 +2268,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2297,7 +2359,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(a_value_array), + Some(Arc::new(a_value_array) as ArrayRef), Some(a_inner_typed_value), None, ); @@ -2317,8 +2379,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2399,7 +2461,7 @@ mod test { .unwrap(), ) as ArrayRef; let b_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(b_value_array), + Some(Arc::new(b_value_array) as ArrayRef), Some(b_inner_typed_value), None, ); @@ -2428,7 +2490,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(a_value_array), + Some(Arc::new(a_value_array) as ArrayRef), Some(a_inner_typed_value), None, ); @@ -2448,8 +2510,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -3262,7 +3324,7 @@ mod test { // Build final VariantArray with top-level nulls ArrayRef::from(VariantArray::from_parts( - metadata_array, + Arc::new(metadata_array) as ArrayRef, None, Some(Arc::new(typed_value_struct)), Some(nulls), @@ -3321,7 +3383,7 @@ mod test { false, // row 3: top-level NULL ]); ArrayRef::from(VariantArray::from_parts( - metadata_array, + Arc::new(metadata_array) as ArrayRef, None, Some(Arc::new(typed_value)), Some(nulls), @@ -3390,8 +3452,8 @@ mod test { // Top-level null is encoded in the main StructArray's null mask let variant_nulls = NullBuffer::from(vec![true, true, true, false]); // Row 3 is top-level null ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), Some(variant_nulls), )) @@ -4068,9 +4130,13 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, all_nulls_values.len(), )); - let variant_array: ArrayRef = - VariantArray::from_parts(metadata, None, Some(Arc::new(typed_value_struct)), None) - .into(); + let variant_array: ArrayRef = VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + Some(Arc::new(typed_value_struct)), + None, + ) + .into(); // Case 1: all-null primitive column should reuse the typed_value Arc directly let all_nulls_field_ref = FieldRef::from(Field::new("result", DataType::Int32, true)); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index dd054a5f7d8..6d1626640c1 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -26,11 +26,10 @@ use crate::type_conversion::{ use crate::variant_array::ShreddedVariantFieldArray; use crate::{VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, - BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, - GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, - OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, - StructArray, + ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewBuilder, + BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, GenericListViewArray, + LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, OffsetSizeTrait, + PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, StructArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::{CastOptions, DecimalCast}; @@ -119,7 +118,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>( } pub(crate) fn make_variant_to_arrow_row_builder<'a>( - metadata: &BinaryViewArray, + metadata: &ArrayRef, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, @@ -924,7 +923,7 @@ impl<'a> ListElementBuilder<'a> { Self::Shredded(b) => { let (value, typed_value, nulls) = b.finish()?; Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts( - Some(value), + Some(Arc::new(value)), Some(typed_value), nulls, ))) @@ -1052,13 +1051,13 @@ where /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { - metadata: BinaryViewArray, + metadata: ArrayRef, builder: VariantValueArrayBuilder, nulls: NullBufferBuilder, } impl VariantToBinaryVariantArrowRowBuilder { - fn new(metadata: BinaryViewArray, capacity: usize) -> Self { + fn new(metadata: ArrayRef, capacity: usize) -> Self { Self { metadata, builder: VariantValueArrayBuilder::new(capacity), @@ -1083,7 +1082,7 @@ impl VariantToBinaryVariantArrowRowBuilder { fn finish(mut self) -> Result { let variant_array = VariantArray::from_parts( self.metadata, - Some(self.builder.build()?), + Some(Arc::new(self.builder.build()?)), None, // no typed_value column self.nulls.finish(), );