-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Support Binary/LargeBinary children #9610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant | |
| let (value, typed_value, nulls) = builder.finish()?; | ||
| Ok(VariantArray::from_parts( | ||
| array.metadata_field().clone(), | ||
| Some(value), | ||
| Some(Arc::new(value) as ArrayRef), | ||
| Some(typed_value), | ||
| nulls, | ||
| )) | ||
|
|
@@ -443,8 +443,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { | |
| let mut builder = StructArrayBuilder::new(); | ||
| for (field_name, typed_value_builder) in self.typed_value_builders { | ||
| let (value, typed_value, nulls) = typed_value_builder.finish()?; | ||
| let array = | ||
| ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); | ||
| let array = ShreddedVariantFieldArray::from_parts( | ||
| Some(Arc::new(value) as ArrayRef), | ||
| Some(typed_value), | ||
| nulls, | ||
| ); | ||
| builder = builder.with_field(field_name, ArrayRef::from(array), false); | ||
| } | ||
| if let Some(nulls) = self.typed_value_nulls.finish() { | ||
|
|
@@ -689,6 +692,7 @@ impl VariantSchemaNode { | |
| mod tests { | ||
| use super::*; | ||
| use crate::VariantArrayBuilder; | ||
| use crate::variant_array::binary_array_value; | ||
| use arrow::array::{ | ||
| Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray, | ||
| GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray, | ||
|
|
@@ -867,7 +871,8 @@ mod tests { | |
| ) { | ||
| assert_eq!(array.len(), expected_len); | ||
|
|
||
| let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field())); | ||
| let fallback_value = array.value_field().unwrap(); | ||
| let fallback_metadata = array.metadata_field(); | ||
| let array = downcast_list_like_array::<O>(array); | ||
|
|
||
| assert_eq!( | ||
|
|
@@ -887,7 +892,7 @@ mod tests { | |
| ); | ||
| assert_eq!( | ||
| array.len(), | ||
| fallbacks.0.len(), | ||
| fallback_value.len(), | ||
| "fallbacks value field should match array length" | ||
| ); | ||
|
|
||
|
|
@@ -902,28 +907,33 @@ mod tests { | |
| // Successfully shredded: typed list value present, no fallback value | ||
| assert!(array.is_valid(idx)); | ||
| assert_eq!(array.value_size(idx), *len); | ||
| assert!(fallbacks.0.is_null(idx)); | ||
| assert!(fallback_value.is_null(idx)); | ||
| } | ||
| None => { | ||
| // Unable to shred: typed list value absent, fallback should carry the variant | ||
| assert!(array.is_null(idx)); | ||
| assert_eq!(array.value_size(idx), O::zero()); | ||
| match expected_fallback { | ||
| Some(expected_variant) => { | ||
| assert!(fallbacks.0.is_valid(idx)); | ||
| let metadata_bytes = fallbacks | ||
| .1 | ||
| .filter(|m| m.is_valid(idx)) | ||
| .map(|m| m.value(idx)) | ||
| .filter(|bytes| !bytes.is_empty()) | ||
| .unwrap_or(EMPTY_VARIANT_METADATA_BYTES); | ||
| assert!(fallback_value.is_valid(idx)); | ||
| let metadata_bytes = | ||
| binary_array_value(fallback_metadata.as_ref(), idx).unwrap(); | ||
| let metadata_bytes = | ||
| if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() { | ||
| metadata_bytes | ||
| } else { | ||
| EMPTY_VARIANT_METADATA_BYTES | ||
| }; | ||
| assert_eq!( | ||
| Variant::new(metadata_bytes, fallbacks.0.value(idx)), | ||
| Variant::new( | ||
| metadata_bytes, | ||
| binary_array_value(fallback_value.as_ref(), idx).unwrap() | ||
| ), | ||
| expected_variant.clone() | ||
| ); | ||
| } | ||
| None => { | ||
| assert!(fallbacks.0.is_null(idx)); | ||
| assert!(fallback_value.is_null(idx)); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -983,7 +993,10 @@ mod tests { | |
| Some(expected_variant) => { | ||
| assert!(element_fallbacks.is_valid(idx)); | ||
| assert_eq!( | ||
| Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)), | ||
| Variant::new( | ||
| EMPTY_VARIANT_METADATA_BYTES, | ||
| binary_array_value(element_fallbacks.as_ref(), idx).unwrap() | ||
| ), | ||
| expected_variant.clone() | ||
| ); | ||
| } | ||
|
|
@@ -1129,7 +1142,7 @@ mod tests { | |
| #[test] | ||
| fn test_all_null_input() { | ||
| // Create VariantArray with no value field (all null case) | ||
| let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata | ||
| let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata | ||
| let all_null_array = VariantArray::from_parts(metadata, None, None, None); | ||
| let result = shred_variant(&all_null_array, &DataType::Int64).unwrap(); | ||
|
|
||
|
|
@@ -1243,7 +1256,10 @@ mod tests { | |
| assert!(!value_field.is_null(1)); // value should contain original | ||
| assert!(typed_value_field.is_null(1)); // typed_value should be null | ||
| assert_eq!( | ||
| Variant::new(metadata_field.value(1), value_field.value(1)), | ||
| Variant::new( | ||
| binary_array_value(metadata_field.as_ref(), 1).unwrap(), | ||
| binary_array_value(value_field.as_ref(), 1).unwrap() | ||
| ), | ||
| Variant::from("hello") | ||
| ); | ||
|
|
||
|
|
@@ -1259,7 +1275,10 @@ mod tests { | |
| assert!(!result.is_null(4)); | ||
| assert!(!value_field.is_null(4)); // should contain Variant::Null | ||
| assert_eq!( | ||
| Variant::new(metadata_field.value(4), value_field.value(4)), | ||
| Variant::new( | ||
| binary_array_value(metadata_field.as_ref(), 4).unwrap(), | ||
| binary_array_value(value_field.as_ref(), 4).unwrap() | ||
| ), | ||
| Variant::Null | ||
| ); | ||
| assert!(typed_value_field.is_null(4)); | ||
|
|
@@ -1336,7 +1355,10 @@ mod tests { | |
| assert!(value.is_valid(1)); | ||
| assert!(typed_value.is_null(1)); | ||
| assert_eq!( | ||
| Variant::new(metadata.value(1), value.value(1)), | ||
| Variant::new( | ||
| binary_array_value(metadata.as_ref(), 1).unwrap(), | ||
| binary_array_value(value.as_ref(), 1).unwrap() | ||
| ), | ||
| Variant::from(42i64) | ||
| ); | ||
|
|
||
|
|
@@ -1350,7 +1372,10 @@ mod tests { | |
| assert!(value.is_valid(3)); | ||
| assert!(typed_value.is_null(3)); | ||
| assert_eq!( | ||
| Variant::new(metadata.value(3), value.value(3)), | ||
| Variant::new( | ||
| binary_array_value(metadata.as_ref(), 3).unwrap(), | ||
| binary_array_value(value.as_ref(), 3).unwrap() | ||
| ), | ||
| Variant::Null | ||
| ); | ||
|
|
||
|
|
@@ -1392,7 +1417,10 @@ mod tests { | |
| assert!(value.is_valid(1)); | ||
| assert!(typed_value.is_null(1)); | ||
| assert_eq!( | ||
| Variant::new(metadata.value(1), value.value(1)), | ||
| Variant::new( | ||
| binary_array_value(metadata.as_ref(), 1).unwrap(), | ||
| binary_array_value(value.as_ref(), 1).unwrap() | ||
| ), | ||
| Variant::from("not_binary") | ||
| ); | ||
|
|
||
|
|
@@ -1406,7 +1434,10 @@ mod tests { | |
| assert!(value.is_valid(3)); | ||
| assert!(typed_value.is_null(3)); | ||
| assert_eq!( | ||
| Variant::new(metadata.value(3), value.value(3)), | ||
| Variant::new( | ||
| binary_array_value(metadata.as_ref(), 3).unwrap(), | ||
| binary_array_value(value.as_ref(), 3).unwrap() | ||
| ), | ||
|
Comment on lines
-1409
to
+1440
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are a lot of these, spread across several test modules. Do you think it would be helpful to define a test helper function that takes a pair of arrays (applying deref) plus an index, and encapsulates the binary array value+unwrap calls? assert_eq!(
variant_from_arrays_at(metadata, value, 3),
Ok(Variant::Null)
); |
||
| Variant::Null | ||
| ); | ||
|
|
||
|
|
@@ -1682,14 +1713,14 @@ mod tests { | |
| .unwrap(); | ||
| let outer_fallbacks = outer_elements.value_field().unwrap(); | ||
|
|
||
| let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( | ||
| let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( | ||
| EMPTY_VARIANT_METADATA_BYTES, | ||
| outer_elements.len(), | ||
| )); | ||
| ))) as ArrayRef; | ||
| let outer_variant = VariantArray::from_parts( | ||
| outer_metadata, | ||
| Some(outer_fallbacks.clone()), | ||
| Some(Arc::new(outer_values.clone())), | ||
| Some(Arc::new(outer_values.clone()) as ArrayRef), | ||
| None, | ||
| ); | ||
|
|
||
|
|
@@ -1792,7 +1823,10 @@ mod tests { | |
| // null is stored as Variant::Null in values | ||
| assert!(id_values.is_valid(1)); | ||
| assert_eq!( | ||
| Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)), | ||
| Variant::new( | ||
| EMPTY_VARIANT_METADATA_BYTES, | ||
| binary_array_value(id_values.as_ref(), 1).unwrap() | ||
| ), | ||
| Variant::Null | ||
| ); | ||
| assert!(id_typed_values.is_null(1)); | ||
|
|
@@ -1866,7 +1900,6 @@ mod tests { | |
| assert_eq!(result.len(), 9); | ||
|
|
||
| let metadata = result.metadata_field(); | ||
|
|
||
| let value = result.value_field().unwrap(); | ||
| let typed_value = result | ||
| .typed_value_field() | ||
|
|
@@ -1882,24 +1915,14 @@ mod tests { | |
| let age_field = | ||
| ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap(); | ||
|
|
||
| let score_value = score_field | ||
| .value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<BinaryViewArray>() | ||
| .unwrap(); | ||
| let score_value = score_field.value_field().unwrap(); | ||
| let score_typed_value = score_field | ||
| .typed_value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<Float64Array>() | ||
| .unwrap(); | ||
| let age_value = age_field | ||
| .value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<BinaryViewArray>() | ||
| .unwrap(); | ||
| let age_value = age_field.value_field().unwrap(); | ||
| let age_typed_value = age_field | ||
| .typed_value_field() | ||
| .unwrap() | ||
|
|
@@ -1918,10 +1941,13 @@ mod tests { | |
| } | ||
| fn get_value<'m, 'v>( | ||
| i: usize, | ||
| metadata: &'m BinaryViewArray, | ||
| value: &'v BinaryViewArray, | ||
| metadata: &'m dyn Array, | ||
| value: &'v dyn Array, | ||
| ) -> Variant<'m, 'v> { | ||
| Variant::new(metadata.value(i), value.value(i)) | ||
| Variant::new( | ||
| binary_array_value(metadata, i).unwrap(), | ||
| binary_array_value(value, i).unwrap(), | ||
| ) | ||
| } | ||
| let expect = |i, expected_result: Option<ShreddedValue<ShreddedStruct>>| { | ||
| match expected_result { | ||
|
|
@@ -1933,7 +1959,10 @@ mod tests { | |
| match expected_value { | ||
| Some(expected_value) => { | ||
| assert!(value.is_valid(i)); | ||
| assert_eq!(expected_value, get_value(i, metadata, value)); | ||
| assert_eq!( | ||
| expected_value, | ||
| get_value(i, metadata.as_ref(), value.as_ref()) | ||
| ); | ||
| } | ||
| None => { | ||
| assert!(value.is_null(i)); | ||
|
|
@@ -1952,7 +1981,7 @@ mod tests { | |
| assert!(score_value.is_valid(i)); | ||
| assert_eq!( | ||
| expected_score_value, | ||
| get_value(i, metadata, score_value) | ||
| get_value(i, metadata.as_ref(), score_value.as_ref()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does deref coercion somehow not kick in here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah... that only works for concrete types, not dyn trait refs. Blech. |
||
| ); | ||
| } | ||
| None => { | ||
|
|
@@ -1973,7 +2002,7 @@ mod tests { | |
| assert!(age_value.is_valid(i)); | ||
| assert_eq!( | ||
| expected_age_value, | ||
| get_value(i, metadata, age_value) | ||
| get_value(i, metadata.as_ref(), age_value.as_ref()) | ||
| ); | ||
| } | ||
| None => { | ||
|
|
@@ -2114,7 +2143,7 @@ mod tests { | |
| // Helper to correctly create a variant object using a row's existing metadata | ||
| let object_with_foo_field = |i| { | ||
| use parquet_variant::{ParentState, ValueBuilder, VariantMetadata}; | ||
| let metadata = VariantMetadata::new(metadata.value(i)); | ||
| let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap()); | ||
| let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata); | ||
| let mut value_builder = ValueBuilder::new(); | ||
| let state = ParentState::variant(&mut value_builder, &mut metadata_builder); | ||
|
|
@@ -2213,7 +2242,10 @@ mod tests { | |
| assert!(value_field.is_null(2)); | ||
| assert!(value_field.is_valid(3)); | ||
| assert_eq!( | ||
| Variant::new(result.metadata_field().value(3), value_field.value(3)), | ||
| Variant::new( | ||
| binary_array_value(result.metadata_field().as_ref(), 3).unwrap(), | ||
| binary_array_value(value_field.as_ref(), 3).unwrap() | ||
| ), | ||
| Variant::from("not an object") | ||
| ); | ||
| assert!(value_field.is_null(4)); | ||
|
|
@@ -2231,10 +2263,10 @@ mod tests { | |
| .unwrap(); | ||
| assert_list_structure_and_elements::<Int64Type, i32>( | ||
| &VariantArray::from_parts( | ||
| BinaryViewArray::from_iter_values(std::iter::repeat_n( | ||
| Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( | ||
| EMPTY_VARIANT_METADATA_BYTES, | ||
| scores_field.len(), | ||
| )), | ||
| ))) as ArrayRef, | ||
| Some(scores_field.value_field().unwrap().clone()), | ||
| Some(scores_field.typed_value_field().unwrap().clone()), | ||
| None, | ||
|
|
@@ -2350,24 +2382,14 @@ mod tests { | |
| ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap()) | ||
| .unwrap(); | ||
|
|
||
| let id_value = id_field | ||
| .value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<BinaryViewArray>() | ||
| .unwrap(); | ||
| let id_value = id_field.value_field().unwrap(); | ||
| let id_typed_value = id_field | ||
| .typed_value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<FixedSizeBinaryArray>() | ||
| .unwrap(); | ||
| let session_id_value = session_id_field | ||
| .value_field() | ||
| .unwrap() | ||
| .as_any() | ||
| .downcast_ref::<BinaryViewArray>() | ||
| .unwrap(); | ||
| let session_id_value = session_id_field.value_field().unwrap(); | ||
| let session_id_typed_value = session_id_field | ||
| .typed_value_field() | ||
| .unwrap() | ||
|
|
@@ -2404,7 +2426,10 @@ mod tests { | |
| assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes()); | ||
|
|
||
| // Verify the value field contains the name field | ||
| let row_1_variant = Variant::new(metadata.value(1), value.value(1)); | ||
| let row_1_variant = Variant::new( | ||
| binary_array_value(metadata.as_ref(), 1).unwrap(), | ||
| binary_array_value(value.as_ref(), 1).unwrap(), | ||
| ); | ||
| let Variant::Object(obj) = row_1_variant else { | ||
| panic!("Expected object"); | ||
| }; | ||
|
|
@@ -2436,7 +2461,10 @@ mod tests { | |
|
|
||
| assert!(session_id_value.is_valid(3)); // type mismatch, stored in value | ||
| assert!(session_id_typed_value.is_null(3)); | ||
| let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3)); | ||
| let session_id_variant = Variant::new( | ||
| binary_array_value(metadata.as_ref(), 3).unwrap(), | ||
| binary_array_value(session_id_value.as_ref(), 3).unwrap(), | ||
| ); | ||
| assert_eq!(session_id_variant, Variant::from("not-a-uuid")); | ||
|
|
||
| // Row 4: Type mismatch - id is int64, not UUID | ||
|
|
@@ -2447,7 +2475,10 @@ mod tests { | |
|
|
||
| assert!(id_value.is_valid(4)); // type mismatch, stored in value | ||
| assert!(id_typed_value.is_null(4)); | ||
| let id_variant = Variant::new(metadata.value(4), id_value.value(4)); | ||
| let id_variant = Variant::new( | ||
| binary_array_value(metadata.as_ref(), 4).unwrap(), | ||
| binary_array_value(id_value.as_ref(), 4).unwrap(), | ||
| ); | ||
| assert_eq!(id_variant, Variant::from(12345i64)); | ||
|
|
||
| assert!(session_id_value.is_null(4)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I'm pretty sure the cast is unnecessary (automatic coercion by compiler)
(several more below)