diff --git a/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs b/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs index fa3066b7e11e..194305d3b65a 100644 --- a/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs @@ -201,6 +201,10 @@ where fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl FixedSizeBinaryDictionaryBuilder diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 6eb48fc0527c..1ad2f7aeaa31 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -140,6 +140,10 @@ where fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl FixedSizeListBuilder @@ -211,6 +215,28 @@ where FixedSizeListArray::new(field, self.list_len, values, nulls) } + fn finish_preserve_values(&mut self) -> FixedSizeListArray { + let len = self.len(); + let values = self.values_builder.finish_preserve_values(); + let nulls = self.null_buffer_builder.finish(); + + assert_eq!( + values.len(), + len * self.list_len as usize, + "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", + values.len(), + self.list_len, + len, + ); + + let field = self + .field + .clone() + .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); + + FixedSizeListArray::new(field, self.list_len, values, nulls) + } + /// Returns the current null buffer as a slice pub fn validity_slice(&self) -> Option<&[u8]> { self.null_buffer_builder.as_slice() @@ -224,7 +250,7 @@ mod tests { use crate::Array; use crate::Int32Array; - use crate::builder::Int32Builder; + use crate::builder::{Int32Builder, tests::PreserveValuesMock}; fn make_list_builder( include_null_element: bool, @@ -491,4 +517,18 @@ mod tests { builder.finish(); } + + #[test] + fn test_finish_preserve_values() { + let mut builder = FixedSizeListBuilder::new(PreserveValuesMock::default(), 2); + + builder.values().inner.append_value(0); + builder.values().inner.append_value(1); + builder.append(true); + + let arr = builder.finish_preserve_values(); + + assert_eq!(1, arr.len()); + assert_eq!(1, builder.values().called); + } } diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 35c7bfced1fd..956014a72b7d 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -256,6 +256,10 @@ where fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl GenericByteDictionaryBuilder diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index cabf7a514050..2f130baae945 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -166,6 +166,10 @@ where fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl GenericListBuilder @@ -329,6 +333,23 @@ where GenericListArray::new(field, offsets, values, nulls) } + fn finish_preserve_values(&mut self) -> GenericListArray { + let values = self.values_builder.finish_preserve_values(); + let nulls = self.null_buffer_builder.finish(); + + let offsets = Buffer::from_vec(std::mem::take(&mut self.offsets_builder)); + // Safety: Safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + self.offsets_builder.push(OffsetSize::zero()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), + }; + + GenericListArray::new(field, offsets, values, nulls) + } + /// Returns the current offsets buffer as a slice pub fn offsets_slice(&self) -> &[OffsetSize] { self.offsets_builder.as_slice() @@ -364,7 +385,7 @@ where mod tests { use super::*; use crate::Int32Array; - use crate::builder::{Int32Builder, ListBuilder, make_builder}; + use crate::builder::{Int32Builder, ListBuilder, make_builder, tests::PreserveValuesMock}; use crate::cast::AsArray; use crate::types::Int32Type; use arrow_schema::DataType; @@ -818,4 +839,17 @@ mod tests { builder.append_value([Some(1)]); builder.finish(); } + + #[test] + fn test_finish_preserve_values() { + let mut builder = ListBuilder::new(PreserveValuesMock::default()); + + builder.values().inner.append_value(1); + builder.append(true); + + let arr = builder.finish_preserve_values(); + + assert_eq!(1, arr.len()); + assert_eq!(1, builder.values().called); + } } diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs index c13c21cb988b..4f9bd64c5db8 100644 --- a/arrow-array/src/builder/generic_list_view_builder.rs +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -71,6 +71,10 @@ impl ArrayBuilder fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl GenericListViewBuilder { @@ -216,6 +220,23 @@ where GenericListViewArray::new(field, offsets, sizes, values, nulls) } + fn finish_preserve_values(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish_preserve_values(); + let nulls = self.null_buffer_builder.finish(); + let offsets = Buffer::from_vec(std::mem::take(&mut self.offsets_builder)); + self.current_offset = OffsetSize::zero(); + + // Safety: Safe by construction + let offsets = ScalarBuffer::from(offsets); + let sizes = Buffer::from_vec(std::mem::take(&mut self.sizes_builder)); + let sizes = ScalarBuffer::from(sizes); + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + /// Returns the current offsets buffer as a slice pub fn offsets_slice(&self) -> &[OffsetSize] { self.offsets_builder.as_slice() @@ -245,7 +266,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, ListViewBuilder, make_builder}; + use crate::builder::{Int32Builder, ListViewBuilder, make_builder, tests::PreserveValuesMock}; use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; @@ -703,4 +724,17 @@ mod tests { builder.append_value([Some(1)]); builder.finish(); } + + #[test] + fn test_finish_preserve_values() { + let mut builder = ListViewBuilder::new(PreserveValuesMock::default()); + + builder.values().inner.append_value(1); + builder.append(true); + + let arr = builder.finish_preserve_values(); + + assert_eq!(1, arr.len()); + assert_eq!(1, builder.values().called); + } } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 5ff1625b4992..a6d49181762b 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -214,6 +214,18 @@ impl MapBuilder { self.finish_helper(keys_arr, values_arr, offset_buffer, nulls, len) } + fn finish_preserve_values(&mut self) -> MapArray { + let len = self.len(); + // Build the keys + let keys_arr = self.key_builder.finish_preserve_values(); + let values_arr = self.value_builder.finish_preserve_values(); + let offset_buffer = Buffer::from_vec(std::mem::take(&mut self.offsets_builder)); + self.offsets_builder.push(0); + let null_bit_buffer = self.null_buffer_builder.finish(); + + self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) + } + fn finish_helper( &self, keys_arr: Arc, @@ -287,6 +299,10 @@ impl ArrayBuilder for MapBuilder { Arc::new(self.finish_cloned()) } + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } + fn as_any(&self) -> &dyn Any { self } @@ -303,7 +319,7 @@ impl ArrayBuilder for MapBuilder { #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, StringBuilder, make_builder}; + use crate::builder::{Int32Builder, StringBuilder, make_builder, tests::PreserveValuesMock}; use crate::{Int32Array, StringArray}; use std::collections::HashMap; @@ -516,4 +532,23 @@ mod tests { builder.finish(); } + + #[test] + fn test_finish_preserve_values() { + let mut builder = MapBuilder::new( + None, + PreserveValuesMock::default(), + PreserveValuesMock::default(), + ); + + builder.keys().inner.append_value(1); + builder.values().inner.append_value(2); + builder.append(true).unwrap(); + + let map = builder.finish_preserve_values(); + + assert_eq!(1, map.len()); + assert_eq!(1, builder.keys().called); + assert_eq!(1, builder.values().called); + } } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 02c6df453b6c..7feb1b4d2ba6 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -341,6 +341,18 @@ pub trait ArrayBuilder: Any + Send + Sync { /// Builds the array without resetting the underlying builder. fn finish_cloned(&self) -> ArrayRef; + /// Builds the array without resetting the values builder. + /// + /// This is relevant for dictionary builders but also for composite builders. + /// Those are not affected directly, but will call the corresponding method + /// on their constituent builders. + /// + /// The default implementation just calls [`finish`][Self::finish] which is sufficient + /// for all but the above mentioned builders. + fn finish_preserve_values(&mut self) -> ArrayRef { + self.finish() + } + /// Returns the builder as a non-mutable `Any` reference. /// /// This is most useful when one wants to call non-mutable APIs on a specific builder @@ -376,6 +388,10 @@ impl ArrayBuilder for Box { (**self).finish_cloned() } + fn finish_preserve_values(&mut self) -> ArrayRef { + (**self).finish_preserve_values() + } + fn as_any(&self) -> &dyn Any { (**self).as_any() } @@ -613,3 +629,45 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box unimplemented!("Data type {t} is not currently supported"), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Default)] + pub struct PreserveValuesMock { + pub called: usize, + pub inner: Int32Builder, + } + + impl ArrayBuilder for PreserveValuesMock { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } + + fn len(&self) -> usize { + self.inner.len() + } + + fn finish(&mut self) -> ArrayRef { + panic!("finish should never be called on PreserveValuesMock") + } + + fn finish_cloned(&self) -> ArrayRef { + panic!("finish_cloned should never be called on PreserveValuesMock") + } + + fn finish_preserve_values(&mut self) -> ArrayRef { + self.called += 1; + self.inner.finish_preserve_values() + } + } +} diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index d9544aec3b9d..4f2187197494 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -268,6 +268,10 @@ where fn finish_cloned(&self) -> ArrayRef { Arc::new(self.finish_cloned()) } + + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } } impl PrimitiveDictionaryBuilder diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 795593c98a8a..d299f7bf9d60 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -135,6 +135,10 @@ impl ArrayBuilder for StructBuilder { Arc::new(self.finish_cloned()) } + fn finish_preserve_values(&mut self) -> ArrayRef { + Arc::new(self.finish_preserve_values()) + } + /// Returns the builder as a non-mutable `Any` reference. /// /// This is most useful when one wants to call non-mutable APIs on a specific builder @@ -265,6 +269,23 @@ impl StructBuilder { StructArray::new(self.fields.clone(), arrays, nulls) } + fn finish_preserve_values(&mut self) -> StructArray { + self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish()); + } + + let arrays = self + .field_builders + .iter_mut() + .map(|f| f.finish_preserve_values()) + .collect(); + + let nulls = self.null_buffer_builder.finish(); + + StructArray::new(self.fields.clone(), arrays, nulls) + } + /// Constructs and validates contents in the builder to ensure that /// - fields and field_builders are of equal length /// - the number of items in individual field_builders are equal to self.len() @@ -304,7 +325,7 @@ mod tests { use arrow_data::ArrayData; use arrow_schema::Field; - use crate::{array::Array, types::ArrowDictionaryKeyType}; + use crate::{array::Array, builder::tests::PreserveValuesMock, types::ArrowDictionaryKeyType}; #[test] fn test_struct_array_builder() { @@ -523,6 +544,33 @@ mod tests { assert_eq!(0, builder.len()); } + #[test] + fn test_struct_array_builder_finish_preserve_values() { + let fields = vec![Field::new("mock", DataType::Int32, false)]; + let field_builders = vec![Box::new(PreserveValuesMock::default()) as Box]; + + let mut builder = StructBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .inner + .append_value(1); + builder.append(true); + + assert_eq!(1, builder.len()); + + let arr = builder.finish_preserve_values(); + + assert_eq!(1, arr.len()); + assert_eq!( + 1, + builder + .field_builder::(0) + .unwrap() + .called + ); + } + #[test] fn test_struct_array_builder_from_schema() { let mut fields = vec![