Skip to content

Commit b7839ff

Browse files
chore: address CodeRabbit review feedback
1. Remove fork-specific issue URL placeholders (upstream-ready) 2. Add consistency assertions to LargeList test: - offset == 0 check - content verification (bytes embedded) 3. Add offset == 0 check to Nifti test for consistency
1 parent 37728d0 commit b7839ff

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

src/datasets/table.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2125,7 +2125,6 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_
21252125
# When ds.shard() or ds.select() creates a sliced view, array.values returns
21262126
# values with internal offset references that can cause PyArrow's C++ layer
21272127
# to crash when processing nested types like Sequence(Nifti()).
2128-
# See: https://github.com/huggingface/datasets/issues/XXXX
21292128
if pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
21302129
if hasattr(array, "offset") and array.offset > 0:
21312130
array = pa.concat_arrays([array])

tests/features/test_embed_storage_sliced.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Tests for embed_array_storage with sliced/sharded arrays.
22
3-
Regression tests for https://github.com/huggingface/datasets/issues/XXXX
4-
(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
3+
Regression tests for SIGKILL crash when processing sliced/sharded Arrow tables
4+
with nested types like Sequence(Nifti()) or Sequence(Image()).
55
"""
66

77
import pyarrow as pa
@@ -81,6 +81,8 @@ def test_embed_array_storage_sliced_list_nifti(self, shared_datadir):
8181
# This should NOT crash with SIGKILL
8282
embedded = embed_array_storage(sliced, List(Nifti()))
8383

84+
# The fix should make the result contiguous (offset = 0)
85+
assert embedded.offset == 0, "Result should be contiguous after fix"
8486
assert len(embedded) == 2
8587
# Verify bytes were embedded
8688
assert embedded[0].as_py()[0]["bytes"] is not None
@@ -111,4 +113,10 @@ def test_embed_array_storage_sliced_large_list(self, shared_datadir):
111113
# This should NOT crash with SIGKILL
112114
embedded = embed_array_storage(sliced, LargeList(Image()))
113115

116+
# The fix should make the result contiguous (offset = 0)
117+
assert embedded.offset == 0, "Result should be contiguous after fix"
114118
assert len(embedded) == 2
119+
# Item 0 of sliced = Item 1 of original (has 2 images)
120+
assert len(embedded[0].as_py()) == 2
121+
# Verify bytes were embedded
122+
assert embedded[0].as_py()[0]["bytes"] is not None

0 commit comments

Comments
 (0)