diff --git a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java index 3146a3763cce..0366ed316544 100644 --- a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java @@ -82,6 +82,87 @@ public static ByteBuffer convertToByteBuffer(UUID value, ByteBuffer reuse) { return buffer; } + /** + * Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without + * creating an intermediate {@link UUID} object. + * + * @param uuidBytes ASCII bytes of a UUID in canonical form ({@code + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes) + * @return a newly allocated ByteBuffer containing the 16-byte UUID + */ + public static ByteBuffer convertToByteBuffer(byte[] uuidBytes) { + return convertToByteBuffer(uuidBytes, null); + } + + /** + * Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without + * creating an intermediate {@link UUID} object. + * + * @param uuidBytes ASCII bytes of a UUID in canonical form ({@code + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes) + * @param reuse a ByteBuffer to reuse, or null to allocate a new one + * @return the reuse buffer (or a new one) containing the 16-byte UUID + */ + public static ByteBuffer convertToByteBuffer(byte[] uuidBytes, ByteBuffer reuse) { + Preconditions.checkArgument( + uuidBytes.length == 36, "Invalid UUID string: expected 36 bytes, got %s", uuidBytes.length); + + // UUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + // MSB from first 3 groups (8+4+4 hex digits = 16 hex = 64 bits) + long msb = parseHexLong(uuidBytes, 0, 8); + checkHyphen(uuidBytes, 8); + msb = (msb << 16) | parseHexLong(uuidBytes, 9, 4); + checkHyphen(uuidBytes, 13); + msb = (msb << 16) | parseHexLong(uuidBytes, 14, 4); + + // LSB from last 2 groups (4+12 hex digits = 16 hex = 64 bits) + checkHyphen(uuidBytes, 18); + long lsb = parseHexLong(uuidBytes, 19, 4); + checkHyphen(uuidBytes, 23); + lsb = (lsb << 48) | parseHexLong(uuidBytes, 24, 12); + + ByteBuffer buffer; + if (reuse != null) { + buffer = reuse; + } else { + buffer = ByteBuffer.allocate(16); + } + + buffer.order(ByteOrder.BIG_ENDIAN); + buffer.putLong(0, msb); + buffer.putLong(8, lsb); + return buffer; + } + + private static long parseHexLong(byte[] bytes, int start, int length) { + long result = 0; + for (int i = start; i < start + length; i++) { + int digit = hexValue(bytes[i]); + result = (result << 4) | digit; + } + return result; + } + + private static int hexValue(byte hexByte) { + if (hexByte >= '0' && hexByte <= '9') { + return hexByte - '0'; + } else if (hexByte >= 'a' && hexByte <= 'f') { + return hexByte - 'a' + 10; + } else if (hexByte >= 'A' && hexByte <= 'F') { + return hexByte - 'A' + 10; + } + throw new IllegalArgumentException( + "Invalid hex character '" + (char) hexByte + "' in UUID string"); + } + + private static void checkHyphen(byte[] bytes, int pos) { + Preconditions.checkArgument( + bytes[pos] == '-', + "Expected '-' at position %s in UUID string, got '%s'", + pos, + (char) bytes[pos]); + } + /** * Generate a RFC 9562 UUIDv7. * diff --git a/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java b/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java index c5f85c2f20b3..8e09712552da 100644 --- a/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java +++ b/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java @@ -19,7 +19,11 @@ package org.apache.iceberg.util; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.UUID; import org.junit.jupiter.api.Test; @@ -31,4 +35,123 @@ public void uuidV7HasVersionAndVariant() { assertThat(uuid.version()).isEqualTo(7); assertThat(uuid.variant()).isEqualTo(2); } + + @Test + public void convertFromBytesRoundTrip() { + // the byte[] overload must produce the same result as the UUID overload + String uuidStr = "f79c3e09-677c-4bbd-a479-3f349cb785e7"; + UUID uuid = UUID.fromString(uuidStr); + + ByteBuffer expected = UUIDUtil.convertToByteBuffer(uuid); + ByteBuffer actual = UUIDUtil.convertToByteBuffer(uuidStr.getBytes(StandardCharsets.US_ASCII)); + + assertThat(actual.array()).isEqualTo(expected.array()); + } + + @Test + public void convertFromBytesKnownVector() { + // known test vector from TestConversions + byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII); + ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes); + + byte[] expectedBytes = { + -9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25 + }; + assertThat(result.array()).isEqualTo(expectedBytes); + } + + @Test + public void convertFromBytesAllZeros() { + byte[] uuidBytes = "00000000-0000-0000-0000-000000000000".getBytes(StandardCharsets.US_ASCII); + ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes); + assertThat(result.array()).isEqualTo(new byte[16]); + } + + @Test + public void convertFromBytesAllFs() { + byte[] uuidBytes = "ffffffff-ffff-ffff-ffff-ffffffffffff".getBytes(StandardCharsets.US_ASCII); + ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes); + + byte[] expected = new byte[16]; + java.util.Arrays.fill(expected, (byte) 0xFF); + assertThat(result.array()).isEqualTo(expected); + } + + @Test + public void convertFromBytesBufferReuse() { + byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII); + ByteBuffer reuse = ByteBuffer.allocate(16); + reuse.order(ByteOrder.BIG_ENDIAN); + + ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes, reuse); + assertThat(result).isSameAs(reuse); + + ByteBuffer expected = + UUIDUtil.convertToByteBuffer(UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")); + assertThat(result.array()).isEqualTo(expected.array()); + } + + @Test + public void convertFromBytesMixedCase() { + byte[] lower = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII); + byte[] upper = "F79C3E09-677C-4BBD-A479-3F349CB785E7".getBytes(StandardCharsets.US_ASCII); + byte[] mixed = "f79C3e09-677c-4Bbd-A479-3f349cB785E7".getBytes(StandardCharsets.US_ASCII); + + byte[] expected = UUIDUtil.convertToByteBuffer(lower).array(); + assertThat(UUIDUtil.convertToByteBuffer(upper).array()).isEqualTo(expected); + assertThat(UUIDUtil.convertToByteBuffer(mixed).array()).isEqualTo(expected); + } + + @Test + public void convertFromBytesRandomRoundTrip() { + for (int i = 0; i < 100; i++) { + UUID uuid = UUID.randomUUID(); + String str = uuid.toString(); + byte[] asciiBytes = str.getBytes(StandardCharsets.US_ASCII); + + ByteBuffer fromUuid = UUIDUtil.convertToByteBuffer(uuid); + ByteBuffer fromBytes = UUIDUtil.convertToByteBuffer(asciiBytes); + + assertThat(fromBytes.array()) + .as("Round-trip mismatch for UUID %s", str) + .isEqualTo(fromUuid.array()); + } + } + + @Test + public void convertFromBytesRejectsWrongLength() { + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "f79c3e09-677c-4bbd-a479-3f349cb785e".getBytes(StandardCharsets.US_ASCII))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("expected 36 bytes"); + + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "f79c3e09-677c-4bbd-a479-3f349cb785e7a".getBytes(StandardCharsets.US_ASCII))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("expected 36 bytes"); + } + + @Test + public void convertFromBytesRejectsMissingHyphen() { + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "f79c3e09x677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Expected '-'"); + } + + @Test + public void convertFromBytesRejectsInvalidHex() { + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "g79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Invalid hex character"); + } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..eef41cbf7a4e 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes()); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index f4ae6114c8ab..d3e25e48365a 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -25,7 +25,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -431,8 +430,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index bb8218bd83df..6f6b29628601 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -101,10 +100,11 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); + // use getBytes because it may return the backing byte array if available. + // otherwise, it copies to a new byte array, which is still cheaper than + // calling toString, which incurs encoding costs. // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..eef41cbf7a4e 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes()); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 58be7f610c81..2f25c290072d 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -25,7 +25,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -430,8 +429,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index bb8218bd83df..6f6b29628601 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -101,10 +100,11 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); + // use getBytes because it may return the backing byte array if available. + // otherwise, it copies to a new byte array, which is still cheaper than + // calling toString, which incurs encoding costs. // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..eef41cbf7a4e 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes()); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index ba816efc0ac8..c6411cd88472 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; @@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index d0d3483a7690..59eaae3c98c2 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -107,10 +106,11 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); + // use getBytes because it may return the backing byte array if available. + // otherwise, it copies to a new byte array, which is still cheaper than + // calling toString, which incurs encoding costs. // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..eef41cbf7a4e 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes()); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index ba816efc0ac8..c6411cd88472 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; @@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index d0d3483a7690..59eaae3c98c2 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -107,10 +106,11 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); + // use getBytes because it may return the backing byte array if available. + // otherwise, it copies to a new byte array, which is still cheaper than + // calling toString, which incurs encoding costs. // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } }