apache · abhishek593 · Apr 16, 2026 · steveloughran · Apr 17, 2026 · abhishek593
diff --git a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java
@@ -82,6 +82,87 @@ public static ByteBuffer convertToByteBuffer(UUID value, ByteBuffer reuse) {
     return buffer;
   }
 
+  /**
+   * Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without
+   * creating an intermediate {@link UUID} object.
+   *
+   * @param uuidBytes ASCII bytes of a UUID in canonical form ({@code
+   *     xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes)
+   * @return a newly allocated ByteBuffer containing the 16-byte UUID
+   */
+  public static ByteBuffer convertToByteBuffer(byte[] uuidBytes) {
+    return convertToByteBuffer(uuidBytes, null);
+  }
+
+  /**
+   * Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without
+   * creating an intermediate {@link UUID} object.
+   *
+   * @param uuidBytes ASCII bytes of a UUID in canonical form ({@code
+   *     xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes)
+   * @param reuse a ByteBuffer to reuse, or null to allocate a new one
+   * @return the reuse buffer (or a new one) containing the 16-byte UUID
+   */
+  public static ByteBuffer convertToByteBuffer(byte[] uuidBytes, ByteBuffer reuse) {
+    Preconditions.checkArgument(
+        uuidBytes.length == 36, "Invalid UUID string: expected 36 bytes, got %s", uuidBytes.length);
+
+    // UUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+    // MSB from first 3 groups (8+4+4 hex digits = 16 hex = 64 bits)
+    long msb = parseHexLong(uuidBytes, 0, 8);
+    checkHyphen(uuidBytes, 8);
+    msb = (msb << 16) | parseHexLong(uuidBytes, 9, 4);
+    checkHyphen(uuidBytes, 13);
+    msb = (msb << 16) | parseHexLong(uuidBytes, 14, 4);
+
+    // LSB from last 2 groups (4+12 hex digits = 16 hex = 64 bits)
+    checkHyphen(uuidBytes, 18);
+    long lsb = parseHexLong(uuidBytes, 19, 4);
+    checkHyphen(uuidBytes, 23);
+    lsb = (lsb << 48) | parseHexLong(uuidBytes, 24, 12);
+
+    ByteBuffer buffer;
+    if (reuse != null) {
+      buffer = reuse;
+    } else {
+      buffer = ByteBuffer.allocate(16);
+    }
+
+    buffer.order(ByteOrder.BIG_ENDIAN);
+    buffer.putLong(0, msb);
+    buffer.putLong(8, lsb);
+    return buffer;
+  }
+
+  private static long parseHexLong(byte[] bytes, int start, int length) {
+    long result = 0;
+    for (int i = start; i < start + length; i++) {
+      int digit = hexValue(bytes[i]);
+      result = (result << 4) | digit;
+    }
+    return result;
+  }
+
+  private static int hexValue(byte hexByte) {
+    if (hexByte >= '0' && hexByte <= '9') {
+      return hexByte - '0';
+    } else if (hexByte >= 'a' && hexByte <= 'f') {
+      return hexByte - 'a' + 10;
+    } else if (hexByte >= 'A' && hexByte <= 'F') {
+      return hexByte - 'A' + 10;
+    }
+    throw new IllegalArgumentException(
+        "Invalid hex character '" + (char) hexByte + "' in UUID string");
+  }
+
+  private static void checkHyphen(byte[] bytes, int pos) {
+    Preconditions.checkArgument(
+        bytes[pos] == '-',
+        "Expected '-' at position %s in UUID string, got '%s'",
+        pos,
+        (char) bytes[pos]);
+  }
+
   /**
    * Generate a RFC 9562 UUIDv7.
    *

diff --git a/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java b/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java
@@ -19,7 +19,11 @@
 package org.apache.iceberg.util;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 import java.util.UUID;
 import org.junit.jupiter.api.Test;
 
@@ -31,4 +35,123 @@ public void uuidV7HasVersionAndVariant() {
     assertThat(uuid.version()).isEqualTo(7);
     assertThat(uuid.variant()).isEqualTo(2);
   }
+
+  @Test
+  public void convertFromBytesRoundTrip() {
+    // the byte[] overload must produce the same result as the UUID overload
+    String uuidStr = "f79c3e09-677c-4bbd-a479-3f349cb785e7";
+    UUID uuid = UUID.fromString(uuidStr);
+
+    ByteBuffer expected = UUIDUtil.convertToByteBuffer(uuid);
+    ByteBuffer actual = UUIDUtil.convertToByteBuffer(uuidStr.getBytes(StandardCharsets.US_ASCII));
+
+    assertThat(actual.array()).isEqualTo(expected.array());
+  }
+
+  @Test
+  public void convertFromBytesKnownVector() {
+    // known test vector from TestConversions
+    byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
+    ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);
+
+    byte[] expectedBytes = {
+      -9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25
+    };
+    assertThat(result.array()).isEqualTo(expectedBytes);
+  }
+
+  @Test
+  public void convertFromBytesAllZeros() {
+    byte[] uuidBytes = "00000000-0000-0000-0000-000000000000".getBytes(StandardCharsets.US_ASCII);
+    ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);
+    assertThat(result.array()).isEqualTo(new byte[16]);
+  }
+
+  @Test
+  public void convertFromBytesAllFs() {
+    byte[] uuidBytes = "ffffffff-ffff-ffff-ffff-ffffffffffff".getBytes(StandardCharsets.US_ASCII);
+    ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);
+
+    byte[] expected = new byte[16];
+    java.util.Arrays.fill(expected, (byte) 0xFF);
+    assertThat(result.array()).isEqualTo(expected);
+  }
+
+  @Test
+  public void convertFromBytesBufferReuse() {
+    byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
+    ByteBuffer reuse = ByteBuffer.allocate(16);
+    reuse.order(ByteOrder.BIG_ENDIAN);
+
+    ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes, reuse);
+    assertThat(result).isSameAs(reuse);
+
+    ByteBuffer expected =
+        UUIDUtil.convertToByteBuffer(UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7"));
+    assertThat(result.array()).isEqualTo(expected.array());
+  }
+
+  @Test
+  public void convertFromBytesMixedCase() {
+    byte[] lower = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
+    byte[] upper = "F79C3E09-677C-4BBD-A479-3F349CB785E7".getBytes(StandardCharsets.US_ASCII);
+    byte[] mixed = "f79C3e09-677c-4Bbd-A479-3f349cB785E7".getBytes(StandardCharsets.US_ASCII);
+
+    byte[] expected = UUIDUtil.convertToByteBuffer(lower).array();
+    assertThat(UUIDUtil.convertToByteBuffer(upper).array()).isEqualTo(expected);
+    assertThat(UUIDUtil.convertToByteBuffer(mixed).array()).isEqualTo(expected);
+  }
+
+  @Test
+  public void convertFromBytesRandomRoundTrip() {
+    for (int i = 0; i < 100; i++) {
+      UUID uuid = UUID.randomUUID();
+      String str = uuid.toString();
+      byte[] asciiBytes = str.getBytes(StandardCharsets.US_ASCII);
+
+      ByteBuffer fromUuid = UUIDUtil.convertToByteBuffer(uuid);
+      ByteBuffer fromBytes = UUIDUtil.convertToByteBuffer(asciiBytes);
+
+      assertThat(fromBytes.array())
+          .as("Round-trip mismatch for UUID %s", str)
+          .isEqualTo(fromUuid.array());
+    }
+  }
+
+  @Test
+  public void convertFromBytesRejectsWrongLength() {
+    assertThatThrownBy(
+            () ->
+                UUIDUtil.convertToByteBuffer(
+                    "f79c3e09-677c-4bbd-a479-3f349cb785e".getBytes(StandardCharsets.US_ASCII)))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessageContaining("expected 36 bytes");
+
+    assertThatThrownBy(
+            () ->
+                UUIDUtil.convertToByteBuffer(
+                    "f79c3e09-677c-4bbd-a479-3f349cb785e7a".getBytes(StandardCharsets.US_ASCII)))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessageContaining("expected 36 bytes");
+  }
+
+  @Test
+  public void convertFromBytesRejectsMissingHyphen() {
+    assertThatThrownBy(
+            () ->
+                UUIDUtil.convertToByteBuffer(
+                    "f79c3e09x677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII)))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessageContaining("Expected '-'");
+  }
+
+  @Test
+  public void convertFromBytesRejectsInvalidHex() {
+    assertThatThrownBy(
+            () ->
+                UUIDUtil.convertToByteBuffer(
+                    "g79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII)))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessageContaining("Invalid hex character");
+  }
 }
diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java
@@ -20,7 +20,6 @@
 
 import java.nio.ByteBuffer;
 import java.util.List;
-import java.util.UUID;
 import java.util.stream.Stream;
 import org.apache.iceberg.FieldMetrics;
 import org.apache.iceberg.orc.OrcValueWriter;
@@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
       // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
       // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
       // would then overwrite previous values
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
       ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
     }
   }

diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java
@@ -25,7 +25,6 @@
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Optional;
-import java.util.UUID;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
 import org.apache.iceberg.parquet.ParquetValueWriter;
@@ -431,8 +430,7 @@ private UUIDWriter(ColumnDescriptor desc) {
 
     @Override
     public void write(int repetitionLevel, UTF8String string) {
-      UUID uuid = UUID.fromString(string.toString());
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
       column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
     }
   }

diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java
@@ -23,7 +23,6 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.List;
-import java.util.UUID;
 import org.apache.avro.io.Encoder;
 import org.apache.avro.util.Utf8;
 import org.apache.iceberg.avro.ValueWriter;
@@ -101,10 +100,11 @@ private UUIDWriter() {}
     @Override
     @SuppressWarnings("ByteBufferBackingArray")
     public void write(UTF8String s, Encoder encoder) throws IOException {
-      // TODO: direct conversion from string to byte buffer
-      UUID uuid = UUID.fromString(s.toString());
+      // use getBytes because it may return the backing byte array if available.
+      // otherwise, it copies to a new byte array, which is still cheaper than
+      // calling toString, which incurs encoding costs.
       // calling array() is safe because the buffer is always allocated by the thread-local
-      encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
+      encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
     }
   }
 

diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java
@@ -20,7 +20,6 @@
 
 import java.nio.ByteBuffer;
 import java.util.List;
-import java.util.UUID;
 import java.util.stream.Stream;
 import org.apache.iceberg.FieldMetrics;
 import org.apache.iceberg.orc.OrcValueWriter;
@@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
       // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
       // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
       // would then overwrite previous values
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
       ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
     }
   }

diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java
@@ -25,7 +25,6 @@
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Optional;
-import java.util.UUID;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
 import org.apache.iceberg.parquet.ParquetValueWriter;
@@ -430,8 +429,7 @@ private UUIDWriter(ColumnDescriptor desc) {
 
     @Override
     public void write(int repetitionLevel, UTF8String string) {
-      UUID uuid = UUID.fromString(string.toString());
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
       column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
     }
   }

diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java
@@ -23,7 +23,6 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.List;
-import java.util.UUID;
 import org.apache.avro.io.Encoder;
 import org.apache.avro.util.Utf8;
 import org.apache.iceberg.avro.ValueWriter;
@@ -101,10 +100,11 @@ private UUIDWriter() {}
     @Override
     @SuppressWarnings("ByteBufferBackingArray")
     public void write(UTF8String s, Encoder encoder) throws IOException {
-      // TODO: direct conversion from string to byte buffer
-      UUID uuid = UUID.fromString(s.toString());
+      // use getBytes because it may return the backing byte array if available.
+      // otherwise, it copies to a new byte array, which is still cheaper than
+      // calling toString, which incurs encoding costs.
       // calling array() is safe because the buffer is always allocated by the thread-local
-      encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
+      encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
     }
   }
 

diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java
@@ -20,7 +20,6 @@
 
 import java.nio.ByteBuffer;
 import java.util.List;
-import java.util.UUID;
 import java.util.stream.Stream;
 import org.apache.iceberg.FieldMetrics;
 import org.apache.iceberg.orc.OrcValueWriter;
@@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
       // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
       // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
       // would then overwrite previous values
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
       ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
     }
   }

diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java
@@ -26,7 +26,6 @@
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Optional;
-import java.util.UUID;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
 import org.apache.iceberg.FieldMetrics;
@@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) {
 
     @Override
     public void write(int repetitionLevel, UTF8String string) {
-      UUID uuid = UUID.fromString(string.toString());
-      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
+      ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
       column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
     }
   }

diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java
@@ -23,7 +23,6 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.List;
-import java.util.UUID;
 import org.apache.avro.io.Encoder;
 import org.apache.avro.util.Utf8;
 import org.apache.iceberg.avro.ValueWriter;
@@ -107,10 +106,11 @@ private UUIDWriter() {}
     @Override
     @SuppressWarnings("ByteBufferBackingArray")
     public void write(UTF8String s, Encoder encoder) throws IOException {
-      // TODO: direct conversion from string to byte buffer
-      UUID uuid = UUID.fromString(s.toString());
+      // use getBytes because it may return the backing byte array if available.
+      // otherwise, it copies to a new byte array, which is still cheaper than
+      // calling toString, which incurs encoding costs.
       // calling array() is safe because the buffer is always allocated by the thread-local
-      encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
+      encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
     }
   }