Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions api/src/main/java/org/apache/iceberg/util/UUIDUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,87 @@ public static ByteBuffer convertToByteBuffer(UUID value, ByteBuffer reuse) {
return buffer;
}

/**
* Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without
* creating an intermediate {@link UUID} object.
*
* @param uuidBytes ASCII bytes of a UUID in canonical form ({@code
* xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes)
* @return a newly allocated ByteBuffer containing the 16-byte UUID
*/
public static ByteBuffer convertToByteBuffer(byte[] uuidBytes) {
return convertToByteBuffer(uuidBytes, null);
}

/**
* Convert the ASCII bytes of a UUID string directly to a 16-byte {@link ByteBuffer}, without
* creating an intermediate {@link UUID} object.
*
* @param uuidBytes ASCII bytes of a UUID in canonical form ({@code
* xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}, 36 bytes)
* @param reuse a ByteBuffer to reuse, or null to allocate a new one
* @return the reuse buffer (or a new one) containing the 16-byte UUID
*/
public static ByteBuffer convertToByteBuffer(byte[] uuidBytes, ByteBuffer reuse) {
Preconditions.checkArgument(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you think a null check is worth it here? or just wasteful overkill?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@steveloughran I think it's overkill, because null here would immediately NPE on uuidBytes.length, which is a clear signal. The existing methods in this class also follow this pattern of not adding null checks, so adding here would be inconsistent.

uuidBytes.length == 36, "Invalid UUID string: expected 36 bytes, got %s", uuidBytes.length);

// UUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
// MSB from first 3 groups (8+4+4 hex digits = 16 hex = 64 bits)
long msb = parseHexLong(uuidBytes, 0, 8);
checkHyphen(uuidBytes, 8);
msb = (msb << 16) | parseHexLong(uuidBytes, 9, 4);
checkHyphen(uuidBytes, 13);
msb = (msb << 16) | parseHexLong(uuidBytes, 14, 4);

// LSB from last 2 groups (4+12 hex digits = 16 hex = 64 bits)
checkHyphen(uuidBytes, 18);
long lsb = parseHexLong(uuidBytes, 19, 4);
checkHyphen(uuidBytes, 23);
lsb = (lsb << 48) | parseHexLong(uuidBytes, 24, 12);

ByteBuffer buffer;
if (reuse != null) {
buffer = reuse;
} else {
buffer = ByteBuffer.allocate(16);
}

buffer.order(ByteOrder.BIG_ENDIAN);
buffer.putLong(0, msb);
buffer.putLong(8, lsb);
return buffer;
}

private static long parseHexLong(byte[] bytes, int start, int length) {
long result = 0;
for (int i = start; i < start + length; i++) {
int digit = hexValue(bytes[i]);
result = (result << 4) | digit;
}
return result;
}

private static int hexValue(byte hexByte) {
if (hexByte >= '0' && hexByte <= '9') {
return hexByte - '0';
} else if (hexByte >= 'a' && hexByte <= 'f') {
return hexByte - 'a' + 10;
} else if (hexByte >= 'A' && hexByte <= 'F') {
return hexByte - 'A' + 10;
}
throw new IllegalArgumentException(
"Invalid hex character '" + (char) hexByte + "' in UUID string");
}

private static void checkHyphen(byte[] bytes, int pos) {
Preconditions.checkArgument(
bytes[pos] == '-',
"Expected '-' at position %s in UUID string, got '%s'",
pos,
(char) bytes[pos]);
}

/**
* Generate a RFC 9562 UUIDv7.
*
Expand Down
123 changes: 123 additions & 0 deletions api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
package org.apache.iceberg.util;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.UUID;
import org.junit.jupiter.api.Test;

Expand All @@ -31,4 +35,123 @@ public void uuidV7HasVersionAndVariant() {
assertThat(uuid.version()).isEqualTo(7);
assertThat(uuid.variant()).isEqualTo(2);
}

@Test
public void convertFromBytesRoundTrip() {
// the byte[] overload must produce the same result as the UUID overload
String uuidStr = "f79c3e09-677c-4bbd-a479-3f349cb785e7";
UUID uuid = UUID.fromString(uuidStr);

ByteBuffer expected = UUIDUtil.convertToByteBuffer(uuid);
ByteBuffer actual = UUIDUtil.convertToByteBuffer(uuidStr.getBytes(StandardCharsets.US_ASCII));

assertThat(actual.array()).isEqualTo(expected.array());
}

@Test
public void convertFromBytesKnownVector() {
// known test vector from TestConversions
byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);

byte[] expectedBytes = {
-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25
};
assertThat(result.array()).isEqualTo(expectedBytes);
}

@Test
public void convertFromBytesAllZeros() {
byte[] uuidBytes = "00000000-0000-0000-0000-000000000000".getBytes(StandardCharsets.US_ASCII);
ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);
assertThat(result.array()).isEqualTo(new byte[16]);
}

@Test
public void convertFromBytesAllFs() {
byte[] uuidBytes = "ffffffff-ffff-ffff-ffff-ffffffffffff".getBytes(StandardCharsets.US_ASCII);
ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes);

byte[] expected = new byte[16];
java.util.Arrays.fill(expected, (byte) 0xFF);
assertThat(result.array()).isEqualTo(expected);
}

@Test
public void convertFromBytesBufferReuse() {
byte[] uuidBytes = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
ByteBuffer reuse = ByteBuffer.allocate(16);
reuse.order(ByteOrder.BIG_ENDIAN);

ByteBuffer result = UUIDUtil.convertToByteBuffer(uuidBytes, reuse);
assertThat(result).isSameAs(reuse);

ByteBuffer expected =
UUIDUtil.convertToByteBuffer(UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7"));
assertThat(result.array()).isEqualTo(expected.array());
}

@Test
public void convertFromBytesMixedCase() {
byte[] lower = "f79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII);
byte[] upper = "F79C3E09-677C-4BBD-A479-3F349CB785E7".getBytes(StandardCharsets.US_ASCII);
byte[] mixed = "f79C3e09-677c-4Bbd-A479-3f349cB785E7".getBytes(StandardCharsets.US_ASCII);

byte[] expected = UUIDUtil.convertToByteBuffer(lower).array();
assertThat(UUIDUtil.convertToByteBuffer(upper).array()).isEqualTo(expected);
assertThat(UUIDUtil.convertToByteBuffer(mixed).array()).isEqualTo(expected);
}

@Test
public void convertFromBytesRandomRoundTrip() {
for (int i = 0; i < 100; i++) {
UUID uuid = UUID.randomUUID();
String str = uuid.toString();
byte[] asciiBytes = str.getBytes(StandardCharsets.US_ASCII);

ByteBuffer fromUuid = UUIDUtil.convertToByteBuffer(uuid);
ByteBuffer fromBytes = UUIDUtil.convertToByteBuffer(asciiBytes);

assertThat(fromBytes.array())
.as("Round-trip mismatch for UUID %s", str)
.isEqualTo(fromUuid.array());
}
}

@Test
public void convertFromBytesRejectsWrongLength() {
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"f79c3e09-677c-4bbd-a479-3f349cb785e".getBytes(StandardCharsets.US_ASCII)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("expected 36 bytes");

assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"f79c3e09-677c-4bbd-a479-3f349cb785e7a".getBytes(StandardCharsets.US_ASCII)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("expected 36 bytes");
}

@Test
public void convertFromBytesRejectsMissingHyphen() {
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"f79c3e09x677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Expected '-'");
}

@Test
public void convertFromBytesRejectsInvalidHex() {
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"g79c3e09-677c-4bbd-a479-3f349cb785e7".getBytes(StandardCharsets.US_ASCII)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Invalid hex character");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.orc.OrcValueWriter;
Expand Down Expand Up @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
// ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
// can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
// would then overwrite previous values
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.UUID;
import org.apache.iceberg.Schema;
import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
import org.apache.iceberg.parquet.ParquetValueWriter;
Expand Down Expand Up @@ -431,8 +430,7 @@ private UUIDWriter(ColumnDescriptor desc) {

@Override
public void write(int repetitionLevel, UTF8String string) {
UUID uuid = UUID.fromString(string.toString());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.List;
import java.util.UUID;
import org.apache.avro.io.Encoder;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.avro.ValueWriter;
Expand Down Expand Up @@ -101,10 +100,11 @@ private UUIDWriter() {}
@Override
@SuppressWarnings("ByteBufferBackingArray")
public void write(UTF8String s, Encoder encoder) throws IOException {
// TODO: direct conversion from string to byte buffer
UUID uuid = UUID.fromString(s.toString());
// use getBytes because it may return the backing byte array if available.
// otherwise, it copies to a new byte array, which is still cheaper than
// calling toString, which incurs encoding costs.
// calling array() is safe because the buffer is always allocated by the thread-local
encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.orc.OrcValueWriter;
Expand Down Expand Up @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
// ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
// can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
// would then overwrite previous values
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.UUID;
import org.apache.iceberg.Schema;
import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
import org.apache.iceberg.parquet.ParquetValueWriter;
Expand Down Expand Up @@ -430,8 +429,7 @@ private UUIDWriter(ColumnDescriptor desc) {

@Override
public void write(int repetitionLevel, UTF8String string) {
UUID uuid = UUID.fromString(string.toString());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.List;
import java.util.UUID;
import org.apache.avro.io.Encoder;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.avro.ValueWriter;
Expand Down Expand Up @@ -101,10 +100,11 @@ private UUIDWriter() {}
@Override
@SuppressWarnings("ByteBufferBackingArray")
public void write(UTF8String s, Encoder encoder) throws IOException {
// TODO: direct conversion from string to byte buffer
UUID uuid = UUID.fromString(s.toString());
// use getBytes because it may return the backing byte array if available.
// otherwise, it copies to a new byte array, which is still cheaper than
// calling toString, which incurs encoding costs.
// calling array() is safe because the buffer is always allocated by the thread-local
encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.orc.OrcValueWriter;
Expand Down Expand Up @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
// ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
// can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
// would then overwrite previous values
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes());
((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
Expand Down Expand Up @@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) {

@Override
public void write(int repetitionLevel, UTF8String string) {
UUID uuid = UUID.fromString(string.toString());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.List;
import java.util.UUID;
import org.apache.avro.io.Encoder;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.avro.ValueWriter;
Expand Down Expand Up @@ -107,10 +106,11 @@ private UUIDWriter() {}
@Override
@SuppressWarnings("ByteBufferBackingArray")
public void write(UTF8String s, Encoder encoder) throws IOException {
// TODO: direct conversion from string to byte buffer
UUID uuid = UUID.fromString(s.toString());
// use getBytes because it may return the backing byte array if available.
// otherwise, it copies to a new byte array, which is still cheaper than
// calling toString, which incurs encoding costs.
// calling array() is safe because the buffer is always allocated by the thread-local
encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
}
}

Expand Down
Loading
Loading