Skip to content

Commit 782504c

Browse files
authored
ICU: per-item zstd compression of libicudata (#237)
* ICU: per-item zstd compression of libicudata with udata.cpp lookup hook * Compress only display-name trees; leave coll/brkitr/locale patterns raw * Delegate package read to icupkg, use util.parseArgs, type everything * Assert writePackage round-trips the input byte-for-byte before compressing * Train zstd dict on cold items only; hot-items comment tweak * Simplify hot-list to */pool.res — the only display-name exclusion that matters * Use --train-cover, --no-check, --no-dictID * Expand to A4-ko config; rename hot-items.txt → keep-raw.txt * Name the compress-threshold constants per review * Pin zstd 1.5.7 in Dockerfiles to match Bun's vendored decoder
1 parent 0d85951 commit 782504c

5 files changed

Lines changed: 477 additions & 13 deletions

File tree

Dockerfile

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ RUN ( apt-get update || \
4141
git \
4242
python3 \
4343
python3-pip \
44+
xz-utils \
4445
ninja-build \
4546
software-properties-common \
4647
apt-transport-https \
@@ -49,6 +50,21 @@ RUN ( apt-get update || \
4950
lsb-release \
5051
&& rm -rf /var/lib/apt/lists/*
5152

53+
# Install zstd (for icu/compress-data.ts). Pinned: focal's apt has 1.4.4 which
54+
# compresses meaningfully worse than 1.5.x; this matches Bun's vendored decoder.
55+
ARG ZSTD_VERSION=1.5.7
56+
RUN curl -fsSL "https://github.com/facebook/zstd/releases/download/v${ZSTD_VERSION}/zstd-${ZSTD_VERSION}.tar.gz" | tar xz -C /tmp \
57+
&& make -C /tmp/zstd-${ZSTD_VERSION}/programs zstd -j$(nproc) \
58+
&& cp /tmp/zstd-${ZSTD_VERSION}/programs/zstd /usr/local/bin/ \
59+
&& rm -rf /tmp/zstd-${ZSTD_VERSION} \
60+
&& zstd --version
61+
62+
# Install Node (for icu/compress-data.ts; needs >=23.6 for default type stripping)
63+
ARG NODE_VERSION=24.16.0
64+
RUN curl -fsSL "https://nodejs.org/dist/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-$(uname -m | sed 's/x86_64/x64/;s/aarch64/arm64/').tar.xz" \
65+
| tar -xJ -C /usr/local --strip-components=1 \
66+
&& node --version
67+
5268
# Install modern CMake for Ubuntu
5369
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null \
5470
&& apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" \
@@ -177,12 +193,17 @@ RUN echo "#include <iostream>\n#include <numbers>\nint main() { std::cout << std
177193

178194
# Download and build ICU.
179195
#
196+
# After tar, patch udata.cpp with a per-item decompression hook (a weak extern
197+
# Bun defines; null in ICU's own tools).
198+
#
180199
# After the first `make` (which produces bin/icupkg), filter data/in/icudt75l.dat
181-
# to drop converters/translit/rbnf/stringprep/confusables/unames, then rebuild
182-
# the data target. Bun has zero ucnv_/utrans_/usprep_/uspoof_ consumers
183-
# (TextCodecICU is removed in src/bun.js/bindings/TextEncodingRegistry.cpp and
184-
# UCONFIG_NO_LEGACY_CONVERSION=1 is set below), so this is unreachable data.
185-
# Cuts libicudata.a by ~7.4 MB with no observable change.
200+
# to drop converters/translit/rbnf/stringprep/confusables/unames — Bun has zero
201+
# ucnv_/utrans_/usprep_/uspoof_ consumers — then rebuild.
202+
#
203+
# Finally, repack the filtered .dat with per-item zstd (icu/compress-data.ts).
204+
# Items matching icu/keep-raw.txt stay uncompressed (too expensive to decode lazily).
205+
# The repacked libicudata.a also embeds the trained zstd dictionary.
206+
COPY icu/ /icu-bun/
186207
ADD https://github.com/unicode-org/icu/releases/download/release-75-1/icu4c-75_1-src.tgz /icu.tgz
187208
RUN --mount=type=tmpfs,target=/icu \
188209
export CFLAGS="$CFLAGS -Os -std=c17 $LTO_FLAG" && \
@@ -191,14 +212,16 @@ RUN --mount=type=tmpfs,target=/icu \
191212
cd /icu && \
192213
tar -xf /icu.tgz --strip-components=1 && \
193214
rm /icu.tgz && \
215+
patch -p1 < /icu-bun/udata-decompress-hook.patch && \
194216
cd source && \
195217
./configure --enable-static --disable-shared --disable-layoutex --disable-layout --with-data-packaging=static --disable-samples --disable-debug --disable-tests --disable-extras --disable-icuio && \
196218
make -j$(nproc) && \
197219
bin/icupkg -l data/in/icudt75l.dat | grep -E '\.(cnv|spp|cfu)$|^cnvalias\.icu$|^translit/|^rbnf/|^unames\.icu$' > data/in/rm.lst && \
198220
bin/icupkg --auto_toc_prefix -r data/in/rm.lst data/in/icudt75l.dat data/in/icudt75l_filtered.dat && \
199221
mv -f data/in/icudt75l_filtered.dat data/in/icudt75l.dat && \
200222
rm -rf data/out lib/libicudata.a && make -j$(nproc) && \
201-
make install && cp -r /icu/source/lib/* /output/lib && cp -r /icu/source/i18n/unicode/* /icu/source/common/unicode/* /output/include/unicode
223+
make install && cp -r /icu/source/lib/* /output/lib && cp -r /icu/source/i18n/unicode/* /icu/source/common/unicode/* /output/include/unicode && \
224+
node --experimental-strip-types /icu-bun/compress-data.ts data/in/icudt75l.dat /output/lib/libicudata.a --skip /icu-bun/keep-raw.txt --icupkg bin/icupkg
202225

203226
# Copy WebKit source and build
204227
COPY . /webkit

Dockerfile.musl

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,28 @@ ARG LTO_FLAG
4242
ARG LLVM_VERSION
4343
ARG DEFAULT_CFLAGS
4444

45-
RUN apk add --no-cache cpio curl tar
46-
45+
RUN apk add --no-cache cpio curl tar nodejs patch
46+
47+
# Install zstd (for icu/compress-data.ts). Pinned to match Bun's vendored
48+
# decoder so glibc/musl prebuilts compress identically.
49+
ARG ZSTD_VERSION=1.5.7
50+
RUN curl -fsSL "https://github.com/facebook/zstd/releases/download/v${ZSTD_VERSION}/zstd-${ZSTD_VERSION}.tar.gz" | tar xz -C /tmp \
51+
&& make -C /tmp/zstd-${ZSTD_VERSION}/programs zstd -j$(nproc) \
52+
&& cp /tmp/zstd-${ZSTD_VERSION}/programs/zstd /usr/local/bin/ \
53+
&& rm -rf /tmp/zstd-${ZSTD_VERSION} \
54+
&& zstd --version
55+
56+
# After tar, patch udata.cpp with a per-item decompression hook (a weak extern
57+
# Bun defines; null in ICU's own tools).
58+
#
4759
# After the first `make` (which produces bin/icupkg), filter data/in/icudt75l.dat
48-
# to drop converters/translit/rbnf/stringprep/confusables/unames, then rebuild
49-
# the data target. Bun has zero ucnv_/utrans_/usprep_/uspoof_ consumers
50-
# (TextCodecICU is removed in src/bun.js/bindings/TextEncodingRegistry.cpp).
51-
# Cuts libicudata.a by ~7.4 MB with no observable change.
60+
# to drop converters/translit/rbnf/stringprep/confusables/unames — Bun has zero
61+
# ucnv_/utrans_/usprep_/uspoof_ consumers — then rebuild.
62+
#
63+
# Finally, repack the filtered .dat with per-item zstd (icu/compress-data.ts).
64+
# Items matching icu/keep-raw.txt stay uncompressed (too expensive to decode lazily).
65+
# The repacked libicudata.a also embeds the trained zstd dictionary.
66+
COPY icu/ /icu-bun/
5267
ADD https://github.com/unicode-org/icu/releases/download/release-75-1/icu4c-75_1-src.tgz /icu.tgz
5368
RUN --mount=type=tmpfs,target=/icu \
5469
export CFLAGS="${DEFAULT_CFLAGS} ${MARCH_FLAG} $CFLAGS -Os -std=c17 $LTO_FLAG" && \
@@ -57,14 +72,16 @@ RUN --mount=type=tmpfs,target=/icu \
5772
cd /icu && \
5873
tar -xf /icu.tgz --strip-components=1 && \
5974
rm /icu.tgz && \
75+
patch -p1 < /icu-bun/udata-decompress-hook.patch && \
6076
cd source && \
6177
./configure --enable-static --disable-shared --with-data-packaging=static --disable-samples --disable-debug --disable-tests && \
6278
make -j$(nproc) && \
6379
bin/icupkg -l data/in/icudt75l.dat | grep -E '\.(cnv|spp|cfu)$|^cnvalias\.icu$|^translit/|^rbnf/|^unames\.icu$' > data/in/rm.lst && \
6480
bin/icupkg --auto_toc_prefix -r data/in/rm.lst data/in/icudt75l.dat data/in/icudt75l_filtered.dat && \
6581
mv -f data/in/icudt75l_filtered.dat data/in/icudt75l.dat && \
6682
rm -rf data/out lib/libicudata.a && make -j$(nproc) && \
67-
make install && cp -r /icu/source/lib/* /output/lib && cp -r /icu/source/i18n/unicode/* /icu/source/common/unicode/* /output/include/unicode
83+
make install && cp -r /icu/source/lib/* /output/lib && cp -r /icu/source/i18n/unicode/* /icu/source/common/unicode/* /output/include/unicode && \
84+
node --experimental-strip-types /icu-bun/compress-data.ts data/in/icudt75l.dat /output/lib/libicudata.a --skip /icu-bun/keep-raw.txt --icupkg bin/icupkg
6885

6986
FROM base as build_webkit
7087

0 commit comments

Comments
 (0)