Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gluten.jni;

import org.apache.gluten.backendsapi.BackendsApiManager;
import org.apache.gluten.runtime.Runtime;
import org.apache.gluten.runtime.Runtimes;
import org.apache.gluten.test.VeloxBackendTestBase;
import org.apache.gluten.vectorized.ColumnarBatchInIterator;

import org.apache.spark.sql.vectorized.ColumnarBatch;
import org.apache.spark.task.TaskResources$;
import org.junit.Assert;
import org.junit.Test;

import java.util.Collections;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

/**
* Regression test for SIGSEGV in CPUThreadPool threads during HDFS scan.
*
* <p>Root cause: JniColumnarBatchIterator destructor called DetachCurrentThread(), which poisoned
* libhdfs.so's TLS-cached JNIEnv*. The next HDFS call on the same thread used the stale env,
* causing SIGSEGV in jni_NewStringUTF.
*
* <p>This test reproduces the exact crash: on a native std::thread (simulating CPUThreadPool), it
* saves the JNIEnv* (like libhdfs caches in TLS), destroys a real JniColumnarBatchIterator, then
* reuses the saved env for a JNI call. With the buggy code, this triggers SIGSEGV and the JVM
* crashes. With the fix, it works normally.
*/
public class JniThreadDetachTest extends VeloxBackendTestBase {

/**
* Native helper in JniTestHelper.cc. Spawns a std::thread and reproduces:
*
* <ol>
* <li>Attach thread, save env (simulates libhdfs TLS cache)
* <li>Create/destroy real JniColumnarBatchIterator (destructor under test)
* <li>Reuse saved env for FindClass (simulates libhdfs's next HDFS call)
* </ol>
*
* With the fix: returns true. With the bug: SIGSEGV crashes the JVM at step 3.
*/
private static native boolean nativeTestIteratorDestructorKeepsThreadAttached(
long runtimeHandle, Object jColumnarBatchItr);

@Test
public void testIteratorDestructorDoesNotDetachThread() {
AtomicBoolean result = new AtomicBoolean(false);
AtomicReference<Throwable> thrown = new AtomicReference<>(null);

TaskResources$.MODULE$.runUnsafe(
() -> {
try {
String backendName = BackendsApiManager.getBackendName();
Runtime runtime = Runtimes.contextInstance(backendName, "JniThreadDetachTest");
long runtimeHandle = runtime.getHandle();

Iterator<ColumnarBatch> emptyIter = Collections.emptyIterator();
ColumnarBatchInIterator batchItr = new ColumnarBatchInIterator(backendName, emptyIter);

boolean ok = nativeTestIteratorDestructorKeepsThreadAttached(runtimeHandle, batchItr);
result.set(ok);
} catch (Throwable t) {
thrown.set(t);
}
return null;
});

if (thrown.get() != null) {
Assert.fail(
"Test setup failed (exception in TaskResources scope): " + thrown.get().getMessage());
}
Assert.assertTrue(
"JNI call on native thread failed after JniColumnarBatchIterator destructor.",
result.get());
}
}
6 changes: 5 additions & 1 deletion cpp/core/jni/JniCommon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ gluten::JniColumnarBatchIterator::~JniColumnarBatchIterator() {
attachCurrentThreadAsDaemonOrThrow(vm_, &env);
env->DeleteGlobalRef(jColumnarBatchItr_);
env->DeleteGlobalRef(serializedColumnarBatchIteratorClass_);
vm_->DetachCurrentThread();
// Do NOT call DetachCurrentThread() here.
// libhdfs.so caches JNIEnv* in thread-local storage after AttachCurrentThread.
// If we detach, libhdfs's TLS cache becomes stale — the next HDFS call via
// libhdfs returns the stale env, causing SIGSEGV in jni_NewStringUTF.
// Daemon-attached threads are safe to leave attached; they won't block JVM shutdown.
}

std::shared_ptr<gluten::ColumnarBatch> gluten::JniColumnarBatchIterator::next() {
Expand Down
6 changes: 5 additions & 1 deletion cpp/core/jni/JniWrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,11 @@ class JavaInputStreamAdaptor final : public arrow::io::InputStream {
env->CallVoidMethod(jniIn_, jniByteInputStreamClose);
checkException(env);
env->DeleteGlobalRef(jniIn_);
vm_->DetachCurrentThread();
// Do NOT call DetachCurrentThread() here.
// libhdfs.so caches JNIEnv* in thread-local storage after AttachCurrentThread.
// If we detach, libhdfs's TLS cache becomes stale — the next HDFS call via
// libhdfs returns the stale env, causing SIGSEGV in jni_NewStringUTF.
// Daemon-attached threads are safe to leave attached; they won't block JVM shutdown.
closed_ = true;
return arrow::Status::OK();
}
Expand Down
1 change: 1 addition & 0 deletions cpp/velox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ set(VELOX_SRCS
jni/JniFileSystem.cc
jni/JniUdf.cc
jni/VeloxJniWrapper.cc
jni/JniTestHelper.cc
jni/JniHashTable.cc
memory/BufferOutputStream.cc
memory/VeloxColumnarBatch.cc
Expand Down
96 changes: 96 additions & 0 deletions cpp/velox/jni/JniTestHelper.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// Native JNI helpers for Java integration tests (backends-velox/src/test/java).
// Separated from VeloxJniWrapper.cc to keep production code clean.

#include <jni.h>

#include <atomic>
#include <memory>
#include <thread>

#include <jni/JniCommon.h>
#include <jni/JniError.h>

#include "compute/Runtime.h"

#ifdef __cplusplus
extern "C" {
#endif

// Regression test helper for JniThreadDetachTest.
//
// Reproduces the exact production crash on a native std::thread (CPUThreadPool):
// 1. Attach thread, save env (simulates libhdfs caching JNIEnv in TLS)
// 2. Create and destroy a real JniColumnarBatchIterator (destructor under test)
// 3. Reuse saved env for FindClass (simulates libhdfs's next hdfsGetPathInfo)
//
// With the fix (no DetachCurrentThread): step 3 succeeds, returns true.
// With the bug (DetachCurrentThread present): step 3 triggers SIGSEGV — JVM crashes.
JNIEXPORT jboolean JNICALL
Java_org_apache_gluten_jni_JniThreadDetachTest_nativeTestIteratorDestructorKeepsThreadAttached( // NOLINT
JNIEnv* env,
jclass,
jlong runtimeHandle,
jobject jColumnarBatchItr) {
JNI_METHOD_START
JavaVM* vm;
if (env->GetJavaVM(&vm) != JNI_OK) {
throw gluten::GlutenException("Unable to get JavaVM instance");
}
auto* runtime = reinterpret_cast<gluten::Runtime*>(runtimeHandle);

// Convert local ref to global ref so the native thread can use it.
jobject globalItr = env->NewGlobalRef(jColumnarBatchItr);

std::atomic<bool> succeeded{false};

// Spawn a native thread (simulates CPUThreadPool).
std::thread t([vm, runtime, globalItr, &succeeded]() {
// Step 1: Attach and save env.
// In production, libhdfs does this and caches env in __thread TLS.
JNIEnv* savedEnv = nullptr;
attachCurrentThreadAsDaemonOrThrow(vm, &savedEnv);

// Step 2: Create and destroy a real JniColumnarBatchIterator.
// The destructor previously called DetachCurrentThread, invalidating savedEnv.
{
auto iter = std::make_unique<gluten::JniColumnarBatchIterator>(savedEnv, globalItr, runtime);
// Real destructor runs here.
}

// Step 3: Reuse savedEnv — simulates libhdfs returning TLS-cached env.
// With the bug: savedEnv is stale, FindClass triggers SIGSEGV (JVM crashes).
// With the fix: savedEnv is valid, FindClass succeeds.
jclass cls = savedEnv->FindClass("java/lang/String");
if (cls != nullptr) {
savedEnv->DeleteLocalRef(cls);
}

succeeded.store(true);
});
t.join();

env->DeleteGlobalRef(globalItr);
return succeeded.load() ? JNI_TRUE : JNI_FALSE;
JNI_METHOD_END(JNI_FALSE)
}

#ifdef __cplusplus
}
#endif
Loading