Skip to content

Commit a962aba

Browse files
committed
Fix SIGSEGV on IOThreadPool threads during HDFS scan
1 parent 0322f43 commit a962aba

5 files changed

Lines changed: 201 additions & 2 deletions

File tree

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.gluten.jni;
18+
19+
import org.apache.gluten.backendsapi.BackendsApiManager;
20+
import org.apache.gluten.runtime.Runtime;
21+
import org.apache.gluten.runtime.Runtimes;
22+
import org.apache.gluten.test.VeloxBackendTestBase;
23+
import org.apache.gluten.vectorized.ColumnarBatchInIterator;
24+
25+
import org.apache.spark.sql.vectorized.ColumnarBatch;
26+
import org.apache.spark.task.TaskResources$;
27+
import org.junit.Assert;
28+
import org.junit.Test;
29+
30+
import java.util.Collections;
31+
import java.util.Iterator;
32+
import java.util.concurrent.atomic.AtomicBoolean;
33+
import java.util.concurrent.atomic.AtomicReference;
34+
35+
/**
36+
* Regression test for SIGSEGV in CPUThreadPool threads during HDFS scan.
37+
*
38+
* <p>Root cause: JniColumnarBatchIterator destructor called DetachCurrentThread(), which poisoned
39+
* libhdfs.so's TLS-cached JNIEnv*. The next HDFS call on the same thread used the stale env,
40+
* causing SIGSEGV in jni_NewStringUTF.
41+
*
42+
* <p>This test reproduces the exact crash: on a native std::thread (simulating CPUThreadPool), it
43+
* saves the JNIEnv* (like libhdfs caches in TLS), destroys a real JniColumnarBatchIterator, then
44+
* reuses the saved env for a JNI call. With the buggy code, this triggers SIGSEGV and the JVM
45+
* crashes. With the fix, it works normally.
46+
*/
47+
public class JniThreadDetachTest extends VeloxBackendTestBase {
48+
49+
/**
50+
* Native helper in JniTestHelper.cc. Spawns a std::thread and reproduces:
51+
*
52+
* <ol>
53+
* <li>Attach thread, save env (simulates libhdfs TLS cache)
54+
* <li>Create/destroy real JniColumnarBatchIterator (destructor under test)
55+
* <li>Reuse saved env for FindClass (simulates libhdfs's next HDFS call)
56+
* </ol>
57+
*
58+
* With the fix: returns true. With the bug: SIGSEGV crashes the JVM at step 3.
59+
*/
60+
private static native boolean nativeTestIteratorDestructorKeepsThreadAttached(
61+
long runtimeHandle, Object jColumnarBatchItr);
62+
63+
@Test
64+
public void testIteratorDestructorDoesNotDetachThread() {
65+
AtomicBoolean result = new AtomicBoolean(false);
66+
AtomicReference<Throwable> thrown = new AtomicReference<>(null);
67+
68+
TaskResources$.MODULE$.runUnsafe(
69+
() -> {
70+
try {
71+
String backendName = BackendsApiManager.getBackendName();
72+
Runtime runtime = Runtimes.contextInstance(backendName, "JniThreadDetachTest");
73+
long runtimeHandle = runtime.getHandle();
74+
75+
Iterator<ColumnarBatch> emptyIter = Collections.emptyIterator();
76+
ColumnarBatchInIterator batchItr = new ColumnarBatchInIterator(backendName, emptyIter);
77+
78+
boolean ok = nativeTestIteratorDestructorKeepsThreadAttached(runtimeHandle, batchItr);
79+
result.set(ok);
80+
} catch (Throwable t) {
81+
thrown.set(t);
82+
}
83+
return null;
84+
});
85+
86+
if (thrown.get() != null) {
87+
Assert.fail(
88+
"Test setup failed (exception in TaskResources scope): " + thrown.get().getMessage());
89+
}
90+
Assert.assertTrue(
91+
"JNI call on native thread failed after JniColumnarBatchIterator destructor.",
92+
result.get());
93+
}
94+
}

cpp/core/jni/JniCommon.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,11 @@ gluten::JniColumnarBatchIterator::~JniColumnarBatchIterator() {
9595
attachCurrentThreadAsDaemonOrThrow(vm_, &env);
9696
env->DeleteGlobalRef(jColumnarBatchItr_);
9797
env->DeleteGlobalRef(serializedColumnarBatchIteratorClass_);
98-
vm_->DetachCurrentThread();
98+
// Do NOT call DetachCurrentThread() here.
99+
// libhdfs.so caches JNIEnv* in thread-local storage after AttachCurrentThread.
100+
// If we detach, libhdfs's TLS cache becomes stale — the next HDFS call via
101+
// libhdfs returns the stale env, causing SIGSEGV in jni_NewStringUTF.
102+
// Daemon-attached threads are safe to leave attached; they won't block JVM shutdown.
99103
}
100104

101105
std::shared_ptr<gluten::ColumnarBatch> gluten::JniColumnarBatchIterator::next() {

cpp/core/jni/JniWrapper.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,11 @@ class JavaInputStreamAdaptor final : public arrow::io::InputStream {
103103
env->CallVoidMethod(jniIn_, jniByteInputStreamClose);
104104
checkException(env);
105105
env->DeleteGlobalRef(jniIn_);
106-
vm_->DetachCurrentThread();
106+
// Do NOT call DetachCurrentThread() here.
107+
// libhdfs.so caches JNIEnv* in thread-local storage after AttachCurrentThread.
108+
// If we detach, libhdfs's TLS cache becomes stale — the next HDFS call via
109+
// libhdfs returns the stale env, causing SIGSEGV in jni_NewStringUTF.
110+
// Daemon-attached threads are safe to leave attached; they won't block JVM shutdown.
107111
closed_ = true;
108112
return arrow::Status::OK();
109113
}

cpp/velox/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ set(VELOX_SRCS
157157
jni/JniFileSystem.cc
158158
jni/JniUdf.cc
159159
jni/VeloxJniWrapper.cc
160+
jni/JniTestHelper.cc
160161
jni/JniHashTable.cc
161162
memory/BufferOutputStream.cc
162163
memory/VeloxColumnarBatch.cc

cpp/velox/jni/JniTestHelper.cc

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
// Native JNI helpers for Java integration tests (backends-velox/src/test/java).
19+
// Separated from VeloxJniWrapper.cc to keep production code clean.
20+
21+
#include <jni.h>
22+
23+
#include <atomic>
24+
#include <memory>
25+
#include <thread>
26+
27+
#include <jni/JniCommon.h>
28+
#include <jni/JniError.h>
29+
30+
#include "compute/Runtime.h"
31+
32+
#ifdef __cplusplus
33+
extern "C" {
34+
#endif
35+
36+
// Regression test helper for JniThreadDetachTest.
37+
//
38+
// Reproduces the exact production crash on a native std::thread (CPUThreadPool):
39+
// 1. Attach thread, save env (simulates libhdfs caching JNIEnv in TLS)
40+
// 2. Create and destroy a real JniColumnarBatchIterator (destructor under test)
41+
// 3. Reuse saved env for FindClass (simulates libhdfs's next hdfsGetPathInfo)
42+
//
43+
// With the fix (no DetachCurrentThread): step 3 succeeds, returns true.
44+
// With the bug (DetachCurrentThread present): step 3 triggers SIGSEGV — JVM crashes.
45+
JNIEXPORT jboolean JNICALL
46+
Java_org_apache_gluten_jni_JniThreadDetachTest_nativeTestIteratorDestructorKeepsThreadAttached( // NOLINT
47+
JNIEnv* env,
48+
jclass,
49+
jlong runtimeHandle,
50+
jobject jColumnarBatchItr) {
51+
JNI_METHOD_START
52+
JavaVM* vm;
53+
if (env->GetJavaVM(&vm) != JNI_OK) {
54+
throw gluten::GlutenException("Unable to get JavaVM instance");
55+
}
56+
auto* runtime = reinterpret_cast<gluten::Runtime*>(runtimeHandle);
57+
58+
// Convert local ref to global ref so the native thread can use it.
59+
jobject globalItr = env->NewGlobalRef(jColumnarBatchItr);
60+
61+
std::atomic<bool> succeeded{false};
62+
63+
// Spawn a native thread (simulates CPUThreadPool).
64+
std::thread t([vm, runtime, globalItr, &succeeded]() {
65+
// Step 1: Attach and save env.
66+
// In production, libhdfs does this and caches env in __thread TLS.
67+
JNIEnv* savedEnv = nullptr;
68+
attachCurrentThreadAsDaemonOrThrow(vm, &savedEnv);
69+
70+
// Step 2: Create and destroy a real JniColumnarBatchIterator.
71+
// The destructor previously called DetachCurrentThread, invalidating savedEnv.
72+
{
73+
auto iter = std::make_unique<gluten::JniColumnarBatchIterator>(savedEnv, globalItr, runtime);
74+
// Real destructor runs here.
75+
}
76+
77+
// Step 3: Reuse savedEnv — simulates libhdfs returning TLS-cached env.
78+
// With the bug: savedEnv is stale, FindClass triggers SIGSEGV (JVM crashes).
79+
// With the fix: savedEnv is valid, FindClass succeeds.
80+
jclass cls = savedEnv->FindClass("java/lang/String");
81+
if (cls != nullptr) {
82+
savedEnv->DeleteLocalRef(cls);
83+
}
84+
85+
succeeded.store(true);
86+
});
87+
t.join();
88+
89+
env->DeleteGlobalRef(globalItr);
90+
return succeeded.load() ? JNI_TRUE : JNI_FALSE;
91+
JNI_METHOD_END(JNI_FALSE)
92+
}
93+
94+
#ifdef __cplusplus
95+
}
96+
#endif

0 commit comments

Comments
 (0)