|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | +package org.apache.gluten.jni; |
| 18 | + |
| 19 | +import org.apache.gluten.backendsapi.BackendsApiManager; |
| 20 | +import org.apache.gluten.runtime.Runtime; |
| 21 | +import org.apache.gluten.runtime.Runtimes; |
| 22 | +import org.apache.gluten.test.VeloxBackendTestBase; |
| 23 | +import org.apache.gluten.vectorized.ColumnarBatchInIterator; |
| 24 | + |
| 25 | +import org.apache.spark.sql.vectorized.ColumnarBatch; |
| 26 | +import org.apache.spark.task.TaskResources$; |
| 27 | +import org.junit.Assert; |
| 28 | +import org.junit.Test; |
| 29 | + |
| 30 | +import java.util.Collections; |
| 31 | +import java.util.Iterator; |
| 32 | +import java.util.concurrent.atomic.AtomicBoolean; |
| 33 | +import java.util.concurrent.atomic.AtomicReference; |
| 34 | + |
| 35 | +/** |
| 36 | + * Regression test for SIGSEGV in CPUThreadPool threads during HDFS scan. |
| 37 | + * |
| 38 | + * <p>Root cause: JniColumnarBatchIterator destructor called DetachCurrentThread(), which poisoned |
| 39 | + * libhdfs.so's TLS-cached JNIEnv*. The next HDFS call on the same thread used the stale env, |
| 40 | + * causing SIGSEGV in jni_NewStringUTF. |
| 41 | + * |
| 42 | + * <p>This test reproduces the exact crash: on a native std::thread (simulating CPUThreadPool), it |
| 43 | + * saves the JNIEnv* (like libhdfs caches in TLS), destroys a real JniColumnarBatchIterator, then |
| 44 | + * reuses the saved env for a JNI call. With the buggy code, this triggers SIGSEGV and the JVM |
| 45 | + * crashes. With the fix, it works normally. |
| 46 | + */ |
| 47 | +public class JniThreadDetachTest extends VeloxBackendTestBase { |
| 48 | + |
| 49 | + /** |
| 50 | + * Native helper in JniTestHelper.cc. Spawns a std::thread and reproduces: |
| 51 | + * |
| 52 | + * <ol> |
| 53 | + * <li>Attach thread, save env (simulates libhdfs TLS cache) |
| 54 | + * <li>Create/destroy real JniColumnarBatchIterator (destructor under test) |
| 55 | + * <li>Reuse saved env for FindClass (simulates libhdfs's next HDFS call) |
| 56 | + * </ol> |
| 57 | + * |
| 58 | + * With the fix: returns true. With the bug: SIGSEGV crashes the JVM at step 3. |
| 59 | + */ |
| 60 | + private static native boolean nativeTestIteratorDestructorKeepsThreadAttached( |
| 61 | + long runtimeHandle, Object jColumnarBatchItr); |
| 62 | + |
| 63 | + @Test |
| 64 | + public void testIteratorDestructorDoesNotDetachThread() { |
| 65 | + AtomicBoolean result = new AtomicBoolean(false); |
| 66 | + AtomicReference<Throwable> thrown = new AtomicReference<>(null); |
| 67 | + |
| 68 | + TaskResources$.MODULE$.runUnsafe( |
| 69 | + () -> { |
| 70 | + try { |
| 71 | + String backendName = BackendsApiManager.getBackendName(); |
| 72 | + Runtime runtime = Runtimes.contextInstance(backendName, "JniThreadDetachTest"); |
| 73 | + long runtimeHandle = runtime.getHandle(); |
| 74 | + |
| 75 | + Iterator<ColumnarBatch> emptyIter = Collections.emptyIterator(); |
| 76 | + ColumnarBatchInIterator batchItr = new ColumnarBatchInIterator(backendName, emptyIter); |
| 77 | + |
| 78 | + boolean ok = nativeTestIteratorDestructorKeepsThreadAttached(runtimeHandle, batchItr); |
| 79 | + result.set(ok); |
| 80 | + } catch (Throwable t) { |
| 81 | + thrown.set(t); |
| 82 | + } |
| 83 | + return null; |
| 84 | + }); |
| 85 | + |
| 86 | + if (thrown.get() != null) { |
| 87 | + Assert.fail( |
| 88 | + "Test setup failed (exception in TaskResources scope): " + thrown.get().getMessage()); |
| 89 | + } |
| 90 | + Assert.assertTrue( |
| 91 | + "JNI call on native thread failed after JniColumnarBatchIterator destructor.", |
| 92 | + result.get()); |
| 93 | + } |
| 94 | +} |
0 commit comments