Skip to content

Commit 99e6175

Browse files
committed
Add backtrace to crash logs
Send crash information from the signal handler to traffic_crashlog. This also adds a 30-second timeout around backtrace collection to prevent the crashlog helper from hanging indefinitely if ptrace/waitpid blocks.
1 parent fcd0827 commit 99e6175

8 files changed

Lines changed: 355 additions & 30 deletions

File tree

src/traffic_crashlog/backtrace.cc

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,57 +105,70 @@ backtrace_for_thread(pid_t threadid, TextBuffer &text)
105105
void *ap = nullptr;
106106
pid_t target = -1;
107107
unsigned level = 0;
108+
int step_result;
108109

109110
// First, attach to the child, causing it to stop.
110111
status = ptrace(PTRACE_ATTACH, threadid, 0, 0);
111112
if (status < 0) {
113+
text.format(" [ptrace ATTACH failed: %s (%d)]\n", strerror(errno), errno);
112114
Dbg(dbg_ctl_backtrace, "ptrace(ATTACH, %ld) -> %s (%d)\n", (long)threadid, strerror(errno), errno);
113115
return;
114116
}
115117

116-
// Wait for it to stop (XXX should be a timed wait ...)
118+
// Wait for it to stop. The caller uses alarm() to enforce a timeout.
117119
target = waitpid(threadid, &status, __WALL | WUNTRACED);
118120
Dbg(dbg_ctl_backtrace, "waited for target %ld, found PID %ld, %s\n", (long)threadid, (long)target,
119121
WIFSTOPPED(status) ? "STOPPED" : "???");
120122
if (target < 0) {
123+
text.format(" [waitpid failed: %s (%d)]\n", strerror(errno), errno);
121124
goto done;
122125
}
123126

124127
ap = _UPT_create(threadid);
125128
Dbg(dbg_ctl_backtrace, "created UPT %p", ap);
126129
if (ap == nullptr) {
130+
text.format(" [_UPT_create failed]\n");
127131
goto done;
128132
}
129133

130134
addr_space = unw_create_addr_space(&_UPT_accessors, 0 /* byteorder */);
131135
Dbg(dbg_ctl_backtrace, "created address space %p\n", addr_space);
132136
if (addr_space == nullptr) {
137+
text.format(" [unw_create_addr_space failed]\n");
133138
goto done;
134139
}
135140

136141
status = unw_init_remote(&cursor, addr_space, ap);
137142
Dbg(dbg_ctl_backtrace, "unw_init_remote(...) -> %d\n", status);
138143
if (status != 0) {
144+
text.format(" [unw_init_remote failed: %d]\n", status);
139145
goto done;
140146
}
141147

142-
while (unw_step(&cursor) > 0) {
148+
step_result = unw_step(&cursor);
149+
if (step_result <= 0) {
150+
text.format(" [unw_step returned %d on first call]\n", step_result);
151+
}
152+
153+
while (step_result > 0) {
143154
unw_word_t ip;
144-
unw_word_t offset;
155+
unw_word_t offset = 0;
145156
char buf[256];
146157

147158
unw_get_reg(&cursor, UNW_REG_IP, &ip);
148159

149160
if (unw_get_proc_name(&cursor, buf, sizeof(buf), &offset) == 0) {
150-
int status;
151-
char *name = abi::__cxa_demangle(buf, nullptr, nullptr, &status);
152-
text.format("%-4u 0x%016llx %s + %p\n", level, static_cast<unsigned long long>(ip), name ? name : buf, (void *)offset);
161+
int demangle_status;
162+
char *name = abi::__cxa_demangle(buf, nullptr, nullptr, &demangle_status);
163+
text.format("%-4u 0x%016llx %s + 0x%lx\n", level, static_cast<unsigned long long>(ip), name ? name : buf,
164+
static_cast<unsigned long>(offset));
153165
free(name);
154166
} else {
155-
text.format("%-4u 0x%016llx 0x0 + %p\n", level, static_cast<unsigned long long>(ip), (void *)offset);
167+
text.format("%-4u 0x%016llx <unknown>\n", level, static_cast<unsigned long long>(ip));
156168
}
157169

158170
++level;
171+
step_result = unw_step(&cursor);
159172
}
160173

161174
done:
@@ -167,8 +180,10 @@ backtrace_for_thread(pid_t threadid, TextBuffer &text)
167180
_UPT_destroy(ap);
168181
}
169182

170-
status = ptrace(PTRACE_DETACH, target, NULL, DATA_NULL);
171-
Dbg(dbg_ctl_backtrace, "ptrace(DETACH, %ld) -> %d (errno %d)\n", (long)target, status, errno);
183+
if (target > 0) {
184+
status = ptrace(PTRACE_DETACH, target, NULL, DATA_NULL);
185+
Dbg(dbg_ctl_backtrace, "ptrace(DETACH, %ld) -> %d (errno %d)\n", (long)target, status, errno);
186+
}
172187
}
173188
} // namespace
174189
int

src/traffic_crashlog/traffic_crashlog.cc

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,24 @@
3232
#include "tscore/BaseLogFile.h"
3333
#include "tscore/runroot.h"
3434
#include "iocore/eventsystem/RecProcess.h"
35+
#include <csignal>
3536
#include <unistd.h>
3637

38+
namespace
39+
{
40+
// Timeout in seconds for backtrace collection. If ptrace/waitpid hangs, this
41+
// prevents the crashlog helper from blocking indefinitely.
42+
constexpr unsigned BACKTRACE_TIMEOUT_SECS = 10;
43+
44+
volatile sig_atomic_t backtrace_timed_out = 0;
45+
46+
void
47+
backtrace_alarm_handler(int /* sig */)
48+
{
49+
backtrace_timed_out = 1;
50+
}
51+
} // namespace
52+
3753
static int syslog_mode = false;
3854
static int debug_mode = false;
3955
static int wait_mode = false;
@@ -94,29 +110,58 @@ crashlog_open(const char *path)
94110
extern int ServerBacktrace(unsigned /* options */, int pid, char **trace);
95111

96112
bool
97-
crashlog_write_backtrace(FILE *fp, pid_t pid, const crashlog_target &)
113+
crashlog_write_backtrace(FILE *fp, pid_t pid, const crashlog_target &target)
98114
{
99-
char *trace = nullptr;
100-
int mgmterr;
115+
char *trace = nullptr;
116+
int mgmterr = -1;
117+
118+
// Set up a timeout to prevent indefinite hangs in ptrace/waitpid.
119+
backtrace_timed_out = 0;
120+
struct sigaction new_action;
121+
struct sigaction old_action;
122+
new_action.sa_handler = backtrace_alarm_handler;
123+
sigemptyset(&new_action.sa_mask);
124+
new_action.sa_flags = 0;
125+
sigaction(SIGALRM, &new_action, &old_action);
126+
alarm(BACKTRACE_TIMEOUT_SECS);
127+
128+
mgmterr = ServerBacktrace(0, static_cast<int>(pid), &trace);
129+
130+
// Cancel the alarm and restore the old handler.
131+
alarm(0);
132+
sigaction(SIGALRM, &old_action, nullptr);
133+
134+
if (backtrace_timed_out) {
135+
fprintf(fp, "Backtrace collection timed out after %u seconds\n", BACKTRACE_TIMEOUT_SECS);
136+
free(trace);
137+
return false;
138+
}
101139

102140
// NOTE: sometimes we can't get a backtrace because the ptrace attach will fail with
103141
// EPERM. I've seen this happen when a debugger is attached, which makes sense, but it
104142
// can also happen without a debugger. Possibly in that case, there is a race with the
105143
// kernel locking the process information?
106144

107-
if ((mgmterr = ServerBacktrace(0, static_cast<int>(pid), &trace)) != 0) {
108-
fprintf(fp, "Unable to retrieve backtrace: %d\n", mgmterr);
109-
return false;
145+
if (mgmterr == 0 && trace != nullptr) {
146+
// ServerBacktrace succeeded - this gives us all threads' backtraces.
147+
fprintf(fp, "%s", trace);
148+
free(trace);
149+
return true;
110150
}
111151

112-
if (trace == nullptr) {
113-
fprintf(fp, "Unable to retrieve backtrace: trace is null\n");
114-
return false;
152+
// ServerBacktrace failed. Fall back to the in-process backtrace from the crashing thread.
153+
if ((target.flags & CRASHLOG_HAVE_BACKTRACE) && !target.backtrace.empty()) {
154+
fprintf(fp, "Crashing Thread Backtrace:\n%s", target.backtrace.c_str());
155+
return true;
115156
}
116157

117-
fprintf(fp, "%s", trace);
118-
free(trace);
119-
return true;
158+
// No backtrace available from either source.
159+
if (mgmterr != 0) {
160+
fprintf(fp, "Unable to retrieve backtrace: ServerBacktrace returned %d\n", mgmterr);
161+
} else {
162+
fprintf(fp, "Unable to retrieve backtrace: no backtrace data available\n");
163+
}
164+
return false;
120165
}
121166

122167
void
@@ -200,11 +245,12 @@ main(int /* argc ATS_UNUSED */, const char **argv)
200245
Note("crashlog started, target=%ld, debug=%s syslog=%s, uid=%ld euid=%ld", static_cast<long>(target_pid),
201246
debug_mode ? "true" : "false", syslog_mode ? "true" : "false", (long)getuid(), (long)geteuid());
202247

203-
ink_zero(target);
204248
target.pid = static_cast<pid_t>(target_pid);
205249
target.timestamp = timestamp();
206250

207-
if (host_triplet && strncmp(host_triplet, "x86_64-unknown-linux", sizeof("x86_64-unknown-linux") - 1) == 0) {
251+
// Read crash context on Linux platforms. The siginfo_t and ucontext_t
252+
// structures are platform-specific but defined for all Linux architectures.
253+
if (host_triplet && (strstr(host_triplet, "linux") != nullptr || strstr(host_triplet, "Linux") != nullptr)) {
208254
ssize_t nbytes;
209255
target.flags |= CRASHLOG_HAVE_THREADINFO;
210256

@@ -219,6 +265,21 @@ main(int /* argc ATS_UNUSED */, const char **argv)
219265
Warning("received %zd of %zu expected thread context bytes", nbytes, sizeof(target.ucontext));
220266
target.flags &= ~CRASHLOG_HAVE_THREADINFO;
221267
}
268+
269+
// Read the in-process backtrace from the crashing thread.
270+
uint32_t bt_len = 0;
271+
nbytes = read(STDIN_FILENO, &bt_len, sizeof(bt_len));
272+
if (nbytes == static_cast<ssize_t>(sizeof(bt_len)) && bt_len > 0 && bt_len < 1024 * 1024) {
273+
target.backtrace.resize(bt_len);
274+
nbytes = read(STDIN_FILENO, target.backtrace.data(), bt_len);
275+
if (nbytes == static_cast<ssize_t>(bt_len)) {
276+
target.flags |= CRASHLOG_HAVE_BACKTRACE;
277+
Note("received %u bytes of in-process backtrace", bt_len);
278+
} else {
279+
Warning("received %zd of %u expected backtrace bytes", nbytes, bt_len);
280+
target.backtrace.clear();
281+
}
282+
}
222283
}
223284

224285
logname = crashlog_name();

src/traffic_crashlog/traffic_crashlog.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include "tscore/Diags.h"
2929
#include "tscore/TextBuffer.h"
3030

31+
#include <string>
32+
3133
// ucontext.h is deprecated on Darwin, and we really only need it on Linux, so only
3234
// include it if we are planning to use it.
3335
#if defined(__linux__)
@@ -49,20 +51,25 @@
4951
#endif
5052

5153
#define CRASHLOG_HAVE_THREADINFO 0x1u
54+
#define CRASHLOG_HAVE_BACKTRACE 0x2u
5255

5356
struct crashlog_target {
54-
pid_t pid;
55-
siginfo_t siginfo;
57+
pid_t pid{0};
58+
siginfo_t siginfo{};
5659
#if defined(__linux__)
57-
ucontext_t ucontext;
60+
ucontext_t ucontext{};
5861
#else
59-
char ucontext; // just a placeholder ...
62+
char ucontext{}; // just a placeholder ...
6063
#endif
61-
struct tm timestamp;
62-
unsigned flags;
64+
struct tm timestamp {
65+
};
66+
unsigned flags{0};
67+
68+
// In-process backtrace from the crashing thread.
69+
std::string backtrace;
6370
};
6471

65-
bool crashlog_write_backtrace(FILE *, const crashlog_target &);
72+
bool crashlog_write_backtrace(FILE *, pid_t /* parent pid */, const crashlog_target &);
6673
bool crashlog_write_datime(FILE *, const crashlog_target &);
6774
bool crashlog_write_exename(FILE *, const crashlog_target &);
6875
bool crashlog_write_proclimits(FILE *, const crashlog_target &);

src/traffic_server/Crash.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
#include "tscore/Version.h"
2929
#include "tscore/signals.h"
3030

31+
#include <string>
32+
#include <unistd.h>
33+
3134
// ucontext.h is deprecated on Darwin, and we really only need it on Linux, so only
3235
// include it if we are planning to use it.
3336
#if defined(__linux__)
@@ -167,6 +170,13 @@ crash_logger_invoke(int signo, siginfo_t *info, void *ctx)
167170
// a single memory block that we can just puke out.
168171
ATS_UNUSED_RETURN(write(crash_logger_fd, info, sizeof(siginfo_t)));
169172
ATS_UNUSED_RETURN(write(crash_logger_fd, static_cast<ucontext_t *>(ctx), sizeof(ucontext_t)));
173+
174+
// Send zero-length backtrace. We cannot safely generate a backtrace here
175+
// because backtrace() can acquire locks (e.g., in the dynamic linker) that
176+
// the crashing thread might be holding, causing a deadlock. The crash
177+
// logger will get the backtrace via ptrace from a separate process instead.
178+
uint32_t bt_len = 0;
179+
ATS_UNUSED_RETURN(write(crash_logger_fd, &bt_len, sizeof(bt_len)));
170180
#endif
171181

172182
close(crash_logger_fd);

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ add_subdirectory(tools/plugins)
3939
add_subdirectory(gold_tests/chunked_encoding)
4040
add_subdirectory(gold_tests/continuations/plugins)
4141
add_subdirectory(gold_tests/jsonrpc/plugins)
42+
add_subdirectory(gold_tests/pluginTest/crash_test)
4243
add_subdirectory(gold_tests/pluginTest/polite_hook_wait)
4344
add_subdirectory(gold_tests/pluginTest/tsapi)
4445
add_subdirectory(gold_tests/pluginTest/TSVConnFd)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#######################
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one or more contributor license
4+
# agreements. See the NOTICE file distributed with this work for additional information regarding
5+
# copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software distributed under the License
12+
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13+
# or implied. See the License for the specific language governing permissions and limitations under
14+
# the License.
15+
#
16+
#######################
17+
18+
add_autest_plugin(crash_test crash_test.cc)

0 commit comments

Comments
 (0)