Bug#1031804: nvtop: Crashes on multi-gpu system
Jesse Rhodes
jesse at sney.ca
Thu Feb 23 01:28:52 GMT 2023
Package: nvtop
Version: 3.0.1-1
Severity: important
X-Debbugs-Cc: jesse at sney.ca
Dear Maintainer,
This system has an integrated AMD GPU (Ryzen 7 7700X/"Raphael") and a discrete nVidia GPU (Geforce GTX 1660 Super), configured for PRIME offloading, with the proprietary nvidia driver, and monitors connected to the integrated.
I tried to run nvtop to verify that the Geforce was being used for a jitsi video conference, and it immediately crashed with the following output:
nvtop: ./src/extract_gpuinfo_amdgpu.c:946: parse_drm_fdinfo_amd: Assertion `!cache_entry_check && "We should not be processing a client id twice per update"' failed.
Aborted (core dumped)
I also got a backtrace (attached).
The message "processing a client id twice" indicates maybe it's not expecting to see statistics from two separate video devices? Though the man page at least implies it should support that.
Including the PRIME environment variables '__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia' when running nvtop makes no difference.
This is not an Optimus laptop but rather a desktop that I'm using the same way. I would be interested to know if actual Optimus devices have the same issue, and if not, what the difference is.
Please let me know if you need any more information, and thanks for your work!
sney
-- System Information:
Debian Release: bookworm/sid
APT prefers testing
APT policy: (990, 'testing'), (500, 'testing-debug'), (500, 'unstable'), (1, 'experimental')
Architecture: amd64 (x86_64)
Foreign Architectures: i386
Kernel: Linux 6.1.0-3-amd64 (SMP w/16 CPU threads; PREEMPT)
Kernel taint flags: TAINT_PROPRIETARY_MODULE, TAINT_OOT_MODULE, TAINT_UNSIGNED_MODULE
Locale: LANG=en_CA.UTF-8, LC_CTYPE=en_CA.UTF-8 (charmap=UTF-8), LANGUAGE not set
Shell: /bin/sh linked to /usr/bin/dash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled
Versions of packages nvtop depends on:
ii libc6 2.36-8
ii libncursesw6 6.4-2
ii libsystemd0 252.5-2
ii libtinfo6 6.4-2
nvtop recommends no packages.
nvtop suggests no packages.
-- no debconf information
-------------- next part --------------
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[?1049h[22;0;0t[1;30r(B[m[4l[?7h[H[2J[39;49m[?1h=[?25lnvtop: ./src/extract_gpuinfo_amdgpu.c:946: parse_drm_fdinfo_amd: Assertion `!cache_entry_check && "We should not be processing a client id twice per update"' failed.
Program received signal SIGABRT, Aborted.
__pthread_kill_implementation (threadid=<optimized out>, signo=signo at entry=6, no_tid=no_tid at entry=0) at ./nptl/pthread_kill.c:44
44 ./nptl/pthread_kill.c: No such file or directory.
#0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo at entry=6, no_tid=no_tid at entry=0) at ./nptl/pthread_kill.c:44
#1 0x00007ffff7c2ed2f in __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
#2 0x00007ffff7bdfef2 in __GI_raise (sig=sig at entry=6) at ../sysdeps/posix/raise.c:26
#3 0x00007ffff7bca472 in __GI_abort () at ./stdlib/abort.c:79
#4 0x00007ffff7bca395 in __assert_fail_base (fmt=0x7ffff7d3ea70 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=946, function=<optimized out>) at ./assert/assert.c:92
#5 0x00007ffff7bd8df2 in __GI___assert_fail (assertion=assertion at entry=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=file at entry=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=line at entry=946, function=function at entry=0x55555556d410 <__PRETTY_FUNCTION__.2> "parse_drm_fdinfo_amd") at ./assert/assert.c:101
#6 0x0000555555567bce in parse_drm_fdinfo_amd (info=0x55555558a2a0, fdinfo_file=0x55555558ceb0, process_info=0x7fffffffd960) at ./src/extract_gpuinfo_amdgpu.c:946
#7 0x000055555556456b in processinfo_sweep_fdinfos () at ./src/extract_processinfo_fdinfo.c:178
#8 0x000055555556301e in gpuinfo_refresh_processes (devices=devices at entry=0x7fffffffdbb0) at ./src/extract_gpuinfo.c:247
#9 0x0000555555558fba in main (argc=<optimized out>, argv=<optimized out>) at ./src/nvtop.c:265
#0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo at entry=6, no_tid=no_tid at entry=0) at ./nptl/pthread_kill.c:44
tid = <optimized out>
ret = 0
pd = <optimized out>
old_mask = {__val = {93824992365256}}
ret = <optimized out>
#1 0x00007ffff7c2ed2f in __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
No locals.
#2 0x00007ffff7bdfef2 in __GI_raise (sig=sig at entry=6) at ../sysdeps/posix/raise.c:26
ret = <optimized out>
#3 0x00007ffff7bca472 in __GI_abort () at ./stdlib/abort.c:79
save_stage = 1
act = {__sigaction_handler = {sa_handler = 0x20, sa_sigaction = 0x20}, sa_mask = {__val = {946, 93824993866816, 7, 140737351478368, 93824993866992, 55834574848, 0, 140737488345000, 6025323250437744384, 5, 18446744073709551344, 13, 93824992334136, 946, 93824992334208, 93824992365256}}, sa_flags = -138162897, sa_restorer = 0x7ffff7d3b47c <_nl_C_name>}
#4 0x00007ffff7bca395 in __assert_fail_base (fmt=0x7ffff7d3ea70 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=946, function=<optimized out>) at ./assert/assert.c:92
str = 0x5555556e3440 "#\264\r"
total = 4096
#5 0x00007ffff7bd8df2 in __GI___assert_fail (assertion=assertion at entry=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=file at entry=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=line at entry=946, function=function at entry=0x55555556d410 <__PRETTY_FUNCTION__.2> "parse_drm_fdinfo_amd") at ./assert/assert.c:101
No locals.
#6 0x0000555555567bce in parse_drm_fdinfo_amd (info=0x55555558a2a0, fdinfo_file=0x55555558ceb0, process_info=0x7fffffffd960) at ./src/extract_gpuinfo_amdgpu.c:946
cache_entry = 0x5555556e3350
ucid = <optimized out>
cache_entry_check = <optimized out>
gpu_info = 0x55555558a2a0
line = 0x5555556d29c0 "drm-memory-cpu"
line_buf_size = 120
count = <optimized out>
client_id_set = true
cid = 1207423
current_time = {tv_sec = 31137, tv_nsec = 88068515}
parse_fdinfo_exit = <optimized out>
__PRETTY_FUNCTION__ = "parse_drm_fdinfo_amd"
__ptr = <optimized out>
#7 0x000055555556456b in processinfo_sweep_fdinfos () at ./src/extract_processinfo_fdinfo.c:178
callback_idx = 0
processes_info_local = {type = gpu_process_unknown, pid = 888435, cmdline = 0x0, user_name = 0x0, gfx_engine_used = 0, compute_engine_used = 0, enc_engine_used = 0, dec_engine_used = 0, gpu_usage = 0, encode_usage = 0, decode_usage = 0, gpu_memory_usage = 4096, gpu_memory_percentage = 0, cpu_usage = 0, cpu_memory_virt = 0, cpu_memory_res = 0, valid = "\000\002"}
fdinfo_fd = <optimized out>
fdinfo_file = 0x55555558ceb0
process_info = <optimized out>
fd_num = <optimized out>
callback_success = false
current_callback = 0x55555558ca40
process_index = <optimized out>
pid_dir_fd = <optimized out>
fd_dir_fd = <optimized out>
fdinfo_dir_fd = <optimized out>
fdinfo_dir = <optimized out>
seen_fds_len = <optimized out>
fdinfo_dent = <optimized out>
client_pid = 888435
proc_dir = 0x5555556d2dc0
seen_fds_capacity = 8
seen_fds = 0x55555558bad0
proc_dent = <optimized out>
next_fd = <optimized out>
#8 0x000055555556301e in gpuinfo_refresh_processes (devices=devices at entry=0x7fffffffdbb0) at ./src/extract_gpuinfo.c:247
device = <optimized out>
#9 0x0000555555558fba in main (argc=<optimized out>, argv=<optimized out>) at ./src/nvtop.c:265
time_before_sleep = {tv_sec = 0, tv_nsec = 0}
time_after_sleep = {tv_sec = 0, tv_nsec = 0}
input_char = <optimized out>
update_interval_option_set = <optimized out>
update_interval_option = -134229984
no_color_option = false
use_fahrenheit_option = false
hide_plot_option = false
reverse_plot_direction_option = false
encode_decode_timer_option_set = false
encode_decode_hide_time = -1
custom_config_file_path = 0x0
siga = {__sigaction_handler = {sa_handler = 0x555555559230 <resize_handler>, sa_sigaction = 0x555555559230 <resize_handler>}, sa_mask = {__val = {0 <repeats 16 times>}}, sa_flags = 0, sa_restorer = 0x0}
allDevCount = 2
monitoredGpus = {next = 0x55555558a2a0, prev = 0x55555558b930}
nonMonitoredGpus = {next = 0x7fffffffdbc0, prev = 0x7fffffffdbc0}
numWarningMessages = 0
warningMessages = 0x5555555749b0 <message_array>
allDevicesOptions = {plot_left_to_right = false, temperature_in_fahrenheit = false, use_color = true, encode_decode_hiding_timer = 30, gpu_specific_opts = 0x555555588f70, config_file_location = 0x5555555893f0 "/home/jesse/.config/nvtop/interface.ini", sort_processes_by = process_memory, sort_descending_order = true, update_interval = 1000, process_fields_displayed = 1951, show_startup_messages = true, filter_nvtop_pid = true, has_monitored_set_changed = false}
numMonitoredGpus = 2
interface = 0x555555576630
time_slept = 1000
Thread 1 (Thread 0x7ffff7915fc0 (LWP 896980) "nvtop"):
#0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo at entry=6, no_tid=no_tid at entry=0) at ./nptl/pthread_kill.c:44
tid = <optimized out>
ret = 0
pd = <optimized out>
old_mask = {__val = {93824992365256}}
ret = <optimized out>
#1 0x00007ffff7c2ed2f in __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
No locals.
#2 0x00007ffff7bdfef2 in __GI_raise (sig=sig at entry=6) at ../sysdeps/posix/raise.c:26
ret = <optimized out>
#3 0x00007ffff7bca472 in __GI_abort () at ./stdlib/abort.c:79
save_stage = 1
act = {__sigaction_handler = {sa_handler = 0x20, sa_sigaction = 0x20}, sa_mask = {__val = {946, 93824993866816, 7, 140737351478368, 93824993866992, 55834574848, 0, 140737488345000, 6025323250437744384, 5, 18446744073709551344, 13, 93824992334136, 946, 93824992334208, 93824992365256}}, sa_flags = -138162897, sa_restorer = 0x7ffff7d3b47c <_nl_C_name>}
#4 0x00007ffff7bca395 in __assert_fail_base (fmt=0x7ffff7d3ea70 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=946, function=<optimized out>) at ./assert/assert.c:92
str = 0x5555556e3440 "#\264\r"
total = 4096
#5 0x00007ffff7bd8df2 in __GI___assert_fail (assertion=assertion at entry=0x55555556d180 "!cache_entry_check && \"We should not be processing a client id twice per update\"", file=file at entry=0x55555556d138 "./src/extract_gpuinfo_amdgpu.c", line=line at entry=946, function=function at entry=0x55555556d410 <__PRETTY_FUNCTION__.2> "parse_drm_fdinfo_amd") at ./assert/assert.c:101
No locals.
#6 0x0000555555567bce in parse_drm_fdinfo_amd (info=0x55555558a2a0, fdinfo_file=0x55555558ceb0, process_info=0x7fffffffd960) at ./src/extract_gpuinfo_amdgpu.c:946
cache_entry = 0x5555556e3350
ucid = <optimized out>
cache_entry_check = <optimized out>
gpu_info = 0x55555558a2a0
line = 0x5555556d29c0 "drm-memory-cpu"
line_buf_size = 120
count = <optimized out>
client_id_set = true
cid = 1207423
current_time = {tv_sec = 31137, tv_nsec = 88068515}
parse_fdinfo_exit = <optimized out>
__PRETTY_FUNCTION__ = "parse_drm_fdinfo_amd"
__ptr = <optimized out>
#7 0x000055555556456b in processinfo_sweep_fdinfos () at ./src/extract_processinfo_fdinfo.c:178
callback_idx = 0
processes_info_local = {type = gpu_process_unknown, pid = 888435, cmdline = 0x0, user_name = 0x0, gfx_engine_used = 0, compute_engine_used = 0, enc_engine_used = 0, dec_engine_used = 0, gpu_usage = 0, encode_usage = 0, decode_usage = 0, gpu_memory_usage = 4096, gpu_memory_percentage = 0, cpu_usage = 0, cpu_memory_virt = 0, cpu_memory_res = 0, valid = "\000\002"}
fdinfo_fd = <optimized out>
fdinfo_file = 0x55555558ceb0
process_info = <optimized out>
fd_num = <optimized out>
callback_success = false
current_callback = 0x55555558ca40
process_index = <optimized out>
pid_dir_fd = <optimized out>
fd_dir_fd = <optimized out>
fdinfo_dir_fd = <optimized out>
fdinfo_dir = <optimized out>
seen_fds_len = <optimized out>
fdinfo_dent = <optimized out>
client_pid = 888435
proc_dir = 0x5555556d2dc0
seen_fds_capacity = 8
seen_fds = 0x55555558bad0
proc_dent = <optimized out>
next_fd = <optimized out>
#8 0x000055555556301e in gpuinfo_refresh_processes (devices=devices at entry=0x7fffffffdbb0) at ./src/extract_gpuinfo.c:247
device = <optimized out>
#9 0x0000555555558fba in main (argc=<optimized out>, argv=<optimized out>) at ./src/nvtop.c:265
time_before_sleep = {tv_sec = 0, tv_nsec = 0}
time_after_sleep = {tv_sec = 0, tv_nsec = 0}
input_char = <optimized out>
update_interval_option_set = <optimized out>
update_interval_option = -134229984
no_color_option = false
use_fahrenheit_option = false
hide_plot_option = false
reverse_plot_direction_option = false
encode_decode_timer_option_set = false
encode_decode_hide_time = -1
custom_config_file_path = 0x0
siga = {__sigaction_handler = {sa_handler = 0x555555559230 <resize_handler>, sa_sigaction = 0x555555559230 <resize_handler>}, sa_mask = {__val = {0 <repeats 16 times>}}, sa_flags = 0, sa_restorer = 0x0}
allDevCount = 2
monitoredGpus = {next = 0x55555558a2a0, prev = 0x55555558b930}
nonMonitoredGpus = {next = 0x7fffffffdbc0, prev = 0x7fffffffdbc0}
numWarningMessages = 0
warningMessages = 0x5555555749b0 <message_array>
allDevicesOptions = {plot_left_to_right = false, temperature_in_fahrenheit = false, use_color = true, encode_decode_hiding_timer = 30, gpu_specific_opts = 0x555555588f70, config_file_location = 0x5555555893f0 "/home/jesse/.config/nvtop/interface.ini", sort_processes_by = process_memory, sort_descending_order = true, update_interval = 1000, process_fields_displayed = 1951, show_startup_messages = true, filter_nvtop_pid = true, has_monitored_set_changed = false}
numMonitoredGpus = 2
interface = 0x555555576630
time_slept = 1000
More information about the pkg-nvidia-devel
mailing list