From 612140e7ddc39eee8816ac397bf2f8302a174fc4 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 7 May 2026 10:37:00 +0200 Subject: [PATCH 1/3] gh-149202: Fix frame pointer unwinding on ppc64le and armv7/clang (#149409) - ppc64's backchain format is also different from x86 - On 32-bit ARM, clang needs `-mno-thumb`, not `-marm` like GCC --- Doc/using/configure.rst | 5 ++--- Modules/_testinternalcapi.c | 6 +++++ configure | 45 ++++++++++++++++++++++++++++++++++++- configure.ac | 4 ++++ 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index 62c53c283825c8..a0e46ff4e375b6 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -789,14 +789,13 @@ also be used to improve performance. - ``-fno-omit-frame-pointer`` and/or ``-mno-omit-leaf-frame-pointer`` are added when the compiler supports them. - - ``-marm`` is added on 32-bit ARM when supported, + - ``-marm`` and/or ``-mno-thumb`` is added on 32-bit ARM when supported, - on s390x platforms, when supported, ``-mbackchain`` is added *instead*. of the above frame pointer flags. Frame pointers enable profilers, debuggers, and system tracing tools (``perf``, ``eBPF``, ``dtrace``, ``gdb``) to walk the C call stack - without DWARF metadata. - The flags propagate to third-party C + without DWARF metadata. The flags propagate to third-party C extensions through :mod:`sysconfig`. On compilers that do not understand them, the build silently skips them. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index c0a7680388e4a7..b8a22c439e853d 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -92,6 +92,12 @@ static const uintptr_t min_frame_pointer_addr = 0x1000; # define FRAME_POINTER_NEXT_OFFSET 0 # define FRAME_POINTER_RETURN_OFFSET \ (S390X_FRAME_RETURN_ADDRESS_OFFSET / (Py_ssize_t)sizeof(uintptr_t)) +#elif defined(__powerpc64__) || defined(__ppc64__) +// ppc64le puts the return address at fp[2]; it saves the Condition Register +// in fp[1]. See: +// https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#STACK +# define FRAME_POINTER_NEXT_OFFSET 0 +# define FRAME_POINTER_RETURN_OFFSET 2 #else # define FRAME_POINTER_NEXT_OFFSET 0 # define FRAME_POINTER_RETURN_OFFSET 1 diff --git a/configure b/configure index f970bf9b7ba3c7..3377bf7516ebd2 100755 --- a/configure +++ b/configure @@ -10346,7 +10346,7 @@ fi case $host_cpu in #( arm|armv*) : - { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -marm" >&5 + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -marm" >&5 printf %s "checking whether C compiler accepts -marm... " >&6; } if test ${ax_cv_check_cflags__Werror__marm+y} then : @@ -10384,6 +10384,49 @@ then : frame_pointer_cflags="$frame_pointer_cflags -marm" +else case e in #( + e) : ;; +esac +fi + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mno-thumb" >&5 +printf %s "checking whether C compiler accepts -mno-thumb... " >&6; } +if test ${ax_cv_check_cflags__Werror__mno_thumb+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ax_check_save_flags=$CFLAGS + CFLAGS="$CFLAGS -Werror -mno-thumb" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + ax_cv_check_cflags__Werror__mno_thumb=yes +else case e in #( + e) ax_cv_check_cflags__Werror__mno_thumb=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ax_check_save_flags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__Werror__mno_thumb" >&5 +printf "%s\n" "$ax_cv_check_cflags__Werror__mno_thumb" >&6; } +if test "x$ax_cv_check_cflags__Werror__mno_thumb" = xyes +then : + + frame_pointer_cflags="$frame_pointer_cflags -mno-thumb" + else case e in #( e) : ;; esac diff --git a/configure.ac b/configure.ac index 9f91a10c2918cf..fc2db4d5e8aebb 100644 --- a/configure.ac +++ b/configure.ac @@ -2549,9 +2549,13 @@ AS_VAR_IF([ac_cv_gcc_compat], [yes], [ frame_pointer_cflags="$frame_pointer_cflags -mno-omit-leaf-frame-pointer" ], [], [-Werror]) AS_CASE([$host_cpu], [arm|armv*], [ + dnl GCC uses "-marm"; clang uses "-mno-thumb" AX_CHECK_COMPILE_FLAG([-marm], [ frame_pointer_cflags="$frame_pointer_cflags -marm" ], [], [-Werror]) + AX_CHECK_COMPILE_FLAG([-mno-thumb], [ + frame_pointer_cflags="$frame_pointer_cflags -mno-thumb" + ], [], [-Werror]) ]) AS_CASE([$host_cpu], [s390*], [ AX_CHECK_COMPILE_FLAG([-mbackchain], [ From b3cfd36ba4f17c19d587427b90ca9378ad009978 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 7 May 2026 11:47:08 +0200 Subject: [PATCH 2/3] gh-149202: Don't use -fno-omit-frame-pointer on ppc64le (#149485) The power ABI specification requires that compilers maintain a back chain by default, so unwinding already works without a dedicated frame pointer. Don't use -fno-omit-frame-pointer on ppc64le. --- Doc/using/configure.rst | 2 ++ Lib/test/test_frame_pointer_unwind.py | 6 ++++++ configure | 8 ++++++++ configure.ac | 3 +++ 4 files changed, 19 insertions(+) diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index a0e46ff4e375b6..77771bb70e1a30 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -792,6 +792,8 @@ also be used to improve performance. - ``-marm`` and/or ``-mno-thumb`` is added on 32-bit ARM when supported, - on s390x platforms, when supported, ``-mbackchain`` is added *instead*. of the above frame pointer flags. + - on ppc64le platforms, no compiler flags is needed since the power ABI + requires that compilers maintain a back chain by default. Frame pointers enable profilers, debuggers, and system tracing tools (``perf``, ``eBPF``, ``dtrace``, ``gdb``) to walk the C call stack diff --git a/Lib/test/test_frame_pointer_unwind.py b/Lib/test/test_frame_pointer_unwind.py index 5cd94e5b27f394..faa012c9c00d8f 100644 --- a/Lib/test/test_frame_pointer_unwind.py +++ b/Lib/test/test_frame_pointer_unwind.py @@ -56,6 +56,12 @@ def _frame_pointers_expected(machine): if sys.maxsize < 2**32: return None return True + if machine == "ppc64le": + # The power ABI specification requires that compilers maintain a + # back chain by default, so unwinding already works without a + # dedicated frame pointer. + # https://openpowerfoundation.org/specifications/64bitelfabi/ + return True if machine == "x86_64": final_opt = "" for opt in cflags.split(): diff --git a/configure b/configure index 3377bf7516ebd2..cff7dfbfba8b9a 100755 --- a/configure +++ b/configure @@ -10435,6 +10435,14 @@ fi ;; #( *) : ;; +esac + case $host_cpu in #( + powerpc64le) : + + frame_pointer_cflags="" + ;; #( + *) : + ;; esac case $host_cpu in #( s390*) : diff --git a/configure.ac b/configure.ac index fc2db4d5e8aebb..ac3269ab765c0d 100644 --- a/configure.ac +++ b/configure.ac @@ -2557,6 +2557,9 @@ AS_VAR_IF([ac_cv_gcc_compat], [yes], [ frame_pointer_cflags="$frame_pointer_cflags -mno-thumb" ], [], [-Werror]) ]) + AS_CASE([$host_cpu], [powerpc64le], [ + frame_pointer_cflags="" + ]) AS_CASE([$host_cpu], [s390*], [ AX_CHECK_COMPILE_FLAG([-mbackchain], [ dnl Do not use no-omit-frame-pointer; see gh-149362 From 13188dbf85cde2e35a5dda09758fb4765ff7bc86 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 7 May 2026 04:32:14 -0700 Subject: [PATCH 3/3] gh-148937: revert process RSS based GC deferral (#149475) --- Doc/library/gc.rst | 5 - Include/internal/pycore_interp_structs.h | 10 - ...-05-06-15-57-28.gh-issue-148940.dRIXiY.rst | 6 + Python/gc_free_threading.c | 211 +----------------- 4 files changed, 7 insertions(+), 225 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-05-06-15-57-28.gh-issue-148940.dRIXiY.rst diff --git a/Doc/library/gc.rst b/Doc/library/gc.rst index 701af579453ce3..65533e7e57adc3 100644 --- a/Doc/library/gc.rst +++ b/Doc/library/gc.rst @@ -133,11 +133,6 @@ The :mod:`!gc` module provides the following functions: With the third generation, things are a bit more complicated, see `Collecting the oldest generation `_ for more information. - In the free-threaded build, the increase in process memory usage is also - checked before running the collector. If the memory usage has not increased - by 10% since the last collection and the net number of object allocations - has not exceeded 40 times *threshold0*, the collection is not run. - See `Garbage collector design `_ for more information. .. versionchanged:: 3.14 diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 02a10e87b7e15c..f13bc2178b1e7e 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -263,16 +263,6 @@ struct _gc_runtime_state { #ifdef Py_GIL_DISABLED /* True if gc.freeze() has been used. */ int freeze_active; - - /* Memory usage of the process (RSS + swap) after last GC. */ - Py_ssize_t last_mem; - - /* This accumulates the new object count whenever collection is deferred - due to the RSS increase condition not being meet. Reset on collection. */ - Py_ssize_t deferred_count; - - /* Mutex held for gc_should_collect_mem_usage(). */ - PyMutex mutex; #else PyGC_Head *generation0; #endif diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-05-06-15-57-28.gh-issue-148940.dRIXiY.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-06-15-57-28.gh-issue-148940.dRIXiY.rst new file mode 100644 index 00000000000000..9c5f1c77ca9da2 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-06-15-57-28.gh-issue-148940.dRIXiY.rst @@ -0,0 +1,6 @@ +Revert the process size based deferral of garbage collection (GH-133464). +The performance issue this change resolves is also fixed by GH-142562. This +approach has the problem that process size as seen by the OS (e.g. the +resident size or RSS) does not immediately decrease after cyclic garbage is +freed since mimalloc defers returning memory of the OS. This change applies +to the free-threaded GC only. diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index b4fcd365592aa5..4e36189580bbf8 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,30 +17,6 @@ #include "pydtrace.h" -// Platform-specific includes for get_process_mem_usage(). -#ifdef _WIN32 - #include - #include // For GetProcessMemoryInfo -#elif defined(__linux__) - #include // For sysconf, getpid -#elif defined(__APPLE__) - #include - #include // Required for TASK_VM_INFO - #include // For sysconf, getpid -#elif defined(__FreeBSD__) - #include - #include - #include // Requires sys/user.h for kinfo_proc definition - #include - #include // For sysconf, getpid - #include // For O_RDONLY - #include // For _POSIX2_LINE_MAX -#elif defined(__OpenBSD__) - #include - #include - #include // For kinfo_proc - #include // For sysconf, getpid -#endif // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -2016,185 +1992,6 @@ cleanup_worklist(struct worklist *worklist) } } -// Return the memory usage (typically RSS + swap) of the process, in units of -// KB. Returns -1 if this operation is not supported or on failure. -static Py_ssize_t -get_process_mem_usage(void) -{ -#ifdef _WIN32 - // Windows implementation using GetProcessMemoryInfo - // Returns WorkingSetSize + PagefileUsage - PROCESS_MEMORY_COUNTERS pmc; - HANDLE hProcess = GetCurrentProcess(); - if (NULL == hProcess) { - // Should not happen for the current process - return -1; - } - - // GetProcessMemoryInfo returns non-zero on success - if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) { - // Values are in bytes, convert to KB. - return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024); - } - else { - return -1; - } - -#elif __linux__ - FILE* fp = fopen("/proc/self/status", "r"); - if (fp == NULL) { - return -1; - } - - char line_buffer[256]; - long long rss_kb = -1; - long long swap_kb = -1; - - while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) { - if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) { - sscanf(line_buffer + 6, "%lld", &rss_kb); - } - else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) { - sscanf(line_buffer + 7, "%lld", &swap_kb); - } - if (rss_kb != -1 && swap_kb != -1) { - break; // Found both - } - } - fclose(fp); - - if (rss_kb != -1 && swap_kb != -1) { - return (Py_ssize_t)(rss_kb + swap_kb); - } - return -1; - -#elif defined(__APPLE__) - // --- MacOS (Darwin) --- - // Returns phys_footprint (RAM + compressed memory) - task_vm_info_data_t vm_info; - mach_msg_type_number_t count = TASK_VM_INFO_COUNT; - kern_return_t kerr; - - kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - if (kerr != KERN_SUCCESS) { - return -1; - } - // phys_footprint is in bytes. Convert to KB. - return (Py_ssize_t)(vm_info.phys_footprint / 1024); - -#elif defined(__FreeBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - // Using /dev/null for vmcore avoids needing dump file. - // NULL for kernel file uses running kernel. - char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages - kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); - if (kd == NULL) { - return -1; - } - - // KERN_PROC_PID filters for the specific process ID - // n_procs will contain the number of processes returned (should be 1 or 0) - pid_t pid = getpid(); - int n_procs; - struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs); - if (kp == NULL) { - kvm_close(kd); - return -1; - } - - Py_ssize_t rss_kb = -1; - if (n_procs > 0) { - // kp[0] contains the info for our process - // ki_rssize is in pages. Convert to KB. - rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb; - } - else { - // Process with PID not found, shouldn't happen for self. - rss_kb = -1; - } - - kvm_close(kd); - return rss_kb; - -#elif defined(__OpenBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - struct kinfo_proc kp; - pid_t pid = getpid(); - int mib[6]; - size_t len = sizeof(kp); - - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = pid; - mib[4] = sizeof(struct kinfo_proc); // size of the structure we want - mib[5] = 1; // want 1 structure back - if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) { - return -1; - } - - if (len > 0) { - // p_vm_rssize is in pages on OpenBSD. Convert to KB. - return (Py_ssize_t)kp.p_vm_rssize * page_size_kb; - } - else { - // Process info not returned - return -1; - } -#else - // Unsupported platform - return -1; -#endif -} - -static bool -gc_should_collect_mem_usage(GCState *gcstate) -{ - Py_ssize_t mem = get_process_mem_usage(); - if (mem < 0) { - // Reading process memory usage is not support or failed. - return true; - } - int threshold = gcstate->young.threshold; - Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); - if (deferred > threshold * 40) { - // Too many new container objects since last GC, even though memory use - // might not have increased much. This is intended to avoid resource - // exhaustion if some objects consume resources but don't result in a - // memory usage increase. We use 40x as the factor here because older - // versions of Python would do full collections after roughly every - // 70,000 new container objects. - return true; - } - Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem); - Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128); - if ((mem - last_mem) > mem_threshold) { - // The process memory usage has increased too much, do a collection. - return true; - } - else { - // The memory usage has not increased enough, defer the collection and - // clear the young object count so we don't check memory usage again - // on the next call to gc_should_collect(). - PyMutex_Lock(&gcstate->mutex); - int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); - _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, - gcstate->deferred_count + young_count); - PyMutex_Unlock(&gcstate->mutex); - return false; - } -} - static bool gc_should_collect(GCState *gcstate) { @@ -2214,7 +2011,7 @@ gc_should_collect(GCState *gcstate) // objects. return false; } - return gc_should_collect_mem_usage(gcstate); + return true; } static void @@ -2275,7 +2072,6 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } state->gcstate->young.count = 0; - state->gcstate->deferred_count = 0; for (int i = 1; i <= generation; ++i) { state->gcstate->old[i-1].count = 0; } @@ -2379,11 +2175,6 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Store the current memory usage, can be smaller now if breaking cycles - // freed some memory. - Py_ssize_t last_mem = get_process_mem_usage(); - _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem); - // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); }