@@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
377377 */
378378static int posix_cpu_timer_create (struct k_itimer * new_timer )
379379{
380+ static struct lock_class_key posix_cpu_timers_key ;
380381 struct pid * pid ;
381382
382383 rcu_read_lock ();
@@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
386387 return - EINVAL ;
387388 }
388389
390+ /*
391+ * If posix timer expiry is handled in task work context then
392+ * timer::it_lock can be taken without disabling interrupts as all
393+ * other locking happens in task context. This requires a seperate
394+ * lock class key otherwise regular posix timer expiry would record
395+ * the lock class being taken in interrupt context and generate a
396+ * false positive warning.
397+ */
398+ if (IS_ENABLED (CONFIG_POSIX_CPU_TIMERS_TASK_WORK ))
399+ lockdep_set_class (& new_timer -> it_lock , & posix_cpu_timers_key );
400+
389401 new_timer -> kclock = & clock_posix_cpu ;
390402 timerqueue_init (& new_timer -> it .cpu .node );
391403 new_timer -> it .cpu .pid = get_pid (pid );
@@ -1080,26 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
10801092 return false;
10811093}
10821094
1083- static void __run_posix_cpu_timers (struct task_struct * tsk )
1095+ static void handle_posix_cpu_timers (struct task_struct * tsk );
1096+
1097+ #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1098+ static void posix_cpu_timers_work (struct callback_head * work )
1099+ {
1100+ handle_posix_cpu_timers (current );
1101+ }
1102+
1103+ /*
1104+ * Initialize posix CPU timers task work in init task. Out of line to
1105+ * keep the callback static and to avoid header recursion hell.
1106+ */
1107+ void __init posix_cputimers_init_work (void )
1108+ {
1109+ init_task_work (& current -> posix_cputimers_work .work ,
1110+ posix_cpu_timers_work );
1111+ }
1112+
1113+ /*
1114+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
1115+ * in hard interrupt context or in task context with interrupts
1116+ * disabled. Aside of that the writer/reader interaction is always in the
1117+ * context of the current task, which means they are strict per CPU.
1118+ */
1119+ static inline bool posix_cpu_timers_work_scheduled (struct task_struct * tsk )
1120+ {
1121+ return tsk -> posix_cputimers_work .scheduled ;
1122+ }
1123+
1124+ static inline void __run_posix_cpu_timers (struct task_struct * tsk )
1125+ {
1126+ if (WARN_ON_ONCE (tsk -> posix_cputimers_work .scheduled ))
1127+ return ;
1128+
1129+ /* Schedule task work to actually expire the timers */
1130+ tsk -> posix_cputimers_work .scheduled = true;
1131+ task_work_add (tsk , & tsk -> posix_cputimers_work .work , TWA_RESUME );
1132+ }
1133+
1134+ static inline bool posix_cpu_timers_enable_work (struct task_struct * tsk ,
1135+ unsigned long start )
1136+ {
1137+ bool ret = true;
1138+
1139+ /*
1140+ * On !RT kernels interrupts are disabled while collecting expired
1141+ * timers, so no tick can happen and the fast path check can be
1142+ * reenabled without further checks.
1143+ */
1144+ if (!IS_ENABLED (CONFIG_PREEMPT_RT )) {
1145+ tsk -> posix_cputimers_work .scheduled = false;
1146+ return true;
1147+ }
1148+
1149+ /*
1150+ * On RT enabled kernels ticks can happen while the expired timers
1151+ * are collected under sighand lock. But any tick which observes
1152+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
1153+ * checks. So reenabling the tick work has do be done carefully:
1154+ *
1155+ * Disable interrupts and run the fast path check if jiffies have
1156+ * advanced since the collecting of expired timers started. If
1157+ * jiffies have not advanced or the fast path check did not find
1158+ * newly expired timers, reenable the fast path check in the timer
1159+ * interrupt. If there are newly expired timers, return false and
1160+ * let the collection loop repeat.
1161+ */
1162+ local_irq_disable ();
1163+ if (start != jiffies && fastpath_timer_check (tsk ))
1164+ ret = false;
1165+ else
1166+ tsk -> posix_cputimers_work .scheduled = false;
1167+ local_irq_enable ();
1168+
1169+ return ret ;
1170+ }
1171+ #else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1172+ static inline void __run_posix_cpu_timers (struct task_struct * tsk )
1173+ {
1174+ lockdep_posixtimer_enter ();
1175+ handle_posix_cpu_timers (tsk );
1176+ lockdep_posixtimer_exit ();
1177+ }
1178+
1179+ static inline bool posix_cpu_timers_work_scheduled (struct task_struct * tsk )
1180+ {
1181+ return false;
1182+ }
1183+
1184+ static inline bool posix_cpu_timers_enable_work (struct task_struct * tsk ,
1185+ unsigned long start )
1186+ {
1187+ return true;
1188+ }
1189+ #endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1190+
1191+ static void handle_posix_cpu_timers (struct task_struct * tsk )
10841192{
10851193 struct k_itimer * timer , * next ;
1086- unsigned long flags ;
1194+ unsigned long flags , start ;
10871195 LIST_HEAD (firing );
10881196
10891197 if (!lock_task_sighand (tsk , & flags ))
10901198 return ;
10911199
1092- /*
1093- * Here we take off tsk->signal->cpu_timers[N] and
1094- * tsk->cpu_timers[N] all the timers that are firing, and
1095- * put them on the firing list.
1096- */
1097- check_thread_timers (tsk , & firing );
1200+ do {
1201+ /*
1202+ * On RT locking sighand lock does not disable interrupts,
1203+ * so this needs to be careful vs. ticks. Store the current
1204+ * jiffies value.
1205+ */
1206+ start = READ_ONCE (jiffies );
1207+ barrier ();
10981208
1099- check_process_timers (tsk , & firing );
1209+ /*
1210+ * Here we take off tsk->signal->cpu_timers[N] and
1211+ * tsk->cpu_timers[N] all the timers that are firing, and
1212+ * put them on the firing list.
1213+ */
1214+ check_thread_timers (tsk , & firing );
1215+
1216+ check_process_timers (tsk , & firing );
1217+
1218+ /*
1219+ * The above timer checks have updated the exipry cache and
1220+ * because nothing can have queued or modified timers after
1221+ * sighand lock was taken above it is guaranteed to be
1222+ * consistent. So the next timer interrupt fastpath check
1223+ * will find valid data.
1224+ *
1225+ * If timer expiry runs in the timer interrupt context then
1226+ * the loop is not relevant as timers will be directly
1227+ * expired in interrupt context. The stub function below
1228+ * returns always true which allows the compiler to
1229+ * optimize the loop out.
1230+ *
1231+ * If timer expiry is deferred to task work context then
1232+ * the following rules apply:
1233+ *
1234+ * - On !RT kernels no tick can have happened on this CPU
1235+ * after sighand lock was acquired because interrupts are
1236+ * disabled. So reenabling task work before dropping
1237+ * sighand lock and reenabling interrupts is race free.
1238+ *
1239+ * - On RT kernels ticks might have happened but the tick
1240+ * work ignored posix CPU timer handling because the
1241+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
1242+ * must be done very carefully including a check whether
1243+ * ticks have happened since the start of the timer
1244+ * expiry checks. posix_cpu_timers_enable_work() takes
1245+ * care of that and eventually lets the expiry checks
1246+ * run again.
1247+ */
1248+ } while (!posix_cpu_timers_enable_work (tsk , start ));
11001249
11011250 /*
1102- * We must release these locks before taking any timer's lock.
1251+ * We must release sighand lock before taking any timer's lock.
11031252 * There is a potential race with timer deletion here, as the
11041253 * siglock now protects our private firing list. We have set
11051254 * the firing flag in each timer, so that a deletion attempt
@@ -1117,6 +1266,13 @@ static void __run_posix_cpu_timers(struct task_struct *tsk)
11171266 list_for_each_entry_safe (timer , next , & firing , it .cpu .elist ) {
11181267 int cpu_firing ;
11191268
1269+ /*
1270+ * spin_lock() is sufficient here even independent of the
1271+ * expiry context. If expiry happens in hard interrupt
1272+ * context it's obvious. For task work context it's safe
1273+ * because all other operations on timer::it_lock happen in
1274+ * task context (syscall or exit).
1275+ */
11201276 spin_lock (& timer -> it_lock );
11211277 list_del_init (& timer -> it .cpu .elist );
11221278 cpu_firing = timer -> it .cpu .firing ;
@@ -1143,16 +1299,21 @@ void run_posix_cpu_timers(void)
11431299
11441300 lockdep_assert_irqs_disabled ();
11451301
1302+ /*
1303+ * If the actual expiry is deferred to task work context and the
1304+ * work is already scheduled there is no point to do anything here.
1305+ */
1306+ if (posix_cpu_timers_work_scheduled (tsk ))
1307+ return ;
1308+
11461309 /*
11471310 * The fast path checks that there are no expired thread or thread
11481311 * group timers. If that's so, just return.
11491312 */
11501313 if (!fastpath_timer_check (tsk ))
11511314 return ;
11521315
1153- lockdep_posixtimer_enter ();
11541316 __run_posix_cpu_timers (tsk );
1155- lockdep_posixtimer_exit ();
11561317}
11571318
11581319/*
0 commit comments