timer_migration.c source code [linux/kernel/time/timer_migration.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Infrastructure for migratable timers
4	*
5	* Copyright(C) 2022 linutronix GmbH
6	*/
7	#include <linux/cpuhotplug.h>
8	#include <linux/slab.h>
9	#include <linux/smp.h>
10	#include <linux/spinlock.h>
11	#include <linux/timerqueue.h>
12	#include <trace/events/ipi.h>
13
14	#include "timer_migration.h"
15	#include "tick-internal.h"
16
17	#define CREATE_TRACE_POINTS
18	#include <trace/events/timer_migration.h>
19
20	/*
21	* The timer migration mechanism is built on a hierarchy of groups. The
22	* lowest level group contains CPUs, the next level groups of CPU groups
23	* and so forth. The CPU groups are kept per node so for the normal case
24	* lock contention won't happen across nodes. Depending on the number of
25	* CPUs per node even the next level might be kept as groups of CPU groups
26	* per node and only the levels above cross the node topology.
27	*
28	* Example topology for a two node system with 24 CPUs each.
29	*
30	* LVL 2 [GRP2:0]
31	* GRP1:0 = GRP1:M
32	*
33	* LVL 1 [GRP1:0] [GRP1:1]
34	* GRP0:0 - GRP0:2 GRP0:3 - GRP0:5
35	*
36	* LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5]
37	* CPUS 0-7 8-15 16-23 24-31 32-39 40-47
38	*
39	* The groups hold a timer queue of events sorted by expiry time. These
40	* queues are updated when CPUs go in idle. When they come out of idle
41	* ignore flag of events is set.
42	*
43	* Each group has a designated migrator CPU/group as long as a CPU/group is
44	* active in the group. This designated role is necessary to avoid that all
45	* active CPUs in a group try to migrate expired timers from other CPUs,
46	* which would result in massive lock bouncing.
47	*
48	* When a CPU is awake, it checks in it's own timer tick the group
49	* hierarchy up to the point where it is assigned the migrator role or if
50	* no CPU is active, it also checks the groups where no migrator is set
51	* (TMIGR_NONE).
52	*
53	* If it finds expired timers in one of the group queues it pulls them over
54	* from the idle CPU and runs the timer function. After that it updates the
55	* group and the parent groups if required.
56	*
57	* CPUs which go idle arm their CPU local timer hardware for the next local
58	* (pinned) timer event. If the next migratable timer expires after the
59	* next local timer or the CPU has no migratable timer pending then the
60	* CPU does not queue an event in the LVL0 group. If the next migratable
61	* timer expires before the next local timer then the CPU queues that timer
62	* in the LVL0 group. In both cases the CPU marks itself idle in the LVL0
63	* group.
64	*
65	* When CPU comes out of idle and when a group has at least a single active
66	* child, the ignore flag of the tmigr_event is set. This indicates, that
67	* the event is ignored even if it is still enqueued in the parent groups
68	* timer queue. It will be removed when touching the timer queue the next
69	* time. This spares locking in active path as the lock protects (after
70	* setup) only event information. For more information about locking,
71	* please read the section "Locking rules".
72	*
73	* If the CPU is the migrator of the group then it delegates that role to
74	* the next active CPU in the group or sets migrator to TMIGR_NONE when
75	* there is no active CPU in the group. This delegation needs to be
76	* propagated up the hierarchy so hand over from other leaves can happen at
77	* all hierarchy levels w/o doing a search.
78	*
79	* When the last CPU in the system goes idle, then it drops all migrator
80	* duties up to the top level of the hierarchy (LVL2 in the example). It
81	* then has to make sure, that it arms it's own local hardware timer for
82	* the earliest event in the system.
83	*
84	*
85	* Lifetime rules:
86	* ---------------
87	*
88	* The groups are built up at init time or when CPUs come online. They are
89	* not destroyed when a group becomes empty due to offlining. The group
90	* just won't participate in the hierarchy management anymore. Destroying
91	* groups would result in interesting race conditions which would just make
92	* the whole mechanism slow and complex.
93	*
94	*
95	* Locking rules:
96	* --------------
97	*
98	* For setting up new groups and handling events it's required to lock both
99	* child and parent group. The lock ordering is always bottom up. This also
100	* includes the per CPU locks in struct tmigr_cpu. For updating the migrator and
101	* active CPU/group information atomic_try_cmpxchg() is used instead and only
102	* the per CPU tmigr_cpu->lock is held.
103	*
104	* During the setup of groups tmigr_level_list is required. It is protected by
105	* @tmigr_mutex.
106	*
107	* When @timer_base->lock as well as tmigr related locks are required, the lock
108	* ordering is: first @timer_base->lock, afterwards tmigr related locks.
109	*
110	*
111	* Protection of the tmigr group state information:
112	* ------------------------------------------------
113	*
114	* The state information with the list of active children and migrator needs to
115	* be protected by a sequence counter. It prevents a race when updates in child
116	* groups are propagated in changed order. The state update is performed
117	* lockless and group wise. The following scenario describes what happens
118	* without updating the sequence counter:
119	*
120	* Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well
121	* as GRP0:1 will not change during the scenario):
122	*
123	* LVL 1 [GRP1:0]
124	* migrator = GRP0:1
125	* active = GRP0:0, GRP0:1
126	* / \
127	* LVL 0 [GRP0:0] [GRP0:1]
128	* migrator = CPU0 migrator = CPU2
129	* active = CPU0 active = CPU2
130	* / \ / \
131	* CPUs 0 1 2 3
132	* active idle active idle
133	*
134	*
135	* 1. CPU0 goes idle. As the update is performed group wise, in the first step
136	* only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to
137	* walk the hierarchy.
138	*
139	* LVL 1 [GRP1:0]
140	* migrator = GRP0:1
141	* active = GRP0:0, GRP0:1
142	* / \
143	* LVL 0 [GRP0:0] [GRP0:1]
144	* --> migrator = TMIGR_NONE migrator = CPU2
145	* --> active = active = CPU2
146	* / \ / \
147	* CPUs 0 1 2 3
148	* --> idle idle active idle
149	*
150	* 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of
151	* idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also
152	* has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the
153	* hierarchy to perform the needed update from their point of view. The
154	* currently visible state looks the following:
155	*
156	* LVL 1 [GRP1:0]
157	* migrator = GRP0:1
158	* active = GRP0:0, GRP0:1
159	* / \
160	* LVL 0 [GRP0:0] [GRP0:1]
161	* --> migrator = CPU1 migrator = CPU2
162	* --> active = CPU1 active = CPU2
163	* / \ / \
164	* CPUs 0 1 2 3
165	* idle --> active active idle
166	*
167	* 3. Here is the race condition: CPU1 managed to propagate its changes (from
168	* step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The
169	* active members of GRP1:0 remain unchanged after the update since it is
170	* still valid from CPU1 current point of view:
171	*
172	* LVL 1 [GRP1:0]
173	* --> migrator = GRP0:1
174	* --> active = GRP0:0, GRP0:1
175	* / \
176	* LVL 0 [GRP0:0] [GRP0:1]
177	* migrator = CPU1 migrator = CPU2
178	* active = CPU1 active = CPU2
179	* / \ / \
180	* CPUs 0 1 2 3
181	* idle active active idle
182	*
183	* 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0.
184	*
185	* LVL 1 [GRP1:0]
186	* --> migrator = GRP0:1
187	* --> active = GRP0:1
188	* / \
189	* LVL 0 [GRP0:0] [GRP0:1]
190	* migrator = CPU1 migrator = CPU2
191	* active = CPU1 active = CPU2
192	* / \ / \
193	* CPUs 0 1 2 3
194	* idle active active idle
195	*
196	*
197	* The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is
198	* active and is correctly listed as active in GRP0:0. However GRP1:0 does not
199	* have GRP0:0 listed as active, which is wrong. The sequence counter has been
200	* added to avoid inconsistent states during updates. The state is updated
201	* atomically only if all members, including the sequence counter, match the
202	* expected value (compare-and-exchange).
203	*
204	* Looking back at the previous example with the addition of the sequence
205	* counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed
206	* the sequence number during the update in step 3 so the expected old value (as
207	* seen by CPU0 before starting the walk) does not match.
208	*
209	* Prevent race between new event and last CPU going inactive
210	* ----------------------------------------------------------
211	*
212	* When the last CPU is going idle and there is a concurrent update of a new
213	* first global timer of an idle CPU, the group and child states have to be read
214	* while holding the lock in tmigr_update_events(). The following scenario shows
215	* what happens, when this is not done.
216	*
217	* 1. Only CPU2 is active:
218	*
219	* LVL 1 [GRP1:0]
220	* migrator = GRP0:1
221	* active = GRP0:1
222	* next_expiry = KTIME_MAX
223	* / \
224	* LVL 0 [GRP0:0] [GRP0:1]
225	* migrator = TMIGR_NONE migrator = CPU2
226	* active = active = CPU2
227	* next_expiry = KTIME_MAX next_expiry = KTIME_MAX
228	* / \ / \
229	* CPUs 0 1 2 3
230	* idle idle active idle
231	*
232	* 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and
233	* propagates that to GRP0:1:
234	*
235	* LVL 1 [GRP1:0]
236	* migrator = GRP0:1
237	* active = GRP0:1
238	* next_expiry = KTIME_MAX
239	* / \
240	* LVL 0 [GRP0:0] [GRP0:1]
241	* migrator = TMIGR_NONE --> migrator = TMIGR_NONE
242	* active = --> active =
243	* next_expiry = KTIME_MAX next_expiry = KTIME_MAX
244	* / \ / \
245	* CPUs 0 1 2 3
246	* idle idle --> idle idle
247	*
248	* 3. Now the idle state is propagated up to GRP1:0. As this is now the last
249	* child going idle in top level group, the expiry of the next group event
250	* has to be handed back to make sure no event is lost. As there is no event
251	* enqueued, KTIME_MAX is handed back to CPU2.
252	*
253	* LVL 1 [GRP1:0]
254	* --> migrator = TMIGR_NONE
255	* --> active =
256	* next_expiry = KTIME_MAX
257	* / \
258	* LVL 0 [GRP0:0] [GRP0:1]
259	* migrator = TMIGR_NONE migrator = TMIGR_NONE
260	* active = active =
261	* next_expiry = KTIME_MAX next_expiry = KTIME_MAX
262	* / \ / \
263	* CPUs 0 1 2 3
264	* idle idle --> idle idle
265	*
266	* 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0
267	* propagates that to GRP0:0:
268	*
269	* LVL 1 [GRP1:0]
270	* migrator = TMIGR_NONE
271	* active =
272	* next_expiry = KTIME_MAX
273	* / \
274	* LVL 0 [GRP0:0] [GRP0:1]
275	* migrator = TMIGR_NONE migrator = TMIGR_NONE
276	* active = active =
277	* --> next_expiry = TIMER0 next_expiry = KTIME_MAX
278	* / \ / \
279	* CPUs 0 1 2 3
280	* idle idle idle idle
281	*
282	* 5. GRP0:0 is not active, so the new timer has to be propagated to
283	* GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value
284	* (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is
285	* handed back to CPU0, as it seems that there is still an active child in
286	* top level group.
287	*
288	* LVL 1 [GRP1:0]
289	* migrator = TMIGR_NONE
290	* active =
291	* --> next_expiry = TIMER0
292	* / \
293	* LVL 0 [GRP0:0] [GRP0:1]
294	* migrator = TMIGR_NONE migrator = TMIGR_NONE
295	* active = active =
296	* next_expiry = TIMER0 next_expiry = KTIME_MAX
297	* / \ / \
298	* CPUs 0 1 2 3
299	* idle idle idle idle
300	*
301	* This is prevented by reading the state when holding the lock (when a new
302	* timer has to be propagated from idle path)::
303	*
304	* CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up())
305	* -------------------------- ---------------------------
306	* // step 3:
307	* cmpxchg(&GRP1:0->state);
308	* tmigr_update_events() {
309	* spin_lock(&GRP1:0->lock);
310	* // ... update events ...
311	* // hand back first expiry when GRP1:0 is idle
312	* spin_unlock(&GRP1:0->lock);
313	* // ^^^ release state modification
314	* }
315	* tmigr_update_events() {
316	* spin_lock(&GRP1:0->lock)
317	* // ^^^ acquire state modification
318	* group_state = atomic_read(&GRP1:0->state)
319	* // .... update events ...
320	* // hand back first expiry when GRP1:0 is idle
321	* spin_unlock(&GRP1:0->lock) <3>
322	* // ^^^ makes state visible for other
323	* // callers of tmigr_new_timer_up()
324	* }
325	*
326	* When CPU0 grabs the lock directly after cmpxchg, the first timer is reported
327	* back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent
328	* update of the group state from active path is no problem, as the upcoming CPU
329	* will take care of the group events.
330	*
331	* Required event and timerqueue update after a remote expiry:
332	* -----------------------------------------------------------
333	*
334	* After expiring timers of a remote CPU, a walk through the hierarchy and
335	* update of events and timerqueues is required. It is obviously needed if there
336	* is a 'new' global timer but also if there is no new global timer but the
337	* remote CPU is still idle.
338	*
339	* 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same
340	* time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is
341	* also idle and has no global timer pending. CPU2 is the only active CPU and
342	* thus also the migrator:
343	*
344	* LVL 1 [GRP1:0]
345	* migrator = GRP0:1
346	* active = GRP0:1
347	* --> timerqueue = evt-GRP0:0
348	* / \
349	* LVL 0 [GRP0:0] [GRP0:1]
350	* migrator = TMIGR_NONE migrator = CPU2
351	* active = active = CPU2
352	* groupevt.ignore = false groupevt.ignore = true
353	* groupevt.cpu = CPU0 groupevt.cpu =
354	* timerqueue = evt-CPU0, timerqueue =
355	* evt-CPU1
356	* / \ / \
357	* CPUs 0 1 2 3
358	* idle idle active idle
359	*
360	* 2. CPU2 starts to expire remote timers. It starts with LVL0 group
361	* GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with
362	* the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It
363	* looks at tmigr_event::cpu struct member and expires the pending timer(s)
364	* of CPU0.
365	*
366	* LVL 1 [GRP1:0]
367	* migrator = GRP0:1
368	* active = GRP0:1
369	* --> timerqueue =
370	* / \
371	* LVL 0 [GRP0:0] [GRP0:1]
372	* migrator = TMIGR_NONE migrator = CPU2
373	* active = active = CPU2
374	* groupevt.ignore = false groupevt.ignore = true
375	* --> groupevt.cpu = CPU0 groupevt.cpu =
376	* timerqueue = evt-CPU0, timerqueue =
377	* evt-CPU1
378	* / \ / \
379	* CPUs 0 1 2 3
380	* idle idle active idle
381	*
382	* 3. Some work has to be done after expiring the timers of CPU0. If we stop
383	* here, then CPU1's pending global timer(s) will not expire in time and the
384	* timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just
385	* been processed. So it is required to walk the hierarchy from CPU0's point
386	* of view and update it accordingly. CPU0's event will be removed from the
387	* timerqueue because it has no pending timer. If CPU0 would have a timer
388	* pending then it has to expire after CPU1's first timer because all timers
389	* from this period were just expired. Either way CPU1's event will be first
390	* in GRP0:0's timerqueue and therefore set in the CPU field of the group
391	* event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not
392	* active:
393	*
394	* LVL 1 [GRP1:0]
395	* migrator = GRP0:1
396	* active = GRP0:1
397	* --> timerqueue = evt-GRP0:0
398	* / \
399	* LVL 0 [GRP0:0] [GRP0:1]
400	* migrator = TMIGR_NONE migrator = CPU2
401	* active = active = CPU2
402	* groupevt.ignore = false groupevt.ignore = true
403	* --> groupevt.cpu = CPU1 groupevt.cpu =
404	* --> timerqueue = evt-CPU1 timerqueue =
405	* / \ / \
406	* CPUs 0 1 2 3
407	* idle idle active idle
408	*
409	* Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the
410	* timer(s) of CPU1.
411	*
412	* The hierarchy walk in step 3 can be skipped if the migrator notices that a
413	* CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care
414	* of the group as migrator and any needed updates within the hierarchy.
415	*/
416
417	static DEFINE_MUTEX(tmigr_mutex);
418	static struct list_head *tmigr_level_list __read_mostly;
419
420	static unsigned int tmigr_hierarchy_levels __read_mostly;
421	static unsigned int tmigr_crossnode_level __read_mostly;
422
423	static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
424
425	#define TMIGR_NONE 0xFF
426	#define BIT_CNT 8
427
428	static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
429	{
430	return !(tmc->tmgroup && tmc->online);
431	}
432
433	/*
434	* Returns true, when @childmask corresponds to the group migrator or when the
435	* group is not active - so no migrator is set.
436	*/
437	static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask)
438	{
439	union tmigr_state s;
440
441	s.state = atomic_read(v: &group->migr_state);
442
443	if ((s.migrator == childmask) \|\| (s.migrator == TMIGR_NONE))
444	return true;
445
446	return false;
447	}
448
449	static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask)
450	{
451	bool lonely, migrator = false;
452	unsigned long active;
453	union tmigr_state s;
454
455	s.state = atomic_read(v: &group->migr_state);
456
457	if ((s.migrator == childmask) \|\| (s.migrator == TMIGR_NONE))
458	migrator = true;
459
460	active = s.active;
461	lonely = bitmap_weight(src: &active, BIT_CNT) <= `1`;
462
463	return (migrator && lonely);
464	}
465
466	static bool tmigr_check_lonely(struct tmigr_group *group)
467	{
468	unsigned long active;
469	union tmigr_state s;
470
471	s.state = atomic_read(v: &group->migr_state);
472
473	active = s.active;
474
475	return bitmap_weight(src: &active, BIT_CNT) <= `1`;
476	}
477
478	/**
479	* struct tmigr_walk - data required for walking the hierarchy
480	* @nextexp: Next CPU event expiry information which is handed into
481	* the timer migration code by the timer code
482	* (get_next_timer_interrupt())
483	* @firstexp: Contains the first event expiry information when
484	* hierarchy is completely idle. When CPU itself was the
485	* last going idle, information makes sure, that CPU will
486	* be back in time. When using this value in the remote
487	* expiry case, firstexp is stored in the per CPU tmigr_cpu
488	* struct of CPU which expires remote timers. It is updated
489	* in top level group only. Be aware, there could occur a
490	* new top level of the hierarchy between the 'top level
491	* call' in tmigr_update_events() and the check for the
492	* parent group in walk_groups(). Then @firstexp might
493	* contain a value != KTIME_MAX even if it was not the
494	* final top level. This is not a problem, as the worst
495	* outcome is a CPU which might wake up a little early.
496	* @evt: Pointer to tmigr_event which needs to be queued (of idle
497	* child group)
498	* @childmask: groupmask of child group
499	* @remote: Is set, when the new timer path is executed in
500	* tmigr_handle_remote_cpu()
501	* @basej: timer base in jiffies
502	* @now: timer base monotonic
503	* @check: is set if there is the need to handle remote timers;
504	* required in tmigr_requires_handle_remote() only
505	* @tmc_active: this flag indicates, whether the CPU which triggers
506	* the hierarchy walk is !idle in the timer migration
507	* hierarchy. When the CPU is idle and the whole hierarchy is
508	* idle, only the first event of the top level has to be
509	* considered.
510	*/
511	struct tmigr_walk {
512	u64 nextexp;
513	u64 firstexp;
514	struct tmigr_event *evt;
515	u8 childmask;
516	bool remote;
517	unsigned long basej;
518	u64 now;
519	bool check;
520	bool tmc_active;
521	};
522
523	typedef bool (up_f)(struct* tmigr_group , struct* tmigr_group , struct* tmigr_walk *);
524
525	static void __walk_groups(up_f up, struct tmigr_walk *data,
526	struct tmigr_cpu *tmc)
527	{
528	struct tmigr_group child = NULL, group = tmc->tmgroup;
529
530	do {
531	WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
532
533	if (up(group, child, data))
534	break;
535
536	child = group;
537	/*
538	* Pairs with the store release on group connection
539	* to make sure group initialization is visible.
540	*/
541	group = READ_ONCE(group->parent);
542	data->childmask = child->groupmask;
543	WARN_ON_ONCE(!data->childmask);
544	} while (group);
545	}
546
547	static void walk_groups(up_f up, struct tmigr_walk data, struct* tmigr_cpu *tmc)
548	{
549	lockdep_assert_held(&tmc->lock);
550
551	__walk_groups(up, data, tmc);
552	}
553
554	/*
555	* Returns the next event of the timerqueue @group->events
556	*
557	* Removes timers with ignore flag and update next_expiry of the group. Values
558	* of the group event are updated in tmigr_update_events() only.
559	*/
560	static struct tmigr_event tmigr_next_groupevt(struct* tmigr_group *group)
561	{
562	struct timerqueue_node *node = NULL;
563	struct tmigr_event *evt = NULL;
564
565	lockdep_assert_held(&group->lock);
566
567	WRITE_ONCE(group->next_expiry, KTIME_MAX);
568
569	while ((node = timerqueue_getnext(head: &group->events))) {
570	evt = container_of(node, struct tmigr_event, nextevt);
571
572	if (!READ_ONCE(evt->ignore)) {
573	WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
574	return evt;
575	}
576
577	/*
578	* Remove next timers with ignore flag, because the group lock
579	* is held anyway
580	*/
581	if (!timerqueue_del(head: &group->events, node))
582	break;
583	}
584
585	return NULL;
586	}
587
588	/*
589	* Return the next event (with the expiry equal or before @now)
590	*
591	* Event, which is returned, is also removed from the queue.
592	*/
593	static struct tmigr_event tmigr_next_expired_groupevt(struct* tmigr_group *group,
594	u64 now)
595	{
596	struct tmigr_event *evt = tmigr_next_groupevt(group);
597
598	if (!evt \|\| now < evt->nextevt.expires)
599	return NULL;
600
601	/*
602	* The event is ready to expire. Remove it and update next group event.
603	*/
604	timerqueue_del(head: &group->events, node: &evt->nextevt);
605	tmigr_next_groupevt(group);
606
607	return evt;
608	}
609
610	static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)
611	{
612	struct tmigr_event *evt;
613
614	evt = tmigr_next_groupevt(group);
615
616	if (!evt)
617	return KTIME_MAX;
618	else
619	return evt->nextevt.expires;
620	}
621
622	static bool tmigr_active_up(struct tmigr_group *group,
623	struct tmigr_group *child,
624	struct tmigr_walk *data)
625	{
626	union tmigr_state curstate, newstate;
627	bool walk_done;
628	u8 childmask;
629
630	childmask = data->childmask;
631	/*
632	* No memory barrier is required here in contrast to
633	* tmigr_inactive_up(), as the group state change does not depend on the
634	* child state.
635	*/
636	curstate.state = atomic_read(v: &group->migr_state);
637
638	do {
639	newstate = curstate;
640	walk_done = true;
641
642	if (newstate.migrator == TMIGR_NONE) {
643	newstate.migrator = childmask;
644
645	/ Changes need to be propagated /
646	walk_done = false;
647	}
648
649	newstate.active \|= childmask;
650	newstate.seq++;
651
652	} while (!atomic_try_cmpxchg(v: &group->migr_state, old: &curstate.state, new: newstate.state));
653
654	trace_tmigr_group_set_cpu_active(group, state: newstate, childmask);
655
656	/*
657	* The group is active (again). The group event might be still queued
658	* into the parent group's timerqueue but can now be handled by the
659	* migrator of this group. Therefore the ignore flag for the group event
660	* is updated to reflect this.
661	*
662	* The update of the ignore flag in the active path is done lockless. In
663	* worst case the migrator of the parent group observes the change too
664	* late and expires remotely all events belonging to this group. The
665	* lock is held while updating the ignore flag in idle path. So this
666	* state change will not be lost.
667	*/
668	WRITE_ONCE(group->groupevt.ignore, true);
669
670	return walk_done;
671	}
672
673	static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)
674	{
675	struct tmigr_walk data;
676
677	data.childmask = tmc->groupmask;
678
679	trace_tmigr_cpu_active(tmc);
680
681	tmc->cpuevt.ignore = true;
682	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
683
684	walk_groups(up: &tmigr_active_up, data: &data, tmc);
685	}
686
687	/**
688	* tmigr_cpu_activate() - set this CPU active in timer migration hierarchy
689	*
690	* Call site timer_clear_idle() is called with interrupts disabled.
691	*/
692	void tmigr_cpu_activate(void)
693	{
694	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
695
696	if (tmigr_is_not_available(tmc))
697	return;
698
699	if (WARN_ON_ONCE(!tmc->idle))
700	return;
701
702	raw_spin_lock(&tmc->lock);
703	tmc->idle = false;
704	__tmigr_cpu_activate(tmc);
705	raw_spin_unlock(&tmc->lock);
706	}
707
708	/*
709	* Returns true, if there is nothing to be propagated to the next level
710	*
711	* @data->firstexp is set to expiry of first gobal event of the (top level of
712	* the) hierarchy, but only when hierarchy is completely idle.
713	*
714	* The child and group states need to be read under the lock, to prevent a race
715	* against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See
716	* also section "Prevent race between new event and last CPU going inactive" in
717	* the documentation at the top.
718	*
719	* This is the only place where the group event expiry value is set.
720	*/
721	static
722	bool tmigr_update_events(struct tmigr_group group, struct* tmigr_group *child,
723	struct tmigr_walk *data)
724	{
725	struct tmigr_event evt, first_childevt;
726	union tmigr_state childstate, groupstate;
727	bool remote = data->remote;
728	bool walk_done = false;
729	bool ignore;
730	u64 nextexp;
731
732	if (child) {
733	raw_spin_lock(&child->lock);
734	raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
735
736	childstate.state = atomic_read(v: &child->migr_state);
737	groupstate.state = atomic_read(v: &group->migr_state);
738
739	if (childstate.active) {
740	walk_done = true;
741	goto unlock;
742	}
743
744	first_childevt = tmigr_next_groupevt(group: child);
745	nextexp = child->next_expiry;
746	evt = &child->groupevt;
747
748	/*
749	* This can race with concurrent idle exit (activate).
750	* If the current writer wins, a useless remote expiration may
751	* be scheduled. If the activate wins, the event is properly
752	* ignored.
753	*/
754	ignore = (nextexp == KTIME_MAX) ? true : false;
755	WRITE_ONCE(evt->ignore, ignore);
756	} else {
757	nextexp = data->nextexp;
758
759	first_childevt = evt = data->evt;
760	ignore = evt->ignore;
761
762	/*
763	* Walking the hierarchy is required in any case when a
764	* remote expiry was done before. This ensures to not lose
765	* already queued events in non active groups (see section
766	* "Required event and timerqueue update after a remote
767	* expiry" in the documentation at the top).
768	*
769	* The two call sites which are executed without a remote expiry
770	* before, are not prevented from propagating changes through
771	* the hierarchy by the return:
772	* - When entering this path by tmigr_new_timer(), @evt->ignore
773	* is never set.
774	* - tmigr_inactive_up() takes care of the propagation by
775	* itself and ignores the return value. But an immediate
776	* return is possible if there is a parent, sparing group
777	* locking at this level, because the upper walking call to
778	* the parent will take care about removing this event from
779	* within the group and update next_expiry accordingly.
780	*
781	* However if there is no parent, ie: the hierarchy has only a
782	* single level so @group is the top level group, make sure the
783	* first event information of the group is updated properly and
784	* also handled properly, so skip this fast return path.
785	*/
786	if (ignore && !remote && group->parent)
787	return true;
788
789	raw_spin_lock(&group->lock);
790
791	childstate.state = `0`;
792	groupstate.state = atomic_read(v: &group->migr_state);
793	}
794
795	/*
796	* If the child event is already queued in the group, remove it from the
797	* queue when the expiry time changed only or when it could be ignored.
798	*/
799	if (timerqueue_node_queued(node: &evt->nextevt)) {
800	if ((evt->nextevt.expires == nextexp) && !ignore) {
801	/ Make sure not to miss a new CPU event with the same expiry /
802	evt->cpu = first_childevt->cpu;
803	goto check_toplvl;
804	}
805
806	if (!timerqueue_del(head: &group->events, node: &evt->nextevt))
807	WRITE_ONCE(group->next_expiry, KTIME_MAX);
808	}
809
810	if (ignore) {
811	/*
812	* When the next child event could be ignored (nextexp is
813	* KTIME_MAX) and there was no remote timer handling before or
814	* the group is already active, there is no need to walk the
815	* hierarchy even if there is a parent group.
816	*
817	* The other way round: even if the event could be ignored, but
818	* if a remote timer handling was executed before and the group
819	* is not active, walking the hierarchy is required to not miss
820	* an enqueued timer in the non active group. The enqueued timer
821	* of the group needs to be propagated to a higher level to
822	* ensure it is handled.
823	*/
824	if (!remote \|\| groupstate.active)
825	walk_done = true;
826	} else {
827	evt->nextevt.expires = nextexp;
828	evt->cpu = first_childevt->cpu;
829
830	if (timerqueue_add(head: &group->events, node: &evt->nextevt))
831	WRITE_ONCE(group->next_expiry, nextexp);
832	}
833
834	check_toplvl:
835	if (!group->parent && (groupstate.migrator == TMIGR_NONE)) {
836	walk_done = true;
837
838	/*
839	* Nothing to do when update was done during remote timer
840	* handling. First timer in top level group which needs to be
841	* handled when top level group is not active, is calculated
842	* directly in tmigr_handle_remote_up().
843	*/
844	if (remote)
845	goto unlock;
846
847	/*
848	* The top level group is idle and it has to be ensured the
849	* global timers are handled in time. (This could be optimized
850	* by keeping track of the last global scheduled event and only
851	* arming it on the CPU if the new event is earlier. Not sure if
852	* its worth the complexity.)
853	*/
854	data->firstexp = tmigr_next_groupevt_expires(group);
855	}
856
857	trace_tmigr_update_events(child, group, childstate, groupstate,
858	nextevt: nextexp);
859
860	unlock:
861	raw_spin_unlock(&group->lock);
862
863	if (child)
864	raw_spin_unlock(&child->lock);
865
866	return walk_done;
867	}
868
869	static bool tmigr_new_timer_up(struct tmigr_group *group,
870	struct tmigr_group *child,
871	struct tmigr_walk *data)
872	{
873	return tmigr_update_events(group, child, data);
874	}
875
876	/*
877	* Returns the expiry of the next timer that needs to be handled. KTIME_MAX is
878	* returned, if an active CPU will handle all the timer migration hierarchy
879	* timers.
880	*/
881	static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp)
882	{
883	struct tmigr_walk data = { .nextexp = nextexp,
884	.firstexp = KTIME_MAX,
885	.evt = &tmc->cpuevt };
886
887	lockdep_assert_held(&tmc->lock);
888
889	if (tmc->remote)
890	return KTIME_MAX;
891
892	trace_tmigr_cpu_new_timer(tmc);
893
894	tmc->cpuevt.ignore = false;
895	data.remote = false;
896
897	walk_groups(up: &tmigr_new_timer_up, data: &data, tmc);
898
899	/ If there is a new first global event, make sure it is handled /
900	return data.firstexp;
901	}
902
903	static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
904	unsigned long jif)
905	{
906	struct timer_events tevt;
907	struct tmigr_walk data;
908	struct tmigr_cpu *tmc;
909
910	tmc = per_cpu_ptr(&tmigr_cpu, cpu);
911
912	raw_spin_lock_irq(&tmc->lock);
913
914	/*
915	* If the remote CPU is offline then the timers have been migrated to
916	* another CPU.
917	*
918	* If tmigr_cpu::remote is set, at the moment another CPU already
919	* expires the timers of the remote CPU.
920	*
921	* If tmigr_event::ignore is set, then the CPU returns from idle and
922	* takes care of its timers.
923	*
924	* If the next event expires in the future, then the event has been
925	* updated and there are no timers to expire right now. The CPU which
926	* updated the event takes care when hierarchy is completely
927	* idle. Otherwise the migrator does it as the event is enqueued.
928	*/
929	if (!tmc->online \|\| tmc->remote \|\| tmc->cpuevt.ignore \|\|
930	now < tmc->cpuevt.nextevt.expires) {
931	raw_spin_unlock_irq(&tmc->lock);
932	return;
933	}
934
935	trace_tmigr_handle_remote_cpu(tmc);
936
937	tmc->remote = true;
938	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
939
940	/ Drop the lock to allow the remote CPU to exit idle /
941	raw_spin_unlock_irq(&tmc->lock);
942
943	if (cpu != smp_processor_id())
944	timer_expire_remote(cpu);
945
946	/*
947	* Lock ordering needs to be preserved - timer_base locks before tmigr
948	* related locks (see section "Locking rules" in the documentation at
949	* the top). During fetching the next timer interrupt, also tmc->lock
950	* needs to be held. Otherwise there is a possible race window against
951	* the CPU itself when it comes out of idle, updates the first timer in
952	* the hierarchy and goes back to idle.
953	*
954	* timer base locks are dropped as fast as possible: After checking
955	* whether the remote CPU went offline in the meantime and after
956	* fetching the next remote timer interrupt. Dropping the locks as fast
957	* as possible keeps the locking region small and prevents holding
958	* several (unnecessary) locks during walking the hierarchy for updating
959	* the timerqueue and group events.
960	*/
961	local_irq_disable();
962	timer_lock_remote_bases(cpu);
963	raw_spin_lock(&tmc->lock);
964
965	/*
966	* When the CPU went offline in the meantime, no hierarchy walk has to
967	* be done for updating the queued events, because the walk was
968	* already done during marking the CPU offline in the hierarchy.
969	*
970	* When the CPU is no longer idle, the CPU takes care of the timers and
971	* also of the timers in the hierarchy.
972	*
973	* (See also section "Required event and timerqueue update after a
974	* remote expiry" in the documentation at the top)
975	*/
976	if (!tmc->online \|\| !tmc->idle) {
977	timer_unlock_remote_bases(cpu);
978	goto unlock;
979	}
980
981	/ next event of CPU /
982	fetch_next_timer_interrupt_remote(basej: jif, basem: now, tevt: &tevt, cpu);
983	timer_unlock_remote_bases(cpu);
984
985	data.nextexp = tevt.global;
986	data.firstexp = KTIME_MAX;
987	data.evt = &tmc->cpuevt;
988	data.remote = true;
989
990	/*
991	* The update is done even when there is no 'new' global timer pending
992	* on the remote CPU (see section "Required event and timerqueue update
993	* after a remote expiry" in the documentation at the top)
994	*/
995	walk_groups(up: &tmigr_new_timer_up, data: &data, tmc);
996
997	unlock:
998	tmc->remote = false;
999	raw_spin_unlock_irq(&tmc->lock);
1000	}
1001
1002	static bool tmigr_handle_remote_up(struct tmigr_group *group,
1003	struct tmigr_group *child,
1004	struct tmigr_walk *data)
1005	{
1006	struct tmigr_event *evt;
1007	unsigned long jif;
1008	u8 childmask;
1009	u64 now;
1010
1011	jif = data->basej;
1012	now = data->now;
1013
1014	childmask = data->childmask;
1015
1016	trace_tmigr_handle_remote(group);
1017	again:
1018	/*
1019	* Handle the group only if @childmask is the migrator or if the
1020	* group has no migrator. Otherwise the group is active and is
1021	* handled by its own migrator.
1022	*/
1023	if (!tmigr_check_migrator(group, childmask))
1024	return true;
1025
1026	raw_spin_lock_irq(&group->lock);
1027
1028	evt = tmigr_next_expired_groupevt(group, now);
1029
1030	if (evt) {
1031	unsigned int remote_cpu = evt->cpu;
1032
1033	raw_spin_unlock_irq(&group->lock);
1034
1035	tmigr_handle_remote_cpu(cpu: remote_cpu, now, jif);
1036
1037	/ check if there is another event, that needs to be handled /
1038	goto again;
1039	}
1040
1041	/*
1042	* Keep track of the expiry of the first event that needs to be handled
1043	* (group->next_expiry was updated by tmigr_next_expired_groupevt(),
1044	* next was set by tmigr_handle_remote_cpu()).
1045	*/
1046	data->firstexp = group->next_expiry;
1047
1048	raw_spin_unlock_irq(&group->lock);
1049
1050	return false;
1051	}
1052
1053	/**
1054	* tmigr_handle_remote() - Handle global timers of remote idle CPUs
1055	*
1056	* Called from the timer soft interrupt with interrupts enabled.
1057	*/
1058	void tmigr_handle_remote(void)
1059	{
1060	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1061	struct tmigr_walk data;
1062
1063	if (tmigr_is_not_available(tmc))
1064	return;
1065
1066	data.childmask = tmc->groupmask;
1067	data.firstexp = KTIME_MAX;
1068
1069	/*
1070	* NOTE: This is a doubled check because the migrator test will be done
1071	* in tmigr_handle_remote_up() anyway. Keep this check to speed up the
1072	* return when nothing has to be done.
1073	*/
1074	if (!tmigr_check_migrator(group: tmc->tmgroup, childmask: tmc->groupmask)) {
1075	/*
1076	* If this CPU was an idle migrator, make sure to clear its wakeup
1077	* value so it won't chase timers that have already expired elsewhere.
1078	* This avoids endless requeue from tmigr_new_timer().
1079	*/
1080	if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
1081	return;
1082	}
1083
1084	data.now = get_jiffies_update(basej: &data.basej);
1085
1086	/*
1087	* Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to
1088	* KTIME_MAX. Even if tmc->lock is not held during the whole remote
1089	* handling, tmc->wakeup is fine to be stale as it is called in
1090	* interrupt context and tick_nohz_next_event() is executed in interrupt
1091	* exit path only after processing the last pending interrupt.
1092	*/
1093
1094	__walk_groups(up: &tmigr_handle_remote_up, data: &data, tmc);
1095
1096	raw_spin_lock_irq(&tmc->lock);
1097	WRITE_ONCE(tmc->wakeup, data.firstexp);
1098	raw_spin_unlock_irq(&tmc->lock);
1099	}
1100
1101	static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
1102	struct tmigr_group *child,
1103	struct tmigr_walk *data)
1104	{
1105	u8 childmask;
1106
1107	childmask = data->childmask;
1108
1109	/*
1110	* Handle the group only if the child is the migrator or if the group
1111	* has no migrator. Otherwise the group is active and is handled by its
1112	* own migrator.
1113	*/
1114	if (!tmigr_check_migrator(group, childmask))
1115	return true;
1116
1117	/*
1118	* When there is a parent group and the CPU which triggered the
1119	* hierarchy walk is not active, proceed the walk to reach the top level
1120	* group before reading the next_expiry value.
1121	*/
1122	if (group->parent && !data->tmc_active)
1123	return false;
1124
1125	/*
1126	* The lock is required on 32bit architectures to read the variable
1127	* consistently with a concurrent writer. On 64bit the lock is not
1128	* required because the read operation is not split and so it is always
1129	* consistent.
1130	*/
1131	if (IS_ENABLED(CONFIG_64BIT)) {
1132	data->firstexp = READ_ONCE(group->next_expiry);
1133	if (data->now >= data->firstexp) {
1134	data->check = true;
1135	return true;
1136	}
1137	} else {
1138	raw_spin_lock(&group->lock);
1139	data->firstexp = group->next_expiry;
1140	if (data->now >= group->next_expiry) {
1141	data->check = true;
1142	raw_spin_unlock(&group->lock);
1143	return true;
1144	}
1145	raw_spin_unlock(&group->lock);
1146	}
1147
1148	return false;
1149	}
1150
1151	/**
1152	* tmigr_requires_handle_remote() - Check the need of remote timer handling
1153	*
1154	* Must be called with interrupts disabled.
1155	*/
1156	bool tmigr_requires_handle_remote(void)
1157	{
1158	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1159	struct tmigr_walk data;
1160	unsigned long jif;
1161	bool ret = false;
1162
1163	if (tmigr_is_not_available(tmc))
1164	return ret;
1165
1166	data.now = get_jiffies_update(basej: &jif);
1167	data.childmask = tmc->groupmask;
1168	data.firstexp = KTIME_MAX;
1169	data.tmc_active = !tmc->idle;
1170	data.check = false;
1171
1172	/*
1173	* If the CPU is active, walk the hierarchy to check whether a remote
1174	* expiry is required.
1175	*
1176	* Check is done lockless as interrupts are disabled and @tmc->idle is
1177	* set only by the local CPU.
1178	*/
1179	if (!tmc->idle) {
1180	__walk_groups(up: &tmigr_requires_handle_remote_up, data: &data, tmc);
1181
1182	return data.check;
1183	}
1184
1185	/*
1186	* When the CPU is idle, compare @tmc->wakeup with @data.now. The lock
1187	* is required on 32bit architectures to read the variable consistently
1188	* with a concurrent writer. On 64bit the lock is not required because
1189	* the read operation is not split and so it is always consistent.
1190	*/
1191	if (IS_ENABLED(CONFIG_64BIT)) {
1192	if (data.now >= READ_ONCE(tmc->wakeup))
1193	return true;
1194	} else {
1195	raw_spin_lock(&tmc->lock);
1196	if (data.now >= tmc->wakeup)
1197	ret = true;
1198	raw_spin_unlock(&tmc->lock);
1199	}
1200
1201	return ret;
1202	}
1203
1204	/**
1205	* tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc)
1206	* @nextexp: Next expiry of global timer (or KTIME_MAX if not)
1207	*
1208	* The CPU is already deactivated in the timer migration
1209	* hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event()
1210	* and thereby the timer idle path is executed once more. @tmc->wakeup
1211	* holds the first timer, when the timer migration hierarchy is
1212	* completely idle.
1213	*
1214	* Returns the first timer that needs to be handled by this CPU or KTIME_MAX if
1215	* nothing needs to be done.
1216	*/
1217	u64 tmigr_cpu_new_timer(u64 nextexp)
1218	{
1219	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1220	u64 ret;
1221
1222	if (tmigr_is_not_available(tmc))
1223	return nextexp;
1224
1225	raw_spin_lock(&tmc->lock);
1226
1227	ret = READ_ONCE(tmc->wakeup);
1228	if (nextexp != KTIME_MAX) {
1229	if (nextexp != tmc->cpuevt.nextevt.expires \|\|
1230	tmc->cpuevt.ignore) {
1231	ret = tmigr_new_timer(tmc, nextexp);
1232	/*
1233	* Make sure the reevaluation of timers in idle path
1234	* will not miss an event.
1235	*/
1236	WRITE_ONCE(tmc->wakeup, ret);
1237	}
1238	}
1239	trace_tmigr_cpu_new_timer_idle(tmc, nextevt: nextexp);
1240	raw_spin_unlock(&tmc->lock);
1241	return ret;
1242	}
1243
1244	static bool tmigr_inactive_up(struct tmigr_group *group,
1245	struct tmigr_group *child,
1246	struct tmigr_walk *data)
1247	{
1248	union tmigr_state curstate, newstate, childstate;
1249	bool walk_done;
1250	u8 childmask;
1251
1252	childmask = data->childmask;
1253	childstate.state = `0`;
1254
1255	/*
1256	* The memory barrier is paired with the cmpxchg() in tmigr_active_up()
1257	* to make sure the updates of child and group states are ordered. The
1258	* ordering is mandatory, as the group state change depends on the child
1259	* state.
1260	*/
1261	curstate.state = atomic_read_acquire(v: &group->migr_state);
1262
1263	for (;;) {
1264	if (child)
1265	childstate.state = atomic_read(v: &child->migr_state);
1266
1267	newstate = curstate;
1268	walk_done = true;
1269
1270	/ Reset active bit when the child is no longer active /
1271	if (!childstate.active)
1272	newstate.active &= ~childmask;
1273
1274	if (newstate.migrator == childmask) {
1275	/*
1276	* Find a new migrator for the group, because the child
1277	* group is idle!
1278	*/
1279	if (!childstate.active) {
1280	unsigned long new_migr_bit, active = newstate.active;
1281
1282	new_migr_bit = find_first_bit(addr: &active, BIT_CNT);
1283
1284	if (new_migr_bit != BIT_CNT) {
1285	newstate.migrator = BIT(new_migr_bit);
1286	} else {
1287	newstate.migrator = TMIGR_NONE;
1288
1289	/ Changes need to be propagated /
1290	walk_done = false;
1291	}
1292	}
1293	}
1294
1295	newstate.seq++;
1296
1297	WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active));
1298
1299	if (atomic_try_cmpxchg(v: &group->migr_state, old: &curstate.state, new: newstate.state)) {
1300	trace_tmigr_group_set_cpu_inactive(group, state: newstate, childmask);
1301	break;
1302	}
1303
1304	/*
1305	* The memory barrier is paired with the cmpxchg() in
1306	* tmigr_active_up() to make sure the updates of child and group
1307	* states are ordered. It is required only when the above
1308	* try_cmpxchg() fails.
1309	*/
1310	smp_mb__after_atomic();
1311	}
1312
1313	data->remote = false;
1314
1315	/ Event Handling /
1316	tmigr_update_events(group, child, data);
1317
1318	return walk_done;
1319	}
1320
1321	static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp)
1322	{
1323	struct tmigr_walk data = { .nextexp = nextexp,
1324	.firstexp = KTIME_MAX,
1325	.evt = &tmc->cpuevt,
1326	.childmask = tmc->groupmask };
1327
1328	/*
1329	* If nextexp is KTIME_MAX, the CPU event will be ignored because the
1330	* local timer expires before the global timer, no global timer is set
1331	* or CPU goes offline.
1332	*/
1333	if (nextexp != KTIME_MAX)
1334	tmc->cpuevt.ignore = false;
1335
1336	walk_groups(up: &tmigr_inactive_up, data: &data, tmc);
1337	return data.firstexp;
1338	}
1339
1340	/**
1341	* tmigr_cpu_deactivate() - Put current CPU into inactive state
1342	* @nextexp: The next global timer expiry of the current CPU
1343	*
1344	* Must be called with interrupts disabled.
1345	*
1346	* Return: the next event expiry of the current CPU or the next event expiry
1347	* from the hierarchy if this CPU is the top level migrator or the hierarchy is
1348	* completely idle.
1349	*/
1350	u64 tmigr_cpu_deactivate(u64 nextexp)
1351	{
1352	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1353	u64 ret;
1354
1355	if (tmigr_is_not_available(tmc))
1356	return nextexp;
1357
1358	raw_spin_lock(&tmc->lock);
1359
1360	ret = __tmigr_cpu_deactivate(tmc, nextexp);
1361
1362	tmc->idle = true;
1363
1364	/*
1365	* Make sure the reevaluation of timers in idle path will not miss an
1366	* event.
1367	*/
1368	WRITE_ONCE(tmc->wakeup, ret);
1369
1370	trace_tmigr_cpu_idle(tmc, nextevt: nextexp);
1371	raw_spin_unlock(&tmc->lock);
1372	return ret;
1373	}
1374
1375	/**
1376	* tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to
1377	* go idle
1378	* @nextevt: The next global timer expiry of the current CPU
1379	*
1380	* Return:
1381	* * KTIME_MAX - when it is probable that nothing has to be done (not
1382	* the only one in the level 0 group; and if it is the
1383	* only one in level 0 group, but there are more than a
1384	* single group active on the way to top level)
1385	* * nextevt - when CPU is offline and has to handle timer on its own
1386	* or when on the way to top in every group only a single
1387	* child is active but @nextevt is before the lowest
1388	* next_expiry encountered while walking up to top level.
1389	* * next_expiry - value of lowest expiry encountered while walking groups
1390	* if only a single child is active on each and @nextevt
1391	* is after this lowest expiry.
1392	*/
1393	u64 tmigr_quick_check(u64 nextevt)
1394	{
1395	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1396	struct tmigr_group *group = tmc->tmgroup;
1397
1398	if (tmigr_is_not_available(tmc))
1399	return nextevt;
1400
1401	if (WARN_ON_ONCE(tmc->idle))
1402	return nextevt;
1403
1404	if (!tmigr_check_migrator_and_lonely(group: tmc->tmgroup, childmask: tmc->groupmask))
1405	return KTIME_MAX;
1406
1407	do {
1408	if (!tmigr_check_lonely(group)) {
1409	return KTIME_MAX;
1410	} else {
1411	/*
1412	* Since current CPU is active, events may not be sorted
1413	* from bottom to the top because the CPU's event is ignored
1414	* up to the top and its sibling's events not propagated upwards.
1415	* Thus keep track of the lowest observed expiry.
1416	*/
1417	nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
1418	if (!group->parent)
1419	return nextevt;
1420	}
1421	group = group->parent;
1422	} while (group);
1423
1424	return KTIME_MAX;
1425	}
1426
1427	/*
1428	* tmigr_trigger_active() - trigger a CPU to become active again
1429	*
1430	* This function is executed on a CPU which is part of cpu_online_mask, when the
1431	* last active CPU in the hierarchy is offlining. With this, it is ensured that
1432	* the other CPU is active and takes over the migrator duty.
1433	*/
1434	static long tmigr_trigger_active(void *unused)
1435	{
1436	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1437
1438	WARN_ON_ONCE(!tmc->online \|\| tmc->idle);
1439
1440	return `0`;
1441	}
1442
1443	static int tmigr_cpu_offline(unsigned int cpu)
1444	{
1445	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1446	int migrator;
1447	u64 firstexp;
1448
1449	raw_spin_lock_irq(&tmc->lock);
1450	tmc->online = false;
1451	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1452
1453	/*
1454	* CPU has to handle the local events on his own, when on the way to
1455	* offline; Therefore nextevt value is set to KTIME_MAX
1456	*/
1457	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
1458	trace_tmigr_cpu_offline(tmc);
1459	raw_spin_unlock_irq(&tmc->lock);
1460
1461	if (firstexp != KTIME_MAX) {
1462	migrator = cpumask_any_but(cpu_online_mask, cpu);
1463	work_on_cpu(migrator, tmigr_trigger_active, NULL);
1464	}
1465
1466	return `0`;
1467	}
1468
1469	static int tmigr_cpu_online(unsigned int cpu)
1470	{
1471	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
1472
1473	/ Check whether CPU data was successfully initialized /
1474	if (WARN_ON_ONCE(!tmc->tmgroup))
1475	return -EINVAL;
1476
1477	raw_spin_lock_irq(&tmc->lock);
1478	trace_tmigr_cpu_online(tmc);
1479	tmc->idle = timer_base_is_idle();
1480	if (!tmc->idle)
1481	__tmigr_cpu_activate(tmc);
1482	tmc->online = true;
1483	raw_spin_unlock_irq(&tmc->lock);
1484	return `0`;
1485	}
1486
1487	static void tmigr_init_group(struct tmigr_group group, unsigned* int lvl,
1488	int node)
1489	{
1490	union tmigr_state s;
1491
1492	raw_spin_lock_init(&group->lock);
1493
1494	group->level = lvl;
1495	group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
1496
1497	group->num_children = `0`;
1498
1499	s.migrator = TMIGR_NONE;
1500	s.active = `0`;
1501	s.seq = `0`;
1502	atomic_set(v: &group->migr_state, i: s.state);
1503
1504	/*
1505	* If this is a new top-level, prepare its groupmask in advance.
1506	* This avoids accidents where yet another new top-level is
1507	* created in the future and made visible before the current groupmask.
1508	*/
1509	if (list_empty(head: &tmigr_level_list[lvl])) {
1510	group->groupmask = BIT(`0`);
1511	/*
1512	* The previous top level has prepared its groupmask already,
1513	* simply account it as the first child.
1514	*/
1515	if (lvl > `0`)
1516	group->num_children = `1`;
1517	}
1518
1519	timerqueue_init_head(head: &group->events);
1520	timerqueue_init(node: &group->groupevt.nextevt);
1521	group->groupevt.nextevt.expires = KTIME_MAX;
1522	WRITE_ONCE(group->next_expiry, KTIME_MAX);
1523	group->groupevt.ignore = true;
1524	}
1525
1526	static struct tmigr_group tmigr_get_group(unsigned* int cpu, int node,
1527	unsigned int lvl)
1528	{
1529	struct tmigr_group tmp, group = NULL;
1530
1531	lockdep_assert_held(&tmigr_mutex);
1532
1533	/ Try to attach to an existing group first /
1534	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
1535	/*
1536	* If @lvl is below the cross NUMA node level, check whether
1537	* this group belongs to the same NUMA node.
1538	*/
1539	if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
1540	continue;
1541
1542	/ Capacity left? /
1543	if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP)
1544	continue;
1545
1546	/*
1547	* TODO: A possible further improvement: Make sure that all CPU
1548	* siblings end up in the same group of the lowest level of the
1549	* hierarchy. Rely on the topology sibling mask would be a
1550	* reasonable solution.
1551	*/
1552
1553	group = tmp;
1554	break;
1555	}
1556
1557	if (group)
1558	return group;
1559
1560	/ Allocate and set up a new group /
1561	group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
1562	if (!group)
1563	return ERR_PTR(error: -ENOMEM);
1564
1565	tmigr_init_group(group, lvl, node);
1566
1567	/ Setup successful. Add it to the hierarchy /
1568	list_add(new: &group->list, head: &tmigr_level_list[lvl]);
1569	trace_tmigr_group_set(group);
1570	return group;
1571	}
1572
1573	static void tmigr_connect_child_parent(struct tmigr_group *child,
1574	struct tmigr_group *parent,
1575	bool activate)
1576	{
1577	struct tmigr_walk data;
1578
1579	raw_spin_lock_irq(&child->lock);
1580	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
1581
1582	if (activate) {
1583	/*
1584	* @child is the old top and @parent the new one. In this
1585	* case groupmask is pre-initialized and @child already
1586	* accounted, along with its new sibling corresponding to the
1587	* CPU going up.
1588	*/
1589	WARN_ON_ONCE(child->groupmask != BIT(`0`) \|\| parent->num_children != `2`);
1590	} else {
1591	/ Adding @child for the CPU going up to @parent. /
1592	child->groupmask = BIT(parent->num_children++);
1593	}
1594
1595	/*
1596	* Make sure parent initialization is visible before publishing it to a
1597	* racing CPU entering/exiting idle. This RELEASE barrier enforces an
1598	* address dependency that pairs with the READ_ONCE() in __walk_groups().
1599	*/
1600	smp_store_release(&child->parent, parent);
1601
1602	raw_spin_unlock(&parent->lock);
1603	raw_spin_unlock_irq(&child->lock);
1604
1605	trace_tmigr_connect_child_parent(child);
1606
1607	if (!activate)
1608	return;
1609
1610	/*
1611	* To prevent inconsistent states, active children need to be active in
1612	* the new parent as well. Inactive children are already marked inactive
1613	* in the parent group:
1614	*
1615	* * When new groups were created by tmigr_setup_groups() starting from
1616	* the lowest level (and not higher then one level below the current
1617	* top level), then they are not active. They will be set active when
1618	* the new online CPU comes active.
1619	*
1620	* * But if a new group above the current top level is required, it is
1621	* mandatory to propagate the active state of the already existing
1622	* child to the new parent. So tmigr_connect_child_parent() is
1623	* executed with the formerly top level group (child) and the newly
1624	* created group (parent).
1625	*
1626	* * It is ensured that the child is active, as this setup path is
1627	* executed in hotplug prepare callback. This is exectued by an
1628	* already connected and !idle CPU. Even if all other CPUs go idle,
1629	* the CPU executing the setup will be responsible up to current top
1630	* level group. And the next time it goes inactive, it will release
1631	* the new childmask and parent to subsequent walkers through this
1632	* @child. Therefore propagate active state unconditionally.
1633	*/
1634	data.childmask = child->groupmask;
1635
1636	/*
1637	* There is only one new level per time (which is protected by
1638	* tmigr_mutex). When connecting the child and the parent and set the
1639	* child active when the parent is inactive, the parent needs to be the
1640	* uppermost level. Otherwise there went something wrong!
1641	*/
1642	WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
1643	}
1644
1645	static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
1646	{
1647	struct tmigr_group group, child, **stack;
1648	int top = `0`, err = `0`, i = `0`;
1649	struct list_head *lvllist;
1650
1651	stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
1652	if (!stack)
1653	return -ENOMEM;
1654
1655	do {
1656	group = tmigr_get_group(cpu, node, lvl: i);
1657	if (IS_ERR(ptr: group)) {
1658	err = PTR_ERR(ptr: group);
1659	break;
1660	}
1661
1662	top = i;
1663	stack[i++] = group;
1664
1665	/*
1666	* When booting only less CPUs of a system than CPUs are
1667	* available, not all calculated hierarchy levels are required.
1668	*
1669	* The loop is aborted as soon as the highest level, which might
1670	* be different from tmigr_hierarchy_levels, contains only a
1671	* single group.
1672	*/
1673	if (group->parent \|\| list_is_singular(head: &tmigr_level_list[i - `1`]))
1674	break;
1675
1676	} while (i < tmigr_hierarchy_levels);
1677
1678	/ Assert single root /
1679	WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
1680
1681	while (i > `0`) {
1682	group = stack[--i];
1683
1684	if (err < `0`) {
1685	list_del(entry: &group->list);
1686	kfree(objp: group);
1687	continue;
1688	}
1689
1690	WARN_ON_ONCE(i != group->level);
1691
1692	/*
1693	* Update tmc -> group / child -> group connection
1694	*/
1695	if (i == `0`) {
1696	struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
1697
1698	raw_spin_lock_irq(&group->lock);
1699
1700	tmc->tmgroup = group;
1701	tmc->groupmask = BIT(group->num_children++);
1702
1703	raw_spin_unlock_irq(&group->lock);
1704
1705	trace_tmigr_connect_cpu_parent(tmc);
1706
1707	/ There are no children that need to be connected /
1708	continue;
1709	} else {
1710	child = stack[i - `1`];
1711	/ Will be activated at online time /
1712	tmigr_connect_child_parent(child, parent: group, activate: false);
1713	}
1714
1715	/ check if uppermost level was newly created /
1716	if (top != i)
1717	continue;
1718
1719	WARN_ON_ONCE(top == `0`);
1720
1721	lvllist = &tmigr_level_list[top];
1722
1723	/*
1724	* Newly created root level should have accounted the upcoming
1725	* CPU's child group and pre-accounted the old root.
1726	*/
1727	if (group->num_children == `2` && list_is_singular(head: lvllist)) {
1728	/*
1729	* The target CPU must never do the prepare work, except
1730	* on early boot when the boot CPU is the target. Otherwise
1731	* it may spuriously activate the old top level group inside
1732	* the new one (nevertheless whether old top level group is
1733	* active or not) and/or release an uninitialized childmask.
1734	*/
1735	WARN_ON_ONCE(cpu == raw_smp_processor_id());
1736
1737	lvllist = &tmigr_level_list[top - `1`];
1738	list_for_each_entry(child, lvllist, list) {
1739	if (child->parent)
1740	continue;
1741
1742	tmigr_connect_child_parent(child, parent: group, activate: true);
1743	}
1744	}
1745	}
1746
1747	kfree(objp: stack);
1748
1749	return err;
1750	}
1751
1752	static int tmigr_add_cpu(unsigned int cpu)
1753	{
1754	int node = cpu_to_node(cpu);
1755	int ret;
1756
1757	mutex_lock(&tmigr_mutex);
1758	ret = tmigr_setup_groups(cpu, node);
1759	mutex_unlock(lock: &tmigr_mutex);
1760
1761	return ret;
1762	}
1763
1764	static int tmigr_cpu_prepare(unsigned int cpu)
1765	{
1766	struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
1767	int ret = `0`;
1768
1769	/ Not first online attempt? /
1770	if (tmc->tmgroup)
1771	return ret;
1772
1773	raw_spin_lock_init(&tmc->lock);
1774	timerqueue_init(node: &tmc->cpuevt.nextevt);
1775	tmc->cpuevt.nextevt.expires = KTIME_MAX;
1776	tmc->cpuevt.ignore = true;
1777	tmc->cpuevt.cpu = cpu;
1778	tmc->remote = false;
1779	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1780
1781	ret = tmigr_add_cpu(cpu);
1782	if (ret < `0`)
1783	return ret;
1784
1785	if (tmc->groupmask == `0`)
1786	return -EINVAL;
1787
1788	return ret;
1789	}
1790
1791	static int __init tmigr_init(void)
1792	{
1793	unsigned int cpulvl, nodelvl, cpus_per_node, i;
1794	unsigned int nnodes = num_possible_nodes();
1795	unsigned int ncpus = num_possible_cpus();
1796	int ret = -ENOMEM;
1797
1798	BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP);
1799
1800	/ Nothing to do if running on UP /
1801	if (ncpus == `1`)
1802	return `0`;
1803
1804	/*
1805	* Calculate the required hierarchy levels. Unfortunately there is no
1806	* reliable information available, unless all possible CPUs have been
1807	* brought up and all NUMA nodes are populated.
1808	*
1809	* Estimate the number of levels with the number of possible nodes and
1810	* the number of possible CPUs. Assume CPUs are spread evenly across
1811	* nodes. We cannot rely on cpumask_of_node() because it only works for
1812	* online CPUs.
1813	*/
1814	cpus_per_node = DIV_ROUND_UP(ncpus, nnodes);
1815
1816	/ Calc the hierarchy levels required to hold the CPUs of a node /
1817	cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node),
1818	ilog2(TMIGR_CHILDREN_PER_GROUP));
1819
1820	/ Calculate the extra levels to connect all nodes /
1821	nodelvl = DIV_ROUND_UP(order_base_2(nnodes),
1822	ilog2(TMIGR_CHILDREN_PER_GROUP));
1823
1824	tmigr_hierarchy_levels = cpulvl + nodelvl;
1825
1826	/*
1827	* If a NUMA node spawns more than one CPU level group then the next
1828	* level(s) of the hierarchy contains groups which handle all CPU groups
1829	* of the same NUMA node. The level above goes across NUMA nodes. Store
1830	* this information for the setup code to decide in which level node
1831	* matching is no longer required.
1832	*/
1833	tmigr_crossnode_level = cpulvl;
1834
1835	tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL);
1836	if (!tmigr_level_list)
1837	goto err;
1838
1839	for (i = `0`; i < tmigr_hierarchy_levels; i++)
1840	INIT_LIST_HEAD(list: &tmigr_level_list[i]);
1841
1842	pr_info("Timer migration: %d hierarchy levels; %d children per group;"
1843	" %d crossnode level\n",
1844	tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
1845	tmigr_crossnode_level);
1846
1847	ret = cpuhp_setup_state(state: CPUHP_TMIGR_PREPARE, name: "tmigr:prepare",
1848	startup: tmigr_cpu_prepare, NULL);
1849	if (ret)
1850	goto err;
1851
1852	ret = cpuhp_setup_state(state: CPUHP_AP_TMIGR_ONLINE, name: "tmigr:online",
1853	startup: tmigr_cpu_online, teardown: tmigr_cpu_offline);
1854	if (ret)
1855	goto err;
1856
1857	return `0`;
1858
1859	err:
1860	pr_err("Timer migration setup failed\n");
1861	return ret;
1862	}
1863	early_initcall(tmigr_init);
1864

source code of linux/kernel/time/timer_migration.c