intel_execlists_submission.c source code [Linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2014 Intel Corporation
4	*/
5
6	/**
7	* DOC: Logical Rings, Logical Ring Contexts and Execlists
8	*
9	* Motivation:
10	* GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
11	* These expanded contexts enable a number of new abilities, especially
12	* "Execlists" (also implemented in this file).
13	*
14	* One of the main differences with the legacy HW contexts is that logical
15	* ring contexts incorporate many more things to the context's state, like
16	* PDPs or ringbuffer control registers:
17	*
18	* The reason why PDPs are included in the context is straightforward: as
19	* PPGTTs (per-process GTTs) are actually per-context, having the PDPs
20	* contained there mean you don't need to do a ppgtt->switch_mm yourself,
21	* instead, the GPU will do it for you on the context switch.
22	*
23	* But, what about the ringbuffer control registers (head, tail, etc..)?
24	* shouldn't we just need a set of those per engine command streamer? This is
25	* where the name "Logical Rings" starts to make sense: by virtualizing the
26	* rings, the engine cs shifts to a new "ring buffer" with every context
27	* switch. When you want to submit a workload to the GPU you: A) choose your
28	* context, B) find its appropriate virtualized ring, C) write commands to it
29	* and then, finally, D) tell the GPU to switch to that context.
30	*
31	* Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
32	* to a contexts is via a context execution list, ergo "Execlists".
33	*
34	* LRC implementation:
35	* Regarding the creation of contexts, we have:
36	*
37	* - One global default context.
38	* - One local default context for each opened fd.
39	* - One local extra context for each context create ioctl call.
40	*
41	* Now that ringbuffers belong per-context (and not per-engine, like before)
42	* and that contexts are uniquely tied to a given engine (and not reusable,
43	* like before) we need:
44	*
45	* - One ringbuffer per-engine inside each context.
46	* - One backing object per-engine inside each context.
47	*
48	* The global default context starts its life with these new objects fully
49	* allocated and populated. The local default context for each opened fd is
50	* more complex, because we don't know at creation time which engine is going
51	* to use them. To handle this, we have implemented a deferred creation of LR
52	* contexts:
53	*
54	* The local context starts its life as a hollow or blank holder, that only
55	* gets populated for a given engine once we receive an execbuffer. If later
56	* on we receive another execbuffer ioctl for the same context but a different
57	* engine, we allocate/populate a new ringbuffer and context backing object and
58	* so on.
59	*
60	* Finally, regarding local contexts created using the ioctl call: as they are
61	* only allowed with the render ring, we can allocate & populate them right
62	* away (no need to defer anything, at least for now).
63	*
64	* Execlists implementation:
65	* Execlists are the new method by which, on gen8+ hardware, workloads are
66	* submitted for execution (as opposed to the legacy, ringbuffer-based, method).
67	* This method works as follows:
68	*
69	* When a request is committed, its commands (the BB start and any leading or
70	* trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
71	* for the appropriate context. The tail pointer in the hardware context is not
72	* updated at this time, but instead, kept by the driver in the ringbuffer
73	* structure. A structure representing this request is added to a request queue
74	* for the appropriate engine: this structure contains a copy of the context's
75	* tail after the request was written to the ring buffer and a pointer to the
76	* context itself.
77	*
78	* If the engine's request queue was empty before the request was added, the
79	* queue is processed immediately. Otherwise the queue will be processed during
80	* a context switch interrupt. In any case, elements on the queue will get sent
81	* (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
82	* globally unique 20-bits submission ID.
83	*
84	* When execution of a request completes, the GPU updates the context status
85	* buffer with a context complete event and generates a context switch interrupt.
86	* During the interrupt handling, the driver examines the events in the buffer:
87	* for each context complete event, if the announced ID matches that on the head
88	* of the request queue, then that request is retired and removed from the queue.
89	*
90	* After processing, if any requests were retired and the queue is not empty
91	* then a new execution list can be submitted. The two requests at the front of
92	* the queue are next to be submitted but since a context may not occur twice in
93	* an execution list, if subsequent requests have the same ID as the first then
94	* the two requests must be combined. This is done simply by discarding requests
95	* at the head of the queue until either only one requests is left (in which case
96	* we use a NULL second context) or the first two requests have unique IDs.
97	*
98	* By always executing the first two requests in the queue the driver ensures
99	* that the GPU is kept as busy as possible. In the case where a single context
100	* completes but a second context is still executing, the request for this second
101	* context will be at the head of the queue when we remove the first one. This
102	* request will then be resubmitted along with a new request for a different context,
103	* which will cause the hardware to continue executing the second request and queue
104	* the new request (the GPU detects the condition of a context getting preempted
105	* with the same context and optimizes the context switch flow by not doing
106	* preemption, but just sampling the new tail pointer).
107	*
108	*/
109
110	#include <linux/interrupt.h>
111	#include <linux/string_helpers.h>
112
113	#include "gen8_engine_cs.h"
114	#include "i915_drv.h"
115	#include "i915_list_util.h"
116	#include "i915_reg.h"
117	#include "i915_timer_util.h"
118	#include "i915_trace.h"
119	#include "i915_vgpu.h"
120	#include "i915_wait_util.h"
121	#include "intel_breadcrumbs.h"
122	#include "intel_context.h"
123	#include "intel_engine_heartbeat.h"
124	#include "intel_engine_pm.h"
125	#include "intel_engine_regs.h"
126	#include "intel_engine_stats.h"
127	#include "intel_execlists_submission.h"
128	#include "intel_gt.h"
129	#include "intel_gt_irq.h"
130	#include "intel_gt_pm.h"
131	#include "intel_gt_regs.h"
132	#include "intel_gt_requests.h"
133	#include "intel_lrc.h"
134	#include "intel_lrc_reg.h"
135	#include "intel_mocs.h"
136	#include "intel_reset.h"
137	#include "intel_ring.h"
138	#include "intel_workarounds.h"
139	#include "shmem_utils.h"
140
141	#define RING_EXECLIST_QFULL (1 << 0x2)
142	#define RING_EXECLIST1_VALID (1 << 0x3)
143	#define RING_EXECLIST0_VALID (1 << 0x4)
144	#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
145	#define RING_EXECLIST1_ACTIVE (1 << 0x11)
146	#define RING_EXECLIST0_ACTIVE (1 << 0x12)
147
148	#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
149	#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
150	#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
151	#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
152	#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
153	#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
154
155	#define GEN8_CTX_STATUS_COMPLETED_MASK \
156	(GEN8_CTX_STATUS_COMPLETE \| GEN8_CTX_STATUS_PREEMPTED)
157
158	#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
159	#define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
160	#define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
161	#define GEN12_IDLE_CTX_ID 0x7FF
162	#define GEN12_CSB_CTX_VALID(csb_dw) \
163	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
164
165	#define XEHP_CTX_STATUS_SWITCHED_TO_NEW_QUEUE BIT(1) /* upper csb dword */
166	#define XEHP_CSB_SW_CTX_ID_MASK GENMASK(31, 10)
167	#define XEHP_IDLE_CTX_ID 0xFFFF
168	#define XEHP_CSB_CTX_VALID(csb_dw) \
169	(FIELD_GET(XEHP_CSB_SW_CTX_ID_MASK, csb_dw) != XEHP_IDLE_CTX_ID)
170
171	/ Typical size of the average request (2 pipecontrols and a MI_BB) /
172	#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
173
174	struct virtual_engine {
175	struct intel_engine_cs base;
176	struct intel_context context;
177	struct rcu_work rcu;
178
179	/*
180	* We allow only a single request through the virtual engine at a time
181	* (each request in the timeline waits for the completion fence of
182	* the previous before being submitted). By restricting ourselves to
183	* only submitting a single request, each request is placed on to a
184	* physical to maximise load spreading (by virtue of the late greedy
185	* scheduling -- each real engine takes the next available request
186	* upon idling).
187	*/
188	struct i915_request *request;
189
190	/*
191	* We keep a rbtree of available virtual engines inside each physical
192	* engine, sorted by priority. Here we preallocate the nodes we need
193	* for the virtual engine, indexed by physical_engine->id.
194	*/
195	struct ve_node {
196	struct rb_node rb;
197	int prio;
198	} nodes[I915_NUM_ENGINES];
199
200	/ And finally, which physical engines this virtual engine maps onto. /
201	unsigned int num_siblings;
202	struct intel_engine_cs *siblings[];
203	};
204
205	static struct virtual_engine to_virtual_engine(struct* intel_engine_cs *engine)
206	{
207	GEM_BUG_ON(!intel_engine_is_virtual(engine));
208	return container_of(engine, struct virtual_engine, base);
209	}
210
211	static struct intel_context *
212	execlists_create_virtual(struct intel_engine_cs *siblings, unsigned* int count,
213	unsigned long flags);
214
215	static struct i915_request *
216	__active_request(const struct intel_timeline * const tl,
217	struct i915_request *rq,
218	int error)
219	{
220	struct i915_request *active = rq;
221
222	list_for_each_entry_from_reverse(rq, &tl->requests, link) {
223	if (__i915_request_is_complete(rq))
224	break;
225
226	if (error) {
227	i915_request_set_error_once(rq, error);
228	__i915_request_skip(rq);
229	}
230	active = rq;
231	}
232
233	return active;
234	}
235
236	static struct i915_request *
237	active_request(const struct intel_timeline * const tl, struct i915_request *rq)
238	{
239	return __active_request(tl, rq, error: `0`);
240	}
241
242	static void ring_set_paused(const struct intel_engine_cs engine, int* state)
243	{
244	/*
245	* We inspect HWS_PREEMPT with a semaphore inside
246	* engine->emit_fini_breadcrumb. If the dword is true,
247	* the ring is paused as the semaphore will busywait
248	* until the dword is false.
249	*/
250	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
251	if (state)
252	wmb();
253	}
254
255	static struct i915_priolist to_priolist(struct* rb_node *rb)
256	{
257	return rb_entry(rb, struct i915_priolist, node);
258	}
259
260	static int rq_prio(const struct i915_request *rq)
261	{
262	return READ_ONCE(rq->sched.attr.priority);
263	}
264
265	static int effective_prio(const struct i915_request *rq)
266	{
267	int prio = rq_prio(rq);
268
269	/*
270	* If this request is special and must not be interrupted at any
271	* cost, so be it. Note we are only checking the most recent request
272	* in the context and so may be masking an earlier vip request. It
273	* is hoped that under the conditions where nopreempt is used, this
274	* will not matter (i.e. all requests to that context will be
275	* nopreempt for as long as desired).
276	*/
277	if (i915_request_has_nopreempt(rq))
278	prio = I915_PRIORITY_UNPREEMPTABLE;
279
280	return prio;
281	}
282
283	static int queue_prio(const struct i915_sched_engine *sched_engine)
284	{
285	struct rb_node *rb;
286
287	rb = rb_first_cached(&sched_engine->queue);
288	if (!rb)
289	return INT_MIN;
290
291	return to_priolist(rb)->priority;
292	}
293
294	static int virtual_prio(const struct intel_engine_execlists *el)
295	{
296	struct rb_node *rb = rb_first_cached(&el->virtual);
297
298	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
299	}
300
301	static bool need_preempt(const struct intel_engine_cs *engine,
302	const struct i915_request *rq)
303	{
304	int last_prio;
305
306	if (!intel_engine_has_semaphores(engine))
307	return false;
308
309	/*
310	* Check if the current priority hint merits a preemption attempt.
311	*
312	* We record the highest value priority we saw during rescheduling
313	* prior to this dequeue, therefore we know that if it is strictly
314	* less than the current tail of ESLP[0], we do not need to force
315	* a preempt-to-idle cycle.
316	*
317	* However, the priority hint is a mere hint that we may need to
318	* preempt. If that hint is stale or we may be trying to preempt
319	* ourselves, ignore the request.
320	*
321	* More naturally we would write
322	* prio >= max(0, last);
323	* except that we wish to prevent triggering preemption at the same
324	* priority level: the task that is running should remain running
325	* to preserve FIFO ordering of dependencies.
326	*/
327	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - `1`);
328	if (engine->sched_engine->queue_priority_hint <= last_prio)
329	return false;
330
331	/*
332	* Check against the first request in ELSP[1], it will, thanks to the
333	* power of PI, be the highest priority of that context.
334	*/
335	if (!list_is_last(list: &rq->sched.link, head: &engine->sched_engine->requests) &&
336	rq_prio(list_next_entry(rq, sched.link)) > last_prio)
337	return true;
338
339	/*
340	* If the inflight context did not trigger the preemption, then maybe
341	* it was the set of queued requests? Pick the highest priority in
342	* the queue (the first active priolist) and see if it deserves to be
343	* running instead of ELSP[0].
344	*
345	* The highest priority request in the queue can not be either
346	* ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
347	* context, it's priority would not exceed ELSP[0] aka last_prio.
348	*/
349	return max(virtual_prio(&engine->execlists),
350	queue_prio(engine->sched_engine)) > last_prio;
351	}
352
353	__maybe_unused static bool
354	assert_priority_queue(const struct i915_request *prev,
355	const struct i915_request *next)
356	{
357	/*
358	* Without preemption, the prev may refer to the still active element
359	* which we refuse to let go.
360	*
361	* Even with preemption, there are times when we think it is better not
362	* to preempt and leave an ostensibly lower priority request in flight.
363	*/
364	if (i915_request_is_active(rq: prev))
365	return true;
366
367	return rq_prio(rq: prev) >= rq_prio(rq: next);
368	}
369
370	static struct i915_request *
371	__unwind_incomplete_requests(struct intel_engine_cs *engine)
372	{
373	struct i915_request rq, rn, *active = NULL;
374	struct list_head *pl;
375	int prio = I915_PRIORITY_INVALID;
376
377	lockdep_assert_held(&engine->sched_engine->lock);
378
379	list_for_each_entry_safe_reverse(rq, rn,
380	&engine->sched_engine->requests,
381	sched.link) {
382	if (__i915_request_is_complete(rq)) {
383	list_del_init(entry: &rq->sched.link);
384	continue;
385	}
386
387	__i915_request_unsubmit(request: rq);
388
389	GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
390	if (rq_prio(rq) != prio) {
391	prio = rq_prio(rq);
392	pl = i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
393	prio);
394	}
395	GEM_BUG_ON(i915_sched_engine_is_empty(engine->sched_engine));
396
397	list_move(list: &rq->sched.link, head: pl);
398	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
399
400	/ Check in case we rollback so far we wrap [size/2] /
401	if (intel_ring_direction(ring: rq->ring,
402	next: rq->tail,
403	prev: rq->ring->tail + `8`) > `0`)
404	rq->context->lrc.desc \|= CTX_DESC_FORCE_RESTORE;
405
406	active = rq;
407	}
408
409	return active;
410	}
411
412	static void
413	execlists_context_status_change(struct i915_request rq, unsigned* long status)
414	{
415	/*
416	* Only used when GVT-g is enabled now. When GVT-g is disabled,
417	* The compiler should eliminate this function as dead-code.
418	*/
419	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
420	return;
421
422	atomic_notifier_call_chain(nh: &rq->engine->context_status_notifier,
423	val: status, v: rq);
424	}
425
426	static void reset_active(struct i915_request *rq,
427	struct intel_engine_cs *engine)
428	{
429	struct intel_context * const ce = rq->context;
430	u32 head;
431
432	/*
433	* The executing context has been cancelled. We want to prevent
434	* further execution along this context and propagate the error on
435	* to anything depending on its results.
436	*
437	* In __i915_request_submit(), we apply the -EIO and remove the
438	* requests' payloads for any banned requests. But first, we must
439	* rewind the context back to the start of the incomplete request so
440	* that we do not jump back into the middle of the batch.
441	*
442	* We preserve the breadcrumbs and semaphores of the incomplete
443	* requests so that inter-timeline dependencies (i.e other timelines)
444	* remain correctly ordered. And we defer to __i915_request_submit()
445	* so that all asynchronous waits are correctly handled.
446	*/
447	ENGINE_TRACE(engine, "{ reset rq=%llx:%lld }\n",
448	rq->fence.context, rq->fence.seqno);
449
450	/ On resubmission of the active request, payload will be scrubbed /
451	if (__i915_request_is_complete(rq))
452	head = rq->tail;
453	else
454	head = __active_request(tl: ce->timeline, rq, error: -EIO)->head;
455	head = intel_ring_wrap(ring: ce->ring, pos: head);
456
457	/ Scrub the context image to prevent replaying the previous batch /
458	lrc_init_regs(ce, engine, clear: true);
459
460	/ We've switched away, so this should be a no-op, but intent matters /
461	ce->lrc.lrca = lrc_update_regs(ce, engine, head);
462	}
463
464	static bool bad_request(const struct i915_request *rq)
465	{
466	return rq->fence.error && i915_request_started(rq);
467	}
468
469	static struct intel_engine_cs *
470	__execlists_schedule_in(struct i915_request *rq)
471	{
472	struct intel_engine_cs * const engine = rq->engine;
473	struct intel_context * const ce = rq->context;
474
475	intel_context_get(ce);
476
477	if (unlikely(intel_context_is_closed(ce) &&
478	!intel_engine_has_heartbeat(engine)))
479	intel_context_set_exiting(ce);
480
481	if (unlikely(!intel_context_is_schedulable(ce) \|\| bad_request(rq)))
482	reset_active(rq, engine);
483
484	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
485	lrc_check_regs(ce, engine, when: "before");
486
487	if (ce->tag) {
488	/ Use a fixed tag for OA and friends /
489	GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
490	ce->lrc.ccid = ce->tag;
491	} else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `55`)) {
492	/ We don't need a strict matching tag, just different values /
493	unsigned int tag = ffs(READ_ONCE(engine->context_tag));
494
495	GEM_BUG_ON(tag == `0` \|\| tag >= BITS_PER_LONG);
496	clear_bit(nr: tag - `1`, addr: &engine->context_tag);
497	ce->lrc.ccid = tag << (XEHP_SW_CTX_ID_SHIFT - `32`);
498
499	BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
500
501	} else {
502	/ We don't need a strict matching tag, just different values /
503	unsigned int tag = __ffs(engine->context_tag);
504
505	GEM_BUG_ON(tag >= BITS_PER_LONG);
506	__clear_bit(tag, &engine->context_tag);
507	ce->lrc.ccid = (`1` + tag) << (GEN11_SW_CTX_ID_SHIFT - `32`);
508
509	BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
510	}
511
512	ce->lrc.ccid \|= engine->execlists.ccid;
513
514	__intel_gt_pm_get(gt: engine->gt);
515	if (engine->fw_domain && !engine->fw_active++)
516	intel_uncore_forcewake_get(uncore: engine->uncore, domains: engine->fw_domain);
517	execlists_context_status_change(rq, status: INTEL_CONTEXT_SCHEDULE_IN);
518	intel_engine_context_in(engine);
519
520	CE_TRACE(ce, "schedule-in, ccid:%x\n", ce->lrc.ccid);
521
522	return engine;
523	}
524
525	static void execlists_schedule_in(struct i915_request rq, int* idx)
526	{
527	struct intel_context * const ce = rq->context;
528	struct intel_engine_cs *old;
529
530	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
531	trace_i915_request_in(rq, port: idx);
532
533	old = ce->inflight;
534	if (!old)
535	old = __execlists_schedule_in(rq);
536	WRITE_ONCE(ce->inflight, ptr_inc(old));
537
538	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
539	}
540
541	static void
542	resubmit_virtual_request(struct i915_request rq, struct* virtual_engine *ve)
543	{
544	struct intel_engine_cs *engine = rq->engine;
545
546	spin_lock_irq(lock: &engine->sched_engine->lock);
547
548	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
549	WRITE_ONCE(rq->engine, &ve->base);
550	ve->base.submit_request(rq);
551
552	spin_unlock_irq(lock: &engine->sched_engine->lock);
553	}
554
555	static void kick_siblings(struct i915_request rq, struct* intel_context *ce)
556	{
557	struct virtual_engine ve = container_of(ce, typeof(ve), context);
558	struct intel_engine_cs *engine = rq->engine;
559
560	/*
561	* After this point, the rq may be transferred to a new sibling, so
562	* before we clear ce->inflight make sure that the context has been
563	* removed from the b->signalers and furthermore we need to make sure
564	* that the concurrent iterator in signal_irq_work is no longer
565	* following ce->signal_link.
566	*/
567	if (!list_empty(head: &ce->signals))
568	intel_context_remove_breadcrumbs(ce, b: engine->breadcrumbs);
569
570	/*
571	* This engine is now too busy to run this virtual request, so
572	* see if we can find an alternative engine for it to execute on.
573	* Once a request has become bonded to this engine, we treat it the
574	* same as other native request.
575	*/
576	if (i915_request_in_priority_queue(rq) &&
577	rq->execution_mask != engine->mask)
578	resubmit_virtual_request(rq, ve);
579
580	if (READ_ONCE(ve->request))
581	tasklet_hi_schedule(t: &ve->base.sched_engine->tasklet);
582	}
583
584	static void __execlists_schedule_out(struct i915_request * const rq,
585	struct intel_context * const ce)
586	{
587	struct intel_engine_cs * const engine = rq->engine;
588	unsigned int ccid;
589
590	/*
591	* NB process_csb() is not under the engine->sched_engine->lock and hence
592	* schedule_out can race with schedule_in meaning that we should
593	* refrain from doing non-trivial work here.
594	*/
595
596	CE_TRACE(ce, "schedule-out, ccid:%x\n", ce->lrc.ccid);
597	GEM_BUG_ON(ce->inflight != engine);
598
599	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
600	lrc_check_regs(ce, engine, when: "after");
601
602	/*
603	* If we have just completed this context, the engine may now be
604	* idle and we want to re-enter powersaving.
605	*/
606	if (intel_timeline_is_last(tl: ce->timeline, rq) &&
607	__i915_request_is_complete(rq))
608	intel_engine_add_retire(engine, tl: ce->timeline);
609
610	ccid = ce->lrc.ccid;
611	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `55`)) {
612	ccid >>= XEHP_SW_CTX_ID_SHIFT - `32`;
613	ccid &= XEHP_MAX_CONTEXT_HW_ID;
614	} else {
615	ccid >>= GEN11_SW_CTX_ID_SHIFT - `32`;
616	ccid &= GEN12_MAX_CONTEXT_HW_ID;
617	}
618
619	if (ccid < BITS_PER_LONG) {
620	GEM_BUG_ON(ccid == `0`);
621	GEM_BUG_ON(test_bit(ccid - `1`, &engine->context_tag));
622	__set_bit(ccid - `1`, &engine->context_tag);
623	}
624	intel_engine_context_out(engine);
625	execlists_context_status_change(rq, status: INTEL_CONTEXT_SCHEDULE_OUT);
626	if (engine->fw_domain && !--engine->fw_active)
627	intel_uncore_forcewake_put(uncore: engine->uncore, domains: engine->fw_domain);
628	intel_gt_pm_put_async_untracked(gt: engine->gt);
629
630	/*
631	* If this is part of a virtual engine, its next request may
632	* have been blocked waiting for access to the active context.
633	* We have to kick all the siblings again in case we need to
634	* switch (e.g. the next request is not runnable on this
635	* engine). Hopefully, we will already have submitted the next
636	* request before the tasklet runs and do not need to rebuild
637	* each virtual tree and kick everyone again.
638	*/
639	if (ce->engine != engine)
640	kick_siblings(rq, ce);
641
642	WRITE_ONCE(ce->inflight, NULL);
643	intel_context_put(ce);
644	}
645
646	static inline void execlists_schedule_out(struct i915_request *rq)
647	{
648	struct intel_context * const ce = rq->context;
649
650	trace_i915_request_out(rq);
651
652	GEM_BUG_ON(!ce->inflight);
653	ce->inflight = ptr_dec(ce->inflight);
654	if (!__intel_context_inflight_count(ce->inflight))
655	__execlists_schedule_out(rq, ce);
656
657	i915_request_put(rq);
658	}
659
660	static u32 map_i915_prio_to_lrc_desc_prio(int prio)
661	{
662	if (prio > I915_PRIORITY_NORMAL)
663	return GEN12_CTX_PRIORITY_HIGH;
664	else if (prio < I915_PRIORITY_NORMAL)
665	return GEN12_CTX_PRIORITY_LOW;
666	else
667	return GEN12_CTX_PRIORITY_NORMAL;
668	}
669
670	static u64 execlists_update_context(struct i915_request *rq)
671	{
672	struct intel_context *ce = rq->context;
673	u64 desc;
674	u32 tail, prev;
675
676	desc = ce->lrc.desc;
677	if (rq->engine->flags & I915_ENGINE_HAS_EU_PRIORITY)
678	desc \|= map_i915_prio_to_lrc_desc_prio(prio: rq_prio(rq));
679
680	/*
681	* WaIdleLiteRestore:bdw,skl
682	*
683	* We should never submit the context with the same RING_TAIL twice
684	* just in case we submit an empty ring, which confuses the HW.
685	*
686	* We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
687	* the normal request to be able to always advance the RING_TAIL on
688	* subsequent resubmissions (for lite restore). Should that fail us,
689	* and we try and submit the same tail again, force the context
690	* reload.
691	*
692	* If we need to return to a preempted context, we need to skip the
693	* lite-restore and force it to reload the RING_TAIL. Otherwise, the
694	* HW has a tendency to ignore us rewinding the TAIL to the end of
695	* an earlier request.
696	*/
697	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
698	prev = rq->ring->tail;
699	tail = intel_ring_set_tail(ring: rq->ring, tail: rq->tail);
700	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= `0`))
701	desc \|= CTX_DESC_FORCE_RESTORE;
702	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
703	rq->tail = rq->wa_tail;
704
705	/*
706	* Make sure the context image is complete before we submit it to HW.
707	*
708	* Ostensibly, writes (including the WCB) should be flushed prior to
709	* an uncached write such as our mmio register access, the empirical
710	* evidence (esp. on Braswell) suggests that the WC write into memory
711	* may not be visible to the HW prior to the completion of the UC
712	* register write and that we may begin execution from the context
713	* before its image is complete leading to invalid PD chasing.
714	*/
715	wmb();
716
717	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
718	return desc;
719	}
720
721	static void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
722	{
723	if (execlists->ctrl_reg) {
724	writel(lower_32_bits(desc), addr: execlists->submit_reg + port * `2`);
725	writel(upper_32_bits(desc), addr: execlists->submit_reg + port * `2` + `1`);
726	} else {
727	writel(upper_32_bits(desc), addr: execlists->submit_reg);
728	writel(lower_32_bits(desc), addr: execlists->submit_reg);
729	}
730	}
731
732	static __maybe_unused char *
733	dump_port(char buf, int* buflen, const char prefix, struct* i915_request *rq)
734	{
735	if (!rq)
736	return "";
737
738	snprintf(buf, size: buflen, fmt: "%sccid:%x %llx:%lld%s prio %d",
739	prefix,
740	rq->context->lrc.ccid,
741	rq->fence.context, rq->fence.seqno,
742	__i915_request_is_complete(rq) ? "!" :
743	__i915_request_has_started(rq) ? "*" :
744	"",
745	rq_prio(rq));
746
747	return buf;
748	}
749
750	static __maybe_unused noinline void
751	trace_ports(const struct intel_engine_execlists *execlists,
752	const char *msg,
753	struct i915_request * const *ports)
754	{
755	const struct intel_engine_cs *engine =
756	container_of(execlists, typeof(*engine), execlists);
757	char __maybe_unused p0[`40`], p1[`40`];
758
759	if (!ports[`0`])
760	return;
761
762	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
763	dump_port(p0, sizeof(p0), "", ports[`0`]),
764	dump_port(p1, sizeof(p1), ", ", ports[`1`]));
765	}
766
767	static bool
768	reset_in_progress(const struct intel_engine_cs *engine)
769	{
770	return unlikely(!__tasklet_is_enabled(&engine->sched_engine->tasklet));
771	}
772
773	static __maybe_unused noinline bool
774	assert_pending_valid(const struct intel_engine_execlists *execlists,
775	const char *msg)
776	{
777	struct intel_engine_cs *engine =
778	container_of(execlists, typeof(*engine), execlists);
779	struct i915_request * const port, rq, *prev = NULL;
780	struct intel_context *ce = NULL;
781	u32 ccid = -`1`;
782
783	trace_ports(execlists, msg, ports: execlists->pending);
784
785	/ We may be messing around with the lists during reset, lalala /
786	if (reset_in_progress(engine))
787	return true;
788
789	if (!execlists->pending[`0`]) {
790	GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
791	engine->name);
792	return false;
793	}
794
795	if (execlists->pending[execlists_num_ports(execlists)]) {
796	GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
797	engine->name, execlists_num_ports(execlists));
798	return false;
799	}
800
801	for (port = execlists->pending; (rq = *port); port++) {
802	unsigned long flags;
803	bool ok = true;
804
805	GEM_BUG_ON(!kref_read(&rq->fence.refcount));
806	GEM_BUG_ON(!i915_request_is_active(rq));
807
808	if (ce == rq->context) {
809	GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
810	engine->name,
811	ce->timeline->fence_context,
812	port - execlists->pending);
813	return false;
814	}
815	ce = rq->context;
816
817	if (ccid == ce->lrc.ccid) {
818	GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
819	engine->name,
820	ccid, ce->timeline->fence_context,
821	port - execlists->pending);
822	return false;
823	}
824	ccid = ce->lrc.ccid;
825
826	/*
827	* Sentinels are supposed to be the last request so they flush
828	* the current execution off the HW. Check that they are the only
829	* request in the pending submission.
830	*
831	* NB: Due to the async nature of preempt-to-busy and request
832	* cancellation we need to handle the case where request
833	* becomes a sentinel in parallel to CSB processing.
834	*/
835	if (prev && i915_request_has_sentinel(rq: prev) &&
836	!READ_ONCE(prev->fence.error)) {
837	GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
838	engine->name,
839	ce->timeline->fence_context,
840	port - execlists->pending);
841	return false;
842	}
843	prev = rq;
844
845	/*
846	* We want virtual requests to only be in the first slot so
847	* that they are never stuck behind a hog and can be immediately
848	* transferred onto the next idle engine.
849	*/
850	if (rq->execution_mask != engine->mask &&
851	port != execlists->pending) {
852	GEM_TRACE_ERR("%s: virtual engine:%llx not in prime position[%zd]\n",
853	engine->name,
854	ce->timeline->fence_context,
855	port - execlists->pending);
856	return false;
857	}
858
859	/ Hold tightly onto the lock to prevent concurrent retires! /
860	if (!spin_trylock_irqsave(&rq->lock, flags))
861	continue;
862
863	if (__i915_request_is_complete(rq))
864	goto unlock;
865
866	if (i915_active_is_idle(ref: &ce->active) &&
867	!intel_context_is_barrier(ce)) {
868	GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
869	engine->name,
870	ce->timeline->fence_context,
871	port - execlists->pending);
872	ok = false;
873	goto unlock;
874	}
875
876	if (!i915_vma_is_pinned(vma: ce->state)) {
877	GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
878	engine->name,
879	ce->timeline->fence_context,
880	port - execlists->pending);
881	ok = false;
882	goto unlock;
883	}
884
885	if (!i915_vma_is_pinned(vma: ce->ring->vma)) {
886	GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
887	engine->name,
888	ce->timeline->fence_context,
889	port - execlists->pending);
890	ok = false;
891	goto unlock;
892	}
893
894	unlock:
895	spin_unlock_irqrestore(lock: &rq->lock, flags);
896	if (!ok)
897	return false;
898	}
899
900	return ce;
901	}
902
903	static void execlists_submit_ports(struct intel_engine_cs *engine)
904	{
905	struct intel_engine_execlists *execlists = &engine->execlists;
906	unsigned int n;
907
908	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
909
910	/*
911	* We can skip acquiring intel_runtime_pm_get() here as it was taken
912	* on our behalf by the request (see i915_gem_mark_busy()) and it will
913	* not be relinquished until the device is idle (see
914	* i915_gem_idle_work_handler()). As a precaution, we make sure
915	* that all ELSP are drained i.e. we have processed the CSB,
916	* before allowing ourselves to idle and calling intel_runtime_pm_put().
917	*/
918	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
919
920	/*
921	* ELSQ note: the submit queue is not cleared after being submitted
922	* to the HW so we need to make sure we always clean it up. This is
923	* currently ensured by the fact that we always write the same number
924	* of elsq entries, keep this in mind before changing the loop below.
925	*/
926	for (n = execlists_num_ports(execlists); n--; ) {
927	struct i915_request *rq = execlists->pending[n];
928
929	write_desc(execlists,
930	desc: rq ? execlists_update_context(rq) : `0`,
931	port: n);
932	}
933
934	/ we need to manually load the submit queue /
935	if (execlists->ctrl_reg)
936	writel(EL_CTRL_LOAD, addr: execlists->ctrl_reg);
937	}
938
939	static bool ctx_single_port_submission(const struct intel_context *ce)
940	{
941	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
942	intel_context_force_single_submission(ce));
943	}
944
945	static bool can_merge_ctx(const struct intel_context *prev,
946	const struct intel_context *next)
947	{
948	if (prev != next)
949	return false;
950
951	if (ctx_single_port_submission(ce: prev))
952	return false;
953
954	return true;
955	}
956
957	static unsigned long i915_request_flags(const struct i915_request *rq)
958	{
959	return READ_ONCE(rq->fence.flags);
960	}
961
962	static bool can_merge_rq(const struct i915_request *prev,
963	const struct i915_request *next)
964	{
965	GEM_BUG_ON(prev == next);
966	GEM_BUG_ON(!assert_priority_queue(prev, next));
967
968	/*
969	* We do not submit known completed requests. Therefore if the next
970	* request is already completed, we can pretend to merge it in
971	* with the previous context (and we will skip updating the ELSP
972	* and tracking). Thus hopefully keeping the ELSP full with active
973	* contexts, despite the best efforts of preempt-to-busy to confuse
974	* us.
975	*/
976	if (__i915_request_is_complete(rq: next))
977	return true;
978
979	if (unlikely((i915_request_flags(prev) \| i915_request_flags(next)) &
980	(BIT(I915_FENCE_FLAG_NOPREEMPT) \|
981	BIT(I915_FENCE_FLAG_SENTINEL))))
982	return false;
983
984	if (!can_merge_ctx(prev: prev->context, next: next->context))
985	return false;
986
987	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
988	return true;
989	}
990
991	static bool virtual_matches(const struct virtual_engine *ve,
992	const struct i915_request *rq,
993	const struct intel_engine_cs *engine)
994	{
995	const struct intel_engine_cs *inflight;
996
997	if (!rq)
998	return false;
999
1000	if (!(rq->execution_mask & engine->mask)) / We peeked too soon! /
1001	return false;
1002
1003	/*
1004	* We track when the HW has completed saving the context image
1005	* (i.e. when we have seen the final CS event switching out of
1006	* the context) and must not overwrite the context image before
1007	* then. This restricts us to only using the active engine
1008	* while the previous virtualized request is inflight (so
1009	* we reuse the register offsets). This is a very small
1010	* hystersis on the greedy seelction algorithm.
1011	*/
1012	inflight = intel_context_inflight(&ve->context);
1013	if (inflight && inflight != engine)
1014	return false;
1015
1016	return true;
1017	}
1018
1019	static struct virtual_engine *
1020	first_virtual_engine(struct intel_engine_cs *engine)
1021	{
1022	struct intel_engine_execlists *el = &engine->execlists;
1023	struct rb_node *rb = rb_first_cached(&el->virtual);
1024
1025	while (rb) {
1026	struct virtual_engine *ve =
1027	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1028	struct i915_request *rq = READ_ONCE(ve->request);
1029
1030	/ lazily cleanup after another engine handled rq /
1031	if (!rq \|\| !virtual_matches(ve, rq, engine)) {
1032	rb_erase_cached(node: rb, root: &el->virtual);
1033	RB_CLEAR_NODE(rb);
1034	rb = rb_first_cached(&el->virtual);
1035	continue;
1036	}
1037
1038	return ve;
1039	}
1040
1041	return NULL;
1042	}
1043
1044	static void virtual_xfer_context(struct virtual_engine *ve,
1045	struct intel_engine_cs *engine)
1046	{
1047	unsigned int n;
1048
1049	if (likely(engine == ve->siblings[`0`]))
1050	return;
1051
1052	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1053	if (!intel_engine_has_relative_mmio(engine))
1054	lrc_update_offsets(ce: &ve->context, engine);
1055
1056	/*
1057	* Move the bound engine to the top of the list for
1058	* future execution. We then kick this tasklet first
1059	* before checking others, so that we preferentially
1060	* reuse this set of bound registers.
1061	*/
1062	for (n = `1`; n < ve->num_siblings; n++) {
1063	if (ve->siblings[n] == engine) {
1064	swap(ve->siblings[n], ve->siblings[`0`]);
1065	break;
1066	}
1067	}
1068	}
1069
1070	static void defer_request(struct i915_request rq, struct* list_head * const pl)
1071	{
1072	LIST_HEAD(list);
1073
1074	/*
1075	* We want to move the interrupted request to the back of
1076	* the round-robin list (i.e. its priority level), but
1077	* in doing so, we must then move all requests that were in
1078	* flight and were waiting for the interrupted request to
1079	* be run after it again.
1080	*/
1081	do {
1082	struct i915_dependency *p;
1083
1084	GEM_BUG_ON(i915_request_is_active(rq));
1085	list_move_tail(list: &rq->sched.link, head: pl);
1086
1087	for_each_waiter(p, rq) {
1088	struct i915_request *w =
1089	container_of(p->waiter, typeof(*w), sched);
1090
1091	if (p->flags & I915_DEPENDENCY_WEAK)
1092	continue;
1093
1094	/ Leave semaphores spinning on the other engines /
1095	if (w->engine != rq->engine)
1096	continue;
1097
1098	/ No waiter should start before its signaler /
1099	GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1100	__i915_request_has_started(w) &&
1101	!__i915_request_is_complete(rq));
1102
1103	if (!i915_request_is_ready(rq: w))
1104	continue;
1105
1106	if (rq_prio(rq: w) < rq_prio(rq))
1107	continue;
1108
1109	GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1110	GEM_BUG_ON(i915_request_is_active(w));
1111	list_move_tail(list: &w->sched.link, head: &list);
1112	}
1113
1114	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1115	} while (rq);
1116	}
1117
1118	static void defer_active(struct intel_engine_cs *engine)
1119	{
1120	struct i915_request *rq;
1121
1122	rq = __unwind_incomplete_requests(engine);
1123	if (!rq)
1124	return;
1125
1126	defer_request(rq, pl: i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
1127	prio: rq_prio(rq)));
1128	}
1129
1130	static bool
1131	timeslice_yield(const struct intel_engine_execlists *el,
1132	const struct i915_request *rq)
1133	{
1134	/*
1135	* Once bitten, forever smitten!
1136	*
1137	* If the active context ever busy-waited on a semaphore,
1138	* it will be treated as a hog until the end of its timeslice (i.e.
1139	* until it is scheduled out and replaced by a new submission,
1140	* possibly even its own lite-restore). The HW only sends an interrupt
1141	* on the first miss, and we do know if that semaphore has been
1142	* signaled, or even if it is now stuck on another semaphore. Play
1143	* safe, yield if it might be stuck -- it will be given a fresh
1144	* timeslice in the near future.
1145	*/
1146	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1147	}
1148
1149	static bool needs_timeslice(const struct intel_engine_cs *engine,
1150	const struct i915_request *rq)
1151	{
1152	if (!intel_engine_has_timeslices(engine))
1153	return false;
1154
1155	/ If not currently active, or about to switch, wait for next event /
1156	if (!rq \|\| __i915_request_is_complete(rq))
1157	return false;
1158
1159	/ We do not need to start the timeslice until after the ACK /
1160	if (READ_ONCE(engine->execlists.pending[`0`]))
1161	return false;
1162
1163	/ If ELSP[1] is occupied, always check to see if worth slicing /
1164	if (!list_is_last_rcu(list: &rq->sched.link,
1165	head: &engine->sched_engine->requests)) {
1166	ENGINE_TRACE(engine, "timeslice required for second inflight context\n");
1167	return true;
1168	}
1169
1170	/ Otherwise, ELSP[0] is by itself, but may be waiting in the queue /
1171	if (!i915_sched_engine_is_empty(sched_engine: engine->sched_engine)) {
1172	ENGINE_TRACE(engine, "timeslice required for queue\n");
1173	return true;
1174	}
1175
1176	if (!RB_EMPTY_ROOT(&engine->execlists.virtual.rb_root)) {
1177	ENGINE_TRACE(engine, "timeslice required for virtual\n");
1178	return true;
1179	}
1180
1181	return false;
1182	}
1183
1184	static bool
1185	timeslice_expired(struct intel_engine_cs engine, const* struct i915_request *rq)
1186	{
1187	const struct intel_engine_execlists *el = &engine->execlists;
1188
1189	if (i915_request_has_nopreempt(rq) && __i915_request_has_started(rq))
1190	return false;
1191
1192	if (!needs_timeslice(engine, rq))
1193	return false;
1194
1195	return timer_expired(t: &el->timer) \|\| timeslice_yield(el, rq);
1196	}
1197
1198	static unsigned long timeslice(const struct intel_engine_cs *engine)
1199	{
1200	return READ_ONCE(engine->props.timeslice_duration_ms);
1201	}
1202
1203	static void start_timeslice(struct intel_engine_cs *engine)
1204	{
1205	struct intel_engine_execlists *el = &engine->execlists;
1206	unsigned long duration;
1207
1208	/ Disable the timer if there is nothing to switch to /
1209	duration = `0`;
1210	if (needs_timeslice(engine, rq: *el->active)) {
1211	/ Avoid continually prolonging an active timeslice /
1212	if (timer_active(t: &el->timer)) {
1213	/*
1214	* If we just submitted a new ELSP after an old
1215	* context, that context may have already consumed
1216	* its timeslice, so recheck.
1217	*/
1218	if (!timer_pending(timer: &el->timer))
1219	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
1220	return;
1221	}
1222
1223	duration = timeslice(engine);
1224	}
1225
1226	set_timer_ms(t: &el->timer, timeout: duration);
1227	}
1228
1229	static void record_preemption(struct intel_engine_execlists *execlists)
1230	{
1231	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1232	}
1233
1234	static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1235	const struct i915_request *rq)
1236	{
1237	if (!rq)
1238	return `0`;
1239
1240	/ Only allow ourselves to force reset the currently active context /
1241	engine->execlists.preempt_target = rq;
1242
1243	/ Force a fast reset for terminated contexts (ignoring sysfs!) /
1244	if (unlikely(intel_context_is_banned(rq->context) \|\| bad_request(rq)))
1245	return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;
1246
1247	return READ_ONCE(engine->props.preempt_timeout_ms);
1248	}
1249
1250	static void set_preempt_timeout(struct intel_engine_cs *engine,
1251	const struct i915_request *rq)
1252	{
1253	if (!intel_engine_has_preempt_reset(engine))
1254	return;
1255
1256	set_timer_ms(t: &engine->execlists.preempt,
1257	timeout: active_preempt_timeout(engine, rq));
1258	}
1259
1260	static bool completed(const struct i915_request *rq)
1261	{
1262	if (i915_request_has_sentinel(rq))
1263	return false;
1264
1265	return __i915_request_is_complete(rq);
1266	}
1267
1268	static void execlists_dequeue(struct intel_engine_cs *engine)
1269	{
1270	struct intel_engine_execlists * const execlists = &engine->execlists;
1271	struct i915_sched_engine * const sched_engine = engine->sched_engine;
1272	struct i915_request **port = execlists->pending;
1273	struct i915_request ** const last_port = port + execlists->port_mask;
1274	struct i915_request last, const *active;
1275	struct virtual_engine *ve;
1276	struct rb_node *rb;
1277	bool submit = false;
1278
1279	/*
1280	* Hardware submission is through 2 ports. Conceptually each port
1281	* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1282	* static for a context, and unique to each, so we only execute
1283	* requests belonging to a single context from each ring. RING_HEAD
1284	* is maintained by the CS in the context image, it marks the place
1285	* where it got up to last time, and through RING_TAIL we tell the CS
1286	* where we want to execute up to this time.
1287	*
1288	* In this list the requests are in order of execution. Consecutive
1289	* requests from the same context are adjacent in the ringbuffer. We
1290	* can combine these requests into a single RING_TAIL update:
1291	*
1292	* RING_HEAD...req1...req2
1293	* ^- RING_TAIL
1294	* since to execute req2 the CS must first execute req1.
1295	*
1296	* Our goal then is to point each port to the end of a consecutive
1297	* sequence of requests as being the most optimal (fewest wake ups
1298	* and context switches) submission.
1299	*/
1300
1301	spin_lock(lock: &sched_engine->lock);
1302
1303	/*
1304	* If the queue is higher priority than the last
1305	* request in the currently active context, submit afresh.
1306	* We will resubmit again afterwards in case we need to split
1307	* the active context to interject the preemption request,
1308	* i.e. we will retrigger preemption following the ack in case
1309	* of trouble.
1310	*
1311	*/
1312	active = execlists->active;
1313	while ((last = *active) && completed(rq: last))
1314	active++;
1315
1316	if (last) {
1317	if (need_preempt(engine, rq: last)) {
1318	ENGINE_TRACE(engine,
1319	"preempting last=%llx:%lld, prio=%d, hint=%d\n",
1320	last->fence.context,
1321	last->fence.seqno,
1322	last->sched.attr.priority,
1323	sched_engine->queue_priority_hint);
1324	record_preemption(execlists);
1325
1326	/*
1327	* Don't let the RING_HEAD advance past the breadcrumb
1328	* as we unwind (and until we resubmit) so that we do
1329	* not accidentally tell it to go backwards.
1330	*/
1331	ring_set_paused(engine, state: `1`);
1332
1333	/*
1334	* Note that we have not stopped the GPU at this point,
1335	* so we are unwinding the incomplete requests as they
1336	* remain inflight and so by the time we do complete
1337	* the preemption, some of the unwound requests may
1338	* complete!
1339	*/
1340	__unwind_incomplete_requests(engine);
1341
1342	last = NULL;
1343	} else if (timeslice_expired(engine, rq: last)) {
1344	ENGINE_TRACE(engine,
1345	"expired:%s last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
1346	str_yes_no(timer_expired(&execlists->timer)),
1347	last->fence.context, last->fence.seqno,
1348	rq_prio(last),
1349	sched_engine->queue_priority_hint,
1350	str_yes_no(timeslice_yield(execlists, last)));
1351
1352	/*
1353	* Consume this timeslice; ensure we start a new one.
1354	*
1355	* The timeslice expired, and we will unwind the
1356	* running contexts and recompute the next ELSP.
1357	* If that submit will be the same pair of contexts
1358	* (due to dependency ordering), we will skip the
1359	* submission. If we don't cancel the timer now,
1360	* we will see that the timer has expired and
1361	* reschedule the tasklet; continually until the
1362	* next context switch or other preemption event.
1363	*
1364	* Since we have decided to reschedule based on
1365	* consumption of this timeslice, if we submit the
1366	* same context again, grant it a full timeslice.
1367	*/
1368	cancel_timer(t: &execlists->timer);
1369	ring_set_paused(engine, state: `1`);
1370	defer_active(engine);
1371
1372	/*
1373	* Unlike for preemption, if we rewind and continue
1374	* executing the same context as previously active,
1375	* the order of execution will remain the same and
1376	* the tail will only advance. We do not need to
1377	* force a full context restore, as a lite-restore
1378	* is sufficient to resample the monotonic TAIL.
1379	*
1380	* If we switch to any other context, similarly we
1381	* will not rewind TAIL of current context, and
1382	* normal save/restore will preserve state and allow
1383	* us to later continue executing the same request.
1384	*/
1385	last = NULL;
1386	} else {
1387	/*
1388	* Otherwise if we already have a request pending
1389	* for execution after the current one, we can
1390	* just wait until the next CS event before
1391	* queuing more. In either case we will force a
1392	* lite-restore preemption event, but if we wait
1393	* we hopefully coalesce several updates into a single
1394	* submission.
1395	*/
1396	if (active[`1`]) {
1397	/*
1398	* Even if ELSP[1] is occupied and not worthy
1399	* of timeslices, our queue might be.
1400	*/
1401	spin_unlock(lock: &sched_engine->lock);
1402	return;
1403	}
1404	}
1405	}
1406
1407	/ XXX virtual is always taking precedence /
1408	while ((ve = first_virtual_engine(engine))) {
1409	struct i915_request *rq;
1410
1411	spin_lock(lock: &ve->base.sched_engine->lock);
1412
1413	rq = ve->request;
1414	if (unlikely(!virtual_matches(ve, rq, engine)))
1415	goto unlock; / lost the race to a sibling /
1416
1417	GEM_BUG_ON(rq->engine != &ve->base);
1418	GEM_BUG_ON(rq->context != &ve->context);
1419
1420	if (unlikely(rq_prio(rq) < queue_prio(sched_engine))) {
1421	spin_unlock(lock: &ve->base.sched_engine->lock);
1422	break;
1423	}
1424
1425	if (last && !can_merge_rq(prev: last, next: rq)) {
1426	spin_unlock(lock: &ve->base.sched_engine->lock);
1427	spin_unlock(lock: &engine->sched_engine->lock);
1428	return; / leave this for another sibling /
1429	}
1430
1431	ENGINE_TRACE(engine,
1432	"virtual rq=%llx:%lld%s, new engine? %s\n",
1433	rq->fence.context,
1434	rq->fence.seqno,
1435	__i915_request_is_complete(rq) ? "!" :
1436	__i915_request_has_started(rq) ? "*" :
1437	"",
1438	str_yes_no(engine != ve->siblings[`0`]));
1439
1440	WRITE_ONCE(ve->request, NULL);
1441	WRITE_ONCE(ve->base.sched_engine->queue_priority_hint, INT_MIN);
1442
1443	rb = &ve->nodes[engine->id].rb;
1444	rb_erase_cached(node: rb, root: &execlists->virtual);
1445	RB_CLEAR_NODE(rb);
1446
1447	GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1448	WRITE_ONCE(rq->engine, engine);
1449
1450	if (__i915_request_submit(request: rq)) {
1451	/*
1452	* Only after we confirm that we will submit
1453	* this request (i.e. it has not already
1454	* completed), do we want to update the context.
1455	*
1456	* This serves two purposes. It avoids
1457	* unnecessary work if we are resubmitting an
1458	* already completed request after timeslicing.
1459	* But more importantly, it prevents us altering
1460	* ve->siblings[] on an idle context, where
1461	* we may be using ve->siblings[] in
1462	* virtual_context_enter / virtual_context_exit.
1463	*/
1464	virtual_xfer_context(ve, engine);
1465	GEM_BUG_ON(ve->siblings[`0`] != engine);
1466
1467	submit = true;
1468	last = rq;
1469	}
1470
1471	i915_request_put(rq);
1472	unlock:
1473	spin_unlock(lock: &ve->base.sched_engine->lock);
1474
1475	/*
1476	* Hmm, we have a bunch of virtual engine requests,
1477	* but the first one was already completed (thanks
1478	* preempt-to-busy!). Keep looking at the veng queue
1479	* until we have no more relevant requests (i.e.
1480	* the normal submit queue has higher priority).
1481	*/
1482	if (submit)
1483	break;
1484	}
1485
1486	while ((rb = rb_first_cached(&sched_engine->queue))) {
1487	struct i915_priolist *p = to_priolist(rb);
1488	struct i915_request rq, rn;
1489
1490	priolist_for_each_request_consume(rq, rn, p) {
1491	bool merge = true;
1492
1493	/*
1494	* Can we combine this request with the current port?
1495	* It has to be the same context/ringbuffer and not
1496	* have any exceptions (e.g. GVT saying never to
1497	* combine contexts).
1498	*
1499	* If we can combine the requests, we can execute both
1500	* by updating the RING_TAIL to point to the end of the
1501	* second request, and so we never need to tell the
1502	* hardware about the first.
1503	*/
1504	if (last && !can_merge_rq(prev: last, next: rq)) {
1505	/*
1506	* If we are on the second port and cannot
1507	* combine this request with the last, then we
1508	* are done.
1509	*/
1510	if (port == last_port)
1511	goto done;
1512
1513	/*
1514	* We must not populate both ELSP[] with the
1515	* same LRCA, i.e. we must submit 2 different
1516	* contexts if we submit 2 ELSP.
1517	*/
1518	if (last->context == rq->context)
1519	goto done;
1520
1521	if (i915_request_has_sentinel(rq: last))
1522	goto done;
1523
1524	/*
1525	* We avoid submitting virtual requests into
1526	* the secondary ports so that we can migrate
1527	* the request immediately to another engine
1528	* rather than wait for the primary request.
1529	*/
1530	if (rq->execution_mask != engine->mask)
1531	goto done;
1532
1533	/*
1534	* If GVT overrides us we only ever submit
1535	* port[0], leaving port[1] empty. Note that we
1536	* also have to be careful that we don't queue
1537	* the same context (even though a different
1538	* request) to the second port.
1539	*/
1540	if (ctx_single_port_submission(ce: last->context) \|\|
1541	ctx_single_port_submission(ce: rq->context))
1542	goto done;
1543
1544	merge = false;
1545	}
1546
1547	if (__i915_request_submit(request: rq)) {
1548	if (!merge) {
1549	*port++ = i915_request_get(rq: last);
1550	last = NULL;
1551	}
1552
1553	GEM_BUG_ON(last &&
1554	!can_merge_ctx(last->context,
1555	rq->context));
1556	GEM_BUG_ON(last &&
1557	i915_seqno_passed(last->fence.seqno,
1558	rq->fence.seqno));
1559
1560	submit = true;
1561	last = rq;
1562	}
1563	}
1564
1565	rb_erase_cached(node: &p->node, root: &sched_engine->queue);
1566	i915_priolist_free(p);
1567	}
1568	done:
1569	*port++ = i915_request_get(rq: last);
1570
1571	/*
1572	* Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1573	*
1574	* We choose the priority hint such that if we add a request of greater
1575	* priority than this, we kick the submission tasklet to decide on
1576	* the right order of submitting the requests to hardware. We must
1577	* also be prepared to reorder requests as they are in-flight on the
1578	* HW. We derive the priority hint then as the first "hole" in
1579	* the HW submission ports and if there are no available slots,
1580	* the priority of the lowest executing request, i.e. last.
1581	*
1582	* When we do receive a higher priority request ready to run from the
1583	* user, see queue_request(), the priority hint is bumped to that
1584	* request triggering preemption on the next dequeue (or subsequent
1585	* interrupt for secondary ports).
1586	*/
1587	sched_engine->queue_priority_hint = queue_prio(sched_engine);
1588	i915_sched_engine_reset_on_empty(sched_engine);
1589	spin_unlock(lock: &sched_engine->lock);
1590
1591	/*
1592	* We can skip poking the HW if we ended up with exactly the same set
1593	* of requests as currently running, e.g. trying to timeslice a pair
1594	* of ordered contexts.
1595	*/
1596	if (submit &&
1597	memcmp(active,
1598	execlists->pending,
1599	(port - execlists->pending) * sizeof(*port))) {
1600	*port = NULL;
1601	while (port-- != execlists->pending)
1602	execlists_schedule_in(rq: *port, idx: port - execlists->pending);
1603
1604	WRITE_ONCE(execlists->yield, -`1`);
1605	set_preempt_timeout(engine, rq: *active);
1606	execlists_submit_ports(engine);
1607	} else {
1608	ring_set_paused(engine, state: `0`);
1609	while (port-- != execlists->pending)
1610	i915_request_put(rq: *port);
1611	*execlists->pending = NULL;
1612	}
1613	}
1614
1615	static void execlists_dequeue_irq(struct intel_engine_cs *engine)
1616	{
1617	local_irq_disable(); / Suspend interrupts across request submission /
1618	execlists_dequeue(engine);
1619	local_irq_enable(); / flush irq_work (e.g. breadcrumb enabling) /
1620	}
1621
1622	static void clear_ports(struct i915_request *ports, int* count)
1623	{
1624	memset_p(p: (void **)ports, NULL, n: count);
1625	}
1626
1627	static void
1628	copy_ports(struct i915_request dst, struct i915_request src, int count)
1629	{
1630	/ A memcpy_p() would be very useful here! /
1631	while (count--)
1632	WRITE_ONCE(dst++, src++); / avoid write tearing /
1633	}
1634
1635	static struct i915_request **
1636	cancel_port_requests(struct intel_engine_execlists * const execlists,
1637	struct i915_request **inactive)
1638	{
1639	struct i915_request * const *port;
1640
1641	for (port = execlists->pending; *port; port++)
1642	inactive++ = port;
1643	clear_ports(ports: execlists->pending, ARRAY_SIZE(execlists->pending));
1644
1645	/ Mark the end of active before we overwrite active /*
1646	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
1647	inactive++ = port;
1648	clear_ports(ports: execlists->inflight, ARRAY_SIZE(execlists->inflight));
1649
1650	smp_wmb(); / complete the seqlock for execlists_active() /
1651	WRITE_ONCE(execlists->active, execlists->inflight);
1652
1653	/ Having cancelled all outstanding process_csb(), stop their timers /
1654	GEM_BUG_ON(execlists->pending[`0`]);
1655	cancel_timer(t: &execlists->timer);
1656	cancel_timer(t: &execlists->preempt);
1657
1658	return inactive;
1659	}
1660
1661	/*
1662	* Starting with Gen12, the status has a new format:
1663	*
1664	* bit 0: switched to new queue
1665	* bit 1: reserved
1666	* bit 2: semaphore wait mode (poll or signal), only valid when
1667	* switch detail is set to "wait on semaphore"
1668	* bits 3-5: engine class
1669	* bits 6-11: engine instance
1670	* bits 12-14: reserved
1671	* bits 15-25: sw context id of the lrc the GT switched to
1672	* bits 26-31: sw counter of the lrc the GT switched to
1673	* bits 32-35: context switch detail
1674	* - 0: ctx complete
1675	* - 1: wait on sync flip
1676	* - 2: wait on vblank
1677	* - 3: wait on scanline
1678	* - 4: wait on semaphore
1679	* - 5: context preempted (not on SEMAPHORE_WAIT or
1680	* WAIT_FOR_EVENT)
1681	* bit 36: reserved
1682	* bits 37-43: wait detail (for switch detail 1 to 4)
1683	* bits 44-46: reserved
1684	* bits 47-57: sw context id of the lrc the GT switched away from
1685	* bits 58-63: sw counter of the lrc the GT switched away from
1686	*
1687	* Xe_HP csb shuffles things around compared to TGL:
1688	*
1689	* bits 0-3: context switch detail (same possible values as TGL)
1690	* bits 4-9: engine instance
1691	* bits 10-25: sw context id of the lrc the GT switched to
1692	* bits 26-31: sw counter of the lrc the GT switched to
1693	* bit 32: semaphore wait mode (poll or signal), Only valid when
1694	* switch detail is set to "wait on semaphore"
1695	* bit 33: switched to new queue
1696	* bits 34-41: wait detail (for switch detail 1 to 4)
1697	* bits 42-57: sw context id of the lrc the GT switched away from
1698	* bits 58-63: sw counter of the lrc the GT switched away from
1699	*/
1700	static inline bool
1701	__gen12_csb_parse(bool ctx_to_valid, bool ctx_away_valid, bool new_queue,
1702	u8 switch_detail)
1703	{
1704	/*
1705	* The context switch detail is not guaranteed to be 5 when a preemption
1706	* occurs, so we can't just check for that. The check below works for
1707	* all the cases we care about, including preemptions of WAIT
1708	* instructions and lite-restore. Preempt-to-idle via the CTRL register
1709	* would require some extra handling, but we don't support that.
1710	*/
1711	if (!ctx_away_valid \|\| new_queue) {
1712	GEM_BUG_ON(!ctx_to_valid);
1713	return true;
1714	}
1715
1716	/*
1717	* switch detail = 5 is covered by the case above and we do not expect a
1718	* context switch on an unsuccessful wait instruction since we always
1719	* use polling mode.
1720	*/
1721	GEM_BUG_ON(switch_detail);
1722	return false;
1723	}
1724
1725	static bool xehp_csb_parse(const u64 csb)
1726	{
1727	return __gen12_csb_parse(XEHP_CSB_CTX_VALID(lower_32_bits(csb)), / cxt to /
1728	XEHP_CSB_CTX_VALID(upper_32_bits(csb)), / cxt away /
1729	upper_32_bits(csb) & XEHP_CTX_STATUS_SWITCHED_TO_NEW_QUEUE,
1730	GEN12_CTX_SWITCH_DETAIL(lower_32_bits(csb)));
1731	}
1732
1733	static bool gen12_csb_parse(const u64 csb)
1734	{
1735	return __gen12_csb_parse(GEN12_CSB_CTX_VALID(lower_32_bits(csb)), / cxt to /
1736	GEN12_CSB_CTX_VALID(upper_32_bits(csb)), / cxt away /
1737	lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE,
1738	GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
1739	}
1740
1741	static bool gen8_csb_parse(const u64 csb)
1742	{
1743	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE \| GEN8_CTX_STATUS_PREEMPTED);
1744	}
1745
1746	static noinline u64
1747	wa_csb_read(const struct intel_engine_cs engine, u64 const csb)
1748	{
1749	u64 entry;
1750
1751	/*
1752	* Reading from the HWSP has one particular advantage: we can detect
1753	* a stale entry. Since the write into HWSP is broken, we have no reason
1754	* to trust the HW at all, the mmio entry may equally be unordered, so
1755	* we prefer the path that is self-checking and as a last resort,
1756	* return the mmio value.
1757	*
1758	* tgl,dg1:HSDES#22011327657
1759	*/
1760	preempt_disable();
1761	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -`1`, `10`)) {
1762	int idx = csb - engine->execlists.csb_status;
1763	int status;
1764
1765	status = GEN8_EXECLISTS_STATUS_BUF;
1766	if (idx >= `6`) {
1767	status = GEN11_EXECLISTS_STATUS_BUF2;
1768	idx -= `6`;
1769	}
1770	status += sizeof(u64) * idx;
1771
1772	entry = intel_uncore_read64(uncore: engine->uncore,
1773	_MMIO(engine->mmio_base + status));
1774	}
1775	preempt_enable();
1776
1777	return entry;
1778	}
1779
1780	static u64 csb_read(const struct intel_engine_cs engine, u64 const csb)
1781	{
1782	u64 entry = READ_ONCE(*csb);
1783
1784	/*
1785	* Unfortunately, the GPU does not always serialise its write
1786	* of the CSB entries before its write of the CSB pointer, at least
1787	* from the perspective of the CPU, using what is known as a Global
1788	* Observation Point. We may read a new CSB tail pointer, but then
1789	* read the stale CSB entries, causing us to misinterpret the
1790	* context-switch events, and eventually declare the GPU hung.
1791	*
1792	* icl:HSDES#1806554093
1793	* tgl:HSDES#22011248461
1794	*/
1795	if (unlikely(entry == -`1`))
1796	entry = wa_csb_read(engine, csb);
1797
1798	/ Consume this entry so that we can spot its future reuse. /
1799	WRITE_ONCE(*csb, -`1`);
1800
1801	/ ELSP is an implicit wmb() before the GPU wraps and overwrites csb /
1802	return entry;
1803	}
1804
1805	static void new_timeslice(struct intel_engine_execlists *el)
1806	{
1807	/ By cancelling, we will start afresh in start_timeslice() /
1808	cancel_timer(t: &el->timer);
1809	}
1810
1811	static struct i915_request **
1812	process_csb(struct intel_engine_cs engine, struct* i915_request **inactive)
1813	{
1814	struct intel_engine_execlists * const execlists = &engine->execlists;
1815	u64 * const buf = execlists->csb_status;
1816	const u8 num_entries = execlists->csb_size;
1817	struct i915_request **prev;
1818	u8 head, tail;
1819
1820	/*
1821	* As we modify our execlists state tracking we require exclusive
1822	* access. Either we are inside the tasklet, or the tasklet is disabled
1823	* and we assume that is only inside the reset paths and so serialised.
1824	*/
1825	GEM_BUG_ON(!tasklet_is_locked(&engine->sched_engine->tasklet) &&
1826	!reset_in_progress(engine));
1827
1828	/*
1829	* Note that csb_write, csb_status may be either in HWSP or mmio.
1830	* When reading from the csb_write mmio register, we have to be
1831	* careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1832	* the low 4bits. As it happens we know the next 4bits are always
1833	* zero and so we can simply masked off the low u8 of the register
1834	* and treat it identically to reading from the HWSP (without having
1835	* to use explicit shifting and masking, and probably bifurcating
1836	* the code to handle the legacy mmio read).
1837	*/
1838	head = execlists->csb_head;
1839	tail = READ_ONCE(*execlists->csb_write);
1840	if (unlikely(head == tail))
1841	return inactive;
1842
1843	/*
1844	* We will consume all events from HW, or at least pretend to.
1845	*
1846	* The sequence of events from the HW is deterministic, and derived
1847	* from our writes to the ELSP, with a smidgen of variability for
1848	* the arrival of the asynchronous requests wrt to the inflight
1849	* execution. If the HW sends an event that does not correspond with
1850	* the one we are expecting, we have to abandon all hope as we lose
1851	* all tracking of what the engine is actually executing. We will
1852	* only detect we are out of sequence with the HW when we get an
1853	* 'impossible' event because we have already drained our own
1854	* preemption/promotion queue. If this occurs, we know that we likely
1855	* lost track of execution earlier and must unwind and restart, the
1856	* simplest way is by stop processing the event queue and force the
1857	* engine to reset.
1858	*/
1859	execlists->csb_head = tail;
1860	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
1861
1862	/*
1863	* Hopefully paired with a wmb() in HW!
1864	*
1865	* We must complete the read of the write pointer before any reads
1866	* from the CSB, so that we do not see stale values. Without an rmb
1867	* (lfence) the HW may speculatively perform the CSB[] reads before
1868	* we perform the READ_ONCE(*csb_write).
1869	*/
1870	rmb();
1871
1872	/ Remember who was last running under the timer /
1873	prev = inactive;
1874	*prev = NULL;
1875
1876	do {
1877	bool promote;
1878	u64 csb;
1879
1880	if (++head == num_entries)
1881	head = `0`;
1882
1883	/*
1884	* We are flying near dragons again.
1885	*
1886	* We hold a reference to the request in execlist_port[]
1887	* but no more than that. We are operating in softirq
1888	* context and so cannot hold any mutex or sleep. That
1889	* prevents us stopping the requests we are processing
1890	* in port[] from being retired simultaneously (the
1891	* breadcrumb will be complete before we see the
1892	* context-switch). As we only hold the reference to the
1893	* request, any pointer chasing underneath the request
1894	* is subject to a potential use-after-free. Thus we
1895	* store all of the bookkeeping within port[] as
1896	* required, and avoid using unguarded pointers beneath
1897	* request itself. The same applies to the atomic
1898	* status notifier.
1899	*/
1900
1901	csb = csb_read(engine, csb: buf + head);
1902	ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
1903	head, upper_32_bits(csb), lower_32_bits(csb));
1904
1905	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `55`))
1906	promote = xehp_csb_parse(csb);
1907	else if (GRAPHICS_VER(engine->i915) >= `12`)
1908	promote = gen12_csb_parse(csb);
1909	else
1910	promote = gen8_csb_parse(csb);
1911	if (promote) {
1912	struct i915_request * const *old = execlists->active;
1913
1914	if (GEM_WARN_ON(!*execlists->pending)) {
1915	execlists->error_interrupt \|= ERROR_CSB;
1916	break;
1917	}
1918
1919	ring_set_paused(engine, state: `0`);
1920
1921	/ Point active to the new ELSP; prevent overwriting /
1922	WRITE_ONCE(execlists->active, execlists->pending);
1923	smp_wmb(); / notify execlists_active() /
1924
1925	/ cancel old inflight, prepare for switch /
1926	trace_ports(execlists, msg: "preempted", ports: old);
1927	while (*old)
1928	inactive++ = old++;
1929
1930	/ switch pending to inflight /
1931	GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1932	copy_ports(dst: execlists->inflight,
1933	src: execlists->pending,
1934	count: execlists_num_ports(execlists));
1935	smp_wmb(); / complete the seqlock /
1936	WRITE_ONCE(execlists->active, execlists->inflight);
1937
1938	/ XXX Magic delay for tgl /
1939	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
1940
1941	WRITE_ONCE(execlists->pending[`0`], NULL);
1942	} else {
1943	if (GEM_WARN_ON(!*execlists->active)) {
1944	execlists->error_interrupt \|= ERROR_CSB;
1945	break;
1946	}
1947
1948	/ port0 completed, advanced to port1 /
1949	trace_ports(execlists, msg: "completed", ports: execlists->active);
1950
1951	/*
1952	* We rely on the hardware being strongly
1953	* ordered, that the breadcrumb write is
1954	* coherent (visible from the CPU) before the
1955	* user interrupt is processed. One might assume
1956	* that the breadcrumb write being before the
1957	* user interrupt and the CS event for the context
1958	* switch would therefore be before the CS event
1959	* itself...
1960	*/
1961	if (GEM_SHOW_DEBUG() &&
1962	!__i915_request_is_complete(rq: *execlists->active)) {
1963	struct i915_request rq = execlists->active;
1964	const u32 *regs __maybe_unused =
1965	rq->context->lrc_reg_state;
1966
1967	ENGINE_TRACE(engine,
1968	"context completed before request!\n");
1969	ENGINE_TRACE(engine,
1970	"ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
1971	ENGINE_READ(engine, RING_START),
1972	ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
1973	ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
1974	ENGINE_READ(engine, RING_CTL),
1975	ENGINE_READ(engine, RING_MI_MODE));
1976	ENGINE_TRACE(engine,
1977	"rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
1978	i915_ggtt_offset(rq->ring->vma),
1979	rq->head, rq->tail,
1980	rq->fence.context,
1981	lower_32_bits(rq->fence.seqno),
1982	hwsp_seqno(rq));
1983	ENGINE_TRACE(engine,
1984	"ctx:{start:%08x, head:%04x, tail:%04x}, ",
1985	regs[CTX_RING_START],
1986	regs[CTX_RING_HEAD],
1987	regs[CTX_RING_TAIL]);
1988	}
1989
1990	inactive++ = execlists->active++;
1991
1992	GEM_BUG_ON(execlists->active - execlists->inflight >
1993	execlists_num_ports(execlists));
1994	}
1995	} while (head != tail);
1996
1997	/*
1998	* Gen11 has proven to fail wrt global observation point between
1999	* entry and tail update, failing on the ordering and thus
2000	* we see an old entry in the context status buffer.
2001	*
2002	* Forcibly evict out entries for the next gpu csb update,
2003	* to increase the odds that we get a fresh entries with non
2004	* working hardware. The cost for doing so comes out mostly with
2005	* the wash as hardware, working or not, will need to do the
2006	* invalidation before.
2007	*/
2008	drm_clflush_virt_range(addr: &buf[`0`], length: num_entries * sizeof(buf[`0`]));
2009
2010	/*
2011	* We assume that any event reflects a change in context flow
2012	* and merits a fresh timeslice. We reinstall the timer after
2013	* inspecting the queue to see if we need to resumbit.
2014	*/
2015	if (prev != execlists->active) { / elide lite-restores /
2016	struct intel_context prev_ce = NULL, active_ce = NULL;
2017
2018	/*
2019	* Note the inherent discrepancy between the HW runtime,
2020	* recorded as part of the context switch, and the CPU
2021	* adjustment for active contexts. We have to hope that
2022	* the delay in processing the CS event is very small
2023	* and consistent. It works to our advantage to have
2024	* the CPU adjustment _undershoot_ (i.e. start later than)
2025	* the CS timestamp so we never overreport the runtime
2026	* and correct overselves later when updating from HW.
2027	*/
2028	if (*prev)
2029	prev_ce = (*prev)->context;
2030	if (*execlists->active)
2031	active_ce = (*execlists->active)->context;
2032	if (prev_ce != active_ce) {
2033	if (prev_ce)
2034	lrc_runtime_stop(ce: prev_ce);
2035	if (active_ce)
2036	lrc_runtime_start(ce: active_ce);
2037	}
2038	new_timeslice(el: execlists);
2039	}
2040
2041	return inactive;
2042	}
2043
2044	static void post_process_csb(struct i915_request **port,
2045	struct i915_request **last)
2046	{
2047	while (port != last)
2048	execlists_schedule_out(rq: *port++);
2049	}
2050
2051	static void __execlists_hold(struct i915_request *rq)
2052	{
2053	LIST_HEAD(list);
2054
2055	do {
2056	struct i915_dependency *p;
2057
2058	if (i915_request_is_active(rq))
2059	__i915_request_unsubmit(request: rq);
2060
2061	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2062	list_move_tail(list: &rq->sched.link,
2063	head: &rq->engine->sched_engine->hold);
2064	i915_request_set_hold(rq);
2065	RQ_TRACE(rq, "on hold\n");
2066
2067	for_each_waiter(p, rq) {
2068	struct i915_request *w =
2069	container_of(p->waiter, typeof(*w), sched);
2070
2071	if (p->flags & I915_DEPENDENCY_WEAK)
2072	continue;
2073
2074	/ Leave semaphores spinning on the other engines /
2075	if (w->engine != rq->engine)
2076	continue;
2077
2078	if (!i915_request_is_ready(rq: w))
2079	continue;
2080
2081	if (__i915_request_is_complete(rq: w))
2082	continue;
2083
2084	if (i915_request_on_hold(rq: w))
2085	continue;
2086
2087	list_move_tail(list: &w->sched.link, head: &list);
2088	}
2089
2090	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2091	} while (rq);
2092	}
2093
2094	static bool execlists_hold(struct intel_engine_cs *engine,
2095	struct i915_request *rq)
2096	{
2097	if (i915_request_on_hold(rq))
2098	return false;
2099
2100	spin_lock_irq(lock: &engine->sched_engine->lock);
2101
2102	if (__i915_request_is_complete(rq)) { / too late! /
2103	rq = NULL;
2104	goto unlock;
2105	}
2106
2107	/*
2108	* Transfer this request onto the hold queue to prevent it
2109	* being resumbitted to HW (and potentially completed) before we have
2110	* released it. Since we may have already submitted following
2111	* requests, we need to remove those as well.
2112	*/
2113	GEM_BUG_ON(i915_request_on_hold(rq));
2114	GEM_BUG_ON(rq->engine != engine);
2115	__execlists_hold(rq);
2116	GEM_BUG_ON(list_empty(&engine->sched_engine->hold));
2117
2118	unlock:
2119	spin_unlock_irq(lock: &engine->sched_engine->lock);
2120	return rq;
2121	}
2122
2123	static bool hold_request(const struct i915_request *rq)
2124	{
2125	struct i915_dependency *p;
2126	bool result = false;
2127
2128	/*
2129	* If one of our ancestors is on hold, we must also be on hold,
2130	* otherwise we will bypass it and execute before it.
2131	*/
2132	rcu_read_lock();
2133	for_each_signaler(p, rq) {
2134	const struct i915_request *s =
2135	container_of(p->signaler, typeof(*s), sched);
2136
2137	if (s->engine != rq->engine)
2138	continue;
2139
2140	result = i915_request_on_hold(rq: s);
2141	if (result)
2142	break;
2143	}
2144	rcu_read_unlock();
2145
2146	return result;
2147	}
2148
2149	static void __execlists_unhold(struct i915_request *rq)
2150	{
2151	LIST_HEAD(list);
2152
2153	do {
2154	struct i915_dependency *p;
2155
2156	RQ_TRACE(rq, "hold release\n");
2157
2158	GEM_BUG_ON(!i915_request_on_hold(rq));
2159	GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2160
2161	i915_request_clear_hold(rq);
2162	list_move_tail(list: &rq->sched.link,
2163	head: i915_sched_lookup_priolist(sched_engine: rq->engine->sched_engine,
2164	prio: rq_prio(rq)));
2165	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2166
2167	/ Also release any children on this engine that are ready /
2168	for_each_waiter(p, rq) {
2169	struct i915_request *w =
2170	container_of(p->waiter, typeof(*w), sched);
2171
2172	if (p->flags & I915_DEPENDENCY_WEAK)
2173	continue;
2174
2175	if (w->engine != rq->engine)
2176	continue;
2177
2178	if (!i915_request_on_hold(rq: w))
2179	continue;
2180
2181	/ Check that no other parents are also on hold /
2182	if (hold_request(rq: w))
2183	continue;
2184
2185	list_move_tail(list: &w->sched.link, head: &list);
2186	}
2187
2188	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2189	} while (rq);
2190	}
2191
2192	static void execlists_unhold(struct intel_engine_cs *engine,
2193	struct i915_request *rq)
2194	{
2195	spin_lock_irq(lock: &engine->sched_engine->lock);
2196
2197	/*
2198	* Move this request back to the priority queue, and all of its
2199	* children and grandchildren that were suspended along with it.
2200	*/
2201	__execlists_unhold(rq);
2202
2203	if (rq_prio(rq) > engine->sched_engine->queue_priority_hint) {
2204	engine->sched_engine->queue_priority_hint = rq_prio(rq);
2205	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2206	}
2207
2208	spin_unlock_irq(lock: &engine->sched_engine->lock);
2209	}
2210
2211	struct execlists_capture {
2212	struct work_struct work;
2213	struct i915_request *rq;
2214	struct i915_gpu_coredump *error;
2215	};
2216
2217	static void execlists_capture_work(struct work_struct *work)
2218	{
2219	struct execlists_capture cap = container_of(work, typeof(cap), work);
2220	const gfp_t gfp = __GFP_KSWAPD_RECLAIM \| __GFP_RETRY_MAYFAIL \|
2221	__GFP_NOWARN;
2222	struct intel_engine_cs *engine = cap->rq->engine;
2223	struct intel_gt_coredump *gt = cap->error->gt;
2224	struct intel_engine_capture_vma *vma;
2225
2226	/ Compress all the objects attached to the request, slow! /
2227	vma = intel_engine_coredump_add_request(ee: gt->engine, rq: cap->rq, gfp);
2228	if (vma) {
2229	struct i915_vma_compress *compress =
2230	i915_vma_capture_prepare(gt);
2231
2232	intel_engine_coredump_add_vma(ee: gt->engine, capture: vma, compress);
2233	i915_vma_capture_finish(gt, compress);
2234	}
2235
2236	gt->simulated = gt->engine->simulated;
2237	cap->error->simulated = gt->simulated;
2238
2239	/ Publish the error state, and announce it to the world /
2240	i915_error_state_store(error: cap->error);
2241	i915_gpu_coredump_put(gpu: cap->error);
2242
2243	/ Return this request and all that depend upon it for signaling /
2244	execlists_unhold(engine, rq: cap->rq);
2245	i915_request_put(rq: cap->rq);
2246
2247	kfree(objp: cap);
2248	}
2249
2250	static struct execlists_capture capture_regs(struct* intel_engine_cs *engine)
2251	{
2252	const gfp_t gfp = GFP_ATOMIC \| __GFP_NOWARN;
2253	struct execlists_capture *cap;
2254
2255	cap = kmalloc(sizeof(*cap), gfp);
2256	if (!cap)
2257	return NULL;
2258
2259	cap->error = i915_gpu_coredump_alloc(i915: engine->i915, gfp);
2260	if (!cap->error)
2261	goto err_cap;
2262
2263	cap->error->gt = intel_gt_coredump_alloc(gt: engine->gt, gfp, CORE_DUMP_FLAG_NONE);
2264	if (!cap->error->gt)
2265	goto err_gpu;
2266
2267	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp, CORE_DUMP_FLAG_NONE);
2268	if (!cap->error->gt->engine)
2269	goto err_gt;
2270
2271	cap->error->gt->engine->hung = true;
2272
2273	return cap;
2274
2275	err_gt:
2276	kfree(objp: cap->error->gt);
2277	err_gpu:
2278	kfree(objp: cap->error);
2279	err_cap:
2280	kfree(objp: cap);
2281	return NULL;
2282	}
2283
2284	static struct i915_request *
2285	active_context(struct intel_engine_cs *engine, u32 ccid)
2286	{
2287	const struct intel_engine_execlists * const el = &engine->execlists;
2288	struct i915_request * const port, rq;
2289
2290	/*
2291	* Use the most recent result from process_csb(), but just in case
2292	* we trigger an error (via interrupt) before the first CS event has
2293	* been written, peek at the next submission.
2294	*/
2295
2296	for (port = el->active; (rq = *port); port++) {
2297	if (rq->context->lrc.ccid == ccid) {
2298	ENGINE_TRACE(engine,
2299	"ccid:%x found at active:%zd\n",
2300	ccid, port - el->active);
2301	return rq;
2302	}
2303	}
2304
2305	for (port = el->pending; (rq = *port); port++) {
2306	if (rq->context->lrc.ccid == ccid) {
2307	ENGINE_TRACE(engine,
2308	"ccid:%x found at pending:%zd\n",
2309	ccid, port - el->pending);
2310	return rq;
2311	}
2312	}
2313
2314	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2315	return NULL;
2316	}
2317
2318	static u32 active_ccid(struct intel_engine_cs *engine)
2319	{
2320	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2321	}
2322
2323	static void execlists_capture(struct intel_engine_cs *engine)
2324	{
2325	struct drm_i915_private *i915 = engine->i915;
2326	struct execlists_capture *cap;
2327
2328	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2329	return;
2330
2331	/*
2332	* We need to _quickly_ capture the engine state before we reset.
2333	* We are inside an atomic section (softirq) here and we are delaying
2334	* the forced preemption event.
2335	*/
2336	cap = capture_regs(engine);
2337	if (!cap)
2338	return;
2339
2340	spin_lock_irq(lock: &engine->sched_engine->lock);
2341	cap->rq = active_context(engine, ccid: active_ccid(engine));
2342	if (cap->rq) {
2343	cap->rq = active_request(tl: cap->rq->context->timeline, rq: cap->rq);
2344	cap->rq = i915_request_get_rcu(rq: cap->rq);
2345	}
2346	spin_unlock_irq(lock: &engine->sched_engine->lock);
2347	if (!cap->rq)
2348	goto err_free;
2349
2350	/*
2351	* Remove the request from the execlists queue, and take ownership
2352	* of the request. We pass it to our worker who will _slowly_ compress
2353	* all the pages the _user_ requested for debugging their batch, after
2354	* which we return it to the queue for signaling.
2355	*
2356	* By removing them from the execlists queue, we also remove the
2357	* requests from being processed by __unwind_incomplete_requests()
2358	* during the intel_engine_reset(), and so they will not be replayed
2359	* afterwards.
2360	*
2361	* Note that because we have not yet reset the engine at this point,
2362	* it is possible for the request that we have identified as being
2363	* guilty, did in fact complete and we will then hit an arbitration
2364	* point allowing the outstanding preemption to succeed. The likelihood
2365	* of that is very low (as capturing of the engine registers should be
2366	* fast enough to run inside an irq-off atomic section!), so we will
2367	* simply hold that request accountable for being non-preemptible
2368	* long enough to force the reset.
2369	*/
2370	if (!execlists_hold(engine, rq: cap->rq))
2371	goto err_rq;
2372
2373	INIT_WORK(&cap->work, execlists_capture_work);
2374	queue_work(wq: i915->unordered_wq, work: &cap->work);
2375	return;
2376
2377	err_rq:
2378	i915_request_put(rq: cap->rq);
2379	err_free:
2380	i915_gpu_coredump_put(gpu: cap->error);
2381	kfree(objp: cap);
2382	}
2383
2384	static void execlists_reset(struct intel_engine_cs engine, const* char *msg)
2385	{
2386	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2387	unsigned long *lock = &engine->gt->reset.flags;
2388
2389	if (!intel_has_reset_engine(gt: engine->gt))
2390	return;
2391
2392	if (test_and_set_bit(nr: bit, addr: lock))
2393	return;
2394
2395	ENGINE_TRACE(engine, "reset for %s\n", msg);
2396
2397	/ Mark this tasklet as disabled to avoid waiting for it to complete /
2398	tasklet_disable_nosync(t: &engine->sched_engine->tasklet);
2399
2400	ring_set_paused(engine, state: `1`); / Freeze the current request in place /
2401	execlists_capture(engine);
2402	intel_engine_reset(engine, reason: msg);
2403
2404	tasklet_enable(t: &engine->sched_engine->tasklet);
2405	clear_and_wake_up_bit(bit, word: lock);
2406	}
2407
2408	static bool preempt_timeout(const struct intel_engine_cs *const engine)
2409	{
2410	const struct timer_list *t = &engine->execlists.preempt;
2411
2412	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2413	return false;
2414
2415	if (!timer_expired(t))
2416	return false;
2417
2418	return engine->execlists.pending[`0`];
2419	}
2420
2421	/*
2422	* Check the unread Context Status Buffers and manage the submission of new
2423	* contexts to the ELSP accordingly.
2424	*/
2425	static void execlists_submission_tasklet(struct tasklet_struct *t)
2426	{
2427	struct i915_sched_engine *sched_engine =
2428	from_tasklet(sched_engine, t, tasklet);
2429	struct intel_engine_cs * const engine = sched_engine->private_data;
2430	struct i915_request post[`2` EXECLIST_MAX_PORTS];
2431	struct i915_request **inactive;
2432
2433	rcu_read_lock();
2434	inactive = process_csb(engine, inactive: post);
2435	GEM_BUG_ON(inactive - post > ARRAY_SIZE(post));
2436
2437	if (unlikely(preempt_timeout(engine))) {
2438	const struct i915_request rq = engine->execlists.active;
2439
2440	/*
2441	* If after the preempt-timeout expired, we are still on the
2442	* same active request/context as before we initiated the
2443	* preemption, reset the engine.
2444	*
2445	* However, if we have processed a CS event to switch contexts,
2446	* but not yet processed the CS event for the pending
2447	* preemption, reset the timer allowing the new context to
2448	* gracefully exit.
2449	*/
2450	cancel_timer(t: &engine->execlists.preempt);
2451	if (rq == engine->execlists.preempt_target)
2452	engine->execlists.error_interrupt \|= ERROR_PREEMPT;
2453	else
2454	set_timer_ms(t: &engine->execlists.preempt,
2455	timeout: active_preempt_timeout(engine, rq));
2456	}
2457
2458	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2459	const char *msg;
2460
2461	/ Generate the error message in priority wrt to the user! /
2462	if (engine->execlists.error_interrupt & GENMASK(`15`, `0`))
2463	msg = "CS error"; / thrown by a user payload /
2464	else if (engine->execlists.error_interrupt & ERROR_CSB)
2465	msg = "invalid CSB event";
2466	else if (engine->execlists.error_interrupt & ERROR_PREEMPT)
2467	msg = "preemption time out";
2468	else
2469	msg = "internal error";
2470
2471	engine->execlists.error_interrupt = `0`;
2472	execlists_reset(engine, msg);
2473	}
2474
2475	if (!engine->execlists.pending[`0`]) {
2476	execlists_dequeue_irq(engine);
2477	start_timeslice(engine);
2478	}
2479
2480	post_process_csb(port: post, last: inactive);
2481	rcu_read_unlock();
2482	}
2483
2484	static void execlists_irq_handler(struct intel_engine_cs *engine, u16 iir)
2485	{
2486	bool tasklet = false;
2487
2488	if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
2489	u32 eir;
2490
2491	/ Upper 16b are the enabling mask, rsvd for internal errors /
2492	eir = ENGINE_READ(engine, RING_EIR) & GENMASK(`15`, `0`);
2493	ENGINE_TRACE(engine, "CS error: %x\n", eir);
2494
2495	/ Disable the error interrupt until after the reset /
2496	if (likely(eir)) {
2497	ENGINE_WRITE(engine, RING_EMR, ~`0u`);
2498	ENGINE_WRITE(engine, RING_EIR, eir);
2499	WRITE_ONCE(engine->execlists.error_interrupt, eir);
2500	tasklet = true;
2501	}
2502	}
2503
2504	if (iir & GT_WAIT_SEMAPHORE_INTERRUPT) {
2505	WRITE_ONCE(engine->execlists.yield,
2506	ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI));
2507	ENGINE_TRACE(engine, "semaphore yield: %08x\n",
2508	engine->execlists.yield);
2509	if (timer_delete(timer: &engine->execlists.timer))
2510	tasklet = true;
2511	}
2512
2513	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
2514	tasklet = true;
2515
2516	if (iir & GT_RENDER_USER_INTERRUPT)
2517	intel_engine_signal_breadcrumbs(engine);
2518
2519	if (tasklet)
2520	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2521	}
2522
2523	static void __execlists_kick(struct intel_engine_execlists *execlists)
2524	{
2525	struct intel_engine_cs *engine =
2526	container_of(execlists, typeof(*engine), execlists);
2527
2528	/ Kick the tasklet for some interrupt coalescing and reset handling /
2529	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2530	}
2531
2532	#define execlists_kick(t, member) \
2533	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2534
2535	static void execlists_timeslice(struct timer_list *timer)
2536	{
2537	execlists_kick(timer, timer);
2538	}
2539
2540	static void execlists_preempt(struct timer_list *timer)
2541	{
2542	execlists_kick(timer, preempt);
2543	}
2544
2545	static void queue_request(struct intel_engine_cs *engine,
2546	struct i915_request *rq)
2547	{
2548	GEM_BUG_ON(!list_empty(&rq->sched.link));
2549	list_add_tail(new: &rq->sched.link,
2550	head: i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
2551	prio: rq_prio(rq)));
2552	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2553	}
2554
2555	static bool submit_queue(struct intel_engine_cs *engine,
2556	const struct i915_request *rq)
2557	{
2558	struct i915_sched_engine *sched_engine = engine->sched_engine;
2559
2560	if (rq_prio(rq) <= sched_engine->queue_priority_hint)
2561	return false;
2562
2563	sched_engine->queue_priority_hint = rq_prio(rq);
2564	return true;
2565	}
2566
2567	static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2568	const struct i915_request *rq)
2569	{
2570	GEM_BUG_ON(i915_request_on_hold(rq));
2571	return !list_empty(head: &engine->sched_engine->hold) && hold_request(rq);
2572	}
2573
2574	static void execlists_submit_request(struct i915_request *request)
2575	{
2576	struct intel_engine_cs *engine = request->engine;
2577	unsigned long flags;
2578
2579	/ Will be called from irq-context when using foreign fences. /
2580	spin_lock_irqsave(&engine->sched_engine->lock, flags);
2581
2582	if (unlikely(ancestor_on_hold(engine, request))) {
2583	RQ_TRACE(request, "ancestor on hold\n");
2584	list_add_tail(new: &request->sched.link,
2585	head: &engine->sched_engine->hold);
2586	i915_request_set_hold(rq: request);
2587	} else {
2588	queue_request(engine, rq: request);
2589
2590	GEM_BUG_ON(i915_sched_engine_is_empty(engine->sched_engine));
2591	GEM_BUG_ON(list_empty(&request->sched.link));
2592
2593	if (submit_queue(engine, rq: request))
2594	__execlists_kick(execlists: &engine->execlists);
2595	}
2596
2597	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
2598	}
2599
2600	static int
2601	__execlists_context_pre_pin(struct intel_context *ce,
2602	struct intel_engine_cs *engine,
2603	struct i915_gem_ww_ctx ww, void* **vaddr)
2604	{
2605	int err;
2606
2607	err = lrc_pre_pin(ce, engine, ww, vaddr);
2608	if (err)
2609	return err;
2610
2611	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) {
2612	lrc_init_state(ce, engine, state: *vaddr);
2613
2614	__i915_gem_object_flush_map(obj: ce->state->obj, offset: `0`, size: engine->context_size);
2615	}
2616
2617	return `0`;
2618	}
2619
2620	static int execlists_context_pre_pin(struct intel_context *ce,
2621	struct i915_gem_ww_ctx *ww,
2622	void **vaddr)
2623	{
2624	return __execlists_context_pre_pin(ce, engine: ce->engine, ww, vaddr);
2625	}
2626
2627	static int execlists_context_pin(struct intel_context ce, void* *vaddr)
2628	{
2629	return lrc_pin(ce, engine: ce->engine, vaddr);
2630	}
2631
2632	static int execlists_context_alloc(struct intel_context *ce)
2633	{
2634	return lrc_alloc(ce, engine: ce->engine);
2635	}
2636
2637	static void execlists_context_cancel_request(struct intel_context *ce,
2638	struct i915_request *rq)
2639	{
2640	struct intel_engine_cs *engine = NULL;
2641
2642	i915_request_active_engine(rq, active: &engine);
2643
2644	if (engine && intel_engine_pulse(engine))
2645	intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, flags: `0`,
2646	fmt: "request cancellation by %s",
2647	current->comm);
2648	}
2649
2650	static struct intel_context *
2651	execlists_create_parallel(struct intel_engine_cs **engines,
2652	unsigned int num_siblings,
2653	unsigned int width)
2654	{
2655	struct intel_context parent = NULL, ce, *err;
2656	int i;
2657
2658	GEM_BUG_ON(num_siblings != `1`);
2659
2660	for (i = `0`; i < width; ++i) {
2661	ce = intel_context_create(engine: engines[i]);
2662	if (IS_ERR(ptr: ce)) {
2663	err = ce;
2664	goto unwind;
2665	}
2666
2667	if (i == `0`)
2668	parent = ce;
2669	else
2670	intel_context_bind_parent_child(parent, child: ce);
2671	}
2672
2673	parent->parallel.fence_context = dma_fence_context_alloc(num: `1`);
2674
2675	intel_context_set_nopreempt(ce: parent);
2676	for_each_child(parent, ce)
2677	intel_context_set_nopreempt(ce);
2678
2679	return parent;
2680
2681	unwind:
2682	if (parent)
2683	intel_context_put(ce: parent);
2684	return err;
2685	}
2686
2687	static const struct intel_context_ops execlists_context_ops = {
2688	.flags = COPS_HAS_INFLIGHT \| COPS_RUNTIME_CYCLES,
2689
2690	.alloc = execlists_context_alloc,
2691
2692	.cancel_request = execlists_context_cancel_request,
2693
2694	.pre_pin = execlists_context_pre_pin,
2695	.pin = execlists_context_pin,
2696	.unpin = lrc_unpin,
2697	.post_unpin = lrc_post_unpin,
2698
2699	.enter = intel_context_enter_engine,
2700	.exit = intel_context_exit_engine,
2701
2702	.reset = lrc_reset,
2703	.destroy = lrc_destroy,
2704
2705	.create_parallel = execlists_create_parallel,
2706	.create_virtual = execlists_create_virtual,
2707	};
2708
2709	static int emit_pdps(struct i915_request *rq)
2710	{
2711	const struct intel_engine_cs * const engine = rq->engine;
2712	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(vm: rq->context->vm);
2713	int err, i;
2714	u32 *cs;
2715
2716	GEM_BUG_ON(intel_vgpu_active(rq->i915));
2717
2718	/*
2719	* Beware ye of the dragons, this sequence is magic!
2720	*
2721	* Small changes to this sequence can cause anything from
2722	* GPU hangs to forcewake errors and machine lockups!
2723	*/
2724
2725	cs = intel_ring_begin(rq, num_dwords: `2`);
2726	if (IS_ERR(ptr: cs))
2727	return PTR_ERR(ptr: cs);
2728
2729	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
2730	*cs++ = MI_NOOP;
2731	intel_ring_advance(rq, cs);
2732
2733	/ Flush any residual operations from the context load /
2734	err = engine->emit_flush(rq, EMIT_FLUSH);
2735	if (err)
2736	return err;
2737
2738	/ Magic required to prevent forcewake errors! /
2739	err = engine->emit_flush(rq, EMIT_INVALIDATE);
2740	if (err)
2741	return err;
2742
2743	cs = intel_ring_begin(rq, num_dwords: `4` * GEN8_3LVL_PDPES + `2`);
2744	if (IS_ERR(ptr: cs))
2745	return PTR_ERR(ptr: cs);
2746
2747	/ Ensure the LRI have landed before we invalidate & continue /
2748	cs++ = MI_LOAD_REGISTER_IMM(`2` GEN8_3LVL_PDPES) \| MI_LRI_FORCE_POSTED;
2749	for (i = GEN8_3LVL_PDPES; i--; ) {
2750	const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, n: i);
2751	u32 base = engine->mmio_base;
2752
2753	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
2754	*cs++ = upper_32_bits(pd_daddr);
2755	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
2756	*cs++ = lower_32_bits(pd_daddr);
2757	}
2758	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
2759	intel_ring_advance(rq, cs);
2760
2761	intel_ring_advance(rq, cs);
2762
2763	return `0`;
2764	}
2765
2766	static int execlists_request_alloc(struct i915_request *request)
2767	{
2768	int ret;
2769
2770	GEM_BUG_ON(!intel_context_is_pinned(request->context));
2771
2772	/*
2773	* Flush enough space to reduce the likelihood of waiting after
2774	* we start building the request - in which case we will just
2775	* have to repeat work.
2776	*/
2777	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2778
2779	/*
2780	* Note that after this point, we have committed to using
2781	* this request as it is being used to both track the
2782	* state of engine initialisation and liveness of the
2783	* golden renderstate above. Think twice before you try
2784	* to cancel/unwind this request now.
2785	*/
2786
2787	if (!i915_vm_is_4lvl(vm: request->context->vm)) {
2788	ret = emit_pdps(rq: request);
2789	if (ret)
2790	return ret;
2791	}
2792
2793	/ Unconditionally invalidate GPU caches and TLBs. /
2794	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2795	if (ret)
2796	return ret;
2797
2798	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2799	return `0`;
2800	}
2801
2802	static void reset_csb_pointers(struct intel_engine_cs *engine)
2803	{
2804	struct intel_engine_execlists * const execlists = &engine->execlists;
2805	const unsigned int reset_value = execlists->csb_size - `1`;
2806
2807	ring_set_paused(engine, state: `0`);
2808
2809	/*
2810	* Sometimes Icelake forgets to reset its pointers on a GPU reset.
2811	* Bludgeon them with a mmio update to be sure.
2812	*/
2813	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
2814	`0xffff` << `16` \| reset_value << `8` \| reset_value);
2815	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2816
2817	/*
2818	* After a reset, the HW starts writing into CSB entry [0]. We
2819	* therefore have to set our HEAD pointer back one entry so that
2820	* the first entry we check is entry 0. To complicate this further,
2821	* as we don't wait for the first interrupt after reset, we have to
2822	* fake the HW write to point back to the last entry so that our
2823	* inline comparison of our cached head position against the last HW
2824	* write works even before the first interrupt.
2825	*/
2826	execlists->csb_head = reset_value;
2827	WRITE_ONCE(*execlists->csb_write, reset_value);
2828	wmb(); / Make sure this is visible to HW (paranoia?) /
2829
2830	/ Check that the GPU does indeed update the CSB entries! /
2831	memset(s: execlists->csb_status, c: -`1`, n: (reset_value + `1`) * sizeof(u64));
2832	drm_clflush_virt_range(addr: execlists->csb_status,
2833	length: execlists->csb_size *
2834	sizeof(execlists->csb_status));
2835
2836	/ Once more for luck and our trusty paranoia /
2837	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
2838	`0xffff` << `16` \| reset_value << `8` \| reset_value);
2839	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2840
2841	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
2842	}
2843
2844	static void sanitize_hwsp(struct intel_engine_cs *engine)
2845	{
2846	struct intel_timeline *tl;
2847
2848	list_for_each_entry(tl, &engine->status_page.timelines, engine_link)
2849	intel_timeline_reset_seqno(tl);
2850	}
2851
2852	static void execlists_sanitize(struct intel_engine_cs *engine)
2853	{
2854	GEM_BUG_ON(execlists_active(&engine->execlists));
2855
2856	/*
2857	* Poison residual state on resume, in case the suspend didn't!
2858	*
2859	* We have to assume that across suspend/resume (or other loss
2860	* of control) that the contents of our pinned buffers has been
2861	* lost, replaced by garbage. Since this doesn't always happen,
2862	* let's poison such state so that we more quickly spot when
2863	* we falsely assume it has been preserved.
2864	*/
2865	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2866	memset(s: engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
2867
2868	reset_csb_pointers(engine);
2869
2870	/*
2871	* The kernel_context HWSP is stored in the status_page. As above,
2872	* that may be lost on resume/initialisation, and so we need to
2873	* reset the value in the HWSP.
2874	*/
2875	sanitize_hwsp(engine);
2876
2877	/ And scrub the dirty cachelines for the HWSP /
2878	drm_clflush_virt_range(addr: engine->status_page.addr, PAGE_SIZE);
2879
2880	intel_engine_reset_pinned_contexts(engine);
2881	}
2882
2883	static void enable_error_interrupt(struct intel_engine_cs *engine)
2884	{
2885	u32 status;
2886
2887	engine->execlists.error_interrupt = `0`;
2888	ENGINE_WRITE(engine, RING_EMR, ~`0u`);
2889	ENGINE_WRITE(engine, RING_EIR, ~`0u`); / clear all existing errors /
2890
2891	status = ENGINE_READ(engine, RING_ESR);
2892	if (unlikely(status)) {
2893	drm_err(&engine->i915->drm,
2894	"engine '%s' resumed still in error: %08x\n",
2895	engine->name, status);
2896	intel_gt_reset_engine(engine);
2897	}
2898
2899	/*
2900	* On current gen8+, we have 2 signals to play with
2901	*
2902	* - I915_ERROR_INSTUCTION (bit 0)
2903	*
2904	* Generate an error if the command parser encounters an invalid
2905	* instruction
2906	*
2907	* This is a fatal error.
2908	*
2909	* - CP_PRIV (bit 2)
2910	*
2911	* Generate an error on privilege violation (where the CP replaces
2912	* the instruction with a no-op). This also fires for writes into
2913	* read-only scratch pages.
2914	*
2915	* This is a non-fatal error, parsing continues.
2916	*
2917	* * there are a few others defined for odd HW that we do not use
2918	*
2919	* Since CP_PRIV fires for cases where we have chosen to ignore the
2920	* error (as the HW is validating and suppressing the mistakes), we
2921	* only unmask the instruction error bit.
2922	*/
2923	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
2924	}
2925
2926	static void enable_execlists(struct intel_engine_cs *engine)
2927	{
2928	u32 mode;
2929
2930	assert_forcewakes_active(uncore: engine->uncore, fw_domains: FORCEWAKE_ALL);
2931
2932	intel_engine_set_hwsp_writemask(engine, mask: ~`0u`); / HWSTAM /
2933
2934	if (GRAPHICS_VER(engine->i915) >= `11`)
2935	mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2936	else
2937	mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2938	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2939
2940	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2941
2942	ENGINE_WRITE_FW(engine,
2943	RING_HWS_PGA,
2944	i915_ggtt_offset(engine->status_page.vma));
2945	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2946
2947	enable_error_interrupt(engine);
2948	}
2949
2950	static int execlists_resume(struct intel_engine_cs *engine)
2951	{
2952	intel_mocs_init_engine(engine);
2953	intel_breadcrumbs_reset(b: engine->breadcrumbs);
2954
2955	enable_execlists(engine);
2956
2957	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2958	xehp_enable_ccs_engines(engine);
2959
2960	return `0`;
2961	}
2962
2963	static void execlists_reset_prepare(struct intel_engine_cs *engine)
2964	{
2965	ENGINE_TRACE(engine, "depth<-%d\n",
2966	atomic_read(&engine->sched_engine->tasklet.count));
2967
2968	/*
2969	* Prevent request submission to the hardware until we have
2970	* completed the reset in i915_gem_reset_finish(). If a request
2971	* is completed by one engine, it may then queue a request
2972	* to a second via its execlists->tasklet just as we are
2973	* calling engine->resume() and also writing the ELSP.
2974	* Turning off the execlists->tasklet until the reset is over
2975	* prevents the race.
2976	*/
2977	__tasklet_disable_sync_once(t: &engine->sched_engine->tasklet);
2978	GEM_BUG_ON(!reset_in_progress(engine));
2979
2980	/*
2981	* We stop engines, otherwise we might get failed reset and a
2982	* dead gpu (on elk). Also as modern gpu as kbl can suffer
2983	* from system hang if batchbuffer is progressing when
2984	* the reset is issued, regardless of READY_TO_RESET ack.
2985	* Thus assume it is best to stop engines on all gens
2986	* where we have a gpu reset.
2987	*
2988	* WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2989	*
2990	* FIXME: Wa for more modern gens needs to be validated
2991	*/
2992	ring_set_paused(engine, state: `1`);
2993	intel_engine_stop_cs(engine);
2994
2995	/*
2996	* Wa_22011802037: In addition to stopping the cs, we need
2997	* to wait for any pending mi force wakeups
2998	*/
2999	if (intel_engine_reset_needs_wa_22011802037(gt: engine->gt))
3000	intel_engine_wait_for_pending_mi_fw(engine);
3001
3002	engine->execlists.reset_ccid = active_ccid(engine);
3003	}
3004
3005	static struct i915_request **
3006	reset_csb(struct intel_engine_cs engine, struct* i915_request **inactive)
3007	{
3008	struct intel_engine_execlists * const execlists = &engine->execlists;
3009
3010	drm_clflush_virt_range(addr: execlists->csb_write,
3011	length: sizeof(execlists->csb_write[`0`]));
3012
3013	inactive = process_csb(engine, inactive); / drain preemption events /
3014
3015	/ Following the reset, we need to reload the CSB read/write pointers /
3016	reset_csb_pointers(engine);
3017
3018	return inactive;
3019	}
3020
3021	static void
3022	execlists_reset_active(struct intel_engine_cs *engine, bool stalled)
3023	{
3024	struct intel_context *ce;
3025	struct i915_request *rq;
3026	u32 head;
3027
3028	/*
3029	* Save the currently executing context, even if we completed
3030	* its request, it was still running at the time of the
3031	* reset and will have been clobbered.
3032	*/
3033	rq = active_context(engine, ccid: engine->execlists.reset_ccid);
3034	if (!rq)
3035	return;
3036
3037	ce = rq->context;
3038	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3039
3040	if (__i915_request_is_complete(rq)) {
3041	/ Idle context; tidy up the ring so we can restart afresh /
3042	head = intel_ring_wrap(ring: ce->ring, pos: rq->tail);
3043	goto out_replay;
3044	}
3045
3046	/ We still have requests in-flight; the engine should be active /
3047	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3048
3049	/ Context has requests still in-flight; it should not be idle! /
3050	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3051
3052	rq = active_request(tl: ce->timeline, rq);
3053	head = intel_ring_wrap(ring: ce->ring, pos: rq->head);
3054	GEM_BUG_ON(head == ce->ring->tail);
3055
3056	/*
3057	* If this request hasn't started yet, e.g. it is waiting on a
3058	* semaphore, we need to avoid skipping the request or else we
3059	* break the signaling chain. However, if the context is corrupt
3060	* the request will not restart and we will be stuck with a wedged
3061	* device. It is quite often the case that if we issue a reset
3062	* while the GPU is loading the context image, that the context
3063	* image becomes corrupt.
3064	*
3065	* Otherwise, if we have not started yet, the request should replay
3066	* perfectly and we do not need to flag the result as being erroneous.
3067	*/
3068	if (!__i915_request_has_started(rq))
3069	goto out_replay;
3070
3071	/*
3072	* If the request was innocent, we leave the request in the ELSP
3073	* and will try to replay it on restarting. The context image may
3074	* have been corrupted by the reset, in which case we may have
3075	* to service a new GPU hang, but more likely we can continue on
3076	* without impact.
3077	*
3078	* If the request was guilty, we presume the context is corrupt
3079	* and have to at least restore the RING register in the context
3080	* image back to the expected values to skip over the guilty request.
3081	*/
3082	__i915_request_reset(rq, guilty: stalled);
3083
3084	/*
3085	* We want a simple context + ring to execute the breadcrumb update.
3086	* We cannot rely on the context being intact across the GPU hang,
3087	* so clear it and rebuild just what we need for the breadcrumb.
3088	* All pending requests for this context will be zapped, and any
3089	* future request will be after userspace has had the opportunity
3090	* to recreate its own state.
3091	*/
3092	out_replay:
3093	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3094	head, ce->ring->tail);
3095	lrc_reset_regs(ce, engine);
3096	ce->lrc.lrca = lrc_update_regs(ce, engine, head);
3097	}
3098
3099	static void execlists_reset_csb(struct intel_engine_cs *engine, bool stalled)
3100	{
3101	struct intel_engine_execlists * const execlists = &engine->execlists;
3102	struct i915_request post[`2` EXECLIST_MAX_PORTS];
3103	struct i915_request **inactive;
3104
3105	rcu_read_lock();
3106	inactive = reset_csb(engine, inactive: post);
3107
3108	execlists_reset_active(engine, stalled: true);
3109
3110	inactive = cancel_port_requests(execlists, inactive);
3111	post_process_csb(port: post, last: inactive);
3112	rcu_read_unlock();
3113	}
3114
3115	static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3116	{
3117	unsigned long flags;
3118
3119	ENGINE_TRACE(engine, "\n");
3120
3121	/ Process the csb, find the guilty context and throw away /
3122	execlists_reset_csb(engine, stalled);
3123
3124	/ Push back any incomplete requests for replay after the reset. /
3125	rcu_read_lock();
3126	spin_lock_irqsave(&engine->sched_engine->lock, flags);
3127	__unwind_incomplete_requests(engine);
3128	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
3129	rcu_read_unlock();
3130	}
3131
3132	static void nop_submission_tasklet(struct tasklet_struct *t)
3133	{
3134	struct i915_sched_engine *sched_engine =
3135	from_tasklet(sched_engine, t, tasklet);
3136	struct intel_engine_cs * const engine = sched_engine->private_data;
3137
3138	/ The driver is wedged; don't process any more events. /
3139	WRITE_ONCE(engine->sched_engine->queue_priority_hint, INT_MIN);
3140	}
3141
3142	static void execlists_reset_cancel(struct intel_engine_cs *engine)
3143	{
3144	struct intel_engine_execlists * const execlists = &engine->execlists;
3145	struct i915_sched_engine * const sched_engine = engine->sched_engine;
3146	struct i915_request rq, rn;
3147	struct rb_node *rb;
3148	unsigned long flags;
3149
3150	ENGINE_TRACE(engine, "\n");
3151
3152	/*
3153	* Before we call engine->cancel_requests(), we should have exclusive
3154	* access to the submission state. This is arranged for us by the
3155	* caller disabling the interrupt generation, the tasklet and other
3156	* threads that may then access the same state, giving us a free hand
3157	* to reset state. However, we still need to let lockdep be aware that
3158	* we know this state may be accessed in hardirq context, so we
3159	* disable the irq around this manipulation and we want to keep
3160	* the spinlock focused on its duties and not accidentally conflate
3161	* coverage to the submission's irq state. (Similarly, although we
3162	* shouldn't need to disable irq around the manipulation of the
3163	* submission's irq state, we also wish to remind ourselves that
3164	* it is irq state.)
3165	*/
3166	execlists_reset_csb(engine, stalled: true);
3167
3168	rcu_read_lock();
3169	spin_lock_irqsave(&engine->sched_engine->lock, flags);
3170
3171	/ Mark all executing requests as skipped. /
3172	list_for_each_entry(rq, &engine->sched_engine->requests, sched.link)
3173	i915_request_put(rq: i915_request_mark_eio(rq));
3174	intel_engine_signal_breadcrumbs(engine);
3175
3176	/ Flush the queued requests to the timeline list (for retiring). /
3177	while ((rb = rb_first_cached(&sched_engine->queue))) {
3178	struct i915_priolist *p = to_priolist(rb);
3179
3180	priolist_for_each_request_consume(rq, rn, p) {
3181	if (i915_request_mark_eio(rq)) {
3182	__i915_request_submit(request: rq);
3183	i915_request_put(rq);
3184	}
3185	}
3186
3187	rb_erase_cached(node: &p->node, root: &sched_engine->queue);
3188	i915_priolist_free(p);
3189	}
3190
3191	/ On-hold requests will be flushed to timeline upon their release /
3192	list_for_each_entry(rq, &sched_engine->hold, sched.link)
3193	i915_request_put(rq: i915_request_mark_eio(rq));
3194
3195	/ Cancel all attached virtual engines /
3196	while ((rb = rb_first_cached(&execlists->virtual))) {
3197	struct virtual_engine *ve =
3198	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3199
3200	rb_erase_cached(node: rb, root: &execlists->virtual);
3201	RB_CLEAR_NODE(rb);
3202
3203	spin_lock(lock: &ve->base.sched_engine->lock);
3204	rq = fetch_and_zero(&ve->request);
3205	if (rq) {
3206	if (i915_request_mark_eio(rq)) {
3207	rq->engine = engine;
3208	__i915_request_submit(request: rq);
3209	i915_request_put(rq);
3210	}
3211	i915_request_put(rq);
3212
3213	ve->base.sched_engine->queue_priority_hint = INT_MIN;
3214	}
3215	spin_unlock(lock: &ve->base.sched_engine->lock);
3216	}
3217
3218	/ Remaining _unready_ requests will be nop'ed when submitted /
3219
3220	sched_engine->queue_priority_hint = INT_MIN;
3221	sched_engine->queue = RB_ROOT_CACHED;
3222
3223	GEM_BUG_ON(__tasklet_is_enabled(&engine->sched_engine->tasklet));
3224	engine->sched_engine->tasklet.callback = nop_submission_tasklet;
3225
3226	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
3227	rcu_read_unlock();
3228	}
3229
3230	static void execlists_reset_finish(struct intel_engine_cs *engine)
3231	{
3232	struct intel_engine_execlists * const execlists = &engine->execlists;
3233
3234	/*
3235	* After a GPU reset, we may have requests to replay. Do so now while
3236	* we still have the forcewake to be sure that the GPU is not allowed
3237	* to sleep before we restart and reload a context.
3238	*
3239	* If the GPU reset fails, the engine may still be alive with requests
3240	* inflight. We expect those to complete, or for the device to be
3241	* reset as the next level of recovery, and as a final resort we
3242	* will declare the device wedged.
3243	*/
3244	GEM_BUG_ON(!reset_in_progress(engine));
3245
3246	/ And kick in case we missed a new request submission. /
3247	if (__tasklet_enable(t: &engine->sched_engine->tasklet))
3248	__execlists_kick(execlists);
3249
3250	ENGINE_TRACE(engine, "depth->%d\n",
3251	atomic_read(&engine->sched_engine->tasklet.count));
3252	}
3253
3254	static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3255	{
3256	ENGINE_WRITE(engine, RING_IMR,
3257	~(engine->irq_enable_mask \| engine->irq_keep_mask));
3258	ENGINE_POSTING_READ(engine, RING_IMR);
3259	}
3260
3261	static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3262	{
3263	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3264	}
3265
3266	static void execlists_park(struct intel_engine_cs *engine)
3267	{
3268	cancel_timer(t: &engine->execlists.timer);
3269	cancel_timer(t: &engine->execlists.preempt);
3270
3271	/ Reset upon idling, or we may delay the busy wakeup. /
3272	WRITE_ONCE(engine->sched_engine->queue_priority_hint, INT_MIN);
3273	}
3274
3275	static void add_to_engine(struct i915_request *rq)
3276	{
3277	lockdep_assert_held(&rq->engine->sched_engine->lock);
3278	list_move_tail(list: &rq->sched.link, head: &rq->engine->sched_engine->requests);
3279	}
3280
3281	static void remove_from_engine(struct i915_request *rq)
3282	{
3283	struct intel_engine_cs engine, locked;
3284
3285	/*
3286	* Virtual engines complicate acquiring the engine timeline lock,
3287	* as their rq->engine pointer is not stable until under that
3288	* engine lock. The simple ploy we use is to take the lock then
3289	* check that the rq still belongs to the newly locked engine.
3290	*/
3291	locked = READ_ONCE(rq->engine);
3292	spin_lock_irq(lock: &locked->sched_engine->lock);
3293	while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
3294	spin_unlock(lock: &locked->sched_engine->lock);
3295	spin_lock(lock: &engine->sched_engine->lock);
3296	locked = engine;
3297	}
3298	list_del_init(entry: &rq->sched.link);
3299
3300	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
3301	clear_bit(nr: I915_FENCE_FLAG_HOLD, addr: &rq->fence.flags);
3302
3303	/ Prevent further __await_execution() registering a cb, then flush /
3304	set_bit(nr: I915_FENCE_FLAG_ACTIVE, addr: &rq->fence.flags);
3305
3306	spin_unlock_irq(lock: &locked->sched_engine->lock);
3307
3308	i915_request_notify_execute_cb_imm(rq);
3309	}
3310
3311	static bool can_preempt(struct intel_engine_cs *engine)
3312	{
3313	return GRAPHICS_VER(engine->i915) > `8`;
3314	}
3315
3316	static void kick_execlists(const struct i915_request rq, int* prio)
3317	{
3318	struct intel_engine_cs *engine = rq->engine;
3319	struct i915_sched_engine *sched_engine = engine->sched_engine;
3320	const struct i915_request *inflight;
3321
3322	/*
3323	* We only need to kick the tasklet once for the high priority
3324	* new context we add into the queue.
3325	*/
3326	if (prio <= sched_engine->queue_priority_hint)
3327	return;
3328
3329	rcu_read_lock();
3330
3331	/ Nothing currently active? We're overdue for a submission! /
3332	inflight = execlists_active(execlists: &engine->execlists);
3333	if (!inflight)
3334	goto unlock;
3335
3336	/*
3337	* If we are already the currently executing context, don't
3338	* bother evaluating if we should preempt ourselves.
3339	*/
3340	if (inflight->context == rq->context)
3341	goto unlock;
3342
3343	ENGINE_TRACE(engine,
3344	"bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n",
3345	prio,
3346	rq->fence.context, rq->fence.seqno,
3347	inflight->fence.context, inflight->fence.seqno,
3348	inflight->sched.attr.priority);
3349
3350	sched_engine->queue_priority_hint = prio;
3351
3352	/*
3353	* Allow preemption of low -> normal -> high, but we do
3354	* not allow low priority tasks to preempt other low priority
3355	* tasks under the impression that latency for low priority
3356	* tasks does not matter (as much as background throughput),
3357	* so kiss.
3358	*/
3359	if (prio >= max(I915_PRIORITY_NORMAL, rq_prio(inflight)))
3360	tasklet_hi_schedule(t: &sched_engine->tasklet);
3361
3362	unlock:
3363	rcu_read_unlock();
3364	}
3365
3366	static void execlists_set_default_submission(struct intel_engine_cs *engine)
3367	{
3368	engine->submit_request = execlists_submit_request;
3369	engine->sched_engine->schedule = i915_schedule;
3370	engine->sched_engine->kick_backend = kick_execlists;
3371	engine->sched_engine->tasklet.callback = execlists_submission_tasklet;
3372	}
3373
3374	static void execlists_shutdown(struct intel_engine_cs *engine)
3375	{
3376	/ Synchronise with residual timers and any softirq they raise /
3377	timer_delete_sync(timer: &engine->execlists.timer);
3378	timer_delete_sync(timer: &engine->execlists.preempt);
3379	tasklet_kill(t: &engine->sched_engine->tasklet);
3380	}
3381
3382	static void execlists_release(struct intel_engine_cs *engine)
3383	{
3384	engine->sanitize = NULL; / no longer in control, nothing to sanitize /
3385
3386	execlists_shutdown(engine);
3387
3388	intel_engine_cleanup_common(engine);
3389	lrc_fini_wa_ctx(engine);
3390	}
3391
3392	static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
3393	ktime_t *now)
3394	{
3395	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
3396	ktime_t total = stats->total;
3397
3398	/*
3399	* If the engine is executing something at the moment
3400	* add it to the total.
3401	*/
3402	*now = ktime_get();
3403	if (READ_ONCE(stats->active))
3404	total = ktime_add(total, ktime_sub(*now, stats->start));
3405
3406	return total;
3407	}
3408
3409	static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
3410	ktime_t *now)
3411	{
3412	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
3413	unsigned int seq;
3414	ktime_t total;
3415
3416	do {
3417	seq = read_seqcount_begin(&stats->lock);
3418	total = __execlists_engine_busyness(engine, now);
3419	} while (read_seqcount_retry(&stats->lock, seq));
3420
3421	return total;
3422	}
3423
3424	static void
3425	logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3426	{
3427	/ Default vfuncs which can be overridden by each engine. /
3428
3429	engine->resume = execlists_resume;
3430
3431	engine->cops = &execlists_context_ops;
3432	engine->request_alloc = execlists_request_alloc;
3433	engine->add_active_request = add_to_engine;
3434	engine->remove_active_request = remove_from_engine;
3435
3436	engine->reset.prepare = execlists_reset_prepare;
3437	engine->reset.rewind = execlists_reset_rewind;
3438	engine->reset.cancel = execlists_reset_cancel;
3439	engine->reset.finish = execlists_reset_finish;
3440
3441	engine->park = execlists_park;
3442	engine->unpark = NULL;
3443
3444	engine->emit_flush = gen8_emit_flush_xcs;
3445	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3446	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_xcs;
3447	if (GRAPHICS_VER(engine->i915) >= `12`) {
3448	engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_xcs;
3449	engine->emit_flush = gen12_emit_flush_xcs;
3450	}
3451	engine->set_default_submission = execlists_set_default_submission;
3452
3453	if (GRAPHICS_VER(engine->i915) < `11`) {
3454	engine->irq_enable = gen8_logical_ring_enable_irq;
3455	engine->irq_disable = gen8_logical_ring_disable_irq;
3456	} else {
3457	/*
3458	* TODO: On Gen11 interrupt masks need to be clear
3459	* to allow C6 entry. Keep interrupts enabled at
3460	* and take the hit of generating extra interrupts
3461	* until a more refined solution exists.
3462	*/
3463	}
3464	intel_engine_set_irq_handler(engine, fn: execlists_irq_handler);
3465
3466	engine->flags \|= I915_ENGINE_SUPPORTS_STATS;
3467	if (!intel_vgpu_active(i915: engine->i915)) {
3468	engine->flags \|= I915_ENGINE_HAS_SEMAPHORES;
3469	if (can_preempt(engine)) {
3470	engine->flags \|= I915_ENGINE_HAS_PREEMPTION;
3471	if (CONFIG_DRM_I915_TIMESLICE_DURATION)
3472	engine->flags \|= I915_ENGINE_HAS_TIMESLICES;
3473	}
3474	}
3475
3476	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `55`)) {
3477	if (intel_engine_has_preemption(engine))
3478	engine->emit_bb_start = xehp_emit_bb_start;
3479	else
3480	engine->emit_bb_start = xehp_emit_bb_start_noarb;
3481	} else {
3482	if (intel_engine_has_preemption(engine))
3483	engine->emit_bb_start = gen8_emit_bb_start;
3484	else
3485	engine->emit_bb_start = gen8_emit_bb_start_noarb;
3486	}
3487
3488	engine->busyness = execlists_engine_busyness;
3489	}
3490
3491	static void logical_ring_default_irqs(struct intel_engine_cs *engine)
3492	{
3493	unsigned int shift = `0`;
3494
3495	if (GRAPHICS_VER(engine->i915) < `11`) {
3496	const u8 irq_shifts[] = {
3497	[RCS0] = GEN8_RCS_IRQ_SHIFT,
3498	[BCS0] = GEN8_BCS_IRQ_SHIFT,
3499	[VCS0] = GEN8_VCS0_IRQ_SHIFT,
3500	[VCS1] = GEN8_VCS1_IRQ_SHIFT,
3501	[VECS0] = GEN8_VECS_IRQ_SHIFT,
3502	};
3503
3504	shift = irq_shifts[engine->id];
3505	}
3506
3507	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3508	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3509	engine->irq_keep_mask \|= GT_CS_MASTER_ERROR_INTERRUPT << shift;
3510	engine->irq_keep_mask \|= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
3511	}
3512
3513	static void rcs_submission_override(struct intel_engine_cs *engine)
3514	{
3515	switch (GRAPHICS_VER(engine->i915)) {
3516	case `12`:
3517	engine->emit_flush = gen12_emit_flush_rcs;
3518	engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3519	break;
3520	case `11`:
3521	engine->emit_flush = gen11_emit_flush_rcs;
3522	engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3523	break;
3524	default:
3525	engine->emit_flush = gen8_emit_flush_rcs;
3526	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3527	break;
3528	}
3529	}
3530
3531	int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3532	{
3533	struct intel_engine_execlists * const execlists = &engine->execlists;
3534	struct drm_i915_private *i915 = engine->i915;
3535	struct intel_uncore *uncore = engine->uncore;
3536	u32 base = engine->mmio_base;
3537
3538	tasklet_setup(t: &engine->sched_engine->tasklet, callback: execlists_submission_tasklet);
3539	timer_setup(&engine->execlists.timer, execlists_timeslice, `0`);
3540	timer_setup(&engine->execlists.preempt, execlists_preempt, `0`);
3541
3542	logical_ring_default_vfuncs(engine);
3543	logical_ring_default_irqs(engine);
3544
3545	seqcount_init(&engine->stats.execlists.lock);
3546
3547	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)
3548	rcs_submission_override(engine);
3549
3550	lrc_init_wa_ctx(engine);
3551
3552	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3553	execlists->submit_reg = intel_uncore_regs(uncore) +
3554	i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3555	execlists->ctrl_reg = intel_uncore_regs(uncore) +
3556	i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3557
3558	engine->fw_domain = intel_uncore_forcewake_for_reg(uncore: engine->uncore,
3559	RING_EXECLIST_CONTROL(engine->mmio_base),
3560	FW_REG_WRITE);
3561	} else {
3562	execlists->submit_reg = intel_uncore_regs(uncore) +
3563	i915_mmio_reg_offset(RING_ELSP(base));
3564	}
3565
3566	execlists->csb_status =
3567	(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3568
3569	execlists->csb_write =
3570	&engine->status_page.addr[INTEL_HWS_CSB_WRITE_INDEX(i915)];
3571
3572	if (GRAPHICS_VER(i915) < `11`)
3573	execlists->csb_size = GEN8_CSB_ENTRIES;
3574	else
3575	execlists->csb_size = GEN11_CSB_ENTRIES;
3576
3577	engine->context_tag = GENMASK(BITS_PER_LONG - `2`, `0`);
3578	if (GRAPHICS_VER(engine->i915) >= `11` &&
3579	GRAPHICS_VER_FULL(engine->i915) < IP_VER(`12`, `55`)) {
3580	execlists->ccid \|= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - `32`);
3581	execlists->ccid \|= engine->class << (GEN11_ENGINE_CLASS_SHIFT - `32`);
3582	}
3583
3584	/ Finally, take ownership and responsibility for cleanup! /
3585	engine->sanitize = execlists_sanitize;
3586	engine->release = execlists_release;
3587
3588	return `0`;
3589	}
3590
3591	static struct list_head virtual_queue(struct* virtual_engine *ve)
3592	{
3593	return &ve->base.sched_engine->default_priolist.requests;
3594	}
3595
3596	static void rcu_virtual_context_destroy(struct work_struct *wrk)
3597	{
3598	struct virtual_engine *ve =
3599	container_of(wrk, typeof(*ve), rcu.work);
3600	unsigned int n;
3601
3602	GEM_BUG_ON(ve->context.inflight);
3603
3604	/ Preempt-to-busy may leave a stale request behind. /
3605	if (unlikely(ve->request)) {
3606	struct i915_request *old;
3607
3608	spin_lock_irq(lock: &ve->base.sched_engine->lock);
3609
3610	old = fetch_and_zero(&ve->request);
3611	if (old) {
3612	GEM_BUG_ON(!__i915_request_is_complete(old));
3613	__i915_request_submit(request: old);
3614	i915_request_put(rq: old);
3615	}
3616
3617	spin_unlock_irq(lock: &ve->base.sched_engine->lock);
3618	}
3619
3620	/*
3621	* Flush the tasklet in case it is still running on another core.
3622	*
3623	* This needs to be done before we remove ourselves from the siblings'
3624	* rbtrees as in the case it is running in parallel, it may reinsert
3625	* the rb_node into a sibling.
3626	*/
3627	tasklet_kill(t: &ve->base.sched_engine->tasklet);
3628
3629	/ Decouple ourselves from the siblings, no more access allowed. /
3630	for (n = `0`; n < ve->num_siblings; n++) {
3631	struct intel_engine_cs *sibling = ve->siblings[n];
3632	struct rb_node *node = &ve->nodes[sibling->id].rb;
3633
3634	if (RB_EMPTY_NODE(node))
3635	continue;
3636
3637	spin_lock_irq(lock: &sibling->sched_engine->lock);
3638
3639	/ Detachment is lazily performed in the sched_engine->tasklet /
3640	if (!RB_EMPTY_NODE(node))
3641	rb_erase_cached(node, root: &sibling->execlists.virtual);
3642
3643	spin_unlock_irq(lock: &sibling->sched_engine->lock);
3644	}
3645	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.sched_engine->tasklet));
3646	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3647
3648	lrc_fini(ce: &ve->context);
3649	intel_context_fini(ce: &ve->context);
3650
3651	if (ve->base.breadcrumbs)
3652	intel_breadcrumbs_put(b: ve->base.breadcrumbs);
3653	if (ve->base.sched_engine)
3654	i915_sched_engine_put(sched_engine: ve->base.sched_engine);
3655	intel_engine_free_request_pool(engine: &ve->base);
3656
3657	kfree(objp: ve);
3658	}
3659
3660	static void virtual_context_destroy(struct kref *kref)
3661	{
3662	struct virtual_engine *ve =
3663	container_of(kref, typeof(*ve), context.ref);
3664
3665	GEM_BUG_ON(!list_empty(&ve->context.signals));
3666
3667	/*
3668	* When destroying the virtual engine, we have to be aware that
3669	* it may still be in use from an hardirq/softirq context causing
3670	* the resubmission of a completed request (background completion
3671	* due to preempt-to-busy). Before we can free the engine, we need
3672	* to flush the submission code and tasklets that are still potentially
3673	* accessing the engine. Flushing the tasklets requires process context,
3674	* and since we can guard the resubmit onto the engine with an RCU read
3675	* lock, we can delegate the free of the engine to an RCU worker.
3676	*/
3677	INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
3678	queue_rcu_work(wq: ve->context.engine->i915->unordered_wq, rwork: &ve->rcu);
3679	}
3680
3681	static void virtual_engine_initial_hint(struct virtual_engine *ve)
3682	{
3683	int swp;
3684
3685	/*
3686	* Pick a random sibling on starting to help spread the load around.
3687	*
3688	* New contexts are typically created with exactly the same order
3689	* of siblings, and often started in batches. Due to the way we iterate
3690	* the array of sibling when submitting requests, sibling[0] is
3691	* prioritised for dequeuing. If we make sure that sibling[0] is fairly
3692	* randomised across the system, we also help spread the load by the
3693	* first engine we inspect being different each time.
3694	*
3695	* NB This does not force us to execute on this engine, it will just
3696	* typically be the first we inspect for submission.
3697	*/
3698	swp = get_random_u32_below(ceil: ve->num_siblings);
3699	if (swp)
3700	swap(ve->siblings[swp], ve->siblings[`0`]);
3701	}
3702
3703	static int virtual_context_alloc(struct intel_context *ce)
3704	{
3705	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3706
3707	return lrc_alloc(ce, engine: ve->siblings[`0`]);
3708	}
3709
3710	static int virtual_context_pre_pin(struct intel_context *ce,
3711	struct i915_gem_ww_ctx *ww,
3712	void **vaddr)
3713	{
3714	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3715
3716	/ Note: we must use a real engine class for setting up reg state /
3717	return __execlists_context_pre_pin(ce, engine: ve->siblings[`0`], ww, vaddr);
3718	}
3719
3720	static int virtual_context_pin(struct intel_context ce, void* *vaddr)
3721	{
3722	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3723
3724	return lrc_pin(ce, engine: ve->siblings[`0`], vaddr);
3725	}
3726
3727	static void virtual_context_enter(struct intel_context *ce)
3728	{
3729	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3730	unsigned int n;
3731
3732	for (n = `0`; n < ve->num_siblings; n++)
3733	intel_engine_pm_get(engine: ve->siblings[n]);
3734
3735	intel_timeline_enter(tl: ce->timeline);
3736	}
3737
3738	static void virtual_context_exit(struct intel_context *ce)
3739	{
3740	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3741	unsigned int n;
3742
3743	intel_timeline_exit(tl: ce->timeline);
3744
3745	for (n = `0`; n < ve->num_siblings; n++)
3746	intel_engine_pm_put(engine: ve->siblings[n]);
3747	}
3748
3749	static struct intel_engine_cs *
3750	virtual_get_sibling(struct intel_engine_cs engine, unsigned* int sibling)
3751	{
3752	struct virtual_engine *ve = to_virtual_engine(engine);
3753
3754	if (sibling >= ve->num_siblings)
3755	return NULL;
3756
3757	return ve->siblings[sibling];
3758	}
3759
3760	static const struct intel_context_ops virtual_context_ops = {
3761	.flags = COPS_HAS_INFLIGHT \| COPS_RUNTIME_CYCLES,
3762
3763	.alloc = virtual_context_alloc,
3764
3765	.cancel_request = execlists_context_cancel_request,
3766
3767	.pre_pin = virtual_context_pre_pin,
3768	.pin = virtual_context_pin,
3769	.unpin = lrc_unpin,
3770	.post_unpin = lrc_post_unpin,
3771
3772	.enter = virtual_context_enter,
3773	.exit = virtual_context_exit,
3774
3775	.destroy = virtual_context_destroy,
3776
3777	.get_sibling = virtual_get_sibling,
3778	};
3779
3780	static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3781	{
3782	struct i915_request *rq;
3783	intel_engine_mask_t mask;
3784
3785	rq = READ_ONCE(ve->request);
3786	if (!rq)
3787	return `0`;
3788
3789	/ The rq is ready for submission; rq->execution_mask is now stable. /
3790	mask = rq->execution_mask;
3791	if (unlikely(!mask)) {
3792	/ Invalid selection, submit to a random engine in error /
3793	i915_request_set_error_once(rq, error: -ENODEV);
3794	mask = ve->siblings[`0`]->mask;
3795	}
3796
3797	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
3798	rq->fence.context, rq->fence.seqno,
3799	mask, ve->base.sched_engine->queue_priority_hint);
3800
3801	return mask;
3802	}
3803
3804	static void virtual_submission_tasklet(struct tasklet_struct *t)
3805	{
3806	struct i915_sched_engine *sched_engine =
3807	from_tasklet(sched_engine, t, tasklet);
3808	struct virtual_engine * const ve =
3809	(struct virtual_engine *)sched_engine->private_data;
3810	const int prio = READ_ONCE(sched_engine->queue_priority_hint);
3811	intel_engine_mask_t mask;
3812	unsigned int n;
3813
3814	rcu_read_lock();
3815	mask = virtual_submission_mask(ve);
3816	rcu_read_unlock();
3817	if (unlikely(!mask))
3818	return;
3819
3820	for (n = `0`; n < ve->num_siblings; n++) {
3821	struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
3822	struct ve_node * const node = &ve->nodes[sibling->id];
3823	struct rb_node *parent, rb;
3824	bool first;
3825
3826	if (!READ_ONCE(ve->request))
3827	break; / already handled by a sibling's tasklet /
3828
3829	spin_lock_irq(lock: &sibling->sched_engine->lock);
3830
3831	if (unlikely(!(mask & sibling->mask))) {
3832	if (!RB_EMPTY_NODE(&node->rb)) {
3833	rb_erase_cached(node: &node->rb,
3834	root: &sibling->execlists.virtual);
3835	RB_CLEAR_NODE(&node->rb);
3836	}
3837
3838	goto unlock_engine;
3839	}
3840
3841	if (unlikely(!RB_EMPTY_NODE(&node->rb))) {
3842	/*
3843	* Cheat and avoid rebalancing the tree if we can
3844	* reuse this node in situ.
3845	*/
3846	first = rb_first_cached(&sibling->execlists.virtual) ==
3847	&node->rb;
3848	if (prio == node->prio \|\| (prio > node->prio && first))
3849	goto submit_engine;
3850
3851	rb_erase_cached(node: &node->rb, root: &sibling->execlists.virtual);
3852	}
3853
3854	rb = NULL;
3855	first = true;
3856	parent = &sibling->execlists.virtual.rb_root.rb_node;
3857	while (*parent) {
3858	struct ve_node *other;
3859
3860	rb = *parent;
3861	other = rb_entry(rb, typeof(*other), rb);
3862	if (prio > other->prio) {
3863	parent = &rb->rb_left;
3864	} else {
3865	parent = &rb->rb_right;
3866	first = false;
3867	}
3868	}
3869
3870	rb_link_node(node: &node->rb, parent: rb, rb_link: parent);
3871	rb_insert_color_cached(node: &node->rb,
3872	root: &sibling->execlists.virtual,
3873	leftmost: first);
3874
3875	submit_engine:
3876	GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3877	node->prio = prio;
3878	if (first && prio > sibling->sched_engine->queue_priority_hint)
3879	tasklet_hi_schedule(t: &sibling->sched_engine->tasklet);
3880
3881	unlock_engine:
3882	spin_unlock_irq(lock: &sibling->sched_engine->lock);
3883
3884	if (intel_context_inflight(&ve->context))
3885	break;
3886	}
3887	}
3888
3889	static void virtual_submit_request(struct i915_request *rq)
3890	{
3891	struct virtual_engine *ve = to_virtual_engine(engine: rq->engine);
3892	unsigned long flags;
3893
3894	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
3895	rq->fence.context,
3896	rq->fence.seqno);
3897
3898	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3899
3900	spin_lock_irqsave(&ve->base.sched_engine->lock, flags);
3901
3902	/ By the time we resubmit a request, it may be completed /
3903	if (__i915_request_is_complete(rq)) {
3904	__i915_request_submit(request: rq);
3905	goto unlock;
3906	}
3907
3908	if (ve->request) { / background completion from preempt-to-busy /
3909	GEM_BUG_ON(!__i915_request_is_complete(ve->request));
3910	__i915_request_submit(request: ve->request);
3911	i915_request_put(rq: ve->request);
3912	}
3913
3914	ve->base.sched_engine->queue_priority_hint = rq_prio(rq);
3915	ve->request = i915_request_get(rq);
3916
3917	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3918	list_move_tail(list: &rq->sched.link, head: virtual_queue(ve));
3919
3920	tasklet_hi_schedule(t: &ve->base.sched_engine->tasklet);
3921
3922	unlock:
3923	spin_unlock_irqrestore(lock: &ve->base.sched_engine->lock, flags);
3924	}
3925
3926	static struct intel_context *
3927	execlists_create_virtual(struct intel_engine_cs *siblings, unsigned* int count,
3928	unsigned long flags)
3929	{
3930	struct drm_i915_private *i915 = siblings[`0`]->i915;
3931	struct virtual_engine *ve;
3932	unsigned int n;
3933	int err;
3934
3935	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3936	if (!ve)
3937	return ERR_PTR(error: -ENOMEM);
3938
3939	ve->base.i915 = i915;
3940	ve->base.gt = siblings[`0`]->gt;
3941	ve->base.uncore = siblings[`0`]->uncore;
3942	ve->base.id = -`1`;
3943
3944	ve->base.class = OTHER_CLASS;
3945	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3946	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3947	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3948
3949	/*
3950	* The decision on whether to submit a request using semaphores
3951	* depends on the saturated state of the engine. We only compute
3952	* this during HW submission of the request, and we need for this
3953	* state to be globally applied to all requests being submitted
3954	* to this engine. Virtual engines encompass more than one physical
3955	* engine and so we cannot accurately tell in advance if one of those
3956	* engines is already saturated and so cannot afford to use a semaphore
3957	* and be pessimized in priority for doing so -- if we are the only
3958	* context using semaphores after all other clients have stopped, we
3959	* will be starved on the saturated system. Such a global switch for
3960	* semaphores is less than ideal, but alas is the current compromise.
3961	*/
3962	ve->base.saturated = ALL_ENGINES;
3963
3964	snprintf(buf: ve->base.name, size: sizeof(ve->base.name), fmt: "virtual");
3965
3966	intel_engine_init_execlists(engine: &ve->base);
3967
3968	ve->base.sched_engine = i915_sched_engine_create(ENGINE_VIRTUAL);
3969	if (!ve->base.sched_engine) {
3970	err = -ENOMEM;
3971	goto err_put;
3972	}
3973	ve->base.sched_engine->private_data = &ve->base;
3974
3975	ve->base.cops = &virtual_context_ops;
3976	ve->base.request_alloc = execlists_request_alloc;
3977
3978	ve->base.sched_engine->schedule = i915_schedule;
3979	ve->base.sched_engine->kick_backend = kick_execlists;
3980	ve->base.submit_request = virtual_submit_request;
3981
3982	INIT_LIST_HEAD(list: virtual_queue(ve));
3983	tasklet_setup(t: &ve->base.sched_engine->tasklet, callback: virtual_submission_tasklet);
3984
3985	intel_context_init(ce: &ve->context, engine: &ve->base);
3986
3987	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
3988	if (!ve->base.breadcrumbs) {
3989	err = -ENOMEM;
3990	goto err_put;
3991	}
3992
3993	for (n = `0`; n < count; n++) {
3994	struct intel_engine_cs *sibling = siblings[n];
3995
3996	GEM_BUG_ON(!is_power_of_2(sibling->mask));
3997	if (sibling->mask & ve->base.mask) {
3998	drm_dbg(&i915->drm,
3999	"duplicate %s entry in load balancer\n",
4000	sibling->name);
4001	err = -EINVAL;
4002	goto err_put;
4003	}
4004
4005	/*
4006	* The virtual engine implementation is tightly coupled to
4007	* the execlists backend -- we push out request directly
4008	* into a tree inside each physical engine. We could support
4009	* layering if we handle cloning of the requests and
4010	* submitting a copy into each backend.
4011	*/
4012	if (sibling->sched_engine->tasklet.callback !=
4013	execlists_submission_tasklet) {
4014	err = -ENODEV;
4015	goto err_put;
4016	}
4017
4018	GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4019	RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4020
4021	ve->siblings[ve->num_siblings++] = sibling;
4022	ve->base.mask \|= sibling->mask;
4023	ve->base.logical_mask \|= sibling->logical_mask;
4024
4025	/*
4026	* All physical engines must be compatible for their emission
4027	* functions (as we build the instructions during request
4028	* construction and do not alter them before submission
4029	* on the physical engine). We use the engine class as a guide
4030	* here, although that could be refined.
4031	*/
4032	if (ve->base.class != OTHER_CLASS) {
4033	if (ve->base.class != sibling->class) {
4034	drm_dbg(&i915->drm,
4035	"invalid mixing of engine class, sibling %d, already %d\n",
4036	sibling->class, ve->base.class);
4037	err = -EINVAL;
4038	goto err_put;
4039	}
4040	continue;
4041	}
4042
4043	ve->base.class = sibling->class;
4044	ve->base.uabi_class = sibling->uabi_class;
4045	snprintf(buf: ve->base.name, size: sizeof(ve->base.name),
4046	fmt: "v%dx%d", ve->base.class, count);
4047	ve->base.context_size = sibling->context_size;
4048
4049	ve->base.add_active_request = sibling->add_active_request;
4050	ve->base.remove_active_request = sibling->remove_active_request;
4051	ve->base.emit_bb_start = sibling->emit_bb_start;
4052	ve->base.emit_flush = sibling->emit_flush;
4053	ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4054	ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4055	ve->base.emit_fini_breadcrumb_dw =
4056	sibling->emit_fini_breadcrumb_dw;
4057
4058	ve->base.flags = sibling->flags;
4059	}
4060
4061	ve->base.flags \|= I915_ENGINE_IS_VIRTUAL;
4062
4063	virtual_engine_initial_hint(ve);
4064	return &ve->context;
4065
4066	err_put:
4067	intel_context_put(ce: &ve->context);
4068	return ERR_PTR(error: err);
4069	}
4070
4071	void intel_execlists_show_requests(struct intel_engine_cs *engine,
4072	struct drm_printer *m,
4073	void (show_request)(struct* drm_printer *m,
4074	const struct i915_request *rq,
4075	const char *prefix,
4076	int indent),
4077	unsigned int max)
4078	{
4079	const struct intel_engine_execlists *execlists = &engine->execlists;
4080	struct i915_sched_engine *sched_engine = engine->sched_engine;
4081	struct i915_request rq, last;
4082	unsigned long flags;
4083	unsigned int count;
4084	struct rb_node *rb;
4085
4086	spin_lock_irqsave(&sched_engine->lock, flags);
4087
4088	last = NULL;
4089	count = `0`;
4090	list_for_each_entry(rq, &sched_engine->requests, sched.link) {
4091	if (count++ < max - `1`)
4092	show_request(m, rq, "\t\t", `0`);
4093	else
4094	last = rq;
4095	}
4096	if (last) {
4097	if (count > max) {
4098	drm_printf(p: m,
4099	f: "\t\t...skipping %d executing requests...\n",
4100	count - max);
4101	}
4102	show_request(m, last, "\t\t", `0`);
4103	}
4104
4105	if (sched_engine->queue_priority_hint != INT_MIN)
4106	drm_printf(p: m, f: "\t\tQueue priority hint: %d\n",
4107	READ_ONCE(sched_engine->queue_priority_hint));
4108
4109	last = NULL;
4110	count = `0`;
4111	for (rb = rb_first_cached(&sched_engine->queue); rb; rb = rb_next(rb)) {
4112	struct i915_priolist p = rb_entry(rb, typeof(p), node);
4113
4114	priolist_for_each_request(rq, p) {
4115	if (count++ < max - `1`)
4116	show_request(m, rq, "\t\t", `0`);
4117	else
4118	last = rq;
4119	}
4120	}
4121	if (last) {
4122	if (count > max) {
4123	drm_printf(p: m,
4124	f: "\t\t...skipping %d queued requests...\n",
4125	count - max);
4126	}
4127	show_request(m, last, "\t\t", `0`);
4128	}
4129
4130	last = NULL;
4131	count = `0`;
4132	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4133	struct virtual_engine *ve =
4134	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4135	struct i915_request *rq = READ_ONCE(ve->request);
4136
4137	if (rq) {
4138	if (count++ < max - `1`)
4139	show_request(m, rq, "\t\t", `0`);
4140	else
4141	last = rq;
4142	}
4143	}
4144	if (last) {
4145	if (count > max) {
4146	drm_printf(p: m,
4147	f: "\t\t...skipping %d virtual requests...\n",
4148	count - max);
4149	}
4150	show_request(m, last, "\t\t", `0`);
4151	}
4152
4153	spin_unlock_irqrestore(lock: &sched_engine->lock, flags);
4154	}
4155
4156	void intel_execlists_dump_active_requests(struct intel_engine_cs *engine,
4157	struct i915_request *hung_rq,
4158	struct drm_printer *m)
4159	{
4160	unsigned long flags;
4161
4162	spin_lock_irqsave(&engine->sched_engine->lock, flags);
4163
4164	intel_engine_dump_active_requests(requests: &engine->sched_engine->requests, hung_rq, m);
4165
4166	drm_printf(p: m, f: "\tOn hold?: %zu\n",
4167	list_count_nodes(head: &engine->sched_engine->hold));
4168
4169	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
4170	}
4171
4172	#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4173	#include "selftest_execlists.c"
4174	#endif
4175

Browse the source code of Linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c