1// SPDX-License-Identifier: MIT
2/*
3 * Copyright © 2008-2018 Intel Corporation
4 */
5
6#include <linux/sched/mm.h>
7#include <linux/stop_machine.h>
8#include <linux/string_helpers.h>
9
10#include "display/intel_display_reset.h"
11#include "display/intel_overlay.h"
12#include "gem/i915_gem_context.h"
13#include "gt/intel_gt_regs.h"
14#include "gt/uc/intel_gsc_fw.h"
15#include "uc/intel_guc.h"
16
17#include "i915_drv.h"
18#include "i915_file_private.h"
19#include "i915_gpu_error.h"
20#include "i915_irq.h"
21#include "i915_reg.h"
22#include "i915_wait_util.h"
23#include "intel_breadcrumbs.h"
24#include "intel_engine_pm.h"
25#include "intel_engine_regs.h"
26#include "intel_gt.h"
27#include "intel_gt_pm.h"
28#include "intel_gt_print.h"
29#include "intel_gt_requests.h"
30#include "intel_mchbar_regs.h"
31#include "intel_pci_config.h"
32#include "intel_reset.h"
33
34#define RESET_MAX_RETRIES 3
35
36static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
37{
38 struct drm_i915_file_private *file_priv = ctx->file_priv;
39 unsigned long prev_hang;
40 unsigned int score;
41
42 if (IS_ERR_OR_NULL(ptr: file_priv))
43 return;
44
45 score = 0;
46 if (banned)
47 score = I915_CLIENT_SCORE_CONTEXT_BAN;
48
49 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
50 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
51 score += I915_CLIENT_SCORE_HANG_FAST;
52
53 if (score) {
54 atomic_add(i: score, v: &file_priv->ban_score);
55
56 drm_dbg(&ctx->i915->drm,
57 "client %s: gained %u ban score, now %u\n",
58 ctx->name, score,
59 atomic_read(&file_priv->ban_score));
60 }
61}
62
63static bool mark_guilty(struct i915_request *rq)
64{
65 struct i915_gem_context *ctx;
66 unsigned long prev_hang;
67 bool banned;
68 int i;
69
70 if (intel_context_is_closed(ce: rq->context))
71 return true;
72
73 rcu_read_lock();
74 ctx = rcu_dereference(rq->context->gem_context);
75 if (ctx && !kref_get_unless_zero(kref: &ctx->ref))
76 ctx = NULL;
77 rcu_read_unlock();
78 if (!ctx)
79 return intel_context_is_banned(ce: rq->context);
80
81 atomic_inc(v: &ctx->guilty_count);
82
83 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
84 if (!i915_gem_context_is_bannable(ctx)) {
85 banned = false;
86 goto out;
87 }
88
89 drm_notice(&ctx->i915->drm,
90 "%s context reset due to GPU hang\n",
91 ctx->name);
92
93 /* Record the timestamp for the last N hangs */
94 prev_hang = ctx->hang_timestamp[0];
95 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
96 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
97 ctx->hang_timestamp[i] = jiffies;
98
99 /* If we have hung N+1 times in rapid succession, we ban the context! */
100 banned = !i915_gem_context_is_recoverable(ctx);
101 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
102 banned = true;
103 if (banned)
104 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
105 ctx->name, atomic_read(&ctx->guilty_count));
106
107 client_mark_guilty(ctx, banned);
108
109out:
110 i915_gem_context_put(ctx);
111 return banned;
112}
113
114static void mark_innocent(struct i915_request *rq)
115{
116 struct i915_gem_context *ctx;
117
118 rcu_read_lock();
119 ctx = rcu_dereference(rq->context->gem_context);
120 if (ctx)
121 atomic_inc(v: &ctx->active_count);
122 rcu_read_unlock();
123}
124
125void __i915_request_reset(struct i915_request *rq, bool guilty)
126{
127 bool banned = false;
128
129 RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty));
130 GEM_BUG_ON(__i915_request_is_complete(rq));
131
132 rcu_read_lock(); /* protect the GEM context */
133 if (guilty) {
134 i915_request_set_error_once(rq, error: -EIO);
135 __i915_request_skip(rq);
136 banned = mark_guilty(rq);
137 } else {
138 i915_request_set_error_once(rq, error: -EAGAIN);
139 mark_innocent(rq);
140 }
141 rcu_read_unlock();
142
143 if (banned)
144 intel_context_ban(ce: rq->context, rq);
145}
146
147static bool i915_in_reset(struct pci_dev *pdev)
148{
149 u8 gdrst;
150
151 pci_read_config_byte(dev: pdev, I915_GDRST, val: &gdrst);
152 return gdrst & GRDOM_RESET_STATUS;
153}
154
155static int i915_do_reset(struct intel_gt *gt,
156 intel_engine_mask_t engine_mask,
157 unsigned int retry)
158{
159 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
160 int err;
161
162 /* Assert reset for at least 50 usec, and wait for acknowledgement. */
163 pci_write_config_byte(dev: pdev, I915_GDRST, GRDOM_RESET_ENABLE);
164 udelay(usec: 50);
165 err = _wait_for_atomic(i915_in_reset(pdev), 50000, 0);
166
167 /* Clear the reset request. */
168 pci_write_config_byte(dev: pdev, I915_GDRST, val: 0);
169 udelay(usec: 50);
170 if (!err)
171 err = _wait_for_atomic(!i915_in_reset(pdev), 50000, 0);
172
173 return err;
174}
175
176static bool g4x_reset_complete(struct pci_dev *pdev)
177{
178 u8 gdrst;
179
180 pci_read_config_byte(dev: pdev, I915_GDRST, val: &gdrst);
181 return (gdrst & GRDOM_RESET_ENABLE) == 0;
182}
183
184static int g33_do_reset(struct intel_gt *gt,
185 intel_engine_mask_t engine_mask,
186 unsigned int retry)
187{
188 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
189
190 pci_write_config_byte(dev: pdev, I915_GDRST, GRDOM_RESET_ENABLE);
191 return _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0);
192}
193
194static int g4x_do_reset(struct intel_gt *gt,
195 intel_engine_mask_t engine_mask,
196 unsigned int retry)
197{
198 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
199 struct intel_uncore *uncore = gt->uncore;
200 int ret;
201
202 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
203 intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, clear: 0, VCP_UNIT_CLOCK_GATE_DISABLE);
204 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
205
206 pci_write_config_byte(dev: pdev, I915_GDRST,
207 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
208 ret = _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0);
209 if (ret) {
210 GT_TRACE(gt, "Wait for media reset failed\n");
211 goto out;
212 }
213
214 pci_write_config_byte(dev: pdev, I915_GDRST,
215 GRDOM_RENDER | GRDOM_RESET_ENABLE);
216 ret = _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0);
217 if (ret) {
218 GT_TRACE(gt, "Wait for render reset failed\n");
219 goto out;
220 }
221
222out:
223 pci_write_config_byte(dev: pdev, I915_GDRST, val: 0);
224
225 intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE, set: 0);
226 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
227
228 return ret;
229}
230
231static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
232 unsigned int retry)
233{
234 struct intel_uncore *uncore = gt->uncore;
235 int ret;
236
237 intel_uncore_write_fw(uncore, ILK_GDSR,
238 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
239 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
240 ILK_GRDOM_RESET_ENABLE, value: 0,
241 fast_timeout_us: 5000, slow_timeout_ms: 0,
242 NULL);
243 if (ret) {
244 GT_TRACE(gt, "Wait for render reset failed\n");
245 goto out;
246 }
247
248 intel_uncore_write_fw(uncore, ILK_GDSR,
249 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
250 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
251 ILK_GRDOM_RESET_ENABLE, value: 0,
252 fast_timeout_us: 5000, slow_timeout_ms: 0,
253 NULL);
254 if (ret) {
255 GT_TRACE(gt, "Wait for media reset failed\n");
256 goto out;
257 }
258
259out:
260 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
261 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
262 return ret;
263}
264
265/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
266static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
267{
268 struct intel_uncore *uncore = gt->uncore;
269 int loops;
270 int err;
271
272 /*
273 * On some platforms, e.g. Jasperlake, we see that the engine register
274 * state is not cleared until shortly after GDRST reports completion,
275 * causing a failure as we try to immediately resume while the internal
276 * state is still in flux. If we immediately repeat the reset, the
277 * second reset appears to serialise with the first, and since it is a
278 * no-op, the registers should retain their reset value. However, there
279 * is still a concern that upon leaving the second reset, the internal
280 * engine state is still in flux and not ready for resuming.
281 *
282 * Starting on MTL, there are some prep steps that we need to do when
283 * resetting some engines that need to be applied every time we write to
284 * GEN6_GDRST. As those are time consuming (tens of ms), we don't want
285 * to perform that twice, so, since the Jasperlake issue hasn't been
286 * observed on MTL, we avoid repeating the reset on newer platforms.
287 */
288 loops = GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70) ? 2 : 1;
289
290 /*
291 * GEN6_GDRST is not in the gt power well, no need to check
292 * for fifo space for the write or forcewake the chip for
293 * the read
294 */
295 do {
296 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
297
298 /* Wait for the device to ack the reset requests. */
299 err = __intel_wait_for_register_fw(uncore, GEN6_GDRST,
300 mask: hw_domain_mask, value: 0,
301 fast_timeout_us: 2000, slow_timeout_ms: 0,
302 NULL);
303 } while (err == 0 && --loops);
304 if (err)
305 GT_TRACE(gt,
306 "Wait for 0x%08x engines reset failed\n",
307 hw_domain_mask);
308
309 /*
310 * As we have observed that the engine state is still volatile
311 * after GDRST is acked, impose a small delay to let everything settle.
312 */
313 udelay(usec: 50);
314
315 return err;
316}
317
318static int __gen6_reset_engines(struct intel_gt *gt,
319 intel_engine_mask_t engine_mask,
320 unsigned int retry)
321{
322 struct intel_engine_cs *engine;
323 u32 hw_mask;
324
325 if (engine_mask == ALL_ENGINES) {
326 hw_mask = GEN6_GRDOM_FULL;
327 } else {
328 intel_engine_mask_t tmp;
329
330 hw_mask = 0;
331 for_each_engine_masked(engine, gt, engine_mask, tmp) {
332 hw_mask |= engine->reset_domain;
333 }
334 }
335
336 return gen6_hw_domain_reset(gt, hw_domain_mask: hw_mask);
337}
338
339static int gen6_reset_engines(struct intel_gt *gt,
340 intel_engine_mask_t engine_mask,
341 unsigned int retry)
342{
343 unsigned long flags;
344 int ret;
345
346 spin_lock_irqsave(&gt->uncore->lock, flags);
347 ret = __gen6_reset_engines(gt, engine_mask, retry);
348 spin_unlock_irqrestore(lock: &gt->uncore->lock, flags);
349
350 return ret;
351}
352
353static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
354{
355 int vecs_id;
356
357 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
358
359 vecs_id = _VECS((engine->instance) / 2);
360
361 return engine->gt->engine[vecs_id];
362}
363
364struct sfc_lock_data {
365 i915_reg_t lock_reg;
366 i915_reg_t ack_reg;
367 i915_reg_t usage_reg;
368 u32 lock_bit;
369 u32 ack_bit;
370 u32 usage_bit;
371 u32 reset_bit;
372};
373
374static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
375 struct sfc_lock_data *sfc_lock)
376{
377 switch (engine->class) {
378 default:
379 MISSING_CASE(engine->class);
380 fallthrough;
381 case VIDEO_DECODE_CLASS:
382 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base);
383 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
384
385 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
386 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
387
388 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
389 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
390 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
391
392 break;
393 case VIDEO_ENHANCEMENT_CLASS:
394 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base);
395 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
396
397 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base);
398 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
399
400 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base);
401 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
402 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
403
404 break;
405 }
406}
407
408static int gen11_lock_sfc(struct intel_engine_cs *engine,
409 u32 *reset_mask,
410 u32 *unlock_mask)
411{
412 struct intel_uncore *uncore = engine->uncore;
413 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
414 struct sfc_lock_data sfc_lock;
415 bool lock_obtained, lock_to_other = false;
416 int ret;
417
418 switch (engine->class) {
419 case VIDEO_DECODE_CLASS:
420 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
421 return 0;
422
423 fallthrough;
424 case VIDEO_ENHANCEMENT_CLASS:
425 get_sfc_forced_lock_data(engine, sfc_lock: &sfc_lock);
426
427 break;
428 default:
429 return 0;
430 }
431
432 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
433 struct intel_engine_cs *paired_vecs;
434
435 if (engine->class != VIDEO_DECODE_CLASS ||
436 GRAPHICS_VER(engine->i915) != 12)
437 return 0;
438
439 /*
440 * Wa_14010733141
441 *
442 * If the VCS-MFX isn't using the SFC, we also need to check
443 * whether VCS-HCP is using it. If so, we need to issue a *VE*
444 * forced lock on the VE engine that shares the same SFC.
445 */
446 if (!(intel_uncore_read_fw(uncore,
447 GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) &
448 GEN12_HCP_SFC_USAGE_BIT))
449 return 0;
450
451 paired_vecs = find_sfc_paired_vecs_engine(engine);
452 get_sfc_forced_lock_data(engine: paired_vecs, sfc_lock: &sfc_lock);
453 lock_to_other = true;
454 *unlock_mask |= paired_vecs->mask;
455 } else {
456 *unlock_mask |= engine->mask;
457 }
458
459 /*
460 * If the engine is using an SFC, tell the engine that a software reset
461 * is going to happen. The engine will then try to force lock the SFC.
462 * If SFC ends up being locked to the engine we want to reset, we have
463 * to reset it as well (we will unlock it once the reset sequence is
464 * completed).
465 */
466 intel_uncore_rmw_fw(uncore, reg: sfc_lock.lock_reg, clear: 0, set: sfc_lock.lock_bit);
467
468 ret = __intel_wait_for_register_fw(uncore,
469 reg: sfc_lock.ack_reg,
470 mask: sfc_lock.ack_bit,
471 value: sfc_lock.ack_bit,
472 fast_timeout_us: 1000, slow_timeout_ms: 0, NULL);
473
474 /*
475 * Was the SFC released while we were trying to lock it?
476 *
477 * We should reset both the engine and the SFC if:
478 * - We were locking the SFC to this engine and the lock succeeded
479 * OR
480 * - We were locking the SFC to a different engine (Wa_14010733141)
481 * but the SFC was released before the lock was obtained.
482 *
483 * Otherwise we need only reset the engine by itself and we can
484 * leave the SFC alone.
485 */
486 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
487 sfc_lock.usage_bit) != 0;
488 if (lock_obtained == lock_to_other)
489 return 0;
490
491 if (ret) {
492 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
493 return ret;
494 }
495
496 *reset_mask |= sfc_lock.reset_bit;
497 return 0;
498}
499
500static void gen11_unlock_sfc(struct intel_engine_cs *engine)
501{
502 struct intel_uncore *uncore = engine->uncore;
503 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
504 struct sfc_lock_data sfc_lock = {};
505
506 if (engine->class != VIDEO_DECODE_CLASS &&
507 engine->class != VIDEO_ENHANCEMENT_CLASS)
508 return;
509
510 if (engine->class == VIDEO_DECODE_CLASS &&
511 (BIT(engine->instance) & vdbox_sfc_access) == 0)
512 return;
513
514 get_sfc_forced_lock_data(engine, sfc_lock: &sfc_lock);
515
516 intel_uncore_rmw_fw(uncore, reg: sfc_lock.lock_reg, clear: sfc_lock.lock_bit, set: 0);
517}
518
519static int __gen11_reset_engines(struct intel_gt *gt,
520 intel_engine_mask_t engine_mask,
521 unsigned int retry)
522{
523 struct intel_engine_cs *engine;
524 intel_engine_mask_t tmp;
525 u32 reset_mask, unlock_mask = 0;
526 int ret;
527
528 if (engine_mask == ALL_ENGINES) {
529 reset_mask = GEN11_GRDOM_FULL;
530 } else {
531 reset_mask = 0;
532 for_each_engine_masked(engine, gt, engine_mask, tmp) {
533 reset_mask |= engine->reset_domain;
534 ret = gen11_lock_sfc(engine, reset_mask: &reset_mask, unlock_mask: &unlock_mask);
535 if (ret)
536 goto sfc_unlock;
537 }
538 }
539
540 ret = gen6_hw_domain_reset(gt, hw_domain_mask: reset_mask);
541
542sfc_unlock:
543 /*
544 * We unlock the SFC based on the lock status and not the result of
545 * gen11_lock_sfc to make sure that we clean properly if something
546 * wrong happened during the lock (e.g. lock acquired after timeout
547 * expiration).
548 *
549 * Due to Wa_14010733141, we may have locked an SFC to an engine that
550 * wasn't being reset. So instead of calling gen11_unlock_sfc()
551 * on engine_mask, we instead call it on the mask of engines that our
552 * gen11_lock_sfc() calls told us actually had locks attempted.
553 */
554 for_each_engine_masked(engine, gt, unlock_mask, tmp)
555 gen11_unlock_sfc(engine);
556
557 return ret;
558}
559
560static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
561{
562 struct intel_uncore *uncore = engine->uncore;
563 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
564 u32 request, mask, ack;
565 int ret;
566
567 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
568 return -ETIMEDOUT;
569
570 ack = intel_uncore_read_fw(uncore, reg);
571 if (ack & RESET_CTL_CAT_ERROR) {
572 /*
573 * For catastrophic errors, ready-for-reset sequence
574 * needs to be bypassed: HAS#396813
575 */
576 request = RESET_CTL_CAT_ERROR;
577 mask = RESET_CTL_CAT_ERROR;
578
579 /* Catastrophic errors need to be cleared by HW */
580 ack = 0;
581 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
582 request = RESET_CTL_REQUEST_RESET;
583 mask = RESET_CTL_READY_TO_RESET;
584 ack = RESET_CTL_READY_TO_RESET;
585 } else {
586 return 0;
587 }
588
589 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
590 ret = __intel_wait_for_register_fw(uncore, reg, mask, value: ack,
591 fast_timeout_us: 700, slow_timeout_ms: 0, NULL);
592 if (ret)
593 gt_err(engine->gt,
594 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
595 engine->name, request,
596 intel_uncore_read_fw(uncore, reg));
597
598 return ret;
599}
600
601static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
602{
603 intel_uncore_write_fw(engine->uncore,
604 RING_RESET_CTL(engine->mmio_base),
605 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
606}
607
608static int gen8_reset_engines(struct intel_gt *gt,
609 intel_engine_mask_t engine_mask,
610 unsigned int retry)
611{
612 struct intel_engine_cs *engine;
613 const bool reset_non_ready = retry >= 1;
614 intel_engine_mask_t tmp;
615 unsigned long flags;
616 int ret;
617
618 spin_lock_irqsave(&gt->uncore->lock, flags);
619
620 for_each_engine_masked(engine, gt, engine_mask, tmp) {
621 ret = gen8_engine_reset_prepare(engine);
622 if (ret && !reset_non_ready)
623 goto skip_reset;
624
625 /*
626 * If this is not the first failed attempt to prepare,
627 * we decide to proceed anyway.
628 *
629 * By doing so we risk context corruption and with
630 * some gens (kbl), possible system hang if reset
631 * happens during active bb execution.
632 *
633 * We rather take context corruption instead of
634 * failed reset with a wedged driver/gpu. And
635 * active bb execution case should be covered by
636 * stop_engines() we have before the reset.
637 */
638 }
639
640 /*
641 * Wa_22011100796:dg2, whenever Full soft reset is required,
642 * reset all individual engines firstly, and then do a full soft reset.
643 *
644 * This is best effort, so ignore any error from the initial reset.
645 */
646 if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
647 __gen11_reset_engines(gt, engine_mask: gt->info.engine_mask, retry: 0);
648
649 if (GRAPHICS_VER(gt->i915) >= 11)
650 ret = __gen11_reset_engines(gt, engine_mask, retry);
651 else
652 ret = __gen6_reset_engines(gt, engine_mask, retry);
653
654skip_reset:
655 for_each_engine_masked(engine, gt, engine_mask, tmp)
656 gen8_engine_reset_cancel(engine);
657
658 spin_unlock_irqrestore(lock: &gt->uncore->lock, flags);
659
660 return ret;
661}
662
663static int mock_reset(struct intel_gt *gt,
664 intel_engine_mask_t mask,
665 unsigned int retry)
666{
667 return 0;
668}
669
670typedef int (*reset_func)(struct intel_gt *,
671 intel_engine_mask_t engine_mask,
672 unsigned int retry);
673
674static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
675{
676 struct drm_i915_private *i915 = gt->i915;
677
678 if (is_mock_gt(gt))
679 return mock_reset;
680 else if (GRAPHICS_VER(i915) >= 8)
681 return gen8_reset_engines;
682 else if (GRAPHICS_VER(i915) >= 6)
683 return gen6_reset_engines;
684 else if (GRAPHICS_VER(i915) >= 5)
685 return ilk_do_reset;
686 else if (IS_G4X(i915))
687 return g4x_do_reset;
688 else if (IS_G33(i915) || IS_PINEVIEW(i915))
689 return g33_do_reset;
690 else if (GRAPHICS_VER(i915) >= 3)
691 return i915_do_reset;
692 else
693 return NULL;
694}
695
696static int __reset_guc(struct intel_gt *gt)
697{
698 u32 guc_domain =
699 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
700
701 return gen6_hw_domain_reset(gt, hw_domain_mask: guc_domain);
702}
703
704static bool needs_wa_14015076503(struct intel_gt *gt, intel_engine_mask_t engine_mask)
705{
706 if (MEDIA_VER_FULL(gt->i915) != IP_VER(13, 0) || !HAS_ENGINE(gt, GSC0))
707 return false;
708
709 if (!__HAS_ENGINE(engine_mask, GSC0))
710 return false;
711
712 return intel_gsc_uc_fw_init_done(gsc: &gt->uc.gsc);
713}
714
715static intel_engine_mask_t
716wa_14015076503_start(struct intel_gt *gt, intel_engine_mask_t engine_mask, bool first)
717{
718 if (!needs_wa_14015076503(gt, engine_mask))
719 return engine_mask;
720
721 /*
722 * wa_14015076503: if the GSC FW is loaded, we need to alert it that
723 * we're going to do a GSC engine reset and then wait for 200ms for the
724 * FW to get ready for it. However, if this is the first ALL_ENGINES
725 * reset attempt and the GSC is not busy, we can try to instead reset
726 * the GuC and all the other engines individually to avoid the 200ms
727 * wait.
728 * Skipping the GSC engine is safe because, differently from other
729 * engines, the GSCCS only role is to forward the commands to the GSC
730 * FW, so it doesn't have any HW outside of the CS itself and therefore
731 * it has no state that we don't explicitly re-init on resume or on
732 * context switch LRC or power context). The HW for the GSC uC is
733 * managed by the GSC FW so we don't need to care about that.
734 */
735 if (engine_mask == ALL_ENGINES && first && intel_engine_is_idle(engine: gt->engine[GSC0])) {
736 __reset_guc(gt);
737 engine_mask = gt->info.engine_mask & ~BIT(GSC0);
738 } else {
739 intel_uncore_rmw(uncore: gt->uncore,
740 HECI_H_GS1(MTL_GSC_HECI2_BASE),
741 clear: 0, HECI_H_GS1_ER_PREP);
742
743 /* make sure the reset bit is clear when writing the CSR reg */
744 intel_uncore_rmw(uncore: gt->uncore,
745 HECI_H_CSR(MTL_GSC_HECI2_BASE),
746 HECI_H_CSR_RST, HECI_H_CSR_IG);
747 msleep(msecs: 200);
748 }
749
750 return engine_mask;
751}
752
753static void
754wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask)
755{
756 if (!needs_wa_14015076503(gt, engine_mask))
757 return;
758
759 intel_uncore_rmw(uncore: gt->uncore,
760 HECI_H_GS1(MTL_GSC_HECI2_BASE),
761 HECI_H_GS1_ER_PREP, set: 0);
762}
763
764static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
765{
766 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
767 reset_func reset;
768 int ret = -ETIMEDOUT;
769 int retry;
770
771 reset = intel_get_gpu_reset(gt);
772 if (!reset)
773 return -ENODEV;
774
775 /*
776 * If the power well sleeps during the reset, the reset
777 * request may be dropped and never completes (causing -EIO).
778 */
779 intel_uncore_forcewake_get(uncore: gt->uncore, domains: FORCEWAKE_ALL);
780 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
781 intel_engine_mask_t reset_mask;
782
783 reset_mask = wa_14015076503_start(gt, engine_mask, first: !retry);
784
785 GT_TRACE(gt, "engine_mask=%x\n", reset_mask);
786 ret = reset(gt, reset_mask, retry);
787
788 wa_14015076503_end(gt, engine_mask: reset_mask);
789 }
790 intel_uncore_forcewake_put(uncore: gt->uncore, domains: FORCEWAKE_ALL);
791
792 return ret;
793}
794
795bool intel_has_gpu_reset(const struct intel_gt *gt)
796{
797 if (!gt->i915->params.reset)
798 return NULL;
799
800 return intel_get_gpu_reset(gt);
801}
802
803bool intel_has_reset_engine(const struct intel_gt *gt)
804{
805 if (gt->i915->params.reset < 2)
806 return false;
807
808 return INTEL_INFO(gt->i915)->has_reset_engine;
809}
810
811int intel_reset_guc(struct intel_gt *gt)
812{
813 int ret;
814
815 GEM_BUG_ON(!HAS_GT_UC(gt->i915));
816
817 intel_uncore_forcewake_get(uncore: gt->uncore, domains: FORCEWAKE_ALL);
818 ret = __reset_guc(gt);
819 intel_uncore_forcewake_put(uncore: gt->uncore, domains: FORCEWAKE_ALL);
820
821 return ret;
822}
823
824/*
825 * Ensure irq handler finishes, and not run again.
826 * Also return the active request so that we only search for it once.
827 */
828static void reset_prepare_engine(struct intel_engine_cs *engine)
829{
830 /*
831 * During the reset sequence, we must prevent the engine from
832 * entering RC6. As the context state is undefined until we restart
833 * the engine, if it does enter RC6 during the reset, the state
834 * written to the powercontext is undefined and so we may lose
835 * GPU state upon resume, i.e. fail to restart after a reset.
836 */
837 intel_uncore_forcewake_get(uncore: engine->uncore, domains: FORCEWAKE_ALL);
838 if (engine->reset.prepare)
839 engine->reset.prepare(engine);
840}
841
842static void revoke_mmaps(struct intel_gt *gt)
843{
844 int i;
845
846 for (i = 0; i < gt->ggtt->num_fences; i++) {
847 struct drm_vma_offset_node *node;
848 struct i915_vma *vma;
849 u64 vma_offset;
850
851 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
852 if (!vma)
853 continue;
854
855 if (!i915_vma_has_userfault(vma))
856 continue;
857
858 GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
859
860 if (!vma->mmo)
861 continue;
862
863 node = &vma->mmo->vma_node;
864 vma_offset = vma->gtt_view.partial.offset << PAGE_SHIFT;
865
866 unmap_mapping_range(mapping: gt->i915->drm.anon_inode->i_mapping,
867 holebegin: drm_vma_node_offset_addr(node) + vma_offset,
868 holelen: vma->size,
869 even_cows: 1);
870 }
871}
872
873static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
874{
875 struct intel_engine_cs *engine;
876 intel_engine_mask_t awake = 0;
877 enum intel_engine_id id;
878
879 /**
880 * For GuC mode with submission enabled, ensure submission
881 * is disabled before stopping ring.
882 *
883 * For GuC mode with submission disabled, ensure that GuC is not
884 * sanitized, do that after engine reset. reset_prepare()
885 * is followed by engine reset which in this mode requires GuC to
886 * process any CSB FIFO entries generated by the resets.
887 */
888 if (intel_uc_uses_guc_submission(uc: &gt->uc))
889 intel_uc_reset_prepare(uc: &gt->uc);
890
891 for_each_engine(engine, gt, id) {
892 if (intel_engine_pm_get_if_awake(engine))
893 awake |= engine->mask;
894 reset_prepare_engine(engine);
895 }
896
897 return awake;
898}
899
900static void gt_revoke(struct intel_gt *gt)
901{
902 revoke_mmaps(gt);
903}
904
905static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
906{
907 struct intel_engine_cs *engine;
908 enum intel_engine_id id;
909 int err;
910
911 /*
912 * Everything depends on having the GTT running, so we need to start
913 * there.
914 */
915 err = i915_ggtt_enable_hw(i915: gt->i915);
916 if (err)
917 return err;
918
919 local_bh_disable();
920 for_each_engine(engine, gt, id)
921 __intel_engine_reset(engine, stalled: stalled_mask & engine->mask);
922 local_bh_enable();
923
924 intel_uc_reset(uc: &gt->uc, ALL_ENGINES);
925
926 intel_ggtt_restore_fences(ggtt: gt->ggtt);
927
928 return err;
929}
930
931static void reset_finish_engine(struct intel_engine_cs *engine)
932{
933 if (engine->reset.finish)
934 engine->reset.finish(engine);
935 intel_uncore_forcewake_put(uncore: engine->uncore, domains: FORCEWAKE_ALL);
936
937 intel_engine_signal_breadcrumbs(engine);
938}
939
940static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
941{
942 struct intel_engine_cs *engine;
943 enum intel_engine_id id;
944
945 for_each_engine(engine, gt, id) {
946 reset_finish_engine(engine);
947 if (awake & engine->mask)
948 intel_engine_pm_put(engine);
949 }
950
951 intel_uc_reset_finish(uc: &gt->uc);
952}
953
954static void nop_submit_request(struct i915_request *request)
955{
956 RQ_TRACE(request, "-EIO\n");
957
958 request = i915_request_mark_eio(rq: request);
959 if (request) {
960 i915_request_submit(request);
961 intel_engine_signal_breadcrumbs(engine: request->engine);
962
963 i915_request_put(rq: request);
964 }
965}
966
967static void __intel_gt_set_wedged(struct intel_gt *gt)
968{
969 struct intel_engine_cs *engine;
970 intel_engine_mask_t awake;
971 enum intel_engine_id id;
972
973 if (test_bit(I915_WEDGED, &gt->reset.flags))
974 return;
975
976 GT_TRACE(gt, "start\n");
977
978 /*
979 * First, stop submission to hw, but do not yet complete requests by
980 * rolling the global seqno forward (since this would complete requests
981 * for which we haven't set the fence error to EIO yet).
982 */
983 awake = reset_prepare(gt);
984
985 /* Even if the GPU reset fails, it should still stop the engines */
986 if (!intel_gt_gpu_reset_clobbers_display(gt))
987 intel_gt_reset_all_engines(gt);
988
989 for_each_engine(engine, gt, id)
990 engine->submit_request = nop_submit_request;
991
992 /*
993 * Make sure no request can slip through without getting completed by
994 * either this call here to intel_engine_write_global_seqno, or the one
995 * in nop_submit_request.
996 */
997 synchronize_rcu_expedited();
998 set_bit(I915_WEDGED, addr: &gt->reset.flags);
999
1000 /* Mark all executing requests as skipped */
1001 local_bh_disable();
1002 for_each_engine(engine, gt, id)
1003 if (engine->reset.cancel)
1004 engine->reset.cancel(engine);
1005 intel_uc_cancel_requests(uc: &gt->uc);
1006 local_bh_enable();
1007
1008 reset_finish(gt, awake);
1009
1010 GT_TRACE(gt, "end\n");
1011}
1012
1013static void set_wedged_work(struct work_struct *w)
1014{
1015 struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
1016 intel_wakeref_t wf;
1017
1018 with_intel_runtime_pm(gt->uncore->rpm, wf)
1019 __intel_gt_set_wedged(gt);
1020}
1021
1022void intel_gt_set_wedged(struct intel_gt *gt)
1023{
1024 intel_wakeref_t wakeref;
1025
1026 if (test_bit(I915_WEDGED, &gt->reset.flags))
1027 return;
1028
1029 wakeref = intel_runtime_pm_get(rpm: gt->uncore->rpm);
1030 mutex_lock(lock: &gt->reset.mutex);
1031
1032 if (GEM_SHOW_DEBUG()) {
1033 struct drm_printer p = drm_dbg_printer(drm: &gt->i915->drm,
1034 category: DRM_UT_DRIVER, NULL);
1035 struct intel_engine_cs *engine;
1036 enum intel_engine_id id;
1037
1038 drm_printf(p: &p, f: "called from %pS\n", (void *)_RET_IP_);
1039 for_each_engine(engine, gt, id) {
1040 if (intel_engine_is_idle(engine))
1041 continue;
1042
1043 intel_engine_dump(engine, m: &p, header: "%s\n", engine->name);
1044 }
1045 }
1046
1047 __intel_gt_set_wedged(gt);
1048
1049 mutex_unlock(lock: &gt->reset.mutex);
1050 intel_runtime_pm_put(rpm: gt->uncore->rpm, wref: wakeref);
1051}
1052
1053static bool __intel_gt_unset_wedged(struct intel_gt *gt)
1054{
1055 struct intel_gt_timelines *timelines = &gt->timelines;
1056 struct intel_timeline *tl;
1057 bool ok;
1058
1059 if (!test_bit(I915_WEDGED, &gt->reset.flags))
1060 return true;
1061
1062 /* Never fully initialised, recovery impossible */
1063 if (intel_gt_has_unrecoverable_error(gt))
1064 return false;
1065
1066 GT_TRACE(gt, "start\n");
1067
1068 /*
1069 * Before unwedging, make sure that all pending operations
1070 * are flushed and errored out - we may have requests waiting upon
1071 * third party fences. We marked all inflight requests as EIO, and
1072 * every execbuf since returned EIO, for consistency we want all
1073 * the currently pending requests to also be marked as EIO, which
1074 * is done inside our nop_submit_request - and so we must wait.
1075 *
1076 * No more can be submitted until we reset the wedged bit.
1077 */
1078 spin_lock(lock: &timelines->lock);
1079 list_for_each_entry(tl, &timelines->active_list, link) {
1080 struct dma_fence *fence;
1081
1082 fence = i915_active_fence_get(active: &tl->last_request);
1083 if (!fence)
1084 continue;
1085
1086 spin_unlock(lock: &timelines->lock);
1087
1088 /*
1089 * All internal dependencies (i915_requests) will have
1090 * been flushed by the set-wedge, but we may be stuck waiting
1091 * for external fences. These should all be capped to 10s
1092 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
1093 * in the worst case.
1094 */
1095 dma_fence_default_wait(fence, intr: false, MAX_SCHEDULE_TIMEOUT);
1096 dma_fence_put(fence);
1097
1098 /* Restart iteration after dropping lock */
1099 spin_lock(lock: &timelines->lock);
1100 tl = list_entry(&timelines->active_list, typeof(*tl), link);
1101 }
1102 spin_unlock(lock: &timelines->lock);
1103
1104 /* We must reset pending GPU events before restoring our submission */
1105 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
1106 if (!intel_gt_gpu_reset_clobbers_display(gt))
1107 ok = intel_gt_reset_all_engines(gt) == 0;
1108 if (!ok) {
1109 /*
1110 * Warn CI about the unrecoverable wedged condition.
1111 * Time for a reboot.
1112 */
1113 add_taint_for_CI(i915: gt->i915, TAINT_WARN);
1114 return false;
1115 }
1116
1117 /*
1118 * Undo nop_submit_request. We prevent all new i915 requests from
1119 * being queued (by disallowing execbuf whilst wedged) so having
1120 * waited for all active requests above, we know the system is idle
1121 * and do not have to worry about a thread being inside
1122 * engine->submit_request() as we swap over. So unlike installing
1123 * the nop_submit_request on reset, we can do this from normal
1124 * context and do not require stop_machine().
1125 */
1126 intel_engines_reset_default_submission(gt);
1127
1128 GT_TRACE(gt, "end\n");
1129
1130 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
1131 clear_bit(I915_WEDGED, addr: &gt->reset.flags);
1132
1133 return true;
1134}
1135
1136bool intel_gt_unset_wedged(struct intel_gt *gt)
1137{
1138 bool result;
1139
1140 mutex_lock(lock: &gt->reset.mutex);
1141 result = __intel_gt_unset_wedged(gt);
1142 mutex_unlock(lock: &gt->reset.mutex);
1143
1144 return result;
1145}
1146
1147static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1148{
1149 int err, i;
1150
1151 err = intel_gt_reset_all_engines(gt);
1152 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1153 msleep(msecs: 10 * (i + 1));
1154 err = intel_gt_reset_all_engines(gt);
1155 }
1156 if (err)
1157 return err;
1158
1159 return gt_reset(gt, stalled_mask);
1160}
1161
1162static int resume(struct intel_gt *gt)
1163{
1164 struct intel_engine_cs *engine;
1165 enum intel_engine_id id;
1166 int ret;
1167
1168 for_each_engine(engine, gt, id) {
1169 ret = intel_engine_resume(engine);
1170 if (ret)
1171 return ret;
1172 }
1173
1174 return 0;
1175}
1176
1177bool intel_gt_gpu_reset_clobbers_display(struct intel_gt *gt)
1178{
1179 struct drm_i915_private *i915 = gt->i915;
1180
1181 return INTEL_INFO(i915)->gpu_reset_clobbers_display;
1182}
1183
1184/**
1185 * intel_gt_reset - reset chip after a hang
1186 * @gt: #intel_gt to reset
1187 * @stalled_mask: mask of the stalled engines with the guilty requests
1188 * @reason: user error message for why we are resetting
1189 *
1190 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1191 * on failure.
1192 *
1193 * Procedure is fairly simple:
1194 * - reset the chip using the reset reg
1195 * - re-init context state
1196 * - re-init hardware status page
1197 * - re-init ring buffer
1198 * - re-init interrupt state
1199 * - re-init display
1200 */
1201void intel_gt_reset(struct intel_gt *gt,
1202 intel_engine_mask_t stalled_mask,
1203 const char *reason)
1204{
1205 struct intel_display *display = gt->i915->display;
1206 intel_engine_mask_t awake;
1207 int ret;
1208
1209 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1210
1211 might_sleep();
1212 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1213
1214 /*
1215 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
1216 * critical section like gpu reset.
1217 */
1218 gt_revoke(gt);
1219
1220 mutex_lock(lock: &gt->reset.mutex);
1221
1222 /* Clear any previous failed attempts at recovery. Time to try again. */
1223 if (!__intel_gt_unset_wedged(gt))
1224 goto unlock;
1225
1226 if (reason)
1227 gt_notice(gt, "Resetting chip for %s\n", reason);
1228 atomic_inc(v: &gt->i915->gpu_error.reset_count);
1229
1230 awake = reset_prepare(gt);
1231
1232 if (!intel_has_gpu_reset(gt)) {
1233 if (gt->i915->params.reset)
1234 gt_err(gt, "GPU reset not supported\n");
1235 else
1236 gt_dbg(gt, "GPU reset disabled\n");
1237 goto error;
1238 }
1239
1240 if (intel_gt_gpu_reset_clobbers_display(gt))
1241 intel_irq_suspend(i915: gt->i915);
1242
1243 if (do_reset(gt, stalled_mask)) {
1244 gt_err(gt, "Failed to reset chip\n");
1245 goto taint;
1246 }
1247
1248 if (intel_gt_gpu_reset_clobbers_display(gt))
1249 intel_irq_resume(i915: gt->i915);
1250
1251 intel_overlay_reset(display);
1252
1253 /* sanitize uC after engine reset */
1254 if (!intel_uc_uses_guc_submission(uc: &gt->uc))
1255 intel_uc_reset_prepare(uc: &gt->uc);
1256 /*
1257 * Next we need to restore the context, but we don't use those
1258 * yet either...
1259 *
1260 * Ring buffer needs to be re-initialized in the KMS case, or if X
1261 * was running at the time of the reset (i.e. we weren't VT
1262 * switched away).
1263 */
1264 ret = intel_gt_init_hw(gt);
1265 if (ret) {
1266 gt_err(gt, "Failed to initialise HW following reset (%d)\n", ret);
1267 goto taint;
1268 }
1269
1270 ret = resume(gt);
1271 if (ret)
1272 goto taint;
1273
1274finish:
1275 reset_finish(gt, awake);
1276unlock:
1277 mutex_unlock(lock: &gt->reset.mutex);
1278 return;
1279
1280taint:
1281 /*
1282 * History tells us that if we cannot reset the GPU now, we
1283 * never will. This then impacts everything that is run
1284 * subsequently. On failing the reset, we mark the driver
1285 * as wedged, preventing further execution on the GPU.
1286 * We also want to go one step further and add a taint to the
1287 * kernel so that any subsequent faults can be traced back to
1288 * this failure. This is important for CI, where if the
1289 * GPU/driver fails we would like to reboot and restart testing
1290 * rather than continue on into oblivion. For everyone else,
1291 * the system should still plod along, but they have been warned!
1292 */
1293 add_taint_for_CI(i915: gt->i915, TAINT_WARN);
1294error:
1295 __intel_gt_set_wedged(gt);
1296 goto finish;
1297}
1298
1299/**
1300 * intel_gt_reset_all_engines() - Reset all engines in the given gt.
1301 * @gt: the GT to reset all engines for.
1302 *
1303 * This function resets all engines within the given gt.
1304 *
1305 * Returns:
1306 * Zero on success, negative error code on failure.
1307 */
1308int intel_gt_reset_all_engines(struct intel_gt *gt)
1309{
1310 return __intel_gt_reset(gt, ALL_ENGINES);
1311}
1312
1313/**
1314 * intel_gt_reset_engine() - Reset a specific engine within a gt.
1315 * @engine: engine to be reset.
1316 *
1317 * This function resets the specified engine within a gt.
1318 *
1319 * Returns:
1320 * Zero on success, negative error code on failure.
1321 */
1322int intel_gt_reset_engine(struct intel_engine_cs *engine)
1323{
1324 return __intel_gt_reset(gt: engine->gt, engine_mask: engine->mask);
1325}
1326
1327int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1328{
1329 struct intel_gt *gt = engine->gt;
1330 int ret;
1331
1332 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1333 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1334
1335 if (intel_engine_uses_guc(engine))
1336 return -ENODEV;
1337
1338 if (!intel_engine_pm_get_if_awake(engine))
1339 return 0;
1340
1341 reset_prepare_engine(engine);
1342
1343 if (msg)
1344 drm_notice(&engine->i915->drm,
1345 "Resetting %s for %s\n", engine->name, msg);
1346 i915_increase_reset_engine_count(error: &engine->i915->gpu_error, engine);
1347
1348 ret = intel_gt_reset_engine(engine);
1349 if (ret) {
1350 /* If we fail here, we expect to fallback to a global reset */
1351 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1352 goto out;
1353 }
1354
1355 /*
1356 * The request that caused the hang is stuck on elsp, we know the
1357 * active request and can drop it, adjust head to skip the offending
1358 * request to resume executing remaining requests in the queue.
1359 */
1360 __intel_engine_reset(engine, stalled: true);
1361
1362 /*
1363 * The engine and its registers (and workarounds in case of render)
1364 * have been reset to their default values. Follow the init_ring
1365 * process to program RING_MODE, HWSP and re-enable submission.
1366 */
1367 ret = intel_engine_resume(engine);
1368
1369out:
1370 intel_engine_cancel_stop_cs(engine);
1371 reset_finish_engine(engine);
1372 intel_engine_pm_put_async(engine);
1373 return ret;
1374}
1375
1376/**
1377 * intel_engine_reset - reset GPU engine to recover from a hang
1378 * @engine: engine to reset
1379 * @msg: reason for GPU reset; or NULL for no drm_notice()
1380 *
1381 * Reset a specific GPU engine. Useful if a hang is detected.
1382 * Returns zero on successful reset or otherwise an error code.
1383 *
1384 * Procedure is:
1385 * - identifies the request that caused the hang and it is dropped
1386 * - reset engine (which will force the engine to idle)
1387 * - re-init/configure engine
1388 */
1389int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1390{
1391 int err;
1392
1393 local_bh_disable();
1394 err = __intel_engine_reset_bh(engine, msg);
1395 local_bh_enable();
1396
1397 return err;
1398}
1399
1400static void display_reset_modeset_stuck(void *gt)
1401{
1402 intel_gt_set_wedged(gt);
1403}
1404
1405static void intel_gt_reset_global(struct intel_gt *gt,
1406 u32 engine_mask,
1407 const char *reason)
1408{
1409 struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1410 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1411 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1412 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1413 struct intel_wedge_me w;
1414
1415 kobject_uevent_env(kobj, action: KOBJ_CHANGE, envp: error_event);
1416
1417 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1418 kobject_uevent_env(kobj, action: KOBJ_CHANGE, envp: reset_event);
1419
1420 /* Use a watchdog to ensure that our reset completes */
1421 intel_wedge_on_timeout(&w, gt, 60 * HZ) {
1422 struct drm_i915_private *i915 = gt->i915;
1423 struct intel_display *display = i915->display;
1424 bool need_display_reset;
1425 bool reset_display;
1426
1427 need_display_reset = intel_gt_gpu_reset_clobbers_display(gt) &&
1428 intel_has_gpu_reset(gt);
1429
1430 reset_display = intel_display_reset_test(display) ||
1431 need_display_reset;
1432
1433 if (reset_display)
1434 reset_display = intel_display_reset_prepare(display,
1435 modeset_stuck: display_reset_modeset_stuck,
1436 context: gt);
1437
1438 intel_gt_reset(gt, stalled_mask: engine_mask, reason);
1439
1440 if (reset_display)
1441 intel_display_reset_finish(display, test_only: !need_display_reset);
1442 }
1443
1444 if (!test_bit(I915_WEDGED, &gt->reset.flags))
1445 kobject_uevent_env(kobj, action: KOBJ_CHANGE, envp: reset_done_event);
1446 else
1447 drm_dev_wedged_event(dev: &gt->i915->drm,
1448 DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
1449 NULL);
1450}
1451
1452/**
1453 * intel_gt_handle_error - handle a gpu error
1454 * @gt: the intel_gt
1455 * @engine_mask: mask representing engines that are hung
1456 * @flags: control flags
1457 * @fmt: Error message format string
1458 *
1459 * Do some basic checking of register state at error time and
1460 * dump it to the syslog. Also call i915_capture_error_state() to make
1461 * sure we get a record and make it available in debugfs. Fire a uevent
1462 * so userspace knows something bad happened (should trigger collection
1463 * of a ring dump etc.).
1464 */
1465void intel_gt_handle_error(struct intel_gt *gt,
1466 intel_engine_mask_t engine_mask,
1467 unsigned long flags,
1468 const char *fmt, ...)
1469{
1470 struct intel_engine_cs *engine;
1471 intel_wakeref_t wakeref;
1472 intel_engine_mask_t tmp;
1473 char error_msg[80];
1474 char *msg = NULL;
1475
1476 if (fmt) {
1477 va_list args;
1478
1479 va_start(args, fmt);
1480 vscnprintf(buf: error_msg, size: sizeof(error_msg), fmt, args);
1481 va_end(args);
1482
1483 msg = error_msg;
1484 }
1485
1486 /*
1487 * In most cases it's guaranteed that we get here with an RPM
1488 * reference held, for example because there is a pending GPU
1489 * request that won't finish until the reset is done. This
1490 * isn't the case at least when we get here by doing a
1491 * simulated reset via debugfs, so get an RPM reference.
1492 */
1493 wakeref = intel_runtime_pm_get(rpm: gt->uncore->rpm);
1494
1495 engine_mask &= gt->info.engine_mask;
1496
1497 if (flags & I915_ERROR_CAPTURE) {
1498 i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
1499 intel_gt_clear_error_registers(gt, engine_mask);
1500 }
1501
1502 /*
1503 * Try engine reset when available. We fall back to full reset if
1504 * single reset fails.
1505 */
1506 if (!intel_uc_uses_guc_submission(uc: &gt->uc) &&
1507 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1508 local_bh_disable();
1509 for_each_engine_masked(engine, gt, engine_mask, tmp) {
1510 BUILD_BUG_ON(I915_RESET_BACKOFF >= I915_RESET_ENGINE);
1511 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1512 addr: &gt->reset.flags))
1513 continue;
1514
1515 if (__intel_engine_reset_bh(engine, msg) == 0)
1516 engine_mask &= ~engine->mask;
1517
1518 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1519 word: &gt->reset.flags);
1520 }
1521 local_bh_enable();
1522 }
1523
1524 if (!engine_mask)
1525 goto out;
1526
1527 /* Full reset needs the mutex, stop any other user trying to do so. */
1528 if (test_and_set_bit(I915_RESET_BACKOFF, addr: &gt->reset.flags)) {
1529 wait_event(gt->reset.queue,
1530 !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1531 goto out; /* piggy-back on the other reset */
1532 }
1533
1534 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1535 synchronize_rcu_expedited();
1536
1537 /*
1538 * Prevent any other reset-engine attempt. We don't do this for GuC
1539 * submission the GuC owns the per-engine reset, not the i915.
1540 */
1541 if (!intel_uc_uses_guc_submission(uc: &gt->uc)) {
1542 for_each_engine(engine, gt, tmp) {
1543 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1544 addr: &gt->reset.flags))
1545 wait_on_bit(word: &gt->reset.flags,
1546 I915_RESET_ENGINE + engine->id,
1547 TASK_UNINTERRUPTIBLE);
1548 }
1549 }
1550
1551 /* Flush everyone using a resource about to be clobbered */
1552 synchronize_srcu_expedited(ssp: &gt->reset.backoff_srcu);
1553
1554 intel_gt_reset_global(gt, engine_mask, reason: msg);
1555
1556 if (!intel_uc_uses_guc_submission(uc: &gt->uc)) {
1557 for_each_engine(engine, gt, tmp)
1558 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1559 addr: &gt->reset.flags);
1560 }
1561 clear_bit_unlock(I915_RESET_BACKOFF, addr: &gt->reset.flags);
1562 smp_mb__after_atomic();
1563 wake_up_all(&gt->reset.queue);
1564
1565out:
1566 intel_runtime_pm_put(rpm: gt->uncore->rpm, wref: wakeref);
1567}
1568
1569static int _intel_gt_reset_lock(struct intel_gt *gt, int *srcu, bool retry)
1570{
1571 might_lock(&gt->reset.backoff_srcu);
1572 if (retry)
1573 might_sleep();
1574
1575 rcu_read_lock();
1576 while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1577 rcu_read_unlock();
1578
1579 if (!retry)
1580 return -EBUSY;
1581
1582 if (wait_event_interruptible(gt->reset.queue,
1583 !test_bit(I915_RESET_BACKOFF,
1584 &gt->reset.flags)))
1585 return -EINTR;
1586
1587 rcu_read_lock();
1588 }
1589 *srcu = srcu_read_lock(ssp: &gt->reset.backoff_srcu);
1590 rcu_read_unlock();
1591
1592 return 0;
1593}
1594
1595int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1596{
1597 return _intel_gt_reset_lock(gt, srcu, retry: false);
1598}
1599
1600int intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu)
1601{
1602 return _intel_gt_reset_lock(gt, srcu, retry: true);
1603}
1604
1605void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1606__releases(&gt->reset.backoff_srcu)
1607{
1608 srcu_read_unlock(ssp: &gt->reset.backoff_srcu, idx: tag);
1609}
1610
1611int intel_gt_terminally_wedged(struct intel_gt *gt)
1612{
1613 might_sleep();
1614
1615 if (!intel_gt_is_wedged(gt))
1616 return 0;
1617
1618 if (intel_gt_has_unrecoverable_error(gt))
1619 return -EIO;
1620
1621 /* Reset still in progress? Maybe we will recover? */
1622 if (wait_event_interruptible(gt->reset.queue,
1623 !test_bit(I915_RESET_BACKOFF,
1624 &gt->reset.flags)))
1625 return -EINTR;
1626
1627 return intel_gt_is_wedged(gt) ? -EIO : 0;
1628}
1629
1630void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1631{
1632 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1633 I915_WEDGED_ON_INIT);
1634 intel_gt_set_wedged(gt);
1635 i915_disable_error_state(i915: gt->i915, err: -ENODEV);
1636 set_bit(I915_WEDGED_ON_INIT, addr: &gt->reset.flags);
1637
1638 /* Wedged on init is non-recoverable */
1639 add_taint_for_CI(i915: gt->i915, TAINT_WARN);
1640}
1641
1642void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1643{
1644 intel_gt_set_wedged(gt);
1645 i915_disable_error_state(i915: gt->i915, err: -ENODEV);
1646 set_bit(I915_WEDGED_ON_FINI, addr: &gt->reset.flags);
1647 intel_gt_retire_requests(gt); /* cleanup any wedged requests */
1648}
1649
1650void intel_gt_init_reset(struct intel_gt *gt)
1651{
1652 init_waitqueue_head(&gt->reset.queue);
1653 mutex_init(&gt->reset.mutex);
1654 init_srcu_struct(ssp: &gt->reset.backoff_srcu);
1655 INIT_WORK(&gt->wedge, set_wedged_work);
1656
1657 /*
1658 * While undesirable to wait inside the shrinker, complain anyway.
1659 *
1660 * If we have to wait during shrinking, we guarantee forward progress
1661 * by forcing the reset. Therefore during the reset we must not
1662 * re-enter the shrinker. By declaring that we take the reset mutex
1663 * within the shrinker, we forbid ourselves from performing any
1664 * fs-reclaim or taking related locks during reset.
1665 */
1666 i915_gem_shrinker_taints_mutex(i915: gt->i915, mutex: &gt->reset.mutex);
1667
1668 /* no GPU until we are ready! */
1669 __set_bit(I915_WEDGED, &gt->reset.flags);
1670}
1671
1672void intel_gt_fini_reset(struct intel_gt *gt)
1673{
1674 cleanup_srcu_struct(ssp: &gt->reset.backoff_srcu);
1675}
1676
1677static void intel_wedge_me(struct work_struct *work)
1678{
1679 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1680
1681 gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
1682 set_wedged_work(&w->gt->wedge);
1683}
1684
1685void __intel_init_wedge(struct intel_wedge_me *w,
1686 struct intel_gt *gt,
1687 long timeout,
1688 const char *name)
1689{
1690 w->gt = gt;
1691 w->name = name;
1692
1693 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1694 queue_delayed_work(wq: gt->i915->unordered_wq, dwork: &w->work, delay: timeout);
1695}
1696
1697void __intel_fini_wedge(struct intel_wedge_me *w)
1698{
1699 cancel_delayed_work_sync(dwork: &w->work);
1700 destroy_delayed_work_on_stack(work: &w->work);
1701 w->gt = NULL;
1702}
1703
1704/*
1705 * Wa_22011802037 requires that we (or the GuC) ensure that no command
1706 * streamers are executing MI_FORCE_WAKE while an engine reset is initiated.
1707 */
1708bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt)
1709{
1710 if (GRAPHICS_VER(gt->i915) < 11)
1711 return false;
1712
1713 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0))
1714 return true;
1715
1716 if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
1717 return false;
1718
1719 return true;
1720}
1721
1722#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1723#include "selftest_reset.c"
1724#include "selftest_hangcheck.c"
1725#endif
1726