| 1 | // SPDX-License-Identifier: MIT | 
|---|
| 2 | /* | 
|---|
| 3 | * Copyright © 2019 Intel Corporation | 
|---|
| 4 | */ | 
|---|
| 5 |  | 
|---|
| 6 | #include "i915_drv.h" | 
|---|
| 7 | #include "i915_request.h" | 
|---|
| 8 |  | 
|---|
| 9 | #include "intel_context.h" | 
|---|
| 10 | #include "intel_engine_heartbeat.h" | 
|---|
| 11 | #include "intel_engine_pm.h" | 
|---|
| 12 | #include "intel_engine.h" | 
|---|
| 13 | #include "intel_gt.h" | 
|---|
| 14 | #include "intel_reset.h" | 
|---|
| 15 |  | 
|---|
| 16 | /* | 
|---|
| 17 | * While the engine is active, we send a periodic pulse along the engine | 
|---|
| 18 | * to check on its health and to flush any idle-barriers. If that request | 
|---|
| 19 | * is stuck, and we fail to preempt it, we declare the engine hung and | 
|---|
| 20 | * issue a reset -- in the hope that restores progress. | 
|---|
| 21 | */ | 
|---|
| 22 |  | 
|---|
| 23 | static bool next_heartbeat(struct intel_engine_cs *engine) | 
|---|
| 24 | { | 
|---|
| 25 | struct i915_request *rq; | 
|---|
| 26 | long delay; | 
|---|
| 27 |  | 
|---|
| 28 | delay = READ_ONCE(engine->props.heartbeat_interval_ms); | 
|---|
| 29 |  | 
|---|
| 30 | rq = engine->heartbeat.systole; | 
|---|
| 31 |  | 
|---|
| 32 | /* | 
|---|
| 33 | * FIXME: The final period extension is disabled if the period has been | 
|---|
| 34 | * modified from the default. This is to prevent issues with certain | 
|---|
| 35 | * selftests which override the value and expect specific behaviour. | 
|---|
| 36 | * Once the selftests have been updated to either cope with variable | 
|---|
| 37 | * heartbeat periods (or to override the pre-emption timeout as well, | 
|---|
| 38 | * or just to add a selftest specific override of the extension), the | 
|---|
| 39 | * generic override can be removed. | 
|---|
| 40 | */ | 
|---|
| 41 | if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && | 
|---|
| 42 | delay == engine->defaults.heartbeat_interval_ms) { | 
|---|
| 43 | long longer; | 
|---|
| 44 |  | 
|---|
| 45 | /* | 
|---|
| 46 | * The final try is at the highest priority possible. Up until now | 
|---|
| 47 | * a pre-emption might not even have been attempted. So make sure | 
|---|
| 48 | * this last attempt allows enough time for a pre-emption to occur. | 
|---|
| 49 | */ | 
|---|
| 50 | longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; | 
|---|
| 51 | longer = intel_clamp_heartbeat_interval_ms(engine, value: longer); | 
|---|
| 52 | if (longer > delay) | 
|---|
| 53 | delay = longer; | 
|---|
| 54 | } | 
|---|
| 55 |  | 
|---|
| 56 | if (!delay) | 
|---|
| 57 | return false; | 
|---|
| 58 |  | 
|---|
| 59 | delay = msecs_to_jiffies_timeout(m: delay); | 
|---|
| 60 | if (delay >= HZ) | 
|---|
| 61 | delay = round_jiffies_up_relative(j: delay); | 
|---|
| 62 | mod_delayed_work(wq: system_highpri_wq, dwork: &engine->heartbeat.work, delay: delay + 1); | 
|---|
| 63 |  | 
|---|
| 64 | return true; | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | static struct i915_request * | 
|---|
| 68 | heartbeat_create(struct intel_context *ce, gfp_t gfp) | 
|---|
| 69 | { | 
|---|
| 70 | struct i915_request *rq; | 
|---|
| 71 |  | 
|---|
| 72 | intel_context_enter(ce); | 
|---|
| 73 | rq = __i915_request_create(ce, gfp); | 
|---|
| 74 | intel_context_exit(ce); | 
|---|
| 75 |  | 
|---|
| 76 | return rq; | 
|---|
| 77 | } | 
|---|
| 78 |  | 
|---|
| 79 | static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) | 
|---|
| 80 | { | 
|---|
| 81 | engine->wakeref_serial = READ_ONCE(engine->serial) + 1; | 
|---|
| 82 | i915_request_add_active_barriers(rq); | 
|---|
| 83 | if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) | 
|---|
| 84 | engine->heartbeat.systole = i915_request_get(rq); | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 | static void heartbeat_commit(struct i915_request *rq, | 
|---|
| 88 | const struct i915_sched_attr *attr) | 
|---|
| 89 | { | 
|---|
| 90 | idle_pulse(engine: rq->engine, rq); | 
|---|
| 91 |  | 
|---|
| 92 | __i915_request_commit(request: rq); | 
|---|
| 93 | __i915_request_queue(rq, attr); | 
|---|
| 94 | } | 
|---|
| 95 |  | 
|---|
| 96 | static void show_heartbeat(const struct i915_request *rq, | 
|---|
| 97 | struct intel_engine_cs *engine) | 
|---|
| 98 | { | 
|---|
| 99 | struct drm_printer p = | 
|---|
| 100 | drm_dbg_printer(drm: &engine->i915->drm, category: DRM_UT_DRIVER, prefix: "heartbeat"); | 
|---|
| 101 |  | 
|---|
| 102 | if (!rq) { | 
|---|
| 103 | intel_engine_dump(engine, m: &p, | 
|---|
| 104 | header: "%s heartbeat not ticking\n", | 
|---|
| 105 | engine->name); | 
|---|
| 106 | } else { | 
|---|
| 107 | intel_engine_dump(engine, m: &p, | 
|---|
| 108 | header: "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", | 
|---|
| 109 | engine->name, | 
|---|
| 110 | rq->fence.context, | 
|---|
| 111 | rq->fence.seqno, | 
|---|
| 112 | rq->sched.attr.priority); | 
|---|
| 113 | } | 
|---|
| 114 | } | 
|---|
| 115 |  | 
|---|
| 116 | static void | 
|---|
| 117 | reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) | 
|---|
| 118 | { | 
|---|
| 119 | if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) | 
|---|
| 120 | show_heartbeat(rq, engine); | 
|---|
| 121 |  | 
|---|
| 122 | if (intel_engine_uses_guc(engine)) | 
|---|
| 123 | /* | 
|---|
| 124 | * GuC itself is toast or GuC's hang detection | 
|---|
| 125 | * is disabled. Either way, need to find the | 
|---|
| 126 | * hang culprit manually. | 
|---|
| 127 | */ | 
|---|
| 128 | intel_guc_find_hung_context(engine); | 
|---|
| 129 |  | 
|---|
| 130 | intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, | 
|---|
| 131 | I915_ERROR_CAPTURE, | 
|---|
| 132 | fmt: "stopped heartbeat on %s", | 
|---|
| 133 | engine->name); | 
|---|
| 134 | } | 
|---|
| 135 |  | 
|---|
| 136 | static void heartbeat(struct work_struct *wrk) | 
|---|
| 137 | { | 
|---|
| 138 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; | 
|---|
| 139 | struct intel_engine_cs *engine = | 
|---|
| 140 | container_of(wrk, typeof(*engine), heartbeat.work.work); | 
|---|
| 141 | struct intel_context *ce = engine->kernel_context; | 
|---|
| 142 | struct i915_request *rq; | 
|---|
| 143 | unsigned long serial; | 
|---|
| 144 |  | 
|---|
| 145 | /* Just in case everything has gone horribly wrong, give it a kick */ | 
|---|
| 146 | intel_engine_flush_submission(engine); | 
|---|
| 147 |  | 
|---|
| 148 | rq = engine->heartbeat.systole; | 
|---|
| 149 | if (rq && i915_request_completed(rq)) { | 
|---|
| 150 | i915_request_put(rq); | 
|---|
| 151 | engine->heartbeat.systole = NULL; | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | if (!intel_engine_pm_get_if_awake(engine)) | 
|---|
| 155 | return; | 
|---|
| 156 |  | 
|---|
| 157 | if (intel_gt_is_wedged(gt: engine->gt)) | 
|---|
| 158 | goto out; | 
|---|
| 159 |  | 
|---|
| 160 | if (i915_sched_engine_disabled(sched_engine: engine->sched_engine)) { | 
|---|
| 161 | reset_engine(engine, rq: engine->heartbeat.systole); | 
|---|
| 162 | goto out; | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | if (engine->heartbeat.systole) { | 
|---|
| 166 | long delay = READ_ONCE(engine->props.heartbeat_interval_ms); | 
|---|
| 167 |  | 
|---|
| 168 | /* Safeguard against too-fast worker invocations */ | 
|---|
| 169 | if (!time_after(jiffies, | 
|---|
| 170 | rq->emitted_jiffies + msecs_to_jiffies(delay))) | 
|---|
| 171 | goto out; | 
|---|
| 172 |  | 
|---|
| 173 | if (!i915_sw_fence_signaled(fence: &rq->submit)) { | 
|---|
| 174 | /* | 
|---|
| 175 | * Not yet submitted, system is stalled. | 
|---|
| 176 | * | 
|---|
| 177 | * This more often happens for ring submission, | 
|---|
| 178 | * where all contexts are funnelled into a common | 
|---|
| 179 | * ringbuffer. If one context is blocked on an | 
|---|
| 180 | * external fence, not only is it not submitted, | 
|---|
| 181 | * but all other contexts, including the kernel | 
|---|
| 182 | * context are stuck waiting for the signal. | 
|---|
| 183 | */ | 
|---|
| 184 | } else if (engine->sched_engine->schedule && | 
|---|
| 185 | rq->sched.attr.priority < I915_PRIORITY_BARRIER) { | 
|---|
| 186 | /* | 
|---|
| 187 | * Gradually raise the priority of the heartbeat to | 
|---|
| 188 | * give high priority work [which presumably desires | 
|---|
| 189 | * low latency and no jitter] the chance to naturally | 
|---|
| 190 | * complete before being preempted. | 
|---|
| 191 | */ | 
|---|
| 192 | attr.priority = I915_PRIORITY_NORMAL; | 
|---|
| 193 | if (rq->sched.attr.priority >= attr.priority) | 
|---|
| 194 | attr.priority = I915_PRIORITY_HEARTBEAT; | 
|---|
| 195 | if (rq->sched.attr.priority >= attr.priority) | 
|---|
| 196 | attr.priority = I915_PRIORITY_BARRIER; | 
|---|
| 197 |  | 
|---|
| 198 | local_bh_disable(); | 
|---|
| 199 | engine->sched_engine->schedule(rq, &attr); | 
|---|
| 200 | local_bh_enable(); | 
|---|
| 201 | } else { | 
|---|
| 202 | reset_engine(engine, rq); | 
|---|
| 203 | } | 
|---|
| 204 |  | 
|---|
| 205 | rq->emitted_jiffies = jiffies; | 
|---|
| 206 | goto out; | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | serial = READ_ONCE(engine->serial); | 
|---|
| 210 | if (engine->wakeref_serial == serial) | 
|---|
| 211 | goto out; | 
|---|
| 212 |  | 
|---|
| 213 | if (!mutex_trylock(lock: &ce->timeline->mutex)) { | 
|---|
| 214 | /* Unable to lock the kernel timeline, is the engine stuck? */ | 
|---|
| 215 | if (xchg(&engine->heartbeat.blocked, serial) == serial) | 
|---|
| 216 | intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, | 
|---|
| 217 | I915_ERROR_CAPTURE, | 
|---|
| 218 | fmt: "no heartbeat on %s", | 
|---|
| 219 | engine->name); | 
|---|
| 220 | goto out; | 
|---|
| 221 | } | 
|---|
| 222 |  | 
|---|
| 223 | rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); | 
|---|
| 224 | if (IS_ERR(ptr: rq)) | 
|---|
| 225 | goto unlock; | 
|---|
| 226 |  | 
|---|
| 227 | heartbeat_commit(rq, attr: &attr); | 
|---|
| 228 |  | 
|---|
| 229 | unlock: | 
|---|
| 230 | mutex_unlock(lock: &ce->timeline->mutex); | 
|---|
| 231 | out: | 
|---|
| 232 | if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) | 
|---|
| 233 | i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); | 
|---|
| 234 | intel_engine_pm_put(engine); | 
|---|
| 235 | } | 
|---|
| 236 |  | 
|---|
| 237 | void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) | 
|---|
| 238 | { | 
|---|
| 239 | if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) | 
|---|
| 240 | return; | 
|---|
| 241 |  | 
|---|
| 242 | next_heartbeat(engine); | 
|---|
| 243 | } | 
|---|
| 244 |  | 
|---|
| 245 | void intel_engine_park_heartbeat(struct intel_engine_cs *engine) | 
|---|
| 246 | { | 
|---|
| 247 | if (cancel_delayed_work(dwork: &engine->heartbeat.work)) | 
|---|
| 248 | i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); | 
|---|
| 249 | } | 
|---|
| 250 |  | 
|---|
| 251 | void intel_gt_unpark_heartbeats(struct intel_gt *gt) | 
|---|
| 252 | { | 
|---|
| 253 | struct intel_engine_cs *engine; | 
|---|
| 254 | enum intel_engine_id id; | 
|---|
| 255 |  | 
|---|
| 256 | for_each_engine(engine, gt, id) | 
|---|
| 257 | if (intel_engine_pm_is_awake(engine)) | 
|---|
| 258 | intel_engine_unpark_heartbeat(engine); | 
|---|
| 259 | } | 
|---|
| 260 |  | 
|---|
| 261 | void intel_gt_park_heartbeats(struct intel_gt *gt) | 
|---|
| 262 | { | 
|---|
| 263 | struct intel_engine_cs *engine; | 
|---|
| 264 | enum intel_engine_id id; | 
|---|
| 265 |  | 
|---|
| 266 | for_each_engine(engine, gt, id) | 
|---|
| 267 | intel_engine_park_heartbeat(engine); | 
|---|
| 268 | } | 
|---|
| 269 |  | 
|---|
| 270 | void intel_engine_init_heartbeat(struct intel_engine_cs *engine) | 
|---|
| 271 | { | 
|---|
| 272 | INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); | 
|---|
| 273 | } | 
|---|
| 274 |  | 
|---|
| 275 | static int __intel_engine_pulse(struct intel_engine_cs *engine) | 
|---|
| 276 | { | 
|---|
| 277 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; | 
|---|
| 278 | struct intel_context *ce = engine->kernel_context; | 
|---|
| 279 | struct i915_request *rq; | 
|---|
| 280 |  | 
|---|
| 281 | lockdep_assert_held(&ce->timeline->mutex); | 
|---|
| 282 | GEM_BUG_ON(!intel_engine_has_preemption(engine)); | 
|---|
| 283 | GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); | 
|---|
| 284 |  | 
|---|
| 285 | rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); | 
|---|
| 286 | if (IS_ERR(ptr: rq)) | 
|---|
| 287 | return PTR_ERR(ptr: rq); | 
|---|
| 288 |  | 
|---|
| 289 | __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); | 
|---|
| 290 |  | 
|---|
| 291 | heartbeat_commit(rq, attr: &attr); | 
|---|
| 292 | GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); | 
|---|
| 293 |  | 
|---|
| 294 | /* Ensure the forced pulse gets a full period to execute */ | 
|---|
| 295 | next_heartbeat(engine); | 
|---|
| 296 |  | 
|---|
| 297 | return 0; | 
|---|
| 298 | } | 
|---|
| 299 |  | 
|---|
| 300 | static unsigned long set_heartbeat(struct intel_engine_cs *engine, | 
|---|
| 301 | unsigned long delay) | 
|---|
| 302 | { | 
|---|
| 303 | unsigned long old; | 
|---|
| 304 |  | 
|---|
| 305 | old = xchg(&engine->props.heartbeat_interval_ms, delay); | 
|---|
| 306 | if (delay) | 
|---|
| 307 | intel_engine_unpark_heartbeat(engine); | 
|---|
| 308 | else | 
|---|
| 309 | intel_engine_park_heartbeat(engine); | 
|---|
| 310 |  | 
|---|
| 311 | return old; | 
|---|
| 312 | } | 
|---|
| 313 |  | 
|---|
| 314 | int intel_engine_set_heartbeat(struct intel_engine_cs *engine, | 
|---|
| 315 | unsigned long delay) | 
|---|
| 316 | { | 
|---|
| 317 | struct intel_context *ce = engine->kernel_context; | 
|---|
| 318 | int err = 0; | 
|---|
| 319 |  | 
|---|
| 320 | if (!delay && !intel_engine_has_preempt_reset(engine)) | 
|---|
| 321 | return -ENODEV; | 
|---|
| 322 |  | 
|---|
| 323 | /* FIXME: Remove together with equally marked hack in next_heartbeat. */ | 
|---|
| 324 | if (delay != engine->defaults.heartbeat_interval_ms && | 
|---|
| 325 | delay < 2 * engine->props.preempt_timeout_ms) { | 
|---|
| 326 | if (intel_engine_uses_guc(engine)) | 
|---|
| 327 | drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", | 
|---|
| 328 | engine->name); | 
|---|
| 329 | else | 
|---|
| 330 | drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", | 
|---|
| 331 | engine->name); | 
|---|
| 332 | } | 
|---|
| 333 |  | 
|---|
| 334 | intel_engine_pm_get(engine); | 
|---|
| 335 |  | 
|---|
| 336 | err = mutex_lock_interruptible(lock: &ce->timeline->mutex); | 
|---|
| 337 | if (err) | 
|---|
| 338 | goto out_rpm; | 
|---|
| 339 |  | 
|---|
| 340 | if (delay != engine->props.heartbeat_interval_ms) { | 
|---|
| 341 | unsigned long saved = set_heartbeat(engine, delay); | 
|---|
| 342 |  | 
|---|
| 343 | /* recheck current execution */ | 
|---|
| 344 | if (intel_engine_has_preemption(engine)) { | 
|---|
| 345 | err = __intel_engine_pulse(engine); | 
|---|
| 346 | if (err) | 
|---|
| 347 | set_heartbeat(engine, delay: saved); | 
|---|
| 348 | } | 
|---|
| 349 | } | 
|---|
| 350 |  | 
|---|
| 351 | mutex_unlock(lock: &ce->timeline->mutex); | 
|---|
| 352 |  | 
|---|
| 353 | out_rpm: | 
|---|
| 354 | intel_engine_pm_put(engine); | 
|---|
| 355 | return err; | 
|---|
| 356 | } | 
|---|
| 357 |  | 
|---|
| 358 | int intel_engine_pulse(struct intel_engine_cs *engine) | 
|---|
| 359 | { | 
|---|
| 360 | struct intel_context *ce = engine->kernel_context; | 
|---|
| 361 | int err; | 
|---|
| 362 |  | 
|---|
| 363 | if (!intel_engine_has_preemption(engine)) | 
|---|
| 364 | return -ENODEV; | 
|---|
| 365 |  | 
|---|
| 366 | if (!intel_engine_pm_get_if_awake(engine)) | 
|---|
| 367 | return 0; | 
|---|
| 368 |  | 
|---|
| 369 | err = -EINTR; | 
|---|
| 370 | if (!mutex_lock_interruptible(lock: &ce->timeline->mutex)) { | 
|---|
| 371 | err = __intel_engine_pulse(engine); | 
|---|
| 372 | mutex_unlock(lock: &ce->timeline->mutex); | 
|---|
| 373 | } | 
|---|
| 374 |  | 
|---|
| 375 | intel_engine_flush_submission(engine); | 
|---|
| 376 | intel_engine_pm_put(engine); | 
|---|
| 377 | return err; | 
|---|
| 378 | } | 
|---|
| 379 |  | 
|---|
| 380 | int intel_engine_flush_barriers(struct intel_engine_cs *engine) | 
|---|
| 381 | { | 
|---|
| 382 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; | 
|---|
| 383 | struct intel_context *ce = engine->kernel_context; | 
|---|
| 384 | struct i915_request *rq; | 
|---|
| 385 | int err; | 
|---|
| 386 |  | 
|---|
| 387 | if (llist_empty(head: &engine->barrier_tasks)) | 
|---|
| 388 | return 0; | 
|---|
| 389 |  | 
|---|
| 390 | if (!intel_engine_pm_get_if_awake(engine)) | 
|---|
| 391 | return 0; | 
|---|
| 392 |  | 
|---|
| 393 | if (mutex_lock_interruptible(lock: &ce->timeline->mutex)) { | 
|---|
| 394 | err = -EINTR; | 
|---|
| 395 | goto out_rpm; | 
|---|
| 396 | } | 
|---|
| 397 |  | 
|---|
| 398 | rq = heartbeat_create(ce, GFP_KERNEL); | 
|---|
| 399 | if (IS_ERR(ptr: rq)) { | 
|---|
| 400 | err = PTR_ERR(ptr: rq); | 
|---|
| 401 | goto out_unlock; | 
|---|
| 402 | } | 
|---|
| 403 |  | 
|---|
| 404 | heartbeat_commit(rq, attr: &attr); | 
|---|
| 405 |  | 
|---|
| 406 | err = 0; | 
|---|
| 407 | out_unlock: | 
|---|
| 408 | mutex_unlock(lock: &ce->timeline->mutex); | 
|---|
| 409 | out_rpm: | 
|---|
| 410 | intel_engine_pm_put(engine); | 
|---|
| 411 | return err; | 
|---|
| 412 | } | 
|---|
| 413 |  | 
|---|
| 414 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) | 
|---|
| 415 | #include "selftest_engine_heartbeat.c" | 
|---|
| 416 | #endif | 
|---|
| 417 |  | 
|---|