1// SPDX-License-Identifier: MIT
2/*
3 * Copyright © 2014-2018 Intel Corporation
4 */
5
6#include "i915_drv.h"
7#include "i915_reg.h"
8#include "intel_context.h"
9#include "intel_engine_pm.h"
10#include "intel_engine_regs.h"
11#include "intel_gpu_commands.h"
12#include "intel_gt.h"
13#include "intel_gt_ccs_mode.h"
14#include "intel_gt_mcr.h"
15#include "intel_gt_print.h"
16#include "intel_gt_regs.h"
17#include "intel_ring.h"
18#include "intel_workarounds.h"
19
20#include "display/intel_fbc_regs.h"
21
22/**
23 * DOC: Hardware workarounds
24 *
25 * Hardware workarounds are register programming documented to be executed in
26 * the driver that fall outside of the normal programming sequences for a
27 * platform. There are some basic categories of workarounds, depending on
28 * how/when they are applied:
29 *
30 * - Context workarounds: workarounds that touch registers that are
31 * saved/restored to/from the HW context image. The list is emitted (via Load
32 * Register Immediate commands) once when initializing the device and saved in
33 * the default context. That default context is then used on every context
34 * creation to have a "primed golden context", i.e. a context image that
35 * already contains the changes needed to all the registers.
36 *
37 * Context workarounds should be implemented in the \*_ctx_workarounds_init()
38 * variants respective to the targeted platforms.
39 *
40 * - Engine workarounds: the list of these WAs is applied whenever the specific
41 * engine is reset. It's also possible that a set of engine classes share a
42 * common power domain and they are reset together. This happens on some
43 * platforms with render and compute engines. In this case (at least) one of
44 * them need to keeep the workaround programming: the approach taken in the
45 * driver is to tie those workarounds to the first compute/render engine that
46 * is registered. When executing with GuC submission, engine resets are
47 * outside of kernel driver control, hence the list of registers involved in
48 * written once, on engine initialization, and then passed to GuC, that
49 * saves/restores their values before/after the reset takes place. See
50 * ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
51 *
52 * Workarounds for registers specific to RCS and CCS should be implemented in
53 * rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
54 * registers belonging to BCS, VCS or VECS should be implemented in
55 * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
56 * engine's MMIO range but that are part of of the common RCS/CCS reset domain
57 * should be implemented in general_render_compute_wa_init(). The settings
58 * about the CCS load balancing should be added in ccs_engine_wa_mode().
59 *
60 * - GT workarounds: the list of these WAs is applied whenever these registers
61 * revert to their default values: on GPU reset, suspend/resume [1]_, etc.
62 *
63 * GT workarounds should be implemented in the \*_gt_workarounds_init()
64 * variants respective to the targeted platforms.
65 *
66 * - Register whitelist: some workarounds need to be implemented in userspace,
67 * but need to touch privileged registers. The whitelist in the kernel
68 * instructs the hardware to allow the access to happen. From the kernel side,
69 * this is just a special case of a MMIO workaround (as we write the list of
70 * these to/be-whitelisted registers to some special HW registers).
71 *
72 * Register whitelisting should be done in the \*_whitelist_build() variants
73 * respective to the targeted platforms.
74 *
75 * - Workaround batchbuffers: buffers that get executed automatically by the
76 * hardware on every HW context restore. These buffers are created and
77 * programmed in the default context so the hardware always go through those
78 * programming sequences when switching contexts. The support for workaround
79 * batchbuffers is enabled these hardware mechanisms:
80 *
81 * #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
82 * context, pointing the hardware to jump to that location when that offset
83 * is reached in the context restore. Workaround batchbuffer in the driver
84 * currently uses this mechanism for all platforms.
85 *
86 * #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
87 * pointing the hardware to a buffer to continue executing after the
88 * engine registers are restored in a context restore sequence. This is
89 * currently not used in the driver.
90 *
91 * - Other: There are WAs that, due to their nature, cannot be applied from a
92 * central place. Those are peppered around the rest of the code, as needed.
93 * Workarounds related to the display IP are the main example.
94 *
95 * .. [1] Technically, some registers are powercontext saved & restored, so they
96 * survive a suspend/resume. In practice, writing them again is not too
97 * costly and simplifies things, so it's the approach taken in the driver.
98 */
99
100static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
101 const char *name, const char *engine_name)
102{
103 wal->gt = gt;
104 wal->name = name;
105 wal->engine_name = engine_name;
106}
107
108#define WA_LIST_CHUNK (1 << 4)
109
110static void wa_init_finish(struct i915_wa_list *wal)
111{
112 /* Trim unused entries. */
113 if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
114 struct i915_wa *list = kmemdup_array(src: wal->list, count: wal->count,
115 element_size: sizeof(*list), GFP_KERNEL);
116
117 if (list) {
118 kfree(objp: wal->list);
119 wal->list = list;
120 }
121 }
122
123 if (!wal->count)
124 return;
125
126 gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
127 wal->wa_count, wal->name, wal->engine_name);
128}
129
130static enum forcewake_domains
131wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
132{
133 enum forcewake_domains fw = 0;
134 struct i915_wa *wa;
135 unsigned int i;
136
137 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
138 fw |= intel_uncore_forcewake_for_reg(uncore,
139 reg: wa->reg,
140 FW_REG_READ |
141 FW_REG_WRITE);
142
143 return fw;
144}
145
146static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
147{
148 unsigned int addr = i915_mmio_reg_offset(wa->reg);
149 struct drm_i915_private *i915 = wal->gt->i915;
150 unsigned int start = 0, end = wal->count;
151 const unsigned int grow = WA_LIST_CHUNK;
152 struct i915_wa *wa_;
153
154 GEM_BUG_ON(!is_power_of_2(grow));
155
156 if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
157 struct i915_wa *list;
158
159 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*list),
160 GFP_KERNEL);
161 if (!list) {
162 drm_err(&i915->drm, "No space for workaround init!\n");
163 return;
164 }
165
166 if (wal->list) {
167 memcpy(to: list, from: wal->list, len: sizeof(*wa) * wal->count);
168 kfree(objp: wal->list);
169 }
170
171 wal->list = list;
172 }
173
174 while (start < end) {
175 unsigned int mid = start + (end - start) / 2;
176
177 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
178 start = mid + 1;
179 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
180 end = mid;
181 } else {
182 wa_ = &wal->list[mid];
183
184 if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
185 drm_err(&i915->drm,
186 "Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
187 i915_mmio_reg_offset(wa_->reg),
188 wa_->clr, wa_->set);
189
190 wa_->set &= ~wa->clr;
191 }
192
193 wal->wa_count++;
194 wa_->set |= wa->set;
195 wa_->clr |= wa->clr;
196 wa_->read |= wa->read;
197 return;
198 }
199 }
200
201 wal->wa_count++;
202 wa_ = &wal->list[wal->count++];
203 *wa_ = *wa;
204
205 while (wa_-- > wal->list) {
206 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
207 i915_mmio_reg_offset(wa_[1].reg));
208 if (i915_mmio_reg_offset(wa_[1].reg) >
209 i915_mmio_reg_offset(wa_[0].reg))
210 break;
211
212 swap(wa_[1], wa_[0]);
213 }
214}
215
216static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
217 u32 clear, u32 set, u32 read_mask, bool masked_reg)
218{
219 struct i915_wa wa = {
220 .reg = reg,
221 .clr = clear,
222 .set = set,
223 .read = read_mask,
224 .masked_reg = masked_reg,
225 };
226
227 _wa_add(wal, wa: &wa);
228}
229
230static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
231 u32 clear, u32 set, u32 read_mask, bool masked_reg)
232{
233 struct i915_wa wa = {
234 .mcr_reg = reg,
235 .clr = clear,
236 .set = set,
237 .read = read_mask,
238 .masked_reg = masked_reg,
239 .is_mcr = 1,
240 };
241
242 _wa_add(wal, wa: &wa);
243}
244
245static void
246wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
247{
248 wa_add(wal, reg, clear, set, read_mask: clear | set, masked_reg: false);
249}
250
251static void
252wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
253{
254 wa_mcr_add(wal, reg, clear, set, read_mask: clear | set, masked_reg: false);
255}
256
257static void
258wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
259{
260 wa_write_clr_set(wal, reg, clear: ~0, set);
261}
262
263static void
264wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
265{
266 wa_write_clr_set(wal, reg, clear: set, set);
267}
268
269static void
270wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
271{
272 wa_mcr_write_clr_set(wal, reg, clear: set, set);
273}
274
275static void
276wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
277{
278 wa_write_clr_set(wal, reg, clear: clr, set: 0);
279}
280
281static void
282wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
283{
284 wa_mcr_write_clr_set(wal, reg, clear: clr, set: 0);
285}
286
287/*
288 * WA operations on "masked register". A masked register has the upper 16 bits
289 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
290 * portion of the register without a rmw: you simply write in the upper 16 bits
291 * the mask of bits you are going to modify.
292 *
293 * The wa_masked_* family of functions already does the necessary operations to
294 * calculate the mask based on the parameters passed, so user only has to
295 * provide the lower 16 bits of that register.
296 */
297
298static void
299wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
300{
301 wa_add(wal, reg, clear: 0, _MASKED_BIT_ENABLE(val), read_mask: val, masked_reg: true);
302}
303
304static void
305wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
306{
307 wa_mcr_add(wal, reg, clear: 0, _MASKED_BIT_ENABLE(val), read_mask: val, masked_reg: true);
308}
309
310static void
311wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
312{
313 wa_add(wal, reg, clear: 0, _MASKED_BIT_DISABLE(val), read_mask: val, masked_reg: true);
314}
315
316static void
317wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
318{
319 wa_mcr_add(wal, reg, clear: 0, _MASKED_BIT_DISABLE(val), read_mask: val, masked_reg: true);
320}
321
322static void
323wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
324 u32 mask, u32 val)
325{
326 wa_add(wal, reg, clear: 0, _MASKED_FIELD(mask, val), read_mask: mask, masked_reg: true);
327}
328
329static void
330wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
331 u32 mask, u32 val)
332{
333 wa_mcr_add(wal, reg, clear: 0, _MASKED_FIELD(mask, val), read_mask: mask, masked_reg: true);
334}
335
336static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
337 struct i915_wa_list *wal)
338{
339 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
340
341 /* WaDisable_RenderCache_OperationalFlush:snb */
342 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
343}
344
345static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
346 struct i915_wa_list *wal)
347{
348 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
349 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
350 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
351
352 /*
353 * BSpec says this must be set, even though
354 * WaDisable4x2SubspanOptimization:ivb,hsw
355 * WaDisable4x2SubspanOptimization isn't listed for VLV.
356 */
357 wa_masked_en(wal,
358 CACHE_MODE_1,
359 PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
360}
361
362static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
363 struct i915_wa_list *wal)
364{
365 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
366
367 /* WaDisableAsyncFlipPerfMode:bdw,chv */
368 wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
369
370 /* WaDisablePartialInstShootdown:bdw,chv */
371 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
372 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
373
374 /* Use Force Non-Coherent whenever executing a 3D context. This is a
375 * workaround for a possible hang in the unlikely event a TLB
376 * invalidation occurs during a PSD flush.
377 */
378 /* WaForceEnableNonCoherent:bdw,chv */
379 /* WaHdcDisableFetchWhenMasked:bdw,chv */
380 wa_masked_en(wal, HDC_CHICKEN0,
381 HDC_DONOT_FETCH_MEM_WHEN_MASKED |
382 HDC_FORCE_NON_COHERENT);
383
384 /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
385 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
386 * polygons in the same 8x4 pixel/sample area to be processed without
387 * stalling waiting for the earlier ones to write to Hierarchical Z
388 * buffer."
389 *
390 * This optimization is off by default for BDW and CHV; turn it on.
391 */
392 wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
393
394 /* Wa4x4STCOptimizationDisable:bdw,chv */
395 wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
396
397 /*
398 * BSpec recommends 8x4 when MSAA is used,
399 * however in practice 16x4 seems fastest.
400 *
401 * Note that PS/WM thread counts depend on the WIZ hashing
402 * disable bit, which we don't touch here, but it's good
403 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
404 */
405 wa_masked_field_set(wal, GEN7_GT_MODE,
406 GEN6_WIZ_HASHING_MASK,
407 GEN6_WIZ_HASHING_16x4);
408}
409
410static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
411 struct i915_wa_list *wal)
412{
413 struct drm_i915_private *i915 = engine->i915;
414
415 gen8_ctx_workarounds_init(engine, wal);
416
417 /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
418 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
419
420 /* WaDisableDopClockGating:bdw
421 *
422 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
423 * to disable EUTC clock gating.
424 */
425 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
426 DOP_CLOCK_GATING_DISABLE);
427
428 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
429 GEN8_SAMPLER_POWER_BYPASS_DIS);
430
431 wa_masked_en(wal, HDC_CHICKEN0,
432 /* WaForceContextSaveRestoreNonCoherent:bdw */
433 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
434 /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
435 (INTEL_INFO(i915)->gt == 3 ? HDC_FENCE_DEST_SLM_DISABLE : 0));
436}
437
438static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
439 struct i915_wa_list *wal)
440{
441 gen8_ctx_workarounds_init(engine, wal);
442
443 /* WaDisableThreadStallDopClockGating:chv */
444 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
445
446 /* Improve HiZ throughput on CHV. */
447 wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
448}
449
450static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
451 struct i915_wa_list *wal)
452{
453 struct drm_i915_private *i915 = engine->i915;
454
455 if (HAS_LLC(i915)) {
456 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
457 *
458 * Must match Display Engine. See
459 * WaCompressedResourceDisplayNewHashMode.
460 */
461 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
462 GEN9_PBE_COMPRESSED_HASH_SELECTION);
463 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
464 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
465 }
466
467 /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
468 /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
469 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
470 FLOW_CONTROL_ENABLE |
471 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
472
473 /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
474 /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
475 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
476 GEN9_ENABLE_YV12_BUGFIX |
477 GEN9_ENABLE_GPGPU_PREEMPTION);
478
479 /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
480 /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
481 wa_masked_en(wal, CACHE_MODE_1,
482 GEN8_4x4_STC_OPTIMIZATION_DISABLE |
483 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
484
485 /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
486 wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
487 GEN9_CCS_TLB_PREFETCH_ENABLE);
488
489 /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
490 wa_masked_en(wal, HDC_CHICKEN0,
491 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
492 HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
493
494 /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
495 * both tied to WaForceContextSaveRestoreNonCoherent
496 * in some hsds for skl. We keep the tie for all gen9. The
497 * documentation is a bit hazy and so we want to get common behaviour,
498 * even though there is no clear evidence we would need both on kbl/bxt.
499 * This area has been source of system hangs so we play it safe
500 * and mimic the skl regardless of what bspec says.
501 *
502 * Use Force Non-Coherent whenever executing a 3D context. This
503 * is a workaround for a possible hang in the unlikely event
504 * a TLB invalidation occurs during a PSD flush.
505 */
506
507 /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
508 wa_masked_en(wal, HDC_CHICKEN0,
509 HDC_FORCE_NON_COHERENT);
510
511 /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
512 if (IS_SKYLAKE(i915) ||
513 IS_KABYLAKE(i915) ||
514 IS_COFFEELAKE(i915) ||
515 IS_COMETLAKE(i915))
516 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
517 GEN8_SAMPLER_POWER_BYPASS_DIS);
518
519 /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
520 wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
521
522 /*
523 * Supporting preemption with fine-granularity requires changes in the
524 * batch buffer programming. Since we can't break old userspace, we
525 * need to set our default preemption level to safe value. Userspace is
526 * still able to use more fine-grained preemption levels, since in
527 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
528 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
529 * not real HW workarounds, but merely a way to start using preemption
530 * while maintaining old contract with userspace.
531 */
532
533 /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
534 wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
535
536 /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
537 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
538 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
539 GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
540
541 /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
542 if (IS_GEN9_LP(i915))
543 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
544}
545
546static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
547 struct i915_wa_list *wal)
548{
549 struct intel_gt *gt = engine->gt;
550 u8 vals[3] = { 0, 0, 0 };
551 unsigned int i;
552
553 for (i = 0; i < 3; i++) {
554 u8 ss;
555
556 /*
557 * Only consider slices where one, and only one, subslice has 7
558 * EUs
559 */
560 if (!is_power_of_2(n: gt->info.sseu.subslice_7eu[i]))
561 continue;
562
563 /*
564 * subslice_7eu[i] != 0 (because of the check above) and
565 * ss_max == 4 (maximum number of subslices possible per slice)
566 *
567 * -> 0 <= ss <= 3;
568 */
569 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
570 vals[i] = 3 - ss;
571 }
572
573 if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
574 return;
575
576 /* Tune IZ hashing. See intel_device_info_runtime_init() */
577 wa_masked_field_set(wal, GEN7_GT_MODE,
578 GEN9_IZ_HASHING_MASK(2) |
579 GEN9_IZ_HASHING_MASK(1) |
580 GEN9_IZ_HASHING_MASK(0),
581 GEN9_IZ_HASHING(2, vals[2]) |
582 GEN9_IZ_HASHING(1, vals[1]) |
583 GEN9_IZ_HASHING(0, vals[0]));
584}
585
586static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
587 struct i915_wa_list *wal)
588{
589 gen9_ctx_workarounds_init(engine, wal);
590 skl_tune_iz_hashing(engine, wal);
591}
592
593static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
594 struct i915_wa_list *wal)
595{
596 gen9_ctx_workarounds_init(engine, wal);
597
598 /* WaDisableThreadStallDopClockGating:bxt */
599 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
600 STALL_DOP_GATING_DISABLE);
601
602 /* WaToEnableHwFixForPushConstHWBug:bxt */
603 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
604 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
605}
606
607static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
608 struct i915_wa_list *wal)
609{
610 struct drm_i915_private *i915 = engine->i915;
611
612 gen9_ctx_workarounds_init(engine, wal);
613
614 /* WaToEnableHwFixForPushConstHWBug:kbl */
615 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
616 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
617 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
618
619 /* WaDisableSbeCacheDispatchPortSharing:kbl */
620 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
621 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
622}
623
624static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
625 struct i915_wa_list *wal)
626{
627 gen9_ctx_workarounds_init(engine, wal);
628
629 /* WaToEnableHwFixForPushConstHWBug:glk */
630 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
631 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
632}
633
634static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
635 struct i915_wa_list *wal)
636{
637 gen9_ctx_workarounds_init(engine, wal);
638
639 /* WaToEnableHwFixForPushConstHWBug:cfl */
640 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
641 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
642
643 /* WaDisableSbeCacheDispatchPortSharing:cfl */
644 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
645 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
646}
647
648static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
649 struct i915_wa_list *wal)
650{
651 struct drm_i915_private *i915 = engine->i915;
652
653 /* Wa_1406697149 (WaDisableBankHangMode:icl) */
654 wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
655
656 /* WaForceEnableNonCoherent:icl
657 * This is not the same workaround as in early Gen9 platforms, where
658 * lacking this could cause system hangs, but coherency performance
659 * overhead is high and only a few compute workloads really need it
660 * (the register is whitelisted in hardware now, so UMDs can opt in
661 * for coherency if they have a good reason).
662 */
663 wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
664
665 /* WaEnableFloatBlendOptimization:icl */
666 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, clear: 0,
667 _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
668 read_mask: 0 /* write-only, so skip validation */,
669 masked_reg: true);
670
671 /* WaDisableGPGPUMidThreadPreemption:icl */
672 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
673 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
674 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
675
676 /* allow headerless messages for preemptible GPGPU context */
677 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
678 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
679
680 /* Wa_1604278689:icl,ehl */
681 wa_write(wal, IVB_FBC_RT_BASE, set: 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
682 wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
683 clear: 0,
684 set: 0xFFFFFFFF);
685
686 /* Wa_1406306137:icl,ehl */
687 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
688
689 if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
690 /*
691 * Disable Repacking for Compression (masked R/W access)
692 * before rendering compressed surfaces for display.
693 */
694 wa_masked_en(wal, CACHE_MODE_0_GEN7,
695 DISABLE_REPACKING_FOR_COMPRESSION);
696 }
697}
698
699/*
700 * These settings aren't actually workarounds, but general tuning settings that
701 * need to be programmed on dg2 platform.
702 */
703static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
704 struct i915_wa_list *wal)
705{
706 wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
707 wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
708 REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
709 wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
710 FF_MODE2_TDS_TIMER_128);
711}
712
713static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
714 struct i915_wa_list *wal)
715{
716 struct drm_i915_private *i915 = engine->i915;
717
718 /*
719 * Wa_1409142259:tgl,dg1,adl-p,adl-n
720 * Wa_1409347922:tgl,dg1,adl-p
721 * Wa_1409252684:tgl,dg1,adl-p
722 * Wa_1409217633:tgl,dg1,adl-p
723 * Wa_1409207793:tgl,dg1,adl-p
724 * Wa_1409178076:tgl,dg1,adl-p,adl-n
725 * Wa_1408979724:tgl,dg1,adl-p,adl-n
726 * Wa_14010443199:tgl,rkl,dg1,adl-p,adl-n
727 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p,adl-n
728 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p,adl-n
729 * Wa_22010465259:tgl,rkl,dg1,adl-s,adl-p,adl-n
730 */
731 wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
732 GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
733
734 /* WaDisableGPGPUMidThreadPreemption:gen12 */
735 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
736 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
737 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
738
739 /*
740 * Wa_16011163337 - GS_TIMER
741 *
742 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
743 * need to program it even on those that don't explicitly list that
744 * workaround.
745 *
746 * Note that the programming of GEN12_FF_MODE2 is further modified
747 * according to the FF_MODE2 guidance given by Wa_1608008084.
748 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
749 * value when read from the CPU.
750 *
751 * The default value for this register is zero for all fields.
752 * So instead of doing a RMW we should just write the desired values
753 * for TDS and GS timers. Note that since the readback can't be trusted,
754 * the clear mask is just set to ~0 to make sure other bits are not
755 * inadvertently set. For the same reason read verification is ignored.
756 */
757 wa_add(wal,
758 GEN12_FF_MODE2,
759 clear: ~0,
760 FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
761 read_mask: 0, masked_reg: false);
762
763 if (!IS_DG1(i915)) {
764 /* Wa_1806527549 */
765 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
766
767 /* Wa_1606376872 */
768 wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
769 }
770
771 /*
772 * This bit must be set to enable performance optimization for fast
773 * clears.
774 */
775 wa_mcr_write_or(wal, GEN8_WM_CHICKEN2, WAIT_ON_DEPTH_STALL_DONE_DISABLE);
776}
777
778static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
779 struct i915_wa_list *wal)
780{
781 gen12_ctx_workarounds_init(engine, wal);
782
783 /* Wa_1409044764 */
784 wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
785 DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
786
787 /* Wa_22010493298 */
788 wa_masked_en(wal, HIZ_CHICKEN,
789 DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
790}
791
792static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
793 struct i915_wa_list *wal)
794{
795 dg2_ctx_gt_tuning_init(engine, wal);
796
797 /* Wa_16013271637:dg2 */
798 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
799 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
800
801 /* Wa_14014947963:dg2 */
802 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, val: 0x4000);
803
804 /* Wa_18018764978:dg2 */
805 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
806
807 /* Wa_18019271663:dg2 */
808 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
809
810 /* Wa_14019877138:dg2 */
811 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
812}
813
814static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
815 struct i915_wa_list *wal)
816{
817 struct intel_gt *gt = engine->gt;
818
819 dg2_ctx_gt_tuning_init(engine, wal);
820
821 /*
822 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
823 * gen12_emit_indirect_ctx_rcs() rather than here on some early
824 * steppings.
825 */
826 if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
827 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
828 wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, set: 0x3FF, read_mask: 0, masked_reg: false);
829}
830
831static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
832 struct i915_wa_list *wal)
833{
834 struct intel_gt *gt = engine->gt;
835
836 xelpg_ctx_gt_tuning_init(engine, wal);
837
838 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
839 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
840 /* Wa_14014947963 */
841 wa_masked_field_set(wal, VF_PREEMPTION,
842 PREEMPTION_VERTEX_COUNT, val: 0x4000);
843
844 /* Wa_16013271637 */
845 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
846 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
847
848 /* Wa_18019627453 */
849 wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
850
851 /* Wa_18018764978 */
852 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
853 }
854
855 /* Wa_18019271663 */
856 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
857
858 /* Wa_14019877138 */
859 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
860}
861
862static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
863 struct i915_wa_list *wal)
864{
865 /*
866 * This is a "fake" workaround defined by software to ensure we
867 * maintain reliable, backward-compatible behavior for userspace with
868 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
869 *
870 * The per-context setting of MI_MODE[12] determines whether the bits
871 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
872 * in the traditional manner or whether they should instead use a new
873 * tgl+ meaning that breaks backward compatibility, but allows nesting
874 * into 3rd-level batchbuffers. When this new capability was first
875 * added in TGL, it remained off by default unless a context
876 * intentionally opted in to the new behavior. However Xe_HPG now
877 * flips this on by default and requires that we explicitly opt out if
878 * we don't want the new behavior.
879 *
880 * From a SW perspective, we want to maintain the backward-compatible
881 * behavior for userspace, so we'll apply a fake workaround to set it
882 * back to the legacy behavior on platforms where the hardware default
883 * is to break compatibility. At the moment there is no Linux
884 * userspace that utilizes third-level batchbuffers, so this will avoid
885 * userspace from needing to make any changes. using the legacy
886 * meaning is the correct thing to do. If/when we have userspace
887 * consumers that want to utilize third-level batch nesting, we can
888 * provide a context parameter to allow them to opt-in.
889 */
890 wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
891}
892
893static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
894 struct i915_wa_list *wal)
895{
896 u8 mocs;
897
898 /*
899 * Some blitter commands do not have a field for MOCS, those
900 * commands will use MOCS index pointed by BLIT_CCTL.
901 * BLIT_CCTL registers are needed to be programmed to un-cached.
902 */
903 if (engine->class == COPY_ENGINE_CLASS) {
904 mocs = engine->gt->mocs.uc_index;
905 wa_write_clr_set(wal,
906 BLIT_CCTL(engine->mmio_base),
907 BLIT_CCTL_MASK,
908 BLIT_CCTL_MOCS(mocs, mocs));
909 }
910}
911
912/*
913 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
914 * defined by the hardware team, but it programming general context registers.
915 * Adding those context register programming in context workaround
916 * allow us to use the wa framework for proper application and validation.
917 */
918static void
919gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
920 struct i915_wa_list *wal)
921{
922 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
923 fakewa_disable_nestedbb_mode(engine, wal);
924
925 gen12_ctx_gt_mocs_init(engine, wal);
926}
927
928static void
929__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
930 struct i915_wa_list *wal,
931 const char *name)
932{
933 struct drm_i915_private *i915 = engine->i915;
934
935 wa_init_start(wal, gt: engine->gt, name, engine_name: engine->name);
936
937 /* Applies to all engines */
938 /*
939 * Fake workarounds are not the actual workaround but
940 * programming of context registers using workaround framework.
941 */
942 if (GRAPHICS_VER(i915) >= 12)
943 gen12_ctx_gt_fake_wa_init(engine, wal);
944
945 if (engine->class != RENDER_CLASS)
946 goto done;
947
948 if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
949 xelpg_ctx_workarounds_init(engine, wal);
950 else if (IS_DG2(i915))
951 dg2_ctx_workarounds_init(engine, wal);
952 else if (IS_DG1(i915))
953 dg1_ctx_workarounds_init(engine, wal);
954 else if (GRAPHICS_VER(i915) == 12)
955 gen12_ctx_workarounds_init(engine, wal);
956 else if (GRAPHICS_VER(i915) == 11)
957 icl_ctx_workarounds_init(engine, wal);
958 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
959 cfl_ctx_workarounds_init(engine, wal);
960 else if (IS_GEMINILAKE(i915))
961 glk_ctx_workarounds_init(engine, wal);
962 else if (IS_KABYLAKE(i915))
963 kbl_ctx_workarounds_init(engine, wal);
964 else if (IS_BROXTON(i915))
965 bxt_ctx_workarounds_init(engine, wal);
966 else if (IS_SKYLAKE(i915))
967 skl_ctx_workarounds_init(engine, wal);
968 else if (IS_CHERRYVIEW(i915))
969 chv_ctx_workarounds_init(engine, wal);
970 else if (IS_BROADWELL(i915))
971 bdw_ctx_workarounds_init(engine, wal);
972 else if (GRAPHICS_VER(i915) == 7)
973 gen7_ctx_workarounds_init(engine, wal);
974 else if (GRAPHICS_VER(i915) == 6)
975 gen6_ctx_workarounds_init(engine, wal);
976 else if (GRAPHICS_VER(i915) < 8)
977 ;
978 else
979 MISSING_CASE(GRAPHICS_VER(i915));
980
981done:
982 wa_init_finish(wal);
983}
984
985void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
986{
987 __intel_engine_init_ctx_wa(engine, wal: &engine->ctx_wa_list, name: "context");
988}
989
990int intel_engine_emit_ctx_wa(struct i915_request *rq)
991{
992 struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
993 struct intel_uncore *uncore = rq->engine->uncore;
994 enum forcewake_domains fw;
995 unsigned long flags;
996 struct i915_wa *wa;
997 unsigned int i;
998 u32 *cs;
999 int ret;
1000
1001 if (wal->count == 0)
1002 return 0;
1003
1004 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1005 if (ret)
1006 return ret;
1007
1008 if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
1009 IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS)
1010 cs = intel_ring_begin(rq, num_dwords: (wal->count * 2 + 6));
1011 else
1012 cs = intel_ring_begin(rq, num_dwords: (wal->count * 2 + 2));
1013
1014 if (IS_ERR(ptr: cs))
1015 return PTR_ERR(ptr: cs);
1016
1017 fw = wal_get_fw_for_rmw(uncore, wal);
1018
1019 intel_gt_mcr_lock(gt: wal->gt, flags: &flags);
1020 spin_lock(lock: &uncore->lock);
1021 intel_uncore_forcewake_get__locked(uncore, domains: fw);
1022
1023 *cs++ = MI_LOAD_REGISTER_IMM(wal->count);
1024 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1025 u32 val;
1026
1027 /* Skip reading the register if it's not really needed */
1028 if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
1029 val = wa->set;
1030 } else {
1031 val = wa->is_mcr ?
1032 intel_gt_mcr_read_any_fw(gt: wal->gt, reg: wa->mcr_reg) :
1033 intel_uncore_read_fw(uncore, wa->reg);
1034 val &= ~wa->clr;
1035 val |= wa->set;
1036 }
1037
1038 *cs++ = i915_mmio_reg_offset(wa->reg);
1039 *cs++ = val;
1040 }
1041 *cs++ = MI_NOOP;
1042
1043 /* Wa_14019789679 */
1044 if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
1045 IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) {
1046 *cs++ = CMD_3DSTATE_MESH_CONTROL;
1047 *cs++ = 0;
1048 *cs++ = 0;
1049 *cs++ = MI_NOOP;
1050 }
1051
1052 intel_uncore_forcewake_put__locked(uncore, domains: fw);
1053 spin_unlock(lock: &uncore->lock);
1054 intel_gt_mcr_unlock(gt: wal->gt, flags);
1055
1056 intel_ring_advance(rq, cs);
1057
1058 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1059 if (ret)
1060 return ret;
1061
1062 return 0;
1063}
1064
1065static void
1066gen4_gt_workarounds_init(struct intel_gt *gt,
1067 struct i915_wa_list *wal)
1068{
1069 /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1070 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1071}
1072
1073static void
1074g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1075{
1076 gen4_gt_workarounds_init(gt, wal);
1077
1078 /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1079 wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1080}
1081
1082static void
1083ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1084{
1085 g4x_gt_workarounds_init(gt, wal);
1086
1087 wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1088}
1089
1090static void
1091snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1092{
1093}
1094
1095static void
1096ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1097{
1098 /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1099 wa_masked_dis(wal,
1100 GEN7_COMMON_SLICE_CHICKEN1,
1101 GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1102
1103 /* WaApplyL3ControlAndL3ChickenMode:ivb */
1104 wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1105 wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1106
1107 /* WaForceL3Serialization:ivb */
1108 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1109}
1110
1111static void
1112vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1113{
1114 /* WaForceL3Serialization:vlv */
1115 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1116
1117 /*
1118 * WaIncreaseL3CreditsForVLVB0:vlv
1119 * This is the hardware default actually.
1120 */
1121 wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1122}
1123
1124static void
1125hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1126{
1127 /* L3 caching of data atomics doesn't work -- disable it. */
1128 wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1129
1130 wa_add(wal,
1131 HSW_ROW_CHICKEN3, clear: 0,
1132 _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1133 read_mask: 0 /* XXX does this reg exist? */, masked_reg: true);
1134
1135 /* WaVSRefCountFullforceMissDisable:hsw */
1136 wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1137}
1138
1139static void
1140gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1141{
1142 const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1143 unsigned int slice, subslice;
1144 u32 mcr, mcr_mask;
1145
1146 GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1147
1148 /*
1149 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1150 * Before any MMIO read into slice/subslice specific registers, MCR
1151 * packet control register needs to be programmed to point to any
1152 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1153 * This means each subsequent MMIO read will be forwarded to an
1154 * specific s/ss combination, but this is OK since these registers
1155 * are consistent across s/ss in almost all cases. In the rare
1156 * occasions, such as INSTDONE, where this value is dependent
1157 * on s/ss combo, the read should be done with read_subslice_reg.
1158 */
1159 slice = ffs(sseu->slice_mask) - 1;
1160 GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1161 subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1162 GEM_BUG_ON(!subslice);
1163 subslice--;
1164
1165 /*
1166 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1167 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1168 */
1169 mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1170 mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1171
1172 drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1173
1174 wa_write_clr_set(wal, GEN8_MCR_SELECTOR, clear: mcr_mask, set: mcr);
1175}
1176
1177static void
1178gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1179{
1180 struct drm_i915_private *i915 = gt->i915;
1181
1182 /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1183 gen9_wa_init_mcr(i915, wal);
1184
1185 /* WaDisableKillLogic:bxt,skl,kbl */
1186 if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1187 wa_write_or(wal,
1188 GAM_ECOCHK,
1189 ECOCHK_DIS_TLB);
1190
1191 if (HAS_LLC(i915)) {
1192 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1193 *
1194 * Must match Display Engine. See
1195 * WaCompressedResourceDisplayNewHashMode.
1196 */
1197 wa_write_or(wal,
1198 MMCD_MISC_CTRL,
1199 MMCD_PCLA | MMCD_HOTSPOT_EN);
1200 }
1201
1202 /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1203 wa_write_or(wal,
1204 GAM_ECOCHK,
1205 BDW_DISABLE_HDC_INVALIDATION);
1206}
1207
1208static void
1209skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1210{
1211 gen9_gt_workarounds_init(gt, wal);
1212
1213 /* WaDisableGafsUnitClkGating:skl */
1214 wa_write_or(wal,
1215 GEN7_UCGCTL4,
1216 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1217
1218 /* WaInPlaceDecompressionHang:skl */
1219 if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1220 wa_write_or(wal,
1221 GEN9_GAMT_ECO_REG_RW_IA,
1222 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1223}
1224
1225static void
1226kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1227{
1228 gen9_gt_workarounds_init(gt, wal);
1229
1230 /* WaDisableDynamicCreditSharing:kbl */
1231 if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1232 wa_write_or(wal,
1233 GAMT_CHKN_BIT_REG,
1234 GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1235
1236 /* WaDisableGafsUnitClkGating:kbl */
1237 wa_write_or(wal,
1238 GEN7_UCGCTL4,
1239 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1240
1241 /* WaInPlaceDecompressionHang:kbl */
1242 wa_write_or(wal,
1243 GEN9_GAMT_ECO_REG_RW_IA,
1244 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1245}
1246
1247static void
1248glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1249{
1250 gen9_gt_workarounds_init(gt, wal);
1251}
1252
1253static void
1254cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1255{
1256 gen9_gt_workarounds_init(gt, wal);
1257
1258 /* WaDisableGafsUnitClkGating:cfl */
1259 wa_write_or(wal,
1260 GEN7_UCGCTL4,
1261 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1262
1263 /* WaInPlaceDecompressionHang:cfl */
1264 wa_write_or(wal,
1265 GEN9_GAMT_ECO_REG_RW_IA,
1266 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1267}
1268
1269static void __set_mcr_steering(struct i915_wa_list *wal,
1270 i915_reg_t steering_reg,
1271 unsigned int slice, unsigned int subslice)
1272{
1273 u32 mcr, mcr_mask;
1274
1275 mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1276 mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1277
1278 wa_write_clr_set(wal, reg: steering_reg, clear: mcr_mask, set: mcr);
1279}
1280
1281static void debug_dump_steering(struct intel_gt *gt)
1282{
1283 struct drm_printer p = drm_dbg_printer(drm: &gt->i915->drm, category: DRM_UT_DRIVER,
1284 prefix: "MCR Steering:");
1285
1286 if (drm_debug_enabled(DRM_UT_DRIVER))
1287 intel_gt_mcr_report_steering(p: &p, gt, dump_table: false);
1288}
1289
1290static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1291 unsigned int slice, unsigned int subslice)
1292{
1293 __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1294
1295 gt->default_steering.groupid = slice;
1296 gt->default_steering.instanceid = subslice;
1297
1298 debug_dump_steering(gt);
1299}
1300
1301static void
1302icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1303{
1304 const struct sseu_dev_info *sseu = &gt->info.sseu;
1305 unsigned int subslice;
1306
1307 GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1308 GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1309
1310 /*
1311 * Although a platform may have subslices, we need to always steer
1312 * reads to the lowest instance that isn't fused off. When Render
1313 * Power Gating is enabled, grabbing forcewake will only power up a
1314 * single subslice (the "minconfig") if there isn't a real workload
1315 * that needs to be run; this means that if we steer register reads to
1316 * one of the higher subslices, we run the risk of reading back 0's or
1317 * random garbage.
1318 */
1319 subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1320
1321 /*
1322 * If the subslice we picked above also steers us to a valid L3 bank,
1323 * then we can just rely on the default steering and won't need to
1324 * worry about explicitly re-steering L3BANK reads later.
1325 */
1326 if (gt->info.l3bank_mask & BIT(subslice))
1327 gt->steering_table[L3BANK] = NULL;
1328
1329 __add_mcr_wa(gt, wal, slice: 0, subslice);
1330}
1331
1332static void
1333xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1334{
1335 const struct sseu_dev_info *sseu = &gt->info.sseu;
1336 unsigned long slice, subslice = 0, slice_mask = 0;
1337 u32 lncf_mask = 0;
1338 int i;
1339
1340 /*
1341 * On Xe_HP the steering increases in complexity. There are now several
1342 * more units that require steering and we're not guaranteed to be able
1343 * to find a common setting for all of them. These are:
1344 * - GSLICE (fusable)
1345 * - DSS (sub-unit within gslice; fusable)
1346 * - L3 Bank (fusable)
1347 * - MSLICE (fusable)
1348 * - LNCF (sub-unit within mslice; always present if mslice is present)
1349 *
1350 * We'll do our default/implicit steering based on GSLICE (in the
1351 * sliceid field) and DSS (in the subsliceid field). If we can
1352 * find overlap between the valid MSLICE and/or LNCF values with
1353 * a suitable GSLICE, then we can just reuse the default value and
1354 * skip and explicit steering at runtime.
1355 *
1356 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1357 * a valid sliceid value. DSS steering is the only type of steering
1358 * that utilizes the 'subsliceid' bits.
1359 *
1360 * Also note that, even though the steering domain is called "GSlice"
1361 * and it is encoded in the register using the gslice format, the spec
1362 * says that the combined (geometry | compute) fuse should be used to
1363 * select the steering.
1364 */
1365
1366 /* Find the potential gslice candidates */
1367 slice_mask = intel_slicemask_from_xehp_dssmask(dss_mask: sseu->subslice_mask,
1368 GEN_DSS_PER_GSLICE);
1369
1370 /*
1371 * Find the potential LNCF candidates. Either LNCF within a valid
1372 * mslice is fine.
1373 */
1374 for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1375 lncf_mask |= (0x3 << (i * 2));
1376
1377 /*
1378 * Are there any sliceid values that work for both GSLICE and LNCF
1379 * steering?
1380 */
1381 if (slice_mask & lncf_mask) {
1382 slice_mask &= lncf_mask;
1383 gt->steering_table[LNCF] = NULL;
1384 }
1385
1386 /* How about sliceid values that also work for MSLICE steering? */
1387 if (slice_mask & gt->info.mslice_mask) {
1388 slice_mask &= gt->info.mslice_mask;
1389 gt->steering_table[MSLICE] = NULL;
1390 }
1391
1392 slice = __ffs(slice_mask);
1393 subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, groupnum: slice) %
1394 GEN_DSS_PER_GSLICE;
1395
1396 __add_mcr_wa(gt, wal, slice, subslice);
1397
1398 /*
1399 * SQIDI ranges are special because they use different steering
1400 * registers than everything else we work with. On XeHP SDV and
1401 * DG2-G10, any value in the steering registers will work fine since
1402 * all instances are present, but DG2-G11 only has SQIDI instances at
1403 * ID's 2 and 3, so we need to steer to one of those. For simplicity
1404 * we'll just steer to a hardcoded "2" since that value will work
1405 * everywhere.
1406 */
1407 __set_mcr_steering(wal, MCFG_MCR_SELECTOR, slice: 0, subslice: 2);
1408 __set_mcr_steering(wal, SF_MCR_SELECTOR, slice: 0, subslice: 2);
1409
1410 /*
1411 * On DG2, GAM registers have a dedicated steering control register
1412 * and must always be programmed to a hardcoded groupid of "1."
1413 */
1414 if (IS_DG2(gt->i915))
1415 __set_mcr_steering(wal, GAM_MCR_SELECTOR, slice: 1, subslice: 0);
1416}
1417
1418static void
1419icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1420{
1421 struct drm_i915_private *i915 = gt->i915;
1422
1423 icl_wa_init_mcr(gt, wal);
1424
1425 /* WaModifyGamTlbPartitioning:icl */
1426 wa_write_clr_set(wal,
1427 GEN11_GACB_PERF_CTRL,
1428 GEN11_HASH_CTRL_MASK,
1429 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1430
1431 /* Wa_1405766107:icl
1432 * Formerly known as WaCL2SFHalfMaxAlloc
1433 */
1434 wa_write_or(wal,
1435 GEN11_LSN_UNSLCVC,
1436 GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1437 GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1438
1439 /* Wa_220166154:icl
1440 * Formerly known as WaDisCtxReload
1441 */
1442 wa_write_or(wal,
1443 GEN8_GAMW_ECO_DEV_RW_IA,
1444 GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1445
1446 /* Wa_1406463099:icl
1447 * Formerly known as WaGamTlbPendError
1448 */
1449 wa_write_or(wal,
1450 GAMT_CHKN_BIT_REG,
1451 GAMT_CHKN_DISABLE_L3_COH_PIPE);
1452
1453 /*
1454 * Wa_1408615072:icl,ehl (vsunit)
1455 * Wa_1407596294:icl,ehl (hsunit)
1456 */
1457 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1458 VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1459
1460 /* Wa_1407352427:icl,ehl */
1461 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1462 PSDUNIT_CLKGATE_DIS);
1463
1464 /* Wa_1406680159:icl,ehl */
1465 wa_mcr_write_or(wal,
1466 GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1467 GWUNIT_CLKGATE_DIS);
1468
1469 /* Wa_1607087056:icl,ehl,jsl */
1470 if (IS_ICELAKE(i915) ||
1471 ((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1472 IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1473 wa_write_or(wal,
1474 GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1475 L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1476
1477 /*
1478 * This is not a documented workaround, but rather an optimization
1479 * to reduce sampler power.
1480 */
1481 wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1482}
1483
1484/*
1485 * Though there are per-engine instances of these registers,
1486 * they retain their value through engine resets and should
1487 * only be provided on the GT workaround list rather than
1488 * the engine-specific workaround list.
1489 */
1490static void
1491wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1492{
1493 struct intel_engine_cs *engine;
1494 int id;
1495
1496 for_each_engine(engine, gt, id) {
1497 if (engine->class != VIDEO_DECODE_CLASS ||
1498 (engine->instance % 2))
1499 continue;
1500
1501 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1502 IECPUNIT_CLKGATE_DIS);
1503 }
1504}
1505
1506static void
1507gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1508{
1509 icl_wa_init_mcr(gt, wal);
1510
1511 /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1512 wa_14011060649(gt, wal);
1513
1514 /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1515 wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1516
1517 /*
1518 * Wa_14015795083
1519 *
1520 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1521 * preventing i915 from modifying it for this workaround. Skip the
1522 * readback verification for this workaround on debug builds; if the
1523 * workaround doesn't stick due to firmware behavior, it's not an error
1524 * that we want CI to flag.
1525 */
1526 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1527 set: 0, read_mask: 0, masked_reg: false);
1528}
1529
1530static void
1531dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1532{
1533 gen12_gt_workarounds_init(gt, wal);
1534
1535 /* Wa_1409420604:dg1 */
1536 wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1537 CPSSUNIT_CLKGATE_DIS);
1538
1539 /* Wa_1408615072:dg1 */
1540 /* Empirical testing shows this register is unaffected by engine reset. */
1541 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1542}
1543
1544static void
1545dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1546{
1547 xehp_init_mcr(gt, wal);
1548
1549 /* Wa_14011060649:dg2 */
1550 wa_14011060649(gt, wal);
1551
1552 if (IS_DG2_G10(gt->i915)) {
1553 /* Wa_22010523718:dg2 */
1554 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1555 CG3DDISCFEG_CLKGATE_DIS);
1556
1557 /* Wa_14011006942:dg2 */
1558 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1559 DSS_ROUTER_CLKGATE_DIS);
1560 }
1561
1562 /* Wa_14014830051:dg2 */
1563 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1564
1565 /*
1566 * Wa_14015795083
1567 * Skip verification for possibly locked register.
1568 */
1569 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1570 set: 0, read_mask: 0, masked_reg: false);
1571
1572 /* Wa_18018781329 */
1573 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1574 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1575 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1576 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1577
1578 /* Wa_1509235366:dg2 */
1579 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1580 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1581
1582 /* Wa_14010648519:dg2 */
1583 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1584}
1585
1586static void
1587xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1588{
1589 /* Wa_14018575942 / Wa_18018781329 */
1590 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1591 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1592
1593 /* Wa_22016670082 */
1594 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1595
1596 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1597 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1598 /* Wa_14014830051 */
1599 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1600
1601 /* Wa_14015795083 */
1602 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1603 }
1604
1605 /*
1606 * Unlike older platforms, we no longer setup implicit steering here;
1607 * all MCR accesses are explicitly steered.
1608 */
1609 debug_dump_steering(gt);
1610}
1611
1612static void
1613wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1614{
1615 struct intel_engine_cs *engine;
1616 int id;
1617
1618 for_each_engine(engine, gt, id)
1619 if (engine->class == VIDEO_DECODE_CLASS)
1620 wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1621 MFXPIPE_CLKGATE_DIS);
1622}
1623
1624static void
1625xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1626{
1627 wa_16021867713(gt, wal);
1628
1629 /*
1630 * Wa_14018778641
1631 * Wa_18018781329
1632 *
1633 * Note that although these registers are MCR on the primary
1634 * GT, the media GT's versions are regular singleton registers.
1635 */
1636 wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1637
1638 /*
1639 * Wa_14018575942
1640 *
1641 * Issue is seen on media KPI test running on VDBOX engine
1642 * especially VP9 encoding WLs
1643 */
1644 wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1645
1646 /* Wa_22016670082 */
1647 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1648
1649 debug_dump_steering(gt);
1650}
1651
1652/*
1653 * The bspec performance guide has recommended MMIO tuning settings. These
1654 * aren't truly "workarounds" but we want to program them through the
1655 * workaround infrastructure to make sure they're (re)applied at the proper
1656 * times.
1657 *
1658 * The programming in this function is for settings that persist through
1659 * engine resets and also are not part of any engine's register state context.
1660 * I.e., settings that only need to be re-applied in the event of a full GT
1661 * reset.
1662 */
1663static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1664{
1665 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1666 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1667 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1668 }
1669
1670 if (IS_DG2(gt->i915)) {
1671 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1672 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1673 }
1674}
1675
1676static void
1677gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1678{
1679 struct drm_i915_private *i915 = gt->i915;
1680
1681 gt_tuning_settings(gt, wal);
1682
1683 if (gt->type == GT_MEDIA) {
1684 if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1685 xelpmp_gt_workarounds_init(gt, wal);
1686 else
1687 MISSING_CASE(MEDIA_VER_FULL(i915));
1688
1689 return;
1690 }
1691
1692 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1693 xelpg_gt_workarounds_init(gt, wal);
1694 else if (IS_DG2(i915))
1695 dg2_gt_workarounds_init(gt, wal);
1696 else if (IS_DG1(i915))
1697 dg1_gt_workarounds_init(gt, wal);
1698 else if (GRAPHICS_VER(i915) == 12)
1699 gen12_gt_workarounds_init(gt, wal);
1700 else if (GRAPHICS_VER(i915) == 11)
1701 icl_gt_workarounds_init(gt, wal);
1702 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1703 cfl_gt_workarounds_init(gt, wal);
1704 else if (IS_GEMINILAKE(i915))
1705 glk_gt_workarounds_init(gt, wal);
1706 else if (IS_KABYLAKE(i915))
1707 kbl_gt_workarounds_init(gt, wal);
1708 else if (IS_BROXTON(i915))
1709 gen9_gt_workarounds_init(gt, wal);
1710 else if (IS_SKYLAKE(i915))
1711 skl_gt_workarounds_init(gt, wal);
1712 else if (IS_HASWELL(i915))
1713 hsw_gt_workarounds_init(gt, wal);
1714 else if (IS_VALLEYVIEW(i915))
1715 vlv_gt_workarounds_init(gt, wal);
1716 else if (IS_IVYBRIDGE(i915))
1717 ivb_gt_workarounds_init(gt, wal);
1718 else if (GRAPHICS_VER(i915) == 6)
1719 snb_gt_workarounds_init(gt, wal);
1720 else if (GRAPHICS_VER(i915) == 5)
1721 ilk_gt_workarounds_init(gt, wal);
1722 else if (IS_G4X(i915))
1723 g4x_gt_workarounds_init(gt, wal);
1724 else if (GRAPHICS_VER(i915) == 4)
1725 gen4_gt_workarounds_init(gt, wal);
1726 else if (GRAPHICS_VER(i915) <= 8)
1727 ;
1728 else
1729 MISSING_CASE(GRAPHICS_VER(i915));
1730}
1731
1732void intel_gt_init_workarounds(struct intel_gt *gt)
1733{
1734 struct i915_wa_list *wal = &gt->wa_list;
1735
1736 wa_init_start(wal, gt, name: "GT", engine_name: "global");
1737 gt_init_workarounds(gt, wal);
1738 wa_init_finish(wal);
1739}
1740
1741static bool
1742wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1743 const char *name, const char *from)
1744{
1745 if ((cur ^ wa->set) & wa->read) {
1746 gt_err(gt,
1747 "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1748 name, from, i915_mmio_reg_offset(wa->reg),
1749 cur, cur & wa->read, wa->set & wa->read);
1750
1751 return false;
1752 }
1753
1754 return true;
1755}
1756
1757static void wa_list_apply(const struct i915_wa_list *wal)
1758{
1759 struct intel_gt *gt = wal->gt;
1760 struct intel_uncore *uncore = gt->uncore;
1761 enum forcewake_domains fw;
1762 unsigned long flags;
1763 struct i915_wa *wa;
1764 unsigned int i;
1765
1766 if (!wal->count)
1767 return;
1768
1769 fw = wal_get_fw_for_rmw(uncore, wal);
1770
1771 intel_gt_mcr_lock(gt, flags: &flags);
1772 spin_lock(lock: &uncore->lock);
1773 intel_uncore_forcewake_get__locked(uncore, domains: fw);
1774
1775 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1776 u32 val, old = 0;
1777
1778 /* open-coded rmw due to steering */
1779 if (wa->clr)
1780 old = wa->is_mcr ?
1781 intel_gt_mcr_read_any_fw(gt, reg: wa->mcr_reg) :
1782 intel_uncore_read_fw(uncore, wa->reg);
1783 val = (old & ~wa->clr) | wa->set;
1784 if (val != old || !wa->clr) {
1785 if (wa->is_mcr)
1786 intel_gt_mcr_multicast_write_fw(gt, reg: wa->mcr_reg, value: val);
1787 else
1788 intel_uncore_write_fw(uncore, wa->reg, val);
1789 }
1790
1791 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1792 u32 val = wa->is_mcr ?
1793 intel_gt_mcr_read_any_fw(gt, reg: wa->mcr_reg) :
1794 intel_uncore_read_fw(uncore, wa->reg);
1795
1796 wa_verify(gt, wa, cur: val, name: wal->name, from: "application");
1797 }
1798 }
1799
1800 intel_uncore_forcewake_put__locked(uncore, domains: fw);
1801 spin_unlock(lock: &uncore->lock);
1802 intel_gt_mcr_unlock(gt, flags);
1803}
1804
1805void intel_gt_apply_workarounds(struct intel_gt *gt)
1806{
1807 wa_list_apply(wal: &gt->wa_list);
1808}
1809
1810static bool wa_list_verify(struct intel_gt *gt,
1811 const struct i915_wa_list *wal,
1812 const char *from)
1813{
1814 struct intel_uncore *uncore = gt->uncore;
1815 struct i915_wa *wa;
1816 enum forcewake_domains fw;
1817 unsigned long flags;
1818 unsigned int i;
1819 bool ok = true;
1820
1821 fw = wal_get_fw_for_rmw(uncore, wal);
1822
1823 intel_gt_mcr_lock(gt, flags: &flags);
1824 spin_lock(lock: &uncore->lock);
1825 intel_uncore_forcewake_get__locked(uncore, domains: fw);
1826
1827 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1828 ok &= wa_verify(gt: wal->gt, wa, cur: wa->is_mcr ?
1829 intel_gt_mcr_read_any_fw(gt, reg: wa->mcr_reg) :
1830 intel_uncore_read_fw(uncore, wa->reg),
1831 name: wal->name, from);
1832
1833 intel_uncore_forcewake_put__locked(uncore, domains: fw);
1834 spin_unlock(lock: &uncore->lock);
1835 intel_gt_mcr_unlock(gt, flags);
1836
1837 return ok;
1838}
1839
1840bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1841{
1842 return wa_list_verify(gt, wal: &gt->wa_list, from);
1843}
1844
1845__maybe_unused
1846static bool is_nonpriv_flags_valid(u32 flags)
1847{
1848 /* Check only valid flag bits are set */
1849 if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1850 return false;
1851
1852 /* NB: Only 3 out of 4 enum values are valid for access field */
1853 if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1854 RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1855 return false;
1856
1857 return true;
1858}
1859
1860static void
1861whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1862{
1863 struct i915_wa wa = {
1864 .reg = reg
1865 };
1866
1867 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1868 return;
1869
1870 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1871 return;
1872
1873 wa.reg.reg |= flags;
1874 _wa_add(wal, wa: &wa);
1875}
1876
1877static void
1878whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1879{
1880 struct i915_wa wa = {
1881 .mcr_reg = reg,
1882 .is_mcr = 1,
1883 };
1884
1885 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1886 return;
1887
1888 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1889 return;
1890
1891 wa.mcr_reg.reg |= flags;
1892 _wa_add(wal, wa: &wa);
1893}
1894
1895static void
1896whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1897{
1898 whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1899}
1900
1901static void
1902whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1903{
1904 whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1905}
1906
1907static void gen9_whitelist_build(struct i915_wa_list *w)
1908{
1909 /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1910 whitelist_reg(wal: w, GEN9_CTX_PREEMPT_REG);
1911
1912 /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1913 whitelist_reg(wal: w, GEN8_CS_CHICKEN1);
1914
1915 /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1916 whitelist_reg(wal: w, GEN8_HDC_CHICKEN1);
1917
1918 /* WaSendPushConstantsFromMMIO:skl,bxt */
1919 whitelist_reg(wal: w, COMMON_SLICE_CHICKEN2);
1920}
1921
1922static void skl_whitelist_build(struct intel_engine_cs *engine)
1923{
1924 struct i915_wa_list *w = &engine->whitelist;
1925
1926 if (engine->class != RENDER_CLASS)
1927 return;
1928
1929 gen9_whitelist_build(w);
1930
1931 /* WaDisableLSQCROPERFforOCL:skl */
1932 whitelist_mcr_reg(wal: w, GEN8_L3SQCREG4);
1933}
1934
1935static void bxt_whitelist_build(struct intel_engine_cs *engine)
1936{
1937 if (engine->class != RENDER_CLASS)
1938 return;
1939
1940 gen9_whitelist_build(w: &engine->whitelist);
1941}
1942
1943static void kbl_whitelist_build(struct intel_engine_cs *engine)
1944{
1945 struct i915_wa_list *w = &engine->whitelist;
1946
1947 if (engine->class != RENDER_CLASS)
1948 return;
1949
1950 gen9_whitelist_build(w);
1951
1952 /* WaDisableLSQCROPERFforOCL:kbl */
1953 whitelist_mcr_reg(wal: w, GEN8_L3SQCREG4);
1954}
1955
1956static void glk_whitelist_build(struct intel_engine_cs *engine)
1957{
1958 struct i915_wa_list *w = &engine->whitelist;
1959
1960 if (engine->class != RENDER_CLASS)
1961 return;
1962
1963 gen9_whitelist_build(w);
1964
1965 /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1966 whitelist_reg(wal: w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1967}
1968
1969static void cfl_whitelist_build(struct intel_engine_cs *engine)
1970{
1971 struct i915_wa_list *w = &engine->whitelist;
1972
1973 if (engine->class != RENDER_CLASS)
1974 return;
1975
1976 gen9_whitelist_build(w);
1977
1978 /*
1979 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1980 *
1981 * This covers 4 register which are next to one another :
1982 * - PS_INVOCATION_COUNT
1983 * - PS_INVOCATION_COUNT_UDW
1984 * - PS_DEPTH_COUNT
1985 * - PS_DEPTH_COUNT_UDW
1986 */
1987 whitelist_reg_ext(wal: w, PS_INVOCATION_COUNT,
1988 RING_FORCE_TO_NONPRIV_ACCESS_RD |
1989 RING_FORCE_TO_NONPRIV_RANGE_4);
1990}
1991
1992static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1993{
1994 struct i915_wa_list *w = &engine->whitelist;
1995
1996 if (engine->class != RENDER_CLASS)
1997 whitelist_reg_ext(wal: w,
1998 RING_CTX_TIMESTAMP(engine->mmio_base),
1999 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2000}
2001
2002static void cml_whitelist_build(struct intel_engine_cs *engine)
2003{
2004 allow_read_ctx_timestamp(engine);
2005
2006 cfl_whitelist_build(engine);
2007}
2008
2009static void icl_whitelist_build(struct intel_engine_cs *engine)
2010{
2011 struct i915_wa_list *w = &engine->whitelist;
2012
2013 allow_read_ctx_timestamp(engine);
2014
2015 switch (engine->class) {
2016 case RENDER_CLASS:
2017 /* WaAllowUMDToModifyHalfSliceChicken7:icl */
2018 whitelist_mcr_reg(wal: w, GEN9_HALF_SLICE_CHICKEN7);
2019
2020 /* WaAllowUMDToModifySamplerMode:icl */
2021 whitelist_mcr_reg(wal: w, GEN10_SAMPLER_MODE);
2022
2023 /* WaEnableStateCacheRedirectToCS:icl */
2024 whitelist_reg(wal: w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2025
2026 /*
2027 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2028 *
2029 * This covers 4 register which are next to one another :
2030 * - PS_INVOCATION_COUNT
2031 * - PS_INVOCATION_COUNT_UDW
2032 * - PS_DEPTH_COUNT
2033 * - PS_DEPTH_COUNT_UDW
2034 */
2035 whitelist_reg_ext(wal: w, PS_INVOCATION_COUNT,
2036 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2037 RING_FORCE_TO_NONPRIV_RANGE_4);
2038 break;
2039
2040 case VIDEO_DECODE_CLASS:
2041 /* hucStatusRegOffset */
2042 whitelist_reg_ext(wal: w, _MMIO(0x2000 + engine->mmio_base),
2043 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2044 /* hucUKernelHdrInfoRegOffset */
2045 whitelist_reg_ext(wal: w, _MMIO(0x2014 + engine->mmio_base),
2046 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2047 /* hucStatus2RegOffset */
2048 whitelist_reg_ext(wal: w, _MMIO(0x23B0 + engine->mmio_base),
2049 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2050 break;
2051
2052 default:
2053 break;
2054 }
2055}
2056
2057static void tgl_whitelist_build(struct intel_engine_cs *engine)
2058{
2059 struct i915_wa_list *w = &engine->whitelist;
2060
2061 allow_read_ctx_timestamp(engine);
2062
2063 switch (engine->class) {
2064 case RENDER_CLASS:
2065 /*
2066 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2067 * Wa_1408556865:tgl
2068 *
2069 * This covers 4 registers which are next to one another :
2070 * - PS_INVOCATION_COUNT
2071 * - PS_INVOCATION_COUNT_UDW
2072 * - PS_DEPTH_COUNT
2073 * - PS_DEPTH_COUNT_UDW
2074 */
2075 whitelist_reg_ext(wal: w, PS_INVOCATION_COUNT,
2076 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2077 RING_FORCE_TO_NONPRIV_RANGE_4);
2078
2079 /*
2080 * Wa_1808121037:tgl
2081 * Wa_14012131227:dg1
2082 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2083 */
2084 whitelist_reg(wal: w, GEN7_COMMON_SLICE_CHICKEN1);
2085
2086 /* Wa_1806527549:tgl */
2087 whitelist_reg(wal: w, HIZ_CHICKEN);
2088
2089 /* Required by recommended tuning setting (not a workaround) */
2090 whitelist_reg(wal: w, GEN11_COMMON_SLICE_CHICKEN3);
2091
2092 break;
2093 default:
2094 break;
2095 }
2096}
2097
2098static void dg2_whitelist_build(struct intel_engine_cs *engine)
2099{
2100 struct i915_wa_list *w = &engine->whitelist;
2101
2102 switch (engine->class) {
2103 case RENDER_CLASS:
2104 /* Required by recommended tuning setting (not a workaround) */
2105 whitelist_mcr_reg(wal: w, XEHP_COMMON_SLICE_CHICKEN3);
2106 whitelist_reg(wal: w, GEN7_COMMON_SLICE_CHICKEN1);
2107 break;
2108 default:
2109 break;
2110 }
2111}
2112
2113static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2114{
2115 struct i915_wa_list *w = &engine->whitelist;
2116
2117 switch (engine->class) {
2118 case RENDER_CLASS:
2119 /* Required by recommended tuning setting (not a workaround) */
2120 whitelist_mcr_reg(wal: w, XEHP_COMMON_SLICE_CHICKEN3);
2121 whitelist_reg(wal: w, GEN7_COMMON_SLICE_CHICKEN1);
2122 break;
2123 default:
2124 break;
2125 }
2126}
2127
2128void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2129{
2130 struct drm_i915_private *i915 = engine->i915;
2131 struct i915_wa_list *w = &engine->whitelist;
2132
2133 wa_init_start(wal: w, gt: engine->gt, name: "whitelist", engine_name: engine->name);
2134
2135 if (engine->gt->type == GT_MEDIA)
2136 ; /* none yet */
2137 else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2138 xelpg_whitelist_build(engine);
2139 else if (IS_DG2(i915))
2140 dg2_whitelist_build(engine);
2141 else if (GRAPHICS_VER(i915) == 12)
2142 tgl_whitelist_build(engine);
2143 else if (GRAPHICS_VER(i915) == 11)
2144 icl_whitelist_build(engine);
2145 else if (IS_COMETLAKE(i915))
2146 cml_whitelist_build(engine);
2147 else if (IS_COFFEELAKE(i915))
2148 cfl_whitelist_build(engine);
2149 else if (IS_GEMINILAKE(i915))
2150 glk_whitelist_build(engine);
2151 else if (IS_KABYLAKE(i915))
2152 kbl_whitelist_build(engine);
2153 else if (IS_BROXTON(i915))
2154 bxt_whitelist_build(engine);
2155 else if (IS_SKYLAKE(i915))
2156 skl_whitelist_build(engine);
2157 else if (GRAPHICS_VER(i915) <= 8)
2158 ;
2159 else
2160 MISSING_CASE(GRAPHICS_VER(i915));
2161
2162 wa_init_finish(wal: w);
2163}
2164
2165void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2166{
2167 const struct i915_wa_list *wal = &engine->whitelist;
2168 struct intel_uncore *uncore = engine->uncore;
2169 const u32 base = engine->mmio_base;
2170 struct i915_wa *wa;
2171 unsigned int i;
2172
2173 if (!wal->count)
2174 return;
2175
2176 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2177 intel_uncore_write(uncore,
2178 RING_FORCE_TO_NONPRIV(base, i),
2179 i915_mmio_reg_offset(wa->reg));
2180
2181 /* And clear the rest just in case of garbage */
2182 for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2183 intel_uncore_write(uncore,
2184 RING_FORCE_TO_NONPRIV(base, i),
2185 i915_mmio_reg_offset(RING_NOPID(base)));
2186}
2187
2188/*
2189 * engine_fake_wa_init(), a place holder to program the registers
2190 * which are not part of an official workaround defined by the
2191 * hardware team.
2192 * Adding programming of those register inside workaround will
2193 * allow utilizing wa framework to proper application and verification.
2194 */
2195static void
2196engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2197{
2198 u8 mocs_w, mocs_r;
2199
2200 /*
2201 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2202 * by the command streamer when executing commands that don't have
2203 * a way to explicitly specify a MOCS setting. The default should
2204 * usually reference whichever MOCS entry corresponds to uncached
2205 * behavior, although use of a WB cached entry is recommended by the
2206 * spec in certain circumstances on specific platforms.
2207 */
2208 if (GRAPHICS_VER(engine->i915) >= 12) {
2209 mocs_r = engine->gt->mocs.uc_index;
2210 mocs_w = engine->gt->mocs.uc_index;
2211
2212 if (HAS_L3_CCS_READ(engine->i915) &&
2213 engine->class == COMPUTE_CLASS) {
2214 mocs_r = engine->gt->mocs.wb_index;
2215
2216 /*
2217 * Even on the few platforms where MOCS 0 is a
2218 * legitimate table entry, it's never the correct
2219 * setting to use here; we can assume the MOCS init
2220 * just forgot to initialize wb_index.
2221 */
2222 drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2223 }
2224
2225 wa_masked_field_set(wal,
2226 RING_CMD_CCTL(engine->mmio_base),
2227 CMD_CCTL_MOCS_MASK,
2228 CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2229 }
2230}
2231
2232static void
2233rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2234{
2235 struct drm_i915_private *i915 = engine->i915;
2236 struct intel_gt *gt = engine->gt;
2237
2238 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2239 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2240 /* Wa_22014600077 */
2241 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2242 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2243 }
2244
2245 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2246 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2247 IS_DG2(i915)) {
2248 /* Wa_1509727124 */
2249 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2250 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2251 }
2252
2253 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2254 IS_DG2(i915)) {
2255 /* Wa_22012856258 */
2256 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2257 GEN12_DISABLE_READ_SUPPRESSION);
2258 }
2259
2260 if (IS_DG2(i915)) {
2261 /*
2262 * Wa_22010960976:dg2
2263 * Wa_14013347512:dg2
2264 */
2265 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2266 LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2267 }
2268
2269 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2270 IS_DG2(i915)) {
2271 /* Wa_14015150844 */
2272 wa_mcr_add(wal, XEHP_HDC_CHICKEN0, clear: 0,
2273 _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2274 read_mask: 0, masked_reg: true);
2275 }
2276
2277 if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2278 IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2279 /*
2280 * Wa_1606700617:tgl,dg1,adl-p
2281 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2282 * Wa_14010826681:tgl,dg1,rkl,adl-p
2283 * Wa_18019627453:dg2
2284 */
2285 wa_masked_en(wal,
2286 GEN9_CS_DEBUG_MODE1,
2287 FF_DOP_CLOCK_GATE_DISABLE);
2288 }
2289
2290 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2291 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2292 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2293 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2294
2295 /*
2296 * Wa_1407928979:tgl A*
2297 * Wa_18011464164:tgl[B0+],dg1[B0+]
2298 * Wa_22010931296:tgl[B0+],dg1[B0+]
2299 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2300 */
2301 wa_write_or(wal, GEN7_FF_THREAD_MODE,
2302 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2303
2304 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2305 wa_mcr_masked_en(wal,
2306 GEN10_SAMPLER_MODE,
2307 ENABLE_SMALLPL);
2308 }
2309
2310 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2311 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2312 /* Wa_1409804808 */
2313 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2314 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2315
2316 /* Wa_14010229206 */
2317 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2318 }
2319
2320 if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2321 /*
2322 * Wa_1607297627
2323 *
2324 * On TGL and RKL there are multiple entries for this WA in the
2325 * BSpec; some indicate this is an A0-only WA, others indicate
2326 * it applies to all steppings so we trust the "all steppings."
2327 */
2328 wa_masked_en(wal,
2329 RING_PSMI_CTL(RENDER_RING_BASE),
2330 GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2331 GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2332 }
2333
2334 if (GRAPHICS_VER(i915) == 11) {
2335 /* This is not an Wa. Enable for better image quality */
2336 wa_masked_en(wal,
2337 _3D_CHICKEN3,
2338 _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2339
2340 /*
2341 * Wa_1405543622:icl
2342 * Formerly known as WaGAPZPriorityScheme
2343 */
2344 wa_write_or(wal,
2345 GEN8_GARBCNTL,
2346 GEN11_ARBITRATION_PRIO_ORDER_MASK);
2347
2348 /*
2349 * Wa_1604223664:icl
2350 * Formerly known as WaL3BankAddressHashing
2351 */
2352 wa_write_clr_set(wal,
2353 GEN8_GARBCNTL,
2354 GEN11_HASH_CTRL_EXCL_MASK,
2355 GEN11_HASH_CTRL_EXCL_BIT0);
2356 wa_write_clr_set(wal,
2357 GEN11_GLBLINVL,
2358 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2359 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2360
2361 /*
2362 * Wa_1405733216:icl
2363 * Formerly known as WaDisableCleanEvicts
2364 */
2365 wa_mcr_write_or(wal,
2366 GEN8_L3SQCREG4,
2367 GEN11_LQSC_CLEAN_EVICT_DISABLE);
2368
2369 /* Wa_1606682166:icl */
2370 wa_write_or(wal,
2371 GEN7_SARCHKMD,
2372 GEN7_DISABLE_SAMPLER_PREFETCH);
2373
2374 /* Wa_1409178092:icl */
2375 wa_mcr_write_clr_set(wal,
2376 GEN11_SCRATCH2,
2377 GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2378 set: 0);
2379
2380 /* WaEnable32PlaneMode:icl */
2381 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2382 GEN11_ENABLE_32_PLANE_MODE);
2383
2384 /*
2385 * Wa_1408767742:icl[a2..forever],ehl[all]
2386 * Wa_1605460711:icl[a0..c0]
2387 */
2388 wa_write_or(wal,
2389 GEN7_FF_THREAD_MODE,
2390 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2391
2392 /* Wa_22010271021 */
2393 wa_masked_en(wal,
2394 GEN9_CS_DEBUG_MODE1,
2395 FF_DOP_CLOCK_GATE_DISABLE);
2396 }
2397
2398 /*
2399 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2400 * beyond) allow the kernel-mode driver to choose between two different
2401 * options for controlling preemption granularity and behavior.
2402 *
2403 * Option 1 (hardware default):
2404 * Preemption settings are controlled in a global manner via
2405 * kernel-only register CS_DEBUG_MODE1 (0x20EC). Any granularity
2406 * and settings chosen by the kernel-mode driver will apply to all
2407 * userspace clients.
2408 *
2409 * Option 2:
2410 * Preemption settings are controlled on a per-context basis via
2411 * register CS_CHICKEN1 (0x2580). CS_CHICKEN1 is saved/restored on
2412 * context switch and is writable by userspace (e.g., via
2413 * MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2414 * which allows different userspace drivers/clients to select
2415 * different settings, or to change those settings on the fly in
2416 * response to runtime needs. This option was known by name
2417 * "FtrPerCtxtPreemptionGranularityControl" at one time, although
2418 * that name is somewhat misleading as other non-granularity
2419 * preemption settings are also impacted by this decision.
2420 *
2421 * On Linux, our policy has always been to let userspace drivers
2422 * control preemption granularity/settings (Option 2). This was
2423 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2424 * userspace developed before object-level preemption was enabled would
2425 * not behave well if i915 were to go with Option 1 and enable that
2426 * preemption in a global manner). On gen9 each context would have
2427 * object-level preemption disabled by default (see
2428 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2429 * userspace drivers could opt-in to object-level preemption as they
2430 * saw fit. For post-gen9 platforms, we continue to utilize Option 2;
2431 * even though it is no longer necessary for ABI compatibility when
2432 * enabling a new platform, it does ensure that userspace will be able
2433 * to implement any workarounds that show up requiring temporary
2434 * adjustments to preemption behavior at runtime.
2435 *
2436 * Notes/Workarounds:
2437 * - Wa_14015141709: On DG2 and early steppings of MTL,
2438 * CS_CHICKEN1[0] does not disable object-level preemption as
2439 * it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2440 * using Option 1). Effectively this means userspace is unable
2441 * to disable object-level preemption on these platforms/steppings
2442 * despite the setting here.
2443 *
2444 * - Wa_16013994831: May require that userspace program
2445 * CS_CHICKEN1[10] when certain runtime conditions are true.
2446 * Userspace requires Option 2 to be in effect for their update of
2447 * CS_CHICKEN1[10] to be effective.
2448 *
2449 * Other workarounds may appear in the future that will also require
2450 * Option 2 behavior to allow proper userspace implementation.
2451 */
2452 if (GRAPHICS_VER(i915) >= 9)
2453 wa_masked_en(wal,
2454 GEN7_FF_SLICE_CS_CHICKEN1,
2455 GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2456
2457 if (IS_SKYLAKE(i915) ||
2458 IS_KABYLAKE(i915) ||
2459 IS_COFFEELAKE(i915) ||
2460 IS_COMETLAKE(i915)) {
2461 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2462 wa_write_or(wal,
2463 GEN8_GARBCNTL,
2464 GEN9_GAPS_TSV_CREDIT_DISABLE);
2465 }
2466
2467 if (IS_BROXTON(i915)) {
2468 /* WaDisablePooledEuLoadBalancingFix:bxt */
2469 wa_masked_en(wal,
2470 FF_SLICE_CS_CHICKEN2,
2471 GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2472 }
2473
2474 if (GRAPHICS_VER(i915) == 9) {
2475 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2476 wa_masked_en(wal,
2477 GEN9_CSFE_CHICKEN1_RCS,
2478 GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2479
2480 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2481 wa_mcr_write_or(wal,
2482 BDW_SCRATCH1,
2483 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2484
2485 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2486 if (IS_GEN9_LP(i915))
2487 wa_mcr_write_clr_set(wal,
2488 GEN8_L3SQCREG1,
2489 L3_PRIO_CREDITS_MASK,
2490 L3_GENERAL_PRIO_CREDITS(62) |
2491 L3_HIGH_PRIO_CREDITS(2));
2492
2493 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2494 wa_mcr_write_or(wal,
2495 GEN8_L3SQCREG4,
2496 GEN8_LQSC_FLUSH_COHERENT_LINES);
2497
2498 /* Disable atomics in L3 to prevent unrecoverable hangs */
2499 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2500 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, set: 0);
2501 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2502 GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, set: 0);
2503 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2504 EVICTION_PERF_FIX_ENABLE, set: 0);
2505 }
2506
2507 if (IS_HASWELL(i915)) {
2508 /* WaSampleCChickenBitEnable:hsw */
2509 wa_masked_en(wal,
2510 HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2511
2512 wa_masked_dis(wal,
2513 CACHE_MODE_0_GEN7,
2514 /* enable HiZ Raw Stall Optimization */
2515 HIZ_RAW_STALL_OPT_DISABLE);
2516 }
2517
2518 if (IS_VALLEYVIEW(i915)) {
2519 /* WaDisableEarlyCull:vlv */
2520 wa_masked_en(wal,
2521 _3D_CHICKEN3,
2522 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2523
2524 /*
2525 * WaVSThreadDispatchOverride:ivb,vlv
2526 *
2527 * This actually overrides the dispatch
2528 * mode for all thread types.
2529 */
2530 wa_write_clr_set(wal,
2531 GEN7_FF_THREAD_MODE,
2532 GEN7_FF_SCHED_MASK,
2533 GEN7_FF_TS_SCHED_HW |
2534 GEN7_FF_VS_SCHED_HW |
2535 GEN7_FF_DS_SCHED_HW);
2536
2537 /* WaPsdDispatchEnable:vlv */
2538 /* WaDisablePSDDualDispatchEnable:vlv */
2539 wa_masked_en(wal,
2540 GEN7_HALF_SLICE_CHICKEN1,
2541 GEN7_MAX_PS_THREAD_DEP |
2542 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2543 }
2544
2545 if (IS_IVYBRIDGE(i915)) {
2546 /* WaDisableEarlyCull:ivb */
2547 wa_masked_en(wal,
2548 _3D_CHICKEN3,
2549 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2550
2551 if (0) { /* causes HiZ corruption on ivb:gt1 */
2552 /* enable HiZ Raw Stall Optimization */
2553 wa_masked_dis(wal,
2554 CACHE_MODE_0_GEN7,
2555 HIZ_RAW_STALL_OPT_DISABLE);
2556 }
2557
2558 /*
2559 * WaVSThreadDispatchOverride:ivb,vlv
2560 *
2561 * This actually overrides the dispatch
2562 * mode for all thread types.
2563 */
2564 wa_write_clr_set(wal,
2565 GEN7_FF_THREAD_MODE,
2566 GEN7_FF_SCHED_MASK,
2567 GEN7_FF_TS_SCHED_HW |
2568 GEN7_FF_VS_SCHED_HW |
2569 GEN7_FF_DS_SCHED_HW);
2570
2571 /* WaDisablePSDDualDispatchEnable:ivb */
2572 if (INTEL_INFO(i915)->gt == 1)
2573 wa_masked_en(wal,
2574 GEN7_HALF_SLICE_CHICKEN1,
2575 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2576 }
2577
2578 if (GRAPHICS_VER(i915) == 7) {
2579 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2580 wa_masked_en(wal,
2581 RING_MODE_GEN7(RENDER_RING_BASE),
2582 GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2583
2584 /*
2585 * BSpec recommends 8x4 when MSAA is used,
2586 * however in practice 16x4 seems fastest.
2587 *
2588 * Note that PS/WM thread counts depend on the WIZ hashing
2589 * disable bit, which we don't touch here, but it's good
2590 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2591 */
2592 wa_masked_field_set(wal,
2593 GEN7_GT_MODE,
2594 GEN6_WIZ_HASHING_MASK,
2595 GEN6_WIZ_HASHING_16x4);
2596 }
2597
2598 if (IS_GRAPHICS_VER(i915, 6, 7))
2599 /*
2600 * We need to disable the AsyncFlip performance optimisations in
2601 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2602 * already be programmed to '1' on all products.
2603 *
2604 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2605 */
2606 wa_masked_en(wal,
2607 RING_MI_MODE(RENDER_RING_BASE),
2608 ASYNC_FLIP_PERF_DISABLE);
2609
2610 if (GRAPHICS_VER(i915) == 6) {
2611 /*
2612 * Required for the hardware to program scanline values for
2613 * waiting
2614 * WaEnableFlushTlbInvalidationMode:snb
2615 */
2616 wa_masked_en(wal,
2617 GFX_MODE,
2618 GFX_TLB_INVALIDATE_EXPLICIT);
2619
2620 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2621 wa_masked_en(wal,
2622 _3D_CHICKEN,
2623 _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2624
2625 wa_masked_en(wal,
2626 _3D_CHICKEN3,
2627 /* WaStripsFansDisableFastClipPerformanceFix:snb */
2628 _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2629 /*
2630 * Bspec says:
2631 * "This bit must be set if 3DSTATE_CLIP clip mode is set
2632 * to normal and 3DSTATE_SF number of SF output attributes
2633 * is more than 16."
2634 */
2635 _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2636
2637 /*
2638 * BSpec recommends 8x4 when MSAA is used,
2639 * however in practice 16x4 seems fastest.
2640 *
2641 * Note that PS/WM thread counts depend on the WIZ hashing
2642 * disable bit, which we don't touch here, but it's good
2643 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2644 */
2645 wa_masked_field_set(wal,
2646 GEN6_GT_MODE,
2647 GEN6_WIZ_HASHING_MASK,
2648 GEN6_WIZ_HASHING_16x4);
2649
2650 /*
2651 * From the Sandybridge PRM, volume 1 part 3, page 24:
2652 * "If this bit is set, STCunit will have LRA as replacement
2653 * policy. [...] This bit must be reset. LRA replacement
2654 * policy is not supported."
2655 */
2656 wa_masked_dis(wal,
2657 CACHE_MODE_0,
2658 CM0_STC_EVICT_DISABLE_LRA_SNB);
2659 }
2660
2661 if (IS_GRAPHICS_VER(i915, 4, 6))
2662 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2663 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2664 clear: 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2665 /* XXX bit doesn't stick on Broadwater */
2666 IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, masked_reg: true);
2667
2668 if (GRAPHICS_VER(i915) == 4)
2669 /*
2670 * Disable CONSTANT_BUFFER before it is loaded from the context
2671 * image. For as it is loaded, it is executed and the stored
2672 * address may no longer be valid, leading to a GPU hang.
2673 *
2674 * This imposes the requirement that userspace reload their
2675 * CONSTANT_BUFFER on every batch, fortunately a requirement
2676 * they are already accustomed to from before contexts were
2677 * enabled.
2678 */
2679 wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2680 clear: 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2681 read_mask: 0 /* XXX bit doesn't stick on Broadwater */,
2682 masked_reg: true);
2683}
2684
2685static void
2686xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2687{
2688 struct drm_i915_private *i915 = engine->i915;
2689
2690 /* WaKBLVECSSemaphoreWaitPoll:kbl */
2691 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2692 wa_write(wal,
2693 RING_SEMA_WAIT_POLL(engine->mmio_base),
2694 set: 1);
2695 }
2696 /* Wa_16018031267, Wa_16018063123 */
2697 if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2698 wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2699 XEHP_BLITTER_SCHEDULING_MODE_MASK,
2700 XEHP_BLITTER_ROUND_ROBIN_MODE);
2701}
2702
2703static void
2704ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2705{
2706 /* boilerplate for any CCS engine workaround */
2707}
2708
2709/*
2710 * The bspec performance guide has recommended MMIO tuning settings. These
2711 * aren't truly "workarounds" but we want to program them with the same
2712 * workaround infrastructure to ensure that they're automatically added to
2713 * the GuC save/restore lists, re-applied at the right times, and checked for
2714 * any conflicting programming requested by real workarounds.
2715 *
2716 * Programming settings should be added here only if their registers are not
2717 * part of an engine's register state context. If a register is part of a
2718 * context, then any tuning settings should be programmed in an appropriate
2719 * function invoked by __intel_engine_init_ctx_wa().
2720 */
2721static void
2722add_render_compute_tuning_settings(struct intel_gt *gt,
2723 struct i915_wa_list *wal)
2724{
2725 struct drm_i915_private *i915 = gt->i915;
2726
2727 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2728 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2729
2730 /*
2731 * This tuning setting proves beneficial only on ATS-M designs; the
2732 * default "age based" setting is optimal on regular DG2 and other
2733 * platforms.
2734 */
2735 if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2736 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2737 THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2738
2739 if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 55))
2740 wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2741}
2742
2743static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2744{
2745 struct intel_gt *gt = engine->gt;
2746 u32 mode;
2747
2748 if (!IS_DG2(gt->i915))
2749 return;
2750
2751 /*
2752 * Wa_14019159160: This workaround, along with others, leads to
2753 * significant challenges in utilizing load balancing among the
2754 * CCS slices. Consequently, an architectural decision has been
2755 * made to completely disable automatic CCS load balancing.
2756 */
2757 wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2758
2759 /*
2760 * After having disabled automatic load balancing we need to
2761 * assign all slices to a single CCS. We will call it CCS mode 1
2762 */
2763 mode = intel_gt_apply_ccs_mode(gt);
2764 wa_masked_en(wal, XEHP_CCS_MODE, val: mode);
2765}
2766
2767/*
2768 * The workarounds in this function apply to shared registers in
2769 * the general render reset domain that aren't tied to a
2770 * specific engine. Since all render+compute engines get reset
2771 * together, and the contents of these registers are lost during
2772 * the shared render domain reset, we'll define such workarounds
2773 * here and then add them to just a single RCS or CCS engine's
2774 * workaround list (whichever engine has the XXXX flag).
2775 */
2776static void
2777general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2778{
2779 struct drm_i915_private *i915 = engine->i915;
2780 struct intel_gt *gt = engine->gt;
2781
2782 add_render_compute_tuning_settings(gt, wal);
2783
2784 if (GRAPHICS_VER(i915) >= 11) {
2785 /* This is not a Wa (although referred to as
2786 * WaSetInidrectStateOverride in places), this allows
2787 * applications that reference sampler states through
2788 * the BindlessSamplerStateBaseAddress to have their
2789 * border color relative to DynamicStateBaseAddress
2790 * rather than BindlessSamplerStateBaseAddress.
2791 *
2792 * Otherwise SAMPLER_STATE border colors have to be
2793 * copied in multiple heaps (DynamicStateBaseAddress &
2794 * BindlessSamplerStateBaseAddress)
2795 *
2796 * BSpec: 46052
2797 */
2798 wa_mcr_masked_en(wal,
2799 GEN10_SAMPLER_MODE,
2800 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2801 }
2802
2803 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2804 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2805 IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2806 /* Wa_14017856879 */
2807 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2808
2809 /* Wa_14020495402 */
2810 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2811 }
2812
2813 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2814 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2815 /*
2816 * Wa_14017066071
2817 * Wa_14017654203
2818 */
2819 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2820 MTL_DISABLE_SAMPLER_SC_OOO);
2821
2822 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2823 /* Wa_22015279794 */
2824 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2825 DISABLE_PREFETCH_INTO_IC);
2826
2827 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2828 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2829 IS_DG2(i915)) {
2830 /* Wa_22013037850 */
2831 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2832 DISABLE_128B_EVICTION_COMMAND_UDW);
2833
2834 /* Wa_18017747507 */
2835 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2836 }
2837
2838 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2839 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2840 IS_DG2(i915)) {
2841 /* Wa_22014226127 */
2842 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2843 }
2844
2845 if (IS_DG2(i915)) {
2846 /* Wa_14015227452:dg2,pvc */
2847 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2848
2849 /*
2850 * Wa_16011620976:dg2_g11
2851 * Wa_22015475538:dg2
2852 */
2853 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2854
2855 /* Wa_18028616096 */
2856 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2857 }
2858
2859 if (IS_DG2_G11(i915)) {
2860 /*
2861 * Wa_22012826095:dg2
2862 * Wa_22013059131:dg2
2863 */
2864 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2865 MAXREQS_PER_BANK,
2866 REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2867
2868 /* Wa_22013059131:dg2 */
2869 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2870 FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2871
2872 /*
2873 * Wa_22012654132
2874 *
2875 * Note that register 0xE420 is write-only and cannot be read
2876 * back for verification on DG2 (due to Wa_14012342262), so
2877 * we need to explicitly skip the readback.
2878 */
2879 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, clear: 0,
2880 _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2881 read_mask: 0 /* write-only, so skip validation */,
2882 masked_reg: true);
2883 }
2884}
2885
2886static void
2887engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2888{
2889 if (GRAPHICS_VER(engine->i915) < 4)
2890 return;
2891
2892 engine_fake_wa_init(engine, wal);
2893
2894 /*
2895 * These are common workarounds that just need to applied
2896 * to a single RCS/CCS engine's workaround list since
2897 * they're reset as part of the general render domain reset.
2898 */
2899 if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
2900 general_render_compute_wa_init(engine, wal);
2901 ccs_engine_wa_mode(engine, wal);
2902 }
2903
2904 if (engine->class == COMPUTE_CLASS)
2905 ccs_engine_wa_init(engine, wal);
2906 else if (engine->class == RENDER_CLASS)
2907 rcs_engine_wa_init(engine, wal);
2908 else
2909 xcs_engine_wa_init(engine, wal);
2910}
2911
2912void intel_engine_init_workarounds(struct intel_engine_cs *engine)
2913{
2914 struct i915_wa_list *wal = &engine->wa_list;
2915
2916 wa_init_start(wal, gt: engine->gt, name: "engine", engine_name: engine->name);
2917 engine_init_workarounds(engine, wal);
2918 wa_init_finish(wal);
2919}
2920
2921void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
2922{
2923 wa_list_apply(wal: &engine->wa_list);
2924}
2925
2926static const struct i915_range mcr_ranges_gen8[] = {
2927 { .start = 0x5500, .end = 0x55ff },
2928 { .start = 0x7000, .end = 0x7fff },
2929 { .start = 0x9400, .end = 0x97ff },
2930 { .start = 0xb000, .end = 0xb3ff },
2931 { .start = 0xe000, .end = 0xe7ff },
2932 {},
2933};
2934
2935static const struct i915_range mcr_ranges_gen12[] = {
2936 { .start = 0x8150, .end = 0x815f },
2937 { .start = 0x9520, .end = 0x955f },
2938 { .start = 0xb100, .end = 0xb3ff },
2939 { .start = 0xde80, .end = 0xe8ff },
2940 { .start = 0x24a00, .end = 0x24a7f },
2941 {},
2942};
2943
2944static const struct i915_range mcr_ranges_xehp[] = {
2945 { .start = 0x4000, .end = 0x4aff },
2946 { .start = 0x5200, .end = 0x52ff },
2947 { .start = 0x5400, .end = 0x7fff },
2948 { .start = 0x8140, .end = 0x815f },
2949 { .start = 0x8c80, .end = 0x8dff },
2950 { .start = 0x94d0, .end = 0x955f },
2951 { .start = 0x9680, .end = 0x96ff },
2952 { .start = 0xb000, .end = 0xb3ff },
2953 { .start = 0xc800, .end = 0xcfff },
2954 { .start = 0xd800, .end = 0xd8ff },
2955 { .start = 0xdc00, .end = 0xffff },
2956 { .start = 0x17000, .end = 0x17fff },
2957 { .start = 0x24a00, .end = 0x24a7f },
2958 {},
2959};
2960
2961static bool mcr_range(struct drm_i915_private *i915, u32 offset)
2962{
2963 const struct i915_range *mcr_ranges;
2964 int i;
2965
2966 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55))
2967 mcr_ranges = mcr_ranges_xehp;
2968 else if (GRAPHICS_VER(i915) >= 12)
2969 mcr_ranges = mcr_ranges_gen12;
2970 else if (GRAPHICS_VER(i915) >= 8)
2971 mcr_ranges = mcr_ranges_gen8;
2972 else
2973 return false;
2974
2975 /*
2976 * Registers in these ranges are affected by the MCR selector
2977 * which only controls CPU initiated MMIO. Routing does not
2978 * work for CS access so we cannot verify them on this path.
2979 */
2980 for (i = 0; mcr_ranges[i].start; i++)
2981 if (offset >= mcr_ranges[i].start &&
2982 offset <= mcr_ranges[i].end)
2983 return true;
2984
2985 return false;
2986}
2987
2988static int
2989wa_list_srm(struct i915_request *rq,
2990 const struct i915_wa_list *wal,
2991 struct i915_vma *vma)
2992{
2993 struct drm_i915_private *i915 = rq->i915;
2994 unsigned int i, count = 0;
2995 const struct i915_wa *wa;
2996 u32 srm, *cs;
2997
2998 srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2999 if (GRAPHICS_VER(i915) >= 8)
3000 srm++;
3001
3002 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3003 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3004 count++;
3005 }
3006
3007 cs = intel_ring_begin(rq, num_dwords: 4 * count);
3008 if (IS_ERR(ptr: cs))
3009 return PTR_ERR(ptr: cs);
3010
3011 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3012 u32 offset = i915_mmio_reg_offset(wa->reg);
3013
3014 if (mcr_range(i915, offset))
3015 continue;
3016
3017 *cs++ = srm;
3018 *cs++ = offset;
3019 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3020 *cs++ = 0;
3021 }
3022 intel_ring_advance(rq, cs);
3023
3024 return 0;
3025}
3026
3027static int engine_wa_list_verify(struct intel_context *ce,
3028 const struct i915_wa_list * const wal,
3029 const char *from)
3030{
3031 const struct i915_wa *wa;
3032 struct i915_request *rq;
3033 struct i915_vma *vma;
3034 struct i915_gem_ww_ctx ww;
3035 unsigned int i;
3036 u32 *results;
3037 int err;
3038
3039 if (!wal->count)
3040 return 0;
3041
3042 vma = __vm_create_scratch_for_read(vm: &ce->engine->gt->ggtt->vm,
3043 size: wal->count * sizeof(u32));
3044 if (IS_ERR(ptr: vma))
3045 return PTR_ERR(ptr: vma);
3046
3047 intel_engine_pm_get(engine: ce->engine);
3048 i915_gem_ww_ctx_init(ctx: &ww, intr: false);
3049retry:
3050 err = i915_gem_object_lock(obj: vma->obj, ww: &ww);
3051 if (err == 0)
3052 err = intel_context_pin_ww(ce, ww: &ww);
3053 if (err)
3054 goto err_pm;
3055
3056 err = i915_vma_pin_ww(vma, ww: &ww, size: 0, alignment: 0,
3057 flags: i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3058 if (err)
3059 goto err_unpin;
3060
3061 rq = i915_request_create(ce);
3062 if (IS_ERR(ptr: rq)) {
3063 err = PTR_ERR(ptr: rq);
3064 goto err_vma;
3065 }
3066
3067 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3068 if (err == 0)
3069 err = wa_list_srm(rq, wal, vma);
3070
3071 i915_request_get(rq);
3072 if (err)
3073 i915_request_set_error_once(rq, error: err);
3074 i915_request_add(rq);
3075
3076 if (err)
3077 goto err_rq;
3078
3079 if (i915_request_wait(rq, flags: 0, HZ / 5) < 0) {
3080 err = -ETIME;
3081 goto err_rq;
3082 }
3083
3084 results = i915_gem_object_pin_map(obj: vma->obj, type: I915_MAP_WB);
3085 if (IS_ERR(ptr: results)) {
3086 err = PTR_ERR(ptr: results);
3087 goto err_rq;
3088 }
3089
3090 err = 0;
3091 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3092 if (mcr_range(i915: rq->i915, i915_mmio_reg_offset(wa->reg)))
3093 continue;
3094
3095 if (!wa_verify(gt: wal->gt, wa, cur: results[i], name: wal->name, from))
3096 err = -ENXIO;
3097 }
3098
3099 i915_gem_object_unpin_map(obj: vma->obj);
3100
3101err_rq:
3102 i915_request_put(rq);
3103err_vma:
3104 i915_vma_unpin(vma);
3105err_unpin:
3106 intel_context_unpin(ce);
3107err_pm:
3108 if (err == -EDEADLK) {
3109 err = i915_gem_ww_ctx_backoff(ctx: &ww);
3110 if (!err)
3111 goto retry;
3112 }
3113 i915_gem_ww_ctx_fini(ctx: &ww);
3114 intel_engine_pm_put(engine: ce->engine);
3115 i915_vma_put(vma);
3116 return err;
3117}
3118
3119int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3120 const char *from)
3121{
3122 return engine_wa_list_verify(ce: engine->kernel_context,
3123 wal: &engine->wa_list,
3124 from);
3125}
3126
3127#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3128#include "selftest_workarounds.c"
3129#endif
3130