intel_ggtt_fencing.c source code [Linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2008-2015 Intel Corporation
4	*/
5
6	#include <linux/highmem.h>
7
8	#include "display/intel_display.h"
9	#include "i915_drv.h"
10	#include "i915_reg.h"
11	#include "i915_scatterlist.h"
12	#include "i915_pvinfo.h"
13	#include "i915_vgpu.h"
14	#include "intel_gt_regs.h"
15	#include "intel_mchbar_regs.h"
16
17	/**
18	* DOC: fence register handling
19	*
20	* Important to avoid confusions: "fences" in the i915 driver are not execution
21	* fences used to track command completion but hardware detiler objects which
22	* wrap a given range of the global GTT. Each platform has only a fairly limited
23	* set of these objects.
24	*
25	* Fences are used to detile GTT memory mappings. They're also connected to the
26	* hardware frontbuffer render tracking and hence interact with frontbuffer
27	* compression. Furthermore on older platforms fences are required for tiled
28	* objects used by the display engine. They can also be used by the render
29	* engine - they're required for blitter commands and are optional for render
30	* commands. But on gen4+ both display (with the exception of fbc) and rendering
31	* have their own tiling state bits and don't need fences.
32	*
33	* Also note that fences only support X and Y tiling and hence can't be used for
34	* the fancier new tiling formats like W, Ys and Yf.
35	*
36	* Finally note that because fences are such a restricted resource they're
37	* dynamically associated with objects. Furthermore fence state is committed to
38	* the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
39	* explicitly call i915_gem_object_get_fence() to synchronize fencing status
40	* for cpu access. Also note that some code wants an unfenced view, for those
41	* cases the fence can be removed forcefully with i915_gem_object_put_fence().
42	*
43	* Internally these functions will synchronize with userspace access by removing
44	* CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
45	*/
46
47	#define pipelined 0
48
49	static struct drm_i915_private fence_to_i915(struct* i915_fence_reg *fence)
50	{
51	return fence->ggtt->vm.i915;
52	}
53
54	static struct intel_uncore fence_to_uncore(struct* i915_fence_reg *fence)
55	{
56	return fence->ggtt->vm.gt->uncore;
57	}
58
59	static void i965_write_fence_reg(struct i915_fence_reg *fence)
60	{
61	i915_reg_t fence_reg_lo, fence_reg_hi;
62	int fence_pitch_shift;
63	u64 val;
64
65	if (GRAPHICS_VER(fence_to_i915(fence)) >= `6`) {
66	fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
67	fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
68	fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
69
70	} else {
71	fence_reg_lo = FENCE_REG_965_LO(fence->id);
72	fence_reg_hi = FENCE_REG_965_HI(fence->id);
73	fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
74	}
75
76	val = `0`;
77	if (fence->tiling) {
78	unsigned int stride = fence->stride;
79
80	GEM_BUG_ON(!IS_ALIGNED(stride, `128`));
81
82	val = fence->start + fence->size - I965_FENCE_PAGE;
83	val <<= `32`;
84	val \|= fence->start;
85	val \|= (u64)((stride / `128`) - `1`) << fence_pitch_shift;
86	if (fence->tiling == I915_TILING_Y)
87	val \|= BIT(I965_FENCE_TILING_Y_SHIFT);
88	val \|= I965_FENCE_REG_VALID;
89	}
90
91	if (!pipelined) {
92	struct intel_uncore *uncore = fence_to_uncore(fence);
93
94	/*
95	* To w/a incoherency with non-atomic 64-bit register updates,
96	* we split the 64-bit update into two 32-bit writes. In order
97	* for a partial fence not to be evaluated between writes, we
98	* precede the update with write to turn off the fence register,
99	* and only enable the fence as the last step.
100	*
101	* For extra levels of paranoia, we make sure each step lands
102	* before applying the next step.
103	*/
104	intel_uncore_write_fw(uncore, fence_reg_lo, `0`);
105	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
106
107	intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
108	intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
109	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
110	}
111	}
112
113	static void i915_write_fence_reg(struct i915_fence_reg *fence)
114	{
115	u32 val;
116
117	val = `0`;
118	if (fence->tiling) {
119	unsigned int stride = fence->stride;
120	unsigned int tiling = fence->tiling;
121	bool is_y_tiled = tiling == I915_TILING_Y;
122
123	if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
124	stride /= `128`;
125	else
126	stride /= `512`;
127	GEM_BUG_ON(!is_power_of_2(stride));
128
129	val = fence->start;
130	if (is_y_tiled)
131	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
132	val \|= I915_FENCE_SIZE_BITS(fence->size);
133	val \|= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
134
135	val \|= I830_FENCE_REG_VALID;
136	}
137
138	if (!pipelined) {
139	struct intel_uncore *uncore = fence_to_uncore(fence);
140	i915_reg_t reg = FENCE_REG(fence->id);
141
142	intel_uncore_write_fw(uncore, reg, val);
143	intel_uncore_posting_read_fw(uncore, reg);
144	}
145	}
146
147	static void i830_write_fence_reg(struct i915_fence_reg *fence)
148	{
149	u32 val;
150
151	val = `0`;
152	if (fence->tiling) {
153	unsigned int stride = fence->stride;
154
155	val = fence->start;
156	if (fence->tiling == I915_TILING_Y)
157	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
158	val \|= I830_FENCE_SIZE_BITS(fence->size);
159	val \|= ilog2(stride / `128`) << I830_FENCE_PITCH_SHIFT;
160	val \|= I830_FENCE_REG_VALID;
161	}
162
163	if (!pipelined) {
164	struct intel_uncore *uncore = fence_to_uncore(fence);
165	i915_reg_t reg = FENCE_REG(fence->id);
166
167	intel_uncore_write_fw(uncore, reg, val);
168	intel_uncore_posting_read_fw(uncore, reg);
169	}
170	}
171
172	static void fence_write(struct i915_fence_reg *fence)
173	{
174	struct drm_i915_private *i915 = fence_to_i915(fence);
175
176	/*
177	* Previous access through the fence register is marshalled by
178	* the mb() inside the fault handlers (i915_gem_release_mmaps)
179	* and explicitly managed for internal users.
180	*/
181
182	if (GRAPHICS_VER(i915) == `2`)
183	i830_write_fence_reg(fence);
184	else if (GRAPHICS_VER(i915) == `3`)
185	i915_write_fence_reg(fence);
186	else
187	i965_write_fence_reg(fence);
188
189	/*
190	* Access through the fenced region afterwards is
191	* ordered by the posting reads whilst writing the registers.
192	*/
193	}
194
195	static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
196	{
197	return GRAPHICS_VER(fence_to_i915(fence)) < `4`;
198	}
199
200	static int fence_update(struct i915_fence_reg *fence,
201	struct i915_vma *vma)
202	{
203	struct i915_ggtt *ggtt = fence->ggtt;
204	struct intel_uncore *uncore = fence_to_uncore(fence);
205	intel_wakeref_t wakeref;
206	struct i915_vma *old;
207	int ret;
208
209	fence->tiling = `0`;
210	if (vma) {
211	GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) \|\|
212	!i915_gem_object_get_tiling(vma->obj));
213
214	if (!i915_vma_is_map_and_fenceable(vma))
215	return -EINVAL;
216
217	if (gpu_uses_fence_registers(fence)) {
218	/ implicit 'unfenced' GPU blits /
219	ret = i915_vma_sync(vma);
220	if (ret)
221	return ret;
222	}
223
224	GEM_BUG_ON(vma->fence_size > i915_vma_size(vma));
225	fence->start = i915_ggtt_offset(vma);
226	fence->size = vma->fence_size;
227	fence->stride = i915_gem_object_get_stride(obj: vma->obj);
228	fence->tiling = i915_gem_object_get_tiling(obj: vma->obj);
229	}
230	WRITE_ONCE(fence->dirty, false);
231
232	old = xchg(&fence->vma, NULL);
233	if (old) {
234	/ XXX Ideally we would move the waiting to outside the mutex /
235	ret = i915_active_wait(ref: &fence->active);
236	if (ret) {
237	fence->vma = old;
238	return ret;
239	}
240
241	i915_vma_flush_writes(vma: old);
242
243	/*
244	* Ensure that all userspace CPU access is completed before
245	* stealing the fence.
246	*/
247	if (old != vma) {
248	GEM_BUG_ON(old->fence != fence);
249	i915_vma_revoke_mmap(vma: old);
250	old->fence = NULL;
251	}
252
253	list_move(list: &fence->link, head: &ggtt->fence_list);
254	}
255
256	/*
257	* We only need to update the register itself if the device is awake.
258	* If the device is currently powered down, we will defer the write
259	* to the runtime resume, see intel_ggtt_restore_fences().
260	*
261	* This only works for removing the fence register, on acquisition
262	* the caller must hold the rpm wakeref. The fence register must
263	* be cleared before we can use any other fences to ensure that
264	* the new fences do not overlap the elided clears, confusing HW.
265	*/
266	wakeref = intel_runtime_pm_get_if_in_use(rpm: uncore->rpm);
267	if (!wakeref) {
268	GEM_BUG_ON(vma);
269	return `0`;
270	}
271
272	WRITE_ONCE(fence->vma, vma);
273	fence_write(fence);
274
275	if (vma) {
276	vma->fence = fence;
277	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
278	}
279
280	intel_runtime_pm_put(rpm: uncore->rpm, wref: wakeref);
281	return `0`;
282	}
283
284	/**
285	* i915_vma_revoke_fence - force-remove fence for a VMA
286	* @vma: vma to map linearly (not through a fence reg)
287	*
288	* This function force-removes any fence from the given object, which is useful
289	* if the kernel wants to do untiled GTT access.
290	*/
291	void i915_vma_revoke_fence(struct i915_vma *vma)
292	{
293	struct i915_fence_reg *fence = vma->fence;
294	intel_wakeref_t wakeref;
295
296	lockdep_assert_held(&vma->vm->mutex);
297	if (!fence)
298	return;
299
300	GEM_BUG_ON(fence->vma != vma);
301	i915_active_wait(ref: &fence->active);
302	GEM_BUG_ON(!i915_active_is_idle(&fence->active));
303	GEM_BUG_ON(atomic_read(&fence->pin_count));
304
305	fence->tiling = `0`;
306	WRITE_ONCE(fence->vma, NULL);
307	vma->fence = NULL;
308
309	/*
310	* Skip the write to HW if and only if the device is currently
311	* suspended.
312	*
313	* If the driver does not currently hold a wakeref (if_in_use == 0),
314	* the device may currently be runtime suspended, or it may be woken
315	* up before the suspend takes place. If the device is not suspended
316	* (powered down) and we skip clearing the fence register, the HW is
317	* left in an undefined state where we may end up with multiple
318	* registers overlapping.
319	*/
320	with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
321	fence_write(fence);
322	}
323
324	static bool fence_is_active(const struct i915_fence_reg *fence)
325	{
326	return fence->vma && i915_vma_is_active(vma: fence->vma);
327	}
328
329	static struct i915_fence_reg fence_find(struct* i915_ggtt *ggtt)
330	{
331	struct intel_display *display = ggtt->vm.i915->display;
332	struct i915_fence_reg *active = NULL;
333	struct i915_fence_reg fence, fn;
334
335	list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
336	GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
337
338	if (fence == active) / now seen this fence twice /
339	active = ERR_PTR(error: -EAGAIN);
340
341	/ Prefer idle fences so we do not have to wait on the GPU /
342	if (active != ERR_PTR(error: -EAGAIN) && fence_is_active(fence)) {
343	if (!active)
344	active = fence;
345
346	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
347	continue;
348	}
349
350	if (atomic_read(v: &fence->pin_count))
351	continue;
352
353	return fence;
354	}
355
356	/ Wait for completion of pending flips which consume fences /
357	if (intel_has_pending_fb_unpin(display))
358	return ERR_PTR(error: -EAGAIN);
359
360	return ERR_PTR(error: -ENOBUFS);
361	}
362
363	int __i915_vma_pin_fence(struct i915_vma *vma)
364	{
365	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm: vma->vm);
366	struct i915_fence_reg *fence;
367	struct i915_vma *set = i915_gem_object_is_tiled(obj: vma->obj) ? vma : NULL;
368	int err;
369
370	lockdep_assert_held(&vma->vm->mutex);
371
372	/ Just update our place in the LRU if our fence is getting reused. /
373	if (vma->fence) {
374	fence = vma->fence;
375	GEM_BUG_ON(fence->vma != vma);
376	atomic_inc(v: &fence->pin_count);
377	if (!fence->dirty) {
378	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
379	return `0`;
380	}
381	} else if (set) {
382	fence = fence_find(ggtt);
383	if (IS_ERR(ptr: fence))
384	return PTR_ERR(ptr: fence);
385
386	GEM_BUG_ON(atomic_read(&fence->pin_count));
387	atomic_inc(v: &fence->pin_count);
388	} else {
389	return `0`;
390	}
391
392	err = fence_update(fence, vma: set);
393	if (err)
394	goto out_unpin;
395
396	GEM_BUG_ON(fence->vma != set);
397	GEM_BUG_ON(vma->fence != (set ? fence : NULL));
398
399	if (set)
400	return `0`;
401
402	out_unpin:
403	atomic_dec(v: &fence->pin_count);
404	return err;
405	}
406
407	/**
408	* i915_vma_pin_fence - set up fencing for a vma
409	* @vma: vma to map through a fence reg
410	*
411	* When mapping objects through the GTT, userspace wants to be able to write
412	* to them without having to worry about swizzling if the object is tiled.
413	* This function walks the fence regs looking for a free one for @obj,
414	* stealing one if it can't find any.
415	*
416	* It then sets up the reg based on the object's properties: address, pitch
417	* and tiling format.
418	*
419	* For an untiled surface, this removes any existing fence.
420	*
421	* Returns:
422	* 0 on success, negative error code on failure.
423	*/
424	int i915_vma_pin_fence(struct i915_vma *vma)
425	{
426	int err;
427
428	if (!vma->fence && !i915_gem_object_is_tiled(obj: vma->obj))
429	return `0`;
430
431	/*
432	* Note that we revoke fences on runtime suspend. Therefore the user
433	* must keep the device awake whilst using the fence.
434	*/
435	assert_rpm_wakelock_held(rpm: vma->vm->gt->uncore->rpm);
436	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
437
438	err = mutex_lock_interruptible(lock: &vma->vm->mutex);
439	if (err)
440	return err;
441
442	err = __i915_vma_pin_fence(vma);
443	mutex_unlock(lock: &vma->vm->mutex);
444
445	return err;
446	}
447
448	/**
449	* i915_reserve_fence - Reserve a fence for vGPU
450	* @ggtt: Global GTT
451	*
452	* This function walks the fence regs looking for a free one and remove
453	* it from the fence_list. It is used to reserve fence for vGPU to use.
454	*/
455	struct i915_fence_reg i915_reserve_fence(struct* i915_ggtt *ggtt)
456	{
457	struct i915_fence_reg *fence;
458	int count;
459	int ret;
460
461	lockdep_assert_held(&ggtt->vm.mutex);
462
463	/ Keep at least one fence available for the display engine. /
464	count = `0`;
465	list_for_each_entry(fence, &ggtt->fence_list, link)
466	count += !atomic_read(v: &fence->pin_count);
467	if (count <= `1`)
468	return ERR_PTR(error: -ENOSPC);
469
470	fence = fence_find(ggtt);
471	if (IS_ERR(ptr: fence))
472	return fence;
473
474	if (fence->vma) {
475	/ Force-remove fence from VMA /
476	ret = fence_update(fence, NULL);
477	if (ret)
478	return ERR_PTR(error: ret);
479	}
480
481	list_del(entry: &fence->link);
482
483	return fence;
484	}
485
486	/**
487	* i915_unreserve_fence - Reclaim a reserved fence
488	* @fence: the fence reg
489	*
490	* This function add a reserved fence register from vGPU to the fence_list.
491	*/
492	void i915_unreserve_fence(struct i915_fence_reg *fence)
493	{
494	struct i915_ggtt *ggtt = fence->ggtt;
495
496	lockdep_assert_held(&ggtt->vm.mutex);
497
498	list_add(new: &fence->link, head: &ggtt->fence_list);
499	}
500
501	/**
502	* intel_ggtt_restore_fences - restore fence state
503	* @ggtt: Global GTT
504	*
505	* Restore the hw fence state to match the software tracking again, to be called
506	* after a gpu reset and on resume. Note that on runtime suspend we only cancel
507	* the fences, to be reacquired by the user later.
508	*/
509	void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
510	{
511	int i;
512
513	for (i = `0`; i < ggtt->num_fences; i++)
514	fence_write(fence: &ggtt->fence_regs[i]);
515	}
516
517	/**
518	* DOC: tiling swizzling details
519	*
520	* The idea behind tiling is to increase cache hit rates by rearranging
521	* pixel data so that a group of pixel accesses are in the same cacheline.
522	* Performance improvement from doing this on the back/depth buffer are on
523	* the order of 30%.
524	*
525	* Intel architectures make this somewhat more complicated, though, by
526	* adjustments made to addressing of data when the memory is in interleaved
527	* mode (matched pairs of DIMMS) to improve memory bandwidth.
528	* For interleaved memory, the CPU sends every sequential 64 bytes
529	* to an alternate memory channel so it can get the bandwidth from both.
530	*
531	* The GPU also rearranges its accesses for increased bandwidth to interleaved
532	* memory, and it matches what the CPU does for non-tiled. However, when tiled
533	* it does it a little differently, since one walks addresses not just in the
534	* X direction but also Y. So, along with alternating channels when bit
535	* 6 of the address flips, it also alternates when other bits flip -- Bits 9
536	* (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
537	* are common to both the 915 and 965-class hardware.
538	*
539	* The CPU also sometimes XORs in higher bits as well, to improve
540	* bandwidth doing strided access like we do so frequently in graphics. This
541	* is called "Channel XOR Randomization" in the MCH documentation. The result
542	* is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
543	* decode.
544	*
545	* All of this bit 6 XORing has an effect on our memory management,
546	* as we need to make sure that the 3d driver can correctly address object
547	* contents.
548	*
549	* If we don't have interleaved memory, all tiling is safe and no swizzling is
550	* required.
551	*
552	* When bit 17 is XORed in, we simply refuse to tile at all. Bit
553	* 17 is not just a page offset, so as we page an object out and back in,
554	* individual pages in it will have different bit 17 addresses, resulting in
555	* each 64 bytes being swapped with its neighbor!
556	*
557	* Otherwise, if interleaved, we have to tell the 3d driver what the address
558	* swizzling it needs to do is, since it's writing with the CPU to the pages
559	* (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
560	* pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
561	* required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
562	* to match what the GPU expects.
563	*/
564
565	/**
566	* detect_bit_6_swizzle - detect bit 6 swizzling pattern
567	* @ggtt: Global GGTT
568	*
569	* Detects bit 6 swizzling of address lookup between IGD access and CPU
570	* access through main memory.
571	*/
572	static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
573	{
574	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
575	struct drm_i915_private *i915 = ggtt->vm.i915;
576	u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
577	u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
578
579	if (GRAPHICS_VER(i915) >= `8` \|\| IS_VALLEYVIEW(i915)) {
580	/*
581	* On BDW+, swizzling is not used. We leave the CPU memory
582	* controller in charge of optimizing memory accesses without
583	* the extra address manipulation GPU side.
584	*
585	* VLV and CHV don't have GPU swizzling.
586	*/
587	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
588	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
589	} else if (GRAPHICS_VER(i915) >= `6`) {
590	if (i915->preserve_bios_swizzle) {
591	if (intel_uncore_read(uncore, DISP_ARB_CTL) &
592	DISP_TILE_SURFACE_SWIZZLING) {
593	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
594	swizzle_y = I915_BIT_6_SWIZZLE_9;
595	} else {
596	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
597	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
598	}
599	} else {
600	u32 dimm_c0, dimm_c1;
601
602	dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
603	dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
604	dimm_c0 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
605	dimm_c1 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
606	/*
607	* Enable swizzling when the channels are populated
608	* with identically sized dimms. We don't need to check
609	* the 3rd channel because no cpu with gpu attached
610	* ships in that configuration. Also, swizzling only
611	* makes sense for 2 channels anyway.
612	*/
613	if (dimm_c0 == dimm_c1) {
614	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
615	swizzle_y = I915_BIT_6_SWIZZLE_9;
616	} else {
617	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
618	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
619	}
620	}
621	} else if (GRAPHICS_VER(i915) == `5`) {
622	/*
623	* On Ironlake whatever DRAM config, GPU always do
624	* same swizzling setup.
625	*/
626	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
627	swizzle_y = I915_BIT_6_SWIZZLE_9;
628	} else if (GRAPHICS_VER(i915) == `2`) {
629	/*
630	* As far as we know, the 865 doesn't have these bit 6
631	* swizzling issues.
632	*/
633	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
634	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
635	} else if (IS_G45(i915) \|\| IS_I965G(i915) \|\| IS_G33(i915)) {
636	/*
637	* The 965, G33, and newer, have a very flexible memory
638	* configuration. It will enable dual-channel mode
639	* (interleaving) on as much memory as it can, and the GPU
640	* will additionally sometimes enable different bit 6
641	* swizzling for tiled objects from the CPU.
642	*
643	* Here's what I found on the G965:
644	* slot fill memory size swizzling
645	* 0A 0B 1A 1B 1-ch 2-ch
646	* 512 0 0 0 512 0 O
647	* 512 0 512 0 16 1008 X
648	* 512 0 0 512 16 1008 X
649	* 0 512 0 512 16 1008 X
650	* 1024 1024 1024 0 2048 1024 O
651	*
652	* We could probably detect this based on either the DRB
653	* matching, which was the case for the swizzling required in
654	* the table above, or from the 1-ch value being less than
655	* the minimum size of a rank.
656	*
657	* Reports indicate that the swizzling actually
658	* varies depending upon page placement inside the
659	* channels, i.e. we see swizzled pages where the
660	* banks of memory are paired and unswizzled on the
661	* uneven portion, so leave that as unknown.
662	*/
663	if (intel_uncore_read16(uncore, C0DRB3_BW) ==
664	intel_uncore_read16(uncore, C1DRB3_BW)) {
665	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
666	swizzle_y = I915_BIT_6_SWIZZLE_9;
667	}
668	} else {
669	u32 dcc = intel_uncore_read(uncore, DCC);
670
671	/*
672	* On 9xx chipsets, channel interleave by the CPU is
673	* determined by DCC. For single-channel, neither the CPU
674	* nor the GPU do swizzling. For dual channel interleaved,
675	* the GPU's interleave is bit 9 and 10 for X tiled, and bit
676	* 9 for Y tiled. The CPU's interleave is independent, and
677	* can be based on either bit 11 (haven't seen this yet) or
678	* bit 17 (common).
679	*/
680	switch (dcc & DCC_ADDRESSING_MODE_MASK) {
681	case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
682	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
683	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
684	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
685	break;
686	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
687	if (dcc & DCC_CHANNEL_XOR_DISABLE) {
688	/*
689	* This is the base swizzling by the GPU for
690	* tiled buffers.
691	*/
692	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
693	swizzle_y = I915_BIT_6_SWIZZLE_9;
694	} else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == `0`) {
695	/ Bit 11 swizzling by the CPU in addition. /
696	swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
697	swizzle_y = I915_BIT_6_SWIZZLE_9_11;
698	} else {
699	/ Bit 17 swizzling by the CPU in addition. /
700	swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
701	swizzle_y = I915_BIT_6_SWIZZLE_9_17;
702	}
703	break;
704	}
705
706	/ check for L-shaped memory aka modified enhanced addressing /
707	if (GRAPHICS_VER(i915) == `4` &&
708	!(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
709	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
710	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
711	}
712
713	if (dcc == `0xffffffff`) {
714	drm_err(&i915->drm, "Couldn't read from MCHBAR. "
715	"Disabling tiling.\n");
716	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
717	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
718	}
719	}
720
721	if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN \|\|
722	swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
723	/*
724	* Userspace likes to explode if it sees unknown swizzling,
725	* so lie. We will finish the lie when reporting through
726	* the get-tiling-ioctl by reporting the physical swizzle
727	* mode as unknown instead.
728	*
729	* As we don't strictly know what the swizzling is, it may be
730	* bit17 dependent, and so we need to also prevent the pages
731	* from being moved.
732	*/
733	i915->gem_quirks \|= GEM_QUIRK_PIN_SWIZZLED_PAGES;
734	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
735	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
736	}
737
738	to_gt(i915)->ggtt->bit_6_swizzle_x = swizzle_x;
739	to_gt(i915)->ggtt->bit_6_swizzle_y = swizzle_y;
740	}
741
742	/*
743	* Swap every 64 bytes of this page around, to account for it having a new
744	* bit 17 of its physical address and therefore being interpreted differently
745	* by the GPU.
746	*/
747	static void swizzle_page(struct page *page)
748	{
749	char temp[`64`];
750	char *vaddr;
751	int i;
752
753	vaddr = kmap_local_page(page);
754
755	for (i = `0`; i < PAGE_SIZE; i += `128`) {
756	memcpy(to: temp, from: &vaddr[i], len: `64`);
757	memcpy(to: &vaddr[i], from: &vaddr[i + `64`], len: `64`);
758	memcpy(to: &vaddr[i + `64`], from: temp, len: `64`);
759	}
760
761	kunmap_local(vaddr);
762	}
763
764	/**
765	* i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
766	* @obj: i915 GEM buffer object
767	* @pages: the scattergather list of physical pages
768	*
769	* This function fixes up the swizzling in case any page frame number for this
770	* object has changed in bit 17 since that state has been saved with
771	* i915_gem_object_save_bit_17_swizzle().
772	*
773	* This is called when pinning backing storage again, since the kernel is free
774	* to move unpinned backing storage around (either by directly moving pages or
775	* by swapping them out and back in again).
776	*/
777	void
778	i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
779	struct sg_table *pages)
780	{
781	struct sgt_iter sgt_iter;
782	struct page *page;
783	int i;
784
785	if (obj->bit_17 == NULL)
786	return;
787
788	i = `0`;
789	for_each_sgt_page(page, sgt_iter, pages) {
790	char new_bit_17 = page_to_phys(page) >> `17`;
791
792	if ((new_bit_17 & `0x1`) != (test_bit(i, obj->bit_17) != `0`)) {
793	swizzle_page(page);
794	set_page_dirty(page);
795	}
796
797	i++;
798	}
799	}
800
801	/**
802	* i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
803	* @obj: i915 GEM buffer object
804	* @pages: the scattergather list of physical pages
805	*
806	* This function saves the bit 17 of each page frame number so that swizzling
807	* can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
808	* be called before the backing storage can be unpinned.
809	*/
810	void
811	i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
812	struct sg_table *pages)
813	{
814	const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
815	struct sgt_iter sgt_iter;
816	struct page *page;
817	int i;
818
819	if (obj->bit_17 == NULL) {
820	obj->bit_17 = bitmap_zalloc(nbits: page_count, GFP_KERNEL);
821	if (obj->bit_17 == NULL) {
822	drm_err(obj->base.dev,
823	"Failed to allocate memory for bit 17 record\n");
824	return;
825	}
826	}
827
828	i = `0`;
829
830	for_each_sgt_page(page, sgt_iter, pages) {
831	if (page_to_phys(page) & (`1` << `17`))
832	__set_bit(i, obj->bit_17);
833	else
834	__clear_bit(i, obj->bit_17);
835	i++;
836	}
837	}
838
839	void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
840	{
841	struct drm_i915_private *i915 = ggtt->vm.i915;
842	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
843	int num_fences;
844	int i;
845
846	INIT_LIST_HEAD(list: &ggtt->fence_list);
847	INIT_LIST_HEAD(list: &ggtt->userfault_list);
848
849	detect_bit_6_swizzle(ggtt);
850
851	if (!i915_ggtt_has_aperture(ggtt))
852	num_fences = `0`;
853	else if (GRAPHICS_VER(i915) >= `7` &&
854	!(IS_VALLEYVIEW(i915) \|\| IS_CHERRYVIEW(i915)))
855	num_fences = `32`;
856	else if (GRAPHICS_VER(i915) >= `4` \|\|
857	IS_I945G(i915) \|\| IS_I945GM(i915) \|\|
858	IS_G33(i915) \|\| IS_PINEVIEW(i915))
859	num_fences = `16`;
860	else
861	num_fences = `8`;
862
863	if (intel_vgpu_active(i915))
864	num_fences = intel_uncore_read(uncore,
865	vgtif_reg(avail_rs.fence_num));
866	ggtt->fence_regs = kcalloc(num_fences,
867	sizeof(*ggtt->fence_regs),
868	GFP_KERNEL);
869	if (!ggtt->fence_regs)
870	num_fences = `0`;
871
872	/ Initialize fence registers to zero /
873	for (i = `0`; i < num_fences; i++) {
874	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
875
876	i915_active_init(&fence->active, NULL, NULL, `0`);
877	fence->ggtt = ggtt;
878	fence->id = i;
879	list_add_tail(new: &fence->link, head: &ggtt->fence_list);
880	}
881	ggtt->num_fences = num_fences;
882
883	intel_ggtt_restore_fences(ggtt);
884	}
885
886	void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
887	{
888	int i;
889
890	for (i = `0`; i < ggtt->num_fences; i++) {
891	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
892
893	i915_active_fini(ref: &fence->active);
894	}
895
896	kfree(objp: ggtt->fence_regs);
897	}
898
899	void intel_gt_init_swizzling(struct intel_gt *gt)
900	{
901	struct drm_i915_private *i915 = gt->i915;
902	struct intel_uncore *uncore = gt->uncore;
903
904	if (GRAPHICS_VER(i915) < `5` \|\|
905	to_gt(i915)->ggtt->bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
906	return;
907
908	intel_uncore_rmw(uncore, DISP_ARB_CTL, clear: `0`, DISP_TILE_SURFACE_SWIZZLING);
909
910	if (GRAPHICS_VER(i915) == `5`)
911	return;
912
913	intel_uncore_rmw(uncore, TILECTL, clear: `0`, TILECTL_SWZCTL);
914
915	if (GRAPHICS_VER(i915) == `6`)
916	intel_uncore_write(uncore,
917	ARB_MODE,
918	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
919	else if (GRAPHICS_VER(i915) == `7`)
920	intel_uncore_write(uncore,
921	ARB_MODE,
922	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
923	else if (GRAPHICS_VER(i915) == `8`)
924	intel_uncore_write(uncore,
925	GAMTARBMODE,
926	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
927	else
928	MISSING_CASE(GRAPHICS_VER(i915));
929	}
930

Browse the source code of Linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c