memcpy_64.S source code [Linux/arch/x86/lib/memcpy_64.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/ Copyright 2002 Andi Kleen /
3
4	#include <linux/export.h>
5	#include <linux/linkage.h>
6	#include <linux/cfi_types.h>
7	#include <asm/errno.h>
8	#include <asm/cpufeatures.h>
9	#include <asm/alternative.h>
10
11	.section .noinstr.text, "ax"
12
13	/*
14	* memcpy - Copy a memory block.
15	*
16	* Input:
17	* rdi destination
18	* rsi source
19	* rdx count
20	*
21	* Output:
22	* rax original destination
23	*
24	* The FSRM alternative should be done inline (avoiding the call and
25	* the disgusting return handling), but that would require some help
26	* from the compiler for better calling conventions.
27	*
28	* The 'rep movsb' itself is small enough to replace the call, but the
29	* two register moves blow up the code. And one of them is "needed"
30	* only for the return value that is the same as the source input,
31	* which the compiler could/should do much better anyway.
32	*/
33	SYM_TYPED_FUNC_START(__memcpy)
34	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36	movq %rdi, %rax
37	movq %rdx, %rcx
38	rep movsb
39	RET
40	SYM_FUNC_END(__memcpy)
41	EXPORT_SYMBOL(__memcpy)
42
43	SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44	SYM_PIC_ALIAS(memcpy)
45	EXPORT_SYMBOL(memcpy)
46
47	SYM_FUNC_START_LOCAL(memcpy_orig)
48	movq %rdi, %rax
49
50	cmpq $`0x20`, %rdx
51	jb .Lhandle_tail
52
53	/*
54	* We check whether memory false dependence could occur,
55	* then jump to corresponding copy mode.
56	*/
57	cmp %dil, %sil
58	jl .Lcopy_backward
59	subq $`0x20`, %rdx
60	.Lcopy_forward_loop:
61	subq $`0x20`, %rdx
62
63	/*
64	* Move in blocks of 4x8 bytes:
65	*/
66	movq `0`*`8`(%rsi), %r8
67	movq `1`*`8`(%rsi), %r9
68	movq `2`*`8`(%rsi), %r10
69	movq `3`*`8`(%rsi), %r11
70	leaq `4`*`8`(%rsi), %rsi
71
72	movq %r8, `0`*`8`(%rdi)
73	movq %r9, `1`*`8`(%rdi)
74	movq %r10, `2`*`8`(%rdi)
75	movq %r11, `3`*`8`(%rdi)
76	leaq `4`*`8`(%rdi), %rdi
77	jae .Lcopy_forward_loop
78	addl $`0x20`, %edx
79	jmp .Lhandle_tail
80
81	.Lcopy_backward:
82	/*
83	* Calculate copy position to tail.
84	*/
85	addq %rdx, %rsi
86	addq %rdx, %rdi
87	subq $`0x20`, %rdx
88	/*
89	* At most 3 ALU operations in one cycle,
90	* so append NOPS in the same 16 bytes trunk.
91	*/
92	.p2align `4`
93	.Lcopy_backward_loop:
94	subq $`0x20`, %rdx
95	movq -`1`*`8`(%rsi), %r8
96	movq -`2`*`8`(%rsi), %r9
97	movq -`3`*`8`(%rsi), %r10
98	movq -`4`*`8`(%rsi), %r11
99	leaq -`4`*`8`(%rsi), %rsi
100	movq %r8, -`1`*`8`(%rdi)
101	movq %r9, -`2`*`8`(%rdi)
102	movq %r10, -`3`*`8`(%rdi)
103	movq %r11, -`4`*`8`(%rdi)
104	leaq -`4`*`8`(%rdi), %rdi
105	jae .Lcopy_backward_loop
106
107	/*
108	* Calculate copy position to head.
109	*/
110	addl $`0x20`, %edx
111	subq %rdx, %rsi
112	subq %rdx, %rdi
113	.Lhandle_tail:
114	cmpl $`16`, %edx
115	jb .Lless_16bytes
116
117	/*
118	* Move data from 16 bytes to 31 bytes.
119	*/
120	movq `0`*`8`(%rsi), %r8
121	movq `1`*`8`(%rsi), %r9
122	movq -`2`*`8`(%rsi, %rdx), %r10
123	movq -`1`*`8`(%rsi, %rdx), %r11
124	movq %r8, `0`*`8`(%rdi)
125	movq %r9, `1`*`8`(%rdi)
126	movq %r10, -`2`*`8`(%rdi, %rdx)
127	movq %r11, -`1`*`8`(%rdi, %rdx)
128	RET
129	.p2align `4`
130	.Lless_16bytes:
131	cmpl $`8`, %edx
132	jb .Lless_8bytes
133	/*
134	* Move data from 8 bytes to 15 bytes.
135	*/
136	movq `0`*`8`(%rsi), %r8
137	movq -`1`*`8`(%rsi, %rdx), %r9
138	movq %r8, `0`*`8`(%rdi)
139	movq %r9, -`1`*`8`(%rdi, %rdx)
140	RET
141	.p2align `4`
142	.Lless_8bytes:
143	cmpl $`4`, %edx
144	jb .Lless_3bytes
145
146	/*
147	* Move data from 4 bytes to 7 bytes.
148	*/
149	movl (%rsi), %ecx
150	movl -`4`(%rsi, %rdx), %r8d
151	movl %ecx, (%rdi)
152	movl %r8d, -`4`(%rdi, %rdx)
153	RET
154	.p2align `4`
155	.Lless_3bytes:
156	subl $`1`, %edx
157	jb .Lend
158	/*
159	* Move data from 1 bytes to 3 bytes.
160	*/
161	movzbl (%rsi), %ecx
162	jz .Lstore_1byte
163	movzbq `1`(%rsi), %r8
164	movzbq (%rsi, %rdx), %r9
165	movb %r8b, `1`(%rdi)
166	movb %r9b, (%rdi, %rdx)
167	.Lstore_1byte:
168	movb %cl, (%rdi)
169
170	.Lend:
171	RET
172	SYM_FUNC_END(memcpy_orig)
173
174

Browse the source code of Linux/arch/x86/lib/memcpy_64.S