1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2025 Christoph Hellwig
4 */
5#include <linux/blk-integrity.h>
6#include <linux/blk-mq-dma.h>
7#include "blk.h"
8
9struct phys_vec {
10 phys_addr_t paddr;
11 u32 len;
12};
13
14static bool __blk_map_iter_next(struct blk_map_iter *iter)
15{
16 if (iter->iter.bi_size)
17 return true;
18 if (!iter->bio || !iter->bio->bi_next)
19 return false;
20
21 iter->bio = iter->bio->bi_next;
22 if (iter->is_integrity) {
23 iter->iter = bio_integrity(bio: iter->bio)->bip_iter;
24 iter->bvecs = bio_integrity(bio: iter->bio)->bip_vec;
25 } else {
26 iter->iter = iter->bio->bi_iter;
27 iter->bvecs = iter->bio->bi_io_vec;
28 }
29 return true;
30}
31
32static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
33 struct phys_vec *vec)
34{
35 unsigned int max_size;
36 struct bio_vec bv;
37
38 if (!iter->iter.bi_size)
39 return false;
40
41 bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
42 vec->paddr = bvec_phys(bvec: &bv);
43 max_size = get_max_segment_size(lim: &req->q->limits, paddr: vec->paddr, UINT_MAX);
44 bv.bv_len = min(bv.bv_len, max_size);
45 bvec_iter_advance_single(bv: iter->bvecs, iter: &iter->iter, bytes: bv.bv_len);
46
47 /*
48 * If we are entirely done with this bi_io_vec entry, check if the next
49 * one could be merged into it. This typically happens when moving to
50 * the next bio, but some callers also don't pack bvecs tight.
51 */
52 while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
53 struct bio_vec next;
54
55 if (!__blk_map_iter_next(iter))
56 break;
57
58 next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
59 if (bv.bv_len + next.bv_len > max_size ||
60 !biovec_phys_mergeable(q: req->q, vec1: &bv, vec2: &next))
61 break;
62
63 bv.bv_len += next.bv_len;
64 bvec_iter_advance_single(bv: iter->bvecs, iter: &iter->iter, bytes: next.bv_len);
65 }
66
67 vec->len = bv.bv_len;
68 return true;
69}
70
71/*
72 * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
73 * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
74 * we need to ensure our segments are aligned to this as well.
75 *
76 * Note that there is no point in using the slightly more complicated IOVA based
77 * path for single segment mappings.
78 */
79static inline bool blk_can_dma_map_iova(struct request *req,
80 struct device *dma_dev)
81{
82 return !((queue_virt_boundary(q: req->q) + 1) &
83 dma_get_merge_boundary(dev: dma_dev));
84}
85
86static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
87{
88 iter->addr = pci_p2pdma_bus_addr_map(state: &iter->p2pdma, paddr: vec->paddr);
89 iter->len = vec->len;
90 return true;
91}
92
93static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
94 struct blk_dma_iter *iter, struct phys_vec *vec)
95{
96 iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
97 offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
98 if (dma_mapping_error(dev: dma_dev, dma_addr: iter->addr)) {
99 iter->status = BLK_STS_RESOURCE;
100 return false;
101 }
102 iter->len = vec->len;
103 return true;
104}
105
106static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
107 struct dma_iova_state *state, struct blk_dma_iter *iter,
108 struct phys_vec *vec)
109{
110 enum dma_data_direction dir = rq_dma_dir(req);
111 unsigned int mapped = 0;
112 int error;
113
114 iter->addr = state->addr;
115 iter->len = dma_iova_size(state);
116
117 do {
118 error = dma_iova_link(dev: dma_dev, state, phys: vec->paddr, offset: mapped,
119 size: vec->len, dir, attrs: 0);
120 if (error)
121 break;
122 mapped += vec->len;
123 } while (blk_map_iter_next(req, iter: &iter->iter, vec));
124
125 error = dma_iova_sync(dev: dma_dev, state, offset: 0, size: mapped);
126 if (error) {
127 iter->status = errno_to_blk_status(errno: error);
128 return false;
129 }
130
131 return true;
132}
133
134static inline void blk_rq_map_iter_init(struct request *rq,
135 struct blk_map_iter *iter)
136{
137 struct bio *bio = rq->bio;
138
139 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
140 *iter = (struct blk_map_iter) {
141 .bvecs = &rq->special_vec,
142 .iter = {
143 .bi_size = rq->special_vec.bv_len,
144 }
145 };
146 } else if (bio) {
147 *iter = (struct blk_map_iter) {
148 .bio = bio,
149 .bvecs = bio->bi_io_vec,
150 .iter = bio->bi_iter,
151 };
152 } else {
153 /* the internal flush request may not have bio attached */
154 *iter = (struct blk_map_iter) {};
155 }
156}
157
158static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
159 struct dma_iova_state *state, struct blk_dma_iter *iter,
160 unsigned int total_len)
161{
162 struct phys_vec vec;
163
164 memset(s: &iter->p2pdma, c: 0, n: sizeof(iter->p2pdma));
165 iter->status = BLK_STS_OK;
166
167 /*
168 * Grab the first segment ASAP because we'll need it to check for P2P
169 * transfers.
170 */
171 if (!blk_map_iter_next(req, iter: &iter->iter, vec: &vec))
172 return false;
173
174 switch (pci_p2pdma_state(state: &iter->p2pdma, dev: dma_dev,
175 phys_to_page(vec.paddr))) {
176 case PCI_P2PDMA_MAP_BUS_ADDR:
177 if (iter->iter.is_integrity)
178 bio_integrity(bio: req->bio)->bip_flags |= BIP_P2P_DMA;
179 else
180 req->cmd_flags |= REQ_P2PDMA;
181 return blk_dma_map_bus(iter, vec: &vec);
182 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
183 /*
184 * P2P transfers through the host bridge are treated the
185 * same as non-P2P transfers below and during unmap.
186 */
187 case PCI_P2PDMA_MAP_NONE:
188 break;
189 default:
190 iter->status = BLK_STS_INVAL;
191 return false;
192 }
193
194 if (blk_can_dma_map_iova(req, dma_dev) &&
195 dma_iova_try_alloc(dev: dma_dev, state, phys: vec.paddr, size: total_len))
196 return blk_rq_dma_map_iova(req, dma_dev, state, iter, vec: &vec);
197 return blk_dma_map_direct(req, dma_dev, iter, vec: &vec);
198}
199
200/**
201 * blk_rq_dma_map_iter_start - map the first DMA segment for a request
202 * @req: request to map
203 * @dma_dev: device to map to
204 * @state: DMA IOVA state
205 * @iter: block layer DMA iterator
206 *
207 * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
208 * caller and don't need to be initialized. @state needs to be stored for use
209 * at unmap time, @iter is only needed at map time.
210 *
211 * Returns %false if there is no segment to map, including due to an error, or
212 * %true ft it did map a segment.
213 *
214 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
215 * the length in @iter.len. If no segment was mapped the status code is
216 * returned in @iter.status.
217 *
218 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
219 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
220 * to try to map the following segments.
221 */
222bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
223 struct dma_iova_state *state, struct blk_dma_iter *iter)
224{
225 blk_rq_map_iter_init(rq: req, iter: &iter->iter);
226 return blk_dma_map_iter_start(req, dma_dev, state, iter,
227 total_len: blk_rq_payload_bytes(rq: req));
228}
229EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
230
231/**
232 * blk_rq_dma_map_iter_next - map the next DMA segment for a request
233 * @req: request to map
234 * @dma_dev: device to map to
235 * @state: DMA IOVA state
236 * @iter: block layer DMA iterator
237 *
238 * Iterate to the next mapping after a previous call to
239 * blk_rq_dma_map_iter_start(). See there for a detailed description of the
240 * arguments.
241 *
242 * Returns %false if there is no segment to map, including due to an error, or
243 * %true ft it did map a segment.
244 *
245 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
246 * the length in @iter.len. If no segment was mapped the status code is
247 * returned in @iter.status.
248 */
249bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
250 struct dma_iova_state *state, struct blk_dma_iter *iter)
251{
252 struct phys_vec vec;
253
254 if (!blk_map_iter_next(req, iter: &iter->iter, vec: &vec))
255 return false;
256
257 if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
258 return blk_dma_map_bus(iter, vec: &vec);
259 return blk_dma_map_direct(req, dma_dev, iter, vec: &vec);
260}
261EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
262
263static inline struct scatterlist *
264blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
265{
266 if (!*sg)
267 return sglist;
268
269 /*
270 * If the driver previously mapped a shorter list, we could see a
271 * termination bit prematurely unless it fully inits the sg table
272 * on each mapping. We KNOW that there must be more entries here
273 * or the driver would be buggy, so force clear the termination bit
274 * to avoid doing a full sg_init_table() in drivers for each command.
275 */
276 sg_unmark_end(sg: *sg);
277 return sg_next(sg: *sg);
278}
279
280/*
281 * Map a request to scatterlist, return number of sg entries setup. Caller
282 * must make sure sg can hold rq->nr_phys_segments entries.
283 */
284int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
285 struct scatterlist **last_sg)
286{
287 struct blk_map_iter iter;
288 struct phys_vec vec;
289 int nsegs = 0;
290
291 blk_rq_map_iter_init(rq, iter: &iter);
292 while (blk_map_iter_next(req: rq, iter: &iter, vec: &vec)) {
293 *last_sg = blk_next_sg(sg: last_sg, sglist);
294 sg_set_page(sg: *last_sg, phys_to_page(vec.paddr), len: vec.len,
295 offset_in_page(vec.paddr));
296 nsegs++;
297 }
298
299 if (*last_sg)
300 sg_mark_end(sg: *last_sg);
301
302 /*
303 * Something must have been wrong if the figured number of
304 * segment is bigger than number of req's physical segments
305 */
306 WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
307
308 return nsegs;
309}
310EXPORT_SYMBOL(__blk_rq_map_sg);
311
312#ifdef CONFIG_BLK_DEV_INTEGRITY
313/**
314 * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
315 * for a request
316 * @req: request to map
317 * @dma_dev: device to map to
318 * @state: DMA IOVA state
319 * @iter: block layer DMA iterator
320 *
321 * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are
322 * provided by the caller and don't need to be initialized. @state needs to be
323 * stored for use at unmap time, @iter is only needed at map time.
324 *
325 * Returns %false if there is no segment to map, including due to an error, or
326 * %true if it did map a segment.
327 *
328 * If a segment was mapped, the DMA address for it is returned in @iter.addr
329 * and the length in @iter.len. If no segment was mapped the status code is
330 * returned in @iter.status.
331 *
332 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
333 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
334 * to try to map the following segments.
335 */
336bool blk_rq_integrity_dma_map_iter_start(struct request *req,
337 struct device *dma_dev, struct dma_iova_state *state,
338 struct blk_dma_iter *iter)
339{
340 unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
341 blk_rq_sectors(req));
342 struct bio *bio = req->bio;
343
344 iter->iter = (struct blk_map_iter) {
345 .bio = bio,
346 .iter = bio_integrity(bio)->bip_iter,
347 .bvecs = bio_integrity(bio)->bip_vec,
348 .is_integrity = true,
349 };
350 return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
351}
352EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
353
354/**
355 * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for
356 * a request
357 * @req: request to map
358 * @dma_dev: device to map to
359 * @state: DMA IOVA state
360 * @iter: block layer DMA iterator
361 *
362 * Iterate to the next integrity mapping after a previous call to
363 * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description
364 * of the arguments.
365 *
366 * Returns %false if there is no segment to map, including due to an error, or
367 * %true if it did map a segment.
368 *
369 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
370 * the length in @iter.len. If no segment was mapped the status code is
371 * returned in @iter.status.
372 */
373bool blk_rq_integrity_dma_map_iter_next(struct request *req,
374 struct device *dma_dev, struct blk_dma_iter *iter)
375{
376 struct phys_vec vec;
377
378 if (!blk_map_iter_next(req, &iter->iter, &vec))
379 return false;
380
381 if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
382 return blk_dma_map_bus(iter, &vec);
383 return blk_dma_map_direct(req, dma_dev, iter, &vec);
384}
385EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
386
387/**
388 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
389 * @rq: request to map
390 * @sglist: target scatterlist
391 *
392 * Description: Map the integrity vectors in request into a
393 * scatterlist. The scatterlist must be big enough to hold all
394 * elements. I.e. sized using blk_rq_count_integrity_sg() or
395 * rq->nr_integrity_segments.
396 */
397int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
398{
399 struct request_queue *q = rq->q;
400 struct scatterlist *sg = NULL;
401 struct bio *bio = rq->bio;
402 unsigned int segments = 0;
403 struct phys_vec vec;
404
405 struct blk_map_iter iter = {
406 .bio = bio,
407 .iter = bio_integrity(bio)->bip_iter,
408 .bvecs = bio_integrity(bio)->bip_vec,
409 .is_integrity = true,
410 };
411
412 while (blk_map_iter_next(rq, &iter, &vec)) {
413 sg = blk_next_sg(&sg, sglist);
414 sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
415 offset_in_page(vec.paddr));
416 segments++;
417 }
418
419 if (sg)
420 sg_mark_end(sg);
421
422 /*
423 * Something must have been wrong if the figured number of segment
424 * is bigger than number of req's physical integrity segments
425 */
426 BUG_ON(segments > rq->nr_integrity_segments);
427 BUG_ON(segments > queue_max_integrity_segments(q));
428 return segments;
429}
430EXPORT_SYMBOL(blk_rq_map_integrity_sg);
431#endif
432