VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 98962

Last change on this file since 98962 was 98103, checked in by vboxsync, 2 years ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 35.2 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
38 * @internal
39 *
40 * The io_uring interface is the most recent interface added to the Linux kernel
41 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
42 * thus not available on most systems as of writing this backend (July 2019).
43 * It supersedes the old async I/O interface and cleans up with some restrictions like
44 * having to disable caching for the file.
45 * The interface is centered around a submission and completion queue to queue multiple new
46 * requests for the kernel to process and get notified about completions to reduce the amount
47 * of context switches to an absolute minimum. It also offers advanced features like
48 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
49 * even more.
50 *
51 * The first implementation will only make use of the basic features and more advanced features
52 * will be added later.
53 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
54 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
55 * while still keeping a consistent platform independent API which allows efficient implementations on
56 * other hosts when they come up.
57 *
58 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
59 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
60 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
61 * * http://kernel.dk/io_uring.pdf
62 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
63 */
64
65
66/*********************************************************************************************************************************
67* Header Files *
68*********************************************************************************************************************************/
69#define LOG_GROUP RTLOGGROUP_IOQUEUE
70#include <iprt/ioqueue.h>
71
72#include <iprt/assertcompile.h>
73#include <iprt/asm.h>
74#include <iprt/errcore.h>
75#include <iprt/file.h>
76#include <iprt/log.h>
77#include <iprt/mem.h>
78#include <iprt/string.h>
79
80#include <errno.h>
81#include <unistd.h>
82#include <signal.h>
83#include <sys/mman.h>
84#include <sys/syscall.h>
85#include <sys/uio.h>
86
87#include "internal/ioqueue.h"
88
89
90/*********************************************************************************************************************************
91* Defined Constants And Macros *
92*********************************************************************************************************************************/
93
94/** The syscall number of io_uring_setup(). */
95#define LNX_IOURING_SYSCALL_SETUP 425
96/** The syscall number of io_uring_enter(). */
97#define LNX_IOURING_SYSCALL_ENTER 426
98/** The syscall number of io_uring_register(). */
99#define LNX_IOURING_SYSCALL_REGISTER 427
100/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
101#define LNX_SYSCALL_EVENTFD2 290
102
103
104/*********************************************************************************************************************************
105* Structures and Typedefs *
106*********************************************************************************************************************************/
107
108/**
109 * Linux io_uring completion event.
110 */
111typedef struct LNXIOURINGCQE
112{
113 /** Opaque user data associated with the completed request. */
114 uint64_t u64User;
115 /** The status code of the request. */
116 int32_t rcLnx;
117 /** Some flags which are not used as of now. */
118 uint32_t fFlags;
119} LNXIOURINGCQE;
120AssertCompileSize(LNXIOURINGCQE, 16);
121/** Pointer to a Linux io_uring completion event. */
122typedef LNXIOURINGCQE *PLNXIOURINGCQE;
123/** Pointer to a constant linux io_uring completion event. */
124typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
125
126
127/**
128 * Linux io_uring submission queue entry.
129 */
130typedef struct LNXIOURINGSQE
131{
132 /** The opcode for the request. */
133 uint8_t u8Opc;
134 /** Common flags for the request. */
135 uint8_t u8Flags;
136 /** Assigned I/O priority. */
137 uint16_t u16IoPrio;
138 /** The file descriptor the request is for. */
139 int32_t i32Fd;
140 /** The start offset into the file for the request. */
141 uint64_t u64OffStart;
142 /** Buffer pointer or Pointer to io vector array depending on opcode. */
143 uint64_t u64AddrBufIoVec;
144 /** Size of the buffer in bytes or number of io vectors. */
145 uint32_t u32BufIoVecSz;
146 /** Opcode dependent data. */
147 union
148 {
149 /** Flags for read/write requests. */
150 uint32_t u32KrnlRwFlags;
151 /** Flags for fsync() like requests. */
152 uint32_t u32FsyncFlags;
153 /** Flags for poll() like requests. */
154 uint16_t u16PollFlags;
155 /** Flags for sync_file_range() like requests. */
156 uint32_t u32SyncFileRangeFlags;
157 /** Flags for requests requiring a msg structure. */
158 uint32_t u32MsgFlags;
159 } uOpc;
160 /** Opaque user data associated with the request and returned durign completion. */
161 uint64_t u64User;
162 /** Request type dependent data. */
163 union
164 {
165 /** Fixed buffer index if indicated by the request flags. */
166 uint16_t u16FixedBufIdx;
167 /** Padding to align the structure to 64 bytes. */
168 uint64_t au64Padding[3];
169 } uReq;
170} LNXIOURINGSQE;
171AssertCompileSize(LNXIOURINGSQE, 64);
172/** Pointer to a Linux io_uring submission queue entry. */
173typedef LNXIOURINGSQE *PLNXIOURINGSQE;
174/** Pointer to a constant Linux io_uring submission queue entry. */
175typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
176
177
178/**
179 * Linux u_ioring SQ ring header structure to maintain the queue.
180 */
181typedef struct LNXIOURINGSQ
182{
183 /** The current head position to fill in new requests. */
184 uint32_t u32OffHead;
185 /** The current tail position the kernel starts processing from. */
186 uint32_t u32OffTail;
187 /** The mask for the head and tail counters to apply to retrieve the index. */
188 uint32_t u32OffRingMask;
189 /** Number of entries in the SQ ring. */
190 uint32_t u32OffRingEntries;
191 /** Flags set asychronously by the kernel. */
192 uint32_t u32OffFlags;
193 /** Counter of dropped requests. */
194 uint32_t u32OffDroppedReqs;
195 /** Offset where to find the array of SQ entries. */
196 uint32_t u32OffArray;
197 /** Reserved. */
198 uint32_t u32Rsvd0;
199 /** Reserved. */
200 uint64_t u64Rsvd1;
201} LNXIOURINGSQ;
202AssertCompileSize(LNXIOURINGSQ, 40);
203/** Pointer to a Linux u_ioring SQ ring header. */
204typedef LNXIOURINGSQ *PLNXIOURINGSQ;
205/** Pointer to a constant Linux u_ioring SQ ring header. */
206typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
207
208
209/**
210 * Linux io_uring CQ ring header structure to maintain the queue.
211 */
212typedef struct LNXIOURINGCQ
213{
214 /** The current head position the kernel modifies when completion events happen. */
215 uint32_t u32OffHead;
216 /** The current tail position to read completion events from. */
217 uint32_t u32OffTail;
218 /** The mask for the head and tail counters to apply to retrieve the index. */
219 uint32_t u32OffRingMask;
220 /** Number of entries in the CQ ring. */
221 uint32_t u32OffRingEntries;
222 /** Number of CQ overflows happened. */
223 uint32_t u32OffOverflowCnt;
224 /** */
225 uint32_t u32OffCqes;
226 /** Reserved. */
227 uint64_t au64Rsvd0[2];
228} LNXIOURINGCQ;
229AssertCompileSize(LNXIOURINGCQ, 40);
230/** Pointer to a Linux u_ioring CQ ring header. */
231typedef LNXIOURINGCQ *PLNXIOURINGCQ;
232/** Pointer to a constant Linux u_ioring CQ ring header. */
233typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
234
235
236/**
237 * Linux io_uring parameters passed to io_uring_setup().
238 */
239typedef struct LNXIOURINGPARAMS
240{
241 /** Number of SQ entries requested, must be power of 2. */
242 uint32_t u32SqEntriesCnt;
243 /** Number of CQ entries requested, must be power of 2. */
244 uint32_t u32CqEntriesCnt;
245 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
246 uint32_t u32Flags;
247 /** Affinity of the kernel side SQ polling thread if enabled. */
248 uint32_t u32SqPollCpu;
249 /** Milliseconds after the kernel side SQ polling thread goes to sleep
250 * if there is are no requests to process. */
251 uint32_t u32SqPollIdleMs;
252 /** Reserved. */
253 uint32_t au32Rsvd0[5];
254 /** Offsets returned for the submission queue. */
255 LNXIOURINGSQ SqOffsets;
256 /** Offsets returned for the completion queue. */
257 LNXIOURINGCQ CqOffsets;
258} LNXIOURINGPARAMS;
259/** Pointer to Linux io_uring parameters. */
260typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
261/** Pointer to constant Linux io_uring parameters. */
262typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
263
264
265/** @name LNXIOURINGSQE::u8Opc defined opcodes.
266 * @{ */
267/** Opcode to profile the interface, does nothing. */
268#define LNX_IOURING_OPC_NOP 0
269/** preadv() like request. */
270#define LNX_IOURING_OPC_READV 1
271/** pwritev() like request. */
272#define LNX_IOURING_OPC_WRITEV 2
273/** fsync() like request. */
274#define LNX_IOURING_OPC_FSYNC 3
275/** Read request using a fixed preset buffer. */
276#define LNX_IOURING_OPC_READ_FIXED 4
277/** Write request using a fixed preset buffer. */
278#define LNX_IOURING_OPC_WRITE_FIXED 5
279/** Add file descriptor to pollset. */
280#define LNX_IOURING_OPC_POLL_ADD 6
281/** Remove file descriptor from pollset. */
282#define LNX_IOURING_OPC_POLL_REMOVE 7
283/** sync_file_range() like request. */
284#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
285/** sendmsg() like request. */
286#define LNX_IOURING_OPC_SENDMSG 9
287/** recvmsg() like request. */
288#define LNX_IOURING_OPC_RECVMSG 10
289/** @} */
290
291
292/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
293 * @{ */
294/** Sync userdata as well instead of metadata only. */
295#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
296/** @} */
297
298
299/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
300 * @{ */
301/** The I/O context is polled. */
302#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
303/** The kernel should poll the submission queue. */
304#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
305/** Sets the CPU affinity of the kernel thread polling the submission queue. */
306#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
307/** @} */
308
309
310/** @name Flags for LNXIOURINGSQE::u8Flags.
311 * @{ */
312/** The file descriptor was registered before use. */
313#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
314/** Complete all active requests before issuing the request with the flag set. */
315#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
316/** Links the request with the flag set to the next one. */
317#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
318/** @} */
319
320
321/** @name Magic mmap offsets to map submission and completion queues.
322 * @{ */
323/** Used to map the submission queue. */
324#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
325/** Used to map the completion queue. */
326#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
327/** Used to map the submission queue entries array. */
328#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
329/** @} */
330
331
332/** @name Flags used for the SQ ring structure.
333 * @{ */
334/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
335#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
336/** @} */
337
338
339/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
340 * @{ */
341/** Retrieve completion events for the completion queue. */
342#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
343/** Wakes the suspended kernel thread processing the requests. */
344#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
345/** @} */
346
347
348/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
349 * @{ */
350/** Register a fixed set of buffers. */
351#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
352/** Unregisters a fixed set of buffers registered previously. */
353#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
354/** Register a fixed set of files. */
355#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
356/** Unregisters a fixed set of files registered previously. */
357#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
358/** Register an eventfd associated with the I/O ring. */
359#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
360/** Unregisters an eventfd registered previously. */
361#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
362/** @} */
363
364
365/**
366 * SQ ring structure.
367 *
368 * @note Some members of this structure point to memory shared with the kernel,
369 * hence the volatile keyword.
370 */
371typedef struct RTIOQUEUESQ
372{
373 /** Pointer to the head counter. */
374 volatile uint32_t *pidxHead;
375 /** Pointer to the tail counter. */
376 volatile uint32_t *pidxTail;
377 /** Mask to apply for the counters to get to the index. */
378 uint32_t fRingMask;
379 /** Number of entries in the ring. */
380 uint32_t cEntries;
381 /** Pointer to the global flags. */
382 volatile uint32_t *pfFlags;
383 /** Pointer to the indirection array used for indexing the real SQ entries. */
384 volatile uint32_t *paidxSqes;
385} RTIOQUEUESQ;
386
387
388/**
389 * CQ ring structure.
390 *
391 * @note Some members of this structure point to memory shared with the kernel,
392 * hence the volatile keyword.
393 */
394typedef struct RTIOQUEUECQ
395{
396 /** Pointer to the head counter. */
397 volatile uint32_t *pidxHead;
398 /** Pointer to the tail counter. */
399 volatile uint32_t *pidxTail;
400 /** Mask to apply for the counters to get to the index. */
401 uint32_t fRingMask;
402 /** Number of entries in the ring. */
403 uint32_t cEntries;
404 /** Pointer to the completion entry ring. */
405 volatile LNXIOURINGCQE *paCqes;
406} RTIOQUEUECQ;
407
408
409/**
410 * Internal I/O queue provider instance data.
411 */
412typedef struct RTIOQUEUEPROVINT
413{
414 /** The io_uring file descriptor. */
415 int iFdIoCtx;
416 /** The eventfd file descriptor registered with the ring. */
417 int iFdEvt;
418 /** The submission queue. */
419 RTIOQUEUESQ Sq;
420 /** The currently uncommitted tail for the SQ. */
421 uint32_t idxSqTail;
422 /** Numbere of uncommitted SQEs. */
423 uint32_t cSqesToCommit;
424 /** The completion queue. */
425 RTIOQUEUECQ Cq;
426 /** Pointer to the mapped SQES entries. */
427 PLNXIOURINGSQE paSqes;
428 /** Pointer to the iovec structure used for non S/G requests. */
429 struct iovec *paIoVecs;
430 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
431 void *pvMMapSqRing;
432 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
433 void *pvMMapCqRing;
434 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
435 void *pvMMapSqes;
436 /** Size of the mapped SQ ring, used for unmapping. */
437 size_t cbMMapSqRing;
438 /** Size of the mapped CQ ring, used for unmapping. */
439 size_t cbMMapCqRing;
440 /** Size of the mapped SQ entries array, used for unmapping. */
441 size_t cbMMapSqes;
442 /** Flag whether the waiter was woken up externally. */
443 volatile bool fExtIntr;
444} RTIOQUEUEPROVINT;
445/** Pointer to the internal I/O queue provider instance data. */
446typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
447
448
449/*********************************************************************************************************************************
450* Internal Functions *
451*********************************************************************************************************************************/
452
453/**
454 * Syscall wrapper for io_uring_setup().
455 *
456 * @returns IPRT status code.
457 * @param cEntries Number of entries for submission and completion queues.
458 * @param pParams Additional parameters for the I/O ring and updated return values
459 * on success.
460 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
461 */
462DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
463{
464 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
465 if (RT_UNLIKELY(rcLnx == -1))
466 return RTErrConvertFromErrno(errno);
467
468 *piFdIoCtx = rcLnx;
469 return VINF_SUCCESS;
470}
471
472
473/**
474 * Syscall wrapper for io_uring_enter().
475 *
476 * @returns IPRT status code.
477 * @param iFdIoCtx The I/O ring file descriptor.
478 * @param cToSubmit Maximum number of requests waiting for processing.
479 * @param cMinComplete Minimum number of completion events to accumulate before returning.
480 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
481 */
482DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
483 uint32_t fFlags)
484{
485 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
486 NULL, 0);
487 if (RT_UNLIKELY(rcLnx == -1))
488 return RTErrConvertFromErrno(errno);
489
490 return VINF_SUCCESS;
491}
492
493
494/**
495 * Syscall wrapper for io_uring_register().
496 *
497 * @returns IPRT status code.
498 * @param iFdIoCtx The I/O ring file descriptor.
499 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
500 * @param pvArg Opaque arguments.
501 * @param cArgs Number of arguments.
502 */
503DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
504 uint32_t cArgs)
505{
506 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
507 if (RT_UNLIKELY(rcLnx == -1))
508 return RTErrConvertFromErrno(errno);
509
510 return VINF_SUCCESS;
511}
512
513
514/**
515 * mmap() wrapper for the common bits and returning an IPRT status code.
516 *
517 * @returns IPRT status code.
518 * @param iFdIoCtx The I/O ring file descriptor.
519 * @param offMmap The mmap() offset.
520 * @param cbMmap How much to map.
521 * @param ppv Where to store the pointer to the mapping on success.
522 */
523DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
524{
525 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
526 if (pv != MAP_FAILED)
527 {
528 *ppv = pv;
529 return VINF_SUCCESS;
530 }
531
532 return RTErrConvertFromErrno(errno);
533}
534
535
536/**
537 * eventfd2() syscall wrapper.
538 *
539 * @returns IPRT status code.
540 * @param uValInit The initial value of the maintained counter.
541 * @param fFlags Flags controlling the eventfd behavior.
542 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
543 */
544DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
545{
546 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
547 if (RT_UNLIKELY(rcLnx == -1))
548 return RTErrConvertFromErrno(errno);
549
550 *piFdEvt = rcLnx;
551 return VINF_SUCCESS;
552}
553
554
555/**
556 * Checks the completion event queue for pending events.
557 *
558 * @returns nothing.
559 * @param pThis The provider instance.
560 * @param paCEvt Pointer to the array of completion events.
561 * @param cCEvt Maximum number of completion events the array can hold.
562 * @param pcCEvtSeen Where to store the number of completion events processed.
563 */
564static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
565 uint32_t cCEvt, uint32_t *pcCEvtSeen)
566{
567 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
568 ASMReadFence();
569 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
570 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
571 ASMReadFence();
572
573 uint32_t cCEvtSeen = 0;
574
575 while ( idxCqTail != idxCqHead
576 && cCEvtSeen < cCEvt)
577 {
578 /* Get the index. */
579 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
580 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
581
582 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
583 if (pCqe->rcLnx >= 0)
584 {
585 paCEvt->rcReq = VINF_SUCCESS;
586 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
587 }
588 else
589 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
590
591#ifdef RT_STRICT /* poison */
592 memset((void *)pCqe, 0xff, sizeof(*pCqe));
593#endif
594
595 paCEvt++;
596 cCEvtSeen++;
597 idxCqHead++;
598 }
599
600 *pcCEvtSeen = cCEvtSeen;
601
602 /* Paranoia strikes again. */
603 ASMWriteFence();
604 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
605 ASMWriteFence();
606}
607
608
609/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
610static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
611{
612 /*
613 * Try to create a simple I/O ring and close it again.
614 * The common code/public API already checked for the proper handle type.
615 */
616 int iFdIoCtx = 0;
617 bool fSupp = false;
618 LNXIOURINGPARAMS Params;
619 RT_ZERO(Params);
620
621 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
622 if (RT_SUCCESS(rc))
623 {
624 /*
625 * Check that we can register an eventfd descriptor to get notified about
626 * completion events while being able to kick the waiter externally out of the wait.
627 */
628 int iFdEvt = 0;
629 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
630 if (RT_SUCCESS(rc))
631 {
632 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
633 &iFdEvt, 1 /*cArgs*/);
634 if (RT_SUCCESS(rc))
635 fSupp = true;
636
637 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
638 }
639 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
640 }
641
642 return fSupp;
643}
644
645
646/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
647static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
648 uint32_t cSqEntries, uint32_t cCqEntries)
649{
650 RT_NOREF(fFlags, cCqEntries);
651
652 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
653 LNXIOURINGPARAMS Params;
654 RT_ZERO(Params);
655
656 pThis->cSqesToCommit = 0;
657 pThis->fExtIntr = false;
658
659 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
660 if (RT_SUCCESS(rc))
661 {
662 /* Map the rings into userspace. */
663 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
664 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
665 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
666
667 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
668 if (RT_LIKELY(pThis->paIoVecs))
669 {
670 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
671 if (RT_SUCCESS(rc))
672 {
673 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
674 if (RT_SUCCESS(rc))
675 {
676 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
677 if (RT_SUCCESS(rc))
678 {
679 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
680 if (RT_SUCCESS(rc))
681 {
682 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
683 if (RT_SUCCESS(rc))
684 {
685 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
686
687 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
688 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
689 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
690 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
691 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
692 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
693 pThis->idxSqTail = *pThis->Sq.pidxTail;
694
695 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
696
697 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
698
699 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
700 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
701 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
702 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
703 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
704 return VINF_SUCCESS;
705 }
706
707 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
708 }
709
710 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
711 }
712
713 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
714 AssertRC(rc);
715 }
716
717 close(pThis->iFdEvt);
718 }
719
720 RTMemFree(pThis->paIoVecs);
721 }
722
723 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
724 }
725
726 return rc;
727}
728
729
730/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
731static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
732{
733 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
734
735 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
736 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
737 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
738
739 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
740 AssertRC(rc);
741
742 close(pThis->iFdEvt);
743 close(pThis->iFdIoCtx);
744 RTMemFree(pThis->paIoVecs);
745
746 RT_ZERO(pThis);
747}
748
749
750/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
751static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
752{
753 RT_NOREF(hIoQueueProv, pHandle);
754 /** @todo Add support for fixed file sets later. */
755 return VINF_SUCCESS;
756}
757
758
759/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
760static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
761{
762 RT_NOREF(hIoQueueProv, pHandle);
763 /** @todo Add support for fixed file sets later. */
764 return VINF_SUCCESS;
765}
766
767
768/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
769static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
770 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
771 void *pvUser)
772{
773 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
774 RT_NOREF(fReqFlags);
775
776 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
777 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
778 struct iovec *pIoVec = &pThis->paIoVecs[idx];
779
780 pIoVec->iov_base = pvBuf;
781 pIoVec->iov_len = cbBuf;
782
783 pSqe->u8Flags = 0;
784 pSqe->u16IoPrio = 0;
785 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
786 pSqe->u64OffStart = off;
787 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
788 pSqe->u32BufIoVecSz = 1;
789 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
790
791 switch (enmOp)
792 {
793 case RTIOQUEUEOP_READ:
794 pSqe->u8Opc = LNX_IOURING_OPC_READV;
795 pSqe->uOpc.u32KrnlRwFlags = 0;
796 break;
797 case RTIOQUEUEOP_WRITE:
798 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
799 pSqe->uOpc.u32KrnlRwFlags = 0;
800 break;
801 case RTIOQUEUEOP_SYNC:
802 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
803 pSqe->uOpc.u32FsyncFlags = 0;
804 break;
805 default:
806 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
807 VERR_INVALID_PARAMETER);
808 }
809
810 pThis->Sq.paidxSqes[idx] = idx;
811 pThis->idxSqTail++;
812 pThis->cSqesToCommit++;
813 return VINF_SUCCESS;
814}
815
816
817/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
818static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
819{
820 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
821
822 ASMWriteFence();
823 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
824 ASMWriteFence();
825
826 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
827 if (RT_SUCCESS(rc))
828 {
829 *pcReqsCommitted = pThis->cSqesToCommit;
830 pThis->cSqesToCommit = 0;
831 }
832
833 return rc;
834}
835
836
837/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
838static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
839 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
840{
841 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
842 int rc = VINF_SUCCESS;
843 uint32_t cCEvtSeen = 0;
844
845 RT_NOREF(fFlags);
846
847 /*
848 * Check the completion queue first for any completed events which might save us a
849 * context switch later on.
850 */
851 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
852
853 while ( cCEvtSeen < cMinWait
854 && RT_SUCCESS(rc))
855 {
856 /*
857 * We can employ a blocking read on the event file descriptor, it will return
858 * either when woken up externally or when there are completion events pending.
859 */
860 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
861 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
862 if (rcLnx == sizeof(uCnt))
863 {
864 uint32_t cCEvtThisSeen = 0;
865 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
866 cCEvtSeen += cCEvtThisSeen;
867
868 /* Whether we got woken up externally. */
869 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
870 rc = VERR_INTERRUPTED;
871 }
872 else if (rcLnx == -1)
873 rc = RTErrConvertFromErrno(errno);
874 else
875 AssertMsgFailed(("Unexpected read() -> 0\n"));
876 }
877
878 *pcCEvt = cCEvtSeen;
879 return rc;
880}
881
882
883/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
884static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
885{
886 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
887 int rc = VINF_SUCCESS;
888
889 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
890 {
891 const uint64_t uValAdd = 1;
892 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
893
894 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
895 if (rcLnx == -1)
896 rc = RTErrConvertFromErrno(errno);
897 }
898
899 return rc;
900}
901
902
903/**
904 * Async file I/O queue provider virtual method table.
905 */
906RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
907{
908 /** uVersion */
909 RTIOQUEUEPROVVTABLE_VERSION,
910 /** pszId */
911 "LnxIoURingFile",
912 /** cbIoQueueProv */
913 sizeof(RTIOQUEUEPROVINT),
914 /** enmHnd */
915 RTHANDLETYPE_FILE,
916 /** fFlags */
917 0,
918 /** pfnIsSupported */
919 rtIoQueueLnxIoURingFileProv_IsSupported,
920 /** pfnQueueInit */
921 rtIoQueueLnxIoURingFileProv_QueueInit,
922 /** pfnQueueDestroy */
923 rtIoQueueLnxIoURingFileProv_QueueDestroy,
924 /** pfnHandleRegister */
925 rtIoQueueLnxIoURingFileProv_HandleRegister,
926 /** pfnHandleDeregister */
927 rtIoQueueLnxIoURingFileProv_HandleDeregister,
928 /** pfnReqPrepare */
929 rtIoQueueLnxIoURingFileProv_ReqPrepare,
930 /** pfnReqPrepareSg */
931 NULL,
932 /** pfnCommit */
933 rtIoQueueLnxIoURingFileProv_Commit,
934 /** pfnEvtWait */
935 rtIoQueueLnxIoURingFileProv_EvtWait,
936 /** pfnEvtWaitWakeup */
937 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
938 /** uEndMarker */
939 RTIOQUEUEPROVVTABLE_VERSION
940};
941
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette