VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 93115

Last change on this file since 93115 was 93115, checked in by vboxsync, 3 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 34.9 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 93115 2022-01-01 11:31:46Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
28 * @internal
29 *
30 * The io_uring interface is the most recent interface added to the Linux kernel
31 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
32 * thus not available on most systems as of writing this backend (July 2019).
33 * It supersedes the old async I/O interface and cleans up with some restrictions like
34 * having to disable caching for the file.
35 * The interface is centered around a submission and completion queue to queue multiple new
36 * requests for the kernel to process and get notified about completions to reduce the amount
37 * of context switches to an absolute minimum. It also offers advanced features like
38 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
39 * even more.
40 *
41 * The first implementation will only make use of the basic features and more advanced features
42 * will be added later.
43 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
44 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
45 * while still keeping a consistent platform independent API which allows efficient implementations on
46 * other hosts when they come up.
47 *
48 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
49 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
50 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
51 * * http://kernel.dk/io_uring.pdf
52 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
53 */
54
55
56/*********************************************************************************************************************************
57* Header Files *
58*********************************************************************************************************************************/
59#define LOG_GROUP RTLOGGROUP_IOQUEUE
60#include <iprt/ioqueue.h>
61
62#include <iprt/assertcompile.h>
63#include <iprt/asm.h>
64#include <iprt/errcore.h>
65#include <iprt/file.h>
66#include <iprt/log.h>
67#include <iprt/mem.h>
68#include <iprt/string.h>
69
70#include <errno.h>
71#include <unistd.h>
72#include <signal.h>
73#include <sys/mman.h>
74#include <sys/syscall.h>
75#include <sys/uio.h>
76
77#include "internal/ioqueue.h"
78
79
80/*********************************************************************************************************************************
81* Defined Constants And Macros *
82*********************************************************************************************************************************/
83
84/** The syscall number of io_uring_setup(). */
85#define LNX_IOURING_SYSCALL_SETUP 425
86/** The syscall number of io_uring_enter(). */
87#define LNX_IOURING_SYSCALL_ENTER 426
88/** The syscall number of io_uring_register(). */
89#define LNX_IOURING_SYSCALL_REGISTER 427
90/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
91#define LNX_SYSCALL_EVENTFD2 290
92
93
94/*********************************************************************************************************************************
95* Structures and Typedefs *
96*********************************************************************************************************************************/
97
98/**
99 * Linux io_uring completion event.
100 */
101typedef struct LNXIOURINGCQE
102{
103 /** Opaque user data associated with the completed request. */
104 uint64_t u64User;
105 /** The status code of the request. */
106 int32_t rcLnx;
107 /** Some flags which are not used as of now. */
108 uint32_t fFlags;
109} LNXIOURINGCQE;
110AssertCompileSize(LNXIOURINGCQE, 16);
111/** Pointer to a Linux io_uring completion event. */
112typedef LNXIOURINGCQE *PLNXIOURINGCQE;
113/** Pointer to a constant linux io_uring completion event. */
114typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
115
116
117/**
118 * Linux io_uring submission queue entry.
119 */
120typedef struct LNXIOURINGSQE
121{
122 /** The opcode for the request. */
123 uint8_t u8Opc;
124 /** Common flags for the request. */
125 uint8_t u8Flags;
126 /** Assigned I/O priority. */
127 uint16_t u16IoPrio;
128 /** The file descriptor the request is for. */
129 int32_t i32Fd;
130 /** The start offset into the file for the request. */
131 uint64_t u64OffStart;
132 /** Buffer pointer or Pointer to io vector array depending on opcode. */
133 uint64_t u64AddrBufIoVec;
134 /** Size of the buffer in bytes or number of io vectors. */
135 uint32_t u32BufIoVecSz;
136 /** Opcode dependent data. */
137 union
138 {
139 /** Flags for read/write requests. */
140 uint32_t u32KrnlRwFlags;
141 /** Flags for fsync() like requests. */
142 uint32_t u32FsyncFlags;
143 /** Flags for poll() like requests. */
144 uint16_t u16PollFlags;
145 /** Flags for sync_file_range() like requests. */
146 uint32_t u32SyncFileRangeFlags;
147 /** Flags for requests requiring a msg structure. */
148 uint32_t u32MsgFlags;
149 } uOpc;
150 /** Opaque user data associated with the request and returned durign completion. */
151 uint64_t u64User;
152 /** Request type dependent data. */
153 union
154 {
155 /** Fixed buffer index if indicated by the request flags. */
156 uint16_t u16FixedBufIdx;
157 /** Padding to align the structure to 64 bytes. */
158 uint64_t au64Padding[3];
159 } uReq;
160} LNXIOURINGSQE;
161AssertCompileSize(LNXIOURINGSQE, 64);
162/** Pointer to a Linux io_uring submission queue entry. */
163typedef LNXIOURINGSQE *PLNXIOURINGSQE;
164/** Pointer to a constant Linux io_uring submission queue entry. */
165typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
166
167
168/**
169 * Linux u_ioring SQ ring header structure to maintain the queue.
170 */
171typedef struct LNXIOURINGSQ
172{
173 /** The current head position to fill in new requests. */
174 uint32_t u32OffHead;
175 /** The current tail position the kernel starts processing from. */
176 uint32_t u32OffTail;
177 /** The mask for the head and tail counters to apply to retrieve the index. */
178 uint32_t u32OffRingMask;
179 /** Number of entries in the SQ ring. */
180 uint32_t u32OffRingEntries;
181 /** Flags set asychronously by the kernel. */
182 uint32_t u32OffFlags;
183 /** Counter of dropped requests. */
184 uint32_t u32OffDroppedReqs;
185 /** Offset where to find the array of SQ entries. */
186 uint32_t u32OffArray;
187 /** Reserved. */
188 uint32_t u32Rsvd0;
189 /** Reserved. */
190 uint64_t u64Rsvd1;
191} LNXIOURINGSQ;
192AssertCompileSize(LNXIOURINGSQ, 40);
193/** Pointer to a Linux u_ioring SQ ring header. */
194typedef LNXIOURINGSQ *PLNXIOURINGSQ;
195/** Pointer to a constant Linux u_ioring SQ ring header. */
196typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
197
198
199/**
200 * Linux io_uring CQ ring header structure to maintain the queue.
201 */
202typedef struct LNXIOURINGCQ
203{
204 /** The current head position the kernel modifies when completion events happen. */
205 uint32_t u32OffHead;
206 /** The current tail position to read completion events from. */
207 uint32_t u32OffTail;
208 /** The mask for the head and tail counters to apply to retrieve the index. */
209 uint32_t u32OffRingMask;
210 /** Number of entries in the CQ ring. */
211 uint32_t u32OffRingEntries;
212 /** Number of CQ overflows happened. */
213 uint32_t u32OffOverflowCnt;
214 /** */
215 uint32_t u32OffCqes;
216 /** Reserved. */
217 uint64_t au64Rsvd0[2];
218} LNXIOURINGCQ;
219AssertCompileSize(LNXIOURINGCQ, 40);
220/** Pointer to a Linux u_ioring CQ ring header. */
221typedef LNXIOURINGCQ *PLNXIOURINGCQ;
222/** Pointer to a constant Linux u_ioring CQ ring header. */
223typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
224
225
226/**
227 * Linux io_uring parameters passed to io_uring_setup().
228 */
229typedef struct LNXIOURINGPARAMS
230{
231 /** Number of SQ entries requested, must be power of 2. */
232 uint32_t u32SqEntriesCnt;
233 /** Number of CQ entries requested, must be power of 2. */
234 uint32_t u32CqEntriesCnt;
235 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
236 uint32_t u32Flags;
237 /** Affinity of the kernel side SQ polling thread if enabled. */
238 uint32_t u32SqPollCpu;
239 /** Milliseconds after the kernel side SQ polling thread goes to sleep
240 * if there is are no requests to process. */
241 uint32_t u32SqPollIdleMs;
242 /** Reserved. */
243 uint32_t au32Rsvd0[5];
244 /** Offsets returned for the submission queue. */
245 LNXIOURINGSQ SqOffsets;
246 /** Offsets returned for the completion queue. */
247 LNXIOURINGCQ CqOffsets;
248} LNXIOURINGPARAMS;
249/** Pointer to Linux io_uring parameters. */
250typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
251/** Pointer to constant Linux io_uring parameters. */
252typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
253
254
255/** @name LNXIOURINGSQE::u8Opc defined opcodes.
256 * @{ */
257/** Opcode to profile the interface, does nothing. */
258#define LNX_IOURING_OPC_NOP 0
259/** preadv() like request. */
260#define LNX_IOURING_OPC_READV 1
261/** pwritev() like request. */
262#define LNX_IOURING_OPC_WRITEV 2
263/** fsync() like request. */
264#define LNX_IOURING_OPC_FSYNC 3
265/** Read request using a fixed preset buffer. */
266#define LNX_IOURING_OPC_READ_FIXED 4
267/** Write request using a fixed preset buffer. */
268#define LNX_IOURING_OPC_WRITE_FIXED 5
269/** Add file descriptor to pollset. */
270#define LNX_IOURING_OPC_POLL_ADD 6
271/** Remove file descriptor from pollset. */
272#define LNX_IOURING_OPC_POLL_REMOVE 7
273/** sync_file_range() like request. */
274#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
275/** sendmsg() like request. */
276#define LNX_IOURING_OPC_SENDMSG 9
277/** recvmsg() like request. */
278#define LNX_IOURING_OPC_RECVMSG 10
279/** @} */
280
281
282/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
283 * @{ */
284/** Sync userdata as well instead of metadata only. */
285#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
286/** @} */
287
288
289/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
290 * @{ */
291/** The I/O context is polled. */
292#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
293/** The kernel should poll the submission queue. */
294#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
295/** Sets the CPU affinity of the kernel thread polling the submission queue. */
296#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
297/** @} */
298
299
300/** @name Flags for LNXIOURINGSQE::u8Flags.
301 * @{ */
302/** The file descriptor was registered before use. */
303#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
304/** Complete all active requests before issuing the request with the flag set. */
305#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
306/** Links the request with the flag set to the next one. */
307#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
308/** @} */
309
310
311/** @name Magic mmap offsets to map submission and completion queues.
312 * @{ */
313/** Used to map the submission queue. */
314#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
315/** Used to map the completion queue. */
316#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
317/** Used to map the submission queue entries array. */
318#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
319/** @} */
320
321
322/** @name Flags used for the SQ ring structure.
323 * @{ */
324/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
325#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
326/** @} */
327
328
329/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
330 * @{ */
331/** Retrieve completion events for the completion queue. */
332#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
333/** Wakes the suspended kernel thread processing the requests. */
334#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
335/** @} */
336
337
338/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
339 * @{ */
340/** Register a fixed set of buffers. */
341#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
342/** Unregisters a fixed set of buffers registered previously. */
343#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
344/** Register a fixed set of files. */
345#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
346/** Unregisters a fixed set of files registered previously. */
347#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
348/** Register an eventfd associated with the I/O ring. */
349#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
350/** Unregisters an eventfd registered previously. */
351#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
352/** @} */
353
354
355/**
356 * SQ ring structure.
357 *
358 * @note Some members of this structure point to memory shared with the kernel,
359 * hence the volatile keyword.
360 */
361typedef struct RTIOQUEUESQ
362{
363 /** Pointer to the head counter. */
364 volatile uint32_t *pidxHead;
365 /** Pointer to the tail counter. */
366 volatile uint32_t *pidxTail;
367 /** Mask to apply for the counters to get to the index. */
368 uint32_t fRingMask;
369 /** Number of entries in the ring. */
370 uint32_t cEntries;
371 /** Pointer to the global flags. */
372 volatile uint32_t *pfFlags;
373 /** Pointer to the indirection array used for indexing the real SQ entries. */
374 volatile uint32_t *paidxSqes;
375} RTIOQUEUESQ;
376
377
378/**
379 * CQ ring structure.
380 *
381 * @note Some members of this structure point to memory shared with the kernel,
382 * hence the volatile keyword.
383 */
384typedef struct RTIOQUEUECQ
385{
386 /** Pointer to the head counter. */
387 volatile uint32_t *pidxHead;
388 /** Pointer to the tail counter. */
389 volatile uint32_t *pidxTail;
390 /** Mask to apply for the counters to get to the index. */
391 uint32_t fRingMask;
392 /** Number of entries in the ring. */
393 uint32_t cEntries;
394 /** Pointer to the completion entry ring. */
395 volatile LNXIOURINGCQE *paCqes;
396} RTIOQUEUECQ;
397
398
399/**
400 * Internal I/O queue provider instance data.
401 */
402typedef struct RTIOQUEUEPROVINT
403{
404 /** The io_uring file descriptor. */
405 int iFdIoCtx;
406 /** The eventfd file descriptor registered with the ring. */
407 int iFdEvt;
408 /** The submission queue. */
409 RTIOQUEUESQ Sq;
410 /** The currently uncommitted tail for the SQ. */
411 uint32_t idxSqTail;
412 /** Numbere of uncommitted SQEs. */
413 uint32_t cSqesToCommit;
414 /** The completion queue. */
415 RTIOQUEUECQ Cq;
416 /** Pointer to the mapped SQES entries. */
417 PLNXIOURINGSQE paSqes;
418 /** Pointer to the iovec structure used for non S/G requests. */
419 struct iovec *paIoVecs;
420 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
421 void *pvMMapSqRing;
422 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
423 void *pvMMapCqRing;
424 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
425 void *pvMMapSqes;
426 /** Size of the mapped SQ ring, used for unmapping. */
427 size_t cbMMapSqRing;
428 /** Size of the mapped CQ ring, used for unmapping. */
429 size_t cbMMapCqRing;
430 /** Size of the mapped SQ entries array, used for unmapping. */
431 size_t cbMMapSqes;
432 /** Flag whether the waiter was woken up externally. */
433 volatile bool fExtIntr;
434} RTIOQUEUEPROVINT;
435/** Pointer to the internal I/O queue provider instance data. */
436typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
437
438
439/*********************************************************************************************************************************
440* Internal Functions *
441*********************************************************************************************************************************/
442
443/**
444 * Syscall wrapper for io_uring_setup().
445 *
446 * @returns IPRT status code.
447 * @param cEntries Number of entries for submission and completion queues.
448 * @param pParams Additional parameters for the I/O ring and updated return values
449 * on success.
450 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
451 */
452DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
453{
454 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
455 if (RT_UNLIKELY(rcLnx == -1))
456 return RTErrConvertFromErrno(errno);
457
458 *piFdIoCtx = rcLnx;
459 return VINF_SUCCESS;
460}
461
462
463/**
464 * Syscall wrapper for io_uring_enter().
465 *
466 * @returns IPRT status code.
467 * @param iFdIoCtx The I/O ring file descriptor.
468 * @param cToSubmit Maximum number of requests waiting for processing.
469 * @param cMinComplete Minimum number of completion events to accumulate before returning.
470 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
471 */
472DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
473 uint32_t fFlags)
474{
475 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
476 NULL, 0);
477 if (RT_UNLIKELY(rcLnx == -1))
478 return RTErrConvertFromErrno(errno);
479
480 return VINF_SUCCESS;
481}
482
483
484/**
485 * Syscall wrapper for io_uring_register().
486 *
487 * @returns IPRT status code.
488 * @param iFdIoCtx The I/O ring file descriptor.
489 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
490 * @param pvArg Opaque arguments.
491 * @param cArgs Number of arguments.
492 */
493DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
494 uint32_t cArgs)
495{
496 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
497 if (RT_UNLIKELY(rcLnx == -1))
498 return RTErrConvertFromErrno(errno);
499
500 return VINF_SUCCESS;
501}
502
503
504/**
505 * mmap() wrapper for the common bits and returning an IPRT status code.
506 *
507 * @returns IPRT status code.
508 * @param iFdIoCtx The I/O ring file descriptor.
509 * @param offMmap The mmap() offset.
510 * @param cbMmap How much to map.
511 * @param ppv Where to store the pointer to the mapping on success.
512 */
513DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
514{
515 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
516 if (pv != MAP_FAILED)
517 {
518 *ppv = pv;
519 return VINF_SUCCESS;
520 }
521
522 return RTErrConvertFromErrno(errno);
523}
524
525
526/**
527 * eventfd2() syscall wrapper.
528 *
529 * @returns IPRT status code.
530 * @param uValInit The initial value of the maintained counter.
531 * @param fFlags Flags controlling the eventfd behavior.
532 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
533 */
534DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
535{
536 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
537 if (RT_UNLIKELY(rcLnx == -1))
538 return RTErrConvertFromErrno(errno);
539
540 *piFdEvt = rcLnx;
541 return VINF_SUCCESS;
542}
543
544
545/**
546 * Checks the completion event queue for pending events.
547 *
548 * @returns nothing.
549 * @param pThis The provider instance.
550 * @param paCEvt Pointer to the array of completion events.
551 * @param cCEvt Maximum number of completion events the array can hold.
552 * @param pcCEvtSeen Where to store the number of completion events processed.
553 */
554static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
555 uint32_t cCEvt, uint32_t *pcCEvtSeen)
556{
557 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
558 ASMReadFence();
559 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
560 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
561 ASMReadFence();
562
563 uint32_t cCEvtSeen = 0;
564
565 while ( idxCqTail != idxCqHead
566 && cCEvtSeen < cCEvt)
567 {
568 /* Get the index. */
569 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
570 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
571
572 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
573 if (pCqe->rcLnx >= 0)
574 {
575 paCEvt->rcReq = VINF_SUCCESS;
576 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
577 }
578 else
579 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
580
581#ifdef RT_STRICT /* poison */
582 memset((void *)pCqe, 0xff, sizeof(*pCqe));
583#endif
584
585 paCEvt++;
586 cCEvtSeen++;
587 idxCqHead++;
588 }
589
590 *pcCEvtSeen = cCEvtSeen;
591
592 /* Paranoia strikes again. */
593 ASMWriteFence();
594 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
595 ASMWriteFence();
596}
597
598
599/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
600static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
601{
602 /*
603 * Try to create a simple I/O ring and close it again.
604 * The common code/public API already checked for the proper handle type.
605 */
606 int iFdIoCtx = 0;
607 bool fSupp = false;
608 LNXIOURINGPARAMS Params;
609 RT_ZERO(Params);
610
611 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
612 if (RT_SUCCESS(rc))
613 {
614 /*
615 * Check that we can register an eventfd descriptor to get notified about
616 * completion events while being able to kick the waiter externally out of the wait.
617 */
618 int iFdEvt = 0;
619 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
620 if (RT_SUCCESS(rc))
621 {
622 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
623 &iFdEvt, 1 /*cArgs*/);
624 if (RT_SUCCESS(rc))
625 fSupp = true;
626
627 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
628 }
629 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
630 }
631
632 return fSupp;
633}
634
635
636/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
637static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
638 uint32_t cSqEntries, uint32_t cCqEntries)
639{
640 RT_NOREF(fFlags, cCqEntries);
641
642 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
643 LNXIOURINGPARAMS Params;
644 RT_ZERO(Params);
645
646 pThis->cSqesToCommit = 0;
647 pThis->fExtIntr = false;
648
649 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
650 if (RT_SUCCESS(rc))
651 {
652 /* Map the rings into userspace. */
653 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
654 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
655 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
656
657 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
658 if (RT_LIKELY(pThis->paIoVecs))
659 {
660 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
661 if (RT_SUCCESS(rc))
662 {
663 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
664 if (RT_SUCCESS(rc))
665 {
666 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
667 if (RT_SUCCESS(rc))
668 {
669 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
670 if (RT_SUCCESS(rc))
671 {
672 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
673 if (RT_SUCCESS(rc))
674 {
675 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
676
677 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
678 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
679 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
680 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
681 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
682 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
683 pThis->idxSqTail = *pThis->Sq.pidxTail;
684
685 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
686
687 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
688
689 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
690 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
691 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
692 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
693 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
694 return VINF_SUCCESS;
695 }
696
697 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
698 }
699
700 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
701 }
702
703 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
704 AssertRC(rc);
705 }
706
707 close(pThis->iFdEvt);
708 }
709
710 RTMemFree(pThis->paIoVecs);
711 }
712
713 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
714 }
715
716 return rc;
717}
718
719
720/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
721static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
722{
723 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
724
725 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
726 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
727 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
728
729 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
730 AssertRC(rc);
731
732 close(pThis->iFdEvt);
733 close(pThis->iFdIoCtx);
734 RTMemFree(pThis->paIoVecs);
735
736 RT_ZERO(pThis);
737}
738
739
740/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
741static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
742{
743 RT_NOREF(hIoQueueProv, pHandle);
744 /** @todo Add support for fixed file sets later. */
745 return VINF_SUCCESS;
746}
747
748
749/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
750static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
751{
752 RT_NOREF(hIoQueueProv, pHandle);
753 /** @todo Add support for fixed file sets later. */
754 return VINF_SUCCESS;
755}
756
757
758/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
759static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
760 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
761 void *pvUser)
762{
763 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
764 RT_NOREF(fReqFlags);
765
766 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
767 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
768 struct iovec *pIoVec = &pThis->paIoVecs[idx];
769
770 pIoVec->iov_base = pvBuf;
771 pIoVec->iov_len = cbBuf;
772
773 pSqe->u8Flags = 0;
774 pSqe->u16IoPrio = 0;
775 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
776 pSqe->u64OffStart = off;
777 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
778 pSqe->u32BufIoVecSz = 1;
779 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
780
781 switch (enmOp)
782 {
783 case RTIOQUEUEOP_READ:
784 pSqe->u8Opc = LNX_IOURING_OPC_READV;
785 pSqe->uOpc.u32KrnlRwFlags = 0;
786 break;
787 case RTIOQUEUEOP_WRITE:
788 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
789 pSqe->uOpc.u32KrnlRwFlags = 0;
790 break;
791 case RTIOQUEUEOP_SYNC:
792 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
793 pSqe->uOpc.u32FsyncFlags = 0;
794 break;
795 default:
796 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
797 VERR_INVALID_PARAMETER);
798 }
799
800 pThis->Sq.paidxSqes[idx] = idx;
801 pThis->idxSqTail++;
802 pThis->cSqesToCommit++;
803 return VINF_SUCCESS;
804}
805
806
807/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
808static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
809{
810 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
811
812 ASMWriteFence();
813 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
814 ASMWriteFence();
815
816 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
817 if (RT_SUCCESS(rc))
818 {
819 *pcReqsCommitted = pThis->cSqesToCommit;
820 pThis->cSqesToCommit = 0;
821 }
822
823 return rc;
824}
825
826
827/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
828static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
829 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
830{
831 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
832 int rc = VINF_SUCCESS;
833 uint32_t cCEvtSeen = 0;
834
835 RT_NOREF(fFlags);
836
837 /*
838 * Check the completion queue first for any completed events which might save us a
839 * context switch later on.
840 */
841 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
842
843 while ( cCEvtSeen < cMinWait
844 && RT_SUCCESS(rc))
845 {
846 /*
847 * We can employ a blocking read on the event file descriptor, it will return
848 * either when woken up externally or when there are completion events pending.
849 */
850 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
851 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
852 if (rcLnx == sizeof(uCnt))
853 {
854 uint32_t cCEvtThisSeen = 0;
855 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
856 cCEvtSeen += cCEvtThisSeen;
857
858 /* Whether we got woken up externally. */
859 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
860 rc = VERR_INTERRUPTED;
861 }
862 else if (rcLnx == -1)
863 rc = RTErrConvertFromErrno(errno);
864 else
865 AssertMsgFailed(("Unexpected read() -> 0\n"));
866 }
867
868 *pcCEvt = cCEvtSeen;
869 return rc;
870}
871
872
873/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
874static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
875{
876 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
877 int rc = VINF_SUCCESS;
878
879 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
880 {
881 const uint64_t uValAdd = 1;
882 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
883
884 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
885 if (rcLnx == -1)
886 rc = RTErrConvertFromErrno(errno);
887 }
888
889 return rc;
890}
891
892
893/**
894 * Async file I/O queue provider virtual method table.
895 */
896RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
897{
898 /** uVersion */
899 RTIOQUEUEPROVVTABLE_VERSION,
900 /** pszId */
901 "LnxIoURingFile",
902 /** cbIoQueueProv */
903 sizeof(RTIOQUEUEPROVINT),
904 /** enmHnd */
905 RTHANDLETYPE_FILE,
906 /** fFlags */
907 0,
908 /** pfnIsSupported */
909 rtIoQueueLnxIoURingFileProv_IsSupported,
910 /** pfnQueueInit */
911 rtIoQueueLnxIoURingFileProv_QueueInit,
912 /** pfnQueueDestroy */
913 rtIoQueueLnxIoURingFileProv_QueueDestroy,
914 /** pfnHandleRegister */
915 rtIoQueueLnxIoURingFileProv_HandleRegister,
916 /** pfnHandleDeregister */
917 rtIoQueueLnxIoURingFileProv_HandleDeregister,
918 /** pfnReqPrepare */
919 rtIoQueueLnxIoURingFileProv_ReqPrepare,
920 /** pfnReqPrepareSg */
921 NULL,
922 /** pfnCommit */
923 rtIoQueueLnxIoURingFileProv_Commit,
924 /** pfnEvtWait */
925 rtIoQueueLnxIoURingFileProv_EvtWait,
926 /** pfnEvtWaitWakeup */
927 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
928 /** uEndMarker */
929 RTIOQUEUEPROVVTABLE_VERSION
930};
931
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette