VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 84333

Last change on this file since 84333 was 84333, checked in by vboxsync, 5 years ago

Runtime/ioqueue-iouringfile-provider: Doxygen

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 84333 2020-05-18 15:46:46Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
28 * @internal
29 *
30 * The io_uring interface is the most recent interface added to the Linux kernel
31 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
32 * thus not available on most systems as of writing this backend (July 2019).
33 * It supersedes the old async I/O interface and cleans up with some restrictions like
34 * having to disable caching for the file.
35 * The interface is centered around a submission and completion queue to queue multiple new
36 * requests for the kernel to process and get notified about completions to reduce the amount
37 * of context switches to an absolute minimum. It also offers advanced features like
38 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
39 * even more.
40 *
41 * The first implementation will only make use of the basic features and more advanced features
42 * will be added later.
43 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
44 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
45 * while still keeping a consistent platform independent API which allows efficient implementations on
46 * other hosts when they come up.
47 *
48 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
49 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
50 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
51 * * http://kernel.dk/io_uring.pdf
52 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
53 */
54
55
56/*********************************************************************************************************************************
57* Header Files *
58*********************************************************************************************************************************/
59#define LOG_GROUP RTLOGGROUP_IOQUEUE
60#include <iprt/ioqueue.h>
61
62#include <iprt/assertcompile.h>
63#include <iprt/asm.h>
64#include <iprt/errcore.h>
65#include <iprt/file.h>
66#include <iprt/log.h>
67#include <iprt/mem.h>
68#include <iprt/string.h>
69
70#include <errno.h>
71#include <unistd.h>
72#include <signal.h>
73#include <sys/mman.h>
74#include <sys/syscall.h>
75#include <sys/uio.h>
76
77#include "internal/ioqueue.h"
78
79
80/*********************************************************************************************************************************
81* Defined Constants And Macros *
82*********************************************************************************************************************************/
83
84/** The syscall number of io_uring_setup(). */
85#define LNX_IOURING_SYSCALL_SETUP 425
86/** The syscall number of io_uring_enter(). */
87#define LNX_IOURING_SYSCALL_ENTER 426
88/** The syscall number of io_uring_register(). */
89#define LNX_IOURING_SYSCALL_REGISTER 427
90/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
91#define LNX_SYSCALL_EVENTFD2 19
92
93
94/*********************************************************************************************************************************
95* Structures and Typedefs *
96*********************************************************************************************************************************/
97
98/**
99 * Linux io_uring completion event.
100 */
101typedef struct LNXIOURINGCQE
102{
103 /** Opaque user data associated with the completed request. */
104 uint64_t u64User;
105 /** The status code of the request. */
106 int32_t rcLnx;
107 /** Some flags which are not used as of now. */
108 uint32_t fFlags;
109} LNXIOURINGCQE;
110AssertCompileSize(LNXIOURINGCQE, 16);
111/** Pointer to a Linux io_uring completion event. */
112typedef LNXIOURINGCQE *PLNXIOURINGCQE;
113/** Pointer to a constant linux io_uring completion event. */
114typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
115
116
117/**
118 * Linux io_uring submission queue entry.
119 */
120typedef struct LNXIOURINGSQE
121{
122 /** The opcode for the request. */
123 uint8_t u8Opc;
124 /** Common flags for the request. */
125 uint8_t u8Flags;
126 /** Assigned I/O priority. */
127 uint16_t u16IoPrio;
128 /** The file descriptor the request is for. */
129 int32_t i32Fd;
130 /** The start offset into the file for the request. */
131 uint64_t u64OffStart;
132 /** Buffer pointer or Pointer to io vector array depending on opcode. */
133 uint64_t u64AddrBufIoVec;
134 /** Size of the buffer in bytes or number of io vectors. */
135 uint32_t u32BufIoVecSz;
136 /** Opcode dependent data. */
137 union
138 {
139 /** Flags for read/write requests. */
140 uint32_t u32KrnlRwFlags;
141 /** Flags for fsync() like requests. */
142 uint32_t u32FsyncFlags;
143 /** Flags for poll() like requests. */
144 uint16_t u16PollFlags;
145 /** Flags for sync_file_range() like requests. */
146 uint32_t u32SyncFileRangeFlags;
147 /** Flags for requests requiring a msg structure. */
148 uint32_t u32MsgFlags;
149 } uOpc;
150 /** Opaque user data associated with the request and returned durign completion. */
151 uint64_t u64User;
152 /** Request type dependent data. */
153 union
154 {
155 /** Fixed buffer index if indicated by the request flags. */
156 uint16_t u16FixedBufIdx;
157 /** Padding to align the structure to 64 bytes. */
158 uint64_t au64Padding[3];
159 } uReq;
160} LNXIOURINGSQE;
161AssertCompileSize(LNXIOURINGSQE, 64);
162/** Pointer to a Linux io_uring submission queue entry. */
163typedef LNXIOURINGSQE *PLNXIOURINGSQE;
164/** Pointer to a constant Linux io_uring submission queue entry. */
165typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
166
167
168/**
169 * Linux u_ioring SQ ring header structure to maintain the queue.
170 */
171typedef struct LNXIOURINGSQ
172{
173 /** The current head position to fill in new requests. */
174 uint32_t u32OffHead;
175 /** The current tail position the kernel starts processing from. */
176 uint32_t u32OffTail;
177 /** The mask for the head and tail counters to apply to retrieve the index. */
178 uint32_t u32OffRingMask;
179 /** Number of entries in the SQ ring. */
180 uint32_t u32OffRingEntries;
181 /** Flags set asychronously by the kernel. */
182 uint32_t u32OffFlags;
183 /** Counter of dropped requests. */
184 uint32_t u32OffDroppedReqs;
185 /** Offset where to find the array of SQ entries. */
186 uint32_t u32OffArray;
187 /** Reserved. */
188 uint32_t u32Rsvd0;
189 /** Reserved. */
190 uint64_t u64Rsvd1;
191} LNXIOURINGSQ;
192AssertCompileSize(LNXIOURINGSQ, 40);
193/** Pointer to a Linux u_ioring SQ ring header. */
194typedef LNXIOURINGSQ *PLNXIOURINGSQ;
195/** Pointer to a constant Linux u_ioring SQ ring header. */
196typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
197
198
199/**
200 * Linux io_uring CQ ring header structure to maintain the queue.
201 */
202typedef struct LNXIOURINGCQ
203{
204 /** The current head position the kernel modifies when completion events happen. */
205 uint32_t u32OffHead;
206 /** The current tail position to read completion events from. */
207 uint32_t u32OffTail;
208 /** The mask for the head and tail counters to apply to retrieve the index. */
209 uint32_t u32OffRingMask;
210 /** Number of entries in the CQ ring. */
211 uint32_t u32OffRingEntries;
212 /** Number of CQ overflows happened. */
213 uint32_t u32OffOverflowCnt;
214 /** */
215 uint32_t u32OffCqes;
216 /** Reserved. */
217 uint64_t au64Rsvd0[2];
218} LNXIOURINGCQ;
219AssertCompileSize(LNXIOURINGCQ, 40);
220/** Pointer to a Linux u_ioring CQ ring header. */
221typedef LNXIOURINGCQ *PLNXIOURINGCQ;
222/** Pointer to a constant Linux u_ioring CQ ring header. */
223typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
224
225
226/**
227 * Linux io_uring parameters passed to io_uring_setup().
228 */
229typedef struct LNXIOURINGPARAMS
230{
231 /** Number of SQ entries requested, must be power of 2. */
232 uint32_t u32SqEntriesCnt;
233 /** Number of CQ entries requested, must be power of 2. */
234 uint32_t u32CqEntriesCnt;
235 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
236 uint32_t u32Flags;
237 /** Affinity of the kernel side SQ polling thread if enabled. */
238 uint32_t u32SqPollCpu;
239 /** Milliseconds after the kernel side SQ polling thread goes to sleep
240 * if there is are no requests to process. */
241 uint32_t u32SqPollIdleMs;
242 /** Reserved. */
243 uint32_t au32Rsvd0[5];
244 /** Offsets returned for the submission queue. */
245 LNXIOURINGSQ SqOffsets;
246 /** Offsets returned for the completion queue. */
247 LNXIOURINGCQ CqOffsets;
248} LNXIOURINGPARAMS;
249/** Pointer to Linux io_uring parameters. */
250typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
251/** Pointer to constant Linux io_uring parameters. */
252typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
253
254
255/** @name LNXIOURINGSQE::u8Opc defined opcodes.
256 * @{ */
257/** Opcode to profile the interface, does nothing. */
258#define LNX_IOURING_OPC_NOP 0
259/** preadv() like request. */
260#define LNX_IOURING_OPC_READV 1
261/** pwritev() like request. */
262#define LNX_IOURING_OPC_WRITEV 2
263/** fsync() like request. */
264#define LNX_IOURING_OPC_FSYNC 3
265/** Read request using a fixed preset buffer. */
266#define LNX_IOURING_OPC_READ_FIXED 4
267/** Write request using a fixed preset buffer. */
268#define LNX_IOURING_OPC_WRITE_FIXED 5
269/** Add file descriptor to pollset. */
270#define LNX_IOURING_OPC_POLL_ADD 6
271/** Remove file descriptor from pollset. */
272#define LNX_IOURING_OPC_POLL_REMOVE 7
273/** sync_file_range() like request. */
274#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
275/** sendmsg() like request. */
276#define LNX_IOURING_OPC_SENDMSG 9
277/** recvmsg() like request. */
278#define LNX_IOURING_OPC_RECVMSG 10
279/** @} */
280
281
282/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
283 * @{ */
284/** Sync userdata as well instead of metadata only. */
285#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
286/** @} */
287
288
289/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
290 * @{ */
291/** The I/O context is polled. */
292#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
293/** The kernel should poll the submission queue. */
294#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
295/** Sets the CPU affinity of the kernel thread polling the submission queue. */
296#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
297/** @} */
298
299
300/** @name Flags for LNXIOURINGSQE::u8Flags.
301 * @{ */
302/** The file descriptor was registered before use. */
303#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
304/** Complete all active requests before issuing the request with the flag set. */
305#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
306/** Links the request with the flag set to the next one. */
307#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
308/** @} */
309
310
311/** @name Magic mmap offsets to map submission and completion queues.
312 * @{ */
313/** Used to map the submission queue. */
314#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
315/** Used to map the completion queue. */
316#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
317/** Used to map the submission queue entries array. */
318#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
319/** @} */
320
321
322/** @name Flags used for the SQ ring structure.
323 * @{ */
324/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
325#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
326/** @} */
327
328
329/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
330 * @{ */
331/** Retrieve completion events for the completion queue. */
332#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
333/** Wakes the suspended kernel thread processing the requests. */
334#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
335/** @} */
336
337
338/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
339 * @{ */
340/** Register a fixed set of buffers. */
341#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
342/** Unregisters a fixed set of buffers registered previously. */
343#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
344/** Register a fixed set of files. */
345#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
346/** Unregisters a fixed set of files registered previously. */
347#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
348/** Register an eventfd associated with the I/O ring. */
349#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
350/** Unregisters an eventfd registered previously. */
351#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
352/** @} */
353
354
355/**
356 * SQ ring structure.
357 *
358 * @note Some members of this structure point to memory shared with the kernel,
359 * hence the volatile keyword.
360 */
361typedef struct RTIOQUEUESQ
362{
363 /** Pointer to the head counter. */
364 volatile uint32_t *pidxHead;
365 /** Pointer to the tail counter. */
366 volatile uint32_t *pidxTail;
367 /** Mask to apply for the counters to get to the index. */
368 uint32_t fRingMask;
369 /** Number of entries in the ring. */
370 uint32_t cEntries;
371 /** Pointer to the global flags. */
372 volatile uint32_t *pfFlags;
373 /** Pointer to the indirection array used for indexing the real SQ entries. */
374 volatile uint32_t *paidxSqes;
375} RTIOQUEUESQ;
376
377
378/**
379 * CQ ring structure.
380 *
381 * @note Some members of this structure point to memory shared with the kernel,
382 * hence the volatile keyword.
383 */
384typedef struct RTIOQUEUECQ
385{
386 /** Pointer to the head counter. */
387 volatile uint32_t *pidxHead;
388 /** Pointer to the tail counter. */
389 volatile uint32_t *pidxTail;
390 /** Mask to apply for the counters to get to the index. */
391 uint32_t fRingMask;
392 /** Number of entries in the ring. */
393 uint32_t cEntries;
394 /** Pointer to the completion entry ring. */
395 volatile LNXIOURINGCQE *paCqes;
396} RTIOQUEUECQ;
397
398
399/**
400 * Internal I/O queue provider instance data.
401 */
402typedef struct RTIOQUEUEPROVINT
403{
404 /** The io_uring file descriptor. */
405 int iFdIoCtx;
406 /** The eventfd file descriptor registered with the ring. */
407 int iFdEvt;
408 /** The submission queue. */
409 RTIOQUEUESQ Sq;
410 /** The currently uncommitted tail for the SQ. */
411 uint32_t idxSqTail;
412 /** Numbere of uncommitted SQEs. */
413 uint32_t cSqesToCommit;
414 /** The completion queue. */
415 RTIOQUEUECQ Cq;
416 /** Pointer to the mapped SQES entries. */
417 PLNXIOURINGSQE paSqes;
418 /** Pointer to the iovec structure used for non S/G requests. */
419 struct iovec *paIoVecs;
420 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
421 void *pvMMapSqRing;
422 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
423 void *pvMMapCqRing;
424 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
425 void *pvMMapSqes;
426 /** Size of the mapped SQ ring, used for unmapping. */
427 size_t cbMMapSqRing;
428 /** Size of the mapped CQ ring, used for unmapping. */
429 size_t cbMMapCqRing;
430 /** Size of the mapped SQ entries array, used for unmapping. */
431 size_t cbMMapSqes;
432 /** Flag whether the waiter was woken up externally. */
433 volatile bool fExtIntr;
434} RTIOQUEUEPROVINT;
435/** Pointer to the internal I/O queue provider instance data. */
436typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
437
438
439/*********************************************************************************************************************************
440* Internal Functions *
441*********************************************************************************************************************************/
442
443/**
444 * Syscall wrapper for io_uring_setup().
445 *
446 * @returns IPRT status code.
447 * @param cEntries Number of entries for submission and completion queues.
448 * @param pParams Additional parameters for the I/O ring and updated return values
449 * on success.
450 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
451 */
452DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
453{
454 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
455 if (RT_UNLIKELY(rcLnx == -1))
456 return RTErrConvertFromErrno(errno);
457
458 *piFdIoCtx = rcLnx;
459 return VINF_SUCCESS;
460}
461
462
463/**
464 * Syscall wrapper for io_uring_enter().
465 *
466 * @returns IPRT status code.
467 * @param iFdIoCtx The I/O ring file descriptor.
468 * @param cToSubmit Maximum number of requests waiting for processing.
469 * @param cMinComplete Minimum number of completion events to accumulate before returning.
470 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
471 */
472DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
473 uint32_t fFlags)
474{
475 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
476 NULL, 0);
477 if (RT_UNLIKELY(rcLnx == -1))
478 return RTErrConvertFromErrno(errno);
479
480 return VINF_SUCCESS;
481}
482
483
484/**
485 * Syscall wrapper for io_uring_register().
486 *
487 * @returns IPRT status code.
488 * @param iFdIoCtx The I/O ring file descriptor.
489 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
490 * @param pvArg Opaque arguments.
491 * @param cArgs Number of arguments.
492 */
493DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
494 uint32_t cArgs)
495{
496 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
497 if (RT_UNLIKELY(rcLnx == -1))
498 return RTErrConvertFromErrno(errno);
499
500 return VINF_SUCCESS;
501}
502
503
504/**
505 * mmap() wrapper for the common bits and returning an IPRT status code.
506 *
507 * @returns IPRT status code.
508 * @param iFdIoCtx The I/O ring file descriptor.
509 * @param offMmap The mmap() offset.
510 * @param cbMmap How much to map.
511 * @param ppv Where to store the pointer to the mapping on success.
512 */
513DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
514{
515 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
516 if (pv != MAP_FAILED)
517 {
518 *ppv = pv;
519 return VINF_SUCCESS;
520 }
521
522 return RTErrConvertFromErrno(errno);
523}
524
525
526/**
527 * eventfd2() syscall wrapper.
528 *
529 * @returns IPRT status code.
530 * @param uValInit The initial value of the maintained counter.
531 * @param fFlags Flags controlling the eventfd behavior.
532 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
533 */
534DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
535{
536 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
537 if (RT_UNLIKELY(rcLnx == -1))
538 return RTErrConvertFromErrno(errno);
539
540 *piFdEvt = rcLnx;
541 return VINF_SUCCESS;
542}
543
544
545/**
546 * Checks the completion event queue for pending events.
547 *
548 * @returns nothing.
549 * @param pThis The provider instance.
550 * @param paCEvt Pointer to the array of completion events.
551 * @param cCEvt Maximum number of completion events the array can hold.
552 * @param pcCEvtSeen Where to store the number of completion events processed.
553 */
554static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
555 uint32_t cCEvt, uint32_t *pcCEvtSeen)
556{
557 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
558 ASMReadFence();
559 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
560 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
561 ASMReadFence();
562
563 uint32_t cCEvtSeen = 0;
564
565 while ( idxCqTail != idxCqHead
566 && cCEvtSeen < cCEvt)
567 {
568 /* Get the index. */
569 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
570 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
571
572 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
573 if (pCqe->rcLnx >= 0)
574 {
575 paCEvt->rcReq = VINF_SUCCESS;
576 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
577 }
578 else
579 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
580
581 paCEvt++;
582 cCEvtSeen++;
583 idxCqHead++;
584 }
585
586 *pcCEvtSeen = cCEvtSeen;
587
588 /* Paranoia strikes again. */
589 ASMWriteFence();
590 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
591 ASMWriteFence();
592}
593
594
595/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
596static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
597{
598 /*
599 * Try to create a simple I/O ring and close it again.
600 * The common code/public API already checked for the proper handle type.
601 */
602 int iFdIoCtx = 0;
603 bool fSupp = false;
604 LNXIOURINGPARAMS Params;
605 RT_ZERO(Params);
606
607 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
608 if (RT_SUCCESS(rc))
609 {
610 /*
611 * Check that we can register an eventfd descriptor to get notified about
612 * completion events while being able to kick the waiter externally out of the wait.
613 */
614 int iFdEvt = 0;
615 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
616 if (RT_SUCCESS(rc))
617 {
618 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
619 &iFdEvt, 1 /*cArgs*/);
620 if (RT_SUCCESS(rc))
621 fSupp = true;
622
623 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
624 }
625 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
626 }
627
628 return fSupp;
629}
630
631
632/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
633static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
634 uint32_t cSqEntries, uint32_t cCqEntries)
635{
636 RT_NOREF(fFlags, cCqEntries);
637
638 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
639 LNXIOURINGPARAMS Params;
640 RT_ZERO(Params);
641
642 pThis->cSqesToCommit = 0;
643 pThis->fExtIntr = false;
644
645 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
646 if (RT_SUCCESS(rc))
647 {
648 /* Map the rings into userspace. */
649 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
650 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
651 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
652
653 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
654 if (RT_LIKELY(pThis->paIoVecs))
655 {
656 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
657 if (RT_SUCCESS(rc))
658 {
659 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
660 if (RT_SUCCESS(rc))
661 {
662 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
663 if (RT_SUCCESS(rc))
664 {
665 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
666 if (RT_SUCCESS(rc))
667 {
668 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
669 if (RT_SUCCESS(rc))
670 {
671 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
672
673 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
674 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
675 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
676 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
677 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
678 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
679 pThis->idxSqTail = *pThis->Sq.pidxTail;
680
681 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
682
683 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
684
685 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
686 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
687 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
688 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
689 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
690 return VINF_SUCCESS;
691 }
692
693 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
694 }
695
696 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
697 }
698
699 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
700 AssertRC(rc);
701 }
702
703 close(pThis->iFdEvt);
704 }
705
706 RTMemFree(pThis->paIoVecs);
707 }
708
709 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
710 }
711
712 return rc;
713}
714
715
716/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
717static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
718{
719 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
720
721 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
722 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
723 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
724
725 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
726 AssertRC(rc);
727
728 close(pThis->iFdEvt);
729 close(pThis->iFdIoCtx);
730 RTMemFree(pThis->paIoVecs);
731
732 RT_ZERO(pThis);
733}
734
735
736/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
737static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
738{
739 RT_NOREF(hIoQueueProv, pHandle);
740 /** @todo Add support for fixed file sets later. */
741 return VINF_SUCCESS;
742}
743
744
745/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
746static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
747{
748 RT_NOREF(hIoQueueProv, pHandle);
749 /** @todo Add support for fixed file sets later. */
750 return VINF_SUCCESS;
751}
752
753
754/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
755static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
756 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
757 void *pvUser)
758{
759 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
760 RT_NOREF(fReqFlags);
761
762 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
763 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
764 struct iovec *pIoVec = &pThis->paIoVecs[idx];
765
766 pIoVec->iov_base = pvBuf;
767 pIoVec->iov_len = cbBuf;
768
769 pSqe->u8Flags = 0;
770 pSqe->u16IoPrio = 0;
771 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
772 pSqe->u64OffStart = off;
773 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
774 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
775
776 switch (enmOp)
777 {
778 case RTIOQUEUEOP_READ:
779 pSqe->u8Opc = LNX_IOURING_OPC_READV;
780 pSqe->uOpc.u32KrnlRwFlags = 0;
781 break;
782 case RTIOQUEUEOP_WRITE:
783 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
784 pSqe->uOpc.u32KrnlRwFlags = 0;
785 break;
786 case RTIOQUEUEOP_SYNC:
787 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
788 pSqe->uOpc.u32FsyncFlags = 0;
789 break;
790 default:
791 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
792 VERR_INVALID_PARAMETER);
793 }
794
795 pThis->idxSqTail++;
796 pThis->cSqesToCommit++;
797 return VINF_SUCCESS;
798}
799
800
801/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
802static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
803{
804 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
805 RT_NOREF(pThis, pcReqsCommitted);
806
807 ASMWriteFence();
808 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
809 ASMWriteFence();
810
811 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
812 if (RT_SUCCESS(rc))
813 {
814 *pcReqsCommitted = pThis->cSqesToCommit;
815 pThis->cSqesToCommit = 0;
816 }
817
818 return rc;
819}
820
821
822/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
823static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
824 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
825{
826 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
827 int rc = VINF_SUCCESS;
828 uint32_t cCEvtSeen = 0;
829
830 RT_NOREF(fFlags);
831
832 /*
833 * Check the completion queue first for any completed events which might save us a
834 * context switch later on.
835 */
836 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
837
838 while ( cCEvtSeen < cMinWait
839 && RT_SUCCESS(rc))
840 {
841 /*
842 * We can employ a blocking read on the event file descriptor, it will return
843 * either when woken up externally or when there are completion events pending.
844 */
845 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
846 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
847 if (rcLnx == sizeof(uCnt))
848 {
849 uint32_t cCEvtThisSeen = 0;
850 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
851 cCEvtSeen += cCEvtThisSeen;
852
853 /* Whether we got woken up externally. */
854 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
855 rc = VERR_INTERRUPTED;
856 }
857 else if (rcLnx == -1)
858 rc = RTErrConvertFromErrno(errno);
859 else
860 AssertMsgFailed(("Unexpected read() -> 0\n"));
861 }
862
863 *pcCEvt = cCEvtSeen;
864 return rc;
865}
866
867
868/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
869static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
870{
871 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
872 int rc = VINF_SUCCESS;
873
874 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
875 {
876 const uint64_t uValAdd = 1;
877 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
878
879 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
880 if (rcLnx == -1)
881 rc = RTErrConvertFromErrno(errno);
882 }
883
884 return rc;
885}
886
887
888/**
889 * Async file I/O queue provider virtual method table.
890 */
891RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
892{
893 /** uVersion */
894 RTIOQUEUEPROVVTABLE_VERSION,
895 /** pszId */
896 "LnxIoURingFile",
897 /** cbIoQueueProv */
898 sizeof(RTIOQUEUEPROVINT),
899 /** enmHnd */
900 RTHANDLETYPE_FILE,
901 /** fFlags */
902 0,
903 /** pfnIsSupported */
904 rtIoQueueLnxIoURingFileProv_IsSupported,
905 /** pfnQueueInit */
906 rtIoQueueLnxIoURingFileProv_QueueInit,
907 /** pfnQueueDestroy */
908 rtIoQueueLnxIoURingFileProv_QueueDestroy,
909 /** pfnHandleRegister */
910 rtIoQueueLnxIoURingFileProv_HandleRegister,
911 /** pfnHandleDeregister */
912 rtIoQueueLnxIoURingFileProv_HandleDeregister,
913 /** pfnReqPrepare */
914 rtIoQueueLnxIoURingFileProv_ReqPrepare,
915 /** pfnReqPrepareSg */
916 NULL,
917 /** pfnCommit */
918 rtIoQueueLnxIoURingFileProv_Commit,
919 /** pfnEvtWait */
920 rtIoQueueLnxIoURingFileProv_EvtWait,
921 /** pfnEvtWaitWakeup */
922 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
923 /** uEndMarker */
924 RTIOQUEUEPROVVTABLE_VERSION
925};
926
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette