VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 81137

Last change on this file since 81137 was 79984, checked in by vboxsync, 5 years ago

Runtime/RTIoQueue: Updates [scm fix]

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 79984 2019-07-25 17:25:41Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
28 * @internal
29 *
30 * The io_uring interface is the most recent interface added to the Linux kernel
31 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
32 * thus not available on most systems as of writing this backend (July 2019).
33 * It supersedes the old async I/O interface and cleans up with some restrictions like
34 * having to disable caching for the file.
35 * The interface is centered around a submission and completion queue to queue multiple new
36 * requests for the kernel to process and get notified about completions to reduce the amount
37 * of context switches to an absolute minimum. It also offers advanced features like
38 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
39 * even more.
40 *
41 * The first implementation will only make use of the basic features and more advanced features
42 * will be added later.
43 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
44 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
45 * while still keeping a consistent platform independent API which allows efficient implementations on
46 * other hosts when they come up.
47 *
48 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
49 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
50 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
51 * * http://kernel.dk/io_uring.pdf
52 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
53 */
54
55
56/*********************************************************************************************************************************
57* Header Files *
58*********************************************************************************************************************************/
59#define LOG_GROUP RTLOGGROUP_IOQUEUE
60#include <iprt/ioqueue.h>
61
62#include <iprt/assertcompile.h>
63#include <iprt/asm.h>
64#include <iprt/errcore.h>
65#include <iprt/file.h>
66#include <iprt/log.h>
67#include <iprt/mem.h>
68#include <iprt/string.h>
69
70#include <errno.h>
71#include <unistd.h>
72#include <signal.h>
73#include <sys/mman.h>
74#include <sys/syscall.h>
75#include <sys/uio.h>
76
77#include "internal/ioqueue.h"
78
79
80/*********************************************************************************************************************************
81* Defined Constants And Macros *
82*********************************************************************************************************************************/
83
84/** The syscall number of io_uring_setup(). */
85#define LNX_IOURING_SYSCALL_SETUP 425
86/** The syscall number of io_uring_enter(). */
87#define LNX_IOURING_SYSCALL_ENTER 426
88/** The syscall number of io_uring_register(). */
89#define LNX_IOURING_SYSCALL_REGISTER 427
90/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
91#define LNX_SYSCALL_EVENTFD2 19
92
93
94/*********************************************************************************************************************************
95* Structures and Typedefs *
96*********************************************************************************************************************************/
97
98/**
99 * Linux io_uring completion event.
100 */
101typedef struct LNXIOURINGCQE
102{
103 /** Opaque user data associated with the completed request. */
104 uint64_t u64User;
105 /** The status code of the request. */
106 int32_t rcLnx;
107 /** Some flags which are not used as of now. */
108 uint32_t fFlags;
109} LNXIOURINGCQE;
110AssertCompileSize(LNXIOURINGCQE, 16);
111/** Pointer to a Linux io_uring completion event. */
112typedef LNXIOURINGCQE *PLNXIOURINGCQE;
113/** Pointer to a constant linux io_uring completion event. */
114typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
115
116
117/**
118 * Linux io_uring submission queue entry.
119 */
120typedef struct LNXIOURINGSQE
121{
122 /** The opcode for the request. */
123 uint8_t u8Opc;
124 /** Common flags for the request. */
125 uint8_t u8Flags;
126 /** Assigned I/O priority. */
127 uint16_t u16IoPrio;
128 /** The file descriptor the request is for. */
129 int32_t i32Fd;
130 /** The start offset into the file for the request. */
131 uint64_t u64OffStart;
132 /** Buffer pointer or Pointer to io vector array depending on opcode. */
133 uint64_t u64AddrBufIoVec;
134 /** Size of the buffer in bytes or number of io vectors. */
135 uint32_t u32BufIoVecSz;
136 /** Opcode dependent data. */
137 union
138 {
139 /** Flags for read/write requests. */
140 uint32_t u32KrnlRwFlags;
141 /** Flags for fsync() like requests. */
142 uint32_t u32FsyncFlags;
143 /** Flags for poll() like requests. */
144 uint16_t u16PollFlags;
145 /** Flags for sync_file_range() like requests. */
146 uint32_t u32SyncFileRangeFlags;
147 /** Flags for requests requiring a msg structure. */
148 uint32_t u32MsgFlags;
149 } uOpc;
150 /** Opaque user data associated with the request and returned durign completion. */
151 uint64_t u64User;
152 /** Request type dependent data. */
153 union
154 {
155 /** Fixed buffer index if indicated by the request flags. */
156 uint16_t u16FixedBufIdx;
157 /** Padding to align the structure to 64 bytes. */
158 uint64_t au64Padding[3];
159 } uReq;
160} LNXIOURINGSQE;
161AssertCompileSize(LNXIOURINGSQE, 64);
162/** Pointer to a Linux io_uring submission queue entry. */
163typedef LNXIOURINGSQE *PLNXIOURINGSQE;
164/** Pointer to a constant Linux io_uring submission queue entry. */
165typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
166
167
168/**
169 * Linux u_ioring SQ ring header structure to maintain the queue.
170 */
171typedef struct LNXIOURINGSQ
172{
173 /** The current head position to fill in new requests. */
174 uint32_t u32OffHead;
175 /** The current tail position the kernel starts processing from. */
176 uint32_t u32OffTail;
177 /** The mask for the head and tail counters to apply to retrieve the index. */
178 uint32_t u32OffRingMask;
179 /** Number of entries in the SQ ring. */
180 uint32_t u32OffRingEntries;
181 /** Flags set asychronously by the kernel. */
182 uint32_t u32OffFlags;
183 /** Counter of dropped requests. */
184 uint32_t u32OffDroppedReqs;
185 /** Offset where to find the array of SQ entries. */
186 uint32_t u32OffArray;
187 /** Reserved. */
188 uint32_t u32Rsvd0;
189 /** Reserved. */
190 uint64_t u64Rsvd1;
191} LNXIOURINGSQ;
192AssertCompileSize(LNXIOURINGSQ, 40);
193/** Pointer to a Linux u_ioring SQ ring header. */
194typedef LNXIOURINGSQ *PLNXIOURINGSQ;
195/** Pointer to a constant Linux u_ioring SQ ring header. */
196typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
197
198
199/**
200 * Linux io_uring CQ ring header structure to maintain the queue.
201 */
202typedef struct LNXIOURINGCQ
203{
204 /** The current head position the kernel modifies when completion events happen. */
205 uint32_t u32OffHead;
206 /** The current tail position to read completion events from. */
207 uint32_t u32OffTail;
208 /** The mask for the head and tail counters to apply to retrieve the index. */
209 uint32_t u32OffRingMask;
210 /** Number of entries in the CQ ring. */
211 uint32_t u32OffRingEntries;
212 /** Number of CQ overflows happened. */
213 uint32_t u32OffOverflowCnt;
214 /** */
215 uint32_t u32OffCqes;
216 /** Reserved. */
217 uint64_t au64Rsvd0[2];
218} LNXIOURINGCQ;
219AssertCompileSize(LNXIOURINGCQ, 40);
220/** Pointer to a Linux u_ioring CQ ring header. */
221typedef LNXIOURINGCQ *PLNXIOURINGCQ;
222/** Pointer to a constant Linux u_ioring CQ ring header. */
223typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
224
225
226/**
227 * Linux io_uring parameters passed to io_uring_setup().
228 */
229typedef struct LNXIOURINGPARAMS
230{
231 /** Number of SQ entries requested, must be power of 2. */
232 uint32_t u32SqEntriesCnt;
233 /** Number of CQ entries requested, must be power of 2. */
234 uint32_t u32CqEntriesCnt;
235 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
236 uint32_t u32Flags;
237 /** Affinity of the kernel side SQ polling thread if enabled. */
238 uint32_t u32SqPollCpu;
239 /** Milliseconds after the kernel side SQ polling thread goes to sleep
240 * if there is are no requests to process. */
241 uint32_t u32SqPollIdleMs;
242 /** Reserved. */
243 uint32_t au32Rsvd0[5];
244 /** Offsets returned for the submission queue. */
245 LNXIOURINGSQ SqOffsets;
246 /** Offsets returned for the completion queue. */
247 LNXIOURINGCQ CqOffsets;
248} LNXIOURINGPARAMS;
249/** Pointer to Linux io_uring parameters. */
250typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
251/** Pointer to constant Linux io_uring parameters. */
252typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
253
254
255/**
256 * @name LNXIOURINGSQE::u8Opc defined opcodes.
257 * @{ */
258/** Opcode to profile the interface, does nothing. */
259#define LNX_IOURING_OPC_NOP 0
260/** preadv() like request. */
261#define LNX_IOURING_OPC_READV 1
262/** pwritev() like request. */
263#define LNX_IOURING_OPC_WRITEV 2
264/** fsync() like request. */
265#define LNX_IOURING_OPC_FSYNC 3
266/** Read request using a fixed preset buffer. */
267#define LNX_IOURING_OPC_READ_FIXED 4
268/** Write request using a fixed preset buffer. */
269#define LNX_IOURING_OPC_WRITE_FIXED 5
270/** Add file descriptor to pollset. */
271#define LNX_IOURING_OPC_POLL_ADD 6
272/** Remove file descriptor from pollset. */
273#define LNX_IOURING_OPC_POLL_REMOVE 7
274/** sync_file_range() like request. */
275#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
276/** sendmsg() like request. */
277#define LNX_IOURING_OPC_SENDMSG 9
278/** recvmsg() like request. */
279#define LNX_IOURING_OPC_RECVMSG 10
280/** @} */
281
282
283/**
284 * @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
285 * @{ */
286/** Sync userdata as well instead of metadata only. */
287#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
288/** @} */
289
290
291/**
292 * @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
293 * @{ */
294/** The I/O context is polled. */
295#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
296/** The kernel should poll the submission queue. */
297#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
298/** Sets the CPU affinity of the kernel thread polling the submission queue. */
299#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
300/** @} */
301
302
303/**
304 * @name Flags for LNXIOURINGSQE::u8Flags.
305 * @{ */
306/** The file descriptor was registered before use. */
307#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
308/** Complete all active requests before issuing the request with the flag set. */
309#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
310/** Links the request with the flag set to the next one. */
311#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
312/** @} */
313
314
315/**
316 * @name Magic mmap offsets to map submission and completion queues.
317 * @{ */
318/** Used to map the submission queue. */
319#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
320/** Used to map the completion queue. */
321#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
322/** Used to map the submission queue entries array. */
323#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
324/** @} */
325
326
327/**
328 * @name Flags used for the SQ ring structure.
329 * @{ */
330/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
331#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
332/** @} */
333
334
335/**
336 * @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
337 * { */
338/** Retrieve completion events for the completion queue. */
339#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
340/** Wakes the suspended kernel thread processing the requests. */
341#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
342/** @} */
343
344
345/**
346 * @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
347 * { */
348/** Register a fixed set of buffers. */
349#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
350/** Unregisters a fixed set of buffers registered previously. */
351#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
352/** Register a fixed set of files. */
353#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
354/** Unregisters a fixed set of files registered previously. */
355#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
356/** Register an eventfd associated with the I/O ring. */
357#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
358/** Unregisters an eventfd registered previously. */
359#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
360/** @} */
361
362
363/**
364 * SQ ring structure.
365 *
366 * @note Some members of this structure point to memory shared with the kernel,
367 * hence the volatile keyword.
368 */
369typedef struct RTIOQUEUESQ
370{
371 /** Pointer to the head counter. */
372 volatile uint32_t *pidxHead;
373 /** Pointer to the tail counter. */
374 volatile uint32_t *pidxTail;
375 /** Mask to apply for the counters to get to the index. */
376 uint32_t fRingMask;
377 /** Number of entries in the ring. */
378 uint32_t cEntries;
379 /** Pointer to the global flags. */
380 volatile uint32_t *pfFlags;
381 /** Pointer to the indirection array used for indexing the real SQ entries. */
382 volatile uint32_t *paidxSqes;
383} RTIOQUEUESQ;
384
385
386/**
387 * CQ ring structure.
388 *
389 * @note Some members of this structure point to memory shared with the kernel,
390 * hence the volatile keyword.
391 */
392typedef struct RTIOQUEUECQ
393{
394 /** Pointer to the head counter. */
395 volatile uint32_t *pidxHead;
396 /** Pointer to the tail counter. */
397 volatile uint32_t *pidxTail;
398 /** Mask to apply for the counters to get to the index. */
399 uint32_t fRingMask;
400 /** Number of entries in the ring. */
401 uint32_t cEntries;
402 /** Pointer to the completion entry ring. */
403 volatile LNXIOURINGCQE *paCqes;
404} RTIOQUEUECQ;
405
406
407/**
408 * Internal I/O queue provider instance data.
409 */
410typedef struct RTIOQUEUEPROVINT
411{
412 /** The io_uring file descriptor. */
413 int iFdIoCtx;
414 /** The eventfd file descriptor registered with the ring. */
415 int iFdEvt;
416 /** The submission queue. */
417 RTIOQUEUESQ Sq;
418 /** The currently uncommitted tail for the SQ. */
419 uint32_t idxSqTail;
420 /** Numbere of uncommitted SQEs. */
421 uint32_t cSqesToCommit;
422 /** The completion queue. */
423 RTIOQUEUECQ Cq;
424 /** Pointer to the mapped SQES entries. */
425 PLNXIOURINGSQE paSqes;
426 /** Pointer to the iovec structure used for non S/G requests. */
427 struct iovec *paIoVecs;
428 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
429 void *pvMMapSqRing;
430 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
431 void *pvMMapCqRing;
432 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
433 void *pvMMapSqes;
434 /** Size of the mapped SQ ring, used for unmapping. */
435 size_t cbMMapSqRing;
436 /** Size of the mapped CQ ring, used for unmapping. */
437 size_t cbMMapCqRing;
438 /** Size of the mapped SQ entries array, used for unmapping. */
439 size_t cbMMapSqes;
440 /** Flag whether the waiter was woken up externally. */
441 volatile bool fExtIntr;
442} RTIOQUEUEPROVINT;
443/** Pointer to the internal I/O queue provider instance data. */
444typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
445
446
447/*********************************************************************************************************************************
448* Internal Functions *
449*********************************************************************************************************************************/
450
451/**
452 * Syscall wrapper for io_uring_setup().
453 *
454 * @returns IPRT status code.
455 * @param cEntries Number of entries for submission and completion queues.
456 * @param pParams Additional parameters for the I/O ring and updated return values
457 * on success.
458 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
459 */
460DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
461{
462 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
463 if (RT_UNLIKELY(rcLnx == -1))
464 return RTErrConvertFromErrno(errno);
465
466 *piFdIoCtx = rcLnx;
467 return VINF_SUCCESS;
468}
469
470
471/**
472 * Syscall wrapper for io_uring_enter().
473 *
474 * @returns IPRT status code.
475 * @param iFdIoCtx The I/O ring file descriptor.
476 * @param cToSubmit Maximum number of requests waiting for processing.
477 * @param cMinComplete Minimum number of completion events to accumulate before returning.
478 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
479 */
480DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
481 uint32_t fFlags)
482{
483 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
484 NULL, 0);
485 if (RT_UNLIKELY(rcLnx == -1))
486 return RTErrConvertFromErrno(errno);
487
488 return VINF_SUCCESS;
489}
490
491
492/**
493 * Syscall wrapper for io_uring_register().
494 *
495 * @returns IPRT status code.
496 * @param iFdIoCtx The I/O ring file descriptor.
497 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
498 * @param pvArg Opaque arguments.
499 * @param cArgs Number of arguments.
500 */
501DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
502 uint32_t cArgs)
503{
504 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
505 if (RT_UNLIKELY(rcLnx == -1))
506 return RTErrConvertFromErrno(errno);
507
508 return VINF_SUCCESS;
509}
510
511
512/**
513 * mmap() wrapper for the common bits and returning an IPRT status code.
514 *
515 * @returns IPRT status code.
516 * @param iFdIoCtx The I/O ring file descriptor.
517 * @param offMmap The mmap() offset.
518 * @param cbMmap How much to map.
519 * @param ppv Where to store the pointer to the mapping on success.
520 */
521DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
522{
523 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
524 if (pv != MAP_FAILED)
525 {
526 *ppv = pv;
527 return VINF_SUCCESS;
528 }
529
530 return RTErrConvertFromErrno(errno);
531}
532
533
534/**
535 * eventfd2() syscall wrapper.
536 *
537 * @returns IPRT status code.
538 * @param uValInit The initial value of the maintained counter.
539 * @param fFlags Flags controlling the eventfd behavior.
540 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
541 */
542DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
543{
544 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
545 if (RT_UNLIKELY(rcLnx == -1))
546 return RTErrConvertFromErrno(errno);
547
548 *piFdEvt = rcLnx;
549 return VINF_SUCCESS;
550}
551
552
553/**
554 * Checks the completion event queue for pending events.
555 *
556 * @returns nothing.
557 * @param pThis The provider instance.
558 * @param paCEvt Pointer to the array of completion events.
559 * @param cCEvt Maximum number of completion events the array can hold.
560 * @param pcCEvtSeen Where to store the number of completion events processed.
561 */
562static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
563 uint32_t cCEvt, uint32_t *pcCEvtSeen)
564{
565 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
566 ASMReadFence();
567 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
568 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
569 ASMReadFence();
570
571 uint32_t cCEvtSeen = 0;
572
573 while ( idxCqTail != idxCqHead
574 && cCEvtSeen < cCEvt)
575 {
576 /* Get the index. */
577 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
578 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
579
580 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
581 if (pCqe->rcLnx >= 0)
582 {
583 paCEvt->rcReq = VINF_SUCCESS;
584 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
585 }
586 else
587 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
588
589 paCEvt++;
590 cCEvtSeen++;
591 idxCqHead++;
592 }
593
594 *pcCEvtSeen = cCEvtSeen;
595
596 /* Paranoia strikes again. */
597 ASMWriteFence();
598 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
599 ASMWriteFence();
600}
601
602
603/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
604static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
605{
606 /*
607 * Try to create a simple I/O ring and close it again.
608 * The common code/public API already checked for the proper handle type.
609 */
610 int iFdIoCtx = 0;
611 bool fSupp = false;
612 LNXIOURINGPARAMS Params;
613 RT_ZERO(Params);
614
615 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
616 if (RT_SUCCESS(rc))
617 {
618 /*
619 * Check that we can register an eventfd descriptor to get notified about
620 * completion events while being able to kick the waiter externally out of the wait.
621 */
622 int iFdEvt = 0;
623 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
624 if (RT_SUCCESS(rc))
625 {
626 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
627 &iFdEvt, 1 /*cArgs*/);
628 if (RT_SUCCESS(rc))
629 fSupp = true;
630
631 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
632 }
633 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
634 }
635
636 return fSupp;
637}
638
639
640/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
641static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
642 uint32_t cSqEntries, uint32_t cCqEntries)
643{
644 RT_NOREF(fFlags, cCqEntries);
645
646 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
647 LNXIOURINGPARAMS Params;
648 RT_ZERO(Params);
649
650 pThis->cSqesToCommit = 0;
651 pThis->fExtIntr = false;
652
653 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
654 if (RT_SUCCESS(rc))
655 {
656 /* Map the rings into userspace. */
657 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
658 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
659 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
660
661 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
662 if (RT_LIKELY(pThis->paIoVecs))
663 {
664 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
665 if (RT_SUCCESS(rc))
666 {
667 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
668 if (RT_SUCCESS(rc))
669 {
670 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
671 if (RT_SUCCESS(rc))
672 {
673 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
674 if (RT_SUCCESS(rc))
675 {
676 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
677 if (RT_SUCCESS(rc))
678 {
679 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
680
681 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
682 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
683 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
684 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
685 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
686 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
687 pThis->idxSqTail = *pThis->Sq.pidxTail;
688
689 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
690
691 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
692
693 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
694 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
695 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
696 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
697 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
698 return VINF_SUCCESS;
699 }
700
701 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
702 }
703
704 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
705 }
706
707 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
708 AssertRC(rc);
709 }
710
711 close(pThis->iFdEvt);
712 }
713
714 RTMemFree(pThis->paIoVecs);
715 }
716
717 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
718 }
719
720 return rc;
721}
722
723
724/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
725static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
726{
727 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
728
729 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
730 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
731 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
732
733 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
734 AssertRC(rc);
735
736 close(pThis->iFdEvt);
737 close(pThis->iFdIoCtx);
738 RTMemFree(pThis->paIoVecs);
739
740 RT_ZERO(pThis);
741}
742
743
744/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
745static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
746{
747 RT_NOREF(hIoQueueProv, pHandle);
748 /** @todo Add support for fixed file sets later. */
749 return VINF_SUCCESS;
750}
751
752
753/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
754static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
755{
756 RT_NOREF(hIoQueueProv, pHandle);
757 /** @todo Add support for fixed file sets later. */
758 return VINF_SUCCESS;
759}
760
761
762/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
763static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
764 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
765 void *pvUser)
766{
767 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
768 RT_NOREF(fReqFlags);
769
770 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
771 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
772 struct iovec *pIoVec = &pThis->paIoVecs[idx];
773
774 pIoVec->iov_base = pvBuf;
775 pIoVec->iov_len = cbBuf;
776
777 pSqe->u8Flags = 0;
778 pSqe->u16IoPrio = 0;
779 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
780 pSqe->u64OffStart = off;
781 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
782 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
783
784 switch (enmOp)
785 {
786 case RTIOQUEUEOP_READ:
787 pSqe->u8Opc = LNX_IOURING_OPC_READV;
788 pSqe->uOpc.u32KrnlRwFlags = 0;
789 break;
790 case RTIOQUEUEOP_WRITE:
791 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
792 pSqe->uOpc.u32KrnlRwFlags = 0;
793 break;
794 case RTIOQUEUEOP_SYNC:
795 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
796 pSqe->uOpc.u32FsyncFlags = 0;
797 break;
798 default:
799 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
800 VERR_INVALID_PARAMETER);
801 }
802
803 pThis->idxSqTail++;
804 pThis->cSqesToCommit++;
805 return VINF_SUCCESS;
806}
807
808
809/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
810static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
811{
812 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
813 RT_NOREF(pThis, pcReqsCommitted);
814
815 ASMWriteFence();
816 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
817 ASMWriteFence();
818
819 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
820 if (RT_SUCCESS(rc))
821 {
822 *pcReqsCommitted = pThis->cSqesToCommit;
823 pThis->cSqesToCommit = 0;
824 }
825
826 return rc;
827}
828
829
830/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
831static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
832 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
833{
834 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
835 int rc = VINF_SUCCESS;
836 uint32_t cCEvtSeen = 0;
837
838 RT_NOREF(fFlags);
839
840 /*
841 * Check the completion queue first for any completed events which might save us a
842 * context switch later on.
843 */
844 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
845
846 while ( cCEvtSeen < cMinWait
847 && RT_SUCCESS(rc))
848 {
849 /*
850 * We can employ a blocking read on the event file descriptor, it will return
851 * either when woken up externally or when there are completion events pending.
852 */
853 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
854 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
855 if (rcLnx == sizeof(uCnt))
856 {
857 uint32_t cCEvtThisSeen = 0;
858 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
859 cCEvtSeen += cCEvtThisSeen;
860
861 /* Whether we got woken up externally. */
862 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
863 rc = VERR_INTERRUPTED;
864 }
865 else if (rcLnx == -1)
866 rc = RTErrConvertFromErrno(errno);
867 else
868 AssertMsgFailed(("Unexpected read() -> 0\n"));
869 }
870
871 *pcCEvt = cCEvtSeen;
872 return rc;
873}
874
875
876/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
877static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
878{
879 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
880 int rc = VINF_SUCCESS;
881
882 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
883 {
884 const uint64_t uValAdd = 1;
885 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
886
887 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
888 if (rcLnx == -1)
889 rc = RTErrConvertFromErrno(errno);
890 }
891
892 return rc;
893}
894
895
896/**
897 * Async file I/O queue provider virtual method table.
898 */
899RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
900{
901 /** uVersion */
902 RTIOQUEUEPROVVTABLE_VERSION,
903 /** pszId */
904 "LnxIoURingFile",
905 /** cbIoQueueProv */
906 sizeof(RTIOQUEUEPROVINT),
907 /** enmHnd */
908 RTHANDLETYPE_FILE,
909 /** fFlags */
910 0,
911 /** pfnIsSupported */
912 rtIoQueueLnxIoURingFileProv_IsSupported,
913 /** pfnQueueInit */
914 rtIoQueueLnxIoURingFileProv_QueueInit,
915 /** pfnQueueDestroy */
916 rtIoQueueLnxIoURingFileProv_QueueDestroy,
917 /** pfnHandleRegister */
918 rtIoQueueLnxIoURingFileProv_HandleRegister,
919 /** pfnHandleDeregister */
920 rtIoQueueLnxIoURingFileProv_HandleDeregister,
921 /** pfnReqPrepare */
922 rtIoQueueLnxIoURingFileProv_ReqPrepare,
923 /** pfnReqPrepareSg */
924 NULL,
925 /** pfnCommit */
926 rtIoQueueLnxIoURingFileProv_Commit,
927 /** pfnEvtWait */
928 rtIoQueueLnxIoURingFileProv_EvtWait,
929 /** pfnEvtWaitWakeup */
930 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
931 /** uEndMarker */
932 RTIOQUEUEPROVVTABLE_VERSION
933};
934
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette