VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 79983

Last change on this file since 79983 was 79983, checked in by vboxsync, 6 years ago

Runtime/RTIoQueue: Updates

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 79983 2019-07-25 17:21:24Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
28 * @internal
29 *
30 * The io_uring interface is the most recent interface added to the Linux kernel
31 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
32 * thus not available on most systems as of writing this backend (July 2019).
33 * It supersedes the old async I/O interface and cleans up with some restrictions like
34 * having to disable caching for the file.
35 * The interface is centered around a submission and completion queue to queue multiple new
36 * requests for the kernel to process and get notified about completions to reduce the amount
37 * of context switches to an absolute minimum. It also offers advanced features like
38 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
39 * even more.
40 *
41 * The first implementation will only make use of the basic features and more advanced features
42 * will be added later.
43 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
44 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
45 * while still keeping a consistent platform independent API which allows efficient implementations on
46 * other hosts when they come up.
47 *
48 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
49 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
50 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
51 * * http://kernel.dk/io_uring.pdf
52 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
53 */
54
55
56/*********************************************************************************************************************************
57* Header Files *
58*********************************************************************************************************************************/
59#define LOG_GROUP RTLOGGROUP_IOQUEUE
60#include <iprt/ioqueue.h>
61
62#include <iprt/assertcompile.h>
63#include <iprt/asm.h>
64#include <iprt/errcore.h>
65#include <iprt/file.h>
66#include <iprt/log.h>
67#include <iprt/mem.h>
68#include <iprt/string.h>
69
70#include <errno.h>
71#include <unistd.h>
72#include <signal.h>
73#include <sys/mman.h>
74#include <sys/syscall.h>
75#include <sys/uio.h>
76
77#include "internal/ioqueue.h"
78
79
80/*********************************************************************************************************************************
81* Defined Constants And Macros *
82*********************************************************************************************************************************/
83
84/** The syscall number of io_uring_setup(). */
85#define LNX_IOURING_SYSCALL_SETUP 425
86/** The syscall number of io_uring_enter(). */
87#define LNX_IOURING_SYSCALL_ENTER 426
88/** The syscall number of io_uring_register(). */
89#define LNX_IOURING_SYSCALL_REGISTER 427
90/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
91#define LNX_SYSCALL_EVENTFD2 19
92
93/*********************************************************************************************************************************
94* Structures and Typedefs *
95*********************************************************************************************************************************/
96
97/**
98 * Linux io_uring completion event.
99 */
100typedef struct LNXIOURINGCQE
101{
102 /** Opaque user data associated with the completed request. */
103 uint64_t u64User;
104 /** The status code of the request. */
105 int32_t rcLnx;
106 /** Some flags which are not used as of now. */
107 uint32_t fFlags;
108} LNXIOURINGCQE;
109AssertCompileSize(LNXIOURINGCQE, 16);
110/** Pointer to a Linux io_uring completion event. */
111typedef LNXIOURINGCQE *PLNXIOURINGCQE;
112/** Pointer to a constant linux io_uring completion event. */
113typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
114
115
116/**
117 * Linux io_uring submission queue entry.
118 */
119typedef struct LNXIOURINGSQE
120{
121 /** The opcode for the request. */
122 uint8_t u8Opc;
123 /** Common flags for the request. */
124 uint8_t u8Flags;
125 /** Assigned I/O priority. */
126 uint16_t u16IoPrio;
127 /** The file descriptor the request is for. */
128 int32_t i32Fd;
129 /** The start offset into the file for the request. */
130 uint64_t u64OffStart;
131 /** Buffer pointer or Pointer to io vector array depending on opcode. */
132 uint64_t u64AddrBufIoVec;
133 /** Size of the buffer in bytes or number of io vectors. */
134 uint32_t u32BufIoVecSz;
135 /** Opcode dependent data. */
136 union
137 {
138 /** Flags for read/write requests. */
139 uint32_t u32KrnlRwFlags;
140 /** Flags for fsync() like requests. */
141 uint32_t u32FsyncFlags;
142 /** Flags for poll() like requests. */
143 uint16_t u16PollFlags;
144 /** Flags for sync_file_range() like requests. */
145 uint32_t u32SyncFileRangeFlags;
146 /** Flags for requests requiring a msg structure. */
147 uint32_t u32MsgFlags;
148 } uOpc;
149 /** Opaque user data associated with the request and returned durign completion. */
150 uint64_t u64User;
151 /** Request type dependent data. */
152 union
153 {
154 /** Fixed buffer index if indicated by the request flags. */
155 uint16_t u16FixedBufIdx;
156 /** Padding to align the structure to 64 bytes. */
157 uint64_t au64Padding[3];
158 } uReq;
159} LNXIOURINGSQE;
160AssertCompileSize(LNXIOURINGSQE, 64);
161/** Pointer to a Linux io_uring submission queue entry. */
162typedef LNXIOURINGSQE *PLNXIOURINGSQE;
163/** Pointer to a constant Linux io_uring submission queue entry. */
164typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
165
166
167/**
168 * Linux u_ioring SQ ring header structure to maintain the queue.
169 */
170typedef struct LNXIOURINGSQ
171{
172 /** The current head position to fill in new requests. */
173 uint32_t u32OffHead;
174 /** The current tail position the kernel starts processing from. */
175 uint32_t u32OffTail;
176 /** The mask for the head and tail counters to apply to retrieve the index. */
177 uint32_t u32OffRingMask;
178 /** Number of entries in the SQ ring. */
179 uint32_t u32OffRingEntries;
180 /** Flags set asychronously by the kernel. */
181 uint32_t u32OffFlags;
182 /** Counter of dropped requests. */
183 uint32_t u32OffDroppedReqs;
184 /** Offset where to find the array of SQ entries. */
185 uint32_t u32OffArray;
186 /** Reserved. */
187 uint32_t u32Rsvd0;
188 /** Reserved. */
189 uint64_t u64Rsvd1;
190} LNXIOURINGSQ;
191AssertCompileSize(LNXIOURINGSQ, 40);
192/** Pointer to a Linux u_ioring SQ ring header. */
193typedef LNXIOURINGSQ *PLNXIOURINGSQ;
194/** Pointer to a constant Linux u_ioring SQ ring header. */
195typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
196
197
198/**
199 * Linux io_uring CQ ring header structure to maintain the queue.
200 */
201typedef struct LNXIOURINGCQ
202{
203 /** The current head position the kernel modifies when completion events happen. */
204 uint32_t u32OffHead;
205 /** The current tail position to read completion events from. */
206 uint32_t u32OffTail;
207 /** The mask for the head and tail counters to apply to retrieve the index. */
208 uint32_t u32OffRingMask;
209 /** Number of entries in the CQ ring. */
210 uint32_t u32OffRingEntries;
211 /** Number of CQ overflows happened. */
212 uint32_t u32OffOverflowCnt;
213 /** */
214 uint32_t u32OffCqes;
215 /** Reserved. */
216 uint64_t au64Rsvd0[2];
217} LNXIOURINGCQ;
218AssertCompileSize(LNXIOURINGCQ, 40);
219/** Pointer to a Linux u_ioring CQ ring header. */
220typedef LNXIOURINGCQ *PLNXIOURINGCQ;
221/** Pointer to a constant Linux u_ioring CQ ring header. */
222typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
223
224
225/**
226 * Linux io_uring parameters passed to io_uring_setup().
227 */
228typedef struct LNXIOURINGPARAMS
229{
230 /** Number of SQ entries requested, must be power of 2. */
231 uint32_t u32SqEntriesCnt;
232 /** Number of CQ entries requested, must be power of 2. */
233 uint32_t u32CqEntriesCnt;
234 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
235 uint32_t u32Flags;
236 /** Affinity of the kernel side SQ polling thread if enabled. */
237 uint32_t u32SqPollCpu;
238 /** Milliseconds after the kernel side SQ polling thread goes to sleep
239 * if there is are no requests to process. */
240 uint32_t u32SqPollIdleMs;
241 /** Reserved. */
242 uint32_t au32Rsvd0[5];
243 /** Offsets returned for the submission queue. */
244 LNXIOURINGSQ SqOffsets;
245 /** Offsets returned for the completion queue. */
246 LNXIOURINGCQ CqOffsets;
247} LNXIOURINGPARAMS;
248/** Pointer to Linux io_uring parameters. */
249typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
250/** Pointer to constant Linux io_uring parameters. */
251typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
252
253
254/**
255 * @name LNXIOURINGSQE::u8Opc defined opcodes.
256 * @{ */
257/** Opcode to profile the interface, does nothing. */
258#define LNX_IOURING_OPC_NOP 0
259/** preadv() like request. */
260#define LNX_IOURING_OPC_READV 1
261/** pwritev() like request. */
262#define LNX_IOURING_OPC_WRITEV 2
263/** fsync() like request. */
264#define LNX_IOURING_OPC_FSYNC 3
265/** Read request using a fixed preset buffer. */
266#define LNX_IOURING_OPC_READ_FIXED 4
267/** Write request using a fixed preset buffer. */
268#define LNX_IOURING_OPC_WRITE_FIXED 5
269/** Add file descriptor to pollset. */
270#define LNX_IOURING_OPC_POLL_ADD 6
271/** Remove file descriptor from pollset. */
272#define LNX_IOURING_OPC_POLL_REMOVE 7
273/** sync_file_range() like request. */
274#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
275/** sendmsg() like request. */
276#define LNX_IOURING_OPC_SENDMSG 9
277/** recvmsg() like request. */
278#define LNX_IOURING_OPC_RECVMSG 10
279/** @} */
280
281
282/**
283 * @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
284 * @{ */
285/** Sync userdata as well instead of metadata only. */
286#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
287/** @} */
288
289
290/**
291 * @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
292 * @{ */
293/** The I/O context is polled. */
294#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
295/** The kernel should poll the submission queue. */
296#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
297/** Sets the CPU affinity of the kernel thread polling the submission queue. */
298#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
299/** @} */
300
301
302/**
303 * @name Flags for LNXIOURINGSQE::u8Flags.
304 * @{ */
305/** The file descriptor was registered before use. */
306#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
307/** Complete all active requests before issuing the request with the flag set. */
308#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
309/** Links the request with the flag set to the next one. */
310#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
311/** @} */
312
313
314/**
315 * @name Magic mmap offsets to map submission and completion queues.
316 * @{ */
317/** Used to map the submission queue. */
318#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
319/** Used to map the completion queue. */
320#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
321/** Used to map the submission queue entries array. */
322#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
323/** @} */
324
325
326/**
327 * @name Flags used for the SQ ring structure.
328 * @{ */
329/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
330#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
331/** @} */
332
333
334/**
335 * @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
336 * { */
337/** Retrieve completion events for the completion queue. */
338#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
339/** Wakes the suspended kernel thread processing the requests. */
340#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
341/** @} */
342
343
344/**
345 * @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
346 * { */
347/** Register a fixed set of buffers. */
348#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
349/** Unregisters a fixed set of buffers registered previously. */
350#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
351/** Register a fixed set of files. */
352#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
353/** Unregisters a fixed set of files registered previously. */
354#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
355/** Register an eventfd associated with the I/O ring. */
356#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
357/** Unregisters an eventfd registered previously. */
358#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
359/** @} */
360
361
362/**
363 * SQ ring structure.
364 *
365 * @note Some members of this structure point to memory shared with the kernel,
366 * hence the volatile keyword.
367 */
368typedef struct RTIOQUEUESQ
369{
370 /** Pointer to the head counter. */
371 volatile uint32_t *pidxHead;
372 /** Pointer to the tail counter. */
373 volatile uint32_t *pidxTail;
374 /** Mask to apply for the counters to get to the index. */
375 uint32_t fRingMask;
376 /** Number of entries in the ring. */
377 uint32_t cEntries;
378 /** Pointer to the global flags. */
379 volatile uint32_t *pfFlags;
380 /** Pointer to the indirection array used for indexing the real SQ entries. */
381 volatile uint32_t *paidxSqes;
382} RTIOQUEUESQ;
383
384
385/**
386 * CQ ring structure.
387 *
388 * @note Some members of this structure point to memory shared with the kernel,
389 * hence the volatile keyword.
390 */
391typedef struct RTIOQUEUECQ
392{
393 /** Pointer to the head counter. */
394 volatile uint32_t *pidxHead;
395 /** Pointer to the tail counter. */
396 volatile uint32_t *pidxTail;
397 /** Mask to apply for the counters to get to the index. */
398 uint32_t fRingMask;
399 /** Number of entries in the ring. */
400 uint32_t cEntries;
401 /** Pointer to the completion entry ring. */
402 volatile LNXIOURINGCQE *paCqes;
403} RTIOQUEUECQ;
404
405
406/**
407 * Internal I/O queue provider instance data.
408 */
409typedef struct RTIOQUEUEPROVINT
410{
411 /** The io_uring file descriptor. */
412 int iFdIoCtx;
413 /** The eventfd file descriptor registered with the ring. */
414 int iFdEvt;
415 /** The submission queue. */
416 RTIOQUEUESQ Sq;
417 /** The currently uncommitted tail for the SQ. */
418 uint32_t idxSqTail;
419 /** Numbere of uncommitted SQEs. */
420 uint32_t cSqesToCommit;
421 /** The completion queue. */
422 RTIOQUEUECQ Cq;
423 /** Pointer to the mapped SQES entries. */
424 PLNXIOURINGSQE paSqes;
425 /** Pointer to the iovec structure used for non S/G requests. */
426 struct iovec *paIoVecs;
427 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
428 void *pvMMapSqRing;
429 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
430 void *pvMMapCqRing;
431 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
432 void *pvMMapSqes;
433 /** Size of the mapped SQ ring, used for unmapping. */
434 size_t cbMMapSqRing;
435 /** Size of the mapped CQ ring, used for unmapping. */
436 size_t cbMMapCqRing;
437 /** Size of the mapped SQ entries array, used for unmapping. */
438 size_t cbMMapSqes;
439 /** Flag whether the waiter was woken up externally. */
440 volatile bool fExtIntr;
441} RTIOQUEUEPROVINT;
442/** Pointer to the internal I/O queue provider instance data. */
443typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
444
445
446/*********************************************************************************************************************************
447* Internal Functions *
448*********************************************************************************************************************************/
449
450/**
451 * Syscall wrapper for io_uring_setup().
452 *
453 * @returns IPRT status code.
454 * @param cEntries Number of entries for submission and completion queues.
455 * @param pParams Additional parameters for the I/O ring and updated return values
456 * on success.
457 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
458 */
459DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
460{
461 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
462 if (RT_UNLIKELY(rcLnx == -1))
463 return RTErrConvertFromErrno(errno);
464
465 *piFdIoCtx = rcLnx;
466 return VINF_SUCCESS;
467}
468
469
470/**
471 * Syscall wrapper for io_uring_enter().
472 *
473 * @returns IPRT status code.
474 * @param iFdIoCtx The I/O ring file descriptor.
475 * @param cToSubmit Maximum number of requests waiting for processing.
476 * @param cMinComplete Minimum number of completion events to accumulate before returning.
477 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
478 */
479DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
480 uint32_t fFlags)
481{
482 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
483 NULL, 0);
484 if (RT_UNLIKELY(rcLnx == -1))
485 return RTErrConvertFromErrno(errno);
486
487 return VINF_SUCCESS;
488}
489
490
491/**
492 * Syscall wrapper for io_uring_register().
493 *
494 * @returns IPRT status code.
495 * @param iFdIoCtx The I/O ring file descriptor.
496 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
497 * @param pvArg Opaque arguments.
498 * @param cArgs Number of arguments.
499 */
500DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
501 uint32_t cArgs)
502{
503 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
504 if (RT_UNLIKELY(rcLnx == -1))
505 return RTErrConvertFromErrno(errno);
506
507 return VINF_SUCCESS;
508}
509
510
511/**
512 * mmap() wrapper for the common bits and returning an IPRT status code.
513 *
514 * @returns IPRT status code.
515 * @param iFdIoCtx The I/O ring file descriptor.
516 * @param offMmap The mmap() offset.
517 * @param cbMmap How much to map.
518 * @param ppv Where to store the pointer to the mapping on success.
519 */
520DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
521{
522 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
523 if (pv != MAP_FAILED)
524 {
525 *ppv = pv;
526 return VINF_SUCCESS;
527 }
528
529 return RTErrConvertFromErrno(errno);
530}
531
532
533/**
534 * eventfd2() syscall wrapper.
535 *
536 * @returns IPRT status code.
537 * @param uValInit The initial value of the maintained counter.
538 * @param fFlags Flags controlling the eventfd behavior.
539 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
540 */
541DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
542{
543 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
544 if (RT_UNLIKELY(rcLnx == -1))
545 return RTErrConvertFromErrno(errno);
546
547 *piFdEvt = rcLnx;
548 return VINF_SUCCESS;
549}
550
551
552/**
553 * Checks the completion event queue for pending events.
554 *
555 * @returns nothing.
556 * @param pThis The provider instance.
557 * @param paCEvt Pointer to the array of completion events.
558 * @param cCEvt Maximum number of completion events the array can hold.
559 * @param pcCEvtSeen Where to store the number of completion events processed.
560 */
561static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
562 uint32_t cCEvt, uint32_t *pcCEvtSeen)
563{
564 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
565 ASMReadFence();
566 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
567 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
568 ASMReadFence();
569
570 uint32_t cCEvtSeen = 0;
571
572 while ( idxCqTail != idxCqHead
573 && cCEvtSeen < cCEvt)
574 {
575 /* Get the index. */
576 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
577 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
578
579 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
580 if (pCqe->rcLnx >= 0)
581 {
582 paCEvt->rcReq = VINF_SUCCESS;
583 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
584 }
585 else
586 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
587
588 paCEvt++;
589 cCEvtSeen++;
590 idxCqHead++;
591 }
592
593 *pcCEvtSeen = cCEvtSeen;
594
595 /* Paranoia strikes again. */
596 ASMWriteFence();
597 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
598 ASMWriteFence();
599}
600
601
602/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
603static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
604{
605 /*
606 * Try to create a simple I/O ring and close it again.
607 * The common code/public API already checked for the proper handle type.
608 */
609 int iFdIoCtx = 0;
610 bool fSupp = false;
611 LNXIOURINGPARAMS Params;
612 RT_ZERO(Params);
613
614 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
615 if (RT_SUCCESS(rc))
616 {
617 /*
618 * Check that we can register an eventfd descriptor to get notified about
619 * completion events while being able to kick the waiter externally out of the wait.
620 */
621 int iFdEvt = 0;
622 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
623 if (RT_SUCCESS(rc))
624 {
625 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
626 &iFdEvt, 1 /*cArgs*/);
627 if (RT_SUCCESS(rc))
628 fSupp = true;
629
630 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
631 }
632 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
633 }
634
635 return fSupp;
636}
637
638
639/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
640static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
641 uint32_t cSqEntries, uint32_t cCqEntries)
642{
643 RT_NOREF(fFlags, cCqEntries);
644
645 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
646 LNXIOURINGPARAMS Params;
647 RT_ZERO(Params);
648
649 pThis->cSqesToCommit = 0;
650 pThis->fExtIntr = false;
651
652 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
653 if (RT_SUCCESS(rc))
654 {
655 /* Map the rings into userspace. */
656 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
657 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
658 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
659
660 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
661 if (RT_LIKELY(pThis->paIoVecs))
662 {
663 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
664 if (RT_SUCCESS(rc))
665 {
666 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
667 if (RT_SUCCESS(rc))
668 {
669 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
670 if (RT_SUCCESS(rc))
671 {
672 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
673 if (RT_SUCCESS(rc))
674 {
675 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
676 if (RT_SUCCESS(rc))
677 {
678 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
679
680 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
681 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
682 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
683 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
684 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
685 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
686 pThis->idxSqTail = *pThis->Sq.pidxTail;
687
688 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
689
690 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
691
692 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
693 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
694 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
695 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
696 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
697 return VINF_SUCCESS;
698 }
699
700 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
701 }
702
703 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
704 }
705
706 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
707 AssertRC(rc);
708 }
709
710 close(pThis->iFdEvt);
711 }
712
713 RTMemFree(pThis->paIoVecs);
714 }
715
716 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
717 }
718
719 return rc;
720}
721
722
723/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
724static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
725{
726 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
727
728 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
729 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
730 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
731
732 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
733 AssertRC(rc);
734
735 close(pThis->iFdEvt);
736 close(pThis->iFdIoCtx);
737 RTMemFree(pThis->paIoVecs);
738
739 RT_ZERO(pThis);
740}
741
742
743/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
744static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
745{
746 RT_NOREF(hIoQueueProv, pHandle);
747 /** @todo Add support for fixed file sets later. */
748 return VINF_SUCCESS;
749}
750
751
752/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
753static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
754{
755 RT_NOREF(hIoQueueProv, pHandle);
756 /** @todo Add support for fixed file sets later. */
757 return VINF_SUCCESS;
758}
759
760
761/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
762static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
763 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
764 void *pvUser)
765{
766 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
767 RT_NOREF(fReqFlags);
768
769 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
770 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
771 struct iovec *pIoVec = &pThis->paIoVecs[idx];
772
773 pIoVec->iov_base = pvBuf;
774 pIoVec->iov_len = cbBuf;
775
776 pSqe->u8Flags = 0;
777 pSqe->u16IoPrio = 0;
778 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
779 pSqe->u64OffStart = off;
780 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
781 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
782
783 switch (enmOp)
784 {
785 case RTIOQUEUEOP_READ:
786 pSqe->u8Opc = LNX_IOURING_OPC_READV;
787 pSqe->uOpc.u32KrnlRwFlags = 0;
788 break;
789 case RTIOQUEUEOP_WRITE:
790 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
791 pSqe->uOpc.u32KrnlRwFlags = 0;
792 break;
793 case RTIOQUEUEOP_SYNC:
794 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
795 pSqe->uOpc.u32FsyncFlags = 0;
796 break;
797 default:
798 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
799 VERR_INVALID_PARAMETER);
800 }
801
802 pThis->idxSqTail++;
803 pThis->cSqesToCommit++;
804 return VINF_SUCCESS;
805}
806
807
808/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
809static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
810{
811 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
812 RT_NOREF(pThis, pcReqsCommitted);
813
814 ASMWriteFence();
815 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
816 ASMWriteFence();
817
818 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
819 if (RT_SUCCESS(rc))
820 {
821 *pcReqsCommitted = pThis->cSqesToCommit;
822 pThis->cSqesToCommit = 0;
823 }
824
825 return rc;
826}
827
828
829/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
830static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
831 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
832{
833 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
834 int rc = VINF_SUCCESS;
835 uint32_t cCEvtSeen = 0;
836
837 RT_NOREF(fFlags);
838
839 /*
840 * Check the completion queue first for any completed events which might save us a
841 * context switch later on.
842 */
843 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
844
845 while ( cCEvtSeen < cMinWait
846 && RT_SUCCESS(rc))
847 {
848 /*
849 * We can employ a blocking read on the event file descriptor, it will return
850 * either when woken up externally or when there are completion events pending.
851 */
852 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
853 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
854 if (rcLnx == sizeof(uCnt))
855 {
856 uint32_t cCEvtThisSeen = 0;
857 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
858 cCEvtSeen += cCEvtThisSeen;
859
860 /* Whether we got woken up externally. */
861 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
862 rc = VERR_INTERRUPTED;
863 }
864 else if (rcLnx == -1)
865 rc = RTErrConvertFromErrno(errno);
866 else
867 AssertMsgFailed(("Unexpected read() -> 0\n"));
868 }
869
870 *pcCEvt = cCEvtSeen;
871 return rc;
872}
873
874
875/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
876static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
877{
878 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
879 int rc = VINF_SUCCESS;
880
881 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
882 {
883 const uint64_t uValAdd = 1;
884 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
885
886 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
887 if (rcLnx == -1)
888 rc = RTErrConvertFromErrno(errno);
889 }
890
891 return rc;
892}
893
894
895/**
896 * Async file I/O queue provider virtual method table.
897 */
898RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
899{
900 /** uVersion */
901 RTIOQUEUEPROVVTABLE_VERSION,
902 /** pszId */
903 "LnxIoURingFile",
904 /** cbIoQueueProv */
905 sizeof(RTIOQUEUEPROVINT),
906 /** enmHnd */
907 RTHANDLETYPE_FILE,
908 /** fFlags */
909 0,
910 /** pfnIsSupported */
911 rtIoQueueLnxIoURingFileProv_IsSupported,
912 /** pfnQueueInit */
913 rtIoQueueLnxIoURingFileProv_QueueInit,
914 /** pfnQueueDestroy */
915 rtIoQueueLnxIoURingFileProv_QueueDestroy,
916 /** pfnHandleRegister */
917 rtIoQueueLnxIoURingFileProv_HandleRegister,
918 /** pfnHandleDeregister */
919 rtIoQueueLnxIoURingFileProv_HandleDeregister,
920 /** pfnReqPrepare */
921 rtIoQueueLnxIoURingFileProv_ReqPrepare,
922 /** pfnReqPrepareSg */
923 NULL,
924 /** pfnCommit */
925 rtIoQueueLnxIoURingFileProv_Commit,
926 /** pfnEvtWait */
927 rtIoQueueLnxIoURingFileProv_EvtWait,
928 /** pfnEvtWaitWakeup */
929 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
930 /** uEndMarker */
931 RTIOQUEUEPROVVTABLE_VERSION
932};
933
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette