1 | /* $Id: ioqueue-iouringfile-provider.cpp 79953 2019-07-24 11:24:43Z vboxsync $ */
|
---|
2 | /** @file
|
---|
3 | * IPRT - I/O queue, Linux io_uring interface I/O file provider.
|
---|
4 | */
|
---|
5 |
|
---|
6 | /*
|
---|
7 | * Copyright (C) 2019 Oracle Corporation
|
---|
8 | *
|
---|
9 | * This file is part of VirtualBox Open Source Edition (OSE), as
|
---|
10 | * available from http://www.virtualbox.org. This file is free software;
|
---|
11 | * you can redistribute it and/or modify it under the terms of the GNU
|
---|
12 | * General Public License (GPL) as published by the Free Software
|
---|
13 | * Foundation, in version 2 as it comes in the "COPYING" file of the
|
---|
14 | * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
|
---|
15 | * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
|
---|
16 | *
|
---|
17 | * The contents of this file may alternatively be used under the terms
|
---|
18 | * of the Common Development and Distribution License Version 1.0
|
---|
19 | * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
|
---|
20 | * VirtualBox OSE distribution, in which case the provisions of the
|
---|
21 | * CDDL are applicable instead of those of the GPL.
|
---|
22 | *
|
---|
23 | * You may elect to license modified versions of this file under the
|
---|
24 | * terms and conditions of either the GPL or the CDDL or both.
|
---|
25 | */
|
---|
26 |
|
---|
27 | /** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
|
---|
28 | * @internal
|
---|
29 | *
|
---|
30 | * The io_uring interface is the most recent interface added to the Linux kernel
|
---|
31 | * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
|
---|
32 | * thus not available on most systems as of writing this backend (July 2019).
|
---|
33 | * It supersedes the old async I/O interface and cleans up with some restrictions like
|
---|
34 | * having to disable caching for the file.
|
---|
35 | * The interface is centered around a submission and completion queue to queue multiple new
|
---|
36 | * requests for the kernel to process and get notified about completions to reduce the amount
|
---|
37 | * of context switches to an absolute minimum. It also offers advanced features like
|
---|
38 | * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
|
---|
39 | * even more.
|
---|
40 | *
|
---|
41 | * The first implementation will only make use of the basic features and more advanced features
|
---|
42 | * will be added later.
|
---|
43 | * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
|
---|
44 | * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
|
---|
45 | * while still keeping a consistent platform independent API which allows efficient implementations on
|
---|
46 | * other hosts when they come up.
|
---|
47 | *
|
---|
48 | * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
|
---|
49 | * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
|
---|
50 | * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
|
---|
51 | * * http://kernel.dk/io_uring.pdf
|
---|
52 | * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
|
---|
53 | */
|
---|
54 |
|
---|
55 |
|
---|
56 | /*********************************************************************************************************************************
|
---|
57 | * Header Files *
|
---|
58 | *********************************************************************************************************************************/
|
---|
59 | #define LOG_GROUP RTLOGGROUP_IOQUEUE
|
---|
60 | #include <iprt/ioqueue.h>
|
---|
61 |
|
---|
62 | #include <iprt/assertcompile.h>
|
---|
63 | #include <iprt/asm.h>
|
---|
64 | #include <iprt/errcore.h>
|
---|
65 | #include <iprt/file.h>
|
---|
66 | #include <iprt/log.h>
|
---|
67 | #include <iprt/mem.h>
|
---|
68 | #include <iprt/string.h>
|
---|
69 |
|
---|
70 | #include <sys/mman.h>
|
---|
71 | #include <unistd.h>
|
---|
72 | #include <sys/syscall.h>
|
---|
73 | #include <errno.h>
|
---|
74 | #include <signal.h>
|
---|
75 |
|
---|
76 | #include "internal/ioqueue.h"
|
---|
77 |
|
---|
78 |
|
---|
79 | /*********************************************************************************************************************************
|
---|
80 | * Defined Constants And Macros *
|
---|
81 | *********************************************************************************************************************************/
|
---|
82 |
|
---|
83 | /** The syscall number of io_uring_setup(). */
|
---|
84 | #define LNX_IOURING_SYSCALL_SETUP 425
|
---|
85 | /** The syscall number of io_uring_enter(). */
|
---|
86 | #define LNX_IOURING_SYSCALL_ENTER 426
|
---|
87 | /** The syscall number of io_uring_register(). */
|
---|
88 | #define LNX_IOURING_SYSCALL_REGISTER 427
|
---|
89 |
|
---|
90 |
|
---|
91 | /*********************************************************************************************************************************
|
---|
92 | * Structures and Typedefs *
|
---|
93 | *********************************************************************************************************************************/
|
---|
94 |
|
---|
95 | /**
|
---|
96 | * Linux io_uring completion event.
|
---|
97 | */
|
---|
98 | typedef struct LNXIOURINGCQE
|
---|
99 | {
|
---|
100 | /** Opaque user data associated with the completed request. */
|
---|
101 | uint64_t u64User;
|
---|
102 | /** The status code of the request. */
|
---|
103 | int32_t rcLnx;
|
---|
104 | /** Some flags which are not used as of now. */
|
---|
105 | uint32_t fFlags;
|
---|
106 | } LNXIOURINGCQE;
|
---|
107 | AssertCompileSize(LNXIOURINGCQE, 16);
|
---|
108 | /** Pointer to a Linux io_uring completion event. */
|
---|
109 | typedef LNXIOURINGCQE *PLNXIOURINGCQE;
|
---|
110 | /** Pointer to a constant linux io_uring completion event. */
|
---|
111 | typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
|
---|
112 |
|
---|
113 |
|
---|
114 | /**
|
---|
115 | * Linux io_uring submission queue entry.
|
---|
116 | */
|
---|
117 | typedef struct LNXIOURINGSQE
|
---|
118 | {
|
---|
119 | /** The opcode for the request. */
|
---|
120 | uint8_t u8Opc;
|
---|
121 | /** Common flags for the request. */
|
---|
122 | uint8_t u8Flags;
|
---|
123 | /** Assigned I/O priority. */
|
---|
124 | uint16_t u16IoPrio;
|
---|
125 | /** The file descriptor the request is for. */
|
---|
126 | int32_t i32Fd;
|
---|
127 | /** The start offset into the file for the request. */
|
---|
128 | uint64_t u64OffStart;
|
---|
129 | /** Buffer pointer or Pointer to io vector array depending on opcode. */
|
---|
130 | uint64_t u64AddrBufIoVec;
|
---|
131 | /** Size of the buffer in bytes or number of io vectors. */
|
---|
132 | uint32_t u32BufIoVecSz;
|
---|
133 | /** Opcode dependent data. */
|
---|
134 | union
|
---|
135 | {
|
---|
136 | /** Flags for read/write requests. */
|
---|
137 | uint32_t u32KrnlRwFlags;
|
---|
138 | /** Flags for fsync() like requests. */
|
---|
139 | uint32_t u32FsyncFlags;
|
---|
140 | /** Flags for poll() like requests. */
|
---|
141 | uint16_t u16PollFlags;
|
---|
142 | /** Flags for sync_file_range() like requests. */
|
---|
143 | uint32_t u32SyncFileRangeFlags;
|
---|
144 | /** Flags for requests requiring a msg structure. */
|
---|
145 | uint32_t u32MsgFlags;
|
---|
146 | } uOpc;
|
---|
147 | /** Opaque user data associated with the request and returned durign completion. */
|
---|
148 | uint64_t u64User;
|
---|
149 | /** Request type dependent data. */
|
---|
150 | union
|
---|
151 | {
|
---|
152 | /** Fixed buffer index if indicated by the request flags. */
|
---|
153 | uint16_t u16FixedBufIdx;
|
---|
154 | /** Padding to align the structure to 64 bytes. */
|
---|
155 | uint64_t au64Padding[3];
|
---|
156 | } uReq;
|
---|
157 | } LNXIOURINGSQE;
|
---|
158 | AssertCompileSize(LNXIOURINGSQE, 64);
|
---|
159 | /** Pointer to a Linux io_uring submission queue entry. */
|
---|
160 | typedef LNXIOURINGSQE *PLNXIOURINGSQE;
|
---|
161 | /** Pointer to a constant Linux io_uring submission queue entry. */
|
---|
162 | typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
|
---|
163 |
|
---|
164 |
|
---|
165 | /**
|
---|
166 | * Linux u_ioring SQ ring header structure to maintain the queue.
|
---|
167 | */
|
---|
168 | typedef struct LNXIOURINGSQ
|
---|
169 | {
|
---|
170 | /** The current head position to fill in new requests. */
|
---|
171 | uint32_t u32OffHead;
|
---|
172 | /** The current tail position the kernel starts processing from. */
|
---|
173 | uint32_t u32OffTail;
|
---|
174 | /** The mask for the head and tail counters to apply to retrieve the index. */
|
---|
175 | uint32_t u32OffRingMask;
|
---|
176 | /** Number of entries in the SQ ring. */
|
---|
177 | uint32_t u32OffRingEntries;
|
---|
178 | /** Flags set asychronously by the kernel. */
|
---|
179 | uint32_t u32OffFlags;
|
---|
180 | /** Counter of dropped requests. */
|
---|
181 | uint32_t u32OffDroppedReqs;
|
---|
182 | /** Offset where to find the array of SQ entries. */
|
---|
183 | uint32_t u32OffArray;
|
---|
184 | /** Reserved. */
|
---|
185 | uint32_t u32Rsvd0;
|
---|
186 | /** Reserved. */
|
---|
187 | uint64_t u64Rsvd1;
|
---|
188 | } LNXIOURINGSQ;
|
---|
189 | AssertCompileSize(LNXIOURINGSQ, 40);
|
---|
190 | /** Pointer to a Linux u_ioring SQ ring header. */
|
---|
191 | typedef LNXIOURINGSQ *PLNXIOURINGSQ;
|
---|
192 | /** Pointer to a constant Linux u_ioring SQ ring header. */
|
---|
193 | typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
|
---|
194 |
|
---|
195 |
|
---|
196 | /**
|
---|
197 | * Linux io_uring CQ ring header structure to maintain the queue.
|
---|
198 | */
|
---|
199 | typedef struct LNXIOURINGCQ
|
---|
200 | {
|
---|
201 | /** The current head position the kernel modifies when completion events happen. */
|
---|
202 | uint32_t u32OffHead;
|
---|
203 | /** The current tail position to read completion events from. */
|
---|
204 | uint32_t u32OffTail;
|
---|
205 | /** The mask for the head and tail counters to apply to retrieve the index. */
|
---|
206 | uint32_t u32OffRingMask;
|
---|
207 | /** Number of entries in the CQ ring. */
|
---|
208 | uint32_t u32OffRingEntries;
|
---|
209 | /** Number of CQ overflows happened. */
|
---|
210 | uint32_t u32OffOverflowCnt;
|
---|
211 | /** */
|
---|
212 | uint32_t u32OffCqes;
|
---|
213 | /** Reserved. */
|
---|
214 | uint64_t au64Rsvd0[2];
|
---|
215 | } LNXIOURINGCQ;
|
---|
216 | AssertCompileSize(LNXIOURINGCQ, 40);
|
---|
217 | /** Pointer to a Linux u_ioring CQ ring header. */
|
---|
218 | typedef LNXIOURINGCQ *PLNXIOURINGCQ;
|
---|
219 | /** Pointer to a constant Linux u_ioring CQ ring header. */
|
---|
220 | typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
|
---|
221 |
|
---|
222 |
|
---|
223 | /**
|
---|
224 | * Linux io_uring parameters passed to io_uring_setup().
|
---|
225 | */
|
---|
226 | typedef struct LNXIOURINGPARAMS
|
---|
227 | {
|
---|
228 | /** Number of SQ entries requested, must be power of 2. */
|
---|
229 | uint32_t u32SqEntriesCnt;
|
---|
230 | /** Number of CQ entries requested, must be power of 2. */
|
---|
231 | uint32_t u32CqEntriesCnt;
|
---|
232 | /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
|
---|
233 | uint32_t u32Flags;
|
---|
234 | /** Affinity of the kernel side SQ polling thread if enabled. */
|
---|
235 | uint32_t u32SqPollCpu;
|
---|
236 | /** Milliseconds after the kernel side SQ polling thread goes to sleep
|
---|
237 | * if there is are no requests to process. */
|
---|
238 | uint32_t u32SqPollIdleMs;
|
---|
239 | /** Reserved. */
|
---|
240 | uint32_t au32Rsvd0[5];
|
---|
241 | /** Offsets returned for the submission queue. */
|
---|
242 | LNXIOURINGSQ SqOffsets;
|
---|
243 | /** Offsets returned for the completion queue. */
|
---|
244 | LNXIOURINGCQ CqOffsets;
|
---|
245 | } LNXIOURINGPARAMS;
|
---|
246 | /** Pointer to Linux io_uring parameters. */
|
---|
247 | typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
|
---|
248 | /** Pointer to constant Linux io_uring parameters. */
|
---|
249 | typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
|
---|
250 |
|
---|
251 |
|
---|
252 | /**
|
---|
253 | * @name LNXIOURINGSQE::u8Opc defined opcodes.
|
---|
254 | * @{ */
|
---|
255 | /** Opcode to profile the interface, does nothing. */
|
---|
256 | #define LNX_IOURING_OPC_NOP 0
|
---|
257 | /** preadv() like request. */
|
---|
258 | #define LNX_IOURING_OPC_READV 1
|
---|
259 | /** pwritev() like request. */
|
---|
260 | #define LNX_IOURING_OPC_WRITEV 2
|
---|
261 | /** fsync() like request. */
|
---|
262 | #define LNX_IOURING_OPC_FSYNC 3
|
---|
263 | /** Read request using a fixed preset buffer. */
|
---|
264 | #define LNX_IOURING_OPC_READ_FIXED 4
|
---|
265 | /** Write request using a fixed preset buffer. */
|
---|
266 | #define LNX_IOURING_OPC_WRITE_FIXED 5
|
---|
267 | /** Add file descriptor to pollset. */
|
---|
268 | #define LNX_IOURING_OPC_POLL_ADD 6
|
---|
269 | /** Remove file descriptor from pollset. */
|
---|
270 | #define LNX_IOURING_OPC_POLL_REMOVE 7
|
---|
271 | /** sync_file_range() like request. */
|
---|
272 | #define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
|
---|
273 | /** sendmsg() like request. */
|
---|
274 | #define LNX_IOURING_OPC_SENDMSG 9
|
---|
275 | /** recvmsg() like request. */
|
---|
276 | #define LNX_IOURING_OPC_RECVMSG 10
|
---|
277 | /** @} */
|
---|
278 |
|
---|
279 |
|
---|
280 | /**
|
---|
281 | * @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
|
---|
282 | * @{ */
|
---|
283 | /** Sync userdata as well instead of metadata only. */
|
---|
284 | #define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
|
---|
285 | /** @} */
|
---|
286 |
|
---|
287 |
|
---|
288 | /**
|
---|
289 | * @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
|
---|
290 | * @{ */
|
---|
291 | /** The I/O context is polled. */
|
---|
292 | #define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
|
---|
293 | /** The kernel should poll the submission queue. */
|
---|
294 | #define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
|
---|
295 | /** Sets the CPU affinity of the kernel thread polling the submission queue. */
|
---|
296 | #define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
|
---|
297 | /** @} */
|
---|
298 |
|
---|
299 |
|
---|
300 | /**
|
---|
301 | * @name Flags for LNXIOURINGSQE::u8Flags.
|
---|
302 | * @{ */
|
---|
303 | /** The file descriptor was registered before use. */
|
---|
304 | #define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
|
---|
305 | /** Complete all active requests before issuing the request with the flag set. */
|
---|
306 | #define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
|
---|
307 | /** Links the request with the flag set to the next one. */
|
---|
308 | #define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
|
---|
309 | /** @} */
|
---|
310 |
|
---|
311 |
|
---|
312 | /**
|
---|
313 | * @name Magic mmap offsets to map submission and completion queues.
|
---|
314 | * @{ */
|
---|
315 | /** Used to map the submission queue. */
|
---|
316 | #define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
|
---|
317 | /** Used to map the completion queue. */
|
---|
318 | #define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
|
---|
319 | /** Used to map the submission queue entries array. */
|
---|
320 | #define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
|
---|
321 | /** @} */
|
---|
322 |
|
---|
323 |
|
---|
324 | /**
|
---|
325 | * @name Flags used for the SQ ring structure.
|
---|
326 | * @{ */
|
---|
327 | /** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
|
---|
328 | #define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
|
---|
329 | /** @} */
|
---|
330 |
|
---|
331 |
|
---|
332 | /**
|
---|
333 | * @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
|
---|
334 | * { */
|
---|
335 | /** Retrieve completion events for the completion queue. */
|
---|
336 | #define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
|
---|
337 | /** Wakes the suspended kernel thread processing the requests. */
|
---|
338 | #define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
|
---|
339 | /** @} */
|
---|
340 |
|
---|
341 |
|
---|
342 | /**
|
---|
343 | * @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
|
---|
344 | * { */
|
---|
345 | /** Register a fixed set of buffers. */
|
---|
346 | #define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
|
---|
347 | /** Unregisters a fixed set of buffers registered previously. */
|
---|
348 | #define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
|
---|
349 | /** Register a fixed set of files. */
|
---|
350 | #define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
|
---|
351 | /** Unregisters a fixed set of files registered previously. */
|
---|
352 | #define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
|
---|
353 | /** Register an eventfd associated with the I/O ring. */
|
---|
354 | #define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
|
---|
355 | /** Unregisters an eventfd registered previously. */
|
---|
356 | #define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
|
---|
357 | /** @} */
|
---|
358 |
|
---|
359 |
|
---|
360 | /**
|
---|
361 | * SQ ring structure.
|
---|
362 | *
|
---|
363 | * @note Some members of this structure point to memory shared with the kernel,
|
---|
364 | * hence the volatile keyword.
|
---|
365 | */
|
---|
366 | typedef struct RTIOQUEUESQ
|
---|
367 | {
|
---|
368 | /** Pointer to the head counter. */
|
---|
369 | volatile uint32_t *pidxHead;
|
---|
370 | /** Pointer to the tail counter. */
|
---|
371 | volatile uint32_t *pidxTail;
|
---|
372 | /** Mask to apply for the counters to get to the index. */
|
---|
373 | uint32_t fRingMask;
|
---|
374 | /** Number of entries in the ring. */
|
---|
375 | uint32_t cEntries;
|
---|
376 | /** Pointer to the global flags. */
|
---|
377 | volatile uint32_t *pfFlags;
|
---|
378 | /** Pointer to the indirection array used for indexing the real SQ entries. */
|
---|
379 | volatile uint32_t *paidxSqes;
|
---|
380 | } RTIOQUEUESQ;
|
---|
381 |
|
---|
382 |
|
---|
383 | /**
|
---|
384 | * CQ ring structure.
|
---|
385 | *
|
---|
386 | * @note Some members of this structure point to memory shared with the kernel,
|
---|
387 | * hence the volatile keyword.
|
---|
388 | */
|
---|
389 | typedef struct RTIOQUEUECQ
|
---|
390 | {
|
---|
391 | /** Pointer to the head counter. */
|
---|
392 | volatile uint32_t *pidxHead;
|
---|
393 | /** Pointer to the tail counter. */
|
---|
394 | volatile uint32_t *pidxTail;
|
---|
395 | /** Mask to apply for the counters to get to the index. */
|
---|
396 | uint32_t fRingMask;
|
---|
397 | /** Number of entries in the ring. */
|
---|
398 | uint32_t cEntries;
|
---|
399 | /** Pointer to the completion entry ring. */
|
---|
400 | volatile LNXIOURINGCQE *paCqes;
|
---|
401 | } RTIOQUEUECQ;
|
---|
402 |
|
---|
403 |
|
---|
404 | /**
|
---|
405 | * Internal I/O queue provider instance data.
|
---|
406 | */
|
---|
407 | typedef struct RTIOQUEUEPROVINT
|
---|
408 | {
|
---|
409 | /** The io_uring file descriptor. */
|
---|
410 | int iFdIoCtx;
|
---|
411 | /** The submission queue. */
|
---|
412 | RTIOQUEUESQ Sq;
|
---|
413 | /** The completion queue. */
|
---|
414 | RTIOQUEUECQ Cq;
|
---|
415 | /** Pointer to the mapped SQES entries. */
|
---|
416 | PLNXIOURINGSQE paSqes;
|
---|
417 | /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
|
---|
418 | void *pvMMapSqRing;
|
---|
419 | /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
|
---|
420 | void *pvMMapCqRing;
|
---|
421 | /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
|
---|
422 | void *pvMMapSqes;
|
---|
423 | /** Size of the mapped SQ ring, used for unmapping. */
|
---|
424 | size_t cbMMapSqRing;
|
---|
425 | /** Size of the mapped CQ ring, used for unmapping. */
|
---|
426 | size_t cbMMapCqRing;
|
---|
427 | /** Size of the mapped SQ entries array, used for unmapping. */
|
---|
428 | size_t cbMMapSqes;
|
---|
429 | } RTIOQUEUEPROVINT;
|
---|
430 | /** Pointer to the internal I/O queue provider instance data. */
|
---|
431 | typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
|
---|
432 |
|
---|
433 |
|
---|
434 | /*********************************************************************************************************************************
|
---|
435 | * Internal Functions *
|
---|
436 | *********************************************************************************************************************************/
|
---|
437 |
|
---|
438 | /**
|
---|
439 | * Syscall wrapper for io_uring_setup().
|
---|
440 | *
|
---|
441 | * @returns IPRT status code.
|
---|
442 | * @param cEntries Number of entries for submission and completion queues.
|
---|
443 | * @param pParams Additional parameters for the I/O ring and updated return values
|
---|
444 | * on success.
|
---|
445 | * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
|
---|
446 | */
|
---|
447 | DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
|
---|
448 | {
|
---|
449 | int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
|
---|
450 | if (RT_UNLIKELY(rcLnx == -1))
|
---|
451 | return RTErrConvertFromErrno(errno);
|
---|
452 |
|
---|
453 | *piFdIoCtx = rcLnx;
|
---|
454 | return VINF_SUCCESS;
|
---|
455 | }
|
---|
456 |
|
---|
457 |
|
---|
458 | /**
|
---|
459 | * Syscall wrapper for io_uring_enter().
|
---|
460 | *
|
---|
461 | * @returns IPRT status code.
|
---|
462 | * @param iFdIoCtx The I/O ring file descriptor.
|
---|
463 | * @param cToSubmit Maximum number of requests waiting for processing.
|
---|
464 | * @param cMinComplete Minimum number of completion events to accumulate before returning.
|
---|
465 | * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
|
---|
466 | */
|
---|
467 | DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
|
---|
468 | uint32_t fFlags)
|
---|
469 | {
|
---|
470 | int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
|
---|
471 | NULL, 0);
|
---|
472 | if (RT_UNLIKELY(rcLnx == -1))
|
---|
473 | return RTErrConvertFromErrno(errno);
|
---|
474 |
|
---|
475 | return VINF_SUCCESS;
|
---|
476 | }
|
---|
477 |
|
---|
478 |
|
---|
479 | /**
|
---|
480 | * Syscall wrapper for io_uring_register().
|
---|
481 | *
|
---|
482 | * @returns IPRT status code.
|
---|
483 | * @param iFdIoCtx The I/O ring file descriptor.
|
---|
484 | * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
|
---|
485 | * @param pvArg Opaque arguments.
|
---|
486 | * @param cArgs Number of arguments.
|
---|
487 | */
|
---|
488 | DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
|
---|
489 | uint32_t cArgs)
|
---|
490 | {
|
---|
491 | int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
|
---|
492 | if (RT_UNLIKELY(rcLnx == -1))
|
---|
493 | return RTErrConvertFromErrno(errno);
|
---|
494 |
|
---|
495 | return VINF_SUCCESS;
|
---|
496 | }
|
---|
497 |
|
---|
498 |
|
---|
499 | /**
|
---|
500 | * mmap() wrapper for the common bits and returning an IPRT status code.
|
---|
501 | *
|
---|
502 | * @returns IPRT status code.
|
---|
503 | * @param iFdIoCtx The I/O ring file descriptor.
|
---|
504 | * @param offMmap The mmap() offset.
|
---|
505 | * @param cbMmap How much to map.
|
---|
506 | * @param ppv Where to store the pointer to the mapping on success.
|
---|
507 | */
|
---|
508 | DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
|
---|
509 | {
|
---|
510 | void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
|
---|
511 | if (pv != MAP_FAILED)
|
---|
512 | {
|
---|
513 | *ppv = pv;
|
---|
514 | return VINF_SUCCESS;
|
---|
515 | }
|
---|
516 |
|
---|
517 | return RTErrConvertFromErrno(errno);
|
---|
518 | }
|
---|
519 |
|
---|
520 |
|
---|
521 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
|
---|
522 | static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
|
---|
523 | {
|
---|
524 | /*
|
---|
525 | * Try to create a simple I/O ring and close it again.
|
---|
526 | * The common code/public API already checked for the proper handle type.
|
---|
527 | */
|
---|
528 | int iFdIoCtx = 0;
|
---|
529 | LNXIOURINGPARAMS Params;
|
---|
530 | RT_ZERO(Params);
|
---|
531 |
|
---|
532 | int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
|
---|
533 | if (RT_SUCCESS(rc))
|
---|
534 | {
|
---|
535 | int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
|
---|
536 | return true;
|
---|
537 | }
|
---|
538 |
|
---|
539 | return false;
|
---|
540 | }
|
---|
541 |
|
---|
542 |
|
---|
543 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
|
---|
544 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
|
---|
545 | size_t cSqEntries, size_t cCqEntries)
|
---|
546 | {
|
---|
547 | RT_NOREF(fFlags, cCqEntries);
|
---|
548 |
|
---|
549 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
550 | LNXIOURINGPARAMS Params;
|
---|
551 | RT_ZERO(Params);
|
---|
552 |
|
---|
553 | int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
|
---|
554 | if (RT_SUCCESS(rc))
|
---|
555 | {
|
---|
556 | /* Map the rings into userspace. */
|
---|
557 | pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
|
---|
558 | pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
|
---|
559 | pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
|
---|
560 |
|
---|
561 | rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
|
---|
562 | if (RT_SUCCESS(rc))
|
---|
563 | {
|
---|
564 | rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
|
---|
565 | if (RT_SUCCESS(rc))
|
---|
566 | {
|
---|
567 | rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
|
---|
568 | if (RT_SUCCESS(rc))
|
---|
569 | {
|
---|
570 | uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
|
---|
571 |
|
---|
572 | pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
|
---|
573 | pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
|
---|
574 | pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
|
---|
575 | pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
|
---|
576 | pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
|
---|
577 | pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
|
---|
578 |
|
---|
579 | pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
|
---|
580 |
|
---|
581 | pbTmp = (uint8_t *)pThis->pvMMapCqRing;
|
---|
582 |
|
---|
583 | pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
|
---|
584 | pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
|
---|
585 | pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
|
---|
586 | pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
|
---|
587 | pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
|
---|
588 | return VINF_SUCCESS;
|
---|
589 | }
|
---|
590 |
|
---|
591 | munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
|
---|
592 | }
|
---|
593 |
|
---|
594 | munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
|
---|
595 | }
|
---|
596 | }
|
---|
597 |
|
---|
598 | return rc;
|
---|
599 | }
|
---|
600 |
|
---|
601 |
|
---|
602 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
|
---|
603 | static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
|
---|
604 | {
|
---|
605 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
606 |
|
---|
607 | int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
|
---|
608 | rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
|
---|
609 | rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
|
---|
610 | close(pThis->iFdIoCtx);
|
---|
611 |
|
---|
612 | RT_ZERO(pThis);
|
---|
613 | }
|
---|
614 |
|
---|
615 |
|
---|
616 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
|
---|
617 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
|
---|
618 | {
|
---|
619 | RT_NOREF(hIoQueueProv, pHandle);
|
---|
620 | /** @todo Add support for fixed file sets later. */
|
---|
621 | return VINF_SUCCESS;
|
---|
622 | }
|
---|
623 |
|
---|
624 |
|
---|
625 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
|
---|
626 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
|
---|
627 | {
|
---|
628 | RT_NOREF(hIoQueueProv, pHandle);
|
---|
629 | /** @todo Add support for fixed file sets later. */
|
---|
630 | return VINF_SUCCESS;
|
---|
631 | }
|
---|
632 |
|
---|
633 |
|
---|
634 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
|
---|
635 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
|
---|
636 | uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
|
---|
637 | void *pvUser)
|
---|
638 | {
|
---|
639 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
640 | RT_NOREF(pThis, pHandle, enmOp, off, pvBuf, cbBuf, fReqFlags, pvUser);
|
---|
641 |
|
---|
642 | return VERR_NOT_IMPLEMENTED;
|
---|
643 | }
|
---|
644 |
|
---|
645 |
|
---|
646 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
|
---|
647 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
|
---|
648 | {
|
---|
649 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
650 | RT_NOREF(pThis, pcReqsCommitted);
|
---|
651 |
|
---|
652 | return VERR_NOT_IMPLEMENTED;
|
---|
653 | }
|
---|
654 |
|
---|
655 |
|
---|
656 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
|
---|
657 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
|
---|
658 | uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
|
---|
659 | {
|
---|
660 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
661 | RT_NOREF(pThis, paCEvt, cCEvt, cMinWait, pcCEvt, fFlags);
|
---|
662 |
|
---|
663 | return VERR_NOT_IMPLEMENTED;;
|
---|
664 | }
|
---|
665 |
|
---|
666 |
|
---|
667 | /** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
|
---|
668 | static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
|
---|
669 | {
|
---|
670 | PRTIOQUEUEPROVINT pThis = hIoQueueProv;
|
---|
671 | RT_NOREF(pThis);
|
---|
672 |
|
---|
673 | return VERR_NOT_IMPLEMENTED;
|
---|
674 | }
|
---|
675 |
|
---|
676 |
|
---|
677 | /**
|
---|
678 | * Async file I/O queue provider virtual method table.
|
---|
679 | */
|
---|
680 | RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
|
---|
681 | {
|
---|
682 | /** uVersion */
|
---|
683 | RTIOQUEUEPROVVTABLE_VERSION,
|
---|
684 | /** pszId */
|
---|
685 | "LnxIoURingFile",
|
---|
686 | /** cbIoQueueProv */
|
---|
687 | sizeof(RTIOQUEUEPROVINT),
|
---|
688 | /** enmHnd */
|
---|
689 | RTHANDLETYPE_FILE,
|
---|
690 | /** fFlags */
|
---|
691 | 0,
|
---|
692 | /** pfnIsSupported */
|
---|
693 | rtIoQueueLnxIoURingFileProv_IsSupported,
|
---|
694 | /** pfnQueueInit */
|
---|
695 | rtIoQueueLnxIoURingFileProv_QueueInit,
|
---|
696 | /** pfnQueueDestroy */
|
---|
697 | rtIoQueueLnxIoURingFileProv_QueueDestroy,
|
---|
698 | /** pfnHandleRegister */
|
---|
699 | rtIoQueueLnxIoURingFileProv_HandleRegister,
|
---|
700 | /** pfnHandleDeregister */
|
---|
701 | rtIoQueueLnxIoURingFileProv_HandleDeregister,
|
---|
702 | /** pfnReqPrepare */
|
---|
703 | rtIoQueueLnxIoURingFileProv_ReqPrepare,
|
---|
704 | /** pfnReqPrepareSg */
|
---|
705 | NULL,
|
---|
706 | /** pfnCommit */
|
---|
707 | rtIoQueueLnxIoURingFileProv_Commit,
|
---|
708 | /** pfnEvtWait */
|
---|
709 | rtIoQueueLnxIoURingFileProv_EvtWait,
|
---|
710 | /** pfnEvtWaitWakeup */
|
---|
711 | rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
|
---|
712 | /** uEndMarker */
|
---|
713 | RTIOQUEUEPROVVTABLE_VERSION
|
---|
714 | };
|
---|
715 |
|
---|