VirtualBox

source: vbox/trunk/src/VBox/Additions/linux/sharedfolders/regops.c@ 77853

Last change on this file since 77853 was 77853, checked in by vboxsync, 6 years ago

linux/vboxsf: Implemented the copy_file_range method and fixed a recently introduced bug in vbsf_reg_write_iter_locking where vbsf_reg_write_sync_page_cache was called with the wrong range length. bugref:9172

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 106.7 KB
Line 
1/* $Id: regops.c 77853 2019-03-22 20:54:14Z vboxsync $ */
2/** @file
3 * vboxsf - VBox Linux Shared Folders VFS, regular file inode and file operations.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person
10 * obtaining a copy of this software and associated documentation
11 * files (the "Software"), to deal in the Software without
12 * restriction, including without limitation the rights to use,
13 * copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following
16 * conditions:
17 *
18 * The above copyright notice and this permission notice shall be
19 * included in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
25 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
26 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 * OTHER DEALINGS IN THE SOFTWARE.
29 */
30
31
32/*********************************************************************************************************************************
33* Header Files *
34*********************************************************************************************************************************/
35#include "vfsmod.h"
36#include <linux/uio.h>
37#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 32)
38# include <linux/aio.h> /* struct kiocb before 4.1 */
39#endif
40#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
41# include <linux/buffer_head.h>
42#endif
43#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31) \
44 && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
45# include <linux/writeback.h>
46#endif
47#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23) \
48 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
49# include <linux/splice.h>
50#endif
51#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
52# include <linux/swap.h> /* for mark_page_accessed */
53#endif
54#include <iprt/err.h>
55
56#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18)
57# define SEEK_END 2
58#endif
59
60#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
61# define iter_is_iovec(a_pIter) ( !((a_pIter)->type & (ITER_KVEC | ITER_BVEC)) )
62#endif
63
64#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
65# define vm_fault_t int
66#endif
67
68#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 20)
69# define pgoff_t unsigned long
70#endif
71
72
73/*********************************************************************************************************************************
74* Structures and Typedefs *
75*********************************************************************************************************************************/
76#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
77/** Used by vbsf_iter_lock_pages() to keep the first page of the next segment. */
78struct vbsf_iter_stash {
79 struct page *pPage;
80 size_t off;
81 size_t cb;
82# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
83 size_t offFromEnd;
84 struct iov_iter Copy;
85# endif
86};
87#endif /* >= 3.16.0 */
88/** Initializer for struct vbsf_iter_stash. */
89#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
90# define VBSF_ITER_STASH_INITIALIZER { NULL, 0 }
91#else
92# define VBSF_ITER_STASH_INITIALIZER { NULL, 0, ~(size_t)0 }
93#endif
94
95
96
97/**
98 * Called when an inode is released to unlink all handles that might impossibly
99 * still be associated with it.
100 *
101 * @param pInodeInfo The inode which handles to drop.
102 */
103void vbsf_handle_drop_chain(struct vbsf_inode_info *pInodeInfo)
104{
105 struct vbsf_handle *pCur, *pNext;
106 unsigned long fSavedFlags;
107 SFLOGFLOW(("vbsf_handle_drop_chain: %p\n", pInodeInfo));
108 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
109
110 RTListForEachSafe(&pInodeInfo->HandleList, pCur, pNext, struct vbsf_handle, Entry) {
111 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
112 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
113 pCur->fFlags |= VBSF_HANDLE_F_ON_LIST;
114 RTListNodeRemove(&pCur->Entry);
115 }
116
117 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
118}
119
120
121/**
122 * Locates a handle that matches all the flags in @a fFlags.
123 *
124 * @returns Pointer to handle on success (retained), use vbsf_handle_release() to
125 * release it. NULL if no suitable handle was found.
126 * @param pInodeInfo The inode info to search.
127 * @param fFlagsSet The flags that must be set.
128 * @param fFlagsClear The flags that must be clear.
129 */
130struct vbsf_handle *vbsf_handle_find(struct vbsf_inode_info *pInodeInfo, uint32_t fFlagsSet, uint32_t fFlagsClear)
131{
132 struct vbsf_handle *pCur;
133 unsigned long fSavedFlags;
134 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
135
136 RTListForEach(&pInodeInfo->HandleList, pCur, struct vbsf_handle, Entry) {
137 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
138 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
139 if ((pCur->fFlags & (fFlagsSet | fFlagsClear)) == fFlagsSet) {
140 uint32_t cRefs = ASMAtomicIncU32(&pCur->cRefs);
141 if (cRefs > 1) {
142 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
143 SFLOGFLOW(("vbsf_handle_find: returns %p\n", pCur));
144 return pCur;
145 }
146 /* Oops, already being closed (safe as it's only ever increased here). */
147 ASMAtomicDecU32(&pCur->cRefs);
148 }
149 }
150
151 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
152 SFLOGFLOW(("vbsf_handle_find: returns NULL!\n"));
153 return NULL;
154}
155
156
157/**
158 * Slow worker for vbsf_handle_release() that does the freeing.
159 *
160 * @returns 0 (ref count).
161 * @param pHandle The handle to release.
162 * @param sf_g The info structure for the shared folder associated
163 * with the handle.
164 * @param pszCaller The caller name (for logging failures).
165 */
166uint32_t vbsf_handle_release_slow(struct vbsf_handle *pHandle, struct vbsf_super_info *sf_g, const char *pszCaller)
167{
168 int rc;
169 unsigned long fSavedFlags;
170
171 SFLOGFLOW(("vbsf_handle_release_slow: %p (%s)\n", pHandle, pszCaller));
172
173 /*
174 * Remove from the list.
175 */
176 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
177
178 AssertMsg((pHandle->fFlags & VBSF_HANDLE_F_MAGIC_MASK) == VBSF_HANDLE_F_MAGIC, ("%p %#x\n", pHandle, pHandle->fFlags));
179 Assert(pHandle->pInodeInfo);
180 Assert(pHandle->pInodeInfo && pHandle->pInodeInfo->u32Magic == SF_INODE_INFO_MAGIC);
181
182 if (pHandle->fFlags & VBSF_HANDLE_F_ON_LIST) {
183 pHandle->fFlags &= ~VBSF_HANDLE_F_ON_LIST;
184 RTListNodeRemove(&pHandle->Entry);
185 }
186
187 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
188
189 /*
190 * Actually destroy it.
191 */
192 rc = VbglR0SfHostReqCloseSimple(sf_g->map.root, pHandle->hHost);
193 if (RT_FAILURE(rc))
194 LogFunc(("Caller %s: VbglR0SfHostReqCloseSimple %#RX64 failed with rc=%Rrc\n", pszCaller, pHandle->hHost, rc));
195 pHandle->hHost = SHFL_HANDLE_NIL;
196 pHandle->fFlags = VBSF_HANDLE_F_MAGIC_DEAD;
197 kfree(pHandle);
198 return 0;
199}
200
201
202/**
203 * Appends a handle to a handle list.
204 *
205 * @param pInodeInfo The inode to add it to.
206 * @param pHandle The handle to add.
207 */
208void vbsf_handle_append(struct vbsf_inode_info *pInodeInfo, struct vbsf_handle *pHandle)
209{
210#ifdef VBOX_STRICT
211 struct vbsf_handle *pCur;
212#endif
213 unsigned long fSavedFlags;
214
215 SFLOGFLOW(("vbsf_handle_append: %p (to %p)\n", pHandle, pInodeInfo));
216 AssertMsg((pHandle->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST)) == VBSF_HANDLE_F_MAGIC,
217 ("%p %#x\n", pHandle, pHandle->fFlags));
218 Assert(pInodeInfo->u32Magic == SF_INODE_INFO_MAGIC);
219
220 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
221
222 AssertMsg((pHandle->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST)) == VBSF_HANDLE_F_MAGIC,
223 ("%p %#x\n", pHandle, pHandle->fFlags));
224#ifdef VBOX_STRICT
225 RTListForEach(&pInodeInfo->HandleList, pCur, struct vbsf_handle, Entry) {
226 Assert(pCur != pHandle);
227 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
228 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
229 }
230 pHandle->pInodeInfo = pInodeInfo;
231#endif
232
233 pHandle->fFlags |= VBSF_HANDLE_F_ON_LIST;
234 RTListAppend(&pInodeInfo->HandleList, &pHandle->Entry);
235
236 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
237}
238
239
240#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23) \
241 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
242
243/*
244 * Some pipe stuff we apparently need for 2.6.23-2.6.30.
245 */
246
247static void vbsf_free_pipebuf(struct page *kpage)
248{
249 kunmap(kpage);
250 __free_pages(kpage, 0);
251}
252
253static void *vbsf_pipe_buf_map(struct pipe_inode_info *pipe, struct pipe_buffer *pipe_buf, int atomic)
254{
255 return 0;
256}
257
258static void vbsf_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *pipe_buf)
259{
260}
261
262static void vbsf_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *pipe_buf, void *map_data)
263{
264}
265
266static int vbsf_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *pipe_buf)
267{
268 return 0;
269}
270
271static void vbsf_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *pipe_buf)
272{
273 vbsf_free_pipebuf(pipe_buf->page);
274}
275
276static int vbsf_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *pipe_buf)
277{
278 return 0;
279}
280
281static struct pipe_buf_operations vbsf_pipe_buf_ops = {
282 .can_merge = 0,
283 .map = vbsf_pipe_buf_map,
284 .unmap = vbsf_pipe_buf_unmap,
285 .confirm = vbsf_pipe_buf_confirm,
286 .release = vbsf_pipe_buf_release,
287 .steal = vbsf_pipe_buf_steal,
288 .get = vbsf_pipe_buf_get,
289};
290
291static int vbsf_reg_read_aux(const char *caller, struct vbsf_super_info *sf_g, struct vbsf_reg_info *sf_r,
292 void *buf, uint32_t *nread, uint64_t pos)
293{
294 int rc = VbglR0SfRead(&g_SfClient, &sf_g->map, sf_r->Handle.hHost, pos, nread, buf, false /* already locked? */ );
295 if (RT_FAILURE(rc)) {
296 LogFunc(("VbglR0SfRead failed. caller=%s, rc=%Rrc\n", caller,
297 rc));
298 return -EPROTO;
299 }
300 return 0;
301}
302
303# define LOCK_PIPE(pipe) do { if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); } while (0)
304# define UNLOCK_PIPE(pipe) do { if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); } while (0)
305
306ssize_t vbsf_splice_read(struct file *in, loff_t * poffset, struct pipe_inode_info *pipe, size_t len, unsigned int flags)
307{
308 size_t bytes_remaining = len;
309 loff_t orig_offset = *poffset;
310 loff_t offset = orig_offset;
311 struct inode *inode = VBSF_GET_F_DENTRY(in)->d_inode;
312 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
313 struct vbsf_reg_info *sf_r = in->private_data;
314 ssize_t retval;
315 struct page *kpage = 0;
316 size_t nsent = 0;
317
318/** @todo rig up a FsPerf test for this code */
319 TRACE();
320 if (!S_ISREG(inode->i_mode)) {
321 LogFunc(("read from non regular file %d\n", inode->i_mode));
322 return -EINVAL;
323 }
324 if (!len) {
325 return 0;
326 }
327
328 LOCK_PIPE(pipe);
329
330 uint32_t req_size = 0;
331 while (bytes_remaining > 0) {
332 kpage = alloc_page(GFP_KERNEL);
333 if (unlikely(kpage == NULL)) {
334 UNLOCK_PIPE(pipe);
335 return -ENOMEM;
336 }
337 req_size = 0;
338 uint32_t nread = req_size = (uint32_t) min(bytes_remaining, (size_t) PAGE_SIZE);
339 uint32_t chunk = 0;
340 void *kbuf = kmap(kpage);
341 while (chunk < req_size) {
342 retval = vbsf_reg_read_aux(__func__, sf_g, sf_r, kbuf + chunk, &nread, offset);
343 if (retval < 0)
344 goto err;
345 if (nread == 0)
346 break;
347 chunk += nread;
348 offset += nread;
349 nread = req_size - chunk;
350 }
351 if (!pipe->readers) {
352 send_sig(SIGPIPE, current, 0);
353 retval = -EPIPE;
354 goto err;
355 }
356 if (pipe->nrbufs < PIPE_BUFFERS) {
357 struct pipe_buffer *pipebuf = pipe->bufs + ((pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1));
358 pipebuf->page = kpage;
359 pipebuf->ops = &vbsf_pipe_buf_ops;
360 pipebuf->len = req_size;
361 pipebuf->offset = 0;
362 pipebuf->private = 0;
363 pipebuf->flags = 0;
364 pipe->nrbufs++;
365 nsent += req_size;
366 bytes_remaining -= req_size;
367 if (signal_pending(current))
368 break;
369 } else { /* pipe full */
370
371 if (flags & SPLICE_F_NONBLOCK) {
372 retval = -EAGAIN;
373 goto err;
374 }
375 vbsf_free_pipebuf(kpage);
376 break;
377 }
378 }
379 UNLOCK_PIPE(pipe);
380 if (!nsent && signal_pending(current))
381 return -ERESTARTSYS;
382 *poffset += nsent;
383 return offset - orig_offset;
384
385 err:
386 UNLOCK_PIPE(pipe);
387 vbsf_free_pipebuf(kpage);
388 return retval;
389}
390
391#endif /* 2.6.23 <= LINUX_VERSION_CODE < 2.6.31 */
392
393/**
394 * Helper for deciding wheter we should do a read via the page cache or not.
395 *
396 * By default we will only use the page cache if there is a writable memory
397 * mapping of the file with a chance that it may have modified any of the pages
398 * already.
399 */
400DECLINLINE(bool) vbsf_should_use_cached_read(struct file *file, struct address_space *mapping, struct vbsf_super_info *sf_g)
401{
402 return mapping
403 && mapping->nrpages > 0
404 && mapping_writably_mapped(mapping)
405 && !(file->f_flags & O_DIRECT)
406 && 1 /** @todo make this behaviour configurable at mount time (sf_g) */;
407}
408
409/** Wrapper around put_page / page_cache_release. */
410DECLINLINE(void) vbsf_put_page(struct page *pPage)
411{
412#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
413 put_page(pPage);
414#else
415 page_cache_release(pPage);
416#endif
417}
418
419
420/** Wrapper around get_page / page_cache_get. */
421DECLINLINE(void) vbsf_get_page(struct page *pPage)
422{
423#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
424 get_page(pPage);
425#else
426 page_cache_get(pPage);
427#endif
428}
429
430
431/** Companion to vbsf_lock_user_pages(). */
432DECLINLINE(void) vbsf_unlock_user_pages(struct page **papPages, size_t cPages, bool fSetDirty, bool fLockPgHack)
433{
434 /* We don't mark kernel pages dirty: */
435 if (fLockPgHack)
436 fSetDirty = false;
437
438 while (cPages-- > 0)
439 {
440 struct page *pPage = papPages[cPages];
441 if (fSetDirty && !PageReserved(pPage))
442 SetPageDirty(pPage);
443 vbsf_put_page(pPage);
444 }
445}
446
447
448/**
449 * Worker for vbsf_lock_user_pages_failed_check_kernel() and
450 * vbsf_iter_lock_pages().
451 */
452static int vbsf_lock_kernel_pages(uint8_t *pbStart, bool fWrite, size_t cPages, struct page **papPages)
453{
454 uintptr_t const uPtrFrom = (uintptr_t)pbStart;
455 uintptr_t const uPtrLast = (uPtrFrom & ~(uintptr_t)PAGE_OFFSET_MASK) + (cPages << PAGE_SHIFT) - 1;
456 uint8_t *pbPage = (uint8_t *)uPtrLast;
457 size_t iPage = cPages;
458
459 /*
460 * Touch the pages first (paranoia^2).
461 */
462 if (fWrite) {
463 uint8_t volatile *pbProbe = (uint8_t volatile *)uPtrFrom;
464 while (iPage-- > 0) {
465 *pbProbe = *pbProbe;
466 pbProbe += PAGE_SIZE;
467 }
468 } else {
469 uint8_t const *pbProbe = (uint8_t const *)uPtrFrom;
470 while (iPage-- > 0) {
471 ASMProbeReadByte(pbProbe);
472 pbProbe += PAGE_SIZE;
473 }
474 }
475
476 /*
477 * Get the pages.
478 * Note! Fixes here probably applies to rtR0MemObjNativeLockKernel as well.
479 */
480 iPage = cPages;
481 if ( uPtrFrom >= (unsigned long)__va(0)
482 && uPtrLast < (unsigned long)high_memory) {
483 /* The physical page mapping area: */
484 while (iPage-- > 0) {
485 struct page *pPage = papPages[iPage] = virt_to_page(pbPage);
486 vbsf_get_page(pPage);
487 pbPage -= PAGE_SIZE;
488 }
489 } else {
490 /* This is vmalloc or some such thing, so go thru page tables: */
491 while (iPage-- > 0) {
492 struct page *pPage = rtR0MemObjLinuxVirtToPage(pbPage);
493 if (pPage) {
494 papPages[iPage] = pPage;
495 vbsf_get_page(pPage);
496 pbPage -= PAGE_SIZE;
497 } else {
498 while (++iPage < cPages) {
499 pPage = papPages[iPage];
500 vbsf_put_page(pPage);
501 }
502 return -EFAULT;
503 }
504 }
505 }
506 return 0;
507}
508
509
510/**
511 * Catches kernel_read() and kernel_write() calls and works around them.
512 *
513 * The file_operations::read and file_operations::write callbacks supposedly
514 * hands us the user buffers to read into and write out of. To allow the kernel
515 * to read and write without allocating buffers in userland, they kernel_read()
516 * and kernel_write() increases the user space address limit before calling us
517 * so that copyin/copyout won't reject it. Our problem is that get_user_pages()
518 * works on the userspace address space structures and will not be fooled by an
519 * increased addr_limit.
520 *
521 * This code tries to detect this situation and fake get_user_lock() for the
522 * kernel buffer.
523 */
524static int vbsf_lock_user_pages_failed_check_kernel(uintptr_t uPtrFrom, size_t cPages, bool fWrite, int rcFailed,
525 struct page **papPages, bool *pfLockPgHack)
526{
527 /*
528 * Check that this is valid user memory that is actually in the kernel range.
529 */
530#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
531 if ( access_ok((void *)uPtrFrom, cPages << PAGE_SHIFT)
532 && uPtrFrom >= USER_DS.seg)
533#else
534 if ( access_ok(fWrite ? VERIFY_WRITE : VERIFY_READ, (void *)uPtrFrom, cPages << PAGE_SHIFT)
535 && uPtrFrom >= USER_DS.seg)
536#endif
537 {
538 int rc = vbsf_lock_kernel_pages((uint8_t *)uPtrFrom, fWrite, cPages, papPages);
539 if (rc == 0) {
540 *pfLockPgHack = true;
541 return 0;
542 }
543 }
544
545 return rcFailed;
546}
547
548
549/** Wrapper around get_user_pages. */
550DECLINLINE(int) vbsf_lock_user_pages(uintptr_t uPtrFrom, size_t cPages, bool fWrite, struct page **papPages, bool *pfLockPgHack)
551{
552# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
553 ssize_t cPagesLocked = get_user_pages_unlocked(uPtrFrom, cPages, papPages,
554 fWrite ? FOLL_WRITE | FOLL_FORCE : FOLL_FORCE);
555# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
556 ssize_t cPagesLocked = get_user_pages_unlocked(uPtrFrom, cPages, fWrite, 1 /*force*/, papPages);
557# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
558 ssize_t cPagesLocked = get_user_pages_unlocked(current, current->mm, uPtrFrom, cPages, fWrite, 1 /*force*/, papPages);
559# else
560 struct task_struct *pTask = current;
561 size_t cPagesLocked;
562 down_read(&pTask->mm->mmap_sem);
563 cPagesLocked = get_user_pages(current, current->mm, uPtrFrom, cPages, fWrite, 1 /*force*/, papPages, NULL);
564 up_read(&pTask->mm->mmap_sem);
565# endif
566 *pfLockPgHack = false;
567 if (cPagesLocked == cPages)
568 return 0;
569
570 /*
571 * It failed.
572 */
573 if (cPagesLocked < 0)
574 return vbsf_lock_user_pages_failed_check_kernel(uPtrFrom, cPages, fWrite, (int)cPagesLocked, papPages, pfLockPgHack);
575
576 vbsf_unlock_user_pages(papPages, cPagesLocked, false /*fSetDirty*/, false /*fLockPgHack*/);
577
578 /* We could use uPtrFrom + cPagesLocked to get the correct status here... */
579 return -EFAULT;
580}
581
582
583/**
584 * Read function used when accessing files that are memory mapped.
585 *
586 * We read from the page cache here to present the a cohertent picture of the
587 * the file content.
588 */
589static ssize_t vbsf_reg_read_mapped(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off)
590{
591#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
592 struct iovec iov = { .iov_base = buf, .iov_len = size };
593 struct iov_iter iter;
594 struct kiocb kiocb;
595 ssize_t cbRet;
596
597 init_sync_kiocb(&kiocb, file);
598 kiocb.ki_pos = *off;
599 iov_iter_init(&iter, READ, &iov, 1, size);
600
601 cbRet = generic_file_read_iter(&kiocb, &iter);
602
603 *off = kiocb.ki_pos;
604 return cbRet;
605
606#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
607 struct iovec iov = { .iov_base = buf, .iov_len = size };
608 struct kiocb kiocb;
609 ssize_t cbRet;
610
611 init_sync_kiocb(&kiocb, file);
612 kiocb.ki_pos = *off;
613
614 cbRet = generic_file_aio_read(&kiocb, &iov, 1, *off);
615 if (cbRet == -EIOCBQUEUED)
616 cbRet = wait_on_sync_kiocb(&kiocb);
617
618 *off = kiocb.ki_pos;
619 return cbRet;
620
621#else /* 2.6.18 or earlier: */
622 return generic_file_read(file, buf, size, off);
623#endif
624}
625
626
627/**
628 * Fallback case of vbsf_reg_read() that locks the user buffers and let the host
629 * write directly to them.
630 */
631static ssize_t vbsf_reg_read_locking(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off,
632 struct vbsf_super_info *sf_g, struct vbsf_reg_info *sf_r)
633{
634 /*
635 * Lock pages and execute the read, taking care not to pass the host
636 * more than it can handle in one go or more than we care to allocate
637 * page arrays for. The latter limit is set at just short of 32KB due
638 * to how the physical heap works.
639 */
640 struct page *apPagesStack[16];
641 struct page **papPages = &apPagesStack[0];
642 struct page **papPagesFree = NULL;
643 VBOXSFREADPGLSTREQ *pReq;
644 loff_t offFile = *off;
645 ssize_t cbRet = -ENOMEM;
646 size_t cPages = (((uintptr_t)buf & PAGE_OFFSET_MASK) + size + PAGE_OFFSET_MASK) >> PAGE_SHIFT;
647 size_t cMaxPages = RT_MIN(RT_MAX(sf_g->cMaxIoPages, 1), cPages);
648 bool fLockPgHack;
649
650 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
651 while (!pReq && cMaxPages > 4) {
652 cMaxPages /= 2;
653 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
654 }
655 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
656 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
657 if (pReq && papPages) {
658 cbRet = 0;
659 for (;;) {
660 /*
661 * Figure out how much to process now and lock the user pages.
662 */
663 int rc;
664 size_t cbChunk = (uintptr_t)buf & PAGE_OFFSET_MASK;
665 pReq->PgLst.offFirstPage = (uint16_t)cbChunk;
666 cPages = RT_ALIGN_Z(cbChunk + size, PAGE_SIZE) >> PAGE_SHIFT;
667 if (cPages <= cMaxPages)
668 cbChunk = size;
669 else {
670 cPages = cMaxPages;
671 cbChunk = (cMaxPages << PAGE_SHIFT) - cbChunk;
672 }
673
674 rc = vbsf_lock_user_pages((uintptr_t)buf, cPages, true /*fWrite*/, papPages, &fLockPgHack);
675 if (rc == 0) {
676 size_t iPage = cPages;
677 while (iPage-- > 0)
678 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
679 } else {
680 cbRet = rc;
681 break;
682 }
683
684 /*
685 * Issue the request and unlock the pages.
686 */
687 rc = VbglR0SfHostReqReadPgLst(sf_g->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
688
689 vbsf_unlock_user_pages(papPages, cPages, true /*fSetDirty*/, fLockPgHack);
690
691 if (RT_SUCCESS(rc)) {
692 /*
693 * Success, advance position and buffer.
694 */
695 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
696 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
697 cbRet += cbActual;
698 offFile += cbActual;
699 buf = (uint8_t *)buf + cbActual;
700 size -= cbActual;
701
702 /*
703 * Are we done already? If so commit the new file offset.
704 */
705 if (!size || cbActual < cbChunk) {
706 *off = offFile;
707 break;
708 }
709 } else if (rc == VERR_NO_MEMORY && cMaxPages > 4) {
710 /*
711 * The host probably doesn't have enough heap to handle the
712 * request, reduce the page count and retry.
713 */
714 cMaxPages /= 4;
715 Assert(cMaxPages > 0);
716 } else {
717 /*
718 * If we've successfully read stuff, return it rather than
719 * the error. (Not sure if this is such a great idea...)
720 */
721 if (cbRet > 0)
722 *off = offFile;
723 else
724 cbRet = -EPROTO;
725 break;
726 }
727 }
728 }
729 if (papPagesFree)
730 kfree(papPages);
731 if (pReq)
732 VbglR0PhysHeapFree(pReq);
733 return cbRet;
734}
735
736
737/**
738 * Read from a regular file.
739 *
740 * @param file the file
741 * @param buf the buffer
742 * @param size length of the buffer
743 * @param off offset within the file (in/out).
744 * @returns the number of read bytes on success, Linux error code otherwise
745 */
746static ssize_t vbsf_reg_read(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off)
747{
748 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
749 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
750 struct vbsf_reg_info *sf_r = file->private_data;
751 struct address_space *mapping = inode->i_mapping;
752
753 SFLOGFLOW(("vbsf_reg_read: inode=%p file=%p buf=%p size=%#zx off=%#llx\n", inode, file, buf, size, *off));
754
755 if (!S_ISREG(inode->i_mode)) {
756 LogFunc(("read from non regular file %d\n", inode->i_mode));
757 return -EINVAL;
758 }
759
760 /** @todo XXX Check read permission according to inode->i_mode! */
761
762 if (!size)
763 return 0;
764
765 /*
766 * If there is a mapping and O_DIRECT isn't in effect, we must at a
767 * heed dirty pages in the mapping and read from them. For simplicity
768 * though, we just do page cache reading when there are writable
769 * mappings around with any kind of pages loaded.
770 */
771 if (vbsf_should_use_cached_read(file, mapping, sf_g))
772 return vbsf_reg_read_mapped(file, buf, size, off);
773
774 /*
775 * For small requests, try use an embedded buffer provided we get a heap block
776 * that does not cross page boundraries (see host code).
777 */
778 if (size <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) /* see allocator */) {
779 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) + size;
780 VBOXSFREADEMBEDDEDREQ *pReq = (VBOXSFREADEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
781 if (pReq) {
782 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
783 ssize_t cbRet;
784 int vrc = VbglR0SfHostReqReadEmbedded(sf_g->map.root, pReq, sf_r->Handle.hHost, *off, (uint32_t)size);
785 if (RT_SUCCESS(vrc)) {
786 cbRet = pReq->Parms.cb32Read.u.value32;
787 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
788 if (copy_to_user(buf, pReq->abData, cbRet) == 0)
789 *off += cbRet;
790 else
791 cbRet = -EFAULT;
792 } else
793 cbRet = -EPROTO;
794 VbglR0PhysHeapFree(pReq);
795 return cbRet;
796 }
797 VbglR0PhysHeapFree(pReq);
798 }
799 }
800
801#if 0 /* Turns out this is slightly slower than locking the pages even for 4KB reads (4.19/amd64). */
802 /*
803 * For medium sized requests try use a bounce buffer.
804 */
805 if (size <= _64K /** @todo make this configurable? */) {
806 void *pvBounce = kmalloc(size, GFP_KERNEL);
807 if (pvBounce) {
808 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
809 if (pReq) {
810 ssize_t cbRet;
811 int vrc = VbglR0SfHostReqReadContig(sf_g->map.root, pReq, sf_r->Handle.hHost, *off,
812 (uint32_t)size, pvBounce, virt_to_phys(pvBounce));
813 if (RT_SUCCESS(vrc)) {
814 cbRet = pReq->Parms.cb32Read.u.value32;
815 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
816 if (copy_to_user(buf, pvBounce, cbRet) == 0)
817 *off += cbRet;
818 else
819 cbRet = -EFAULT;
820 } else
821 cbRet = -EPROTO;
822 VbglR0PhysHeapFree(pReq);
823 kfree(pvBounce);
824 return cbRet;
825 }
826 kfree(pvBounce);
827 }
828 }
829#endif
830
831 return vbsf_reg_read_locking(file, buf, size, off, sf_g, sf_r);
832}
833
834
835/**
836 * Helper the synchronizes the page cache content with something we just wrote
837 * to the host.
838 */
839void vbsf_reg_write_sync_page_cache(struct address_space *mapping, loff_t offFile, uint32_t cbRange,
840 uint8_t const *pbSrcBuf, struct page **papSrcPages, uint32_t offSrcPage, size_t cSrcPages)
841{
842 Assert(offSrcPage < PAGE_SIZE);
843 if (mapping && mapping->nrpages > 0) {
844 /*
845 * Work the pages in the write range.
846 */
847 while (cbRange > 0) {
848 /*
849 * Lookup the page at offFile. We're fine if there aren't
850 * any there. We're skip if it's dirty or is being written
851 * back, at least for now.
852 */
853 size_t const offDstPage = offFile & PAGE_OFFSET_MASK;
854 size_t const cbToCopy = RT_MIN(PAGE_SIZE - offDstPage, cbRange);
855 pgoff_t const idxPage = offFile >> PAGE_SHIFT;
856 struct page *pDstPage = find_lock_page(mapping, idxPage);
857 if (pDstPage) {
858 if ( pDstPage->mapping == mapping /* ignore if re-purposed (paranoia) */
859 && pDstPage->index == idxPage
860 && !PageDirty(pDstPage) /* ignore if dirty */
861 && !PageWriteback(pDstPage) /* ignore if being written back */ ) {
862 /*
863 * Map the page and do the copying.
864 */
865 uint8_t *pbDst = (uint8_t *)kmap(pDstPage);
866 if (pbSrcBuf)
867 memcpy(&pbDst[offDstPage], pbSrcBuf, cbToCopy);
868 else {
869 uint32_t const cbSrc0 = PAGE_SIZE - offSrcPage;
870 uint8_t const *pbSrc = (uint8_t const *)kmap(papSrcPages[0]);
871 AssertMsg(cSrcPages >= 1, ("offFile=%#llx cbRange=%#zx cbToCopy=%#zx\n", offFile, cbRange, cbToCopy));
872 memcpy(&pbDst[offDstPage], &pbSrc[offSrcPage], RT_MIN(cbToCopy, cbSrc0));
873 kunmap(papSrcPages[0]);
874 if (cbToCopy > cbSrc0) {
875 AssertMsg(cSrcPages >= 2, ("offFile=%#llx cbRange=%#zx cbToCopy=%#zx\n", offFile, cbRange, cbToCopy));
876 pbSrc = (uint8_t const *)kmap(papSrcPages[1]);
877 memcpy(&pbDst[offDstPage + cbSrc0], pbSrc, cbToCopy - cbSrc0);
878 kunmap(papSrcPages[1]);
879 }
880 }
881 kunmap(pDstPage);
882 flush_dcache_page(pDstPage);
883 if (cbToCopy == PAGE_SIZE)
884 SetPageUptodate(pDstPage);
885# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
886 mark_page_accessed(pDstPage);
887# endif
888 } else
889 SFLOGFLOW(("vbsf_reg_write_sync_page_cache: Skipping page %p: mapping=%p (vs %p) writeback=%d offset=%#lx (vs%#lx)\n",
890 pDstPage, pDstPage->mapping, mapping, PageWriteback(pDstPage), pDstPage->index, idxPage));
891 unlock_page(pDstPage);
892 vbsf_put_page(pDstPage);
893 }
894
895 /*
896 * Advance.
897 */
898 if (pbSrcBuf)
899 pbSrcBuf += cbToCopy;
900 else
901 {
902 offSrcPage += cbToCopy;
903 Assert(offSrcPage < PAGE_SIZE * 2);
904 if (offSrcPage >= PAGE_SIZE) {
905 offSrcPage &= PAGE_OFFSET_MASK;
906 papSrcPages++;
907# ifdef VBOX_STRICT
908 Assert(cSrcPages > 0);
909 cSrcPages--;
910# endif
911 }
912 }
913 offFile += cbToCopy;
914 cbRange -= cbToCopy;
915 }
916 }
917 RT_NOREF(cSrcPages);
918}
919
920
921/**
922 * Fallback case of vbsf_reg_write() that locks the user buffers and let the host
923 * write directly to them.
924 */
925static ssize_t vbsf_reg_write_locking(struct file *file, const char /*__user*/ *buf, size_t size, loff_t *off, loff_t offFile,
926 struct inode *inode, struct vbsf_inode_info *sf_i,
927 struct vbsf_super_info *sf_g, struct vbsf_reg_info *sf_r)
928{
929 /*
930 * Lock pages and execute the write, taking care not to pass the host
931 * more than it can handle in one go or more than we care to allocate
932 * page arrays for. The latter limit is set at just short of 32KB due
933 * to how the physical heap works.
934 */
935 struct page *apPagesStack[16];
936 struct page **papPages = &apPagesStack[0];
937 struct page **papPagesFree = NULL;
938 VBOXSFWRITEPGLSTREQ *pReq;
939 ssize_t cbRet = -ENOMEM;
940 size_t cPages = (((uintptr_t)buf & PAGE_OFFSET_MASK) + size + PAGE_OFFSET_MASK) >> PAGE_SHIFT;
941 size_t cMaxPages = RT_MIN(RT_MAX(sf_g->cMaxIoPages, 1), cPages);
942 bool fLockPgHack;
943
944 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
945 while (!pReq && cMaxPages > 4) {
946 cMaxPages /= 2;
947 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
948 }
949 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
950 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
951 if (pReq && papPages) {
952 cbRet = 0;
953 for (;;) {
954 /*
955 * Figure out how much to process now and lock the user pages.
956 */
957 int rc;
958 size_t cbChunk = (uintptr_t)buf & PAGE_OFFSET_MASK;
959 pReq->PgLst.offFirstPage = (uint16_t)cbChunk;
960 cPages = RT_ALIGN_Z(cbChunk + size, PAGE_SIZE) >> PAGE_SHIFT;
961 if (cPages <= cMaxPages)
962 cbChunk = size;
963 else {
964 cPages = cMaxPages;
965 cbChunk = (cMaxPages << PAGE_SHIFT) - cbChunk;
966 }
967
968 rc = vbsf_lock_user_pages((uintptr_t)buf, cPages, false /*fWrite*/, papPages, &fLockPgHack);
969 if (rc == 0) {
970 size_t iPage = cPages;
971 while (iPage-- > 0)
972 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
973 } else {
974 cbRet = rc;
975 break;
976 }
977
978 /*
979 * Issue the request and unlock the pages.
980 */
981 rc = VbglR0SfHostReqWritePgLst(sf_g->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
982 if (RT_SUCCESS(rc)) {
983 /*
984 * Success, advance position and buffer.
985 */
986 uint32_t cbActual = pReq->Parms.cb32Write.u.value32;
987 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
988
989 vbsf_reg_write_sync_page_cache(inode->i_mapping, offFile, cbActual, NULL /*pbKrnlBuf*/,
990 papPages, (uintptr_t)buf & PAGE_OFFSET_MASK, cPages);
991 vbsf_unlock_user_pages(papPages, cPages, false /*fSetDirty*/, fLockPgHack);
992
993 cbRet += cbActual;
994 offFile += cbActual;
995 buf = (uint8_t *)buf + cbActual;
996 size -= cbActual;
997 if (offFile > i_size_read(inode))
998 i_size_write(inode, offFile);
999 sf_i->force_restat = 1; /* mtime (and size) may have changed */
1000
1001 /*
1002 * Are we done already? If so commit the new file offset.
1003 */
1004 if (!size || cbActual < cbChunk) {
1005 *off = offFile;
1006 break;
1007 }
1008 } else {
1009 vbsf_unlock_user_pages(papPages, cPages, false /*fSetDirty*/, fLockPgHack);
1010 if (rc == VERR_NO_MEMORY && cMaxPages > 4) {
1011 /*
1012 * The host probably doesn't have enough heap to handle the
1013 * request, reduce the page count and retry.
1014 */
1015 cMaxPages /= 4;
1016 Assert(cMaxPages > 0);
1017 } else {
1018 /*
1019 * If we've successfully written stuff, return it rather than
1020 * the error. (Not sure if this is such a great idea...)
1021 */
1022 if (cbRet > 0)
1023 *off = offFile;
1024 else
1025 cbRet = -EPROTO;
1026 break;
1027 }
1028 }
1029 }
1030 }
1031 if (papPagesFree)
1032 kfree(papPages);
1033 if (pReq)
1034 VbglR0PhysHeapFree(pReq);
1035 return cbRet;
1036}
1037
1038
1039/**
1040 * Write to a regular file.
1041 *
1042 * @param file the file
1043 * @param buf the buffer
1044 * @param size length of the buffer
1045 * @param off offset within the file
1046 * @returns the number of written bytes on success, Linux error code otherwise
1047 */
1048static ssize_t vbsf_reg_write(struct file *file, const char *buf, size_t size, loff_t * off)
1049{
1050 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
1051 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
1052 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
1053 struct vbsf_reg_info *sf_r = file->private_data;
1054 struct address_space *mapping = inode->i_mapping;
1055 loff_t pos;
1056
1057 SFLOGFLOW(("vbsf_reg_write: inode=%p file=%p buf=%p size=%#zx off=%#llx\n", inode, file, buf, size, *off));
1058 BUG_ON(!sf_i);
1059 BUG_ON(!sf_g);
1060 BUG_ON(!sf_r);
1061 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
1062
1063 pos = *off;
1064 /** @todo This should be handled by the host, it returning the new file
1065 * offset when appending. We may have an outdated i_size value here! */
1066 if (file->f_flags & O_APPEND)
1067 pos = i_size_read(inode);
1068
1069 /** @todo XXX Check write permission according to inode->i_mode! */
1070
1071 if (!size) {
1072 if (file->f_flags & O_APPEND) /** @todo check if this is the consensus behavior... */
1073 *off = pos;
1074 return 0;
1075 }
1076
1077 /*
1078 * If there are active writable mappings, coordinate with any
1079 * pending writes via those.
1080 */
1081 if ( mapping
1082 && mapping->nrpages > 0
1083 && mapping_writably_mapped(mapping)) {
1084#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
1085 int err = filemap_fdatawait_range(mapping, pos, pos + size - 1);
1086 if (err)
1087 return err;
1088#else
1089 /** @todo ... */
1090#endif
1091 }
1092
1093 /*
1094 * For small requests, try use an embedded buffer provided we get a heap block
1095 * that does not cross page boundraries (see host code).
1096 */
1097 if (size <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) /* see allocator */) {
1098 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) + size;
1099 VBOXSFWRITEEMBEDDEDREQ *pReq = (VBOXSFWRITEEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
1100 if ( pReq
1101 && (PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
1102 ssize_t cbRet;
1103 if (copy_from_user(pReq->abData, buf, size) == 0) {
1104 int vrc = VbglR0SfHostReqWriteEmbedded(sf_g->map.root, pReq, sf_r->Handle.hHost,
1105 pos, (uint32_t)size);
1106 if (RT_SUCCESS(vrc)) {
1107 cbRet = pReq->Parms.cb32Write.u.value32;
1108 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
1109 vbsf_reg_write_sync_page_cache(mapping, pos, (uint32_t)cbRet, pReq->abData,
1110 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
1111 pos += cbRet;
1112 *off = pos;
1113 if (pos > i_size_read(inode))
1114 i_size_write(inode, pos);
1115 } else
1116 cbRet = -EPROTO;
1117 sf_i->force_restat = 1; /* mtime (and size) may have changed */
1118 } else
1119 cbRet = -EFAULT;
1120
1121 VbglR0PhysHeapFree(pReq);
1122 return cbRet;
1123 }
1124 if (pReq)
1125 VbglR0PhysHeapFree(pReq);
1126 }
1127
1128#if 0 /* Turns out this is slightly slower than locking the pages even for 4KB reads (4.19/amd64). */
1129 /*
1130 * For medium sized requests try use a bounce buffer.
1131 */
1132 if (size <= _64K /** @todo make this configurable? */) {
1133 void *pvBounce = kmalloc(size, GFP_KERNEL);
1134 if (pvBounce) {
1135 if (copy_from_user(pvBounce, buf, size) == 0) {
1136 VBOXSFWRITEPGLSTREQ *pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
1137 if (pReq) {
1138 ssize_t cbRet;
1139 int vrc = VbglR0SfHostReqWriteContig(sf_g->map.root, pReq, sf_r->handle, pos,
1140 (uint32_t)size, pvBounce, virt_to_phys(pvBounce));
1141 if (RT_SUCCESS(vrc)) {
1142 cbRet = pReq->Parms.cb32Write.u.value32;
1143 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
1144 vbsf_reg_write_sync_page_cache(mapping, pos, (uint32_t)cbRet, (uint8_t const *)pvBounce,
1145 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
1146 pos += cbRet;
1147 *off = pos;
1148 if (pos > i_size_read(inode))
1149 i_size_write(inode, pos);
1150 } else
1151 cbRet = -EPROTO;
1152 sf_i->force_restat = 1; /* mtime (and size) may have changed */
1153 VbglR0PhysHeapFree(pReq);
1154 kfree(pvBounce);
1155 return cbRet;
1156 }
1157 kfree(pvBounce);
1158 } else {
1159 kfree(pvBounce);
1160 return -EFAULT;
1161 }
1162 }
1163 }
1164#endif
1165
1166 return vbsf_reg_write_locking(file, buf, size, off, pos, inode, sf_i, sf_g, sf_r);
1167}
1168
1169#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
1170/*
1171 * Hide missing uio.h functionality in older kernsl.
1172 */
1173
1174static size_t copy_from_iter(uint8_t *pbDst, size_t cbToCopy, struct iov_iter *pSrcIter)
1175{
1176 size_t const cbTotal = cbToCopy;
1177 Assert(iov_iter_count(pSrcIter) >= cbToCopy);
1178 if (pSrcIter->type & ITER_BVEC) {
1179 while (cbToCopy > 0) {
1180 size_t const offPage = (uintptr_t)pbDst & PAGE_OFFSET_MASK;
1181 size_t const cbThisCopy = RT_MIN(PAGE_SIZE - offPage, cbToCopy);
1182 struct page *pPage = rtR0MemObjLinuxVirtToPage(pbDst);
1183 size_t cbCopied = copy_page_from_iter(pPage, offPage, cbThisCopy, pSrcIter);
1184 AssertStmt(cbCopied <= cbThisCopy, cbCopied = cbThisCopy);
1185 pbDst += cbCopied;
1186 cbToCopy -= cbCopied;
1187 if (cbCopied != cbToCopy)
1188 break;
1189 }
1190 } else {
1191 while (cbToCopy > 0) {
1192 size_t cbThisCopy = iov_iter_single_seg_count(pSrcIter);
1193 if (cbThisCopy > 0) {
1194 if (cbThisCopy > cbToCopy)
1195 cbThisCopy = cbToCopy;
1196 if (pSrcIter->type & ITER_KVEC)
1197 memcpy(pbDst, (void *)pSrcIter->iov->iov_base + pSrcIter->iov_offset, cbThisCopy);
1198 else if (!copy_from_user(pbDst, pSrcIter->iov->iov_base + pSrcIter->iov_offset, cbThisCopy))
1199 break;
1200 pbDst += cbThisCopy;
1201 cbToCopy -= cbThisCopy;
1202 }
1203 iov_iter_advance(pSrcIter, cbThisCopy);
1204 }
1205 }
1206 return cbTotal - cbToCopy;
1207}
1208
1209static size_t copy_to_iter(uint8_t const *pbSrc, size_t cbToCopy, struct iov_iter *pDstIter)
1210{
1211 size_t const cbTotal = cbToCopy;
1212 Assert(iov_iter_count(pDstIter) >= cbToCopy);
1213 if (pDstIter->type & ITER_BVEC) {
1214 while (cbToCopy > 0) {
1215 size_t const offPage = (uintptr_t)pbSrc & PAGE_OFFSET_MASK;
1216 size_t const cbThisCopy = RT_MIN(PAGE_SIZE - offPage, cbToCopy);
1217 struct page *pPage = rtR0MemObjLinuxVirtToPage((void *)pbSrc);
1218 size_t cbCopied = copy_page_to_iter(pPage, offPage, cbThisCopy, pDstIter);
1219 AssertStmt(cbCopied <= cbThisCopy, cbCopied = cbThisCopy);
1220 pbSrc += cbCopied;
1221 cbToCopy -= cbCopied;
1222 if (cbCopied != cbToCopy)
1223 break;
1224 }
1225 } else {
1226 while (cbToCopy > 0) {
1227 size_t cbThisCopy = iov_iter_single_seg_count(pDstIter);
1228 if (cbThisCopy > 0) {
1229 if (cbThisCopy > cbToCopy)
1230 cbThisCopy = cbToCopy;
1231 if (pDstIter->type & ITER_KVEC)
1232 memcpy((void *)pDstIter->iov->iov_base + pDstIter->iov_offset, pbSrc, cbThisCopy);
1233 else if (!copy_to_user(pDstIter->iov->iov_base + pDstIter->iov_offset, pbSrc, cbThisCopy)) {
1234 break;
1235 }
1236 pbSrc += cbThisCopy;
1237 cbToCopy -= cbThisCopy;
1238 }
1239 iov_iter_advance(pDstIter, cbThisCopy);
1240 }
1241 }
1242 return cbTotal - cbToCopy;
1243}
1244
1245#endif /* 3.16.0 >= linux < 3.18.0 */
1246#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
1247
1248/**
1249 * Companion to vbsf_iter_lock_pages().
1250 */
1251DECLINLINE(void) vbsf_iter_unlock_pages(struct iov_iter *iter, struct page **papPages, size_t cPages, bool fSetDirty)
1252{
1253 /* We don't mark kernel pages dirty: */
1254 if (iter->type & ITER_KVEC)
1255 fSetDirty = false;
1256
1257 while (cPages-- > 0)
1258 {
1259 struct page *pPage = papPages[cPages];
1260 if (fSetDirty && !PageReserved(pPage))
1261 SetPageDirty(pPage);
1262 vbsf_put_page(pPage);
1263 }
1264}
1265
1266
1267/**
1268 * Locks up to @a cMaxPages from the I/O vector iterator, advancing the
1269 * iterator.
1270 *
1271 * @returns 0 on success, negative errno value on failure.
1272 * @param iter The iterator to lock pages from.
1273 * @param fWrite Whether to write (true) or read (false) lock the pages.
1274 * @param pStash Where we stash peek results.
1275 * @param cMaxPages The maximum number of pages to get.
1276 * @param papPages Where to return the locked pages.
1277 * @param pcPages Where to return the number of pages.
1278 * @param poffPage0 Where to return the offset into the first page.
1279 * @param pcbChunk Where to return the number of bytes covered.
1280 */
1281static int vbsf_iter_lock_pages(struct iov_iter *iter, bool fWrite, struct vbsf_iter_stash *pStash, size_t cMaxPages,
1282 struct page **papPages, size_t *pcPages, size_t *poffPage0, size_t *pcbChunk)
1283{
1284 size_t cbChunk = 0;
1285 size_t cPages = 0;
1286 size_t offPage0 = 0;
1287 int rc = 0;
1288
1289 Assert(iov_iter_count(iter) + pStash->cb > 0);
1290 if (!(iter->type & ITER_KVEC)) {
1291 /*
1292 * Do we have a stashed page?
1293 */
1294 if (pStash->pPage) {
1295 papPages[0] = pStash->pPage;
1296 offPage0 = pStash->off;
1297 cbChunk = pStash->cb;
1298 cPages = 1;
1299 pStash->pPage = NULL;
1300 pStash->off = 0;
1301 pStash->cb = 0;
1302 if ( offPage0 + cbChunk < PAGE_SIZE
1303 || iov_iter_count(iter) == 0) {
1304 *poffPage0 = offPage0;
1305 *pcbChunk = cbChunk;
1306 *pcPages = cPages;
1307 SFLOGFLOW(("vbsf_iter_lock_pages: returns %d - cPages=%#zx offPage0=%#zx cbChunk=%zx (stashed)\n",
1308 rc, cPages, offPage0, cbChunk));
1309 return 0;
1310 }
1311 cMaxPages -= 1;
1312 SFLOG3(("vbsf_iter_lock_pages: Picked up stashed page: %#zx LB %#zx\n", offPage0, cbChunk));
1313 } else {
1314# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
1315 /*
1316 * Copy out our starting point to assist rewinding.
1317 */
1318 pStash->offFromEnd = iov_iter_count(iter);
1319 pStash->Copy = *iter;
1320# endif
1321 }
1322
1323 /*
1324 * Get pages segment by segment.
1325 */
1326 do {
1327 /*
1328 * Make a special case of the first time thru here, since that's
1329 * the most typical scenario.
1330 */
1331 ssize_t cbSegRet;
1332 if (cPages == 0) {
1333# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
1334 while (!iov_iter_single_seg_count(iter)) /* Old code didn't skip empty segments which caused EFAULTs. */
1335 iov_iter_advance(iter, 0);
1336# endif
1337 cbSegRet = iov_iter_get_pages(iter, papPages, iov_iter_count(iter), cMaxPages, &offPage0);
1338 if (cbSegRet > 0) {
1339 iov_iter_advance(iter, cbSegRet);
1340 cbChunk = (size_t)cbSegRet;
1341 cPages = RT_ALIGN_Z(offPage0 + cbSegRet, PAGE_SIZE) >> PAGE_SHIFT;
1342 cMaxPages -= cPages;
1343 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages -> %#zx @ %#zx; %#zx pages [first]\n", cbSegRet, offPage0, cPages));
1344 if ( cMaxPages == 0
1345 || ((offPage0 + (size_t)cbSegRet) & PAGE_OFFSET_MASK))
1346 break;
1347 } else {
1348 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
1349 rc = (int)cbSegRet;
1350 break;
1351 }
1352 } else {
1353 /*
1354 * Probe first page of new segment to check that we've got a zero offset and
1355 * can continue on the current chunk. Stash the page if the offset isn't zero.
1356 */
1357 size_t offPgProbe;
1358 size_t cbSeg = iov_iter_single_seg_count(iter);
1359 while (!cbSeg) {
1360 iov_iter_advance(iter, 0);
1361 cbSeg = iov_iter_single_seg_count(iter);
1362 }
1363 cbSegRet = iov_iter_get_pages(iter, &papPages[cPages], iov_iter_count(iter), 1, &offPgProbe);
1364 if (cbSegRet > 0) {
1365 iov_iter_advance(iter, cbSegRet); /** @todo maybe not do this if we stash the page? */
1366 Assert(offPgProbe + cbSegRet <= PAGE_SIZE);
1367 if (offPgProbe == 0) {
1368 cbChunk += cbSegRet;
1369 cPages += 1;
1370 cMaxPages -= 1;
1371 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages(1) -> %#zx @ %#zx\n", cbSegRet, offPgProbe));
1372 if ( cMaxPages == 0
1373 || cbSegRet != PAGE_SIZE)
1374 break;
1375
1376 /*
1377 * Get the rest of the segment (if anything remaining).
1378 */
1379 cbSeg -= cbSegRet;
1380 if (cbSeg > 0) {
1381 cbSegRet = iov_iter_get_pages(iter, &papPages[cPages], iov_iter_count(iter), cMaxPages, &offPgProbe);
1382 if (cbSegRet > 0) {
1383 size_t const cPgRet = RT_ALIGN_Z((size_t)cbSegRet, PAGE_SIZE) >> PAGE_SHIFT;
1384 Assert(offPgProbe == 0);
1385 iov_iter_advance(iter, cbSegRet);
1386 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages() -> %#zx; %#zx pages\n", cbSegRet, cPgRet));
1387 cPages += cPgRet;
1388 cMaxPages -= cPgRet;
1389 cbChunk += cbSegRet;
1390 if ( cMaxPages == 0
1391 || ((size_t)cbSegRet & PAGE_OFFSET_MASK))
1392 break;
1393 } else {
1394 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
1395 rc = (int)cbSegRet;
1396 break;
1397 }
1398 }
1399 } else {
1400 /* The segment didn't start at a page boundrary, so stash it for
1401 the next round: */
1402 SFLOGFLOW(("vbsf_iter_lock_pages: iov_iter_get_pages(1) -> %#zx @ %#zx; stashed\n", cbSegRet, offPgProbe));
1403 Assert(papPages[cPages]);
1404 pStash->pPage = papPages[cPages];
1405 pStash->off = offPgProbe;
1406 pStash->cb = cbSegRet;
1407 break;
1408 }
1409 } else {
1410 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
1411 rc = (int)cbSegRet;
1412 break;
1413 }
1414 }
1415 Assert(cMaxPages > 0);
1416 } while (iov_iter_count(iter) > 0);
1417
1418 } else {
1419 /*
1420 * The silly iov_iter_get_pages_alloc() function doesn't handle KVECs,
1421 * so everyone needs to do that by themselves.
1422 *
1423 * Note! Fixes here may apply to rtR0MemObjNativeLockKernel()
1424 * and vbsf_lock_user_pages_failed_check_kernel() as well.
1425 */
1426# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
1427 pStash->offFromEnd = iov_iter_count(iter);
1428 pStash->Copy = *iter;
1429# endif
1430 do {
1431 uint8_t *pbBuf;
1432 size_t offStart;
1433 size_t cPgSeg;
1434
1435 size_t cbSeg = iov_iter_single_seg_count(iter);
1436 while (!cbSeg) {
1437 iov_iter_advance(iter, 0);
1438 cbSeg = iov_iter_single_seg_count(iter);
1439 }
1440
1441# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
1442 pbBuf = iter->kvec->iov_base + iter->iov_offset;
1443# else
1444 pbBuf = iter->iov->iov_base + iter->iov_offset;
1445# endif
1446 offStart = (uintptr_t)pbBuf & PAGE_OFFSET_MASK;
1447 if (!cPages)
1448 offPage0 = offStart;
1449 else if (offStart)
1450 break;
1451
1452 cPgSeg = RT_ALIGN_Z(cbSeg, PAGE_SIZE) >> PAGE_SHIFT;
1453 if (cPgSeg > cMaxPages) {
1454 cPgSeg = cMaxPages;
1455 cbSeg = (cPgSeg << PAGE_SHIFT) - offStart;
1456 }
1457
1458 rc = vbsf_lock_kernel_pages(pbBuf, fWrite, cPgSeg, &papPages[cPages]);
1459 if (rc == 0) {
1460 iov_iter_advance(iter, cbSeg);
1461 cbChunk += cbSeg;
1462 cPages += cPgSeg;
1463 cMaxPages -= cPgSeg;
1464 if ( cMaxPages == 0
1465 || ((offStart + cbSeg) & PAGE_OFFSET_MASK) != 0)
1466 break;
1467 } else
1468 break;
1469 } while (iov_iter_count(iter) > 0);
1470 }
1471
1472 /*
1473 * Clean up if we failed; set return values.
1474 */
1475 if (rc == 0) {
1476 /* likely */
1477 } else {
1478 if (cPages > 0)
1479 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
1480 offPage0 = cbChunk = cPages = 0;
1481 }
1482 *poffPage0 = offPage0;
1483 *pcbChunk = cbChunk;
1484 *pcPages = cPages;
1485 SFLOGFLOW(("vbsf_iter_lock_pages: returns %d - cPages=%#zx offPage0=%#zx cbChunk=%zx\n", rc, cPages, offPage0, cbChunk));
1486 return rc;
1487}
1488
1489
1490/**
1491 * Rewinds the I/O vector.
1492 */
1493static bool vbsf_iter_rewind(struct iov_iter *iter, struct vbsf_iter_stash *pStash, size_t cbToRewind, size_t cbChunk)
1494{
1495 size_t cbExtra;
1496 if (!pStash->pPage) {
1497 cbExtra = 0;
1498 } else {
1499 cbExtra = pStash->cb;
1500 vbsf_put_page(pStash->pPage);
1501 pStash->pPage = NULL;
1502 pStash->cb = 0;
1503 pStash->off = 0;
1504 }
1505
1506# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
1507 iov_iter_revert(iter, cbToRewind + cbExtra);
1508 return true;
1509# else
1510 /** @todo impl this */
1511 return false;
1512# endif
1513}
1514
1515
1516/**
1517 * Cleans up the page locking stash.
1518 */
1519DECLINLINE(void) vbsf_iter_cleanup_stash(struct iov_iter *iter, struct vbsf_iter_stash *pStash)
1520{
1521 if (pStash->pPage)
1522 vbsf_iter_rewind(iter, pStash, 0, 0);
1523}
1524
1525
1526/**
1527 * Calculates the longest span of pages we could transfer to the host in a
1528 * single request.
1529 *
1530 * @returns Page count, non-zero.
1531 * @param iter The I/O vector iterator to inspect.
1532 */
1533static size_t vbsf_iter_max_span_of_pages(struct iov_iter *iter)
1534{
1535 size_t cPages;
1536 if (iter_is_iovec(iter) || (iter->type & ITER_KVEC)) {
1537 const struct iovec *pCurIov = iter->iov;
1538 size_t cLeft = iter->nr_segs;
1539 size_t cPagesSpan = 0;
1540
1541 /* iovect and kvec are identical, except for the __user tagging of iov_base. */
1542 AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, struct kvec, iov_base);
1543 AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, struct kvec, iov_len);
1544 AssertCompile(sizeof(struct iovec) == sizeof(struct kvec));
1545
1546 cPages = 1;
1547 AssertReturn(cLeft > 0, cPages);
1548
1549 /* Special case: segment offset. */
1550 if (iter->iov_offset > 0) {
1551 if (iter->iov_offset < pCurIov->iov_len) {
1552 size_t const cbSegLeft = pCurIov->iov_len - iter->iov_offset;
1553 size_t const offPage0 = ((uintptr_t)pCurIov->iov_base + iter->iov_offset) & PAGE_OFFSET_MASK;
1554 cPages = cPagesSpan = RT_ALIGN_Z(offPage0 + cbSegLeft, PAGE_SIZE) >> PAGE_SHIFT;
1555 if ((offPage0 + cbSegLeft) & PAGE_OFFSET_MASK)
1556 cPagesSpan = 0;
1557 }
1558 SFLOGFLOW(("vbsf_iter: seg[0]= %p LB %#zx\n", pCurIov->iov_base, pCurIov->iov_len));
1559 pCurIov++;
1560 cLeft--;
1561 }
1562
1563 /* Full segments. */
1564 while (cLeft-- > 0) {
1565 if (pCurIov->iov_len > 0) {
1566 size_t const offPage0 = (uintptr_t)pCurIov->iov_base & PAGE_OFFSET_MASK;
1567 if (offPage0 == 0) {
1568 if (!(pCurIov->iov_len & PAGE_OFFSET_MASK)) {
1569 cPagesSpan += pCurIov->iov_len >> PAGE_SHIFT;
1570 } else {
1571 cPagesSpan += RT_ALIGN_Z(pCurIov->iov_len, PAGE_SIZE) >> PAGE_SHIFT;
1572 if (cPagesSpan > cPages)
1573 cPages = cPagesSpan;
1574 cPagesSpan = 0;
1575 }
1576 } else {
1577 if (cPagesSpan > cPages)
1578 cPages = cPagesSpan;
1579 if (!((offPage0 + pCurIov->iov_len) & PAGE_OFFSET_MASK)) {
1580 cPagesSpan = pCurIov->iov_len >> PAGE_SHIFT;
1581 } else {
1582 cPagesSpan += RT_ALIGN_Z(offPage0 + pCurIov->iov_len, PAGE_SIZE) >> PAGE_SHIFT;
1583 if (cPagesSpan > cPages)
1584 cPages = cPagesSpan;
1585 cPagesSpan = 0;
1586 }
1587 }
1588 }
1589 SFLOGFLOW(("vbsf_iter: seg[%u]= %p LB %#zx\n", iter->nr_segs - cLeft, pCurIov->iov_base, pCurIov->iov_len));
1590 pCurIov++;
1591 }
1592 if (cPagesSpan > cPages)
1593 cPages = cPagesSpan;
1594 } else {
1595 /* Won't bother with accurate counts for the next two types, just make
1596 some rough estimates (does pipes have segments?): */
1597 size_t cSegs = iter->type & ITER_BVEC ? RT_MAX(1, iter->nr_segs) : 1;
1598 cPages = (iov_iter_count(iter) + (PAGE_SIZE * 2 - 2) * cSegs) >> PAGE_SHIFT;
1599 }
1600 SFLOGFLOW(("vbsf_iter_max_span_of_pages: returns %#zx\n", cPages));
1601 return cPages;
1602}
1603
1604
1605/**
1606 * Worker for vbsf_reg_read_iter() that deals with larger reads using page
1607 * locking.
1608 */
1609static ssize_t vbsf_reg_read_iter_locking(struct kiocb *kio, struct iov_iter *iter, size_t cbToRead,
1610 struct vbsf_super_info *sf_g, struct vbsf_reg_info *sf_r)
1611{
1612 /*
1613 * Estimate how many pages we may possible submit in a single request so
1614 * that we can allocate matching request buffer and page array.
1615 */
1616 struct page *apPagesStack[16];
1617 struct page **papPages = &apPagesStack[0];
1618 struct page **papPagesFree = NULL;
1619 VBOXSFREADPGLSTREQ *pReq;
1620 ssize_t cbRet = 0;
1621 size_t cMaxPages = vbsf_iter_max_span_of_pages(iter);
1622 cMaxPages = RT_MIN(RT_MAX(sf_g->cMaxIoPages, 2), cMaxPages);
1623
1624 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
1625 while (!pReq && cMaxPages > 4) {
1626 cMaxPages /= 2;
1627 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
1628 }
1629 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
1630 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
1631 if (pReq && papPages) {
1632
1633 /*
1634 * The read loop.
1635 */
1636 struct vbsf_iter_stash Stash = VBSF_ITER_STASH_INITIALIZER;
1637 do {
1638 /*
1639 * Grab as many pages as we can. This means that if adjacent
1640 * segments both starts and ends at a page boundrary, we can
1641 * do them both in the same transfer from the host.
1642 */
1643 size_t cPages = 0;
1644 size_t cbChunk = 0;
1645 size_t offPage0 = 0;
1646 int rc = vbsf_iter_lock_pages(iter, true /*fWrite*/, &Stash, cMaxPages, papPages, &cPages, &offPage0, &cbChunk);
1647 if (rc == 0) {
1648 size_t iPage = cPages;
1649 while (iPage-- > 0)
1650 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
1651 pReq->PgLst.offFirstPage = (uint16_t)offPage0;
1652 AssertStmt(cbChunk <= cbToRead, cbChunk = cbToRead);
1653 } else {
1654 cbRet = rc;
1655 break;
1656 }
1657
1658 /*
1659 * Issue the request and unlock the pages.
1660 */
1661 rc = VbglR0SfHostReqReadPgLst(sf_g->map.root, pReq, sf_r->Handle.hHost, kio->ki_pos, cbChunk, cPages);
1662 SFLOGFLOW(("vbsf_reg_read_iter_locking: VbglR0SfHostReqReadPgLst -> %d (cbActual=%#x cbChunk=%#zx of %#zx cPages=%#zx offPage0=%#x\n",
1663 rc, pReq->Parms.cb32Read.u.value32, cbChunk, cbToRead, cPages, offPage0));
1664
1665 vbsf_iter_unlock_pages(iter, papPages, cPages, true /*fSetDirty*/);
1666
1667 if (RT_SUCCESS(rc)) {
1668 /*
1669 * Success, advance position and buffer.
1670 */
1671 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
1672 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
1673 cbRet += cbActual;
1674 kio->ki_pos += cbActual;
1675 cbToRead -= cbActual;
1676
1677 /*
1678 * Are we done already?
1679 */
1680 if (!cbToRead)
1681 break;
1682 if (cbActual < cbChunk) { /* We ASSUME end-of-file here. */
1683 if (vbsf_iter_rewind(iter, &Stash, cbChunk - cbActual, cbActual))
1684 iov_iter_truncate(iter, 0);
1685 break;
1686 }
1687 } else {
1688 /*
1689 * Try rewind the iter structure.
1690 */
1691 bool const fRewindOkay = vbsf_iter_rewind(iter, &Stash, cbChunk, cbChunk);
1692 if (rc == VERR_NO_MEMORY && cMaxPages > 4 && fRewindOkay) {
1693 /*
1694 * The host probably doesn't have enough heap to handle the
1695 * request, reduce the page count and retry.
1696 */
1697 cMaxPages /= 4;
1698 Assert(cMaxPages > 0);
1699 } else {
1700 /*
1701 * If we've successfully read stuff, return it rather than
1702 * the error. (Not sure if this is such a great idea...)
1703 */
1704 if (cbRet <= 0)
1705 cbRet = -EPROTO;
1706 break;
1707 }
1708 }
1709 } while (cbToRead > 0);
1710
1711 vbsf_iter_cleanup_stash(iter, &Stash);
1712 }
1713 else
1714 cbRet = -ENOMEM;
1715 if (papPagesFree)
1716 kfree(papPages);
1717 if (pReq)
1718 VbglR0PhysHeapFree(pReq);
1719 SFLOGFLOW(("vbsf_reg_read_iter_locking: returns %#zx (%zd)\n", cbRet, cbRet));
1720 return cbRet;
1721}
1722
1723
1724/**
1725 * Read into I/O vector iterator.
1726 *
1727 * @returns Number of bytes read on success, negative errno on error.
1728 * @param kio The kernel I/O control block (or something like that).
1729 * @param iter The I/O vector iterator describing the buffer.
1730 */
1731static ssize_t vbsf_reg_read_iter(struct kiocb *kio, struct iov_iter *iter)
1732{
1733 size_t cbToRead = iov_iter_count(iter);
1734 struct inode *inode = VBSF_GET_F_DENTRY(kio->ki_filp)->d_inode;
1735 struct address_space *mapping = inode->i_mapping;
1736
1737 struct vbsf_reg_info *sf_r = kio->ki_filp->private_data;
1738 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
1739
1740 SFLOGFLOW(("vbsf_reg_read_iter: inode=%p file=%p size=%#zx off=%#llx type=%#x\n",
1741 inode, kio->ki_filp, cbToRead, kio->ki_pos, iter->type));
1742 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
1743
1744 /*
1745 * Do we have anything at all to do here?
1746 */
1747 if (!cbToRead)
1748 return 0;
1749
1750 /*
1751 * If there is a mapping and O_DIRECT isn't in effect, we must at a
1752 * heed dirty pages in the mapping and read from them. For simplicity
1753 * though, we just do page cache reading when there are writable
1754 * mappings around with any kind of pages loaded.
1755 */
1756 if (vbsf_should_use_cached_read(kio->ki_filp, mapping, sf_g))
1757 return generic_file_read_iter(kio, iter);
1758
1759 /*
1760 * Now now we reject async I/O requests.
1761 */
1762 if (!is_sync_kiocb(kio)) {
1763 SFLOGFLOW(("vbsf_reg_read_iter: async I/O not yet supported\n")); /** @todo extend FsPerf with AIO tests. */
1764 return -EOPNOTSUPP;
1765 }
1766
1767 /*
1768 * For small requests, try use an embedded buffer provided we get a heap block
1769 * that does not cross page boundraries (see host code).
1770 */
1771 if (cbToRead <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) /* see allocator */) {
1772 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) + cbToRead;
1773 VBOXSFREADEMBEDDEDREQ *pReq = (VBOXSFREADEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
1774 if (pReq) {
1775 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
1776 ssize_t cbRet;
1777 int vrc = VbglR0SfHostReqReadEmbedded(sf_g->map.root, pReq, sf_r->Handle.hHost, kio->ki_pos, (uint32_t)cbToRead);
1778 if (RT_SUCCESS(vrc)) {
1779 cbRet = pReq->Parms.cb32Read.u.value32;
1780 AssertStmt(cbRet <= (ssize_t)cbToRead, cbRet = cbToRead);
1781 if (copy_to_iter(pReq->abData, cbRet, iter) == cbRet) {
1782 kio->ki_pos += cbRet;
1783 if (cbRet < cbToRead)
1784 iov_iter_truncate(iter, 0);
1785 } else
1786 cbRet = -EFAULT;
1787 } else
1788 cbRet = -EPROTO;
1789 VbglR0PhysHeapFree(pReq);
1790 SFLOGFLOW(("vbsf_reg_read_iter: returns %#zx (%zd)\n", cbRet, cbRet));
1791 return cbRet;
1792 }
1793 VbglR0PhysHeapFree(pReq);
1794 }
1795 }
1796
1797 /*
1798 * Otherwise do the page locking thing.
1799 */
1800 return vbsf_reg_read_iter_locking(kio, iter, cbToRead, sf_g, sf_r);
1801}
1802
1803
1804/**
1805 * Worker for vbsf_reg_write_iter() that deals with larger writes using page
1806 * locking.
1807 */
1808static ssize_t vbsf_reg_write_iter_locking(struct kiocb *kio, struct iov_iter *iter, size_t cbToWrite, loff_t offFile,
1809 struct vbsf_super_info *sf_g, struct vbsf_reg_info *sf_r,
1810 struct inode *inode, struct vbsf_inode_info *sf_i, struct address_space *mapping)
1811{
1812 /*
1813 * Estimate how many pages we may possible submit in a single request so
1814 * that we can allocate matching request buffer and page array.
1815 */
1816 struct page *apPagesStack[16];
1817 struct page **papPages = &apPagesStack[0];
1818 struct page **papPagesFree = NULL;
1819 VBOXSFWRITEPGLSTREQ *pReq;
1820 ssize_t cbRet = 0;
1821 size_t cMaxPages = vbsf_iter_max_span_of_pages(iter);
1822 cMaxPages = RT_MIN(RT_MAX(sf_g->cMaxIoPages, 2), cMaxPages);
1823
1824 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
1825 while (!pReq && cMaxPages > 4) {
1826 cMaxPages /= 2;
1827 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
1828 }
1829 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
1830 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
1831 if (pReq && papPages) {
1832
1833 /*
1834 * The write loop.
1835 */
1836 struct vbsf_iter_stash Stash = VBSF_ITER_STASH_INITIALIZER;
1837 do {
1838 /*
1839 * Grab as many pages as we can. This means that if adjacent
1840 * segments both starts and ends at a page boundrary, we can
1841 * do them both in the same transfer from the host.
1842 */
1843 size_t cPages = 0;
1844 size_t cbChunk = 0;
1845 size_t offPage0 = 0;
1846 int rc = vbsf_iter_lock_pages(iter, false /*fWrite*/, &Stash, cMaxPages, papPages, &cPages, &offPage0, &cbChunk);
1847 if (rc == 0) {
1848 size_t iPage = cPages;
1849 while (iPage-- > 0)
1850 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
1851 pReq->PgLst.offFirstPage = (uint16_t)offPage0;
1852 AssertStmt(cbChunk <= cbToWrite, cbChunk = cbToWrite);
1853 } else {
1854 cbRet = rc;
1855 break;
1856 }
1857
1858 /*
1859 * Issue the request and unlock the pages.
1860 */
1861 rc = VbglR0SfHostReqWritePgLst(sf_g->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
1862 SFLOGFLOW(("vbsf_reg_write_iter_locking: VbglR0SfHostReqWritePgLst -> %d (cbActual=%#x cbChunk=%#zx of %#zx cPages=%#zx offPage0=%#x\n",
1863 rc, pReq->Parms.cb32Write.u.value32, cbChunk, cbToWrite, cPages, offPage0));
1864 if (RT_SUCCESS(rc)) {
1865 /*
1866 * Success, advance position and buffer.
1867 */
1868 uint32_t cbActual = pReq->Parms.cb32Write.u.value32;
1869 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
1870
1871 vbsf_reg_write_sync_page_cache(mapping, offFile, cbActual, NULL /*pbSrcBuf*/, papPages, offPage0, cPages);
1872 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
1873
1874 cbRet += cbActual;
1875 offFile += cbActual;
1876 kio->ki_pos = offFile;
1877 cbToWrite -= cbActual;
1878 if (offFile > i_size_read(inode))
1879 i_size_write(inode, offFile);
1880 sf_i->force_restat = 1; /* mtime (and size) may have changed */
1881
1882 /*
1883 * Are we done already?
1884 */
1885 if (!cbToWrite)
1886 break;
1887 if (cbActual < cbChunk) { /* We ASSUME end-of-file here. */
1888 if (vbsf_iter_rewind(iter, &Stash, cbChunk - cbActual, cbActual))
1889 iov_iter_truncate(iter, 0);
1890 break;
1891 }
1892 } else {
1893 /*
1894 * Try rewind the iter structure.
1895 */
1896 bool fRewindOkay;
1897 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
1898 fRewindOkay = vbsf_iter_rewind(iter, &Stash, cbChunk, cbChunk);
1899 if (rc == VERR_NO_MEMORY && cMaxPages > 4 && fRewindOkay) {
1900 /*
1901 * The host probably doesn't have enough heap to handle the
1902 * request, reduce the page count and retry.
1903 */
1904 cMaxPages /= 4;
1905 Assert(cMaxPages > 0);
1906 } else {
1907 /*
1908 * If we've successfully written stuff, return it rather than
1909 * the error. (Not sure if this is such a great idea...)
1910 */
1911 if (cbRet <= 0)
1912 cbRet = -EPROTO;
1913 break;
1914 }
1915 }
1916 } while (cbToWrite > 0);
1917
1918 vbsf_iter_cleanup_stash(iter, &Stash);
1919 }
1920 else
1921 cbRet = -ENOMEM;
1922 if (papPagesFree)
1923 kfree(papPages);
1924 if (pReq)
1925 VbglR0PhysHeapFree(pReq);
1926 SFLOGFLOW(("vbsf_reg_write_iter_locking: returns %#zx (%zd)\n", cbRet, cbRet));
1927 return cbRet;
1928}
1929
1930
1931
1932/**
1933 * Write from I/O vector iterator.
1934 *
1935 * @returns Number of bytes written on success, negative errno on error.
1936 * @param kio The kernel I/O control block (or something like that).
1937 * @param iter The I/O vector iterator describing the buffer.
1938 */
1939static ssize_t vbsf_reg_write_iter(struct kiocb *kio, struct iov_iter *iter)
1940{
1941 size_t cbToWrite = iov_iter_count(iter);
1942 struct inode *inode = VBSF_GET_F_DENTRY(kio->ki_filp)->d_inode;
1943 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
1944 struct address_space *mapping = inode->i_mapping;
1945
1946 struct vbsf_reg_info *sf_r = kio->ki_filp->private_data;
1947 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
1948 loff_t offFile = kio->ki_pos;
1949
1950 SFLOGFLOW(("vbsf_reg_write_iter: inode=%p file=%p size=%#zx off=%#llx type=%#x\n",
1951 inode, kio->ki_filp, cbToWrite, offFile, iter->type));
1952 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
1953
1954 /*
1955 * Enforce APPEND flag.
1956 */
1957 /** @todo This should be handled by the host, it returning the new file
1958 * offset when appending. We may have an outdated i_size value here! */
1959#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)
1960 if (kio->ki_flags & IOCB_APPEND)
1961#else
1962 if (kio->ki_filp->f_flags & O_APPEND)
1963#endif
1964 kio->ki_pos = offFile = i_size_read(inode);
1965
1966 /*
1967 * Do we have anything at all to do here?
1968 */
1969 if (!cbToWrite)
1970 return 0;
1971
1972 /*
1973 * Now now we reject async I/O requests.
1974 */
1975 if (!is_sync_kiocb(kio)) {
1976 SFLOGFLOW(("vbsf_reg_write_iter: async I/O not yet supported\n")); /** @todo extend FsPerf with AIO tests. */
1977 return -EOPNOTSUPP;
1978 }
1979
1980 /*
1981 * If there are active writable mappings, coordinate with any
1982 * pending writes via those.
1983 */
1984 if ( mapping
1985 && mapping->nrpages > 0
1986 && mapping_writably_mapped(mapping)) {
1987#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
1988 int err = filemap_fdatawait_range(mapping, offFile, offFile + cbToWrite - 1);
1989 if (err)
1990 return err;
1991#else
1992 /** @todo ... */
1993#endif
1994 }
1995
1996 /*
1997 * For small requests, try use an embedded buffer provided we get a heap block
1998 * that does not cross page boundraries (see host code).
1999 */
2000 if (cbToWrite <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) /* see allocator */) {
2001 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) + cbToWrite;
2002 VBOXSFWRITEEMBEDDEDREQ *pReq = (VBOXSFWRITEEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
2003 if (pReq) {
2004 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
2005 ssize_t cbRet;
2006 if (copy_from_iter(pReq->abData, cbToWrite, iter) == cbToWrite) {
2007 int vrc = VbglR0SfHostReqWriteEmbedded(sf_g->map.root, pReq, sf_r->Handle.hHost,
2008 offFile, (uint32_t)cbToWrite);
2009 if (RT_SUCCESS(vrc)) {
2010 cbRet = pReq->Parms.cb32Write.u.value32;
2011 AssertStmt(cbRet <= (ssize_t)cbToWrite, cbRet = cbToWrite);
2012 vbsf_reg_write_sync_page_cache(mapping, offFile, (uint32_t)cbRet, pReq->abData,
2013 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
2014 kio->ki_pos = offFile += cbRet;
2015 if (offFile > i_size_read(inode))
2016 i_size_write(inode, offFile);
2017# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
2018 if ((size_t)cbRet < cbToWrite)
2019 iov_iter_revert(iter, cbToWrite - cbRet);
2020# endif
2021 } else
2022 cbRet = -EPROTO;
2023 sf_i->force_restat = 1; /* mtime (and size) may have changed */
2024 } else
2025 cbRet = -EFAULT;
2026 VbglR0PhysHeapFree(pReq);
2027 SFLOGFLOW(("vbsf_reg_write_iter: returns %#zx (%zd)\n", cbRet, cbRet));
2028 return cbRet;
2029 }
2030 VbglR0PhysHeapFree(pReq);
2031 }
2032 }
2033
2034 /*
2035 * Otherwise do the page locking thing.
2036 */
2037 return vbsf_reg_write_iter_locking(kio, iter, cbToWrite, offFile, sf_g, sf_r, inode, sf_i, mapping);
2038}
2039
2040#endif /* >= 3.16.0 */
2041
2042/**
2043 * Used by vbsf_reg_open() and vbsf_inode_atomic_open() to
2044 *
2045 * @returns shared folders create flags.
2046 * @param fLnxOpen The linux O_XXX flags to convert.
2047 * @param pfHandle Pointer to vbsf_handle::fFlags.
2048 * @param pszCaller Caller, for logging purposes.
2049 */
2050uint32_t vbsf_linux_oflags_to_vbox(unsigned fLnxOpen, uint32_t *pfHandle, const char *pszCaller)
2051{
2052 uint32_t fVBoxFlags = SHFL_CF_ACCESS_DENYNONE;
2053
2054 /*
2055 * Disposition.
2056 */
2057 if (fLnxOpen & O_CREAT) {
2058 Log(("%s: O_CREAT set\n", pszCaller));
2059 fVBoxFlags |= SHFL_CF_ACT_CREATE_IF_NEW;
2060 if (fLnxOpen & O_EXCL) {
2061 Log(("%s: O_EXCL set\n", pszCaller));
2062 fVBoxFlags |= SHFL_CF_ACT_FAIL_IF_EXISTS;
2063 } else if (fLnxOpen & O_TRUNC) {
2064 Log(("%s: O_TRUNC set\n", pszCaller));
2065 fVBoxFlags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
2066 } else
2067 fVBoxFlags |= SHFL_CF_ACT_OPEN_IF_EXISTS;
2068 } else {
2069 fVBoxFlags |= SHFL_CF_ACT_FAIL_IF_NEW;
2070 if (fLnxOpen & O_TRUNC) {
2071 Log(("%s: O_TRUNC set\n", pszCaller));
2072 fVBoxFlags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
2073 }
2074 }
2075
2076 /*
2077 * Access.
2078 */
2079 switch (fLnxOpen & O_ACCMODE) {
2080 case O_RDONLY:
2081 fVBoxFlags |= SHFL_CF_ACCESS_READ;
2082 *pfHandle |= VBSF_HANDLE_F_READ;
2083 break;
2084
2085 case O_WRONLY:
2086 fVBoxFlags |= SHFL_CF_ACCESS_WRITE;
2087 *pfHandle |= VBSF_HANDLE_F_WRITE;
2088 break;
2089
2090 case O_RDWR:
2091 fVBoxFlags |= SHFL_CF_ACCESS_READWRITE;
2092 *pfHandle |= VBSF_HANDLE_F_READ | VBSF_HANDLE_F_WRITE;
2093 break;
2094
2095 default:
2096 BUG();
2097 }
2098
2099 if (fLnxOpen & O_APPEND) {
2100 Log(("%s: O_APPEND set\n", pszCaller));
2101 fVBoxFlags |= SHFL_CF_ACCESS_APPEND;
2102 *pfHandle |= VBSF_HANDLE_F_APPEND;
2103 }
2104
2105 /*
2106 * Only directories?
2107 */
2108 if (fLnxOpen & O_DIRECTORY) {
2109 Log(("%s: O_DIRECTORY set\n", pszCaller));
2110 fVBoxFlags |= SHFL_CF_DIRECTORY;
2111 }
2112
2113 return fVBoxFlags;
2114}
2115
2116
2117/**
2118 * Open a regular file.
2119 *
2120 * @param inode the inode
2121 * @param file the file
2122 * @returns 0 on success, Linux error code otherwise
2123 */
2124static int vbsf_reg_open(struct inode *inode, struct file *file)
2125{
2126 int rc, rc_linux = 0;
2127 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
2128 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
2129 struct vbsf_reg_info *sf_r;
2130 struct dentry *dentry = VBSF_GET_F_DENTRY(file);
2131 VBOXSFCREATEREQ *pReq;
2132
2133 SFLOGFLOW(("vbsf_reg_open: inode=%p file=%p flags=%#x %s\n", inode, file, file->f_flags, sf_i ? sf_i->path->String.ach : NULL));
2134 BUG_ON(!sf_g);
2135 BUG_ON(!sf_i);
2136
2137 sf_r = kmalloc(sizeof(*sf_r), GFP_KERNEL);
2138 if (!sf_r) {
2139 LogRelFunc(("could not allocate reg info\n"));
2140 return -ENOMEM;
2141 }
2142
2143 RTListInit(&sf_r->Handle.Entry);
2144 sf_r->Handle.cRefs = 1;
2145 sf_r->Handle.fFlags = VBSF_HANDLE_F_FILE | VBSF_HANDLE_F_MAGIC;
2146 sf_r->Handle.hHost = SHFL_HANDLE_NIL;
2147
2148 /* Already open? */
2149 if (sf_i->handle != SHFL_HANDLE_NIL) {
2150 /*
2151 * This inode was created with vbsf_create_worker(). Check the CreateFlags:
2152 * O_CREAT, O_TRUNC: inherent true (file was just created). Not sure
2153 * about the access flags (SHFL_CF_ACCESS_*).
2154 */
2155 sf_i->force_restat = 1;
2156 sf_r->Handle.hHost = sf_i->handle;
2157 sf_i->handle = SHFL_HANDLE_NIL;
2158 file->private_data = sf_r;
2159
2160 sf_r->Handle.fFlags |= VBSF_HANDLE_F_READ | VBSF_HANDLE_F_WRITE; /** @todo fix */
2161 vbsf_handle_append(sf_i, &sf_r->Handle);
2162 SFLOGFLOW(("vbsf_reg_open: returns 0 (#1) - sf_i=%p hHost=%#llx\n", sf_i, sf_r->Handle.hHost));
2163 return 0;
2164 }
2165
2166 pReq = (VBOXSFCREATEREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq) + sf_i->path->u16Size);
2167 if (!pReq) {
2168 kfree(sf_r);
2169 LogRelFunc(("Failed to allocate a VBOXSFCREATEREQ buffer!\n"));
2170 return -ENOMEM;
2171 }
2172 memcpy(&pReq->StrPath, sf_i->path, SHFLSTRING_HEADER_SIZE + sf_i->path->u16Size);
2173 RT_ZERO(pReq->CreateParms);
2174 pReq->CreateParms.Handle = SHFL_HANDLE_NIL;
2175
2176 /* We check the value of pReq->CreateParms.Handle afterwards to
2177 * find out if the call succeeded or failed, as the API does not seem
2178 * to cleanly distinguish error and informational messages.
2179 *
2180 * Furthermore, we must set pReq->CreateParms.Handle to SHFL_HANDLE_NIL
2181 * to make the shared folders host service use our fMode parameter */
2182
2183 /* We ignore O_EXCL, as the Linux kernel seems to call create
2184 beforehand itself, so O_EXCL should always fail. */
2185 pReq->CreateParms.CreateFlags = vbsf_linux_oflags_to_vbox(file->f_flags & ~O_EXCL, &sf_r->Handle.fFlags, __FUNCTION__);
2186 pReq->CreateParms.Info.Attr.fMode = inode->i_mode;
2187 LogFunc(("vbsf_reg_open: calling VbglR0SfHostReqCreate, file %s, flags=%#x, %#x\n",
2188 sf_i->path->String.utf8, file->f_flags, pReq->CreateParms.CreateFlags));
2189 rc = VbglR0SfHostReqCreate(sf_g->map.root, pReq);
2190 if (RT_FAILURE(rc)) {
2191 LogFunc(("VbglR0SfHostReqCreate failed flags=%d,%#x rc=%Rrc\n", file->f_flags, pReq->CreateParms.CreateFlags, rc));
2192 kfree(sf_r);
2193 VbglR0PhysHeapFree(pReq);
2194 return -RTErrConvertToErrno(rc);
2195 }
2196
2197 if (pReq->CreateParms.Handle != SHFL_HANDLE_NIL) {
2198 vbsf_dentry_chain_increase_ttl(dentry);
2199 rc_linux = 0;
2200 } else {
2201 switch (pReq->CreateParms.Result) {
2202 case SHFL_PATH_NOT_FOUND:
2203 rc_linux = -ENOENT;
2204 break;
2205 case SHFL_FILE_NOT_FOUND:
2206 /** @todo sf_dentry_increase_parent_ttl(file->f_dentry); if we can trust it. */
2207 rc_linux = -ENOENT;
2208 break;
2209 case SHFL_FILE_EXISTS:
2210 vbsf_dentry_chain_increase_ttl(dentry);
2211 rc_linux = -EEXIST;
2212 break;
2213 default:
2214 vbsf_dentry_chain_increase_parent_ttl(dentry);
2215 rc_linux = 0;
2216 break;
2217 }
2218 }
2219
2220 sf_i->force_restat = 1; /** @todo Why?!? */
2221 sf_r->Handle.hHost = pReq->CreateParms.Handle;
2222 file->private_data = sf_r;
2223 vbsf_handle_append(sf_i, &sf_r->Handle);
2224 VbglR0PhysHeapFree(pReq);
2225 SFLOGFLOW(("vbsf_reg_open: returns 0 (#2) - sf_i=%p hHost=%#llx\n", sf_i, sf_r->Handle.hHost));
2226 return rc_linux;
2227}
2228
2229
2230/**
2231 * Close a regular file.
2232 *
2233 * @param inode the inode
2234 * @param file the file
2235 * @returns 0 on success, Linux error code otherwise
2236 */
2237static int vbsf_reg_release(struct inode *inode, struct file *file)
2238{
2239 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
2240 struct vbsf_reg_info *sf_r = file->private_data;
2241
2242 SFLOGFLOW(("vbsf_reg_release: inode=%p file=%p\n", inode, file));
2243 if (sf_r) {
2244 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
2245 Assert(sf_g);
2246
2247#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 25)
2248 /* See the smbfs source (file.c). mmap in particular can cause data to be
2249 * written to the file after it is closed, which we can't cope with. We
2250 * copy and paste the body of filemap_write_and_wait() here as it was not
2251 * defined before 2.6.6 and not exported until quite a bit later. */
2252 /* filemap_write_and_wait(inode->i_mapping); */
2253 if (inode->i_mapping->nrpages
2254 && filemap_fdatawrite(inode->i_mapping) != -EIO)
2255 filemap_fdatawait(inode->i_mapping);
2256#endif
2257
2258 /* Release sf_r, closing the handle if we're the last user. */
2259 file->private_data = NULL;
2260 vbsf_handle_release(&sf_r->Handle, sf_g, "vbsf_reg_release");
2261
2262 sf_i->handle = SHFL_HANDLE_NIL;
2263 }
2264 return 0;
2265}
2266
2267/**
2268 * Wrapper around generic/default seek function that ensures that we've got
2269 * the up-to-date file size when doing anything relative to EOF.
2270 *
2271 * The issue is that the host may extend the file while we weren't looking and
2272 * if the caller wishes to append data, it may end up overwriting existing data
2273 * if we operate with a stale size. So, we always retrieve the file size on EOF
2274 * relative seeks.
2275 */
2276static loff_t vbsf_reg_llseek(struct file *file, loff_t off, int whence)
2277{
2278 SFLOGFLOW(("vbsf_reg_llseek: file=%p off=%lld whence=%d\n", file, off, whence));
2279
2280 switch (whence) {
2281#ifdef SEEK_HOLE
2282 case SEEK_HOLE:
2283 case SEEK_DATA:
2284#endif
2285 case SEEK_END: {
2286 struct vbsf_reg_info *sf_r = file->private_data;
2287 int rc = vbsf_inode_revalidate_with_handle(VBSF_GET_F_DENTRY(file), sf_r->Handle.hHost,
2288 true /*fForce*/, false /*fInodeLocked*/);
2289 if (rc == 0)
2290 break;
2291 return rc;
2292 }
2293 }
2294
2295#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 8)
2296 return generic_file_llseek(file, off, whence);
2297#else
2298 return default_llseek(file, off, whence);
2299#endif
2300}
2301
2302/**
2303 * Flush region of file - chiefly mmap/msync.
2304 *
2305 * We cannot use the noop_fsync / simple_sync_file here as that means
2306 * msync(,,MS_SYNC) will return before the data hits the host, thereby
2307 * causing coherency issues with O_DIRECT access to the same file as
2308 * well as any host interaction with the file.
2309 */
2310#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
2311static int vbsf_reg_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2312{
2313# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2314 return __generic_file_fsync(file, start, end, datasync);
2315# else
2316 return generic_file_fsync(file, start, end, datasync);
2317# endif
2318}
2319#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35)
2320static int vbsf_reg_fsync(struct file *file, int datasync)
2321{
2322 return generic_file_fsync(file, datasync);
2323}
2324#else /* < 2.6.35 */
2325static int vbsf_reg_fsync(struct file *file, struct dentry *dentry, int datasync)
2326{
2327# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 31)
2328 return simple_fsync(file, dentry, datasync);
2329# else
2330 int rc;
2331 struct inode *inode = dentry->d_inode;
2332 AssertReturn(inode, -EINVAL);
2333
2334 /** @todo What about file_fsync()? (<= 2.5.11) */
2335
2336# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
2337 rc = sync_mapping_buffers(inode->i_mapping);
2338 if ( rc == 0
2339 && (inode->i_state & I_DIRTY)
2340 && ((inode->i_state & I_DIRTY_DATASYNC) || !datasync)
2341 ) {
2342 struct writeback_control wbc = {
2343 .sync_mode = WB_SYNC_ALL,
2344 .nr_to_write = 0
2345 };
2346 rc = sync_inode(inode, &wbc);
2347 }
2348# else /* < 2.5.12 */
2349 rc = fsync_inode_buffers(inode);
2350# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
2351 rc |= fsync_inode_data_buffers(inode);
2352# endif
2353 /** @todo probably need to do more here... */
2354# endif /* < 2.5.12 */
2355 return rc;
2356# endif
2357}
2358#endif /* < 2.6.35 */
2359
2360
2361#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
2362/**
2363 * Copy a datablock from one file to another on the host side.
2364 */
2365static ssize_t vbsf_reg_copy_file_range(struct file *pFileSrc, loff_t offSrc, struct file *pFileDst, loff_t offDst,
2366 size_t cbRange, unsigned int fFlags)
2367{
2368 ssize_t cbRet;
2369 if (g_uSfLastFunction >= SHFL_FN_COPY_FILE_PART) {
2370 struct inode *pInodeSrc = pFileSrc->f_inode;
2371 struct vbsf_inode_info *pInodeInfoSrc = VBSF_GET_INODE_INFO(pInodeSrc);
2372 struct vbsf_super_info *pSuperInfoSrc = VBSF_GET_SUPER_INFO(pInodeSrc->i_sb);
2373 struct vbsf_reg_info *pFileInfoSrc = (struct vbsf_reg_info *)pFileSrc->private_data;
2374 struct inode *pInodeDst = pInodeSrc;
2375 struct vbsf_inode_info *pInodeInfoDst = VBSF_GET_INODE_INFO(pInodeDst);
2376 struct vbsf_super_info *pSuperInfoDst = VBSF_GET_SUPER_INFO(pInodeDst->i_sb);
2377 struct vbsf_reg_info *pFileInfoDst = (struct vbsf_reg_info *)pFileDst->private_data;
2378 VBOXSFCOPYFILEPARTREQ *pReq;
2379
2380 /*
2381 * Some extra validation.
2382 */
2383 AssertPtrReturn(pInodeInfoSrc, -EOPNOTSUPP);
2384 Assert(pInodeInfoSrc->u32Magic == SF_INODE_INFO_MAGIC);
2385 AssertPtrReturn(pInodeInfoDst, -EOPNOTSUPP);
2386 Assert(pInodeInfoDst->u32Magic == SF_INODE_INFO_MAGIC);
2387
2388# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
2389 if (!S_ISREG(pInodeSrc->i_mode) || !S_ISREG(pInodeDst->i_mode))
2390 return S_ISDIR(pInodeSrc->i_mode) || S_ISDIR(pInodeDst->i_mode) ? -EISDIR : -EINVAL;
2391# endif
2392
2393 /*
2394 * Allocate the request and issue it.
2395 */
2396 pReq = (VBOXSFCOPYFILEPARTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
2397 if (pReq) {
2398 int vrc = VbglR0SfHostReqCopyFilePart(pSuperInfoSrc->map.root, pFileInfoSrc->Handle.hHost, offSrc,
2399 pSuperInfoDst->map.root, pFileInfoDst->Handle.hHost, offDst,
2400 cbRange, 0 /*fFlags*/, pReq);
2401 if (RT_SUCCESS(vrc))
2402 cbRet = pReq->Parms.cb64ToCopy.u.value64;
2403 else if (vrc == VERR_NOT_IMPLEMENTED)
2404 cbRet = -EOPNOTSUPP;
2405 else
2406 cbRet = -RTErrConvertToErrno(vrc);
2407
2408 VbglR0PhysHeapFree(pReq);
2409 } else
2410 cbRet = -ENOMEM;
2411 } else {
2412 cbRet = -EOPNOTSUPP;
2413 }
2414 SFLOGFLOW(("vbsf_reg_copy_file_range: returns %zd\n", cbRet));
2415 return cbRet;
2416}
2417#endif /* > 4.5 */
2418
2419#ifdef SFLOG_ENABLED
2420/*
2421 * This is just for logging page faults and such.
2422 */
2423
2424/** Pointer to the ops generic_file_mmap returns the first time it's called. */
2425static struct vm_operations_struct const *g_pGenericFileVmOps = NULL;
2426/** Merge of g_LoggingVmOpsTemplate and g_pGenericFileVmOps. */
2427static struct vm_operations_struct g_LoggingVmOps;
2428
2429
2430/* Generic page fault callback: */
2431# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
2432static vm_fault_t vbsf_vmlog_fault(struct vm_fault *vmf)
2433{
2434 vm_fault_t rc;
2435 SFLOGFLOW(("vbsf_vmlog_fault: vmf=%p flags=%#x addr=%p\n", vmf, vmf->flags, vmf->address));
2436 rc = g_pGenericFileVmOps->fault(vmf);
2437 SFLOGFLOW(("vbsf_vmlog_fault: returns %d\n", rc));
2438 return rc;
2439}
2440# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
2441static int vbsf_vmlog_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2442{
2443 int rc;
2444# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
2445 SFLOGFLOW(("vbsf_vmlog_fault: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->address));
2446# else
2447 SFLOGFLOW(("vbsf_vmlog_fault: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->virtual_address));
2448# endif
2449 rc = g_pGenericFileVmOps->fault(vma, vmf);
2450 SFLOGFLOW(("vbsf_vmlog_fault: returns %d\n", rc));
2451 return rc;
2452}
2453# endif
2454
2455
2456/* Special/generic page fault handler: */
2457# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 26)
2458# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 1)
2459static struct page *vbsf_vmlog_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
2460{
2461 struct page *page;
2462 SFLOGFLOW(("vbsf_vmlog_nopage: vma=%p address=%p type=%p:{%#x}\n", vma, address, type, type ? *type : 0));
2463 page = g_pGenericFileVmOps->nopage(vma, address, type);
2464 SFLOGFLOW(("vbsf_vmlog_nopage: returns %p\n", page));
2465 return page;
2466}
2467# else
2468static struct page *vbsf_vmlog_nopage(struct vm_area_struct *vma, unsigned long address, int write_access_or_unused)
2469{
2470 struct page *page;
2471 SFLOGFLOW(("vbsf_vmlog_nopage: vma=%p address=%p wau=%d\n", vma, address, write_access_or_unused));
2472 page = g_pGenericFileVmOps->nopage(vma, address, write_access_or_unused);
2473 SFLOGFLOW(("vbsf_vmlog_nopage: returns %p\n", page));
2474 return page;
2475}
2476# endif /* < 2.6.26 */
2477
2478
2479/* Special page fault callback for making something writable: */
2480# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
2481static vm_fault_t vbsf_vmlog_page_mkwrite(struct vm_fault *vmf)
2482{
2483 vm_fault_t rc;
2484# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
2485 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vmf=%p flags=%#x addr=%p\n", vmf, vmf->flags, vmf->address));
2486# else
2487 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vmf=%p flags=%#x addr=%p\n", vmf, vmf->flags, vmf->virtual_address));
2488# endif
2489 rc = g_pGenericFileVmOps->page_mkwrite(vmf);
2490 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
2491 return rc;
2492}
2493# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
2494static int vbsf_vmlog_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2495{
2496 int rc;
2497 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->virtual_address));
2498 rc = g_pGenericFileVmOps->page_mkwrite(vma, vmf);
2499 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
2500 return rc;
2501}
2502# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
2503static int vbsf_vmlog_page_mkwrite(struct vm_area_struct *vma, struct page *page)
2504{
2505 int rc;
2506 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vma=%p page=%p\n", vma, page));
2507 rc = g_pGenericFileVmOps->page_mkwrite(vma, page);
2508 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
2509 return rc;
2510}
2511# endif
2512
2513
2514/* Special page fault callback for mapping pages: */
2515# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
2516static void vbsf_vmlog_map_pages(struct vm_fault *vmf, pgoff_t start, pgoff_t end)
2517{
2518 SFLOGFLOW(("vbsf_vmlog_map_pages: vmf=%p (flags=%#x addr=%p) start=%p end=%p\n", vmf, vmf->flags, vmf->address, start, end));
2519 g_pGenericFileVmOps->map_pages(vmf, start, end);
2520 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
2521}
2522# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
2523static void vbsf_vmlog_map_pages(struct fault_env *fenv, pgoff_t start, pgoff_t end)
2524{
2525 SFLOGFLOW(("vbsf_vmlog_map_pages: fenv=%p (flags=%#x addr=%p) start=%p end=%p\n", fenv, fenv->flags, fenv->address, start, end));
2526 g_pGenericFileVmOps->map_pages(fenv, start, end);
2527 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
2528}
2529# elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)
2530static void vbsf_vmlog_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
2531{
2532 SFLOGFLOW(("vbsf_vmlog_map_pages: vma=%p vmf=%p (flags=%#x addr=%p)\n", vma, vmf, vmf->flags, vmf->virtual_address));
2533 g_pGenericFileVmOps->map_pages(vma, vmf);
2534 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
2535}
2536# endif
2537
2538
2539/** Overload template. */
2540static struct vm_operations_struct const g_LoggingVmOpsTemplate = {
2541# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
2542 .fault = vbsf_vmlog_fault,
2543# endif
2544# if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 25)
2545 .nopage = vbsf_vmlog_nopage,
2546# endif
2547# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
2548 .page_mkwrite = vbsf_vmlog_page_mkwrite,
2549# endif
2550# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)
2551 .map_pages = vbsf_vmlog_map_pages,
2552# endif
2553};
2554
2555/** file_operations::mmap wrapper for logging purposes. */
2556extern int vbsf_reg_mmap(struct file *file, struct vm_area_struct *vma)
2557{
2558 int rc;
2559 SFLOGFLOW(("vbsf_reg_mmap: file=%p vma=%p\n", file, vma));
2560 rc = generic_file_mmap(file, vma);
2561 if (rc == 0) {
2562 /* Merge the ops and template the first time thru (there's a race here). */
2563 if (g_pGenericFileVmOps == NULL) {
2564 uintptr_t const *puSrc1 = (uintptr_t *)vma->vm_ops;
2565 uintptr_t const *puSrc2 = (uintptr_t *)&g_LoggingVmOpsTemplate;
2566 uintptr_t volatile *puDst = (uintptr_t *)&g_LoggingVmOps;
2567 size_t cbLeft = sizeof(g_LoggingVmOps) / sizeof(*puDst);
2568 while (cbLeft-- > 0) {
2569 *puDst = *puSrc2 && *puSrc1 ? *puSrc2 : *puSrc1;
2570 puSrc1++;
2571 puSrc2++;
2572 puDst++;
2573 }
2574 g_pGenericFileVmOps = vma->vm_ops;
2575 vma->vm_ops = &g_LoggingVmOps;
2576 } else if (g_pGenericFileVmOps == vma->vm_ops)
2577 vma->vm_ops = &g_LoggingVmOps;
2578 else
2579 SFLOGFLOW(("vbsf_reg_mmap: Warning: vm_ops=%p, expected %p!\n", vma->vm_ops, g_pGenericFileVmOps));
2580 }
2581 SFLOGFLOW(("vbsf_reg_mmap: returns %d\n", rc));
2582 return rc;
2583}
2584
2585#endif /* SFLOG_ENABLED */
2586
2587
2588/**
2589 * File operations for regular files.
2590 */
2591struct file_operations vbsf_reg_fops = {
2592 .open = vbsf_reg_open,
2593 .read = vbsf_reg_read,
2594 .write = vbsf_reg_write,
2595#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2596 .read_iter = vbsf_reg_read_iter,
2597 .write_iter = vbsf_reg_write_iter,
2598#endif
2599 .release = vbsf_reg_release,
2600#ifdef SFLOG_ENABLED
2601 .mmap = vbsf_reg_mmap,
2602#else
2603 .mmap = generic_file_mmap,
2604#endif
2605#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
2606# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
2607/** @todo This code is known to cause caching of data which should not be
2608 * cached. Investigate. */
2609# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
2610 .splice_read = vbsf_splice_read,
2611# else
2612 .sendfile = generic_file_sendfile,
2613# endif
2614 .aio_read = generic_file_aio_read,
2615 .aio_write = generic_file_aio_write,
2616# endif
2617#endif
2618 .llseek = vbsf_reg_llseek,
2619 .fsync = vbsf_reg_fsync,
2620#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
2621 .copy_file_range = vbsf_reg_copy_file_range,
2622#endif
2623};
2624
2625struct inode_operations vbsf_reg_iops = {
2626#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 18)
2627 .getattr = vbsf_inode_getattr,
2628#else
2629 .revalidate = vbsf_inode_revalidate,
2630#endif
2631 .setattr = vbsf_inode_setattr,
2632};
2633
2634
2635#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
2636
2637/**
2638 * Used to read the content of a page into the page cache.
2639 *
2640 * Needed for mmap and reads+writes when the file is mmapped in a
2641 * shared+writeable fashion.
2642 */
2643static int vbsf_readpage(struct file *file, struct page *page)
2644{
2645 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
2646 int err;
2647
2648 SFLOGFLOW(("vbsf_readpage: inode=%p file=%p page=%p off=%#llx\n", inode, file, page, (uint64_t)page->index << PAGE_SHIFT));
2649 Assert(PageLocked(page));
2650
2651 if (PageUptodate(page)) {
2652 unlock_page(page);
2653 return 0;
2654 }
2655
2656 if (!is_bad_inode(inode)) {
2657 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
2658 if (pReq) {
2659 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
2660 struct vbsf_reg_info *sf_r = file->private_data;
2661 uint32_t cbRead;
2662 int vrc;
2663
2664 pReq->PgLst.offFirstPage = 0;
2665 pReq->PgLst.aPages[0] = page_to_phys(page);
2666 vrc = VbglR0SfHostReqReadPgLst(sf_g->map.root,
2667 pReq,
2668 sf_r->Handle.hHost,
2669 (uint64_t)page->index << PAGE_SHIFT,
2670 PAGE_SIZE,
2671 1 /*cPages*/);
2672
2673 cbRead = pReq->Parms.cb32Read.u.value32;
2674 AssertStmt(cbRead <= PAGE_SIZE, cbRead = PAGE_SIZE);
2675 VbglR0PhysHeapFree(pReq);
2676
2677 if (RT_SUCCESS(vrc)) {
2678 if (cbRead == PAGE_SIZE) {
2679 /* likely */
2680 } else {
2681 uint8_t *pbMapped = (uint8_t *)kmap(page);
2682 RT_BZERO(&pbMapped[cbRead], PAGE_SIZE - cbRead);
2683 kunmap(page);
2684 /** @todo truncate the inode file size? */
2685 }
2686
2687 flush_dcache_page(page);
2688 SetPageUptodate(page);
2689 unlock_page(page);
2690 return 0;
2691 }
2692 err = -RTErrConvertToErrno(vrc);
2693 } else
2694 err = -ENOMEM;
2695 } else
2696 err = -EIO;
2697 SetPageError(page);
2698 unlock_page(page);
2699 return err;
2700}
2701
2702
2703/**
2704 * Used to write out the content of a dirty page cache page to the host file.
2705 *
2706 * Needed for mmap and writes when the file is mmapped in a shared+writeable
2707 * fashion.
2708 */
2709static int vbsf_writepage(struct page *page, struct writeback_control *wbc)
2710{
2711 struct address_space *mapping = page->mapping;
2712 struct inode *inode = mapping->host;
2713 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
2714 struct vbsf_handle *pHandle = vbsf_handle_find(sf_i, VBSF_HANDLE_F_WRITE, VBSF_HANDLE_F_APPEND);
2715 int err;
2716
2717 SFLOGFLOW(("vbsf_writepage: inode=%p page=%p off=%#llx pHandle=%p (%#llx)\n",
2718 inode, page,(uint64_t)page->index << PAGE_SHIFT, pHandle, pHandle->hHost));
2719
2720 if (pHandle) {
2721 struct vbsf_super_info *sf_g = VBSF_GET_SUPER_INFO(inode->i_sb);
2722 VBOXSFWRITEPGLSTREQ *pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
2723 if (pReq) {
2724 uint64_t const cbFile = i_size_read(inode);
2725 uint64_t const offInFile = (uint64_t)page->index << PAGE_SHIFT;
2726 uint32_t const cbToWrite = page->index != (cbFile >> PAGE_SHIFT) ? PAGE_SIZE
2727 : (uint32_t)cbFile & (uint32_t)PAGE_OFFSET_MASK;
2728 int vrc;
2729
2730 pReq->PgLst.offFirstPage = 0;
2731 pReq->PgLst.aPages[0] = page_to_phys(page);
2732 vrc = VbglR0SfHostReqWritePgLst(sf_g->map.root,
2733 pReq,
2734 pHandle->hHost,
2735 offInFile,
2736 cbToWrite,
2737 1 /*cPages*/);
2738 AssertMsgStmt(pReq->Parms.cb32Write.u.value32 == cbToWrite || RT_FAILURE(vrc), /* lazy bird */
2739 ("%#x vs %#x\n", pReq->Parms.cb32Write, cbToWrite),
2740 vrc = VERR_WRITE_ERROR);
2741 VbglR0PhysHeapFree(pReq);
2742
2743 if (RT_SUCCESS(vrc)) {
2744 /* Update the inode if we've extended the file. */
2745 /** @todo is this necessary given the cbToWrite calc above? */
2746 uint64_t const offEndOfWrite = offInFile + cbToWrite;
2747 if ( offEndOfWrite > cbFile
2748 && offEndOfWrite > i_size_read(inode))
2749 i_size_write(inode, offEndOfWrite);
2750
2751 if (PageError(page))
2752 ClearPageError(page);
2753
2754 err = 0;
2755 } else {
2756 ClearPageUptodate(page);
2757 err = -EPROTO;
2758 }
2759 } else
2760 err = -ENOMEM;
2761 vbsf_handle_release(pHandle, sf_g, "vbsf_writepage");
2762 } else {
2763 static uint64_t volatile s_cCalls = 0;
2764 if (s_cCalls++ < 16)
2765 printk("vbsf_writepage: no writable handle for %s..\n", sf_i->path->String.ach);
2766 err = -EPROTO;
2767 }
2768 unlock_page(page);
2769 return err;
2770}
2771
2772# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
2773/**
2774 * Called when writing thru the page cache (which we shouldn't be doing).
2775 */
2776int vbsf_write_begin(struct file *file, struct address_space *mapping, loff_t pos,
2777 unsigned len, unsigned flags, struct page **pagep, void **fsdata)
2778{
2779 /** @todo r=bird: We shouldn't ever get here, should we? Because we don't use
2780 * the page cache for any writes AFAIK. We could just as well use
2781 * simple_write_begin & simple_write_end here if we think we really
2782 * need to have non-NULL function pointers in the table... */
2783 static uint64_t volatile s_cCalls = 0;
2784 if (s_cCalls++ < 16) {
2785 printk("vboxsf: Unexpected call to vbsf_write_begin(pos=%#llx len=%#x flags=%#x)! Please report.\n",
2786 (unsigned long long)pos, len, flags);
2787 RTLogBackdoorPrintf("vboxsf: Unexpected call to vbsf_write_begin(pos=%#llx len=%#x flags=%#x)! Please report.\n",
2788 (unsigned long long)pos, len, flags);
2789# ifdef WARN_ON
2790 WARN_ON(1);
2791# endif
2792 }
2793 return simple_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
2794}
2795# endif /* KERNEL_VERSION >= 2.6.24 */
2796
2797# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
2798/**
2799 * This is needed to make open accept O_DIRECT as well as dealing with direct
2800 * I/O requests if we don't intercept them earlier.
2801 */
2802# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
2803static ssize_t vbsf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
2804# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)
2805static ssize_t vbsf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2806# elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2807static ssize_t vbsf_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2808# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 6)
2809static ssize_t vbsf_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2810# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 55)
2811static int vbsf_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2812# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 41)
2813static int vbsf_direct_IO(int rw, struct file *file, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2814# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 35)
2815static int vbsf_direct_IO(int rw, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2816# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 26)
2817static int vbsf_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset, size_t count)
2818# else
2819static int vbsf_direct_IO(int rw, struct inode *inode, struct kiobuf *, unsigned long, int)
2820# endif
2821{
2822 TRACE();
2823 return -EINVAL;
2824}
2825# endif
2826
2827/**
2828 * Address space (for the page cache) operations for regular files.
2829 *
2830 * @todo the FsPerf touch/flush (mmap) test fails on 4.4.0 (ubuntu 16.04 lts).
2831 */
2832struct address_space_operations vbsf_reg_aops = {
2833 .readpage = vbsf_readpage,
2834 .writepage = vbsf_writepage,
2835 /** @todo Need .writepages if we want msync performance... */
2836# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
2837 .set_page_dirty = __set_page_dirty_buffers,
2838# endif
2839# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
2840 .write_begin = vbsf_write_begin,
2841 .write_end = simple_write_end,
2842# else
2843 .prepare_write = simple_prepare_write,
2844 .commit_write = simple_commit_write,
2845# endif
2846# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
2847 .direct_IO = vbsf_direct_IO,
2848# endif
2849};
2850
2851#endif /* LINUX_VERSION_CODE >= 2.6.0 */
2852
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette