VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/VBoxNetFlt/linux/VBoxNetFlt-linux.c@ 33141

Last change on this file since 33141 was 32856, checked in by vboxsync, 14 years ago

vboxnetflt: qdisc compilation fix for newer kernels (#4814)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 82.9 KB
Line 
1/* $Id: VBoxNetFlt-linux.c 32856 2010-10-01 07:57:17Z vboxsync $ */
2/** @file
3 * VBoxNetFlt - Network Filter Driver (Host), Linux Specific Code.
4 */
5
6/*
7 * Copyright (C) 2006-2008 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*******************************************************************************
19* Header Files *
20*******************************************************************************/
21#define LOG_GROUP LOG_GROUP_NET_FLT_DRV
22#define VBOXNETFLT_LINUX_NO_XMIT_QUEUE
23#include "the-linux-kernel.h"
24#include "version-generated.h"
25#include "product-generated.h"
26#include <linux/netdevice.h>
27#include <linux/etherdevice.h>
28#include <linux/rtnetlink.h>
29#include <linux/miscdevice.h>
30#include <linux/ip.h>
31
32#include <VBox/log.h>
33#include <VBox/err.h>
34#include <VBox/intnetinline.h>
35#include <VBox/pdmnetinline.h>
36#include <VBox/param.h>
37#include <iprt/alloca.h>
38#include <iprt/assert.h>
39#include <iprt/spinlock.h>
40#include <iprt/semaphore.h>
41#include <iprt/initterm.h>
42#include <iprt/process.h>
43#include <iprt/mem.h>
44#include <iprt/net.h>
45#include <iprt/log.h>
46#include <iprt/mp.h>
47#include <iprt/mem.h>
48#include <iprt/time.h>
49
50#define VBOXNETFLT_OS_SPECFIC 1
51#include "../VBoxNetFltInternal.h"
52
53#ifdef CONFIG_NET_SCHED
54# define VBOXNETFLT_WITH_QDISC /* Comment this out to disable qdisc support */
55# ifdef VBOXNETFLT_WITH_QDISC
56# include <net/pkt_sched.h>
57# endif /* VBOXNETFLT_WITH_QDISC */
58#endif
59
60
61/*******************************************************************************
62* Defined Constants And Macros *
63*******************************************************************************/
64#define VBOX_FLT_NB_TO_INST(pNB) RT_FROM_MEMBER(pNB, VBOXNETFLTINS, u.s.Notifier)
65#define VBOX_FLT_PT_TO_INST(pPT) RT_FROM_MEMBER(pPT, VBOXNETFLTINS, u.s.PacketType)
66#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
67# define VBOX_FLT_XT_TO_INST(pXT) RT_FROM_MEMBER(pXT, VBOXNETFLTINS, u.s.XmitTask)
68#endif
69
70#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
71# define VBOX_SKB_RESET_NETWORK_HDR(skb) skb_reset_network_header(skb)
72# define VBOX_SKB_RESET_MAC_HDR(skb) skb_reset_mac_header(skb)
73#else
74# define VBOX_SKB_RESET_NETWORK_HDR(skb) skb->nh.raw = skb->data
75# define VBOX_SKB_RESET_MAC_HDR(skb) skb->mac.raw = skb->data
76#endif
77
78#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
79# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(skb)
80#else
81# define CHECKSUM_PARTIAL CHECKSUM_HW
82# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 10)
83# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(skb, 0)
84# else
85# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 7)
86# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(&skb, 0)
87# else
88# define VBOX_SKB_CHECKSUM_HELP(skb) (!skb_checksum_help(skb))
89# endif
90/* Versions prior 2.6.10 use stats for both bstats and qstats */
91# define bstats stats
92# define qstats stats
93# endif
94#endif
95
96#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 13)
97static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
98{
99 kfree_skb(skb);
100 sch->stats.drops++;
101
102 return NET_XMIT_DROP;
103}
104#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 13) */
105
106#ifndef NET_IP_ALIGN
107# define NET_IP_ALIGN 2
108#endif
109
110#if 0
111/** Create scatter / gather segments for fragments. When not used, we will
112 * linearize the socket buffer before creating the internal networking SG. */
113# define VBOXNETFLT_SG_SUPPORT 1
114#endif
115
116#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
117/** Indicates that the linux kernel may send us GSO frames. */
118# define VBOXNETFLT_WITH_GSO 1
119
120/** This enables or disables the transmitting of GSO frame from the internal
121 * network and to the host. */
122# define VBOXNETFLT_WITH_GSO_XMIT_HOST 1
123
124# if 0 /** @todo This is currently disable because it causes performance loss of 5-10%. */
125/** This enables or disables the transmitting of GSO frame from the internal
126 * network and to the wire. */
127# define VBOXNETFLT_WITH_GSO_XMIT_WIRE 1
128# endif
129
130/** This enables or disables the forwarding/flooding of GSO frame from the host
131 * to the internal network. */
132# define VBOXNETFLT_WITH_GSO_RECV 1
133
134#endif
135
136#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29)
137/** This enables or disables handling of GSO frames coming from the wire (GRO). */
138# define VBOXNETFLT_WITH_GRO 1
139#endif
140/*
141 * GRO support was backported to RHEL 5.4
142 */
143#ifdef RHEL_RELEASE_CODE
144# if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 4)
145# define VBOXNETFLT_WITH_GRO 1
146# endif
147#endif
148
149/*******************************************************************************
150* Internal Functions *
151*******************************************************************************/
152static int VBoxNetFltLinuxInit(void);
153static void VBoxNetFltLinuxUnload(void);
154static void vboxNetFltLinuxForwardToIntNet(PVBOXNETFLTINS pThis, struct sk_buff *pBuf);
155
156
157/*******************************************************************************
158* Global Variables *
159*******************************************************************************/
160/**
161 * The (common) global data.
162 */
163static VBOXNETFLTGLOBALS g_VBoxNetFltGlobals;
164
165module_init(VBoxNetFltLinuxInit);
166module_exit(VBoxNetFltLinuxUnload);
167
168MODULE_AUTHOR(VBOX_VENDOR);
169MODULE_DESCRIPTION(VBOX_PRODUCT " Network Filter Driver");
170MODULE_LICENSE("GPL");
171#ifdef MODULE_VERSION
172MODULE_VERSION(VBOX_VERSION_STRING " (" RT_XSTR(INTNETTRUNKIFPORT_VERSION) ")");
173#endif
174
175
176#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 12) && defined(LOG_ENABLED)
177unsigned dev_get_flags(const struct net_device *dev)
178{
179 unsigned flags;
180
181 flags = (dev->flags & ~(IFF_PROMISC |
182 IFF_ALLMULTI |
183 IFF_RUNNING)) |
184 (dev->gflags & (IFF_PROMISC |
185 IFF_ALLMULTI));
186
187 if (netif_running(dev) && netif_carrier_ok(dev))
188 flags |= IFF_RUNNING;
189
190 return flags;
191}
192#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 12) */
193
194
195#ifdef VBOXNETFLT_WITH_QDISC
196//#define QDISC_LOG(x) printk x
197# define QDISC_LOG(x) do { } while (0)
198
199# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
200# define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, ops)
201# elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
202# define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, ops, parent)
203# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
204# define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, queue, ops, parent)
205# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
206
207# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
208# define qdisc_dev(qdisc) (qdisc->dev)
209# define qdisc_pkt_len(skb) (skb->len)
210# define QDISC_GET(dev) (dev->qdisc_sleeping)
211# else
212# define QDISC_GET(dev) (netdev_get_tx_queue(dev, 0)->qdisc_sleeping)
213# endif
214
215# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
216# define QDISC_SAVED_NUM(dev) 1
217# elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
218# define QDISC_SAVED_NUM(dev) dev->num_tx_queues
219# else
220# define QDISC_SAVED_NUM(dev) dev->num_tx_queues+1
221# endif
222
223# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
224# define QDISC_IS_BUSY(dev, qdisc) test_bit(__LINK_STATE_SCHED, &dev->state)
225# elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36)
226# define QDISC_IS_BUSY(dev, qdisc) (test_bit(__QDISC_STATE_RUNNING, &qdisc->state) || \
227 test_bit(__QDISC_STATE_SCHED, &qdisc->state))
228# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 36) */
229# define QDISC_IS_BUSY(dev, qdisc) (qdisc_is_running(qdisc) || \
230 test_bit(__QDISC_STATE_SCHED, &qdisc->state))
231# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 36) */
232
233struct VBoxNetQDiscPriv
234{
235 /** Pointer to the single child qdisc. */
236 struct Qdisc *pChild;
237 /*
238 * Technically it is possible to have different qdiscs for different TX
239 * queues so we have to save them all.
240 */
241 /** Pointer to the array of saved qdiscs. */
242 struct Qdisc **ppSaved;
243 /** Pointer to the net filter instance. */
244 PVBOXNETFLTINS pVBoxNetFlt;
245};
246typedef struct VBoxNetQDiscPriv *PVBOXNETQDISCPRIV;
247
248//#define VBOXNETFLT_QDISC_ENQUEUE
249static int vboxNetFltQdiscEnqueue(struct sk_buff *skb, struct Qdisc *sch)
250{
251 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
252 int rc;
253
254# ifdef VBOXNETFLT_QDISC_ENQUEUE
255 if (VALID_PTR(pPriv->pVBoxNetFlt))
256 {
257 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
258 PCRTNETETHERHDR pEtherHdr;
259 PINTNETTRUNKSWPORT pSwitchPort;
260 uint32_t cbHdrs = skb_headlen(skb);
261
262 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
263 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(skb, 0, cbHdrs, &abHdrBuf[0]);
264 if ( pEtherHdr
265 && (pSwitchPort = pPriv->pVBoxNetFlt->pSwitchPort) != NULL
266 && VALID_PTR(pSwitchPort)
267 && cbHdrs >= 6)
268 {
269 /** @todo consider reference counting, etc. */
270 INTNETSWDECISION enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
271 if (enmDecision == INTNETSWDECISION_INTNET)
272 {
273 struct sk_buff *pBuf = skb_copy(skb, GFP_ATOMIC);
274 pBuf->pkt_type = PACKET_OUTGOING;
275 vboxNetFltLinuxForwardToIntNet(pPriv->pVBoxNetFlt, pBuf);
276 qdisc_drop(skb, sch);
277 ++sch->bstats.packets;
278 sch->bstats.bytes += qdisc_pkt_len(skb);
279 return NET_XMIT_SUCCESS;
280 }
281 }
282 }
283# endif /* VBOXNETFLT_QDISC_ENQUEUE */
284 rc = pPriv->pChild->enqueue(skb, pPriv->pChild);
285 if (rc == NET_XMIT_SUCCESS)
286 {
287 ++sch->q.qlen;
288 ++sch->bstats.packets;
289 sch->bstats.bytes += qdisc_pkt_len(skb);
290 }
291 else
292 ++sch->qstats.drops;
293 return rc;
294}
295
296static struct sk_buff *vboxNetFltQdiscDequeue(struct Qdisc *sch)
297{
298 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
299# ifdef VBOXNETFLT_QDISC_ENQUEUE
300 --sch->q.qlen;
301 return pPriv->pChild->dequeue(pPriv->pChild);
302# else /* VBOXNETFLT_QDISC_ENQUEUE */
303 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
304 PCRTNETETHERHDR pEtherHdr;
305 PINTNETTRUNKSWPORT pSwitchPort;
306 struct sk_buff *pSkb;
307
308 QDISC_LOG(("vboxNetFltDequeue: Enter pThis=%p\n", pPriv->pVBoxNetFlt));
309
310 while ((pSkb = pPriv->pChild->dequeue(pPriv->pChild)) != NULL)
311 {
312 struct sk_buff *pBuf;
313 INTNETSWDECISION enmDecision;
314 uint32_t cbHdrs;
315
316 --sch->q.qlen;
317
318 if (!VALID_PTR(pPriv->pVBoxNetFlt))
319 break;
320
321 cbHdrs = skb_headlen(pSkb);
322 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
323 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(pSkb, 0, cbHdrs, &abHdrBuf[0]);
324 if ( !pEtherHdr
325 || (pSwitchPort = pPriv->pVBoxNetFlt->pSwitchPort) == NULL
326 || !VALID_PTR(pSwitchPort)
327 || cbHdrs < 6)
328 break;
329
330 /** @todo consider reference counting, etc. */
331 enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
332 if (enmDecision != INTNETSWDECISION_INTNET)
333 break;
334
335 pBuf = skb_copy(pSkb, GFP_ATOMIC);
336 pBuf->pkt_type = PACKET_OUTGOING;
337 QDISC_LOG(("vboxNetFltDequeue: pThis=%p\n", pPriv->pVBoxNetFlt));
338 vboxNetFltLinuxForwardToIntNet(pPriv->pVBoxNetFlt, pBuf);
339 qdisc_drop(pSkb, sch);
340 QDISC_LOG(("VBoxNetFlt: Packet for %02x:%02x:%02x:%02x:%02x:%02x dropped\n",
341 pSkb->data[0], pSkb->data[1], pSkb->data[2],
342 pSkb->data[3], pSkb->data[4], pSkb->data[5]));
343 }
344
345 return pSkb;
346# endif /* VBOXNETFLT_QDISC_ENQUEUE */
347}
348
349# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29)
350static int vboxNetFltQdiscRequeue(struct sk_buff *skb, struct Qdisc *sch)
351{
352 int rc;
353 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
354
355 rc = pPriv->pChild->ops->requeue(skb, pPriv->pChild);
356 if (rc == 0)
357 {
358 sch->q.qlen++;
359# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 10)
360 sch->qstats.requeues++;
361# endif
362 }
363
364 return rc;
365}
366# endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29) */
367
368static unsigned int vboxNetFltQdiscDrop(struct Qdisc *sch)
369{
370 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
371 unsigned int cbLen;
372
373 if (pPriv->pChild->ops->drop)
374 {
375 cbLen = pPriv->pChild->ops->drop(pPriv->pChild);
376 if (cbLen != 0)
377 {
378 ++sch->qstats.drops;
379 --sch->q.qlen;
380 return cbLen;
381 }
382 }
383
384 return 0;
385}
386
387# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 25)
388static int vboxNetFltQdiscInit(struct Qdisc *sch, struct rtattr *opt)
389# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
390static int vboxNetFltQdiscInit(struct Qdisc *sch, struct nlattr *opt)
391# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
392{
393 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
394 struct net_device *pDev = qdisc_dev(sch);
395
396 pPriv->pVBoxNetFlt = NULL;
397
398 pPriv->ppSaved = kcalloc(QDISC_SAVED_NUM(pDev), sizeof(pPriv->ppSaved[0]),
399 GFP_KERNEL);
400 if (!pPriv->ppSaved)
401 return -ENOMEM;
402
403 pPriv->pChild = QDISC_CREATE(pDev, netdev_get_tx_queue(pDev, 0),
404 &pfifo_qdisc_ops,
405 TC_H_MAKE(TC_H_MAJ(sch->handle),
406 TC_H_MIN(1)));
407 if (!pPriv->pChild)
408 {
409 kfree(pPriv->ppSaved);
410 pPriv->ppSaved = NULL;
411 return -ENOMEM;
412 }
413
414 return 0;
415}
416
417static void vboxNetFltQdiscReset(struct Qdisc *sch)
418{
419 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
420
421 qdisc_reset(pPriv->pChild);
422 sch->q.qlen = 0;
423 sch->qstats.backlog = 0;
424}
425
426static void vboxNetFltQdiscDestroy(struct Qdisc* sch)
427{
428 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
429 struct net_device *pDev = qdisc_dev(sch);
430
431 qdisc_destroy(pPriv->pChild);
432 pPriv->pChild = NULL;
433
434 if (pPriv->ppSaved)
435 {
436 int i;
437 for (i = 0; i < QDISC_SAVED_NUM(pDev); i++)
438 if (pPriv->ppSaved[i])
439 qdisc_destroy(pPriv->ppSaved[i]);
440 kfree(pPriv->ppSaved);
441 pPriv->ppSaved = NULL;
442 }
443}
444
445static int vboxNetFltClassGraft(struct Qdisc *sch, unsigned long arg, struct Qdisc *pNew,
446 struct Qdisc **ppOld)
447{
448 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
449
450 if (pNew == NULL)
451 pNew = &noop_qdisc;
452
453 sch_tree_lock(sch);
454 *ppOld = pPriv->pChild;
455 pPriv->pChild = pNew;
456# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
457 sch->q.qlen = 0;
458# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20) */
459 qdisc_tree_decrease_qlen(*ppOld, (*ppOld)->q.qlen);
460# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20) */
461 qdisc_reset(*ppOld);
462 sch_tree_unlock(sch);
463
464 return 0;
465}
466
467static struct Qdisc *vboxNetFltClassLeaf(struct Qdisc *sch, unsigned long arg)
468{
469 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
470 return pPriv->pChild;
471}
472
473static unsigned long vboxNetFltClassGet(struct Qdisc *sch, u32 classid)
474{
475 return 1;
476}
477
478static void vboxNetFltClassPut(struct Qdisc *sch, unsigned long arg)
479{
480}
481
482# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 25)
483static int vboxNetFltClassChange(struct Qdisc *sch, u32 classid, u32 parentid,
484 struct rtattr **tca, unsigned long *arg)
485# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
486static int vboxNetFltClassChange(struct Qdisc *sch, u32 classid, u32 parentid,
487 struct nlattr **tca, unsigned long *arg)
488# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
489{
490 return -ENOSYS;
491}
492
493static int vboxNetFltClassDelete(struct Qdisc *sch, unsigned long arg)
494{
495 return -ENOSYS;
496}
497
498static void vboxNetFltClassWalk(struct Qdisc *sch, struct qdisc_walker *walker)
499{
500 if (!walker->stop) {
501 if (walker->count >= walker->skip)
502 if (walker->fn(sch, 1, walker) < 0) {
503 walker->stop = 1;
504 return;
505 }
506 walker->count++;
507 }
508}
509
510# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
511static struct tcf_proto **vboxNetFltClassFindTcf(struct Qdisc *sch, unsigned long cl)
512{
513 return NULL;
514}
515# endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32) */
516
517static int vboxNetFltClassDump(struct Qdisc *sch, unsigned long cl,
518 struct sk_buff *skb, struct tcmsg *tcm)
519{
520 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
521
522 if (cl != 1)
523 return -ENOENT;
524
525 tcm->tcm_handle |= TC_H_MIN(1);
526 tcm->tcm_info = pPriv->pChild->handle;
527
528 return 0;
529}
530
531
532static struct Qdisc_class_ops g_VBoxNetFltClassOps =
533{
534 .graft = vboxNetFltClassGraft,
535 .leaf = vboxNetFltClassLeaf,
536 .get = vboxNetFltClassGet,
537 .put = vboxNetFltClassPut,
538 .change = vboxNetFltClassChange,
539 .delete = vboxNetFltClassDelete,
540 .walk = vboxNetFltClassWalk,
541# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
542 .tcf_chain = vboxNetFltClassFindTcf,
543# endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32) */
544 .dump = vboxNetFltClassDump,
545};
546
547
548static struct Qdisc_ops g_VBoxNetFltQDiscOps = {
549 .cl_ops = &g_VBoxNetFltClassOps,
550 .id = "vboxnetflt",
551 .priv_size = sizeof(struct VBoxNetQDiscPriv),
552 .enqueue = vboxNetFltQdiscEnqueue,
553 .dequeue = vboxNetFltQdiscDequeue,
554# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29)
555 .requeue = vboxNetFltQdiscRequeue,
556# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29) */
557 .peek = qdisc_peek_dequeued,
558# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29) */
559 .drop = vboxNetFltQdiscDrop,
560 .init = vboxNetFltQdiscInit,
561 .reset = vboxNetFltQdiscReset,
562 .destroy = vboxNetFltQdiscDestroy,
563 .owner = THIS_MODULE
564};
565
566/*
567 * If our qdisc is already attached to the device (that means the user
568 * installed it from command line with 'tc' command) we simply update
569 * the pointer to vboxnetflt instance in qdisc's private structure.
570 * Otherwise we need to take some additional steps:
571 * - Create our qdisc;
572 * - Save all references to qdiscs;
573 * - Replace our child with the first qdisc reference;
574 * - Replace all references so they point to our qdisc.
575 */
576static void vboxNetFltLinuxQdiscInstall(PVBOXNETFLTINS pThis, struct net_device *pDev)
577{
578# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
579 int i;
580# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
581 PVBOXNETQDISCPRIV pPriv;
582
583 struct Qdisc *pExisting = QDISC_GET(pDev);
584 if (strcmp(pExisting->ops->id, "vboxnetflt"))
585 {
586 /* The existing qdisc is different from ours, let's create new one. */
587 struct Qdisc *pNew = QDISC_CREATE(pDev, netdev_get_tx_queue(pDev, 0),
588 &g_VBoxNetFltQDiscOps, TC_H_ROOT);
589 if (!pNew)
590 return; // TODO: Error?
591
592 if (!try_module_get(THIS_MODULE))
593 {
594 /*
595 * This may cause a memory leak but calling qdisc_destroy()
596 * is not an option as it will call module_put().
597 */
598 return;
599 }
600 pPriv = qdisc_priv(pNew);
601
602 qdisc_destroy(pPriv->pChild);
603 pPriv->pChild = QDISC_GET(pDev);
604 atomic_inc(&pPriv->pChild->refcnt);
605 /*
606 * There is no need in deactivating the device or acquiring any locks
607 * prior changing qdiscs since we do not destroy the old qdisc.
608 * Atomic replacement of pointers is enough.
609 */
610 /*
611 * No need to change reference counters here as we merely move
612 * the pointer and the reference counter of the newly allocated
613 * qdisc is already 1.
614 */
615# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
616 pPriv->ppSaved[0] = pDev->qdisc_sleeping;
617 ASMAtomicWritePtr(&pDev->qdisc_sleeping, pNew);
618 ASMAtomicWritePtr(&pDev->qdisc, pNew);
619# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
620 for (i = 0; i < pDev->num_tx_queues; i++)
621 {
622 struct netdev_queue *pQueue = netdev_get_tx_queue(pDev, i);
623
624 pPriv->ppSaved[i] = pQueue->qdisc_sleeping;
625 ASMAtomicWritePtr(&pQueue->qdisc_sleeping, pNew);
626 ASMAtomicWritePtr(&pQueue->qdisc, pNew);
627 if (i)
628 atomic_inc(&pNew->refcnt);
629 }
630 /* Newer kernels store root qdisc in netdev structure as well. */
631# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
632 pPriv->ppSaved[pDev->num_tx_queues] = pDev->qdisc;
633 ASMAtomicWritePtr(&pDev->qdisc, pNew);
634 atomic_inc(&pNew->refcnt);
635# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) */
636# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
637 /* Synch the queue len with our child */
638 pNew->q.qlen = pPriv->pChild->q.qlen;
639 }
640 else
641 {
642 /* We already have vboxnetflt qdisc, let's use it. */
643 pPriv = qdisc_priv(pExisting);
644 }
645 ASMAtomicWritePtr(&pPriv->pVBoxNetFlt, pThis);
646 QDISC_LOG(("vboxNetFltLinuxInstallQdisc: pThis=%p\n", pPriv->pVBoxNetFlt));
647}
648
649static void vboxNetFltLinuxQdiscRemove(PVBOXNETFLTINS pThis, struct net_device *pDev)
650{
651# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
652 int i;
653# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
654 PVBOXNETQDISCPRIV pPriv;
655 struct Qdisc *pQdisc, *pChild;
656 if (!pDev)
657 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
658 if (!VALID_PTR(pDev))
659 {
660 printk("VBoxNetFlt: Failed to detach qdisc, invalid device pointer: %p\n",
661 pDev);
662 return; // TODO: Consider returing an error
663 }
664
665
666 pQdisc = QDISC_GET(pDev);
667 if (strcmp(pQdisc->ops->id, "vboxnetflt"))
668 {
669 /* Looks like the user has replaced our qdisc manually. */
670 printk("VBoxNetFlt: Failed to detach qdisc, wrong qdisc: %s\n",
671 pQdisc->ops->id);
672 return; // TODO: Consider returing an error
673 }
674
675 pPriv = qdisc_priv(pQdisc);
676 Assert(pPriv->pVBoxNetFlt == pThis);
677 ASMAtomicWriteNullPtr(&pPriv->pVBoxNetFlt);
678 pChild = ASMAtomicXchgPtrT(&pPriv->pChild, &noop_qdisc, struct Qdisc *);
679 qdisc_destroy(pChild); /* It won't be the last reference. */
680
681 QDISC_LOG(("vboxNetFltLinuxQdiscRemove: refcnt=%d num_tx_queues=%d\n",
682 atomic_read(&pQdisc->refcnt), pDev->num_tx_queues));
683# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
684 /* Play it safe, make sure the qdisc is not being used. */
685 if (pPriv->ppSaved[0])
686 {
687 ASMAtomicWritePtr(&pDev->qdisc_sleeping, pPriv->ppSaved[0]);
688 ASMAtomicWritePtr(&pDev->qdisc, pPriv->ppSaved[0]);
689 pPriv->ppSaved[0] = NULL;
690 while (QDISC_IS_BUSY(pDev, pQdisc))
691 yield();
692 qdisc_destroy(pQdisc); /* Destroy reference */
693 }
694# else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
695 for (i = 0; i < pDev->num_tx_queues; i++)
696 {
697 struct netdev_queue *pQueue = netdev_get_tx_queue(pDev, i);
698 if (pPriv->ppSaved[i])
699 {
700 Assert(pQueue->qdisc_sleeping == pQdisc);
701 ASMAtomicWritePtr(&pQueue->qdisc_sleeping, pPriv->ppSaved[i]);
702 ASMAtomicWritePtr(&pQueue->qdisc, pPriv->ppSaved[i]);
703 pPriv->ppSaved[i] = NULL;
704 while (QDISC_IS_BUSY(pDev, pQdisc))
705 yield();
706 qdisc_destroy(pQdisc); /* Destroy reference */
707 }
708 }
709 /* Newer kernels store root qdisc in netdev structure as well. */
710# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
711 ASMAtomicWritePtr(&pDev->qdisc, pPriv->ppSaved[pDev->num_tx_queues]);
712 pPriv->ppSaved[pDev->num_tx_queues] = NULL;
713 while (QDISC_IS_BUSY(pDev, pQdisc))
714 yield();
715 qdisc_destroy(pQdisc); /* Destroy reference */
716# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) */
717# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
718
719 /*
720 * At this point all references to our qdisc should be gone
721 * unless the user had installed it manually.
722 */
723 QDISC_LOG(("vboxNetFltLinuxRemoveQdisc: pThis=%p\n", pPriv->pVBoxNetFlt));
724}
725
726#endif /* VBOXNETFLT_WITH_QDISC */
727
728
729/**
730 * Initialize module.
731 *
732 * @returns appropriate status code.
733 */
734static int __init VBoxNetFltLinuxInit(void)
735{
736 int rc;
737 /*
738 * Initialize IPRT.
739 */
740 rc = RTR0Init(0);
741 if (RT_SUCCESS(rc))
742 {
743 Log(("VBoxNetFltLinuxInit\n"));
744
745 /*
746 * Initialize the globals and connect to the support driver.
747 *
748 * This will call back vboxNetFltOsOpenSupDrv (and maybe vboxNetFltOsCloseSupDrv)
749 * for establishing the connect to the support driver.
750 */
751 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
752 rc = vboxNetFltInitGlobalsAndIdc(&g_VBoxNetFltGlobals);
753 if (RT_SUCCESS(rc))
754 {
755#ifdef VBOXNETFLT_WITH_QDISC
756 /*memcpy(&g_VBoxNetFltQDiscOps, &pfifo_qdisc_ops, sizeof(g_VBoxNetFltQDiscOps));
757 strcpy(g_VBoxNetFltQDiscOps.id, "vboxnetflt");
758 g_VBoxNetFltQDiscOps.owner = THIS_MODULE;*/
759 rc = register_qdisc(&g_VBoxNetFltQDiscOps);
760 if (rc)
761 {
762 LogRel(("VBoxNetFlt: Failed to registed qdisc: %d\n", rc));
763 return rc;
764 }
765#endif /* VBOXNETFLT_WITH_QDISC */
766 LogRel(("VBoxNetFlt: Successfully started.\n"));
767 return 0;
768 }
769
770 LogRel(("VBoxNetFlt: failed to initialize device extension (rc=%d)\n", rc));
771 RTR0Term();
772 }
773 else
774 LogRel(("VBoxNetFlt: failed to initialize IPRT (rc=%d)\n", rc));
775
776 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
777 return -RTErrConvertToErrno(rc);
778}
779
780
781/**
782 * Unload the module.
783 *
784 * @todo We have to prevent this if we're busy!
785 */
786static void __exit VBoxNetFltLinuxUnload(void)
787{
788 int rc;
789 Log(("VBoxNetFltLinuxUnload\n"));
790 Assert(vboxNetFltCanUnload(&g_VBoxNetFltGlobals));
791
792#ifdef VBOXNETFLT_WITH_QDISC
793 unregister_qdisc(&g_VBoxNetFltQDiscOps);
794#endif /* VBOXNETFLT_WITH_QDISC */
795 /*
796 * Undo the work done during start (in reverse order).
797 */
798 rc = vboxNetFltTryDeleteIdcAndGlobals(&g_VBoxNetFltGlobals);
799 AssertRC(rc); NOREF(rc);
800
801 RTR0Term();
802
803 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
804
805 Log(("VBoxNetFltLinuxUnload - done\n"));
806}
807
808
809/**
810 * Experiment where we filter trafic from the host to the internal network
811 * before it reaches the NIC driver.
812 *
813 * The current code uses a very ugly hack and only works on kernels using the
814 * net_device_ops (>= 2.6.29). It has been shown to give us a
815 * performance boost of 60-100% though. So, we have to find some less hacky way
816 * of getting this job done eventually.
817 *
818 * #define VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
819 */
820#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
821
822/**
823 * The overridden net_device_ops of the device we're attached to.
824 *
825 * Requires Linux 2.6.29 or later.
826 *
827 * This is a very dirty hack that was create to explore how much we can improve
828 * the host to guest transfers by not CC'ing the NIC.
829 */
830typedef struct VBoxNetDeviceOpsOverride
831{
832 /** Our overridden ops. */
833 struct net_device_ops Ops;
834 /** Magic word. */
835 uint32_t u32Magic;
836 /** Pointer to the original ops. */
837 struct net_device_ops const *pOrgOps;
838 /** Pointer to the net filter instance. */
839 PVBOXNETFLTINS pVBoxNetFlt;
840 /** The number of filtered packages. */
841 uint64_t cFiltered;
842 /** The total number of packets */
843 uint64_t cTotal;
844} VBOXNETDEVICEOPSOVERRIDE, *PVBOXNETDEVICEOPSOVERRIDE;
845/** VBOXNETDEVICEOPSOVERRIDE::u32Magic value. */
846#define VBOXNETDEVICEOPSOVERRIDE_MAGIC UINT32_C(0x00c0ffee)
847
848/**
849 * ndo_start_xmit wrapper that drops packets that shouldn't go to the wire
850 * because they belong on the internal network.
851 *
852 * @returns NETDEV_TX_XXX.
853 * @param pSkb The socket buffer to transmit.
854 * @param pDev The net device.
855 */
856static int vboxNetFltLinuxStartXmitFilter(struct sk_buff *pSkb, struct net_device *pDev)
857{
858 PVBOXNETDEVICEOPSOVERRIDE pOverride = (PVBOXNETDEVICEOPSOVERRIDE)pDev->netdev_ops;
859 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
860 PCRTNETETHERHDR pEtherHdr;
861 PINTNETTRUNKSWPORT pSwitchPort;
862 uint32_t cbHdrs;
863
864
865 /*
866 * Validate the override structure.
867 *
868 * Note! We're racing vboxNetFltLinuxUnhookDev here. If this was supposed
869 * to be production quality code, we would have to be much more
870 * careful here and avoid the race.
871 */
872 if ( !VALID_PTR(pOverride)
873 || pOverride->u32Magic != VBOXNETDEVICEOPSOVERRIDE_MAGIC
874 || !VALID_PTR(pOverride->pOrgOps))
875 {
876 printk("vboxNetFltLinuxStartXmitFilter: bad override %p\n", pOverride);
877 dev_kfree_skb(pSkb);
878 return NETDEV_TX_OK;
879 }
880 pOverride->cTotal++;
881
882 /*
883 * Do the filtering base on the defaul OUI of our virtual NICs
884 *
885 * Note! In a real solution, we would ask the switch whether the
886 * destination MAC is 100% to be on the internal network and then
887 * drop it.
888 */
889 cbHdrs = skb_headlen(pSkb);
890 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
891 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(pSkb, 0, cbHdrs, &abHdrBuf[0]);
892 if ( pEtherHdr
893 && VALID_PTR(pOverride->pVBoxNetFlt)
894 && (pSwitchPort = pOverride->pVBoxNetFlt->pSwitchPort) != NULL
895 && VALID_PTR(pSwitchPort)
896 && cbHdrs >= 6)
897 {
898 INTNETSWDECISION enmDecision;
899
900 /** @todo consider reference counting, etc. */
901 enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
902 if (enmDecision == INTNETSWDECISION_INTNET)
903 {
904 dev_kfree_skb(pSkb);
905 pOverride->cFiltered++;
906 return NETDEV_TX_OK;
907 }
908 }
909
910 return pOverride->pOrgOps->ndo_start_xmit(pSkb, pDev);
911}
912
913/**
914 * Hooks the device ndo_start_xmit operation of the device.
915 *
916 * @param pThis The net filter instance.
917 * @param pDev The net device.
918 */
919static void vboxNetFltLinuxHookDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
920{
921 PVBOXNETDEVICEOPSOVERRIDE pOverride;
922 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
923
924 pOverride = RTMemAlloc(sizeof(*pOverride));
925 if (!pOverride)
926 return;
927 pOverride->pOrgOps = pDev->netdev_ops;
928 pOverride->Ops = *pDev->netdev_ops;
929 pOverride->Ops.ndo_start_xmit = vboxNetFltLinuxStartXmitFilter;
930 pOverride->u32Magic = VBOXNETDEVICEOPSOVERRIDE_MAGIC;
931 pOverride->cTotal = 0;
932 pOverride->cFiltered = 0;
933 pOverride->pVBoxNetFlt = pThis;
934
935 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp); /* (this isn't necessary, but so what) */
936 ASMAtomicWritePtr((void * volatile *)&pDev->netdev_ops, pOverride);
937 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
938}
939
940/**
941 * Undos what vboxNetFltLinuxHookDev did.
942 *
943 * @param pThis The net filter instance.
944 * @param pDev The net device. Can be NULL, in which case
945 * we'll try retrieve it from @a pThis.
946 */
947static void vboxNetFltLinuxUnhookDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
948{
949 PVBOXNETDEVICEOPSOVERRIDE pOverride;
950 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
951
952 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
953 if (!pDev)
954 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
955 if (VALID_PTR(pDev))
956 {
957 pOverride = (PVBOXNETDEVICEOPSOVERRIDE)pDev->netdev_ops;
958 if ( VALID_PTR(pOverride)
959 && pOverride->u32Magic == VBOXNETDEVICEOPSOVERRIDE_MAGIC
960 && VALID_PTR(pOverride->pOrgOps)
961 )
962 {
963 ASMAtomicWritePtr((void * volatile *)&pDev->netdev_ops, pOverride->pOrgOps);
964 ASMAtomicWriteU32(&pOverride->u32Magic, 0);
965 }
966 else
967 pOverride = NULL;
968 }
969 else
970 pOverride = NULL;
971 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
972
973 if (pOverride)
974 {
975 printk("vboxnetflt: dropped %llu out of %llu packets\n", pOverride->cFiltered, pOverride->cTotal);
976 RTMemFree(pOverride);
977 }
978}
979
980#endif /* VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT */
981
982
983/**
984 * Reads and retains the host interface handle.
985 *
986 * @returns The handle, NULL if detached.
987 * @param pThis
988 */
989DECLINLINE(struct net_device *) vboxNetFltLinuxRetainNetDev(PVBOXNETFLTINS pThis)
990{
991#if 0
992 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
993 struct net_device *pDev = NULL;
994
995 Log(("vboxNetFltLinuxRetainNetDev\n"));
996 /*
997 * Be careful here to avoid problems racing the detached callback.
998 */
999 RTSpinlockAcquire(pThis->hSpinlock, &Tmp);
1000 if (!ASMAtomicUoReadBool(&pThis->fDisconnectedFromHost))
1001 {
1002 pDev = (struct net_device *)ASMAtomicUoReadPtr((void * volatile *)&pThis->u.s.pDev);
1003 if (pDev)
1004 {
1005 dev_hold(pDev);
1006 Log(("vboxNetFltLinuxRetainNetDev: Device %p(%s) retained. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1007 }
1008 }
1009 RTSpinlockRelease(pThis->hSpinlock, &Tmp);
1010
1011 Log(("vboxNetFltLinuxRetainNetDev - done\n"));
1012 return pDev;
1013#else
1014 return ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1015#endif
1016}
1017
1018
1019/**
1020 * Release the host interface handle previously retained
1021 * by vboxNetFltLinuxRetainNetDev.
1022 *
1023 * @param pThis The instance.
1024 * @param pDev The vboxNetFltLinuxRetainNetDev
1025 * return value, NULL is fine.
1026 */
1027DECLINLINE(void) vboxNetFltLinuxReleaseNetDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
1028{
1029#if 0
1030 Log(("vboxNetFltLinuxReleaseNetDev\n"));
1031 NOREF(pThis);
1032 if (pDev)
1033 {
1034 dev_put(pDev);
1035 Log(("vboxNetFltLinuxReleaseNetDev: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1036 }
1037 Log(("vboxNetFltLinuxReleaseNetDev - done\n"));
1038#endif
1039}
1040
1041#define VBOXNETFLT_CB_TAG(skb) (0xA1C90000 | (skb->dev->ifindex & 0xFFFF))
1042#define VBOXNETFLT_SKB_TAG(skb) (*(uint32_t*)&((skb)->cb[sizeof((skb)->cb)-sizeof(uint32_t)]))
1043
1044/**
1045 * Checks whether this is an mbuf created by vboxNetFltLinuxMBufFromSG,
1046 * i.e. a buffer which we're pushing and should be ignored by the filter callbacks.
1047 *
1048 * @returns true / false accordingly.
1049 * @param pBuf The sk_buff.
1050 */
1051DECLINLINE(bool) vboxNetFltLinuxSkBufIsOur(struct sk_buff *pBuf)
1052{
1053 return VBOXNETFLT_SKB_TAG(pBuf) == VBOXNETFLT_CB_TAG(pBuf);
1054}
1055
1056
1057/**
1058 * Internal worker that create a linux sk_buff for a
1059 * (scatter/)gather list.
1060 *
1061 * @returns Pointer to the sk_buff.
1062 * @param pThis The instance.
1063 * @param pSG The (scatter/)gather list.
1064 * @param fDstWire Set if the destination is the wire.
1065 */
1066static struct sk_buff *vboxNetFltLinuxSkBufFromSG(PVBOXNETFLTINS pThis, PINTNETSG pSG, bool fDstWire)
1067{
1068 struct sk_buff *pPkt;
1069 struct net_device *pDev;
1070 unsigned fGsoType = 0;
1071
1072 if (pSG->cbTotal == 0)
1073 {
1074 LogRel(("VBoxNetFlt: Dropped empty packet coming from internal network.\n"));
1075 return NULL;
1076 }
1077
1078 /** @todo We should use fragments mapping the SG buffers with large packets.
1079 * 256 bytes seems to be the a threshold used a lot for this. It
1080 * requires some nasty work on the intnet side though... */
1081 /*
1082 * Allocate a packet and copy over the data.
1083 */
1084 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1085 pPkt = dev_alloc_skb(pSG->cbTotal + NET_IP_ALIGN);
1086 if (RT_UNLIKELY(!pPkt))
1087 {
1088 Log(("vboxNetFltLinuxSkBufFromSG: Failed to allocate sk_buff(%u).\n", pSG->cbTotal));
1089 pSG->pvUserData = NULL;
1090 return NULL;
1091 }
1092 pPkt->dev = pDev;
1093 pPkt->ip_summed = CHECKSUM_NONE;
1094
1095 /* Align IP header on 16-byte boundary: 2 + 14 (ethernet hdr size). */
1096 skb_reserve(pPkt, NET_IP_ALIGN);
1097
1098 /* Copy the segments. */
1099 skb_put(pPkt, pSG->cbTotal);
1100 IntNetSgRead(pSG, pPkt->data);
1101
1102#if defined(VBOXNETFLT_WITH_GSO_XMIT_WIRE) || defined(VBOXNETFLT_WITH_GSO_XMIT_HOST)
1103 /*
1104 * Setup GSO if used by this packet.
1105 */
1106 switch ((PDMNETWORKGSOTYPE)pSG->GsoCtx.u8Type)
1107 {
1108 default:
1109 AssertMsgFailed(("%u (%s)\n", pSG->GsoCtx.u8Type, PDMNetGsoTypeName((PDMNETWORKGSOTYPE)pSG->GsoCtx.u8Type) ));
1110 /* fall thru */
1111 case PDMNETWORKGSOTYPE_INVALID:
1112 fGsoType = 0;
1113 break;
1114 case PDMNETWORKGSOTYPE_IPV4_TCP:
1115 fGsoType = SKB_GSO_TCPV4;
1116 break;
1117 case PDMNETWORKGSOTYPE_IPV4_UDP:
1118 fGsoType = SKB_GSO_UDP;
1119 break;
1120 case PDMNETWORKGSOTYPE_IPV6_TCP:
1121 fGsoType = SKB_GSO_TCPV6;
1122 break;
1123 }
1124 if (fGsoType)
1125 {
1126 struct skb_shared_info *pShInfo = skb_shinfo(pPkt);
1127
1128 pShInfo->gso_type = fGsoType | SKB_GSO_DODGY;
1129 pShInfo->gso_size = pSG->GsoCtx.cbMaxSeg;
1130 pShInfo->gso_segs = PDMNetGsoCalcSegmentCount(&pSG->GsoCtx, pSG->cbTotal);
1131
1132 /*
1133 * We need to set checksum fields even if the packet goes to the host
1134 * directly as it may be immediately forwared by IP layer @bugref{5020}.
1135 */
1136 Assert(skb_headlen(pPkt) >= pSG->GsoCtx.cbHdrs);
1137 pPkt->ip_summed = CHECKSUM_PARTIAL;
1138# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
1139 pPkt->csum_start = skb_headroom(pPkt) + pSG->GsoCtx.offHdr2;
1140 if (fGsoType & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
1141 pPkt->csum_offset = RT_OFFSETOF(RTNETTCP, th_sum);
1142 else
1143 pPkt->csum_offset = RT_OFFSETOF(RTNETUDP, uh_sum);
1144# else
1145 pPkt->h.raw = pPkt->data + pSG->GsoCtx.offHdr2;
1146 if (fGsoType & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
1147 pPkt->csum = RT_OFFSETOF(RTNETTCP, th_sum);
1148 else
1149 pPkt->csum = RT_OFFSETOF(RTNETUDP, uh_sum);
1150# endif
1151 if (!fDstWire)
1152 PDMNetGsoPrepForDirectUse(&pSG->GsoCtx, pPkt->data, pSG->cbTotal, PDMNETCSUMTYPE_PSEUDO);
1153 }
1154#endif /* VBOXNETFLT_WITH_GSO_XMIT_WIRE || VBOXNETFLT_WITH_GSO_XMIT_HOST */
1155
1156 /*
1157 * Finish up the socket buffer.
1158 */
1159 pPkt->protocol = eth_type_trans(pPkt, pDev);
1160 if (fDstWire)
1161 {
1162 VBOX_SKB_RESET_NETWORK_HDR(pPkt);
1163
1164 /* Restore ethernet header back. */
1165 skb_push(pPkt, ETH_HLEN); /** @todo VLAN: +4 if VLAN? */
1166 VBOX_SKB_RESET_MAC_HDR(pPkt);
1167 }
1168 VBOXNETFLT_SKB_TAG(pPkt) = VBOXNETFLT_CB_TAG(pPkt);
1169
1170 return pPkt;
1171}
1172
1173
1174/**
1175 * Initializes a SG list from an sk_buff.
1176 *
1177 * @returns Number of segments.
1178 * @param pThis The instance.
1179 * @param pBuf The sk_buff.
1180 * @param pSG The SG.
1181 * @param pvFrame The frame pointer, optional.
1182 * @param cSegs The number of segments allocated for the SG.
1183 * This should match the number in the mbuf exactly!
1184 * @param fSrc The source of the frame.
1185 * @param pGso Pointer to the GSO context if it's a GSO
1186 * internal network frame. NULL if regular frame.
1187 */
1188DECLINLINE(void) vboxNetFltLinuxSkBufToSG(PVBOXNETFLTINS pThis, struct sk_buff *pBuf, PINTNETSG pSG,
1189 unsigned cSegs, uint32_t fSrc, PCPDMNETWORKGSO pGsoCtx)
1190{
1191 int i;
1192 NOREF(pThis);
1193
1194 Assert(!skb_shinfo(pBuf)->frag_list);
1195
1196 if (!pGsoCtx)
1197 IntNetSgInitTempSegs(pSG, pBuf->len, cSegs, 0 /*cSegsUsed*/);
1198 else
1199 IntNetSgInitTempSegsGso(pSG, pBuf->len, cSegs, 0 /*cSegsUsed*/, pGsoCtx);
1200
1201#ifdef VBOXNETFLT_SG_SUPPORT
1202 pSG->aSegs[0].cb = skb_headlen(pBuf);
1203 pSG->aSegs[0].pv = pBuf->data;
1204 pSG->aSegs[0].Phys = NIL_RTHCPHYS;
1205
1206 for (i = 0; i < skb_shinfo(pBuf)->nr_frags; i++)
1207 {
1208 skb_frag_t *pFrag = &skb_shinfo(pBuf)->frags[i];
1209 pSG->aSegs[i+1].cb = pFrag->size;
1210 pSG->aSegs[i+1].pv = kmap(pFrag->page);
1211 printk("%p = kmap()\n", pSG->aSegs[i+1].pv);
1212 pSG->aSegs[i+1].Phys = NIL_RTHCPHYS;
1213 }
1214 ++i;
1215
1216#else
1217 pSG->aSegs[0].cb = pBuf->len;
1218 pSG->aSegs[0].pv = pBuf->data;
1219 pSG->aSegs[0].Phys = NIL_RTHCPHYS;
1220 i = 1;
1221#endif
1222
1223 pSG->cSegsUsed = i;
1224
1225#ifdef PADD_RUNT_FRAMES_FROM_HOST
1226 /*
1227 * Add a trailer if the frame is too small.
1228 *
1229 * Since we're getting to the packet before it is framed, it has not
1230 * yet been padded. The current solution is to add a segment pointing
1231 * to a buffer containing all zeros and pray that works for all frames...
1232 */
1233 if (pSG->cbTotal < 60 && (fSrc & INTNETTRUNKDIR_HOST))
1234 {
1235 static uint8_t const s_abZero[128] = {0};
1236
1237 AssertReturnVoid(i < cSegs);
1238
1239 pSG->aSegs[i].Phys = NIL_RTHCPHYS;
1240 pSG->aSegs[i].pv = (void *)&s_abZero[0];
1241 pSG->aSegs[i].cb = 60 - pSG->cbTotal;
1242 pSG->cbTotal = 60;
1243 pSG->cSegsUsed++;
1244 Assert(i + 1 <= pSG->cSegsAlloc)
1245 }
1246#endif
1247
1248 Log4(("vboxNetFltLinuxSkBufToSG: allocated=%d, segments=%d frags=%d next=%p frag_list=%p pkt_type=%x fSrc=%x\n",
1249 pSG->cSegsAlloc, pSG->cSegsUsed, skb_shinfo(pBuf)->nr_frags, pBuf->next, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type, fSrc));
1250 for (i = 0; i < pSG->cSegsUsed; i++)
1251 Log4(("vboxNetFltLinuxSkBufToSG: #%d: cb=%d pv=%p\n",
1252 i, pSG->aSegs[i].cb, pSG->aSegs[i].pv));
1253}
1254
1255/**
1256 * Packet handler,
1257 *
1258 * @returns 0 or EJUSTRETURN.
1259 * @param pThis The instance.
1260 * @param pMBuf The mbuf.
1261 * @param pvFrame The start of the frame, optional.
1262 * @param fSrc Where the packet (allegedly) comes from, one INTNETTRUNKDIR_* value.
1263 * @param eProtocol The protocol.
1264 */
1265#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 14)
1266static int vboxNetFltLinuxPacketHandler(struct sk_buff *pBuf,
1267 struct net_device *pSkbDev,
1268 struct packet_type *pPacketType,
1269 struct net_device *pOrigDev)
1270#else
1271static int vboxNetFltLinuxPacketHandler(struct sk_buff *pBuf,
1272 struct net_device *pSkbDev,
1273 struct packet_type *pPacketType)
1274#endif
1275{
1276 PVBOXNETFLTINS pThis;
1277 struct net_device *pDev;
1278 LogFlow(("vboxNetFltLinuxPacketHandler: pBuf=%p pSkbDev=%p pPacketType=%p\n",
1279 pBuf, pSkbDev, pPacketType));
1280#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
1281 Log3(("vboxNetFltLinuxPacketHandler: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1282 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1283 Log4(("vboxNetFltLinuxPacketHandler: packet dump follows:\n%.*Rhxd\n", pBuf->len-pBuf->data_len, skb_mac_header(pBuf)));
1284#else
1285 Log3(("vboxNetFltLinuxPacketHandler: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u tso_size=%u tso_seqs=%u frag_list=%p pkt_type=%x\n",
1286 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->tso_size, skb_shinfo(pBuf)->tso_segs, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1287#endif
1288 /*
1289 * Drop it immediately?
1290 */
1291 if (!pBuf)
1292 return 0;
1293
1294 pThis = VBOX_FLT_PT_TO_INST(pPacketType);
1295 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1296 if (pThis->u.s.pDev != pSkbDev)
1297 {
1298 Log(("vboxNetFltLinuxPacketHandler: Devices do not match, pThis may be wrong! pThis=%p\n", pThis));
1299 return 0;
1300 }
1301
1302 Log4(("vboxNetFltLinuxPacketHandler: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
1303 if (vboxNetFltLinuxSkBufIsOur(pBuf))
1304 {
1305 Log2(("vboxNetFltLinuxPacketHandler: got our own sk_buff, drop it.\n"));
1306 dev_kfree_skb(pBuf);
1307 return 0;
1308 }
1309
1310#ifndef VBOXNETFLT_SG_SUPPORT
1311 {
1312 /*
1313 * Get rid of fragmented packets, they cause too much trouble.
1314 */
1315 struct sk_buff *pCopy = skb_copy(pBuf, GFP_ATOMIC);
1316 kfree_skb(pBuf);
1317 if (!pCopy)
1318 {
1319 LogRel(("VBoxNetFlt: Failed to allocate packet buffer, dropping the packet.\n"));
1320 return 0;
1321 }
1322 pBuf = pCopy;
1323# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
1324 Log3(("vboxNetFltLinuxPacketHandler: skb copy len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1325 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1326 Log4(("vboxNetFltLinuxPacketHandler: packet dump follows:\n%.*Rhxd\n", pBuf->len-pBuf->data_len, skb_mac_header(pBuf)));
1327# else
1328 Log3(("vboxNetFltLinuxPacketHandler: skb copy len=%u data_len=%u truesize=%u next=%p nr_frags=%u tso_size=%u tso_seqs=%u frag_list=%p pkt_type=%x\n",
1329 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->tso_size, skb_shinfo(pBuf)->tso_segs, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1330# endif
1331 }
1332#endif
1333
1334#ifdef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
1335 /* Forward it to the internal network. */
1336 vboxNetFltLinuxForwardToIntNet(pThis, pBuf);
1337#else
1338 /* Add the packet to transmit queue and schedule the bottom half. */
1339 skb_queue_tail(&pThis->u.s.XmitQueue, pBuf);
1340 schedule_work(&pThis->u.s.XmitTask);
1341 Log4(("vboxNetFltLinuxPacketHandler: scheduled work %p for sk_buff %p\n",
1342 &pThis->u.s.XmitTask, pBuf));
1343#endif
1344
1345 /* It does not really matter what we return, it is ignored by the kernel. */
1346 return 0;
1347}
1348
1349/**
1350 * Calculate the number of INTNETSEG segments the socket buffer will need.
1351 *
1352 * @returns Segment count.
1353 * @param pBuf The socket buffer.
1354 */
1355DECLINLINE(unsigned) vboxNetFltLinuxCalcSGSegments(struct sk_buff *pBuf)
1356{
1357#ifdef VBOXNETFLT_SG_SUPPORT
1358 unsigned cSegs = 1 + skb_shinfo(pBuf)->nr_frags;
1359#else
1360 unsigned cSegs = 1;
1361#endif
1362#ifdef PADD_RUNT_FRAMES_FROM_HOST
1363 /* vboxNetFltLinuxSkBufToSG adds a padding segment if it's a runt. */
1364 if (pBuf->len < 60)
1365 cSegs++;
1366#endif
1367 return cSegs;
1368}
1369
1370/**
1371 * Destroy the intnet scatter / gather buffer created by
1372 * vboxNetFltLinuxSkBufToSG.
1373 */
1374static void vboxNetFltLinuxDestroySG(PINTNETSG pSG)
1375{
1376#ifdef VBOXNETFLT_SG_SUPPORT
1377 int i;
1378
1379 for (i = 0; i < skb_shinfo(pBuf)->nr_frags; i++)
1380 {
1381 printk("kunmap(%p)\n", pSG->aSegs[i+1].pv);
1382 kunmap(pSG->aSegs[i+1].pv);
1383 }
1384#endif
1385 NOREF(pSG);
1386}
1387
1388#ifdef LOG_ENABLED
1389/**
1390 * Logging helper.
1391 */
1392static void vboxNetFltDumpPacket(PINTNETSG pSG, bool fEgress, const char *pszWhere, int iIncrement)
1393{
1394 uint8_t *pInt, *pExt;
1395 static int iPacketNo = 1;
1396 iPacketNo += iIncrement;
1397 if (fEgress)
1398 {
1399 pExt = pSG->aSegs[0].pv;
1400 pInt = pExt + 6;
1401 }
1402 else
1403 {
1404 pInt = pSG->aSegs[0].pv;
1405 pExt = pInt + 6;
1406 }
1407 Log(("VBoxNetFlt: (int)%02x:%02x:%02x:%02x:%02x:%02x"
1408 " %s (%s)%02x:%02x:%02x:%02x:%02x:%02x (%u bytes) packet #%u\n",
1409 pInt[0], pInt[1], pInt[2], pInt[3], pInt[4], pInt[5],
1410 fEgress ? "-->" : "<--", pszWhere,
1411 pExt[0], pExt[1], pExt[2], pExt[3], pExt[4], pExt[5],
1412 pSG->cbTotal, iPacketNo));
1413 Log3(("%.*Rhxd\n", pSG->aSegs[0].cb, pSG->aSegs[0].pv));
1414}
1415#else
1416# define vboxNetFltDumpPacket(a, b, c, d) do {} while (0)
1417#endif
1418
1419#ifdef VBOXNETFLT_WITH_GSO_RECV
1420
1421/**
1422 * Worker for vboxNetFltLinuxForwardToIntNet that checks if we can forwards a
1423 * GSO socket buffer without having to segment it.
1424 *
1425 * @returns true on success, false if needs segmenting.
1426 * @param pThis The net filter instance.
1427 * @param pSkb The GSO socket buffer.
1428 * @param fSrc The source.
1429 * @param pGsoCtx Where to return the GSO context on success.
1430 */
1431static bool vboxNetFltLinuxCanForwardAsGso(PVBOXNETFLTINS pThis, struct sk_buff *pSkb, uint32_t fSrc,
1432 PPDMNETWORKGSO pGsoCtx)
1433{
1434 PDMNETWORKGSOTYPE enmGsoType;
1435 uint16_t uEtherType;
1436 unsigned int cbTransport;
1437 unsigned int offTransport;
1438 unsigned int cbTransportHdr;
1439 unsigned uProtocol;
1440 union
1441 {
1442 RTNETIPV4 IPv4;
1443 RTNETIPV6 IPv6;
1444 RTNETTCP Tcp;
1445 uint8_t ab[40];
1446 uint16_t au16[40/2];
1447 uint32_t au32[40/4];
1448 } Buf;
1449
1450 /*
1451 * Check the GSO properties of the socket buffer and make sure it fits.
1452 */
1453 /** @todo Figure out how to handle SKB_GSO_TCP_ECN! */
1454 if (RT_UNLIKELY( skb_shinfo(pSkb)->gso_type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCPV6 | SKB_GSO_TCPV4) ))
1455 {
1456 Log5(("vboxNetFltLinuxCanForwardAsGso: gso_type=%#x\n", skb_shinfo(pSkb)->gso_type));
1457 return false;
1458 }
1459 if (RT_UNLIKELY( skb_shinfo(pSkb)->gso_size < 1
1460 || pSkb->len > VBOX_MAX_GSO_SIZE ))
1461 {
1462 Log5(("vboxNetFltLinuxCanForwardAsGso: gso_size=%#x skb_len=%#x (max=%#x)\n", skb_shinfo(pSkb)->gso_size, pSkb->len, VBOX_MAX_GSO_SIZE));
1463 return false;
1464 }
1465 /*
1466 * It is possible to receive GSO packets from wire if GRO is enabled.
1467 */
1468 if (RT_UNLIKELY(fSrc & INTNETTRUNKDIR_WIRE))
1469 {
1470 Log5(("vboxNetFltLinuxCanForwardAsGso: fSrc=wire\n"));
1471#ifdef VBOXNETFLT_WITH_GRO
1472 /*
1473 * The packet came from the wire and the driver has already consumed
1474 * mac header. We need to restore it back.
1475 */
1476 pSkb->mac_len = skb_network_header(pSkb) - skb_mac_header(pSkb);
1477 skb_push(pSkb, pSkb->mac_len);
1478 Log5(("vboxNetFltLinuxCanForwardAsGso: mac_len=%d data=%p mac_header=%p network_header=%p\n",
1479 pSkb->mac_len, pSkb->data, skb_mac_header(pSkb), skb_network_header(pSkb)));
1480#else /* !VBOXNETFLT_WITH_GRO */
1481 /* Older kernels didn't have GRO. */
1482 return false;
1483#endif /* !VBOXNETFLT_WITH_GRO */
1484 }
1485 else
1486 {
1487 /*
1488 * skb_gso_segment does the following. Do we need to do it as well?
1489 */
1490#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
1491 skb_reset_mac_header(pSkb);
1492 pSkb->mac_len = pSkb->network_header - pSkb->mac_header;
1493#else
1494 pSkb->mac.raw = pSkb->data;
1495 pSkb->mac_len = pSkb->nh.raw - pSkb->data;
1496#endif
1497 }
1498
1499 /*
1500 * Switch on the ethertype.
1501 */
1502 uEtherType = pSkb->protocol;
1503 if ( uEtherType == RT_H2N_U16_C(RTNET_ETHERTYPE_VLAN)
1504 && pSkb->mac_len == sizeof(RTNETETHERHDR) + sizeof(uint32_t))
1505 {
1506 uint16_t const *puEtherType = skb_header_pointer(pSkb, sizeof(RTNETETHERHDR) + sizeof(uint16_t), sizeof(uint16_t), &Buf);
1507 if (puEtherType)
1508 uEtherType = *puEtherType;
1509 }
1510 switch (uEtherType)
1511 {
1512 case RT_H2N_U16_C(RTNET_ETHERTYPE_IPV4):
1513 {
1514 unsigned int cbHdr;
1515 PCRTNETIPV4 pIPv4 = (PCRTNETIPV4)skb_header_pointer(pSkb, pSkb->mac_len, sizeof(Buf.IPv4), &Buf);
1516 if (RT_UNLIKELY(!pIPv4))
1517 {
1518 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access IPv4 hdr\n"));
1519 return false;
1520 }
1521
1522 cbHdr = pIPv4->ip_hl * 4;
1523 cbTransport = RT_N2H_U16(pIPv4->ip_len);
1524 if (RT_UNLIKELY( cbHdr < RTNETIPV4_MIN_LEN
1525 || cbHdr > cbTransport ))
1526 {
1527 Log5(("vboxNetFltLinuxCanForwardAsGso: invalid IPv4 lengths: ip_hl=%u ip_len=%u\n", pIPv4->ip_hl, RT_N2H_U16(pIPv4->ip_len)));
1528 return false;
1529 }
1530 cbTransport -= cbHdr;
1531 offTransport = pSkb->mac_len + cbHdr;
1532 uProtocol = pIPv4->ip_p;
1533 if (uProtocol == RTNETIPV4_PROT_TCP)
1534 enmGsoType = PDMNETWORKGSOTYPE_IPV4_TCP;
1535 else if (uProtocol == RTNETIPV4_PROT_UDP)
1536 enmGsoType = PDMNETWORKGSOTYPE_IPV4_UDP;
1537 else /** @todo IPv6: 4to6 tunneling */
1538 enmGsoType = PDMNETWORKGSOTYPE_INVALID;
1539 break;
1540 }
1541
1542 case RT_H2N_U16_C(RTNET_ETHERTYPE_IPV6):
1543 {
1544 PCRTNETIPV6 pIPv6 = (PCRTNETIPV6)skb_header_pointer(pSkb, pSkb->mac_len, sizeof(Buf.IPv6), &Buf);
1545 if (RT_UNLIKELY(!pIPv6))
1546 {
1547 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access IPv6 hdr\n"));
1548 return false;
1549 }
1550
1551 cbTransport = RT_N2H_U16(pIPv6->ip6_plen);
1552 offTransport = pSkb->mac_len + sizeof(RTNETIPV6);
1553 uProtocol = pIPv6->ip6_nxt;
1554 /** @todo IPv6: Dig our way out of the other headers. */
1555 if (uProtocol == RTNETIPV4_PROT_TCP)
1556 enmGsoType = PDMNETWORKGSOTYPE_IPV6_TCP;
1557 else if (uProtocol == RTNETIPV4_PROT_UDP)
1558 enmGsoType = PDMNETWORKGSOTYPE_IPV4_UDP;
1559 else
1560 enmGsoType = PDMNETWORKGSOTYPE_INVALID;
1561 break;
1562 }
1563
1564 default:
1565 Log5(("vboxNetFltLinuxCanForwardAsGso: uEtherType=%#x\n", RT_H2N_U16(uEtherType)));
1566 return false;
1567 }
1568
1569 if (enmGsoType == PDMNETWORKGSOTYPE_INVALID)
1570 {
1571 Log5(("vboxNetFltLinuxCanForwardAsGso: Unsupported protocol %d\n", uProtocol));
1572 return false;
1573 }
1574
1575 if (RT_UNLIKELY( offTransport + cbTransport <= offTransport
1576 || offTransport + cbTransport > pSkb->len
1577 || cbTransport < (uProtocol == RTNETIPV4_PROT_TCP ? RTNETTCP_MIN_LEN : RTNETUDP_MIN_LEN)) )
1578 {
1579 Log5(("vboxNetFltLinuxCanForwardAsGso: Bad transport length; off=%#x + cb=%#x => %#x; skb_len=%#x (%s)\n",
1580 offTransport, cbTransport, offTransport + cbTransport, pSkb->len, PDMNetGsoTypeName(enmGsoType) ));
1581 return false;
1582 }
1583
1584 /*
1585 * Check the TCP/UDP bits.
1586 */
1587 if (uProtocol == RTNETIPV4_PROT_TCP)
1588 {
1589 PCRTNETTCP pTcp = (PCRTNETTCP)skb_header_pointer(pSkb, offTransport, sizeof(Buf.Tcp), &Buf);
1590 if (RT_UNLIKELY(!pTcp))
1591 {
1592 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access TCP hdr\n"));
1593 return false;
1594 }
1595
1596 cbTransportHdr = pTcp->th_off * 4;
1597 if (RT_UNLIKELY( cbTransportHdr < RTNETTCP_MIN_LEN
1598 || cbTransportHdr > cbTransport
1599 || offTransport + cbTransportHdr >= UINT8_MAX
1600 || offTransport + cbTransportHdr >= pSkb->len ))
1601 {
1602 Log5(("vboxNetFltLinuxCanForwardAsGso: No space for TCP header; off=%#x cb=%#x skb_len=%#x\n", offTransport, cbTransportHdr, pSkb->len));
1603 return false;
1604 }
1605
1606 }
1607 else
1608 {
1609 Assert(uProtocol == RTNETIPV4_PROT_UDP);
1610 cbTransportHdr = sizeof(RTNETUDP);
1611 if (RT_UNLIKELY( offTransport + cbTransportHdr >= UINT8_MAX
1612 || offTransport + cbTransportHdr >= pSkb->len ))
1613 {
1614 Log5(("vboxNetFltLinuxCanForwardAsGso: No space for UDP header; off=%#x skb_len=%#x\n", offTransport, pSkb->len));
1615 return false;
1616 }
1617 }
1618
1619 /*
1620 * We're good, init the GSO context.
1621 */
1622 pGsoCtx->u8Type = enmGsoType;
1623 pGsoCtx->cbHdrs = offTransport + cbTransportHdr;
1624 pGsoCtx->cbMaxSeg = skb_shinfo(pSkb)->gso_size;
1625 pGsoCtx->offHdr1 = pSkb->mac_len;
1626 pGsoCtx->offHdr2 = offTransport;
1627 pGsoCtx->au8Unused[0] = 0;
1628 pGsoCtx->au8Unused[1] = 0;
1629
1630 return true;
1631}
1632
1633/**
1634 * Forward the socket buffer as a GSO internal network frame.
1635 *
1636 * @returns IPRT status code.
1637 * @param pThis The net filter instance.
1638 * @param pSkb The GSO socket buffer.
1639 * @param fSrc The source.
1640 * @param pGsoCtx Where to return the GSO context on success.
1641 */
1642static int vboxNetFltLinuxForwardAsGso(PVBOXNETFLTINS pThis, struct sk_buff *pSkb, uint32_t fSrc, PCPDMNETWORKGSO pGsoCtx)
1643{
1644 int rc;
1645 unsigned cSegs = vboxNetFltLinuxCalcSGSegments(pSkb);
1646 if (RT_LIKELY(cSegs <= MAX_SKB_FRAGS + 1))
1647 {
1648 PINTNETSG pSG = (PINTNETSG)alloca(RT_OFFSETOF(INTNETSG, aSegs[cSegs]));
1649 if (RT_LIKELY(pSG))
1650 {
1651 vboxNetFltLinuxSkBufToSG(pThis, pSkb, pSG, cSegs, fSrc, pGsoCtx);
1652
1653 vboxNetFltDumpPacket(pSG, false, (fSrc & INTNETTRUNKDIR_HOST) ? "host" : "wire", 1);
1654 pThis->pSwitchPort->pfnRecv(pThis->pSwitchPort, NULL /* pvIf */, pSG, fSrc);
1655
1656 vboxNetFltLinuxDestroySG(pSG);
1657 rc = VINF_SUCCESS;
1658 }
1659 else
1660 {
1661 Log(("VBoxNetFlt: Dropping the sk_buff (failure case).\n"));
1662 rc = VERR_NO_MEMORY;
1663 }
1664 }
1665 else
1666 {
1667 Log(("VBoxNetFlt: Bad sk_buff? cSegs=%#x.\n", cSegs));
1668 rc = VERR_INTERNAL_ERROR_3;
1669 }
1670
1671 Log4(("VBoxNetFlt: Dropping the sk_buff.\n"));
1672 dev_kfree_skb(pSkb);
1673 return rc;
1674}
1675
1676#endif /* VBOXNETFLT_WITH_GSO_RECV */
1677
1678/**
1679 * Worker for vboxNetFltLinuxForwardToIntNet.
1680 *
1681 * @returns VINF_SUCCESS or VERR_NO_MEMORY.
1682 * @param pThis The net filter instance.
1683 * @param pBuf The socket buffer.
1684 * @param fSrc The source.
1685 */
1686static int vboxNetFltLinuxForwardSegment(PVBOXNETFLTINS pThis, struct sk_buff *pBuf, uint32_t fSrc)
1687{
1688 int rc;
1689 unsigned cSegs = vboxNetFltLinuxCalcSGSegments(pBuf);
1690 if (cSegs <= MAX_SKB_FRAGS + 1)
1691 {
1692 PINTNETSG pSG = (PINTNETSG)alloca(RT_OFFSETOF(INTNETSG, aSegs[cSegs]));
1693 if (RT_LIKELY(pSG))
1694 {
1695 if (fSrc & INTNETTRUNKDIR_WIRE)
1696 {
1697 /*
1698 * The packet came from wire, ethernet header was removed by device driver.
1699 * Restore it.
1700 */
1701 skb_push(pBuf, ETH_HLEN);
1702 }
1703
1704 vboxNetFltLinuxSkBufToSG(pThis, pBuf, pSG, cSegs, fSrc, NULL /*pGsoCtx*/);
1705
1706 vboxNetFltDumpPacket(pSG, false, (fSrc & INTNETTRUNKDIR_HOST) ? "host" : "wire", 1);
1707 pThis->pSwitchPort->pfnRecv(pThis->pSwitchPort, NULL /* pvIf */, pSG, fSrc);
1708
1709 vboxNetFltLinuxDestroySG(pSG);
1710 rc = VINF_SUCCESS;
1711 }
1712 else
1713 {
1714 Log(("VBoxNetFlt: Failed to allocate SG buffer.\n"));
1715 rc = VERR_NO_MEMORY;
1716 }
1717 }
1718 else
1719 {
1720 Log(("VBoxNetFlt: Bad sk_buff? cSegs=%#x.\n", cSegs));
1721 rc = VERR_INTERNAL_ERROR_3;
1722 }
1723
1724 Log4(("VBoxNetFlt: Dropping the sk_buff.\n"));
1725 dev_kfree_skb(pBuf);
1726 return rc;
1727}
1728
1729/**
1730 *
1731 * @param pBuf The socket buffer. This is consumed by this function.
1732 */
1733static void vboxNetFltLinuxForwardToIntNet(PVBOXNETFLTINS pThis, struct sk_buff *pBuf)
1734{
1735 uint32_t fSrc = pBuf->pkt_type == PACKET_OUTGOING ? INTNETTRUNKDIR_HOST : INTNETTRUNKDIR_WIRE;
1736
1737#ifdef VBOXNETFLT_WITH_GSO
1738 if (skb_is_gso(pBuf))
1739 {
1740 PDMNETWORKGSO GsoCtx;
1741 Log3(("vboxNetFltLinuxForwardToIntNet: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x ip_summed=%d\n",
1742 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type, pBuf->ip_summed));
1743# ifdef VBOXNETFLT_WITH_GSO_RECV
1744 if ( (skb_shinfo(pBuf)->gso_type & (SKB_GSO_UDP | SKB_GSO_TCPV6 | SKB_GSO_TCPV4))
1745 && vboxNetFltLinuxCanForwardAsGso(pThis, pBuf, fSrc, &GsoCtx) )
1746 vboxNetFltLinuxForwardAsGso(pThis, pBuf, fSrc, &GsoCtx);
1747 else
1748# endif
1749 {
1750 /* Need to segment the packet */
1751 struct sk_buff *pNext;
1752 struct sk_buff *pSegment = skb_gso_segment(pBuf, 0 /*supported features*/);
1753 if (IS_ERR(pSegment))
1754 {
1755 dev_kfree_skb(pBuf);
1756 LogRel(("VBoxNetFlt: Failed to segment a packet (%d).\n", PTR_ERR(pSegment)));
1757 return;
1758 }
1759
1760 for (; pSegment; pSegment = pNext)
1761 {
1762 Log3(("vboxNetFltLinuxForwardToIntNet: segment len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1763 pSegment->len, pSegment->data_len, pSegment->truesize, pSegment->next, skb_shinfo(pSegment)->nr_frags, skb_shinfo(pSegment)->gso_size, skb_shinfo(pSegment)->gso_segs, skb_shinfo(pSegment)->gso_type, skb_shinfo(pSegment)->frag_list, pSegment->pkt_type));
1764 pNext = pSegment->next;
1765 pSegment->next = 0;
1766 vboxNetFltLinuxForwardSegment(pThis, pSegment, fSrc);
1767 }
1768 dev_kfree_skb(pBuf);
1769 }
1770 }
1771 else
1772#endif /* VBOXNETFLT_WITH_GSO */
1773 {
1774 if (pBuf->ip_summed == CHECKSUM_PARTIAL && pBuf->pkt_type == PACKET_OUTGOING)
1775 {
1776#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
1777 /*
1778 * Try to work around the problem with CentOS 4.7 and 5.2 (2.6.9
1779 * and 2.6.18 kernels), they pass wrong 'h' pointer down. We take IP
1780 * header length from the header itself and reconstruct 'h' pointer
1781 * to TCP (or whatever) header.
1782 */
1783 unsigned char *tmp = pBuf->h.raw;
1784 if (pBuf->h.raw == pBuf->nh.raw && pBuf->protocol == htons(ETH_P_IP))
1785 pBuf->h.raw = pBuf->nh.raw + pBuf->nh.iph->ihl * 4;
1786#endif /* LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18) */
1787 if (VBOX_SKB_CHECKSUM_HELP(pBuf))
1788 {
1789 LogRel(("VBoxNetFlt: Failed to compute checksum, dropping the packet.\n"));
1790 dev_kfree_skb(pBuf);
1791 return;
1792 }
1793#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
1794 /* Restore the original (wrong) pointer. */
1795 pBuf->h.raw = tmp;
1796#endif /* LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18) */
1797 }
1798 vboxNetFltLinuxForwardSegment(pThis, pBuf, fSrc);
1799 }
1800}
1801
1802#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
1803/**
1804 * Work queue handler that forwards the socket buffers queued by
1805 * vboxNetFltLinuxPacketHandler to the internal network.
1806 *
1807 * @param pWork The work queue.
1808 */
1809# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20)
1810static void vboxNetFltLinuxXmitTask(struct work_struct *pWork)
1811# else
1812static void vboxNetFltLinuxXmitTask(void *pWork)
1813# endif
1814{
1815 PVBOXNETFLTINS pThis = VBOX_FLT_XT_TO_INST(pWork);
1816 struct sk_buff *pBuf;
1817
1818 Log4(("vboxNetFltLinuxXmitTask: Got work %p.\n", pWork));
1819
1820 /*
1821 * Active? Retain the instance and increment the busy counter.
1822 */
1823 if (vboxNetFltTryRetainBusyActive(pThis))
1824 {
1825 while ((pBuf = skb_dequeue(&pThis->u.s.XmitQueue)) != NULL)
1826 vboxNetFltLinuxForwardToIntNet(pThis, pBuf);
1827
1828 vboxNetFltRelease(pThis, true /* fBusy */);
1829 }
1830 else
1831 {
1832 /** @todo Shouldn't we just drop the packets here? There is little point in
1833 * making them accumulate when the VM is paused and it'll only waste
1834 * kernel memory anyway... Hmm. maybe wait a short while (2-5 secs)
1835 * before start draining the packets (goes for the intnet ring buf
1836 * too)? */
1837 }
1838}
1839#endif /* !VBOXNETFLT_LINUX_NO_XMIT_QUEUE */
1840
1841/**
1842 * Reports the GSO capabilites of the hardware NIC.
1843 *
1844 * @param pThis The net filter instance. The caller hold a
1845 * reference to this.
1846 */
1847static void vboxNetFltLinuxReportNicGsoCapabilities(PVBOXNETFLTINS pThis)
1848{
1849#ifdef VBOXNETFLT_WITH_GSO_XMIT_WIRE
1850 if (vboxNetFltTryRetainBusyNotDisconnected(pThis))
1851 {
1852 struct net_device *pDev;
1853 PINTNETTRUNKSWPORT pSwitchPort;
1854 unsigned int fFeatures;
1855 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
1856
1857 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1858
1859 pSwitchPort = pThis->pSwitchPort; /* this doesn't need to be here, but it doesn't harm. */
1860 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1861 if (pDev)
1862 fFeatures = pDev->features;
1863 else
1864 fFeatures = 0;
1865
1866 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1867
1868 if (pThis->pSwitchPort)
1869 {
1870 /* Set/update the GSO capabilities of the NIC. */
1871 uint32_t fGsoCapabilites = 0;
1872 if (fFeatures & NETIF_F_TSO)
1873 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_TCP);
1874 if (fFeatures & NETIF_F_TSO6)
1875 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_TCP);
1876# if 0 /** @todo GSO: Test UDP offloading (UFO) on linux. */
1877 if (fFeatures & NETIF_F_UFO)
1878 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_UDP);
1879 if (fFeatures & NETIF_F_UFO)
1880 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_UDP);
1881# endif
1882 pThis->pSwitchPort->pfnReportGsoCapabilities(pThis->pSwitchPort, fGsoCapabilites, INTNETTRUNKDIR_WIRE);
1883 }
1884
1885 vboxNetFltRelease(pThis, true /*fBusy*/);
1886 }
1887#endif /* VBOXNETFLT_WITH_GSO_XMIT_WIRE */
1888}
1889
1890/**
1891 * Helper that determins whether the host (ignoreing us) is operating the
1892 * interface in promiscuous mode or not.
1893 */
1894static bool vboxNetFltLinuxPromiscuous(PVBOXNETFLTINS pThis)
1895{
1896 bool fRc = false;
1897 struct net_device * pDev = vboxNetFltLinuxRetainNetDev(pThis);
1898 if (pDev)
1899 {
1900 fRc = !!(pDev->promiscuity - (ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet) & 1));
1901 LogFlow(("vboxNetFltPortOsIsPromiscuous: returns %d, pDev->promiscuity=%d, fPromiscuousSet=%d\n",
1902 fRc, pDev->promiscuity, pThis->u.s.fPromiscuousSet));
1903 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
1904 }
1905 return fRc;
1906}
1907
1908/**
1909 * Internal worker for vboxNetFltLinuxNotifierCallback.
1910 *
1911 * @returns VBox status code.
1912 * @param pThis The instance.
1913 * @param fRediscovery If set we're doing a rediscovery attempt, so, don't
1914 * flood the release log.
1915 */
1916static int vboxNetFltLinuxAttachToInterface(PVBOXNETFLTINS pThis, struct net_device *pDev)
1917{
1918 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
1919 LogFlow(("vboxNetFltLinuxAttachToInterface: pThis=%p (%s)\n", pThis, pThis->szName));
1920
1921 /*
1922 * Retain and store the device.
1923 */
1924 dev_hold(pDev);
1925
1926 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1927 ASMAtomicUoWritePtr(&pThis->u.s.pDev, pDev);
1928 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1929
1930 Log(("vboxNetFltLinuxAttachToInterface: Device %p(%s) retained. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1931 Log(("vboxNetFltLinuxAttachToInterface: Got pDev=%p pThis=%p pThis->u.s.pDev=%p\n", pDev, pThis, ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *)));
1932
1933 /* Get the mac address while we still have a valid net_device reference. */
1934 memcpy(&pThis->u.s.MacAddr, pDev->dev_addr, sizeof(pThis->u.s.MacAddr));
1935
1936 /*
1937 * Install a packet filter for this device with a protocol wildcard (ETH_P_ALL).
1938 */
1939 pThis->u.s.PacketType.type = __constant_htons(ETH_P_ALL);
1940 pThis->u.s.PacketType.dev = pDev;
1941 pThis->u.s.PacketType.func = vboxNetFltLinuxPacketHandler;
1942 dev_add_pack(&pThis->u.s.PacketType);
1943
1944#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
1945 vboxNetFltLinuxHookDev(pThis, pDev);
1946#endif
1947#ifdef VBOXNETFLT_WITH_QDISC
1948 vboxNetFltLinuxQdiscInstall(pThis, pDev);
1949#endif /* VBOXNETFLT_WITH_QDISC */
1950
1951 /*
1952 * Set indicators that require the spinlock. Be abit paranoid about racing
1953 * the device notification handle.
1954 */
1955 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1956 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1957 if (pDev)
1958 {
1959 ASMAtomicUoWriteBool(&pThis->fDisconnectedFromHost, false);
1960 ASMAtomicUoWriteBool(&pThis->u.s.fRegistered, true);
1961 pDev = NULL; /* don't dereference it */
1962 }
1963 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1964 Log(("vboxNetFltLinuxAttachToInterface: this=%p: Packet handler installed.\n", pThis));
1965
1966 /*
1967 * If the above succeeded report GSO capabilites, if not undo and
1968 * release the device.
1969 */
1970 if (!pDev)
1971 {
1972 Assert(pThis->pSwitchPort);
1973 if (vboxNetFltTryRetainBusyNotDisconnected(pThis))
1974 {
1975 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
1976 pThis->pSwitchPort->pfnReportMacAddress(pThis->pSwitchPort, &pThis->u.s.MacAddr);
1977 pThis->pSwitchPort->pfnReportPromiscuousMode(pThis->pSwitchPort, vboxNetFltLinuxPromiscuous(pThis));
1978 pThis->pSwitchPort->pfnReportNoPreemptDsts(pThis->pSwitchPort, INTNETTRUNKDIR_WIRE | INTNETTRUNKDIR_HOST);
1979 vboxNetFltRelease(pThis, true /*fBusy*/);
1980 }
1981 }
1982 else
1983 {
1984#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
1985 vboxNetFltLinuxUnhookDev(pThis, pDev);
1986#endif
1987#ifdef VBOXNETFLT_WITH_QDISC
1988 vboxNetFltLinuxQdiscRemove(pThis, pDev);
1989#endif /* VBOXNETFLT_WITH_QDISC */
1990 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1991 ASMAtomicUoWriteNullPtr(&pThis->u.s.pDev);
1992 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1993 dev_put(pDev);
1994 Log(("vboxNetFltLinuxAttachToInterface: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1995 }
1996
1997 LogRel(("VBoxNetFlt: attached to '%s' / %.*Rhxs\n", pThis->szName, sizeof(pThis->u.s.MacAddr), &pThis->u.s.MacAddr));
1998 return VINF_SUCCESS;
1999}
2000
2001
2002static int vboxNetFltLinuxUnregisterDevice(PVBOXNETFLTINS pThis, struct net_device *pDev)
2003{
2004 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
2005
2006 Assert(!pThis->fDisconnectedFromHost);
2007
2008#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
2009 vboxNetFltLinuxUnhookDev(pThis, pDev);
2010#endif
2011#ifdef VBOXNETFLT_WITH_QDISC
2012 vboxNetFltLinuxQdiscRemove(pThis, pDev);
2013#endif /* VBOXNETFLT_WITH_QDISC */
2014
2015 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
2016 ASMAtomicWriteBool(&pThis->u.s.fRegistered, false);
2017 ASMAtomicWriteBool(&pThis->fDisconnectedFromHost, true);
2018 ASMAtomicUoWriteNullPtr(&pThis->u.s.pDev);
2019 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
2020
2021 dev_remove_pack(&pThis->u.s.PacketType);
2022#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2023 skb_queue_purge(&pThis->u.s.XmitQueue);
2024#endif
2025 Log(("vboxNetFltLinuxUnregisterDevice: this=%p: Packet handler removed, xmit queue purged.\n", pThis));
2026 Log(("vboxNetFltLinuxUnregisterDevice: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
2027 dev_put(pDev);
2028
2029 return NOTIFY_OK;
2030}
2031
2032static int vboxNetFltLinuxDeviceIsUp(PVBOXNETFLTINS pThis, struct net_device *pDev)
2033{
2034 /* Check if we are not suspended and promiscuous mode has not been set. */
2035 if ( pThis->enmTrunkState == INTNETTRUNKIFSTATE_ACTIVE
2036 && !ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet))
2037 {
2038 /* Note that there is no need for locking as the kernel got hold of the lock already. */
2039 dev_set_promiscuity(pDev, 1);
2040 ASMAtomicWriteBool(&pThis->u.s.fPromiscuousSet, true);
2041 Log(("vboxNetFltLinuxDeviceIsUp: enabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2042 }
2043 else
2044 Log(("vboxNetFltLinuxDeviceIsUp: no need to enable promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2045 return NOTIFY_OK;
2046}
2047
2048static int vboxNetFltLinuxDeviceGoingDown(PVBOXNETFLTINS pThis, struct net_device *pDev)
2049{
2050 /* Undo promiscuous mode if we has set it. */
2051 if (ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet))
2052 {
2053 /* Note that there is no need for locking as the kernel got hold of the lock already. */
2054 dev_set_promiscuity(pDev, -1);
2055 ASMAtomicWriteBool(&pThis->u.s.fPromiscuousSet, false);
2056 Log(("vboxNetFltLinuxDeviceGoingDown: disabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2057 }
2058 else
2059 Log(("vboxNetFltLinuxDeviceGoingDown: no need to disable promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2060 return NOTIFY_OK;
2061}
2062
2063#ifdef LOG_ENABLED
2064/** Stringify the NETDEV_XXX constants. */
2065static const char *vboxNetFltLinuxGetNetDevEventName(unsigned long ulEventType)
2066{
2067 const char *pszEvent = "NETDRV_<unknown>";
2068 switch (ulEventType)
2069 {
2070 case NETDEV_REGISTER: pszEvent = "NETDEV_REGISTER"; break;
2071 case NETDEV_UNREGISTER: pszEvent = "NETDEV_UNREGISTER"; break;
2072 case NETDEV_UP: pszEvent = "NETDEV_UP"; break;
2073 case NETDEV_DOWN: pszEvent = "NETDEV_DOWN"; break;
2074 case NETDEV_REBOOT: pszEvent = "NETDEV_REBOOT"; break;
2075 case NETDEV_CHANGENAME: pszEvent = "NETDEV_CHANGENAME"; break;
2076 case NETDEV_CHANGE: pszEvent = "NETDEV_CHANGE"; break;
2077 case NETDEV_CHANGEMTU: pszEvent = "NETDEV_CHANGEMTU"; break;
2078 case NETDEV_CHANGEADDR: pszEvent = "NETDEV_CHANGEADDR"; break;
2079 case NETDEV_GOING_DOWN: pszEvent = "NETDEV_GOING_DOWN"; break;
2080# ifdef NETDEV_FEAT_CHANGE
2081 case NETDEV_FEAT_CHANGE: pszEvent = "NETDEV_FEAT_CHANGE"; break;
2082# endif
2083 }
2084 return pszEvent;
2085}
2086#endif /* LOG_ENABLED */
2087
2088/**
2089 * Callback for listening to netdevice events.
2090 *
2091 * This works the rediscovery, clean up on unregistration, promiscuity on
2092 * up/down, and GSO feature changes from ethtool.
2093 *
2094 * @returns NOTIFY_OK
2095 * @param self Pointer to our notifier registration block.
2096 * @param ulEventType The event.
2097 * @param ptr Event specific, but it is usually the device it
2098 * relates to.
2099 */
2100static int vboxNetFltLinuxNotifierCallback(struct notifier_block *self, unsigned long ulEventType, void *ptr)
2101
2102{
2103 PVBOXNETFLTINS pThis = VBOX_FLT_NB_TO_INST(self);
2104 struct net_device *pDev = (struct net_device *)ptr;
2105 int rc = NOTIFY_OK;
2106
2107 Log(("VBoxNetFlt: got event %s(0x%lx) on %s, pDev=%p pThis=%p pThis->u.s.pDev=%p\n",
2108 vboxNetFltLinuxGetNetDevEventName(ulEventType), ulEventType, pDev->name, pDev, pThis, ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *)));
2109 if ( ulEventType == NETDEV_REGISTER
2110 && !strcmp(pDev->name, pThis->szName))
2111 {
2112 vboxNetFltLinuxAttachToInterface(pThis, pDev);
2113 }
2114 else
2115 {
2116 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
2117 if (pDev == ptr)
2118 {
2119 switch (ulEventType)
2120 {
2121 case NETDEV_UNREGISTER:
2122 rc = vboxNetFltLinuxUnregisterDevice(pThis, pDev);
2123 break;
2124 case NETDEV_UP:
2125 rc = vboxNetFltLinuxDeviceIsUp(pThis, pDev);
2126 break;
2127 case NETDEV_GOING_DOWN:
2128 rc = vboxNetFltLinuxDeviceGoingDown(pThis, pDev);
2129 break;
2130 case NETDEV_CHANGENAME:
2131 break;
2132#ifdef NETDEV_FEAT_CHANGE
2133 case NETDEV_FEAT_CHANGE:
2134 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
2135 break;
2136#endif
2137 }
2138 }
2139 }
2140
2141 return rc;
2142}
2143
2144bool vboxNetFltOsMaybeRediscovered(PVBOXNETFLTINS pThis)
2145{
2146 return !ASMAtomicUoReadBool(&pThis->fDisconnectedFromHost);
2147}
2148
2149int vboxNetFltPortOsXmit(PVBOXNETFLTINS pThis, void *pvIfData, PINTNETSG pSG, uint32_t fDst)
2150{
2151 struct net_device * pDev;
2152 int err;
2153 int rc = VINF_SUCCESS;
2154 NOREF(pvIfData);
2155
2156 LogFlow(("vboxNetFltPortOsXmit: pThis=%p (%s)\n", pThis, pThis->szName));
2157
2158 pDev = vboxNetFltLinuxRetainNetDev(pThis);
2159 if (pDev)
2160 {
2161 /*
2162 * Create a sk_buff for the gather list and push it onto the wire.
2163 */
2164 if (fDst & INTNETTRUNKDIR_WIRE)
2165 {
2166 struct sk_buff *pBuf = vboxNetFltLinuxSkBufFromSG(pThis, pSG, true);
2167 if (pBuf)
2168 {
2169 vboxNetFltDumpPacket(pSG, true, "wire", 1);
2170 Log4(("vboxNetFltPortOsXmit: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
2171 Log4(("vboxNetFltPortOsXmit: dev_queue_xmit(%p)\n", pBuf));
2172 err = dev_queue_xmit(pBuf);
2173 if (err)
2174 rc = RTErrConvertFromErrno(err);
2175 }
2176 else
2177 rc = VERR_NO_MEMORY;
2178 }
2179
2180 /*
2181 * Create a sk_buff for the gather list and push it onto the host stack.
2182 */
2183 if (fDst & INTNETTRUNKDIR_HOST)
2184 {
2185 struct sk_buff *pBuf = vboxNetFltLinuxSkBufFromSG(pThis, pSG, false);
2186 if (pBuf)
2187 {
2188 vboxNetFltDumpPacket(pSG, true, "host", (fDst & INTNETTRUNKDIR_WIRE) ? 0 : 1);
2189 Log4(("vboxNetFltPortOsXmit: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
2190 Log4(("vboxNetFltPortOsXmit: netif_rx_ni(%p)\n", pBuf));
2191 err = netif_rx_ni(pBuf);
2192 if (err)
2193 rc = RTErrConvertFromErrno(err);
2194 }
2195 else
2196 rc = VERR_NO_MEMORY;
2197 }
2198
2199 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
2200 }
2201
2202 return rc;
2203}
2204
2205
2206void vboxNetFltPortOsSetActive(PVBOXNETFLTINS pThis, bool fActive)
2207{
2208 struct net_device * pDev;
2209
2210 LogFlow(("vboxNetFltPortOsSetActive: pThis=%p (%s), fActive=%s, fDisablePromiscuous=%s\n",
2211 pThis, pThis->szName, fActive?"true":"false",
2212 pThis->fDisablePromiscuous?"true":"false"));
2213
2214 if (pThis->fDisablePromiscuous)
2215 return;
2216
2217 pDev = vboxNetFltLinuxRetainNetDev(pThis);
2218 if (pDev)
2219 {
2220 /*
2221 * This api is a bit weird, the best reference is the code.
2222 *
2223 * Also, we have a bit or race conditions wrt the maintance of
2224 * host the interface promiscuity for vboxNetFltPortOsIsPromiscuous.
2225 */
2226#ifdef LOG_ENABLED
2227 u_int16_t fIf;
2228 unsigned const cPromiscBefore = pDev->promiscuity;
2229#endif
2230 if (fActive)
2231 {
2232 Assert(!pThis->u.s.fPromiscuousSet);
2233
2234 rtnl_lock();
2235 dev_set_promiscuity(pDev, 1);
2236 rtnl_unlock();
2237 pThis->u.s.fPromiscuousSet = true;
2238 Log(("vboxNetFltPortOsSetActive: enabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2239 }
2240 else
2241 {
2242 if (pThis->u.s.fPromiscuousSet)
2243 {
2244 rtnl_lock();
2245 dev_set_promiscuity(pDev, -1);
2246 rtnl_unlock();
2247 Log(("vboxNetFltPortOsSetActive: disabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2248 }
2249 pThis->u.s.fPromiscuousSet = false;
2250
2251#ifdef LOG_ENABLED
2252 fIf = dev_get_flags(pDev);
2253 Log(("VBoxNetFlt: fIf=%#x; %d->%d\n", fIf, cPromiscBefore, pDev->promiscuity));
2254#endif
2255 }
2256
2257 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
2258 }
2259}
2260
2261
2262int vboxNetFltOsDisconnectIt(PVBOXNETFLTINS pThis)
2263{
2264#ifdef VBOXNETFLT_WITH_QDISC
2265 vboxNetFltLinuxQdiscRemove(pThis, NULL);
2266#endif /* VBOXNETFLT_WITH_QDISC */
2267 /*
2268 * Remove packet handler when we get disconnected from internal switch as
2269 * we don't want the handler to forward packets to disconnected switch.
2270 */
2271 dev_remove_pack(&pThis->u.s.PacketType);
2272 return VINF_SUCCESS;
2273}
2274
2275
2276int vboxNetFltOsConnectIt(PVBOXNETFLTINS pThis)
2277{
2278 /*
2279 * Report the GSO capabilities of the host and device (if connected).
2280 * Note! No need to mark ourselves busy here.
2281 */
2282 /** @todo duplicate work here now? Attach */
2283#if defined(VBOXNETFLT_WITH_GSO_XMIT_HOST)
2284 pThis->pSwitchPort->pfnReportGsoCapabilities(pThis->pSwitchPort,
2285 0
2286 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_TCP)
2287 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_TCP)
2288# if 0 /** @todo GSO: Test UDP offloading (UFO) on linux. */
2289 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_UDP)
2290 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_UDP)
2291# endif
2292 , INTNETTRUNKDIR_HOST);
2293
2294#endif
2295 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
2296
2297 return VINF_SUCCESS;
2298}
2299
2300
2301void vboxNetFltOsDeleteInstance(PVBOXNETFLTINS pThis)
2302{
2303 struct net_device *pDev;
2304 bool fRegistered;
2305 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
2306
2307#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
2308 vboxNetFltLinuxUnhookDev(pThis, NULL);
2309#endif
2310
2311 /** @todo This code may race vboxNetFltLinuxUnregisterDevice (very very
2312 * unlikely, but none the less). Since it doesn't actually update the
2313 * state (just reads it), it is likely to panic in some interesting
2314 * ways. */
2315
2316 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
2317 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
2318 fRegistered = ASMAtomicUoReadBool(&pThis->u.s.fRegistered);
2319 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
2320
2321 if (fRegistered)
2322 {
2323#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2324 skb_queue_purge(&pThis->u.s.XmitQueue);
2325#endif
2326 Log(("vboxNetFltOsDeleteInstance: this=%p: Packet handler removed, xmit queue purged.\n", pThis));
2327 Log(("vboxNetFltOsDeleteInstance: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
2328 dev_put(pDev);
2329 }
2330 Log(("vboxNetFltOsDeleteInstance: this=%p: Notifier removed.\n", pThis));
2331 unregister_netdevice_notifier(&pThis->u.s.Notifier);
2332 module_put(THIS_MODULE);
2333}
2334
2335
2336int vboxNetFltOsInitInstance(PVBOXNETFLTINS pThis, void *pvContext)
2337{
2338 int err;
2339 NOREF(pvContext);
2340
2341 pThis->u.s.Notifier.notifier_call = vboxNetFltLinuxNotifierCallback;
2342 err = register_netdevice_notifier(&pThis->u.s.Notifier);
2343 if (err)
2344 return VERR_INTNET_FLT_IF_FAILED;
2345 if (!pThis->u.s.fRegistered)
2346 {
2347 unregister_netdevice_notifier(&pThis->u.s.Notifier);
2348 LogRel(("VBoxNetFlt: failed to find %s.\n", pThis->szName));
2349 return VERR_INTNET_FLT_IF_NOT_FOUND;
2350 }
2351
2352 Log(("vboxNetFltOsInitInstance: this=%p: Notifier installed.\n", pThis));
2353 if ( pThis->fDisconnectedFromHost
2354 || !try_module_get(THIS_MODULE))
2355 return VERR_INTNET_FLT_IF_FAILED;
2356
2357 return VINF_SUCCESS;
2358}
2359
2360int vboxNetFltOsPreInitInstance(PVBOXNETFLTINS pThis)
2361{
2362 /*
2363 * Init the linux specific members.
2364 */
2365 pThis->u.s.pDev = NULL;
2366 pThis->u.s.fRegistered = false;
2367 pThis->u.s.fPromiscuousSet = false;
2368 memset(&pThis->u.s.PacketType, 0, sizeof(pThis->u.s.PacketType));
2369#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2370 skb_queue_head_init(&pThis->u.s.XmitQueue);
2371# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20)
2372 INIT_WORK(&pThis->u.s.XmitTask, vboxNetFltLinuxXmitTask);
2373# else
2374 INIT_WORK(&pThis->u.s.XmitTask, vboxNetFltLinuxXmitTask, &pThis->u.s.XmitTask);
2375# endif
2376#endif
2377
2378 return VINF_SUCCESS;
2379}
2380
2381
2382void vboxNetFltPortOsNotifyMacAddress(PVBOXNETFLTINS pThis, void *pvIfData, PCRTMAC pMac)
2383{
2384 NOREF(pThis); NOREF(pvIfData); NOREF(pMac);
2385}
2386
2387
2388int vboxNetFltPortOsConnectInterface(PVBOXNETFLTINS pThis, void *pvIf, void **pvIfData)
2389{
2390 /* Nothing to do */
2391 NOREF(pThis); NOREF(pvIf); NOREF(pvIfData);
2392 return VINF_SUCCESS;
2393}
2394
2395
2396int vboxNetFltPortOsDisconnectInterface(PVBOXNETFLTINS pThis, void *pvIfData)
2397{
2398 /* Nothing to do */
2399 NOREF(pThis); NOREF(pvIfData);
2400 return VINF_SUCCESS;
2401}
2402
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette