VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/VBoxNetFlt/linux/VBoxNetFlt-linux.c@ 31680

Last change on this file since 31680 was 31680, checked in by vboxsync, 14 years ago

don't enable qdisc support if CONFIG_NET_SCHED is disabled (Linux 2.6.0 already has this config variable)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 82.4 KB
Line 
1/* $Id: VBoxNetFlt-linux.c 31680 2010-08-16 07:32:57Z vboxsync $ */
2/** @file
3 * VBoxNetFlt - Network Filter Driver (Host), Linux Specific Code.
4 */
5
6/*
7 * Copyright (C) 2006-2008 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*******************************************************************************
19* Header Files *
20*******************************************************************************/
21#define LOG_GROUP LOG_GROUP_NET_FLT_DRV
22#define VBOXNETFLT_LINUX_NO_XMIT_QUEUE
23#include "the-linux-kernel.h"
24#include "version-generated.h"
25#include "product-generated.h"
26#include <linux/netdevice.h>
27#include <linux/etherdevice.h>
28#include <linux/rtnetlink.h>
29#include <linux/miscdevice.h>
30#include <linux/ip.h>
31
32#include <VBox/log.h>
33#include <VBox/err.h>
34#include <VBox/intnetinline.h>
35#include <VBox/pdmnetinline.h>
36#include <VBox/param.h>
37#include <iprt/alloca.h>
38#include <iprt/assert.h>
39#include <iprt/spinlock.h>
40#include <iprt/semaphore.h>
41#include <iprt/initterm.h>
42#include <iprt/process.h>
43#include <iprt/mem.h>
44#include <iprt/net.h>
45#include <iprt/log.h>
46#include <iprt/mp.h>
47#include <iprt/mem.h>
48#include <iprt/time.h>
49
50#define VBOXNETFLT_OS_SPECFIC 1
51#include "../VBoxNetFltInternal.h"
52
53#ifdef CONFIG_NET_SCHED
54# define VBOXNETFLT_WITH_QDISC /* Comment this out to disable qdisc support */
55# ifdef VBOXNETFLT_WITH_QDISC
56# include <net/pkt_sched.h>
57# endif /* VBOXNETFLT_WITH_QDISC */
58#endif
59
60
61/*******************************************************************************
62* Defined Constants And Macros *
63*******************************************************************************/
64#define VBOX_FLT_NB_TO_INST(pNB) RT_FROM_MEMBER(pNB, VBOXNETFLTINS, u.s.Notifier)
65#define VBOX_FLT_PT_TO_INST(pPT) RT_FROM_MEMBER(pPT, VBOXNETFLTINS, u.s.PacketType)
66#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
67# define VBOX_FLT_XT_TO_INST(pXT) RT_FROM_MEMBER(pXT, VBOXNETFLTINS, u.s.XmitTask)
68#endif
69
70#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
71# define VBOX_SKB_RESET_NETWORK_HDR(skb) skb_reset_network_header(skb)
72# define VBOX_SKB_RESET_MAC_HDR(skb) skb_reset_mac_header(skb)
73#else
74# define VBOX_SKB_RESET_NETWORK_HDR(skb) skb->nh.raw = skb->data
75# define VBOX_SKB_RESET_MAC_HDR(skb) skb->mac.raw = skb->data
76#endif
77
78#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
79# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(skb)
80#else
81# define CHECKSUM_PARTIAL CHECKSUM_HW
82# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 10)
83# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(skb, 0)
84# else
85# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 7)
86# define VBOX_SKB_CHECKSUM_HELP(skb) skb_checksum_help(&skb, 0)
87# else
88# define VBOX_SKB_CHECKSUM_HELP(skb) (!skb_checksum_help(skb))
89# endif
90/* Versions prior 2.6.10 use stats for both bstats and qstats */
91# define bstats stats
92# define qstats stats
93# endif
94#endif
95
96#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 13)
97static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
98{
99 kfree_skb(skb);
100 sch->stats.drops++;
101
102 return NET_XMIT_DROP;
103}
104#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 13) */
105
106#ifndef NET_IP_ALIGN
107# define NET_IP_ALIGN 2
108#endif
109
110#if 0
111/** Create scatter / gather segments for fragments. When not used, we will
112 * linearize the socket buffer before creating the internal networking SG. */
113# define VBOXNETFLT_SG_SUPPORT 1
114#endif
115
116#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
117/** Indicates that the linux kernel may send us GSO frames. */
118# define VBOXNETFLT_WITH_GSO 1
119
120/** This enables or disables the transmitting of GSO frame from the internal
121 * network and to the host. */
122# define VBOXNETFLT_WITH_GSO_XMIT_HOST 1
123
124# if 0 /** @todo This is currently disable because it causes performance loss of 5-10%. */
125/** This enables or disables the transmitting of GSO frame from the internal
126 * network and to the wire. */
127# define VBOXNETFLT_WITH_GSO_XMIT_WIRE 1
128# endif
129
130/** This enables or disables the forwarding/flooding of GSO frame from the host
131 * to the internal network. */
132# define VBOXNETFLT_WITH_GSO_RECV 1
133
134#endif
135
136#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29)
137/** This enables or disables handling of GSO frames coming from the wire (GRO). */
138# define VBOXNETFLT_WITH_GRO 1
139#endif
140/*
141 * GRO support was backported to RHEL 5.4
142 */
143#ifdef RHEL_RELEASE_CODE
144# if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 4)
145# define VBOXNETFLT_WITH_GRO 1
146# endif
147#endif
148
149/*******************************************************************************
150* Internal Functions *
151*******************************************************************************/
152static int VBoxNetFltLinuxInit(void);
153static void VBoxNetFltLinuxUnload(void);
154static void vboxNetFltLinuxForwardToIntNet(PVBOXNETFLTINS pThis, struct sk_buff *pBuf);
155
156
157/*******************************************************************************
158* Global Variables *
159*******************************************************************************/
160/**
161 * The (common) global data.
162 */
163static VBOXNETFLTGLOBALS g_VBoxNetFltGlobals;
164
165module_init(VBoxNetFltLinuxInit);
166module_exit(VBoxNetFltLinuxUnload);
167
168MODULE_AUTHOR(VBOX_VENDOR);
169MODULE_DESCRIPTION(VBOX_PRODUCT " Network Filter Driver");
170MODULE_LICENSE("GPL");
171#ifdef MODULE_VERSION
172MODULE_VERSION(VBOX_VERSION_STRING " (" RT_XSTR(INTNETTRUNKIFPORT_VERSION) ")");
173#endif
174
175
176#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 12) && defined(LOG_ENABLED)
177unsigned dev_get_flags(const struct net_device *dev)
178{
179 unsigned flags;
180
181 flags = (dev->flags & ~(IFF_PROMISC |
182 IFF_ALLMULTI |
183 IFF_RUNNING)) |
184 (dev->gflags & (IFF_PROMISC |
185 IFF_ALLMULTI));
186
187 if (netif_running(dev) && netif_carrier_ok(dev))
188 flags |= IFF_RUNNING;
189
190 return flags;
191}
192#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 12) */
193
194
195#ifdef VBOXNETFLT_WITH_QDISC
196//#define QDISC_LOG(x) printk x
197#define QDISC_LOG(x) do { } while (0)
198
199#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
200#define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, ops)
201#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
202#define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, ops, parent)
203#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
204#define QDISC_CREATE(dev, queue, ops, parent) qdisc_create_dflt(dev, queue, ops, parent)
205#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
206
207#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
208#define qdisc_dev(qdisc) (qdisc->dev)
209#define qdisc_pkt_len(skb) (skb->len)
210#define QDISC_GET(dev) (dev->qdisc_sleeping)
211#else
212#define QDISC_GET(dev) (netdev_get_tx_queue(dev, 0)->qdisc_sleeping)
213#endif
214
215#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
216#define QDISC_SAVED_NUM(dev) 1
217#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
218#define QDISC_SAVED_NUM(dev) dev->num_tx_queues
219#else
220#define QDISC_SAVED_NUM(dev) dev->num_tx_queues+1
221#endif
222
223#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
224#define QDISC_IS_BUSY(dev, qdisc) test_bit(__LINK_STATE_SCHED, &dev->state)
225#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
226#define QDISC_IS_BUSY(dev, qdisc) (test_bit(__QDISC_STATE_RUNNING, &qdisc->state) || \
227 test_bit(__QDISC_STATE_SCHED, &qdisc->state))
228#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
229
230struct VBoxNetQDiscPriv
231{
232 /** Pointer to the single child qdisc. */
233 struct Qdisc *pChild;
234 /*
235 * Technically it is possible to have different qdiscs for different TX
236 * queues so we have to save them all.
237 */
238 /** Pointer to the array of saved qdiscs. */
239 struct Qdisc **ppSaved;
240 /** Pointer to the net filter instance. */
241 PVBOXNETFLTINS pVBoxNetFlt;
242};
243typedef struct VBoxNetQDiscPriv *PVBOXNETQDISCPRIV;
244
245//#define VBOXNETFLT_QDISC_ENQUEUE
246static int vboxNetFltQdiscEnqueue(struct sk_buff *skb, struct Qdisc *sch)
247{
248 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
249 int rc;
250
251#ifdef VBOXNETFLT_QDISC_ENQUEUE
252 if (VALID_PTR(pPriv->pVBoxNetFlt))
253 {
254 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
255 PCRTNETETHERHDR pEtherHdr;
256 PINTNETTRUNKSWPORT pSwitchPort;
257 uint32_t cbHdrs = skb_headlen(skb);
258
259 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
260 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(skb, 0, cbHdrs, &abHdrBuf[0]);
261 if ( pEtherHdr
262 && (pSwitchPort = pPriv->pVBoxNetFlt->pSwitchPort) != NULL
263 && VALID_PTR(pSwitchPort)
264 && cbHdrs >= 6)
265 {
266 /** @todo consider reference counting, etc. */
267 INTNETSWDECISION enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
268 if (enmDecision == INTNETSWDECISION_INTNET)
269 {
270 struct sk_buff *pBuf = skb_copy(skb, GFP_ATOMIC);
271 pBuf->pkt_type = PACKET_OUTGOING;
272 vboxNetFltLinuxForwardToIntNet(pPriv->pVBoxNetFlt, pBuf);
273 qdisc_drop(skb, sch);
274 ++sch->bstats.packets;
275 sch->bstats.bytes += qdisc_pkt_len(skb);
276 return NET_XMIT_SUCCESS;
277 }
278 }
279 }
280#endif /* VBOXNETFLT_QDISC_ENQUEUE */
281 rc = pPriv->pChild->enqueue(skb, pPriv->pChild);
282 if (rc == NET_XMIT_SUCCESS)
283 {
284 ++sch->q.qlen;
285 ++sch->bstats.packets;
286 sch->bstats.bytes += qdisc_pkt_len(skb);
287 }
288 else
289 ++sch->qstats.drops;
290 return rc;
291}
292
293static struct sk_buff *vboxNetFltQdiscDequeue(struct Qdisc *sch)
294{
295 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
296#ifdef VBOXNETFLT_QDISC_ENQUEUE
297 --sch->q.qlen;
298 return pPriv->pChild->dequeue(pPriv->pChild);
299#else /* VBOXNETFLT_QDISC_ENQUEUE */
300 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
301 PCRTNETETHERHDR pEtherHdr;
302 PINTNETTRUNKSWPORT pSwitchPort;
303 struct sk_buff *pSkb;
304
305 QDISC_LOG(("vboxNetFltDequeue: Enter pThis=%p\n", pPriv->pVBoxNetFlt));
306
307 while ((pSkb = pPriv->pChild->dequeue(pPriv->pChild)) != NULL)
308 {
309 struct sk_buff *pBuf;
310 INTNETSWDECISION enmDecision;
311 uint32_t cbHdrs;
312
313 --sch->q.qlen;
314
315 if (!VALID_PTR(pPriv->pVBoxNetFlt))
316 break;
317
318 cbHdrs = skb_headlen(pSkb);
319 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
320 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(pSkb, 0, cbHdrs, &abHdrBuf[0]);
321 if ( !pEtherHdr
322 || (pSwitchPort = pPriv->pVBoxNetFlt->pSwitchPort) == NULL
323 || !VALID_PTR(pSwitchPort)
324 || cbHdrs < 6)
325 break;
326
327 /** @todo consider reference counting, etc. */
328 enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
329 if (enmDecision != INTNETSWDECISION_INTNET)
330 break;
331
332 pBuf = skb_copy(pSkb, GFP_ATOMIC);
333 pBuf->pkt_type = PACKET_OUTGOING;
334 QDISC_LOG(("vboxNetFltDequeue: pThis=%p\n", pPriv->pVBoxNetFlt));
335 vboxNetFltLinuxForwardToIntNet(pPriv->pVBoxNetFlt, pBuf);
336 qdisc_drop(pSkb, sch);
337 QDISC_LOG(("VBoxNetFlt: Packet for %02x:%02x:%02x:%02x:%02x:%02x dropped\n",
338 pSkb->data[0], pSkb->data[1], pSkb->data[2],
339 pSkb->data[3], pSkb->data[4], pSkb->data[5]));
340 }
341
342 return pSkb;
343#endif /* VBOXNETFLT_QDISC_ENQUEUE */
344}
345
346#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29)
347static int vboxNetFltQdiscRequeue(struct sk_buff *skb, struct Qdisc *sch)
348{
349 int rc;
350 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
351
352 rc = pPriv->pChild->ops->requeue(skb, pPriv->pChild);
353 if (rc == 0)
354 {
355 sch->q.qlen++;
356#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 10)
357 sch->qstats.requeues++;
358#endif
359 }
360
361 return rc;
362}
363#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29) */
364
365static unsigned int vboxNetFltQdiscDrop(struct Qdisc *sch)
366{
367 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
368 unsigned int cbLen;
369
370 if (pPriv->pChild->ops->drop)
371 {
372 cbLen = pPriv->pChild->ops->drop(pPriv->pChild);
373 if (cbLen != 0)
374 {
375 ++sch->qstats.drops;
376 --sch->q.qlen;
377 return cbLen;
378 }
379 }
380
381 return 0;
382}
383
384#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 25)
385static int vboxNetFltQdiscInit(struct Qdisc *sch, struct rtattr *opt)
386#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
387static int vboxNetFltQdiscInit(struct Qdisc *sch, struct nlattr *opt)
388#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
389{
390 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
391 struct net_device *pDev = qdisc_dev(sch);
392
393 pPriv->pVBoxNetFlt = NULL;
394
395 pPriv->ppSaved = kcalloc(QDISC_SAVED_NUM(pDev), sizeof(pPriv->ppSaved[0]),
396 GFP_KERNEL);
397 if (!pPriv->ppSaved)
398 return -ENOMEM;
399
400 pPriv->pChild = QDISC_CREATE(pDev, netdev_get_tx_queue(pDev, 0),
401 &pfifo_qdisc_ops,
402 TC_H_MAKE(TC_H_MAJ(sch->handle),
403 TC_H_MIN(1)));
404 if (!pPriv->pChild)
405 {
406 kfree(pPriv->ppSaved);
407 pPriv->ppSaved = NULL;
408 return -ENOMEM;
409 }
410
411 return 0;
412}
413
414static void vboxNetFltQdiscReset(struct Qdisc *sch)
415{
416 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
417
418 qdisc_reset(pPriv->pChild);
419 sch->q.qlen = 0;
420 sch->qstats.backlog = 0;
421}
422
423static void vboxNetFltQdiscDestroy(struct Qdisc* sch)
424{
425 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
426 struct net_device *pDev = qdisc_dev(sch);
427
428 qdisc_destroy(pPriv->pChild);
429 pPriv->pChild = NULL;
430
431 if (pPriv->ppSaved)
432 {
433 int i;
434 for (i = 0; i < QDISC_SAVED_NUM(pDev); i++)
435 if (pPriv->ppSaved[i])
436 qdisc_destroy(pPriv->ppSaved[i]);
437 kfree(pPriv->ppSaved);
438 pPriv->ppSaved = NULL;
439 }
440}
441
442static int vboxNetFltClassGraft(struct Qdisc *sch, unsigned long arg, struct Qdisc *pNew,
443 struct Qdisc **ppOld)
444{
445 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
446
447 if (pNew == NULL)
448 pNew = &noop_qdisc;
449
450 sch_tree_lock(sch);
451 *ppOld = pPriv->pChild;
452 pPriv->pChild = pNew;
453#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
454 sch->q.qlen = 0;
455#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20) */
456 qdisc_tree_decrease_qlen(*ppOld, (*ppOld)->q.qlen);
457#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20) */
458 qdisc_reset(*ppOld);
459 sch_tree_unlock(sch);
460
461 return 0;
462}
463
464static struct Qdisc *vboxNetFltClassLeaf(struct Qdisc *sch, unsigned long arg)
465{
466 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
467 return pPriv->pChild;
468}
469
470static unsigned long vboxNetFltClassGet(struct Qdisc *sch, u32 classid)
471{
472 return 1;
473}
474
475static void vboxNetFltClassPut(struct Qdisc *sch, unsigned long arg)
476{
477}
478
479#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 25)
480static int vboxNetFltClassChange(struct Qdisc *sch, u32 classid, u32 parentid,
481 struct rtattr **tca, unsigned long *arg)
482#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
483static int vboxNetFltClassChange(struct Qdisc *sch, u32 classid, u32 parentid,
484 struct nlattr **tca, unsigned long *arg)
485#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) */
486{
487 return -ENOSYS;
488}
489
490static int vboxNetFltClassDelete(struct Qdisc *sch, unsigned long arg)
491{
492 return -ENOSYS;
493}
494
495static void vboxNetFltClassWalk(struct Qdisc *sch, struct qdisc_walker *walker)
496{
497 if (!walker->stop) {
498 if (walker->count >= walker->skip)
499 if (walker->fn(sch, 1, walker) < 0) {
500 walker->stop = 1;
501 return;
502 }
503 walker->count++;
504 }
505}
506
507static struct tcf_proto **vboxNetFltClassFindTcf(struct Qdisc *sch, unsigned long cl)
508{
509 return NULL;
510}
511
512static int vboxNetFltClassDump(struct Qdisc *sch, unsigned long cl,
513 struct sk_buff *skb, struct tcmsg *tcm)
514{
515 PVBOXNETQDISCPRIV pPriv = qdisc_priv(sch);
516
517 if (cl != 1)
518 return -ENOENT;
519
520 tcm->tcm_handle |= TC_H_MIN(1);
521 tcm->tcm_info = pPriv->pChild->handle;
522
523 return 0;
524}
525
526
527static struct Qdisc_class_ops g_VBoxNetFltClassOps =
528{
529 .graft = vboxNetFltClassGraft,
530 .leaf = vboxNetFltClassLeaf,
531 .get = vboxNetFltClassGet,
532 .put = vboxNetFltClassPut,
533 .change = vboxNetFltClassChange,
534 .delete = vboxNetFltClassDelete,
535 .walk = vboxNetFltClassWalk,
536 .tcf_chain = vboxNetFltClassFindTcf,
537 .dump = vboxNetFltClassDump,
538};
539
540
541static struct Qdisc_ops g_VBoxNetFltQDiscOps = {
542 .cl_ops = &g_VBoxNetFltClassOps,
543 .id = "vboxnetflt",
544 .priv_size = sizeof(struct VBoxNetQDiscPriv),
545 .enqueue = vboxNetFltQdiscEnqueue,
546 .dequeue = vboxNetFltQdiscDequeue,
547#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 29)
548 .requeue = vboxNetFltQdiscRequeue,
549#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29) */
550 .peek = qdisc_peek_dequeued,
551#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29) */
552 .drop = vboxNetFltQdiscDrop,
553 .init = vboxNetFltQdiscInit,
554 .reset = vboxNetFltQdiscReset,
555 .destroy = vboxNetFltQdiscDestroy,
556 .owner = THIS_MODULE
557};
558
559/*
560 * If our qdisc is already attached to the device (that means the user
561 * installed it from command line with 'tc' command) we simply update
562 * the pointer to vboxnetflt instance in qdisc's private structure.
563 * Otherwise we need to take some additional steps:
564 * - Create our qdisc;
565 * - Save all references to qdiscs;
566 * - Replace our child with the first qdisc reference;
567 * - Replace all references so they point to our qdisc.
568 */
569static void vboxNetFltLinuxQdiscInstall(PVBOXNETFLTINS pThis, struct net_device *pDev)
570{
571#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
572 int i;
573#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
574 PVBOXNETQDISCPRIV pPriv;
575
576 struct Qdisc *pExisting = QDISC_GET(pDev);
577 if (strcmp(pExisting->ops->id, "vboxnetflt"))
578 {
579 /* The existing qdisc is different from ours, let's create new one. */
580 struct Qdisc *pNew = QDISC_CREATE(pDev, netdev_get_tx_queue(pDev, 0),
581 &g_VBoxNetFltQDiscOps, TC_H_ROOT);
582 if (!pNew)
583 return; // TODO: Error?
584
585 if (!try_module_get(THIS_MODULE))
586 {
587 /*
588 * This may cause a memory leak but calling qdisc_destroy()
589 * is not an option as it will call module_put().
590 */
591 return;
592 }
593 pPriv = qdisc_priv(pNew);
594
595 qdisc_destroy(pPriv->pChild);
596 pPriv->pChild = QDISC_GET(pDev);
597 atomic_inc(&pPriv->pChild->refcnt);
598 /*
599 * There is no need in deactivating the device or acquiring any locks
600 * prior changing qdiscs since we do not destroy the old qdisc.
601 * Atomic replacement of pointers is enough.
602 */
603 /*
604 * No need to change reference counters here as we merely move
605 * the pointer and the reference counter of the newly allocated
606 * qdisc is already 1.
607 */
608#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
609 pPriv->ppSaved[0] = pDev->qdisc_sleeping;
610 ASMAtomicWritePtr(&pDev->qdisc_sleeping, pNew);
611 ASMAtomicWritePtr(&pDev->qdisc, pNew);
612#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
613 for (i = 0; i < pDev->num_tx_queues; i++)
614 {
615 struct netdev_queue *pQueue = netdev_get_tx_queue(pDev, i);
616
617 pPriv->ppSaved[i] = pQueue->qdisc_sleeping;
618 ASMAtomicWritePtr(&pQueue->qdisc_sleeping, pNew);
619 ASMAtomicWritePtr(&pQueue->qdisc, pNew);
620 if (i)
621 atomic_inc(&pNew->refcnt);
622 }
623 /* Newer kernels store root qdisc in netdev structure as well. */
624# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
625 pPriv->ppSaved[pDev->num_tx_queues] = pDev->qdisc;
626 ASMAtomicWritePtr(&pDev->qdisc, pNew);
627 atomic_inc(&pNew->refcnt);
628# endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) */
629#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
630 /* Synch the queue len with our child */
631 pNew->q.qlen = pPriv->pChild->q.qlen;
632 }
633 else
634 {
635 /* We already have vboxnetflt qdisc, let's use it. */
636 pPriv = qdisc_priv(pExisting);
637 }
638 ASMAtomicWritePtr(&pPriv->pVBoxNetFlt, pThis);
639 QDISC_LOG(("vboxNetFltLinuxInstallQdisc: pThis=%p\n", pPriv->pVBoxNetFlt));
640}
641
642static void vboxNetFltLinuxQdiscRemove(PVBOXNETFLTINS pThis, struct net_device *pDev)
643{
644#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
645 int i;
646#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
647 PVBOXNETQDISCPRIV pPriv;
648 struct Qdisc *pQdisc, *pChild;
649 if (!pDev)
650 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
651 if (!VALID_PTR(pDev))
652 {
653 printk("VBoxNetFlt: Failed to detach qdisc, invalid device pointer: %p\n",
654 pDev);
655 return; // TODO: Consider returing an error
656 }
657
658
659 pQdisc = QDISC_GET(pDev);
660 if (strcmp(pQdisc->ops->id, "vboxnetflt"))
661 {
662 /* Looks like the user has replaced our qdisc manually. */
663 printk("VBoxNetFlt: Failed to detach qdisc, wrong qdisc: %s\n",
664 pQdisc->ops->id);
665 return; // TODO: Consider returing an error
666 }
667
668 pPriv = qdisc_priv(pQdisc);
669 Assert(pPriv->pVBoxNetFlt == pThis);
670 ASMAtomicWriteNullPtr(&pPriv->pVBoxNetFlt);
671 pChild = ASMAtomicXchgPtrT(&pPriv->pChild, &noop_qdisc, struct Qdisc *);
672 qdisc_destroy(pChild); /* It won't be the last reference. */
673
674 QDISC_LOG(("vboxNetFltLinuxQdiscRemove: refcnt=%d num_tx_queues=%d\n",
675 atomic_read(&pQdisc->refcnt), pDev->num_tx_queues));
676#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)
677 /* Play it safe, make sure the qdisc is not being used. */
678 if (pPriv->ppSaved[0])
679 {
680 ASMAtomicWritePtr(&pDev->qdisc_sleeping, pPriv->ppSaved[0]);
681 ASMAtomicWritePtr(&pDev->qdisc, pPriv->ppSaved[0]);
682 pPriv->ppSaved[0] = NULL;
683 while (QDISC_IS_BUSY(pDev, pQdisc))
684 yield();
685 qdisc_destroy(pQdisc); /* Destroy reference */
686 }
687#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
688 for (i = 0; i < pDev->num_tx_queues; i++)
689 {
690 struct netdev_queue *pQueue = netdev_get_tx_queue(pDev, i);
691 if (pPriv->ppSaved[i])
692 {
693 Assert(pQueue->qdisc_sleeping == pQdisc);
694 ASMAtomicWritePtr(&pQueue->qdisc_sleeping, pPriv->ppSaved[i]);
695 ASMAtomicWritePtr(&pQueue->qdisc, pPriv->ppSaved[i]);
696 pPriv->ppSaved[i] = NULL;
697 while (QDISC_IS_BUSY(pDev, pQdisc))
698 yield();
699 qdisc_destroy(pQdisc); /* Destroy reference */
700 }
701 }
702 /* Newer kernels store root qdisc in netdev structure as well. */
703#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
704 ASMAtomicWritePtr(&pDev->qdisc, pPriv->ppSaved[pDev->num_tx_queues]);
705 pPriv->ppSaved[pDev->num_tx_queues] = NULL;
706 while (QDISC_IS_BUSY(pDev, pQdisc))
707 yield();
708 qdisc_destroy(pQdisc); /* Destroy reference */
709#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) */
710#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) */
711
712 /*
713 * At this point all references to our qdisc should be gone
714 * unless the user had installed it manually.
715 */
716 QDISC_LOG(("vboxNetFltLinuxRemoveQdisc: pThis=%p\n", pPriv->pVBoxNetFlt));
717}
718
719#endif /* VBOXNETFLT_WITH_QDISC */
720
721
722/**
723 * Initialize module.
724 *
725 * @returns appropriate status code.
726 */
727static int __init VBoxNetFltLinuxInit(void)
728{
729 int rc;
730 /*
731 * Initialize IPRT.
732 */
733 rc = RTR0Init(0);
734 if (RT_SUCCESS(rc))
735 {
736 Log(("VBoxNetFltLinuxInit\n"));
737
738 /*
739 * Initialize the globals and connect to the support driver.
740 *
741 * This will call back vboxNetFltOsOpenSupDrv (and maybe vboxNetFltOsCloseSupDrv)
742 * for establishing the connect to the support driver.
743 */
744 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
745 rc = vboxNetFltInitGlobalsAndIdc(&g_VBoxNetFltGlobals);
746 if (RT_SUCCESS(rc))
747 {
748#ifdef VBOXNETFLT_WITH_QDISC
749 /*memcpy(&g_VBoxNetFltQDiscOps, &pfifo_qdisc_ops, sizeof(g_VBoxNetFltQDiscOps));
750 strcpy(g_VBoxNetFltQDiscOps.id, "vboxnetflt");
751 g_VBoxNetFltQDiscOps.owner = THIS_MODULE;*/
752 rc = register_qdisc(&g_VBoxNetFltQDiscOps);
753 if (rc)
754 {
755 LogRel(("VBoxNetFlt: Failed to registed qdisc: %d\n", rc));
756 return rc;
757 }
758#endif /* VBOXNETFLT_WITH_QDISC */
759 LogRel(("VBoxNetFlt: Successfully started.\n"));
760 return 0;
761 }
762
763 LogRel(("VBoxNetFlt: failed to initialize device extension (rc=%d)\n", rc));
764 RTR0Term();
765 }
766 else
767 LogRel(("VBoxNetFlt: failed to initialize IPRT (rc=%d)\n", rc));
768
769 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
770 return -RTErrConvertToErrno(rc);
771}
772
773
774/**
775 * Unload the module.
776 *
777 * @todo We have to prevent this if we're busy!
778 */
779static void __exit VBoxNetFltLinuxUnload(void)
780{
781 int rc;
782 Log(("VBoxNetFltLinuxUnload\n"));
783 Assert(vboxNetFltCanUnload(&g_VBoxNetFltGlobals));
784
785#ifdef VBOXNETFLT_WITH_QDISC
786 unregister_qdisc(&g_VBoxNetFltQDiscOps);
787#endif /* VBOXNETFLT_WITH_QDISC */
788 /*
789 * Undo the work done during start (in reverse order).
790 */
791 rc = vboxNetFltTryDeleteIdcAndGlobals(&g_VBoxNetFltGlobals);
792 AssertRC(rc); NOREF(rc);
793
794 RTR0Term();
795
796 memset(&g_VBoxNetFltGlobals, 0, sizeof(g_VBoxNetFltGlobals));
797
798 Log(("VBoxNetFltLinuxUnload - done\n"));
799}
800
801
802/**
803 * Experiment where we filter trafic from the host to the internal network
804 * before it reaches the NIC driver.
805 *
806 * The current code uses a very ugly hack and only works on kernels using the
807 * net_device_ops (>= 2.6.29). It has been shown to give us a
808 * performance boost of 60-100% though. So, we have to find some less hacky way
809 * of getting this job done eventually.
810 *
811 * #define VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
812 */
813#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
814
815/**
816 * The overridden net_device_ops of the device we're attached to.
817 *
818 * Requires Linux 2.6.29 or later.
819 *
820 * This is a very dirty hack that was create to explore how much we can improve
821 * the host to guest transfers by not CC'ing the NIC.
822 */
823typedef struct VBoxNetDeviceOpsOverride
824{
825 /** Our overridden ops. */
826 struct net_device_ops Ops;
827 /** Magic word. */
828 uint32_t u32Magic;
829 /** Pointer to the original ops. */
830 struct net_device_ops const *pOrgOps;
831 /** Pointer to the net filter instance. */
832 PVBOXNETFLTINS pVBoxNetFlt;
833 /** The number of filtered packages. */
834 uint64_t cFiltered;
835 /** The total number of packets */
836 uint64_t cTotal;
837} VBOXNETDEVICEOPSOVERRIDE, *PVBOXNETDEVICEOPSOVERRIDE;
838/** VBOXNETDEVICEOPSOVERRIDE::u32Magic value. */
839#define VBOXNETDEVICEOPSOVERRIDE_MAGIC UINT32_C(0x00c0ffee)
840
841/**
842 * ndo_start_xmit wrapper that drops packets that shouldn't go to the wire
843 * because they belong on the internal network.
844 *
845 * @returns NETDEV_TX_XXX.
846 * @param pSkb The socket buffer to transmit.
847 * @param pDev The net device.
848 */
849static int vboxNetFltLinuxStartXmitFilter(struct sk_buff *pSkb, struct net_device *pDev)
850{
851 PVBOXNETDEVICEOPSOVERRIDE pOverride = (PVBOXNETDEVICEOPSOVERRIDE)pDev->netdev_ops;
852 uint8_t abHdrBuf[sizeof(RTNETETHERHDR) + sizeof(uint32_t) + RTNETIPV4_MIN_LEN];
853 PCRTNETETHERHDR pEtherHdr;
854 PINTNETTRUNKSWPORT pSwitchPort;
855 uint32_t cbHdrs;
856
857
858 /*
859 * Validate the override structure.
860 *
861 * Note! We're racing vboxNetFltLinuxUnhookDev here. If this was supposed
862 * to be production quality code, we would have to be much more
863 * careful here and avoid the race.
864 */
865 if ( !VALID_PTR(pOverride)
866 || pOverride->u32Magic != VBOXNETDEVICEOPSOVERRIDE_MAGIC
867 || !VALID_PTR(pOverride->pOrgOps))
868 {
869 printk("vboxNetFltLinuxStartXmitFilter: bad override %p\n", pOverride);
870 dev_kfree_skb(pSkb);
871 return NETDEV_TX_OK;
872 }
873 pOverride->cTotal++;
874
875 /*
876 * Do the filtering base on the defaul OUI of our virtual NICs
877 *
878 * Note! In a real solution, we would ask the switch whether the
879 * destination MAC is 100% to be on the internal network and then
880 * drop it.
881 */
882 cbHdrs = skb_headlen(pSkb);
883 cbHdrs = RT_MIN(cbHdrs, sizeof(abHdrBuf));
884 pEtherHdr = (PCRTNETETHERHDR)skb_header_pointer(pSkb, 0, cbHdrs, &abHdrBuf[0]);
885 if ( pEtherHdr
886 && VALID_PTR(pOverride->pVBoxNetFlt)
887 && (pSwitchPort = pOverride->pVBoxNetFlt->pSwitchPort) != NULL
888 && VALID_PTR(pSwitchPort)
889 && cbHdrs >= 6)
890 {
891 INTNETSWDECISION enmDecision;
892
893 /** @todo consider reference counting, etc. */
894 enmDecision = pSwitchPort->pfnPreRecv(pSwitchPort, pEtherHdr, cbHdrs, INTNETTRUNKDIR_HOST);
895 if (enmDecision == INTNETSWDECISION_INTNET)
896 {
897 dev_kfree_skb(pSkb);
898 pOverride->cFiltered++;
899 return NETDEV_TX_OK;
900 }
901 }
902
903 return pOverride->pOrgOps->ndo_start_xmit(pSkb, pDev);
904}
905
906/**
907 * Hooks the device ndo_start_xmit operation of the device.
908 *
909 * @param pThis The net filter instance.
910 * @param pDev The net device.
911 */
912static void vboxNetFltLinuxHookDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
913{
914 PVBOXNETDEVICEOPSOVERRIDE pOverride;
915 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
916
917 pOverride = RTMemAlloc(sizeof(*pOverride));
918 if (!pOverride)
919 return;
920 pOverride->pOrgOps = pDev->netdev_ops;
921 pOverride->Ops = *pDev->netdev_ops;
922 pOverride->Ops.ndo_start_xmit = vboxNetFltLinuxStartXmitFilter;
923 pOverride->u32Magic = VBOXNETDEVICEOPSOVERRIDE_MAGIC;
924 pOverride->cTotal = 0;
925 pOverride->cFiltered = 0;
926 pOverride->pVBoxNetFlt = pThis;
927
928 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp); /* (this isn't necessary, but so what) */
929 ASMAtomicWritePtr((void * volatile *)&pDev->netdev_ops, pOverride);
930 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
931}
932
933/**
934 * Undos what vboxNetFltLinuxHookDev did.
935 *
936 * @param pThis The net filter instance.
937 * @param pDev The net device. Can be NULL, in which case
938 * we'll try retrieve it from @a pThis.
939 */
940static void vboxNetFltLinuxUnhookDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
941{
942 PVBOXNETDEVICEOPSOVERRIDE pOverride;
943 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
944
945 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
946 if (!pDev)
947 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
948 if (VALID_PTR(pDev))
949 {
950 pOverride = (PVBOXNETDEVICEOPSOVERRIDE)pDev->netdev_ops;
951 if ( VALID_PTR(pOverride)
952 && pOverride->u32Magic == VBOXNETDEVICEOPSOVERRIDE_MAGIC
953 && VALID_PTR(pOverride->pOrgOps)
954 )
955 {
956 ASMAtomicWritePtr((void * volatile *)&pDev->netdev_ops, pOverride->pOrgOps);
957 ASMAtomicWriteU32(&pOverride->u32Magic, 0);
958 }
959 else
960 pOverride = NULL;
961 }
962 else
963 pOverride = NULL;
964 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
965
966 if (pOverride)
967 {
968 printk("vboxnetflt: dropped %llu out of %llu packets\n", pOverride->cFiltered, pOverride->cTotal);
969 RTMemFree(pOverride);
970 }
971}
972
973#endif /* VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT */
974
975
976/**
977 * Reads and retains the host interface handle.
978 *
979 * @returns The handle, NULL if detached.
980 * @param pThis
981 */
982DECLINLINE(struct net_device *) vboxNetFltLinuxRetainNetDev(PVBOXNETFLTINS pThis)
983{
984#if 0
985 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
986 struct net_device *pDev = NULL;
987
988 Log(("vboxNetFltLinuxRetainNetDev\n"));
989 /*
990 * Be careful here to avoid problems racing the detached callback.
991 */
992 RTSpinlockAcquire(pThis->hSpinlock, &Tmp);
993 if (!ASMAtomicUoReadBool(&pThis->fDisconnectedFromHost))
994 {
995 pDev = (struct net_device *)ASMAtomicUoReadPtr((void * volatile *)&pThis->u.s.pDev);
996 if (pDev)
997 {
998 dev_hold(pDev);
999 Log(("vboxNetFltLinuxRetainNetDev: Device %p(%s) retained. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1000 }
1001 }
1002 RTSpinlockRelease(pThis->hSpinlock, &Tmp);
1003
1004 Log(("vboxNetFltLinuxRetainNetDev - done\n"));
1005 return pDev;
1006#else
1007 return ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1008#endif
1009}
1010
1011
1012/**
1013 * Release the host interface handle previously retained
1014 * by vboxNetFltLinuxRetainNetDev.
1015 *
1016 * @param pThis The instance.
1017 * @param pDev The vboxNetFltLinuxRetainNetDev
1018 * return value, NULL is fine.
1019 */
1020DECLINLINE(void) vboxNetFltLinuxReleaseNetDev(PVBOXNETFLTINS pThis, struct net_device *pDev)
1021{
1022#if 0
1023 Log(("vboxNetFltLinuxReleaseNetDev\n"));
1024 NOREF(pThis);
1025 if (pDev)
1026 {
1027 dev_put(pDev);
1028 Log(("vboxNetFltLinuxReleaseNetDev: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1029 }
1030 Log(("vboxNetFltLinuxReleaseNetDev - done\n"));
1031#endif
1032}
1033
1034#define VBOXNETFLT_CB_TAG(skb) (0xA1C90000 | (skb->dev->ifindex & 0xFFFF))
1035#define VBOXNETFLT_SKB_TAG(skb) (*(uint32_t*)&((skb)->cb[sizeof((skb)->cb)-sizeof(uint32_t)]))
1036
1037/**
1038 * Checks whether this is an mbuf created by vboxNetFltLinuxMBufFromSG,
1039 * i.e. a buffer which we're pushing and should be ignored by the filter callbacks.
1040 *
1041 * @returns true / false accordingly.
1042 * @param pBuf The sk_buff.
1043 */
1044DECLINLINE(bool) vboxNetFltLinuxSkBufIsOur(struct sk_buff *pBuf)
1045{
1046 return VBOXNETFLT_SKB_TAG(pBuf) == VBOXNETFLT_CB_TAG(pBuf);
1047}
1048
1049
1050/**
1051 * Internal worker that create a linux sk_buff for a
1052 * (scatter/)gather list.
1053 *
1054 * @returns Pointer to the sk_buff.
1055 * @param pThis The instance.
1056 * @param pSG The (scatter/)gather list.
1057 * @param fDstWire Set if the destination is the wire.
1058 */
1059static struct sk_buff *vboxNetFltLinuxSkBufFromSG(PVBOXNETFLTINS pThis, PINTNETSG pSG, bool fDstWire)
1060{
1061 struct sk_buff *pPkt;
1062 struct net_device *pDev;
1063 unsigned fGsoType = 0;
1064
1065 if (pSG->cbTotal == 0)
1066 {
1067 LogRel(("VBoxNetFlt: Dropped empty packet coming from internal network.\n"));
1068 return NULL;
1069 }
1070
1071 /** @todo We should use fragments mapping the SG buffers with large packets.
1072 * 256 bytes seems to be the a threshold used a lot for this. It
1073 * requires some nasty work on the intnet side though... */
1074 /*
1075 * Allocate a packet and copy over the data.
1076 */
1077 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1078 pPkt = dev_alloc_skb(pSG->cbTotal + NET_IP_ALIGN);
1079 if (RT_UNLIKELY(!pPkt))
1080 {
1081 Log(("vboxNetFltLinuxSkBufFromSG: Failed to allocate sk_buff(%u).\n", pSG->cbTotal));
1082 pSG->pvUserData = NULL;
1083 return NULL;
1084 }
1085 pPkt->dev = pDev;
1086 pPkt->ip_summed = CHECKSUM_NONE;
1087
1088 /* Align IP header on 16-byte boundary: 2 + 14 (ethernet hdr size). */
1089 skb_reserve(pPkt, NET_IP_ALIGN);
1090
1091 /* Copy the segments. */
1092 skb_put(pPkt, pSG->cbTotal);
1093 IntNetSgRead(pSG, pPkt->data);
1094
1095#if defined(VBOXNETFLT_WITH_GSO_XMIT_WIRE) || defined(VBOXNETFLT_WITH_GSO_XMIT_HOST)
1096 /*
1097 * Setup GSO if used by this packet.
1098 */
1099 switch ((PDMNETWORKGSOTYPE)pSG->GsoCtx.u8Type)
1100 {
1101 default:
1102 AssertMsgFailed(("%u (%s)\n", pSG->GsoCtx.u8Type, PDMNetGsoTypeName((PDMNETWORKGSOTYPE)pSG->GsoCtx.u8Type) ));
1103 /* fall thru */
1104 case PDMNETWORKGSOTYPE_INVALID:
1105 fGsoType = 0;
1106 break;
1107 case PDMNETWORKGSOTYPE_IPV4_TCP:
1108 fGsoType = SKB_GSO_TCPV4;
1109 break;
1110 case PDMNETWORKGSOTYPE_IPV4_UDP:
1111 fGsoType = SKB_GSO_UDP;
1112 break;
1113 case PDMNETWORKGSOTYPE_IPV6_TCP:
1114 fGsoType = SKB_GSO_TCPV6;
1115 break;
1116 }
1117 if (fGsoType)
1118 {
1119 struct skb_shared_info *pShInfo = skb_shinfo(pPkt);
1120
1121 pShInfo->gso_type = fGsoType | SKB_GSO_DODGY;
1122 pShInfo->gso_size = pSG->GsoCtx.cbMaxSeg;
1123 pShInfo->gso_segs = PDMNetGsoCalcSegmentCount(&pSG->GsoCtx, pSG->cbTotal);
1124
1125 /*
1126 * We need to set checksum fields even if the packet goes to the host
1127 * directly as it may be immediately forwared by IP layer @bugref{5020}.
1128 */
1129 Assert(skb_headlen(pPkt) >= pSG->GsoCtx.cbHdrs);
1130 pPkt->ip_summed = CHECKSUM_PARTIAL;
1131# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
1132 pPkt->csum_start = skb_headroom(pPkt) + pSG->GsoCtx.offHdr2;
1133 if (fGsoType & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
1134 pPkt->csum_offset = RT_OFFSETOF(RTNETTCP, th_sum);
1135 else
1136 pPkt->csum_offset = RT_OFFSETOF(RTNETUDP, uh_sum);
1137# else
1138 pPkt->h.raw = pPkt->data + pSG->GsoCtx.offHdr2;
1139 if (fGsoType & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
1140 pPkt->csum = RT_OFFSETOF(RTNETTCP, th_sum);
1141 else
1142 pPkt->csum = RT_OFFSETOF(RTNETUDP, uh_sum);
1143# endif
1144 if (!fDstWire)
1145 PDMNetGsoPrepForDirectUse(&pSG->GsoCtx, pPkt->data, pSG->cbTotal, PDMNETCSUMTYPE_PSEUDO);
1146 }
1147#endif /* VBOXNETFLT_WITH_GSO_XMIT_WIRE || VBOXNETFLT_WITH_GSO_XMIT_HOST */
1148
1149 /*
1150 * Finish up the socket buffer.
1151 */
1152 pPkt->protocol = eth_type_trans(pPkt, pDev);
1153 if (fDstWire)
1154 {
1155 VBOX_SKB_RESET_NETWORK_HDR(pPkt);
1156
1157 /* Restore ethernet header back. */
1158 skb_push(pPkt, ETH_HLEN); /** @todo VLAN: +4 if VLAN? */
1159 VBOX_SKB_RESET_MAC_HDR(pPkt);
1160 }
1161 VBOXNETFLT_SKB_TAG(pPkt) = VBOXNETFLT_CB_TAG(pPkt);
1162
1163 return pPkt;
1164}
1165
1166
1167/**
1168 * Initializes a SG list from an sk_buff.
1169 *
1170 * @returns Number of segments.
1171 * @param pThis The instance.
1172 * @param pBuf The sk_buff.
1173 * @param pSG The SG.
1174 * @param pvFrame The frame pointer, optional.
1175 * @param cSegs The number of segments allocated for the SG.
1176 * This should match the number in the mbuf exactly!
1177 * @param fSrc The source of the frame.
1178 * @param pGso Pointer to the GSO context if it's a GSO
1179 * internal network frame. NULL if regular frame.
1180 */
1181DECLINLINE(void) vboxNetFltLinuxSkBufToSG(PVBOXNETFLTINS pThis, struct sk_buff *pBuf, PINTNETSG pSG,
1182 unsigned cSegs, uint32_t fSrc, PCPDMNETWORKGSO pGsoCtx)
1183{
1184 int i;
1185 NOREF(pThis);
1186
1187 Assert(!skb_shinfo(pBuf)->frag_list);
1188
1189 if (!pGsoCtx)
1190 IntNetSgInitTempSegs(pSG, pBuf->len, cSegs, 0 /*cSegsUsed*/);
1191 else
1192 IntNetSgInitTempSegsGso(pSG, pBuf->len, cSegs, 0 /*cSegsUsed*/, pGsoCtx);
1193
1194#ifdef VBOXNETFLT_SG_SUPPORT
1195 pSG->aSegs[0].cb = skb_headlen(pBuf);
1196 pSG->aSegs[0].pv = pBuf->data;
1197 pSG->aSegs[0].Phys = NIL_RTHCPHYS;
1198
1199 for (i = 0; i < skb_shinfo(pBuf)->nr_frags; i++)
1200 {
1201 skb_frag_t *pFrag = &skb_shinfo(pBuf)->frags[i];
1202 pSG->aSegs[i+1].cb = pFrag->size;
1203 pSG->aSegs[i+1].pv = kmap(pFrag->page);
1204 printk("%p = kmap()\n", pSG->aSegs[i+1].pv);
1205 pSG->aSegs[i+1].Phys = NIL_RTHCPHYS;
1206 }
1207 ++i;
1208
1209#else
1210 pSG->aSegs[0].cb = pBuf->len;
1211 pSG->aSegs[0].pv = pBuf->data;
1212 pSG->aSegs[0].Phys = NIL_RTHCPHYS;
1213 i = 1;
1214#endif
1215
1216 pSG->cSegsUsed = i;
1217
1218#ifdef PADD_RUNT_FRAMES_FROM_HOST
1219 /*
1220 * Add a trailer if the frame is too small.
1221 *
1222 * Since we're getting to the packet before it is framed, it has not
1223 * yet been padded. The current solution is to add a segment pointing
1224 * to a buffer containing all zeros and pray that works for all frames...
1225 */
1226 if (pSG->cbTotal < 60 && (fSrc & INTNETTRUNKDIR_HOST))
1227 {
1228 static uint8_t const s_abZero[128] = {0};
1229
1230 AssertReturnVoid(i < cSegs);
1231
1232 pSG->aSegs[i].Phys = NIL_RTHCPHYS;
1233 pSG->aSegs[i].pv = (void *)&s_abZero[0];
1234 pSG->aSegs[i].cb = 60 - pSG->cbTotal;
1235 pSG->cbTotal = 60;
1236 pSG->cSegsUsed++;
1237 Assert(i + 1 <= pSG->cSegsAlloc)
1238 }
1239#endif
1240
1241 Log4(("vboxNetFltLinuxSkBufToSG: allocated=%d, segments=%d frags=%d next=%p frag_list=%p pkt_type=%x fSrc=%x\n",
1242 pSG->cSegsAlloc, pSG->cSegsUsed, skb_shinfo(pBuf)->nr_frags, pBuf->next, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type, fSrc));
1243 for (i = 0; i < pSG->cSegsUsed; i++)
1244 Log4(("vboxNetFltLinuxSkBufToSG: #%d: cb=%d pv=%p\n",
1245 i, pSG->aSegs[i].cb, pSG->aSegs[i].pv));
1246}
1247
1248/**
1249 * Packet handler,
1250 *
1251 * @returns 0 or EJUSTRETURN.
1252 * @param pThis The instance.
1253 * @param pMBuf The mbuf.
1254 * @param pvFrame The start of the frame, optional.
1255 * @param fSrc Where the packet (allegedly) comes from, one INTNETTRUNKDIR_* value.
1256 * @param eProtocol The protocol.
1257 */
1258#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 14)
1259static int vboxNetFltLinuxPacketHandler(struct sk_buff *pBuf,
1260 struct net_device *pSkbDev,
1261 struct packet_type *pPacketType,
1262 struct net_device *pOrigDev)
1263#else
1264static int vboxNetFltLinuxPacketHandler(struct sk_buff *pBuf,
1265 struct net_device *pSkbDev,
1266 struct packet_type *pPacketType)
1267#endif
1268{
1269 PVBOXNETFLTINS pThis;
1270 struct net_device *pDev;
1271 LogFlow(("vboxNetFltLinuxPacketHandler: pBuf=%p pSkbDev=%p pPacketType=%p\n",
1272 pBuf, pSkbDev, pPacketType));
1273#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
1274 Log3(("vboxNetFltLinuxPacketHandler: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1275 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1276 Log4(("vboxNetFltLinuxPacketHandler: packet dump follows:\n%.*Rhxd\n", pBuf->len-pBuf->data_len, skb_mac_header(pBuf)));
1277#else
1278 Log3(("vboxNetFltLinuxPacketHandler: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u tso_size=%u tso_seqs=%u frag_list=%p pkt_type=%x\n",
1279 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->tso_size, skb_shinfo(pBuf)->tso_segs, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1280#endif
1281 /*
1282 * Drop it immediately?
1283 */
1284 if (!pBuf)
1285 return 0;
1286
1287 pThis = VBOX_FLT_PT_TO_INST(pPacketType);
1288 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1289 if (pThis->u.s.pDev != pSkbDev)
1290 {
1291 Log(("vboxNetFltLinuxPacketHandler: Devices do not match, pThis may be wrong! pThis=%p\n", pThis));
1292 return 0;
1293 }
1294
1295 Log4(("vboxNetFltLinuxPacketHandler: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
1296 if (vboxNetFltLinuxSkBufIsOur(pBuf))
1297 {
1298 Log2(("vboxNetFltLinuxPacketHandler: got our own sk_buff, drop it.\n"));
1299 dev_kfree_skb(pBuf);
1300 return 0;
1301 }
1302
1303#ifndef VBOXNETFLT_SG_SUPPORT
1304 {
1305 /*
1306 * Get rid of fragmented packets, they cause too much trouble.
1307 */
1308 struct sk_buff *pCopy = skb_copy(pBuf, GFP_ATOMIC);
1309 kfree_skb(pBuf);
1310 if (!pCopy)
1311 {
1312 LogRel(("VBoxNetFlt: Failed to allocate packet buffer, dropping the packet.\n"));
1313 return 0;
1314 }
1315 pBuf = pCopy;
1316# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
1317 Log3(("vboxNetFltLinuxPacketHandler: skb copy len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1318 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1319 Log4(("vboxNetFltLinuxPacketHandler: packet dump follows:\n%.*Rhxd\n", pBuf->len-pBuf->data_len, skb_mac_header(pBuf)));
1320# else
1321 Log3(("vboxNetFltLinuxPacketHandler: skb copy len=%u data_len=%u truesize=%u next=%p nr_frags=%u tso_size=%u tso_seqs=%u frag_list=%p pkt_type=%x\n",
1322 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->tso_size, skb_shinfo(pBuf)->tso_segs, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type));
1323# endif
1324 }
1325#endif
1326
1327#ifdef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
1328 /* Forward it to the internal network. */
1329 vboxNetFltLinuxForwardToIntNet(pThis, pBuf);
1330#else
1331 /* Add the packet to transmit queue and schedule the bottom half. */
1332 skb_queue_tail(&pThis->u.s.XmitQueue, pBuf);
1333 schedule_work(&pThis->u.s.XmitTask);
1334 Log4(("vboxNetFltLinuxPacketHandler: scheduled work %p for sk_buff %p\n",
1335 &pThis->u.s.XmitTask, pBuf));
1336#endif
1337
1338 /* It does not really matter what we return, it is ignored by the kernel. */
1339 return 0;
1340}
1341
1342/**
1343 * Calculate the number of INTNETSEG segments the socket buffer will need.
1344 *
1345 * @returns Segment count.
1346 * @param pBuf The socket buffer.
1347 */
1348DECLINLINE(unsigned) vboxNetFltLinuxCalcSGSegments(struct sk_buff *pBuf)
1349{
1350#ifdef VBOXNETFLT_SG_SUPPORT
1351 unsigned cSegs = 1 + skb_shinfo(pBuf)->nr_frags;
1352#else
1353 unsigned cSegs = 1;
1354#endif
1355#ifdef PADD_RUNT_FRAMES_FROM_HOST
1356 /* vboxNetFltLinuxSkBufToSG adds a padding segment if it's a runt. */
1357 if (pBuf->len < 60)
1358 cSegs++;
1359#endif
1360 return cSegs;
1361}
1362
1363/**
1364 * Destroy the intnet scatter / gather buffer created by
1365 * vboxNetFltLinuxSkBufToSG.
1366 */
1367static void vboxNetFltLinuxDestroySG(PINTNETSG pSG)
1368{
1369#ifdef VBOXNETFLT_SG_SUPPORT
1370 int i;
1371
1372 for (i = 0; i < skb_shinfo(pBuf)->nr_frags; i++)
1373 {
1374 printk("kunmap(%p)\n", pSG->aSegs[i+1].pv);
1375 kunmap(pSG->aSegs[i+1].pv);
1376 }
1377#endif
1378 NOREF(pSG);
1379}
1380
1381#ifdef LOG_ENABLED
1382/**
1383 * Logging helper.
1384 */
1385static void vboxNetFltDumpPacket(PINTNETSG pSG, bool fEgress, const char *pszWhere, int iIncrement)
1386{
1387 uint8_t *pInt, *pExt;
1388 static int iPacketNo = 1;
1389 iPacketNo += iIncrement;
1390 if (fEgress)
1391 {
1392 pExt = pSG->aSegs[0].pv;
1393 pInt = pExt + 6;
1394 }
1395 else
1396 {
1397 pInt = pSG->aSegs[0].pv;
1398 pExt = pInt + 6;
1399 }
1400 Log(("VBoxNetFlt: (int)%02x:%02x:%02x:%02x:%02x:%02x"
1401 " %s (%s)%02x:%02x:%02x:%02x:%02x:%02x (%u bytes) packet #%u\n",
1402 pInt[0], pInt[1], pInt[2], pInt[3], pInt[4], pInt[5],
1403 fEgress ? "-->" : "<--", pszWhere,
1404 pExt[0], pExt[1], pExt[2], pExt[3], pExt[4], pExt[5],
1405 pSG->cbTotal, iPacketNo));
1406 Log3(("%.*Rhxd\n", pSG->aSegs[0].cb, pSG->aSegs[0].pv));
1407}
1408#else
1409# define vboxNetFltDumpPacket(a, b, c, d) do {} while (0)
1410#endif
1411
1412#ifdef VBOXNETFLT_WITH_GSO_RECV
1413
1414/**
1415 * Worker for vboxNetFltLinuxForwardToIntNet that checks if we can forwards a
1416 * GSO socket buffer without having to segment it.
1417 *
1418 * @returns true on success, false if needs segmenting.
1419 * @param pThis The net filter instance.
1420 * @param pSkb The GSO socket buffer.
1421 * @param fSrc The source.
1422 * @param pGsoCtx Where to return the GSO context on success.
1423 */
1424static bool vboxNetFltLinuxCanForwardAsGso(PVBOXNETFLTINS pThis, struct sk_buff *pSkb, uint32_t fSrc,
1425 PPDMNETWORKGSO pGsoCtx)
1426{
1427 PDMNETWORKGSOTYPE enmGsoType;
1428 uint16_t uEtherType;
1429 unsigned int cbTransport;
1430 unsigned int offTransport;
1431 unsigned int cbTransportHdr;
1432 unsigned uProtocol;
1433 union
1434 {
1435 RTNETIPV4 IPv4;
1436 RTNETIPV6 IPv6;
1437 RTNETTCP Tcp;
1438 uint8_t ab[40];
1439 uint16_t au16[40/2];
1440 uint32_t au32[40/4];
1441 } Buf;
1442
1443 /*
1444 * Check the GSO properties of the socket buffer and make sure it fits.
1445 */
1446 /** @todo Figure out how to handle SKB_GSO_TCP_ECN! */
1447 if (RT_UNLIKELY( skb_shinfo(pSkb)->gso_type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCPV6 | SKB_GSO_TCPV4) ))
1448 {
1449 Log5(("vboxNetFltLinuxCanForwardAsGso: gso_type=%#x\n", skb_shinfo(pSkb)->gso_type));
1450 return false;
1451 }
1452 if (RT_UNLIKELY( skb_shinfo(pSkb)->gso_size < 1
1453 || pSkb->len > VBOX_MAX_GSO_SIZE ))
1454 {
1455 Log5(("vboxNetFltLinuxCanForwardAsGso: gso_size=%#x skb_len=%#x (max=%#x)\n", skb_shinfo(pSkb)->gso_size, pSkb->len, VBOX_MAX_GSO_SIZE));
1456 return false;
1457 }
1458 /*
1459 * It is possible to receive GSO packets from wire if GRO is enabled.
1460 */
1461 if (RT_UNLIKELY(fSrc & INTNETTRUNKDIR_WIRE))
1462 {
1463 Log5(("vboxNetFltLinuxCanForwardAsGso: fSrc=wire\n"));
1464#ifdef VBOXNETFLT_WITH_GRO
1465 /*
1466 * The packet came from the wire and the driver has already consumed
1467 * mac header. We need to restore it back.
1468 */
1469 pSkb->mac_len = skb_network_header(pSkb) - skb_mac_header(pSkb);
1470 skb_push(pSkb, pSkb->mac_len);
1471 Log5(("vboxNetFltLinuxCanForwardAsGso: mac_len=%d data=%p mac_header=%p network_header=%p\n",
1472 pSkb->mac_len, pSkb->data, skb_mac_header(pSkb), skb_network_header(pSkb)));
1473#else /* !VBOXNETFLT_WITH_GRO */
1474 /* Older kernels didn't have GRO. */
1475 return false;
1476#endif /* !VBOXNETFLT_WITH_GRO */
1477 }
1478 else
1479 {
1480 /*
1481 * skb_gso_segment does the following. Do we need to do it as well?
1482 */
1483#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
1484 skb_reset_mac_header(pSkb);
1485 pSkb->mac_len = pSkb->network_header - pSkb->mac_header;
1486#else
1487 pSkb->mac.raw = pSkb->data;
1488 pSkb->mac_len = pSkb->nh.raw - pSkb->data;
1489#endif
1490 }
1491
1492 /*
1493 * Switch on the ethertype.
1494 */
1495 uEtherType = pSkb->protocol;
1496 if ( uEtherType == RT_H2N_U16_C(RTNET_ETHERTYPE_VLAN)
1497 && pSkb->mac_len == sizeof(RTNETETHERHDR) + sizeof(uint32_t))
1498 {
1499 uint16_t const *puEtherType = skb_header_pointer(pSkb, sizeof(RTNETETHERHDR) + sizeof(uint16_t), sizeof(uint16_t), &Buf);
1500 if (puEtherType)
1501 uEtherType = *puEtherType;
1502 }
1503 switch (uEtherType)
1504 {
1505 case RT_H2N_U16_C(RTNET_ETHERTYPE_IPV4):
1506 {
1507 unsigned int cbHdr;
1508 PCRTNETIPV4 pIPv4 = (PCRTNETIPV4)skb_header_pointer(pSkb, pSkb->mac_len, sizeof(Buf.IPv4), &Buf);
1509 if (RT_UNLIKELY(!pIPv4))
1510 {
1511 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access IPv4 hdr\n"));
1512 return false;
1513 }
1514
1515 cbHdr = pIPv4->ip_hl * 4;
1516 cbTransport = RT_N2H_U16(pIPv4->ip_len);
1517 if (RT_UNLIKELY( cbHdr < RTNETIPV4_MIN_LEN
1518 || cbHdr > cbTransport ))
1519 {
1520 Log5(("vboxNetFltLinuxCanForwardAsGso: invalid IPv4 lengths: ip_hl=%u ip_len=%u\n", pIPv4->ip_hl, RT_N2H_U16(pIPv4->ip_len)));
1521 return false;
1522 }
1523 cbTransport -= cbHdr;
1524 offTransport = pSkb->mac_len + cbHdr;
1525 uProtocol = pIPv4->ip_p;
1526 if (uProtocol == RTNETIPV4_PROT_TCP)
1527 enmGsoType = PDMNETWORKGSOTYPE_IPV4_TCP;
1528 else if (uProtocol == RTNETIPV4_PROT_UDP)
1529 enmGsoType = PDMNETWORKGSOTYPE_IPV4_UDP;
1530 else /** @todo IPv6: 4to6 tunneling */
1531 enmGsoType = PDMNETWORKGSOTYPE_INVALID;
1532 break;
1533 }
1534
1535 case RT_H2N_U16_C(RTNET_ETHERTYPE_IPV6):
1536 {
1537 PCRTNETIPV6 pIPv6 = (PCRTNETIPV6)skb_header_pointer(pSkb, pSkb->mac_len, sizeof(Buf.IPv6), &Buf);
1538 if (RT_UNLIKELY(!pIPv6))
1539 {
1540 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access IPv6 hdr\n"));
1541 return false;
1542 }
1543
1544 cbTransport = RT_N2H_U16(pIPv6->ip6_plen);
1545 offTransport = pSkb->mac_len + sizeof(RTNETIPV6);
1546 uProtocol = pIPv6->ip6_nxt;
1547 /** @todo IPv6: Dig our way out of the other headers. */
1548 if (uProtocol == RTNETIPV4_PROT_TCP)
1549 enmGsoType = PDMNETWORKGSOTYPE_IPV6_TCP;
1550 else if (uProtocol == RTNETIPV4_PROT_UDP)
1551 enmGsoType = PDMNETWORKGSOTYPE_IPV4_UDP;
1552 else
1553 enmGsoType = PDMNETWORKGSOTYPE_INVALID;
1554 break;
1555 }
1556
1557 default:
1558 Log5(("vboxNetFltLinuxCanForwardAsGso: uEtherType=%#x\n", RT_H2N_U16(uEtherType)));
1559 return false;
1560 }
1561
1562 if (enmGsoType == PDMNETWORKGSOTYPE_INVALID)
1563 {
1564 Log5(("vboxNetFltLinuxCanForwardAsGso: Unsupported protocol %d\n", uProtocol));
1565 return false;
1566 }
1567
1568 if (RT_UNLIKELY( offTransport + cbTransport <= offTransport
1569 || offTransport + cbTransport > pSkb->len
1570 || cbTransport < (uProtocol == RTNETIPV4_PROT_TCP ? RTNETTCP_MIN_LEN : RTNETUDP_MIN_LEN)) )
1571 {
1572 Log5(("vboxNetFltLinuxCanForwardAsGso: Bad transport length; off=%#x + cb=%#x => %#x; skb_len=%#x (%s)\n",
1573 offTransport, cbTransport, offTransport + cbTransport, pSkb->len, PDMNetGsoTypeName(enmGsoType) ));
1574 return false;
1575 }
1576
1577 /*
1578 * Check the TCP/UDP bits.
1579 */
1580 if (uProtocol == RTNETIPV4_PROT_TCP)
1581 {
1582 PCRTNETTCP pTcp = (PCRTNETTCP)skb_header_pointer(pSkb, offTransport, sizeof(Buf.Tcp), &Buf);
1583 if (RT_UNLIKELY(!pTcp))
1584 {
1585 Log5(("vboxNetFltLinuxCanForwardAsGso: failed to access TCP hdr\n"));
1586 return false;
1587 }
1588
1589 cbTransportHdr = pTcp->th_off * 4;
1590 if (RT_UNLIKELY( cbTransportHdr < RTNETTCP_MIN_LEN
1591 || cbTransportHdr > cbTransport
1592 || offTransport + cbTransportHdr >= UINT8_MAX
1593 || offTransport + cbTransportHdr >= pSkb->len ))
1594 {
1595 Log5(("vboxNetFltLinuxCanForwardAsGso: No space for TCP header; off=%#x cb=%#x skb_len=%#x\n", offTransport, cbTransportHdr, pSkb->len));
1596 return false;
1597 }
1598
1599 }
1600 else
1601 {
1602 Assert(uProtocol == RTNETIPV4_PROT_UDP);
1603 cbTransportHdr = sizeof(RTNETUDP);
1604 if (RT_UNLIKELY( offTransport + cbTransportHdr >= UINT8_MAX
1605 || offTransport + cbTransportHdr >= pSkb->len ))
1606 {
1607 Log5(("vboxNetFltLinuxCanForwardAsGso: No space for UDP header; off=%#x skb_len=%#x\n", offTransport, pSkb->len));
1608 return false;
1609 }
1610 }
1611
1612 /*
1613 * We're good, init the GSO context.
1614 */
1615 pGsoCtx->u8Type = enmGsoType;
1616 pGsoCtx->cbHdrs = offTransport + cbTransportHdr;
1617 pGsoCtx->cbMaxSeg = skb_shinfo(pSkb)->gso_size;
1618 pGsoCtx->offHdr1 = pSkb->mac_len;
1619 pGsoCtx->offHdr2 = offTransport;
1620 pGsoCtx->au8Unused[0] = 0;
1621 pGsoCtx->au8Unused[1] = 0;
1622
1623 return true;
1624}
1625
1626/**
1627 * Forward the socket buffer as a GSO internal network frame.
1628 *
1629 * @returns IPRT status code.
1630 * @param pThis The net filter instance.
1631 * @param pSkb The GSO socket buffer.
1632 * @param fSrc The source.
1633 * @param pGsoCtx Where to return the GSO context on success.
1634 */
1635static int vboxNetFltLinuxForwardAsGso(PVBOXNETFLTINS pThis, struct sk_buff *pSkb, uint32_t fSrc, PCPDMNETWORKGSO pGsoCtx)
1636{
1637 int rc;
1638 unsigned cSegs = vboxNetFltLinuxCalcSGSegments(pSkb);
1639 if (RT_LIKELY(cSegs <= MAX_SKB_FRAGS + 1))
1640 {
1641 PINTNETSG pSG = (PINTNETSG)alloca(RT_OFFSETOF(INTNETSG, aSegs[cSegs]));
1642 if (RT_LIKELY(pSG))
1643 {
1644 vboxNetFltLinuxSkBufToSG(pThis, pSkb, pSG, cSegs, fSrc, pGsoCtx);
1645
1646 vboxNetFltDumpPacket(pSG, false, (fSrc & INTNETTRUNKDIR_HOST) ? "host" : "wire", 1);
1647 pThis->pSwitchPort->pfnRecv(pThis->pSwitchPort, NULL /* pvIf */, pSG, fSrc);
1648
1649 vboxNetFltLinuxDestroySG(pSG);
1650 rc = VINF_SUCCESS;
1651 }
1652 else
1653 {
1654 Log(("VBoxNetFlt: Dropping the sk_buff (failure case).\n"));
1655 rc = VERR_NO_MEMORY;
1656 }
1657 }
1658 else
1659 {
1660 Log(("VBoxNetFlt: Bad sk_buff? cSegs=%#x.\n", cSegs));
1661 rc = VERR_INTERNAL_ERROR_3;
1662 }
1663
1664 Log4(("VBoxNetFlt: Dropping the sk_buff.\n"));
1665 dev_kfree_skb(pSkb);
1666 return rc;
1667}
1668
1669#endif /* VBOXNETFLT_WITH_GSO_RECV */
1670
1671/**
1672 * Worker for vboxNetFltLinuxForwardToIntNet.
1673 *
1674 * @returns VINF_SUCCESS or VERR_NO_MEMORY.
1675 * @param pThis The net filter instance.
1676 * @param pBuf The socket buffer.
1677 * @param fSrc The source.
1678 */
1679static int vboxNetFltLinuxForwardSegment(PVBOXNETFLTINS pThis, struct sk_buff *pBuf, uint32_t fSrc)
1680{
1681 int rc;
1682 unsigned cSegs = vboxNetFltLinuxCalcSGSegments(pBuf);
1683 if (cSegs <= MAX_SKB_FRAGS + 1)
1684 {
1685 PINTNETSG pSG = (PINTNETSG)alloca(RT_OFFSETOF(INTNETSG, aSegs[cSegs]));
1686 if (RT_LIKELY(pSG))
1687 {
1688 if (fSrc & INTNETTRUNKDIR_WIRE)
1689 {
1690 /*
1691 * The packet came from wire, ethernet header was removed by device driver.
1692 * Restore it.
1693 */
1694 skb_push(pBuf, ETH_HLEN);
1695 }
1696
1697 vboxNetFltLinuxSkBufToSG(pThis, pBuf, pSG, cSegs, fSrc, NULL /*pGsoCtx*/);
1698
1699 vboxNetFltDumpPacket(pSG, false, (fSrc & INTNETTRUNKDIR_HOST) ? "host" : "wire", 1);
1700 pThis->pSwitchPort->pfnRecv(pThis->pSwitchPort, NULL /* pvIf */, pSG, fSrc);
1701
1702 vboxNetFltLinuxDestroySG(pSG);
1703 rc = VINF_SUCCESS;
1704 }
1705 else
1706 {
1707 Log(("VBoxNetFlt: Failed to allocate SG buffer.\n"));
1708 rc = VERR_NO_MEMORY;
1709 }
1710 }
1711 else
1712 {
1713 Log(("VBoxNetFlt: Bad sk_buff? cSegs=%#x.\n", cSegs));
1714 rc = VERR_INTERNAL_ERROR_3;
1715 }
1716
1717 Log4(("VBoxNetFlt: Dropping the sk_buff.\n"));
1718 dev_kfree_skb(pBuf);
1719 return rc;
1720}
1721
1722/**
1723 *
1724 * @param pBuf The socket buffer. This is consumed by this function.
1725 */
1726static void vboxNetFltLinuxForwardToIntNet(PVBOXNETFLTINS pThis, struct sk_buff *pBuf)
1727{
1728 uint32_t fSrc = pBuf->pkt_type == PACKET_OUTGOING ? INTNETTRUNKDIR_HOST : INTNETTRUNKDIR_WIRE;
1729
1730#ifdef VBOXNETFLT_WITH_GSO
1731 if (skb_is_gso(pBuf))
1732 {
1733 PDMNETWORKGSO GsoCtx;
1734 Log3(("vboxNetFltLinuxForwardToIntNet: skb len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x ip_summed=%d\n",
1735 pBuf->len, pBuf->data_len, pBuf->truesize, pBuf->next, skb_shinfo(pBuf)->nr_frags, skb_shinfo(pBuf)->gso_size, skb_shinfo(pBuf)->gso_segs, skb_shinfo(pBuf)->gso_type, skb_shinfo(pBuf)->frag_list, pBuf->pkt_type, pBuf->ip_summed));
1736# ifdef VBOXNETFLT_WITH_GSO_RECV
1737 if ( (skb_shinfo(pBuf)->gso_type & (SKB_GSO_UDP | SKB_GSO_TCPV6 | SKB_GSO_TCPV4))
1738 && vboxNetFltLinuxCanForwardAsGso(pThis, pBuf, fSrc, &GsoCtx) )
1739 vboxNetFltLinuxForwardAsGso(pThis, pBuf, fSrc, &GsoCtx);
1740 else
1741# endif
1742 {
1743 /* Need to segment the packet */
1744 struct sk_buff *pNext;
1745 struct sk_buff *pSegment = skb_gso_segment(pBuf, 0 /*supported features*/);
1746 if (IS_ERR(pSegment))
1747 {
1748 dev_kfree_skb(pBuf);
1749 LogRel(("VBoxNetFlt: Failed to segment a packet (%d).\n", PTR_ERR(pSegment)));
1750 return;
1751 }
1752
1753 for (; pSegment; pSegment = pNext)
1754 {
1755 Log3(("vboxNetFltLinuxForwardToIntNet: segment len=%u data_len=%u truesize=%u next=%p nr_frags=%u gso_size=%u gso_seqs=%u gso_type=%x frag_list=%p pkt_type=%x\n",
1756 pSegment->len, pSegment->data_len, pSegment->truesize, pSegment->next, skb_shinfo(pSegment)->nr_frags, skb_shinfo(pSegment)->gso_size, skb_shinfo(pSegment)->gso_segs, skb_shinfo(pSegment)->gso_type, skb_shinfo(pSegment)->frag_list, pSegment->pkt_type));
1757 pNext = pSegment->next;
1758 pSegment->next = 0;
1759 vboxNetFltLinuxForwardSegment(pThis, pSegment, fSrc);
1760 }
1761 dev_kfree_skb(pBuf);
1762 }
1763 }
1764 else
1765#endif /* VBOXNETFLT_WITH_GSO */
1766 {
1767 if (pBuf->ip_summed == CHECKSUM_PARTIAL && pBuf->pkt_type == PACKET_OUTGOING)
1768 {
1769#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
1770 /*
1771 * Try to work around the problem with CentOS 4.7 and 5.2 (2.6.9
1772 * and 2.6.18 kernels), they pass wrong 'h' pointer down. We take IP
1773 * header length from the header itself and reconstruct 'h' pointer
1774 * to TCP (or whatever) header.
1775 */
1776 unsigned char *tmp = pBuf->h.raw;
1777 if (pBuf->h.raw == pBuf->nh.raw && pBuf->protocol == htons(ETH_P_IP))
1778 pBuf->h.raw = pBuf->nh.raw + pBuf->nh.iph->ihl * 4;
1779#endif /* LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18) */
1780 if (VBOX_SKB_CHECKSUM_HELP(pBuf))
1781 {
1782 LogRel(("VBoxNetFlt: Failed to compute checksum, dropping the packet.\n"));
1783 dev_kfree_skb(pBuf);
1784 return;
1785 }
1786#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
1787 /* Restore the original (wrong) pointer. */
1788 pBuf->h.raw = tmp;
1789#endif /* LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18) */
1790 }
1791 vboxNetFltLinuxForwardSegment(pThis, pBuf, fSrc);
1792 }
1793}
1794
1795#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
1796/**
1797 * Work queue handler that forwards the socket buffers queued by
1798 * vboxNetFltLinuxPacketHandler to the internal network.
1799 *
1800 * @param pWork The work queue.
1801 */
1802# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20)
1803static void vboxNetFltLinuxXmitTask(struct work_struct *pWork)
1804# else
1805static void vboxNetFltLinuxXmitTask(void *pWork)
1806# endif
1807{
1808 PVBOXNETFLTINS pThis = VBOX_FLT_XT_TO_INST(pWork);
1809 struct sk_buff *pBuf;
1810
1811 Log4(("vboxNetFltLinuxXmitTask: Got work %p.\n", pWork));
1812
1813 /*
1814 * Active? Retain the instance and increment the busy counter.
1815 */
1816 if (vboxNetFltTryRetainBusyActive(pThis))
1817 {
1818 while ((pBuf = skb_dequeue(&pThis->u.s.XmitQueue)) != NULL)
1819 vboxNetFltLinuxForwardToIntNet(pThis, pBuf);
1820
1821 vboxNetFltRelease(pThis, true /* fBusy */);
1822 }
1823 else
1824 {
1825 /** @todo Shouldn't we just drop the packets here? There is little point in
1826 * making them accumulate when the VM is paused and it'll only waste
1827 * kernel memory anyway... Hmm. maybe wait a short while (2-5 secs)
1828 * before start draining the packets (goes for the intnet ring buf
1829 * too)? */
1830 }
1831}
1832#endif /* !VBOXNETFLT_LINUX_NO_XMIT_QUEUE */
1833
1834/**
1835 * Reports the GSO capabilites of the hardware NIC.
1836 *
1837 * @param pThis The net filter instance. The caller hold a
1838 * reference to this.
1839 */
1840static void vboxNetFltLinuxReportNicGsoCapabilities(PVBOXNETFLTINS pThis)
1841{
1842#ifdef VBOXNETFLT_WITH_GSO_XMIT_WIRE
1843 if (vboxNetFltTryRetainBusyNotDisconnected(pThis))
1844 {
1845 struct net_device *pDev;
1846 PINTNETTRUNKSWPORT pSwitchPort;
1847 unsigned int fFeatures;
1848 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
1849
1850 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1851
1852 pSwitchPort = pThis->pSwitchPort; /* this doesn't need to be here, but it doesn't harm. */
1853 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1854 if (pDev)
1855 fFeatures = pDev->features;
1856 else
1857 fFeatures = 0;
1858
1859 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1860
1861 if (pThis->pSwitchPort)
1862 {
1863 /* Set/update the GSO capabilities of the NIC. */
1864 uint32_t fGsoCapabilites = 0;
1865 if (fFeatures & NETIF_F_TSO)
1866 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_TCP);
1867 if (fFeatures & NETIF_F_TSO6)
1868 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_TCP);
1869# if 0 /** @todo GSO: Test UDP offloading (UFO) on linux. */
1870 if (fFeatures & NETIF_F_UFO)
1871 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_UDP);
1872 if (fFeatures & NETIF_F_UFO)
1873 fGsoCapabilites |= RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_UDP);
1874# endif
1875 pThis->pSwitchPort->pfnReportGsoCapabilities(pThis->pSwitchPort, fGsoCapabilites, INTNETTRUNKDIR_WIRE);
1876 }
1877
1878 vboxNetFltRelease(pThis, true /*fBusy*/);
1879 }
1880#endif /* VBOXNETFLT_WITH_GSO_XMIT_WIRE */
1881}
1882
1883/**
1884 * Helper that determins whether the host (ignoreing us) is operating the
1885 * interface in promiscuous mode or not.
1886 */
1887static bool vboxNetFltLinuxPromiscuous(PVBOXNETFLTINS pThis)
1888{
1889 bool fRc = false;
1890 struct net_device * pDev = vboxNetFltLinuxRetainNetDev(pThis);
1891 if (pDev)
1892 {
1893 fRc = !!(pDev->promiscuity - (ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet) & 1));
1894 LogFlow(("vboxNetFltPortOsIsPromiscuous: returns %d, pDev->promiscuity=%d, fPromiscuousSet=%d\n",
1895 fRc, pDev->promiscuity, pThis->u.s.fPromiscuousSet));
1896 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
1897 }
1898 return fRc;
1899}
1900
1901/**
1902 * Internal worker for vboxNetFltLinuxNotifierCallback.
1903 *
1904 * @returns VBox status code.
1905 * @param pThis The instance.
1906 * @param fRediscovery If set we're doing a rediscovery attempt, so, don't
1907 * flood the release log.
1908 */
1909static int vboxNetFltLinuxAttachToInterface(PVBOXNETFLTINS pThis, struct net_device *pDev)
1910{
1911 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
1912 LogFlow(("vboxNetFltLinuxAttachToInterface: pThis=%p (%s)\n", pThis, pThis->szName));
1913
1914 /*
1915 * Retain and store the device.
1916 */
1917 dev_hold(pDev);
1918
1919 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1920 ASMAtomicUoWritePtr(&pThis->u.s.pDev, pDev);
1921 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1922
1923 Log(("vboxNetFltLinuxAttachToInterface: Device %p(%s) retained. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1924 Log(("vboxNetFltLinuxAttachToInterface: Got pDev=%p pThis=%p pThis->u.s.pDev=%p\n", pDev, pThis, ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *)));
1925
1926 /* Get the mac address while we still have a valid net_device reference. */
1927 memcpy(&pThis->u.s.MacAddr, pDev->dev_addr, sizeof(pThis->u.s.MacAddr));
1928
1929 /*
1930 * Install a packet filter for this device with a protocol wildcard (ETH_P_ALL).
1931 */
1932 pThis->u.s.PacketType.type = __constant_htons(ETH_P_ALL);
1933 pThis->u.s.PacketType.dev = pDev;
1934 pThis->u.s.PacketType.func = vboxNetFltLinuxPacketHandler;
1935 dev_add_pack(&pThis->u.s.PacketType);
1936
1937#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
1938 vboxNetFltLinuxHookDev(pThis, pDev);
1939#endif
1940#ifdef VBOXNETFLT_WITH_QDISC
1941 vboxNetFltLinuxQdiscInstall(pThis, pDev);
1942#endif /* VBOXNETFLT_WITH_QDISC */
1943
1944 /*
1945 * Set indicators that require the spinlock. Be abit paranoid about racing
1946 * the device notification handle.
1947 */
1948 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1949 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
1950 if (pDev)
1951 {
1952 ASMAtomicUoWriteBool(&pThis->fDisconnectedFromHost, false);
1953 ASMAtomicUoWriteBool(&pThis->u.s.fRegistered, true);
1954 pDev = NULL; /* don't dereference it */
1955 }
1956 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1957 Log(("vboxNetFltLinuxAttachToInterface: this=%p: Packet handler installed.\n", pThis));
1958
1959 /*
1960 * If the above succeeded report GSO capabilites, if not undo and
1961 * release the device.
1962 */
1963 if (!pDev)
1964 {
1965 Assert(pThis->pSwitchPort);
1966 if (vboxNetFltTryRetainBusyNotDisconnected(pThis))
1967 {
1968 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
1969 pThis->pSwitchPort->pfnReportMacAddress(pThis->pSwitchPort, &pThis->u.s.MacAddr);
1970 pThis->pSwitchPort->pfnReportPromiscuousMode(pThis->pSwitchPort, vboxNetFltLinuxPromiscuous(pThis));
1971 pThis->pSwitchPort->pfnReportNoPreemptDsts(pThis->pSwitchPort, INTNETTRUNKDIR_WIRE | INTNETTRUNKDIR_HOST);
1972 vboxNetFltRelease(pThis, true /*fBusy*/);
1973 }
1974 }
1975 else
1976 {
1977#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
1978 vboxNetFltLinuxUnhookDev(pThis, pDev);
1979#endif
1980#ifdef VBOXNETFLT_WITH_QDISC
1981 vboxNetFltLinuxQdiscRemove(pThis, pDev);
1982#endif /* VBOXNETFLT_WITH_QDISC */
1983 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
1984 ASMAtomicUoWriteNullPtr(&pThis->u.s.pDev);
1985 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
1986 dev_put(pDev);
1987 Log(("vboxNetFltLinuxAttachToInterface: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
1988 }
1989
1990 LogRel(("VBoxNetFlt: attached to '%s' / %.*Rhxs\n", pThis->szName, sizeof(pThis->u.s.MacAddr), &pThis->u.s.MacAddr));
1991 return VINF_SUCCESS;
1992}
1993
1994
1995static int vboxNetFltLinuxUnregisterDevice(PVBOXNETFLTINS pThis, struct net_device *pDev)
1996{
1997 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
1998
1999 Assert(!pThis->fDisconnectedFromHost);
2000
2001#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
2002 vboxNetFltLinuxUnhookDev(pThis, pDev);
2003#endif
2004#ifdef VBOXNETFLT_WITH_QDISC
2005 vboxNetFltLinuxQdiscRemove(pThis, pDev);
2006#endif /* VBOXNETFLT_WITH_QDISC */
2007
2008 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
2009 ASMAtomicWriteBool(&pThis->u.s.fRegistered, false);
2010 ASMAtomicWriteBool(&pThis->fDisconnectedFromHost, true);
2011 ASMAtomicUoWriteNullPtr(&pThis->u.s.pDev);
2012 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
2013
2014 dev_remove_pack(&pThis->u.s.PacketType);
2015#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2016 skb_queue_purge(&pThis->u.s.XmitQueue);
2017#endif
2018 Log(("vboxNetFltLinuxUnregisterDevice: this=%p: Packet handler removed, xmit queue purged.\n", pThis));
2019 Log(("vboxNetFltLinuxUnregisterDevice: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
2020 dev_put(pDev);
2021
2022 return NOTIFY_OK;
2023}
2024
2025static int vboxNetFltLinuxDeviceIsUp(PVBOXNETFLTINS pThis, struct net_device *pDev)
2026{
2027 /* Check if we are not suspended and promiscuous mode has not been set. */
2028 if ( pThis->enmTrunkState == INTNETTRUNKIFSTATE_ACTIVE
2029 && !ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet))
2030 {
2031 /* Note that there is no need for locking as the kernel got hold of the lock already. */
2032 dev_set_promiscuity(pDev, 1);
2033 ASMAtomicWriteBool(&pThis->u.s.fPromiscuousSet, true);
2034 Log(("vboxNetFltLinuxDeviceIsUp: enabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2035 }
2036 else
2037 Log(("vboxNetFltLinuxDeviceIsUp: no need to enable promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2038 return NOTIFY_OK;
2039}
2040
2041static int vboxNetFltLinuxDeviceGoingDown(PVBOXNETFLTINS pThis, struct net_device *pDev)
2042{
2043 /* Undo promiscuous mode if we has set it. */
2044 if (ASMAtomicUoReadBool(&pThis->u.s.fPromiscuousSet))
2045 {
2046 /* Note that there is no need for locking as the kernel got hold of the lock already. */
2047 dev_set_promiscuity(pDev, -1);
2048 ASMAtomicWriteBool(&pThis->u.s.fPromiscuousSet, false);
2049 Log(("vboxNetFltLinuxDeviceGoingDown: disabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2050 }
2051 else
2052 Log(("vboxNetFltLinuxDeviceGoingDown: no need to disable promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2053 return NOTIFY_OK;
2054}
2055
2056#ifdef LOG_ENABLED
2057/** Stringify the NETDEV_XXX constants. */
2058static const char *vboxNetFltLinuxGetNetDevEventName(unsigned long ulEventType)
2059{
2060 const char *pszEvent = "NETDRV_<unknown>";
2061 switch (ulEventType)
2062 {
2063 case NETDEV_REGISTER: pszEvent = "NETDEV_REGISTER"; break;
2064 case NETDEV_UNREGISTER: pszEvent = "NETDEV_UNREGISTER"; break;
2065 case NETDEV_UP: pszEvent = "NETDEV_UP"; break;
2066 case NETDEV_DOWN: pszEvent = "NETDEV_DOWN"; break;
2067 case NETDEV_REBOOT: pszEvent = "NETDEV_REBOOT"; break;
2068 case NETDEV_CHANGENAME: pszEvent = "NETDEV_CHANGENAME"; break;
2069 case NETDEV_CHANGE: pszEvent = "NETDEV_CHANGE"; break;
2070 case NETDEV_CHANGEMTU: pszEvent = "NETDEV_CHANGEMTU"; break;
2071 case NETDEV_CHANGEADDR: pszEvent = "NETDEV_CHANGEADDR"; break;
2072 case NETDEV_GOING_DOWN: pszEvent = "NETDEV_GOING_DOWN"; break;
2073# ifdef NETDEV_FEAT_CHANGE
2074 case NETDEV_FEAT_CHANGE: pszEvent = "NETDEV_FEAT_CHANGE"; break;
2075# endif
2076 }
2077 return pszEvent;
2078}
2079#endif /* LOG_ENABLED */
2080
2081/**
2082 * Callback for listening to netdevice events.
2083 *
2084 * This works the rediscovery, clean up on unregistration, promiscuity on
2085 * up/down, and GSO feature changes from ethtool.
2086 *
2087 * @returns NOTIFY_OK
2088 * @param self Pointer to our notifier registration block.
2089 * @param ulEventType The event.
2090 * @param ptr Event specific, but it is usually the device it
2091 * relates to.
2092 */
2093static int vboxNetFltLinuxNotifierCallback(struct notifier_block *self, unsigned long ulEventType, void *ptr)
2094
2095{
2096 PVBOXNETFLTINS pThis = VBOX_FLT_NB_TO_INST(self);
2097 struct net_device *pDev = (struct net_device *)ptr;
2098 int rc = NOTIFY_OK;
2099
2100 Log(("VBoxNetFlt: got event %s(0x%lx) on %s, pDev=%p pThis=%p pThis->u.s.pDev=%p\n",
2101 vboxNetFltLinuxGetNetDevEventName(ulEventType), ulEventType, pDev->name, pDev, pThis, ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *)));
2102 if ( ulEventType == NETDEV_REGISTER
2103 && !strcmp(pDev->name, pThis->szName))
2104 {
2105 vboxNetFltLinuxAttachToInterface(pThis, pDev);
2106 }
2107 else
2108 {
2109 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
2110 if (pDev == ptr)
2111 {
2112 switch (ulEventType)
2113 {
2114 case NETDEV_UNREGISTER:
2115 rc = vboxNetFltLinuxUnregisterDevice(pThis, pDev);
2116 break;
2117 case NETDEV_UP:
2118 rc = vboxNetFltLinuxDeviceIsUp(pThis, pDev);
2119 break;
2120 case NETDEV_GOING_DOWN:
2121 rc = vboxNetFltLinuxDeviceGoingDown(pThis, pDev);
2122 break;
2123 case NETDEV_CHANGENAME:
2124 break;
2125#ifdef NETDEV_FEAT_CHANGE
2126 case NETDEV_FEAT_CHANGE:
2127 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
2128 break;
2129#endif
2130 }
2131 }
2132 }
2133
2134 return rc;
2135}
2136
2137bool vboxNetFltOsMaybeRediscovered(PVBOXNETFLTINS pThis)
2138{
2139 return !ASMAtomicUoReadBool(&pThis->fDisconnectedFromHost);
2140}
2141
2142int vboxNetFltPortOsXmit(PVBOXNETFLTINS pThis, void *pvIfData, PINTNETSG pSG, uint32_t fDst)
2143{
2144 struct net_device * pDev;
2145 int err;
2146 int rc = VINF_SUCCESS;
2147 NOREF(pvIfData);
2148
2149 LogFlow(("vboxNetFltPortOsXmit: pThis=%p (%s)\n", pThis, pThis->szName));
2150
2151 pDev = vboxNetFltLinuxRetainNetDev(pThis);
2152 if (pDev)
2153 {
2154 /*
2155 * Create a sk_buff for the gather list and push it onto the wire.
2156 */
2157 if (fDst & INTNETTRUNKDIR_WIRE)
2158 {
2159 struct sk_buff *pBuf = vboxNetFltLinuxSkBufFromSG(pThis, pSG, true);
2160 if (pBuf)
2161 {
2162 vboxNetFltDumpPacket(pSG, true, "wire", 1);
2163 Log4(("vboxNetFltPortOsXmit: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
2164 Log4(("vboxNetFltPortOsXmit: dev_queue_xmit(%p)\n", pBuf));
2165 err = dev_queue_xmit(pBuf);
2166 if (err)
2167 rc = RTErrConvertFromErrno(err);
2168 }
2169 else
2170 rc = VERR_NO_MEMORY;
2171 }
2172
2173 /*
2174 * Create a sk_buff for the gather list and push it onto the host stack.
2175 */
2176 if (fDst & INTNETTRUNKDIR_HOST)
2177 {
2178 struct sk_buff *pBuf = vboxNetFltLinuxSkBufFromSG(pThis, pSG, false);
2179 if (pBuf)
2180 {
2181 vboxNetFltDumpPacket(pSG, true, "host", (fDst & INTNETTRUNKDIR_WIRE) ? 0 : 1);
2182 Log4(("vboxNetFltPortOsXmit: pBuf->cb dump:\n%.*Rhxd\n", sizeof(pBuf->cb), pBuf->cb));
2183 Log4(("vboxNetFltPortOsXmit: netif_rx_ni(%p)\n", pBuf));
2184 err = netif_rx_ni(pBuf);
2185 if (err)
2186 rc = RTErrConvertFromErrno(err);
2187 }
2188 else
2189 rc = VERR_NO_MEMORY;
2190 }
2191
2192 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
2193 }
2194
2195 return rc;
2196}
2197
2198
2199void vboxNetFltPortOsSetActive(PVBOXNETFLTINS pThis, bool fActive)
2200{
2201 struct net_device * pDev;
2202
2203 LogFlow(("vboxNetFltPortOsSetActive: pThis=%p (%s), fActive=%s, fDisablePromiscuous=%s\n",
2204 pThis, pThis->szName, fActive?"true":"false",
2205 pThis->fDisablePromiscuous?"true":"false"));
2206
2207 if (pThis->fDisablePromiscuous)
2208 return;
2209
2210 pDev = vboxNetFltLinuxRetainNetDev(pThis);
2211 if (pDev)
2212 {
2213 /*
2214 * This api is a bit weird, the best reference is the code.
2215 *
2216 * Also, we have a bit or race conditions wrt the maintance of
2217 * host the interface promiscuity for vboxNetFltPortOsIsPromiscuous.
2218 */
2219#ifdef LOG_ENABLED
2220 u_int16_t fIf;
2221 unsigned const cPromiscBefore = pDev->promiscuity;
2222#endif
2223 if (fActive)
2224 {
2225 Assert(!pThis->u.s.fPromiscuousSet);
2226
2227 rtnl_lock();
2228 dev_set_promiscuity(pDev, 1);
2229 rtnl_unlock();
2230 pThis->u.s.fPromiscuousSet = true;
2231 Log(("vboxNetFltPortOsSetActive: enabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2232 }
2233 else
2234 {
2235 if (pThis->u.s.fPromiscuousSet)
2236 {
2237 rtnl_lock();
2238 dev_set_promiscuity(pDev, -1);
2239 rtnl_unlock();
2240 Log(("vboxNetFltPortOsSetActive: disabled promiscuous mode on %s (%d)\n", pThis->szName, pDev->promiscuity));
2241 }
2242 pThis->u.s.fPromiscuousSet = false;
2243
2244#ifdef LOG_ENABLED
2245 fIf = dev_get_flags(pDev);
2246 Log(("VBoxNetFlt: fIf=%#x; %d->%d\n", fIf, cPromiscBefore, pDev->promiscuity));
2247#endif
2248 }
2249
2250 vboxNetFltLinuxReleaseNetDev(pThis, pDev);
2251 }
2252}
2253
2254
2255int vboxNetFltOsDisconnectIt(PVBOXNETFLTINS pThis)
2256{
2257#ifdef VBOXNETFLT_WITH_QDISC
2258 vboxNetFltLinuxQdiscRemove(pThis, NULL);
2259#endif /* VBOXNETFLT_WITH_QDISC */
2260 /*
2261 * Remove packet handler when we get disconnected from internal switch as
2262 * we don't want the handler to forward packets to disconnected switch.
2263 */
2264 dev_remove_pack(&pThis->u.s.PacketType);
2265 return VINF_SUCCESS;
2266}
2267
2268
2269int vboxNetFltOsConnectIt(PVBOXNETFLTINS pThis)
2270{
2271 /*
2272 * Report the GSO capabilities of the host and device (if connected).
2273 * Note! No need to mark ourselves busy here.
2274 */
2275 /** @todo duplicate work here now? Attach */
2276#if defined(VBOXNETFLT_WITH_GSO_XMIT_HOST)
2277 pThis->pSwitchPort->pfnReportGsoCapabilities(pThis->pSwitchPort,
2278 0
2279 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_TCP)
2280 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_TCP)
2281# if 0 /** @todo GSO: Test UDP offloading (UFO) on linux. */
2282 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV4_UDP)
2283 | RT_BIT_32(PDMNETWORKGSOTYPE_IPV6_UDP)
2284# endif
2285 , INTNETTRUNKDIR_HOST);
2286
2287#endif
2288 vboxNetFltLinuxReportNicGsoCapabilities(pThis);
2289
2290 return VINF_SUCCESS;
2291}
2292
2293
2294void vboxNetFltOsDeleteInstance(PVBOXNETFLTINS pThis)
2295{
2296 struct net_device *pDev;
2297 bool fRegistered;
2298 RTSPINLOCKTMP Tmp = RTSPINLOCKTMP_INITIALIZER;
2299
2300#ifdef VBOXNETFLT_WITH_FILTER_HOST2GUEST_SKBS_EXPERIMENT
2301 vboxNetFltLinuxUnhookDev(pThis, NULL);
2302#endif
2303
2304 /** @todo This code may race vboxNetFltLinuxUnregisterDevice (very very
2305 * unlikely, but none the less). Since it doesn't actually update the
2306 * state (just reads it), it is likely to panic in some interesting
2307 * ways. */
2308
2309 RTSpinlockAcquireNoInts(pThis->hSpinlock, &Tmp);
2310 pDev = ASMAtomicUoReadPtrT(&pThis->u.s.pDev, struct net_device *);
2311 fRegistered = ASMAtomicUoReadBool(&pThis->u.s.fRegistered);
2312 RTSpinlockReleaseNoInts(pThis->hSpinlock, &Tmp);
2313
2314 if (fRegistered)
2315 {
2316#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2317 skb_queue_purge(&pThis->u.s.XmitQueue);
2318#endif
2319 Log(("vboxNetFltOsDeleteInstance: this=%p: Packet handler removed, xmit queue purged.\n", pThis));
2320 Log(("vboxNetFltOsDeleteInstance: Device %p(%s) released. ref=%d\n", pDev, pDev->name, atomic_read(&pDev->refcnt)));
2321 dev_put(pDev);
2322 }
2323 Log(("vboxNetFltOsDeleteInstance: this=%p: Notifier removed.\n", pThis));
2324 unregister_netdevice_notifier(&pThis->u.s.Notifier);
2325 module_put(THIS_MODULE);
2326}
2327
2328
2329int vboxNetFltOsInitInstance(PVBOXNETFLTINS pThis, void *pvContext)
2330{
2331 int err;
2332 NOREF(pvContext);
2333
2334 pThis->u.s.Notifier.notifier_call = vboxNetFltLinuxNotifierCallback;
2335 err = register_netdevice_notifier(&pThis->u.s.Notifier);
2336 if (err)
2337 return VERR_INTNET_FLT_IF_FAILED;
2338 if (!pThis->u.s.fRegistered)
2339 {
2340 unregister_netdevice_notifier(&pThis->u.s.Notifier);
2341 LogRel(("VBoxNetFlt: failed to find %s.\n", pThis->szName));
2342 return VERR_INTNET_FLT_IF_NOT_FOUND;
2343 }
2344
2345 Log(("vboxNetFltOsInitInstance: this=%p: Notifier installed.\n", pThis));
2346 if ( pThis->fDisconnectedFromHost
2347 || !try_module_get(THIS_MODULE))
2348 return VERR_INTNET_FLT_IF_FAILED;
2349
2350 return VINF_SUCCESS;
2351}
2352
2353int vboxNetFltOsPreInitInstance(PVBOXNETFLTINS pThis)
2354{
2355 /*
2356 * Init the linux specific members.
2357 */
2358 pThis->u.s.pDev = NULL;
2359 pThis->u.s.fRegistered = false;
2360 pThis->u.s.fPromiscuousSet = false;
2361 memset(&pThis->u.s.PacketType, 0, sizeof(pThis->u.s.PacketType));
2362#ifndef VBOXNETFLT_LINUX_NO_XMIT_QUEUE
2363 skb_queue_head_init(&pThis->u.s.XmitQueue);
2364# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20)
2365 INIT_WORK(&pThis->u.s.XmitTask, vboxNetFltLinuxXmitTask);
2366# else
2367 INIT_WORK(&pThis->u.s.XmitTask, vboxNetFltLinuxXmitTask, &pThis->u.s.XmitTask);
2368# endif
2369#endif
2370
2371 return VINF_SUCCESS;
2372}
2373
2374
2375void vboxNetFltPortOsNotifyMacAddress(PVBOXNETFLTINS pThis, void *pvIfData, PCRTMAC pMac)
2376{
2377 NOREF(pThis); NOREF(pvIfData); NOREF(pMac);
2378}
2379
2380
2381int vboxNetFltPortOsConnectInterface(PVBOXNETFLTINS pThis, void *pvIf, void **pvIfData)
2382{
2383 /* Nothing to do */
2384 NOREF(pThis); NOREF(pvIf); NOREF(pvIfData);
2385 return VINF_SUCCESS;
2386}
2387
2388
2389int vboxNetFltPortOsDisconnectInterface(PVBOXNETFLTINS pThis, void *pvIfData)
2390{
2391 /* Nothing to do */
2392 NOREF(pThis); NOREF(pvIfData);
2393 return VINF_SUCCESS;
2394}
2395
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette