VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 46072

Last change on this file since 46072 was 45261, checked in by vboxsync, 12 years ago

NAT: dnsproxy crush fix attempt.

on return from timeout operation, references to timeout should be cleared.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 50.6 KB
Line 
1/* $Id: socket.c 45261 2013-03-31 02:03:24Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
40/**
41 *
42 */
43struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
44{
45 struct socket *pNewSocket = NULL;
46 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
47 pNewSocket = socreate();
48 if (!pNewSocket)
49 {
50 LogFunc(("Can't create socket\n"));
51 LogFlowFunc(("Leave: NULL\n"));
52 return NULL;
53 }
54 if (fBindSocket)
55 {
56 if (udp_attach(pData, pNewSocket, 0) <= 0)
57 {
58 sofree(pData, pNewSocket);
59 LogFunc(("Can't attach fresh created socket\n"));
60 return NULL;
61 }
62 }
63 else
64 {
65 pNewSocket->so_cloneOf = (struct socket *)pSo;
66 pNewSocket->s = pSo->s;
67 insque(pData, pNewSocket, &udb);
68 }
69 pNewSocket->so_laddr = pSo->so_laddr;
70 pNewSocket->so_lport = pSo->so_lport;
71 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
72 pNewSocket->so_fport = pSo->so_fport;
73 pSo->so_cCloneCounter++;
74 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
75 return pNewSocket;
76}
77
78struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
79{
80 struct socket *pSoClone = NULL;
81 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
82 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
83 {
84 if ( pSoClone->so_cloneOf
85 && pSoClone->so_cloneOf == pcSo
86 && pSoClone->so_lport == pcSo->so_lport
87 && pSoClone->so_fport == pcSo->so_fport
88 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
89 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
90 goto done;
91 }
92 pSoClone = NULL;
93done:
94 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
95 return pSoClone;
96}
97#endif
98
99#ifdef VBOX_WITH_NAT_SEND2HOME
100DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
101{
102 int idxAddr;
103 int ret = 0;
104 bool fSendDone = false;
105 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
106 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
107 {
108
109 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
110 AssertReturn((pNewSocket, false));
111 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
112 /* @todo: more verbose on errors,
113 * @note: we shouldn't care if this send fail or not (we're in broadcast).
114 */
115 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
116 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
117 if (ret < 0)
118 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
119 fSendDone |= ret > 0;
120 }
121 LogFlowFunc(("Leave %RTbool\n", fSendDone));
122 return fSendDone;
123}
124#endif /* !VBOX_WITH_NAT_SEND2HOME */
125static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
126#ifdef RT_OS_WINDOWS
127static void sorecvfrom_icmp_win(PNATState, struct socket *);
128#else /* RT_OS_WINDOWS */
129static void sorecvfrom_icmp_unix(PNATState, struct socket *);
130#endif /* !RT_OS_WINDOWS */
131
132void
133so_init()
134{
135}
136
137struct socket *
138solookup(struct socket *head, struct in_addr laddr,
139 u_int lport, struct in_addr faddr, u_int fport)
140{
141 struct socket *so;
142
143 for (so = head->so_next; so != head; so = so->so_next)
144 {
145 if ( so->so_lport == lport
146 && so->so_laddr.s_addr == laddr.s_addr
147 && so->so_faddr.s_addr == faddr.s_addr
148 && so->so_fport == fport)
149 return so;
150 }
151
152 return (struct socket *)NULL;
153}
154
155/*
156 * Create a new socket, initialise the fields
157 * It is the responsibility of the caller to
158 * insque() it into the correct linked-list
159 */
160struct socket *
161socreate()
162{
163 struct socket *so;
164
165 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
166 if (so)
167 {
168 so->so_state = SS_NOFDREF;
169 so->s = -1;
170#if !defined(RT_OS_WINDOWS)
171 so->so_poll_index = -1;
172#endif
173 }
174 return so;
175}
176
177/*
178 * remque and free a socket, clobber cache
179 */
180void
181sofree(PNATState pData, struct socket *so)
182{
183 LogFlowFunc(("ENTER:%R[natsock]\n", so));
184 /*
185 * We should not remove socket when polling routine do the polling
186 * instead we mark it for deletion.
187 */
188 if (so->fUnderPolling)
189 {
190 so->fShouldBeRemoved = 1;
191 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
192 return;
193 }
194 /**
195 * Check that we don't freeng socket with tcbcb
196 */
197 Assert(!sototcpcb(so));
198 /* udp checks */
199 Assert(!so->so_timeout);
200 Assert(!so->so_timeout_arg);
201 if (so == tcp_last_so)
202 tcp_last_so = &tcb;
203 else if (so == udp_last_so)
204 udp_last_so = &udb;
205
206 /* libalias notification */
207 if (so->so_pvLnk)
208 slirpDeleteLinkSocket(so->so_pvLnk);
209 /* check if mbuf haven't been already freed */
210 if (so->so_m != NULL)
211 {
212 m_freem(pData, so->so_m);
213 so->so_m = NULL;
214 }
215
216 if (so->so_next && so->so_prev)
217 {
218 remque(pData, so); /* crashes if so is not in a queue */
219 NSOCK_DEC();
220 }
221
222 RTMemFree(so);
223 LogFlowFuncLeave();
224}
225
226/*
227 * Read from so's socket into sb_snd, updating all relevant sbuf fields
228 * NOTE: This will only be called if it is select()ed for reading, so
229 * a read() of 0 (or less) means it's disconnected
230 */
231#ifndef VBOX_WITH_SLIRP_BSD_SBUF
232int
233soread(PNATState pData, struct socket *so)
234{
235 int n, nn, lss, total;
236 struct sbuf *sb = &so->so_snd;
237 size_t len = sb->sb_datalen - sb->sb_cc;
238 struct iovec iov[2];
239 int mss = so->so_tcpcb->t_maxseg;
240
241 STAM_PROFILE_START(&pData->StatIOread, a);
242 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
243 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
244
245 QSOCKET_LOCK(tcb);
246 SOCKET_LOCK(so);
247 QSOCKET_UNLOCK(tcb);
248
249 LogFlow(("soread: so = %R[natsock]\n", so));
250 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
251
252 /*
253 * No need to check if there's enough room to read.
254 * soread wouldn't have been called if there weren't
255 */
256
257 len = sb->sb_datalen - sb->sb_cc;
258
259 iov[0].iov_base = sb->sb_wptr;
260 iov[1].iov_base = 0;
261 iov[1].iov_len = 0;
262 if (sb->sb_wptr < sb->sb_rptr)
263 {
264 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
265 /* Should never succeed, but... */
266 if (iov[0].iov_len > len)
267 iov[0].iov_len = len;
268 if (iov[0].iov_len > mss)
269 iov[0].iov_len -= iov[0].iov_len%mss;
270 n = 1;
271 }
272 else
273 {
274 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
275 /* Should never succeed, but... */
276 if (iov[0].iov_len > len)
277 iov[0].iov_len = len;
278 len -= iov[0].iov_len;
279 if (len)
280 {
281 iov[1].iov_base = sb->sb_data;
282 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
283 if (iov[1].iov_len > len)
284 iov[1].iov_len = len;
285 total = iov[0].iov_len + iov[1].iov_len;
286 if (total > mss)
287 {
288 lss = total % mss;
289 if (iov[1].iov_len > lss)
290 {
291 iov[1].iov_len -= lss;
292 n = 2;
293 }
294 else
295 {
296 lss -= iov[1].iov_len;
297 iov[0].iov_len -= lss;
298 n = 1;
299 }
300 }
301 else
302 n = 2;
303 }
304 else
305 {
306 if (iov[0].iov_len > mss)
307 iov[0].iov_len -= iov[0].iov_len%mss;
308 n = 1;
309 }
310 }
311
312#ifdef HAVE_READV
313 nn = readv(so->s, (struct iovec *)iov, n);
314#else
315 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
316#endif
317 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
318 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
319 if (nn <= 0)
320 {
321 /*
322 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
323 * _could_ mean that the connection is closed. But we will receive an
324 * FD_CLOSE event later if the connection was _really_ closed. With
325 * www.youtube.com I see this very often. Closing the socket too early
326 * would be dangerous.
327 */
328 int status;
329 unsigned long pending = 0;
330 status = ioctlsocket(so->s, FIONREAD, &pending);
331 if (status < 0)
332 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
333 if (nn == 0 && (pending != 0))
334 {
335 SOCKET_UNLOCK(so);
336 STAM_PROFILE_STOP(&pData->StatIOread, a);
337 return 0;
338 }
339 if ( nn < 0
340 && soIgnorableErrorCode(errno))
341 {
342 SOCKET_UNLOCK(so);
343 STAM_PROFILE_STOP(&pData->StatIOread, a);
344 return 0;
345 }
346 else
347 {
348 int fUninitiolizedTemplate = 0;
349 fUninitiolizedTemplate = RT_BOOL(( sototcpcb(so)
350 && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY
351 || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY)));
352 /* nn == 0 means peer has performed an orderly shutdown */
353 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
354 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
355 sofcantrcvmore(so);
356 if (!fUninitiolizedTemplate)
357 tcp_sockclosed(pData, sototcpcb(so));
358 else
359 tcp_drop(pData, sototcpcb(so), errno);
360 SOCKET_UNLOCK(so);
361 STAM_PROFILE_STOP(&pData->StatIOread, a);
362 return -1;
363 }
364 }
365 STAM_STATS(
366 if (n == 1)
367 {
368 STAM_COUNTER_INC(&pData->StatIORead_in_1);
369 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
370 }
371 else
372 {
373 STAM_COUNTER_INC(&pData->StatIORead_in_2);
374 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
375 }
376 );
377
378#ifndef HAVE_READV
379 /*
380 * If there was no error, try and read the second time round
381 * We read again if n = 2 (ie, there's another part of the buffer)
382 * and we read as much as we could in the first read
383 * We don't test for <= 0 this time, because there legitimately
384 * might not be any more data (since the socket is non-blocking),
385 * a close will be detected on next iteration.
386 * A return of -1 wont (shouldn't) happen, since it didn't happen above
387 */
388 if (n == 2 && nn == iov[0].iov_len)
389 {
390 int ret;
391 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
392 if (ret > 0)
393 nn += ret;
394 STAM_STATS(
395 if (ret > 0)
396 {
397 STAM_COUNTER_INC(&pData->StatIORead_in_2);
398 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
399 }
400 );
401 }
402
403 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
404#endif
405
406 /* Update fields */
407 sb->sb_cc += nn;
408 sb->sb_wptr += nn;
409 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
410 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
411 {
412 sb->sb_wptr -= sb->sb_datalen;
413 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
414 }
415 STAM_PROFILE_STOP(&pData->StatIOread, a);
416 SOCKET_UNLOCK(so);
417 return nn;
418}
419#else /* VBOX_WITH_SLIRP_BSD_SBUF */
420int
421soread(PNATState pData, struct socket *so)
422{
423 int n;
424 char *buf;
425 struct sbuf *sb = &so->so_snd;
426 size_t len = sbspace(sb);
427 int mss = so->so_tcpcb->t_maxseg;
428
429 STAM_PROFILE_START(&pData->StatIOread, a);
430 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
431 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
432
433 QSOCKET_LOCK(tcb);
434 SOCKET_LOCK(so);
435 QSOCKET_UNLOCK(tcb);
436
437 LogFlowFunc(("soread: so = %lx\n", (long)so));
438
439 if (len > mss)
440 len -= len % mss;
441 buf = RTMemAlloc(len);
442 if (buf == NULL)
443 {
444 Log(("NAT: can't alloc enough memory\n"));
445 return -1;
446 }
447
448 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
449 if (n <= 0)
450 {
451 /*
452 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
453 * _could_ mean that the connection is closed. But we will receive an
454 * FD_CLOSE event later if the connection was _really_ closed. With
455 * www.youtube.com I see this very often. Closing the socket too early
456 * would be dangerous.
457 */
458 int status;
459 unsigned long pending = 0;
460 status = ioctlsocket(so->s, FIONREAD, &pending);
461 if (status < 0)
462 Log(("NAT:error in WSAIoctl: %d\n", errno));
463 if (n == 0 && (pending != 0))
464 {
465 SOCKET_UNLOCK(so);
466 STAM_PROFILE_STOP(&pData->StatIOread, a);
467 RTMemFree(buf);
468 return 0;
469 }
470 if ( n < 0
471 && soIgnorableErrorCode(errno))
472 {
473 SOCKET_UNLOCK(so);
474 STAM_PROFILE_STOP(&pData->StatIOread, a);
475 RTMemFree(buf);
476 return 0;
477 }
478 else
479 {
480 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
481 n, errno, strerror(errno)));
482 sofcantrcvmore(so);
483 tcp_sockclosed(pData, sototcpcb(so));
484 SOCKET_UNLOCK(so);
485 STAM_PROFILE_STOP(&pData->StatIOread, a);
486 RTMemFree(buf);
487 return -1;
488 }
489 }
490
491 sbuf_bcat(sb, buf, n);
492 RTMemFree(buf);
493 return n;
494}
495#endif
496
497/*
498 * Get urgent data
499 *
500 * When the socket is created, we set it SO_OOBINLINE,
501 * so when OOB data arrives, we soread() it and everything
502 * in the send buffer is sent as urgent data
503 */
504void
505sorecvoob(PNATState pData, struct socket *so)
506{
507 struct tcpcb *tp = sototcpcb(so);
508 ssize_t ret;
509
510 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
511
512 /*
513 * We take a guess at how much urgent data has arrived.
514 * In most situations, when urgent data arrives, the next
515 * read() should get all the urgent data. This guess will
516 * be wrong however if more data arrives just after the
517 * urgent data, or the read() doesn't return all the
518 * urgent data.
519 */
520 ret = soread(pData, so);
521 if (RT_LIKELY(ret > 0))
522 {
523 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
524 tp->t_force = 1;
525 tcp_output(pData, tp);
526 tp->t_force = 0;
527 }
528}
529#ifndef VBOX_WITH_SLIRP_BSD_SBUF
530/*
531 * Send urgent data
532 * There's a lot duplicated code here, but...
533 */
534int
535sosendoob(struct socket *so)
536{
537 struct sbuf *sb = &so->so_rcv;
538 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
539
540 int n, len;
541
542 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
543
544 if (so->so_urgc > sizeof(buff))
545 so->so_urgc = sizeof(buff); /* XXX */
546
547 if (sb->sb_rptr < sb->sb_wptr)
548 {
549 /* We can send it directly */
550 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
551 so->so_urgc -= n;
552
553 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
554 n, so->so_urgc));
555 }
556 else
557 {
558 /*
559 * Since there's no sendv or sendtov like writev,
560 * we must copy all data to a linear buffer then
561 * send it all
562 */
563 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
564 if (len > so->so_urgc)
565 len = so->so_urgc;
566 memcpy(buff, sb->sb_rptr, len);
567 so->so_urgc -= len;
568 if (so->so_urgc)
569 {
570 n = sb->sb_wptr - sb->sb_data;
571 if (n > so->so_urgc)
572 n = so->so_urgc;
573 memcpy(buff + len, sb->sb_data, n);
574 so->so_urgc -= n;
575 len += n;
576 }
577 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
578#ifdef DEBUG
579 if (n != len)
580 Log(("Didn't send all data urgently XXXXX\n"));
581#endif
582 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
583 n, so->so_urgc));
584 }
585
586 sb->sb_cc -= n;
587 sb->sb_rptr += n;
588 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
589 sb->sb_rptr -= sb->sb_datalen;
590
591 return n;
592}
593
594/*
595 * Write data from so_rcv to so's socket,
596 * updating all sbuf field as necessary
597 */
598int
599sowrite(PNATState pData, struct socket *so)
600{
601 int n, nn;
602 struct sbuf *sb = &so->so_rcv;
603 size_t len = sb->sb_cc;
604 struct iovec iov[2];
605
606 STAM_PROFILE_START(&pData->StatIOwrite, a);
607 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
608 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
609 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
610 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
611 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
612 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
613 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
614 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
615 LogFlowFunc(("so = %R[natsock]\n", so));
616 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
617 QSOCKET_LOCK(tcb);
618 SOCKET_LOCK(so);
619 QSOCKET_UNLOCK(tcb);
620 if (so->so_urgc)
621 {
622 sosendoob(so);
623 if (sb->sb_cc == 0)
624 {
625 SOCKET_UNLOCK(so);
626 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
627 return 0;
628 }
629 }
630
631 /*
632 * No need to check if there's something to write,
633 * sowrite wouldn't have been called otherwise
634 */
635
636 len = sb->sb_cc;
637
638 iov[0].iov_base = sb->sb_rptr;
639 iov[1].iov_base = 0;
640 iov[1].iov_len = 0;
641 if (sb->sb_rptr < sb->sb_wptr)
642 {
643 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
644 /* Should never succeed, but... */
645 if (iov[0].iov_len > len)
646 iov[0].iov_len = len;
647 n = 1;
648 }
649 else
650 {
651 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
652 if (iov[0].iov_len > len)
653 iov[0].iov_len = len;
654 len -= iov[0].iov_len;
655 if (len)
656 {
657 iov[1].iov_base = sb->sb_data;
658 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
659 if (iov[1].iov_len > len)
660 iov[1].iov_len = len;
661 n = 2;
662 }
663 else
664 n = 1;
665 }
666 STAM_STATS({
667 if (n == 1)
668 {
669 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
670 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
671 }
672 else
673 {
674 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
675 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
676 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
677 }
678 });
679 /* Check if there's urgent data to send, and if so, send it */
680#ifdef HAVE_READV
681 nn = writev(so->s, (const struct iovec *)iov, n);
682#else
683 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
684#endif
685 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
686 /* This should never happen, but people tell me it does *shrug* */
687 if ( nn < 0
688 && soIgnorableErrorCode(errno))
689 {
690 SOCKET_UNLOCK(so);
691 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
692 return 0;
693 }
694
695 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
696 {
697 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
698 __PRETTY_FUNCTION__, so->so_state, errno));
699 sofcantsendmore(so);
700 tcp_sockclosed(pData, sototcpcb(so));
701 SOCKET_UNLOCK(so);
702 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
703 return -1;
704 }
705
706#ifndef HAVE_READV
707 if (n == 2 && nn == iov[0].iov_len)
708 {
709 int ret;
710 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
711 if (ret > 0)
712 nn += ret;
713 STAM_STATS({
714 if (ret > 0 && ret != iov[1].iov_len)
715 {
716 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
717 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
718 }
719 });
720 }
721 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
722#endif
723
724 /* Update sbuf */
725 sb->sb_cc -= nn;
726 sb->sb_rptr += nn;
727 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
728 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
729 {
730 sb->sb_rptr -= sb->sb_datalen;
731 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
732 }
733
734 /*
735 * If in DRAIN mode, and there's no more data, set
736 * it CANTSENDMORE
737 */
738 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
739 sofcantsendmore(so);
740
741 SOCKET_UNLOCK(so);
742 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
743 return nn;
744}
745#else /* VBOX_WITH_SLIRP_BSD_SBUF */
746static int
747do_sosend(struct socket *so, int fUrg)
748{
749 struct sbuf *sb = &so->so_rcv;
750
751 int n, len;
752
753 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
754
755 len = sbuf_len(sb);
756
757 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
758 if (n < 0)
759 Log(("NAT: Can't sent sbuf via socket.\n"));
760 if (fUrg)
761 so->so_urgc -= n;
762 if (n > 0 && n < len)
763 {
764 char *ptr;
765 char *buff;
766 buff = RTMemAlloc(len);
767 if (buff == NULL)
768 {
769 Log(("NAT: No space to allocate temporal buffer\n"));
770 return -1;
771 }
772 ptr = sbuf_data(sb);
773 memcpy(buff, &ptr[n], len - n);
774 sbuf_bcpy(sb, buff, len - n);
775 RTMemFree(buff);
776 return n;
777 }
778 sbuf_clear(sb);
779 return n;
780}
781int
782sosendoob(struct socket *so)
783{
784 return do_sosend(so, 1);
785}
786
787/*
788 * Write data from so_rcv to so's socket,
789 * updating all sbuf field as necessary
790 */
791int
792sowrite(PNATState pData, struct socket *so)
793{
794 return do_sosend(so, 0);
795}
796#endif
797
798/*
799 * recvfrom() a UDP socket
800 */
801void
802sorecvfrom(PNATState pData, struct socket *so)
803{
804 ssize_t ret = 0;
805 struct sockaddr_in addr;
806 socklen_t addrlen = sizeof(struct sockaddr_in);
807
808 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
809
810 if (so->so_type == IPPROTO_ICMP)
811 {
812 /* This is a "ping" reply */
813#ifdef RT_OS_WINDOWS
814 sorecvfrom_icmp_win(pData, so);
815#else /* RT_OS_WINDOWS */
816 sorecvfrom_icmp_unix(pData, so);
817#endif /* !RT_OS_WINDOWS */
818 udp_detach(pData, so);
819 }
820 else
821 {
822 /* A "normal" UDP packet */
823 struct mbuf *m;
824 ssize_t len;
825 u_long n = 0;
826 int rc = 0;
827 static int signalled = 0;
828 char *pchBuffer = NULL;
829 bool fWithTemporalBuffer = false;
830
831 QSOCKET_LOCK(udb);
832 SOCKET_LOCK(so);
833 QSOCKET_UNLOCK(udb);
834
835 /*How many data has been received ?*/
836 /*
837 * 1. calculate how much we can read
838 * 2. read as much as possible
839 * 3. attach buffer to allocated header mbuf
840 */
841 rc = ioctlsocket(so->s, FIONREAD, &n);
842 if (rc == -1)
843 {
844 if ( soIgnorableErrorCode(errno)
845 || errno == ENOTCONN)
846 return;
847 else if (signalled == 0)
848 {
849 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
850 signalled = 1;
851 }
852 return;
853 }
854
855 len = sizeof(struct udpiphdr);
856 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
857 if (m == NULL)
858 return;
859
860 len += n;
861 m->m_data += ETH_HLEN;
862 m->m_pkthdr.header = mtod(m, void *);
863 m->m_data += sizeof(struct udpiphdr);
864
865 pchBuffer = mtod(m, char *);
866 fWithTemporalBuffer = false;
867 /*
868 * Even if amounts of bytes on socket is greater than MTU value
869 * Slirp will able fragment it, but we won't create temporal location
870 * here.
871 */
872 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
873 {
874 pchBuffer = RTMemAlloc((n) * sizeof(char));
875 if (!pchBuffer)
876 {
877 m_freem(pData, m);
878 return;
879 }
880 fWithTemporalBuffer = true;
881 }
882 ret = recvfrom(so->s, pchBuffer, n, 0,
883 (struct sockaddr *)&addr, &addrlen);
884 if (fWithTemporalBuffer)
885 {
886 if (ret > 0)
887 {
888 m_copyback(pData, m, 0, ret, pchBuffer);
889 /*
890 * If we've met comporison below our size prediction was failed
891 * it's not fatal just we've allocated for nothing. (@todo add counter here
892 * to calculate how rare we here)
893 */
894 if(ret < slirp_size(pData) && !m->m_next)
895 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
896 n, ret, slirp_size(pData)));
897 }
898 /* we're freeing buffer anyway */
899 RTMemFree(pchBuffer);
900 }
901 else
902 m->m_len = ret;
903
904 if (ret < 0)
905 {
906 u_char code = ICMP_UNREACH_PORT;
907
908 if (errno == EHOSTUNREACH)
909 code = ICMP_UNREACH_HOST;
910 else if (errno == ENETUNREACH)
911 code = ICMP_UNREACH_NET;
912
913 m_freem(pData, m);
914 if ( soIgnorableErrorCode(errno)
915 || errno == ENOTCONN)
916 {
917 return;
918 }
919
920 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
921 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
922 so->so_m = NULL;
923 }
924 else
925 {
926 Assert((m_length(m,NULL) == ret));
927 /*
928 * Hack: domain name lookup will be used the most for UDP,
929 * and since they'll only be used once there's no need
930 * for the 4 minute (or whatever) timeout... So we time them
931 * out much quicker (10 seconds for now...)
932 */
933 if (so->so_expire)
934 {
935 if (so->so_fport != RT_H2N_U16_C(53))
936 so->so_expire = curtime + SO_EXPIRE;
937 }
938 /*
939 * last argument should be changed if Slirp will inject IP attributes
940 * Note: Here we can't check if dnsproxy's sent initial request
941 */
942 if ( pData->fUseDnsProxy
943 && so->so_fport == RT_H2N_U16_C(53))
944 dnsproxy_answer(pData, so, m);
945
946#if 0
947 if (m->m_len == len)
948 {
949 m_inc(m, MINCSIZE);
950 m->m_len = 0;
951 }
952#endif
953
954 /* packets definetly will be fragmented, could confuse receiver peer. */
955 if (m_length(m, NULL) > if_mtu)
956 m->m_flags |= M_SKIP_FIREWALL;
957 /*
958 * If this packet was destined for CTL_ADDR,
959 * make it look like that's where it came from, done by udp_output
960 */
961 udp_output(pData, so, m, &addr);
962 SOCKET_UNLOCK(so);
963 } /* rx error */
964 } /* if ping packet */
965}
966
967/*
968 * sendto() a socket
969 */
970int
971sosendto(PNATState pData, struct socket *so, struct mbuf *m)
972{
973 int ret;
974 struct sockaddr_in *paddr;
975 struct sockaddr addr;
976#if 0
977 struct sockaddr_in host_addr;
978#endif
979 caddr_t buf = 0;
980 int mlen;
981
982 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
983
984 memset(&addr, 0, sizeof(struct sockaddr));
985#ifdef RT_OS_DARWIN
986 addr.sa_len = sizeof(struct sockaddr_in);
987#endif
988 paddr = (struct sockaddr_in *)&addr;
989 paddr->sin_family = AF_INET;
990 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
991 {
992 /* It's an alias */
993 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
994 switch(last_byte)
995 {
996#if 0
997 /* handle this case at 'default:' */
998 case CTL_BROADCAST:
999 addr.sin_addr.s_addr = INADDR_BROADCAST;
1000 /* Send the packet to host to fully emulate broadcast */
1001 /** @todo r=klaus: on Linux host this causes the host to receive
1002 * the packet twice for some reason. And I cannot find any place
1003 * in the man pages which states that sending a broadcast does not
1004 * reach the host itself. */
1005 host_addr.sin_family = AF_INET;
1006 host_addr.sin_port = so->so_fport;
1007 host_addr.sin_addr = our_addr;
1008 sendto(so->s, m->m_data, m->m_len, 0,
1009 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
1010 break;
1011#endif
1012 case CTL_DNS:
1013 case CTL_ALIAS:
1014 default:
1015 if (last_byte == ~pData->netmask)
1016 paddr->sin_addr.s_addr = INADDR_BROADCAST;
1017 else
1018 paddr->sin_addr = loopback_addr;
1019 break;
1020 }
1021 }
1022 else
1023 paddr->sin_addr = so->so_faddr;
1024 paddr->sin_port = so->so_fport;
1025
1026 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1027 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1028
1029 /* Don't care what port we get */
1030 /*
1031 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1032 * generates bodyless messages, annoying memmory management system.
1033 */
1034 mlen = m_length(m, NULL);
1035 if (mlen > 0)
1036 {
1037 buf = RTMemAlloc(mlen);
1038 if (buf == NULL)
1039 {
1040 return -1;
1041 }
1042 m_copydata(m, 0, mlen, buf);
1043 }
1044 ret = sendto(so->s, buf, mlen, 0,
1045 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1046#ifdef VBOX_WITH_NAT_SEND2HOME
1047 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1048 {
1049 slirpSend2Home(pData, so, buf, mlen, 0);
1050 }
1051#endif
1052 if (buf)
1053 RTMemFree(buf);
1054 if (ret < 0)
1055 {
1056 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1057 return -1;
1058 }
1059
1060 /*
1061 * Kill the socket if there's no reply in 4 minutes,
1062 * but only if it's an expirable socket
1063 */
1064 if (so->so_expire)
1065 so->so_expire = curtime + SO_EXPIRE;
1066 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1067 return 0;
1068}
1069
1070/*
1071 * XXX This should really be tcp_listen
1072 */
1073struct socket *
1074solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1075{
1076 struct sockaddr_in addr;
1077 struct socket *so;
1078 socklen_t addrlen = sizeof(addr);
1079 int s, opt = 1;
1080 int status;
1081
1082 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1083
1084 if ((so = socreate()) == NULL)
1085 {
1086 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1087 return NULL;
1088 }
1089
1090 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1091 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1092 {
1093 RTMemFree(so);
1094 return NULL;
1095 }
1096
1097 SOCKET_LOCK_CREATE(so);
1098 SOCKET_LOCK(so);
1099 QSOCKET_LOCK(tcb);
1100 insque(pData, so,&tcb);
1101 NSOCK_INC();
1102 QSOCKET_UNLOCK(tcb);
1103
1104 /*
1105 * SS_FACCEPTONCE sockets must time out.
1106 */
1107 if (flags & SS_FACCEPTONCE)
1108 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1109
1110 so->so_state = (SS_FACCEPTCONN|flags);
1111 so->so_lport = lport; /* Kept in network format */
1112 so->so_laddr.s_addr = laddr; /* Ditto */
1113
1114 memset(&addr, 0, sizeof(addr));
1115#ifdef RT_OS_DARWIN
1116 addr.sin_len = sizeof(addr);
1117#endif
1118 addr.sin_family = AF_INET;
1119 addr.sin_addr.s_addr = bind_addr;
1120 addr.sin_port = port;
1121
1122 /**
1123 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1124 * kernel will choose the optimal value for requests queue length.
1125 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1126 */
1127 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1128 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1129 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1130 || (listen(s, pData->soMaxConn) < 0))
1131 {
1132#ifdef RT_OS_WINDOWS
1133 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1134 closesocket(s);
1135 QSOCKET_LOCK(tcb);
1136 sofree(pData, so);
1137 QSOCKET_UNLOCK(tcb);
1138 /* Restore the real errno */
1139 WSASetLastError(tmperrno);
1140#else
1141 int tmperrno = errno; /* Don't clobber the real reason we failed */
1142 close(s);
1143 if (sototcpcb(so))
1144 tcp_close(pData, sototcpcb(so));
1145 else
1146 sofree(pData, so);
1147 /* Restore the real errno */
1148 errno = tmperrno;
1149#endif
1150 return NULL;
1151 }
1152 fd_nonblock(s);
1153 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1154
1155 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1156 so->so_fport = addr.sin_port;
1157 /* set socket buffers */
1158 opt = pData->socket_rcv;
1159 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1160 if (status < 0)
1161 {
1162 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1163 goto no_sockopt;
1164 }
1165 opt = pData->socket_snd;
1166 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1167 if (status < 0)
1168 {
1169 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1170 goto no_sockopt;
1171 }
1172no_sockopt:
1173 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1174 so->so_faddr = alias_addr;
1175 else
1176 so->so_faddr = addr.sin_addr;
1177
1178 so->s = s;
1179 SOCKET_UNLOCK(so);
1180 return so;
1181}
1182
1183/*
1184 * Data is available in so_rcv
1185 * Just write() the data to the socket
1186 * XXX not yet...
1187 * @todo do we really need this function, what it's intended to do?
1188 */
1189void
1190sorwakeup(struct socket *so)
1191{
1192 NOREF(so);
1193#if 0
1194 sowrite(so);
1195 FD_CLR(so->s,&writefds);
1196#endif
1197}
1198
1199/*
1200 * Data has been freed in so_snd
1201 * We have room for a read() if we want to
1202 * For now, don't read, it'll be done in the main loop
1203 */
1204void
1205sowwakeup(struct socket *so)
1206{
1207 NOREF(so);
1208}
1209
1210/*
1211 * Various session state calls
1212 * XXX Should be #define's
1213 * The socket state stuff needs work, these often get call 2 or 3
1214 * times each when only 1 was needed
1215 */
1216void
1217soisfconnecting(struct socket *so)
1218{
1219 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1220 SS_FCANTSENDMORE|SS_FWDRAIN);
1221 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1222}
1223
1224void
1225soisfconnected(struct socket *so)
1226{
1227 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1228 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1229 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1230 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1231}
1232
1233void
1234sofcantrcvmore(struct socket *so)
1235{
1236 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1237 if ((so->so_state & SS_NOFDREF) == 0)
1238 {
1239 shutdown(so->s, 0);
1240 }
1241 so->so_state &= ~(SS_ISFCONNECTING);
1242 if (so->so_state & SS_FCANTSENDMORE)
1243 so->so_state = SS_NOFDREF; /* Don't select it */
1244 /* XXX close() here as well? */
1245 else
1246 so->so_state |= SS_FCANTRCVMORE;
1247 LogFlowFuncLeave();
1248}
1249
1250void
1251sofcantsendmore(struct socket *so)
1252{
1253 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1254 if ((so->so_state & SS_NOFDREF) == 0)
1255 shutdown(so->s, 1); /* send FIN to fhost */
1256
1257 so->so_state &= ~(SS_ISFCONNECTING);
1258 if (so->so_state & SS_FCANTRCVMORE)
1259 so->so_state = SS_NOFDREF; /* as above */
1260 else
1261 so->so_state |= SS_FCANTSENDMORE;
1262 LogFlowFuncLeave();
1263}
1264
1265void
1266soisfdisconnected(struct socket *so)
1267{
1268 NOREF(so);
1269#if 0
1270 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1271 close(so->s);
1272 so->so_state = SS_ISFDISCONNECTED;
1273 /*
1274 * XXX Do nothing ... ?
1275 */
1276#endif
1277}
1278
1279/*
1280 * Set write drain mode
1281 * Set CANTSENDMORE once all data has been write()n
1282 */
1283void
1284sofwdrain(struct socket *so)
1285{
1286 if (SBUF_LEN(&so->so_rcv))
1287 so->so_state |= SS_FWDRAIN;
1288 else
1289 sofcantsendmore(so);
1290}
1291
1292static void
1293send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1294{
1295 struct ip *ip;
1296 uint32_t dst, src;
1297 char ip_copy[256];
1298 struct icmp *icp;
1299 int old_ip_len = 0;
1300 int hlen, original_hlen = 0;
1301 struct mbuf *m;
1302 struct icmp_msg *icm;
1303 uint8_t proto;
1304 int type = 0;
1305
1306 ip = (struct ip *)buff;
1307 /* Fix ip->ip_len to contain the total packet length including the header
1308 * in _host_ byte order for all OSes. On Darwin, that value already is in
1309 * host byte order. Solaris and Darwin report only the payload. */
1310#ifndef RT_OS_DARWIN
1311 ip->ip_len = RT_N2H_U16(ip->ip_len);
1312#endif
1313 hlen = (ip->ip_hl << 2);
1314#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1315 ip->ip_len += hlen;
1316#endif
1317 if (ip->ip_len < hlen + ICMP_MINLEN)
1318 {
1319 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1320 return;
1321 }
1322 icp = (struct icmp *)((char *)ip + hlen);
1323
1324 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1325 if ( icp->icmp_type != ICMP_ECHOREPLY
1326 && icp->icmp_type != ICMP_TIMXCEED
1327 && icp->icmp_type != ICMP_UNREACH)
1328 {
1329 return;
1330 }
1331
1332 /*
1333 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1334 * ICMP_ECHOREPLY assuming data 0
1335 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1336 */
1337 if (ip->ip_len < hlen + 8)
1338 {
1339 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1340 return;
1341 }
1342
1343 type = icp->icmp_type;
1344 if ( type == ICMP_TIMXCEED
1345 || type == ICMP_UNREACH)
1346 {
1347 /*
1348 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1349 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1350 */
1351 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1352 {
1353 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1354 return;
1355 }
1356 ip = &icp->icmp_ip;
1357 }
1358
1359 icm = icmp_find_original_mbuf(pData, ip);
1360 if (icm == NULL)
1361 {
1362 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1363 return;
1364 }
1365
1366 m = icm->im_m;
1367 if (!m)
1368 {
1369 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
1370 LIST_REMOVE(icm, im_list);
1371 RTMemFree(icm);
1372 return;
1373 }
1374
1375 src = addr->sin_addr.s_addr;
1376 if (type == ICMP_ECHOREPLY)
1377 {
1378 struct ip *ip0 = mtod(m, struct ip *);
1379 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1380 if (icp0->icmp_type != ICMP_ECHO)
1381 {
1382 Log(("NAT: we haven't found echo for this reply\n"));
1383 return;
1384 }
1385 /*
1386 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1387 * IP header combined by OS network stack, our local copy of IP header contians values
1388 * in host byte order so no byte order conversion is required. IP headers fields are converting
1389 * in ip_output0 routine only.
1390 */
1391 if ( (ip->ip_len - hlen)
1392 != (ip0->ip_len - (ip0->ip_hl << 2)))
1393 {
1394 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1395 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1396 return;
1397 }
1398 }
1399
1400 /* ip points on origianal ip header */
1401 ip = mtod(m, struct ip *);
1402 proto = ip->ip_p;
1403 /* Now ip is pointing on header we've sent from guest */
1404 if ( icp->icmp_type == ICMP_TIMXCEED
1405 || icp->icmp_type == ICMP_UNREACH)
1406 {
1407 old_ip_len = (ip->ip_hl << 2) + 64;
1408 if (old_ip_len > sizeof(ip_copy))
1409 old_ip_len = sizeof(ip_copy);
1410 memcpy(ip_copy, ip, old_ip_len);
1411 }
1412
1413 /* source address from original IP packet*/
1414 dst = ip->ip_src.s_addr;
1415
1416 /* overide ther tail of old packet */
1417 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1418 original_hlen = ip->ip_hl << 2;
1419 /* saves original ip header and options */
1420 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1421 ip->ip_len = m_length(m, NULL);
1422 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1423
1424 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1425 type = icp->icmp_type;
1426 if ( type == ICMP_TIMXCEED
1427 || type == ICMP_UNREACH)
1428 {
1429 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1430 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1431 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1432 }
1433
1434 ip->ip_src.s_addr = src;
1435 ip->ip_dst.s_addr = dst;
1436 icmp_reflect(pData, m);
1437 LIST_REMOVE(icm, im_list);
1438 pData->cIcmpCacheSize--;
1439 /* Don't call m_free here*/
1440
1441 if ( type == ICMP_TIMXCEED
1442 || type == ICMP_UNREACH)
1443 {
1444 icm->im_so->so_m = NULL;
1445 switch (proto)
1446 {
1447 case IPPROTO_UDP:
1448 /*XXX: so->so_m already freed so we shouldn't call sofree */
1449 udp_detach(pData, icm->im_so);
1450 break;
1451 case IPPROTO_TCP:
1452 /*close tcp should be here */
1453 break;
1454 default:
1455 /* do nothing */
1456 break;
1457 }
1458 }
1459 RTMemFree(icm);
1460}
1461
1462#ifdef RT_OS_WINDOWS
1463static void
1464sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1465{
1466 int len;
1467 int i;
1468 struct ip *ip;
1469 struct mbuf *m;
1470 struct icmp *icp;
1471 struct icmp_msg *icm;
1472 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1473 uint32_t src;
1474 ICMP_ECHO_REPLY *icr;
1475 int hlen = 0;
1476 int nbytes = 0;
1477 u_char code = ~0;
1478 int out_len;
1479 int size;
1480
1481 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1482 if (len < 0)
1483 {
1484 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1485 return;
1486 }
1487 if (len == 0)
1488 return; /* no error */
1489
1490 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1491 for (i = 0; i < len; ++i)
1492 {
1493 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1494 i, icr[i].Data, icr[i].DataSize));
1495 switch(icr[i].Status)
1496 {
1497 case IP_DEST_HOST_UNREACHABLE:
1498 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1499 case IP_DEST_NET_UNREACHABLE:
1500 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1501 case IP_DEST_PROT_UNREACHABLE:
1502 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1503 /* UNREACH error inject here */
1504 case IP_DEST_PORT_UNREACHABLE:
1505 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1506 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1507 so->so_m = NULL;
1508 break;
1509 case IP_SUCCESS: /* echo replied */
1510 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1511 size;
1512 size = MCLBYTES;
1513 if (out_len < MSIZE)
1514 size = MCLBYTES;
1515 else if (out_len < MCLBYTES)
1516 size = MCLBYTES;
1517 else if (out_len < MJUM9BYTES)
1518 size = MJUM9BYTES;
1519 else if (out_len < MJUM16BYTES)
1520 size = MJUM16BYTES;
1521 else
1522 AssertMsgFailed(("Unsupported size"));
1523
1524 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1525 LogFunc(("m_getjcl returns m: %p\n", m));
1526 if (m == NULL)
1527 return;
1528 m->m_len = 0;
1529 m->m_data += if_maxlinkhdr;
1530 m->m_pkthdr.header = mtod(m, void *);
1531
1532 ip = mtod(m, struct ip *);
1533 ip->ip_src.s_addr = icr[i].Address;
1534 ip->ip_p = IPPROTO_ICMP;
1535 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1536 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1537 ip->ip_ttl = icr[i].Options.Ttl;
1538
1539 icp = (struct icmp *)&ip[1]; /* no options */
1540 icp->icmp_type = ICMP_ECHOREPLY;
1541 icp->icmp_code = 0;
1542 icp->icmp_id = so->so_icmp_id;
1543 icp->icmp_seq = so->so_icmp_seq;
1544
1545 icm = icmp_find_original_mbuf(pData, ip);
1546 if (icm)
1547 {
1548 /* on this branch we don't need stored variant */
1549 m_freem(pData, icm->im_m);
1550 LIST_REMOVE(icm, im_list);
1551 pData->cIcmpCacheSize--;
1552 RTMemFree(icm);
1553 }
1554
1555
1556 hlen = (ip->ip_hl << 2);
1557 Assert((hlen >= sizeof(struct ip)));
1558
1559 m->m_data += hlen + ICMP_MINLEN;
1560 if (!RT_VALID_PTR(icr[i].Data))
1561 {
1562 m_freem(pData, m);
1563 break;
1564 }
1565 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1566 m->m_data -= hlen + ICMP_MINLEN;
1567 m->m_len += hlen + ICMP_MINLEN;
1568
1569
1570 ip->ip_len = m_length(m, NULL);
1571 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1572
1573 icmp_reflect(pData, m);
1574 break;
1575 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1576
1577 ip_broken = icr[i].Data;
1578 icm = icmp_find_original_mbuf(pData, ip_broken);
1579 if (icm == NULL) {
1580 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1581 return;
1582 }
1583 m = icm->im_m;
1584 ip = mtod(m, struct ip *);
1585 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1586 ip->ip_ttl = icr[i].Options.Ttl;
1587 src = ip->ip_src.s_addr;
1588 ip->ip_dst.s_addr = src;
1589 ip->ip_dst.s_addr = icr[i].Address;
1590
1591 hlen = (ip->ip_hl << 2);
1592 icp = (struct icmp *)((char *)ip + hlen);
1593 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1594
1595 m->m_len = (ip_broken->ip_hl << 2) + 64;
1596 m->m_pkthdr.header = mtod(m, void *);
1597 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1598 icmp_reflect(pData, m);
1599 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1600 LIST_REMOVE(icm, im_list);
1601 pData->cIcmpCacheSize--;
1602 RTMemFree(icm);
1603 break;
1604 default:
1605 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1606 break;
1607 }
1608 }
1609}
1610#else /* !RT_OS_WINDOWS */
1611static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1612{
1613 struct sockaddr_in addr;
1614 socklen_t addrlen = sizeof(struct sockaddr_in);
1615 struct ip ip;
1616 char *buff;
1617 int len = 0;
1618
1619 /* 1- step: read the ip header */
1620 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1621 (struct sockaddr *)&addr, &addrlen);
1622 if ( len < 0
1623 && ( soIgnorableErrorCode(errno)
1624 || errno == ENOTCONN))
1625 {
1626 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1627 return;
1628 }
1629
1630 if ( len < sizeof(struct ip)
1631 || len < 0
1632 || len == 0)
1633 {
1634 u_char code;
1635 code = ICMP_UNREACH_PORT;
1636
1637 if (errno == EHOSTUNREACH)
1638 code = ICMP_UNREACH_HOST;
1639 else if (errno == ENETUNREACH)
1640 code = ICMP_UNREACH_NET;
1641
1642 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1643 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1644 so->so_m = NULL;
1645 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1646 return;
1647 }
1648 /* basic check of IP header */
1649 if ( ip.ip_v != IPVERSION
1650# ifndef RT_OS_DARWIN
1651 || ip.ip_p != IPPROTO_ICMP
1652# endif
1653 )
1654 {
1655 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1656 return;
1657 }
1658# ifndef RT_OS_DARWIN
1659 /* Darwin reports the IP length already in host byte order. */
1660 ip.ip_len = RT_N2H_U16(ip.ip_len);
1661# endif
1662# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1663 /* Solaris and Darwin report the payload only */
1664 ip.ip_len += (ip.ip_hl << 2);
1665# endif
1666 /* Note: ip->ip_len in host byte order (all OS) */
1667 len = ip.ip_len;
1668 buff = RTMemAlloc(len);
1669 if (buff == NULL)
1670 {
1671 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1672 return;
1673 }
1674 /* 2 - step: we're reading rest of the datagramm to the buffer */
1675 addrlen = sizeof(struct sockaddr_in);
1676 memset(&addr, 0, addrlen);
1677 len = recvfrom(so->s, buff, len, 0,
1678 (struct sockaddr *)&addr, &addrlen);
1679 if ( len < 0
1680 && ( soIgnorableErrorCode(errno)
1681 || errno == ENOTCONN))
1682 {
1683 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1684 ip.ip_len));
1685 RTMemFree(buff);
1686 return;
1687 }
1688 if ( len < 0
1689 || len == 0)
1690 {
1691 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1692 errno, len, (ip.ip_len - sizeof(struct ip))));
1693 RTMemFree(buff);
1694 return;
1695 }
1696 /* len is modified in 2nd read, when the rest of the datagramm was read */
1697 send_icmp_to_guest(pData, buff, len, &addr);
1698 RTMemFree(buff);
1699}
1700#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette