VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 40621

Last change on this file since 40621 was 40621, checked in by vboxsync, 13 years ago

NAT: libalias closes self sockets, and Slirp inform libalias in case it closes its socket.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 50.1 KB
Line 
1/* $Id: socket.c 40621 2012-03-26 01:18:08Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
40/**
41 *
42 */
43struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
44{
45 struct socket *pNewSocket = NULL;
46 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
47 pNewSocket = socreate();
48 if (!pNewSocket)
49 {
50 LogFunc(("Can't create socket\n"));
51 LogFlowFunc(("Leave: NULL\n"));
52 return NULL;
53 }
54 if (fBindSocket)
55 {
56 if (udp_attach(pData, pNewSocket, 0) <= 0)
57 {
58 sofree(pData, pNewSocket);
59 LogFunc(("Can't attach fresh created socket\n"));
60 return NULL;
61 }
62 }
63 else
64 {
65 pNewSocket->so_cloneOf = (struct socket *)pSo;
66 pNewSocket->s = pSo->s;
67 insque(pData, pNewSocket, &udb);
68 }
69 pNewSocket->so_laddr = pSo->so_laddr;
70 pNewSocket->so_lport = pSo->so_lport;
71 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
72 pNewSocket->so_fport = pSo->so_fport;
73 pSo->so_cCloneCounter++;
74 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
75 return pNewSocket;
76}
77
78struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
79{
80 struct socket *pSoClone = NULL;
81 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
82 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
83 {
84 if ( pSoClone->so_cloneOf
85 && pSoClone->so_cloneOf == pcSo
86 && pSoClone->so_lport == pcSo->so_lport
87 && pSoClone->so_fport == pcSo->so_fport
88 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
89 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
90 goto done;
91 }
92 pSoClone = NULL;
93done:
94 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
95 return pSoClone;
96}
97#endif
98
99#ifdef VBOX_WITH_NAT_SEND2HOME
100DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
101{
102 int idxAddr;
103 int ret = 0;
104 bool fSendDone = false;
105 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
106 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
107 {
108
109 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
110 AssertReturn((pNewSocket, false));
111 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
112 /* @todo: more verbose on errors,
113 * @note: we shouldn't care if this send fail or not (we're in broadcast).
114 */
115 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
116 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
117 if (ret < 0)
118 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
119 fSendDone |= ret > 0;
120 }
121 LogFlowFunc(("Leave %RTbool\n", fSendDone));
122 return fSendDone;
123}
124#endif /* !VBOX_WITH_NAT_SEND2HOME */
125static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
126#ifdef RT_OS_WINDOWS
127static void sorecvfrom_icmp_win(PNATState, struct socket *);
128#else /* RT_OS_WINDOWS */
129static void sorecvfrom_icmp_unix(PNATState, struct socket *);
130#endif /* !RT_OS_WINDOWS */
131
132void
133so_init()
134{
135}
136
137struct socket *
138solookup(struct socket *head, struct in_addr laddr,
139 u_int lport, struct in_addr faddr, u_int fport)
140{
141 struct socket *so;
142
143 for (so = head->so_next; so != head; so = so->so_next)
144 {
145 if ( so->so_lport == lport
146 && so->so_laddr.s_addr == laddr.s_addr
147 && so->so_faddr.s_addr == faddr.s_addr
148 && so->so_fport == fport)
149 return so;
150 }
151
152 return (struct socket *)NULL;
153}
154
155/*
156 * Create a new socket, initialise the fields
157 * It is the responsibility of the caller to
158 * insque() it into the correct linked-list
159 */
160struct socket *
161socreate()
162{
163 struct socket *so;
164
165 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
166 if (so)
167 {
168 so->so_state = SS_NOFDREF;
169 so->s = -1;
170#if !defined(RT_OS_WINDOWS)
171 so->so_poll_index = -1;
172#endif
173 }
174 return so;
175}
176
177/*
178 * remque and free a socket, clobber cache
179 * VBOX_WITH_SLIRP_MT: before sofree queue should be locked, because
180 * in sofree we don't know from which queue item beeing removed.
181 */
182void
183sofree(PNATState pData, struct socket *so)
184{
185 if (so == tcp_last_so)
186 tcp_last_so = &tcb;
187 else if (so == udp_last_so)
188 udp_last_so = &udb;
189 /* libalias notification */
190 if (so->so_pvLnk)
191 slirpDeleteLinkSocket(so->so_pvLnk);
192
193 /* check if mbuf haven't been already freed */
194 if (so->so_m != NULL)
195 m_freem(pData, so->so_m);
196#ifndef VBOX_WITH_SLIRP_MT
197 if (so->so_next && so->so_prev)
198 {
199 remque(pData, so); /* crashes if so is not in a queue */
200 NSOCK_DEC();
201 }
202
203 RTMemFree(so);
204#else
205 so->so_deleted = 1;
206#endif
207}
208
209#ifdef VBOX_WITH_SLIRP_MT
210void
211soread_queue(PNATState pData, struct socket *so, int *ret)
212{
213 *ret = soread(pData, so);
214}
215#endif
216
217/*
218 * Read from so's socket into sb_snd, updating all relevant sbuf fields
219 * NOTE: This will only be called if it is select()ed for reading, so
220 * a read() of 0 (or less) means it's disconnected
221 */
222#ifndef VBOX_WITH_SLIRP_BSD_SBUF
223int
224soread(PNATState pData, struct socket *so)
225{
226 int n, nn, lss, total;
227 struct sbuf *sb = &so->so_snd;
228 size_t len = sb->sb_datalen - sb->sb_cc;
229 struct iovec iov[2];
230 int mss = so->so_tcpcb->t_maxseg;
231
232 STAM_PROFILE_START(&pData->StatIOread, a);
233 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
234 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
235
236 QSOCKET_LOCK(tcb);
237 SOCKET_LOCK(so);
238 QSOCKET_UNLOCK(tcb);
239
240 LogFlow(("soread: so = %R[natsock]\n", so));
241 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
242
243 /*
244 * No need to check if there's enough room to read.
245 * soread wouldn't have been called if there weren't
246 */
247
248 len = sb->sb_datalen - sb->sb_cc;
249
250 iov[0].iov_base = sb->sb_wptr;
251 iov[1].iov_base = 0;
252 iov[1].iov_len = 0;
253 if (sb->sb_wptr < sb->sb_rptr)
254 {
255 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
256 /* Should never succeed, but... */
257 if (iov[0].iov_len > len)
258 iov[0].iov_len = len;
259 if (iov[0].iov_len > mss)
260 iov[0].iov_len -= iov[0].iov_len%mss;
261 n = 1;
262 }
263 else
264 {
265 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
266 /* Should never succeed, but... */
267 if (iov[0].iov_len > len)
268 iov[0].iov_len = len;
269 len -= iov[0].iov_len;
270 if (len)
271 {
272 iov[1].iov_base = sb->sb_data;
273 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
274 if (iov[1].iov_len > len)
275 iov[1].iov_len = len;
276 total = iov[0].iov_len + iov[1].iov_len;
277 if (total > mss)
278 {
279 lss = total % mss;
280 if (iov[1].iov_len > lss)
281 {
282 iov[1].iov_len -= lss;
283 n = 2;
284 }
285 else
286 {
287 lss -= iov[1].iov_len;
288 iov[0].iov_len -= lss;
289 n = 1;
290 }
291 }
292 else
293 n = 2;
294 }
295 else
296 {
297 if (iov[0].iov_len > mss)
298 iov[0].iov_len -= iov[0].iov_len%mss;
299 n = 1;
300 }
301 }
302
303#ifdef HAVE_READV
304 nn = readv(so->s, (struct iovec *)iov, n);
305#else
306 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
307#endif
308 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
309 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
310 if (nn <= 0)
311 {
312 /*
313 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
314 * _could_ mean that the connection is closed. But we will receive an
315 * FD_CLOSE event later if the connection was _really_ closed. With
316 * www.youtube.com I see this very often. Closing the socket too early
317 * would be dangerous.
318 */
319 int status;
320 unsigned long pending = 0;
321 status = ioctlsocket(so->s, FIONREAD, &pending);
322 if (status < 0)
323 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
324 if (nn == 0 && (pending != 0))
325 {
326 SOCKET_UNLOCK(so);
327 STAM_PROFILE_STOP(&pData->StatIOread, a);
328 return 0;
329 }
330 if ( nn < 0
331 && ( errno == EINTR
332 || errno == EAGAIN
333 || errno == EWOULDBLOCK))
334 {
335 SOCKET_UNLOCK(so);
336 STAM_PROFILE_STOP(&pData->StatIOread, a);
337 return 0;
338 }
339 else
340 {
341 /* nn == 0 means peer has performed an orderly shutdown */
342 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
343 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
344 sofcantrcvmore(so);
345 tcp_sockclosed(pData, sototcpcb(so));
346 SOCKET_UNLOCK(so);
347 STAM_PROFILE_STOP(&pData->StatIOread, a);
348 return -1;
349 }
350 }
351 STAM_STATS(
352 if (n == 1)
353 {
354 STAM_COUNTER_INC(&pData->StatIORead_in_1);
355 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
356 }
357 else
358 {
359 STAM_COUNTER_INC(&pData->StatIORead_in_2);
360 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
361 }
362 );
363
364#ifndef HAVE_READV
365 /*
366 * If there was no error, try and read the second time round
367 * We read again if n = 2 (ie, there's another part of the buffer)
368 * and we read as much as we could in the first read
369 * We don't test for <= 0 this time, because there legitimately
370 * might not be any more data (since the socket is non-blocking),
371 * a close will be detected on next iteration.
372 * A return of -1 wont (shouldn't) happen, since it didn't happen above
373 */
374 if (n == 2 && nn == iov[0].iov_len)
375 {
376 int ret;
377 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
378 if (ret > 0)
379 nn += ret;
380 STAM_STATS(
381 if (ret > 0)
382 {
383 STAM_COUNTER_INC(&pData->StatIORead_in_2);
384 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
385 }
386 );
387 }
388
389 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
390#endif
391
392 /* Update fields */
393 sb->sb_cc += nn;
394 sb->sb_wptr += nn;
395 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
396 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
397 {
398 sb->sb_wptr -= sb->sb_datalen;
399 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
400 }
401 STAM_PROFILE_STOP(&pData->StatIOread, a);
402 SOCKET_UNLOCK(so);
403 return nn;
404}
405#else /* VBOX_WITH_SLIRP_BSD_SBUF */
406int
407soread(PNATState pData, struct socket *so)
408{
409 int n;
410 char *buf;
411 struct sbuf *sb = &so->so_snd;
412 size_t len = sbspace(sb);
413 int mss = so->so_tcpcb->t_maxseg;
414
415 STAM_PROFILE_START(&pData->StatIOread, a);
416 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
417 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
418
419 QSOCKET_LOCK(tcb);
420 SOCKET_LOCK(so);
421 QSOCKET_UNLOCK(tcb);
422
423 LogFlowFunc(("soread: so = %lx\n", (long)so));
424
425 if (len > mss)
426 len -= len % mss;
427 buf = RTMemAlloc(len);
428 if (buf == NULL)
429 {
430 Log(("NAT: can't alloc enough memory\n"));
431 return -1;
432 }
433
434 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
435 if (n <= 0)
436 {
437 /*
438 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
439 * _could_ mean that the connection is closed. But we will receive an
440 * FD_CLOSE event later if the connection was _really_ closed. With
441 * www.youtube.com I see this very often. Closing the socket too early
442 * would be dangerous.
443 */
444 int status;
445 unsigned long pending = 0;
446 status = ioctlsocket(so->s, FIONREAD, &pending);
447 if (status < 0)
448 Log(("NAT:error in WSAIoctl: %d\n", errno));
449 if (n == 0 && (pending != 0))
450 {
451 SOCKET_UNLOCK(so);
452 STAM_PROFILE_STOP(&pData->StatIOread, a);
453 RTMemFree(buf);
454 return 0;
455 }
456 if ( n < 0
457 && ( errno == EINTR
458 || errno == EAGAIN
459 || errno == EWOULDBLOCK))
460 {
461 SOCKET_UNLOCK(so);
462 STAM_PROFILE_STOP(&pData->StatIOread, a);
463 RTMemFree(buf);
464 return 0;
465 }
466 else
467 {
468 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
469 n, errno, strerror(errno)));
470 sofcantrcvmore(so);
471 tcp_sockclosed(pData, sototcpcb(so));
472 SOCKET_UNLOCK(so);
473 STAM_PROFILE_STOP(&pData->StatIOread, a);
474 RTMemFree(buf);
475 return -1;
476 }
477 }
478
479 sbuf_bcat(sb, buf, n);
480 RTMemFree(buf);
481 return n;
482}
483#endif
484
485/*
486 * Get urgent data
487 *
488 * When the socket is created, we set it SO_OOBINLINE,
489 * so when OOB data arrives, we soread() it and everything
490 * in the send buffer is sent as urgent data
491 */
492void
493sorecvoob(PNATState pData, struct socket *so)
494{
495 struct tcpcb *tp = sototcpcb(so);
496 ssize_t ret;
497
498 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
499
500 /*
501 * We take a guess at how much urgent data has arrived.
502 * In most situations, when urgent data arrives, the next
503 * read() should get all the urgent data. This guess will
504 * be wrong however if more data arrives just after the
505 * urgent data, or the read() doesn't return all the
506 * urgent data.
507 */
508 ret = soread(pData, so);
509 if (RT_LIKELY(ret > 0))
510 {
511 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
512 tp->t_force = 1;
513 tcp_output(pData, tp);
514 tp->t_force = 0;
515 }
516}
517#ifndef VBOX_WITH_SLIRP_BSD_SBUF
518/*
519 * Send urgent data
520 * There's a lot duplicated code here, but...
521 */
522int
523sosendoob(struct socket *so)
524{
525 struct sbuf *sb = &so->so_rcv;
526 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
527
528 int n, len;
529
530 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
531
532 if (so->so_urgc > sizeof(buff))
533 so->so_urgc = sizeof(buff); /* XXX */
534
535 if (sb->sb_rptr < sb->sb_wptr)
536 {
537 /* We can send it directly */
538 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
539 so->so_urgc -= n;
540
541 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
542 n, so->so_urgc));
543 }
544 else
545 {
546 /*
547 * Since there's no sendv or sendtov like writev,
548 * we must copy all data to a linear buffer then
549 * send it all
550 */
551 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
552 if (len > so->so_urgc)
553 len = so->so_urgc;
554 memcpy(buff, sb->sb_rptr, len);
555 so->so_urgc -= len;
556 if (so->so_urgc)
557 {
558 n = sb->sb_wptr - sb->sb_data;
559 if (n > so->so_urgc)
560 n = so->so_urgc;
561 memcpy(buff + len, sb->sb_data, n);
562 so->so_urgc -= n;
563 len += n;
564 }
565 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
566#ifdef DEBUG
567 if (n != len)
568 Log(("Didn't send all data urgently XXXXX\n"));
569#endif
570 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
571 n, so->so_urgc));
572 }
573
574 sb->sb_cc -= n;
575 sb->sb_rptr += n;
576 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
577 sb->sb_rptr -= sb->sb_datalen;
578
579 return n;
580}
581
582/*
583 * Write data from so_rcv to so's socket,
584 * updating all sbuf field as necessary
585 */
586int
587sowrite(PNATState pData, struct socket *so)
588{
589 int n, nn;
590 struct sbuf *sb = &so->so_rcv;
591 size_t len = sb->sb_cc;
592 struct iovec iov[2];
593
594 STAM_PROFILE_START(&pData->StatIOwrite, a);
595 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
596 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
597 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
598 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
599 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
600 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
601 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
602 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
603 LogFlowFunc(("so = %R[natsock]\n", so));
604 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
605 QSOCKET_LOCK(tcb);
606 SOCKET_LOCK(so);
607 QSOCKET_UNLOCK(tcb);
608 if (so->so_urgc)
609 {
610 sosendoob(so);
611 if (sb->sb_cc == 0)
612 {
613 SOCKET_UNLOCK(so);
614 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
615 return 0;
616 }
617 }
618
619 /*
620 * No need to check if there's something to write,
621 * sowrite wouldn't have been called otherwise
622 */
623
624 len = sb->sb_cc;
625
626 iov[0].iov_base = sb->sb_rptr;
627 iov[1].iov_base = 0;
628 iov[1].iov_len = 0;
629 if (sb->sb_rptr < sb->sb_wptr)
630 {
631 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
632 /* Should never succeed, but... */
633 if (iov[0].iov_len > len)
634 iov[0].iov_len = len;
635 n = 1;
636 }
637 else
638 {
639 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
640 if (iov[0].iov_len > len)
641 iov[0].iov_len = len;
642 len -= iov[0].iov_len;
643 if (len)
644 {
645 iov[1].iov_base = sb->sb_data;
646 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
647 if (iov[1].iov_len > len)
648 iov[1].iov_len = len;
649 n = 2;
650 }
651 else
652 n = 1;
653 }
654 STAM_STATS({
655 if (n == 1)
656 {
657 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
658 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
659 }
660 else
661 {
662 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
663 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
664 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
665 }
666 });
667 /* Check if there's urgent data to send, and if so, send it */
668#ifdef HAVE_READV
669 nn = writev(so->s, (const struct iovec *)iov, n);
670#else
671 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
672#endif
673 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
674 /* This should never happen, but people tell me it does *shrug* */
675 if ( nn < 0
676 && ( errno == EAGAIN
677 || errno == EINTR
678 || errno == EWOULDBLOCK))
679 {
680 SOCKET_UNLOCK(so);
681 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
682 return 0;
683 }
684
685 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
686 {
687 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
688 __PRETTY_FUNCTION__, so->so_state, errno));
689 sofcantsendmore(so);
690 tcp_sockclosed(pData, sototcpcb(so));
691 SOCKET_UNLOCK(so);
692 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
693 return -1;
694 }
695
696#ifndef HAVE_READV
697 if (n == 2 && nn == iov[0].iov_len)
698 {
699 int ret;
700 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
701 if (ret > 0)
702 nn += ret;
703 STAM_STATS({
704 if (ret > 0 && ret != iov[1].iov_len)
705 {
706 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
707 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
708 }
709 });
710 }
711 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
712#endif
713
714 /* Update sbuf */
715 sb->sb_cc -= nn;
716 sb->sb_rptr += nn;
717 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
718 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
719 {
720 sb->sb_rptr -= sb->sb_datalen;
721 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
722 }
723
724 /*
725 * If in DRAIN mode, and there's no more data, set
726 * it CANTSENDMORE
727 */
728 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
729 sofcantsendmore(so);
730
731 SOCKET_UNLOCK(so);
732 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
733 return nn;
734}
735#else /* VBOX_WITH_SLIRP_BSD_SBUF */
736static int
737do_sosend(struct socket *so, int fUrg)
738{
739 struct sbuf *sb = &so->so_rcv;
740
741 int n, len;
742
743 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
744
745 len = sbuf_len(sb);
746
747 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
748 if (n < 0)
749 Log(("NAT: Can't sent sbuf via socket.\n"));
750 if (fUrg)
751 so->so_urgc -= n;
752 if (n > 0 && n < len)
753 {
754 char *ptr;
755 char *buff;
756 buff = RTMemAlloc(len);
757 if (buff == NULL)
758 {
759 Log(("NAT: No space to allocate temporal buffer\n"));
760 return -1;
761 }
762 ptr = sbuf_data(sb);
763 memcpy(buff, &ptr[n], len - n);
764 sbuf_bcpy(sb, buff, len - n);
765 RTMemFree(buff);
766 return n;
767 }
768 sbuf_clear(sb);
769 return n;
770}
771int
772sosendoob(struct socket *so)
773{
774 return do_sosend(so, 1);
775}
776
777/*
778 * Write data from so_rcv to so's socket,
779 * updating all sbuf field as necessary
780 */
781int
782sowrite(PNATState pData, struct socket *so)
783{
784 return do_sosend(so, 0);
785}
786#endif
787
788/*
789 * recvfrom() a UDP socket
790 */
791void
792sorecvfrom(PNATState pData, struct socket *so)
793{
794 ssize_t ret = 0;
795 struct sockaddr_in addr;
796 socklen_t addrlen = sizeof(struct sockaddr_in);
797
798 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
799
800 if (so->so_type == IPPROTO_ICMP)
801 {
802 /* This is a "ping" reply */
803#ifdef RT_OS_WINDOWS
804 sorecvfrom_icmp_win(pData, so);
805#else /* RT_OS_WINDOWS */
806 sorecvfrom_icmp_unix(pData, so);
807#endif /* !RT_OS_WINDOWS */
808 udp_detach(pData, so);
809 }
810 else
811 {
812 /* A "normal" UDP packet */
813 struct mbuf *m;
814 ssize_t len;
815 u_long n = 0;
816 int rc = 0;
817 static int signalled = 0;
818 char *pchBuffer = NULL;
819 bool fWithTemporalBuffer = false;
820
821 QSOCKET_LOCK(udb);
822 SOCKET_LOCK(so);
823 QSOCKET_UNLOCK(udb);
824
825 /*How many data has been received ?*/
826 /*
827 * 1. calculate how much we can read
828 * 2. read as much as possible
829 * 3. attach buffer to allocated header mbuf
830 */
831 rc = ioctlsocket(so->s, FIONREAD, &n);
832 if (rc == -1)
833 {
834 if ( errno == EAGAIN
835 || errno == EWOULDBLOCK
836 || errno == EINPROGRESS
837 || errno == ENOTCONN)
838 return;
839 else if (signalled == 0)
840 {
841 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
842 signalled = 1;
843 }
844 return;
845 }
846
847 len = sizeof(struct udpiphdr);
848 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
849 if (m == NULL)
850 return;
851
852 len += n;
853 m->m_data += ETH_HLEN;
854 m->m_pkthdr.header = mtod(m, void *);
855 m->m_data += sizeof(struct udpiphdr);
856
857 pchBuffer = mtod(m, char *);
858 fWithTemporalBuffer = false;
859 /*
860 * Even if amounts of bytes on socket is greater than MTU value
861 * Slirp will able fragment it, but we won't create temporal location
862 * here.
863 */
864 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
865 {
866 pchBuffer = RTMemAlloc((n) * sizeof(char));
867 if (!pchBuffer)
868 {
869 m_freem(pData, m);
870 return;
871 }
872 fWithTemporalBuffer = true;
873 }
874 ret = recvfrom(so->s, pchBuffer, n, 0,
875 (struct sockaddr *)&addr, &addrlen);
876 if (fWithTemporalBuffer)
877 {
878 if (ret > 0)
879 {
880 m_copyback(pData, m, 0, ret, pchBuffer);
881 /*
882 * If we've met comporison below our size prediction was failed
883 * it's not fatal just we've allocated for nothing. (@todo add counter here
884 * to calculate how rare we here)
885 */
886 if(ret < slirp_size(pData) && !m->m_next)
887 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
888 n, ret, slirp_size(pData)));
889 }
890 /* we're freeing buffer anyway */
891 RTMemFree(pchBuffer);
892 }
893 else
894 m->m_len = ret;
895
896 if (ret < 0)
897 {
898 u_char code = ICMP_UNREACH_PORT;
899
900 if (errno == EHOSTUNREACH)
901 code = ICMP_UNREACH_HOST;
902 else if (errno == ENETUNREACH)
903 code = ICMP_UNREACH_NET;
904
905 m_freem(pData, m);
906 if ( errno == EAGAIN
907 || errno == EWOULDBLOCK
908 || errno == EINPROGRESS
909 || errno == ENOTCONN)
910 {
911 return;
912 }
913
914 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
915 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
916 so->so_m = NULL;
917 }
918 else
919 {
920 Assert((m_length(m,NULL) == ret));
921 /*
922 * Hack: domain name lookup will be used the most for UDP,
923 * and since they'll only be used once there's no need
924 * for the 4 minute (or whatever) timeout... So we time them
925 * out much quicker (10 seconds for now...)
926 */
927 if (so->so_expire)
928 {
929 if (so->so_fport != RT_H2N_U16_C(53))
930 so->so_expire = curtime + SO_EXPIRE;
931 }
932 /*
933 * last argument should be changed if Slirp will inject IP attributes
934 * Note: Here we can't check if dnsproxy's sent initial request
935 */
936 if ( pData->fUseDnsProxy
937 && so->so_fport == RT_H2N_U16_C(53))
938 dnsproxy_answer(pData, so, m);
939
940#if 0
941 if (m->m_len == len)
942 {
943 m_inc(m, MINCSIZE);
944 m->m_len = 0;
945 }
946#endif
947
948 /* packets definetly will be fragmented, could confuse receiver peer. */
949 if (m_length(m, NULL) > if_mtu)
950 m->m_flags |= M_SKIP_FIREWALL;
951 /*
952 * If this packet was destined for CTL_ADDR,
953 * make it look like that's where it came from, done by udp_output
954 */
955 udp_output(pData, so, m, &addr);
956 SOCKET_UNLOCK(so);
957 } /* rx error */
958 } /* if ping packet */
959}
960
961/*
962 * sendto() a socket
963 */
964int
965sosendto(PNATState pData, struct socket *so, struct mbuf *m)
966{
967 int ret;
968 struct sockaddr_in *paddr;
969 struct sockaddr addr;
970#if 0
971 struct sockaddr_in host_addr;
972#endif
973 caddr_t buf = 0;
974 int mlen;
975
976 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
977
978 memset(&addr, 0, sizeof(struct sockaddr));
979#ifdef RT_OS_DARWIN
980 addr.sa_len = sizeof(struct sockaddr_in);
981#endif
982 paddr = (struct sockaddr_in *)&addr;
983 paddr->sin_family = AF_INET;
984 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
985 {
986 /* It's an alias */
987 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
988 switch(last_byte)
989 {
990#if 0
991 /* handle this case at 'default:' */
992 case CTL_BROADCAST:
993 addr.sin_addr.s_addr = INADDR_BROADCAST;
994 /* Send the packet to host to fully emulate broadcast */
995 /** @todo r=klaus: on Linux host this causes the host to receive
996 * the packet twice for some reason. And I cannot find any place
997 * in the man pages which states that sending a broadcast does not
998 * reach the host itself. */
999 host_addr.sin_family = AF_INET;
1000 host_addr.sin_port = so->so_fport;
1001 host_addr.sin_addr = our_addr;
1002 sendto(so->s, m->m_data, m->m_len, 0,
1003 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
1004 break;
1005#endif
1006 case CTL_DNS:
1007 case CTL_ALIAS:
1008 default:
1009 if (last_byte == ~pData->netmask)
1010 paddr->sin_addr.s_addr = INADDR_BROADCAST;
1011 else
1012 paddr->sin_addr = loopback_addr;
1013 break;
1014 }
1015 }
1016 else
1017 paddr->sin_addr = so->so_faddr;
1018 paddr->sin_port = so->so_fport;
1019
1020 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1021 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1022
1023 /* Don't care what port we get */
1024 /*
1025 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1026 * generates bodyless messages, annoying memmory management system.
1027 */
1028 mlen = m_length(m, NULL);
1029 if (mlen > 0)
1030 {
1031 buf = RTMemAlloc(mlen);
1032 if (buf == NULL)
1033 {
1034 return -1;
1035 }
1036 m_copydata(m, 0, mlen, buf);
1037 }
1038 ret = sendto(so->s, buf, mlen, 0,
1039 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1040#ifdef VBOX_WITH_NAT_SEND2HOME
1041 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1042 {
1043 slirpSend2Home(pData, so, buf, mlen, 0);
1044 }
1045#endif
1046 if (buf)
1047 RTMemFree(buf);
1048 if (ret < 0)
1049 {
1050 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1051 return -1;
1052 }
1053
1054 /*
1055 * Kill the socket if there's no reply in 4 minutes,
1056 * but only if it's an expirable socket
1057 */
1058 if (so->so_expire)
1059 so->so_expire = curtime + SO_EXPIRE;
1060 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1061 return 0;
1062}
1063
1064/*
1065 * XXX This should really be tcp_listen
1066 */
1067struct socket *
1068solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1069{
1070 struct sockaddr_in addr;
1071 struct socket *so;
1072 socklen_t addrlen = sizeof(addr);
1073 int s, opt = 1;
1074 int status;
1075
1076 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1077
1078 if ((so = socreate()) == NULL)
1079 {
1080 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1081 return NULL;
1082 }
1083
1084 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1085 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1086 {
1087 RTMemFree(so);
1088 return NULL;
1089 }
1090
1091 SOCKET_LOCK_CREATE(so);
1092 SOCKET_LOCK(so);
1093 QSOCKET_LOCK(tcb);
1094 insque(pData, so,&tcb);
1095 NSOCK_INC();
1096 QSOCKET_UNLOCK(tcb);
1097
1098 /*
1099 * SS_FACCEPTONCE sockets must time out.
1100 */
1101 if (flags & SS_FACCEPTONCE)
1102 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1103
1104 so->so_state = (SS_FACCEPTCONN|flags);
1105 so->so_lport = lport; /* Kept in network format */
1106 so->so_laddr.s_addr = laddr; /* Ditto */
1107
1108 memset(&addr, 0, sizeof(addr));
1109#ifdef RT_OS_DARWIN
1110 addr.sin_len = sizeof(addr);
1111#endif
1112 addr.sin_family = AF_INET;
1113 addr.sin_addr.s_addr = bind_addr;
1114 addr.sin_port = port;
1115
1116 /**
1117 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1118 * kernel will choose the optimal value for requests queue length.
1119 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1120 */
1121 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1122 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1123 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1124 || (listen(s, pData->soMaxConn) < 0))
1125 {
1126#ifdef RT_OS_WINDOWS
1127 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1128 closesocket(s);
1129 QSOCKET_LOCK(tcb);
1130 sofree(pData, so);
1131 QSOCKET_UNLOCK(tcb);
1132 /* Restore the real errno */
1133 WSASetLastError(tmperrno);
1134#else
1135 int tmperrno = errno; /* Don't clobber the real reason we failed */
1136 close(s);
1137 QSOCKET_LOCK(tcb);
1138 sofree(pData, so);
1139 QSOCKET_UNLOCK(tcb);
1140 /* Restore the real errno */
1141 errno = tmperrno;
1142#endif
1143 return NULL;
1144 }
1145 fd_nonblock(s);
1146 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1147
1148 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1149 so->so_fport = addr.sin_port;
1150 /* set socket buffers */
1151 opt = pData->socket_rcv;
1152 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1153 if (status < 0)
1154 {
1155 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1156 goto no_sockopt;
1157 }
1158 opt = pData->socket_snd;
1159 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1160 if (status < 0)
1161 {
1162 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1163 goto no_sockopt;
1164 }
1165no_sockopt:
1166 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1167 so->so_faddr = alias_addr;
1168 else
1169 so->so_faddr = addr.sin_addr;
1170
1171 so->s = s;
1172 SOCKET_UNLOCK(so);
1173 return so;
1174}
1175
1176/*
1177 * Data is available in so_rcv
1178 * Just write() the data to the socket
1179 * XXX not yet...
1180 * @todo do we really need this function, what it's intended to do?
1181 */
1182void
1183sorwakeup(struct socket *so)
1184{
1185 NOREF(so);
1186#if 0
1187 sowrite(so);
1188 FD_CLR(so->s,&writefds);
1189#endif
1190}
1191
1192/*
1193 * Data has been freed in so_snd
1194 * We have room for a read() if we want to
1195 * For now, don't read, it'll be done in the main loop
1196 */
1197void
1198sowwakeup(struct socket *so)
1199{
1200 NOREF(so);
1201}
1202
1203/*
1204 * Various session state calls
1205 * XXX Should be #define's
1206 * The socket state stuff needs work, these often get call 2 or 3
1207 * times each when only 1 was needed
1208 */
1209void
1210soisfconnecting(struct socket *so)
1211{
1212 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1213 SS_FCANTSENDMORE|SS_FWDRAIN);
1214 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1215}
1216
1217void
1218soisfconnected(struct socket *so)
1219{
1220 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1221 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1222 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1223 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1224}
1225
1226void
1227sofcantrcvmore(struct socket *so)
1228{
1229 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1230 if ((so->so_state & SS_NOFDREF) == 0)
1231 {
1232 shutdown(so->s, 0);
1233 }
1234 so->so_state &= ~(SS_ISFCONNECTING);
1235 if (so->so_state & SS_FCANTSENDMORE)
1236 so->so_state = SS_NOFDREF; /* Don't select it */
1237 /* XXX close() here as well? */
1238 else
1239 so->so_state |= SS_FCANTRCVMORE;
1240 LogFlowFuncLeave();
1241}
1242
1243void
1244sofcantsendmore(struct socket *so)
1245{
1246 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1247 if ((so->so_state & SS_NOFDREF) == 0)
1248 shutdown(so->s, 1); /* send FIN to fhost */
1249
1250 so->so_state &= ~(SS_ISFCONNECTING);
1251 if (so->so_state & SS_FCANTRCVMORE)
1252 so->so_state = SS_NOFDREF; /* as above */
1253 else
1254 so->so_state |= SS_FCANTSENDMORE;
1255 LogFlowFuncLeave();
1256}
1257
1258void
1259soisfdisconnected(struct socket *so)
1260{
1261 NOREF(so);
1262#if 0
1263 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1264 close(so->s);
1265 so->so_state = SS_ISFDISCONNECTED;
1266 /*
1267 * XXX Do nothing ... ?
1268 */
1269#endif
1270}
1271
1272/*
1273 * Set write drain mode
1274 * Set CANTSENDMORE once all data has been write()n
1275 */
1276void
1277sofwdrain(struct socket *so)
1278{
1279 if (SBUF_LEN(&so->so_rcv))
1280 so->so_state |= SS_FWDRAIN;
1281 else
1282 sofcantsendmore(so);
1283}
1284
1285static void
1286send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1287{
1288 struct ip *ip;
1289 uint32_t dst, src;
1290 char ip_copy[256];
1291 struct icmp *icp;
1292 int old_ip_len = 0;
1293 int hlen, original_hlen = 0;
1294 struct mbuf *m;
1295 struct icmp_msg *icm;
1296 uint8_t proto;
1297 int type = 0;
1298
1299 ip = (struct ip *)buff;
1300 /* Fix ip->ip_len to contain the total packet length including the header
1301 * in _host_ byte order for all OSes. On Darwin, that value already is in
1302 * host byte order. Solaris and Darwin report only the payload. */
1303#ifndef RT_OS_DARWIN
1304 ip->ip_len = RT_N2H_U16(ip->ip_len);
1305#endif
1306 hlen = (ip->ip_hl << 2);
1307#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1308 ip->ip_len += hlen;
1309#endif
1310 if (ip->ip_len < hlen + ICMP_MINLEN)
1311 {
1312 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1313 return;
1314 }
1315 icp = (struct icmp *)((char *)ip + hlen);
1316
1317 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1318 if ( icp->icmp_type != ICMP_ECHOREPLY
1319 && icp->icmp_type != ICMP_TIMXCEED
1320 && icp->icmp_type != ICMP_UNREACH)
1321 {
1322 return;
1323 }
1324
1325 /*
1326 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1327 * ICMP_ECHOREPLY assuming data 0
1328 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1329 */
1330 if (ip->ip_len < hlen + 8)
1331 {
1332 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1333 return;
1334 }
1335
1336 type = icp->icmp_type;
1337 if ( type == ICMP_TIMXCEED
1338 || type == ICMP_UNREACH)
1339 {
1340 /*
1341 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1342 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1343 */
1344 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1345 {
1346 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1347 return;
1348 }
1349 ip = &icp->icmp_ip;
1350 }
1351
1352 icm = icmp_find_original_mbuf(pData, ip);
1353 if (icm == NULL)
1354 {
1355 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1356 return;
1357 }
1358
1359 m = icm->im_m;
1360 Assert(m != NULL);
1361
1362 src = addr->sin_addr.s_addr;
1363 if (type == ICMP_ECHOREPLY)
1364 {
1365 struct ip *ip0 = mtod(m, struct ip *);
1366 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1367 if (icp0->icmp_type != ICMP_ECHO)
1368 {
1369 Log(("NAT: we haven't found echo for this reply\n"));
1370 return;
1371 }
1372 /*
1373 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1374 * IP header combined by OS network stack, our local copy of IP header contians values
1375 * in host byte order so no byte order conversion is required. IP headers fields are converting
1376 * in ip_output0 routine only.
1377 */
1378 if ( (ip->ip_len - hlen)
1379 != (ip0->ip_len - (ip0->ip_hl << 2)))
1380 {
1381 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1382 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1383 return;
1384 }
1385 }
1386
1387 /* ip points on origianal ip header */
1388 ip = mtod(m, struct ip *);
1389 proto = ip->ip_p;
1390 /* Now ip is pointing on header we've sent from guest */
1391 if ( icp->icmp_type == ICMP_TIMXCEED
1392 || icp->icmp_type == ICMP_UNREACH)
1393 {
1394 old_ip_len = (ip->ip_hl << 2) + 64;
1395 if (old_ip_len > sizeof(ip_copy))
1396 old_ip_len = sizeof(ip_copy);
1397 memcpy(ip_copy, ip, old_ip_len);
1398 }
1399
1400 /* source address from original IP packet*/
1401 dst = ip->ip_src.s_addr;
1402
1403 /* overide ther tail of old packet */
1404 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1405 original_hlen = ip->ip_hl << 2;
1406 /* saves original ip header and options */
1407 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1408 ip->ip_len = m_length(m, NULL);
1409 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1410
1411 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1412 type = icp->icmp_type;
1413 if ( type == ICMP_TIMXCEED
1414 || type == ICMP_UNREACH)
1415 {
1416 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1417 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1418 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1419 }
1420
1421 ip->ip_src.s_addr = src;
1422 ip->ip_dst.s_addr = dst;
1423 icmp_reflect(pData, m);
1424 LIST_REMOVE(icm, im_list);
1425 pData->cIcmpCacheSize--;
1426 /* Don't call m_free here*/
1427
1428 if ( type == ICMP_TIMXCEED
1429 || type == ICMP_UNREACH)
1430 {
1431 icm->im_so->so_m = NULL;
1432 switch (proto)
1433 {
1434 case IPPROTO_UDP:
1435 /*XXX: so->so_m already freed so we shouldn't call sofree */
1436 udp_detach(pData, icm->im_so);
1437 break;
1438 case IPPROTO_TCP:
1439 /*close tcp should be here */
1440 break;
1441 default:
1442 /* do nothing */
1443 break;
1444 }
1445 }
1446 RTMemFree(icm);
1447}
1448
1449#ifdef RT_OS_WINDOWS
1450static void
1451sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1452{
1453 int len;
1454 int i;
1455 struct ip *ip;
1456 struct mbuf *m;
1457 struct icmp *icp;
1458 struct icmp_msg *icm;
1459 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1460 uint32_t src;
1461 ICMP_ECHO_REPLY *icr;
1462 int hlen = 0;
1463 int nbytes = 0;
1464 u_char code = ~0;
1465 int out_len;
1466 int size;
1467
1468 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1469 if (len < 0)
1470 {
1471 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1472 return;
1473 }
1474 if (len == 0)
1475 return; /* no error */
1476
1477 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1478 for (i = 0; i < len; ++i)
1479 {
1480 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1481 i, icr[i].Data, icr[i].DataSize));
1482 switch(icr[i].Status)
1483 {
1484 case IP_DEST_HOST_UNREACHABLE:
1485 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1486 case IP_DEST_NET_UNREACHABLE:
1487 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1488 case IP_DEST_PROT_UNREACHABLE:
1489 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1490 /* UNREACH error inject here */
1491 case IP_DEST_PORT_UNREACHABLE:
1492 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1493 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1494 so->so_m = NULL;
1495 break;
1496 case IP_SUCCESS: /* echo replied */
1497 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1498 size;
1499 size = MCLBYTES;
1500 if (out_len < MSIZE)
1501 size = MCLBYTES;
1502 else if (out_len < MCLBYTES)
1503 size = MCLBYTES;
1504 else if (out_len < MJUM9BYTES)
1505 size = MJUM9BYTES;
1506 else if (out_len < MJUM16BYTES)
1507 size = MJUM16BYTES;
1508 else
1509 AssertMsgFailed(("Unsupported size"));
1510
1511 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1512 LogFunc(("m_getjcl returns m: %p\n", m));
1513 if (m == NULL)
1514 return;
1515 m->m_len = 0;
1516 m->m_data += if_maxlinkhdr;
1517 m->m_pkthdr.header = mtod(m, void *);
1518
1519 ip = mtod(m, struct ip *);
1520 ip->ip_src.s_addr = icr[i].Address;
1521 ip->ip_p = IPPROTO_ICMP;
1522 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1523 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1524 ip->ip_ttl = icr[i].Options.Ttl;
1525
1526 icp = (struct icmp *)&ip[1]; /* no options */
1527 icp->icmp_type = ICMP_ECHOREPLY;
1528 icp->icmp_code = 0;
1529 icp->icmp_id = so->so_icmp_id;
1530 icp->icmp_seq = so->so_icmp_seq;
1531
1532 icm = icmp_find_original_mbuf(pData, ip);
1533 if (icm)
1534 {
1535 /* on this branch we don't need stored variant */
1536 m_freem(pData, icm->im_m);
1537 LIST_REMOVE(icm, im_list);
1538 pData->cIcmpCacheSize--;
1539 RTMemFree(icm);
1540 }
1541
1542
1543 hlen = (ip->ip_hl << 2);
1544 Assert((hlen >= sizeof(struct ip)));
1545
1546 m->m_data += hlen + ICMP_MINLEN;
1547 if (!RT_VALID_PTR(icr[i].Data))
1548 {
1549 m_freem(pData, m);
1550 break;
1551 }
1552 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1553 m->m_data -= hlen + ICMP_MINLEN;
1554 m->m_len += hlen + ICMP_MINLEN;
1555
1556
1557 ip->ip_len = m_length(m, NULL);
1558 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1559
1560 icmp_reflect(pData, m);
1561 break;
1562 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1563
1564 ip_broken = icr[i].Data;
1565 icm = icmp_find_original_mbuf(pData, ip_broken);
1566 if (icm == NULL) {
1567 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1568 return;
1569 }
1570 m = icm->im_m;
1571 ip = mtod(m, struct ip *);
1572 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1573 ip->ip_ttl = icr[i].Options.Ttl;
1574 src = ip->ip_src.s_addr;
1575 ip->ip_dst.s_addr = src;
1576 ip->ip_dst.s_addr = icr[i].Address;
1577
1578 hlen = (ip->ip_hl << 2);
1579 icp = (struct icmp *)((char *)ip + hlen);
1580 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1581
1582 m->m_len = (ip_broken->ip_hl << 2) + 64;
1583 m->m_pkthdr.header = mtod(m, void *);
1584 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1585 icmp_reflect(pData, m);
1586 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1587 LIST_REMOVE(icm, im_list);
1588 pData->cIcmpCacheSize--;
1589 RTMemFree(icm);
1590 break;
1591 default:
1592 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1593 break;
1594 }
1595 }
1596}
1597#else /* !RT_OS_WINDOWS */
1598static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1599{
1600 struct sockaddr_in addr;
1601 socklen_t addrlen = sizeof(struct sockaddr_in);
1602 struct ip ip;
1603 char *buff;
1604 int len = 0;
1605
1606 /* 1- step: read the ip header */
1607 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1608 (struct sockaddr *)&addr, &addrlen);
1609 if ( len < 0
1610 && ( errno == EAGAIN
1611 || errno == EWOULDBLOCK
1612 || errno == EINPROGRESS
1613 || errno == ENOTCONN))
1614 {
1615 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1616 return;
1617 }
1618
1619 if ( len < sizeof(struct ip)
1620 || len < 0
1621 || len == 0)
1622 {
1623 u_char code;
1624 code = ICMP_UNREACH_PORT;
1625
1626 if (errno == EHOSTUNREACH)
1627 code = ICMP_UNREACH_HOST;
1628 else if (errno == ENETUNREACH)
1629 code = ICMP_UNREACH_NET;
1630
1631 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1632 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1633 so->so_m = NULL;
1634 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1635 return;
1636 }
1637 /* basic check of IP header */
1638 if ( ip.ip_v != IPVERSION
1639# ifndef RT_OS_DARWIN
1640 || ip.ip_p != IPPROTO_ICMP
1641# endif
1642 )
1643 {
1644 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1645 return;
1646 }
1647# ifndef RT_OS_DARWIN
1648 /* Darwin reports the IP length already in host byte order. */
1649 ip.ip_len = RT_N2H_U16(ip.ip_len);
1650# endif
1651# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1652 /* Solaris and Darwin report the payload only */
1653 ip.ip_len += (ip.ip_hl << 2);
1654# endif
1655 /* Note: ip->ip_len in host byte order (all OS) */
1656 len = ip.ip_len;
1657 buff = RTMemAlloc(len);
1658 if (buff == NULL)
1659 {
1660 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1661 return;
1662 }
1663 /* 2 - step: we're reading rest of the datagramm to the buffer */
1664 addrlen = sizeof(struct sockaddr_in);
1665 memset(&addr, 0, addrlen);
1666 len = recvfrom(so->s, buff, len, 0,
1667 (struct sockaddr *)&addr, &addrlen);
1668 if ( len < 0
1669 && ( errno == EAGAIN
1670 || errno == EWOULDBLOCK
1671 || errno == EINPROGRESS
1672 || errno == ENOTCONN))
1673 {
1674 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1675 ip.ip_len));
1676 RTMemFree(buff);
1677 return;
1678 }
1679 if ( len < 0
1680 || len == 0)
1681 {
1682 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1683 errno, len, (ip.ip_len - sizeof(struct ip))));
1684 RTMemFree(buff);
1685 return;
1686 }
1687 /* len is modified in 2nd read, when the rest of the datagramm was read */
1688 send_icmp_to_guest(pData, buff, len, &addr);
1689 RTMemFree(buff);
1690}
1691#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette