VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 39287

Last change on this file since 39287 was 39287, checked in by vboxsync, 13 years ago

NAT: "sending to home" on wide cast and cloning udp sockets. Both disabled.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 48.7 KB
Line 
1/* $Id: socket.c 39287 2011-11-14 09:41:52Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#define WANT_SYS_IOCTL_H
28#include <slirp.h>
29#include "ip_icmp.h"
30#include "main.h"
31#ifdef __sun__
32#include <sys/filio.h>
33#endif
34#include <VBox/vmm/pdmdrv.h>
35#if defined (RT_OS_WINDOWS)
36#include <iphlpapi.h>
37#include <icmpapi.h>
38#endif
39
40#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
41/**
42 *
43 */
44struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, const struct socket *pSo, uint32_t u32ForeignAddr)
45{
46 struct socket *pNewSocket = NULL;
47 LogFlowFunc(("Enter: so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", pSo, u32ForeignAddr));
48 pNewSocket = socreate();
49 if (!pNewSocket)
50 {
51 LogFunc(("Can't create socket\n"));
52 LogFlowFunc(("Leave: NULL\n"));
53 return NULL;
54 }
55 if (udp_attach(pData, pNewSocket, 0) <= 0)
56 {
57 sofree(pData, pNewSocket);
58 LogFunc(("Can't attach fresh created socket\n"));
59 return NULL;
60 }
61 pNewSocket->so_laddr = pSo->so_laddr;
62 pNewSocket->so_lport = pSo->so_lport;
63 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
64 pNewSocket->so_fport = pSo->so_fport;
65 return pNewSocket;
66}
67#endif
68
69#ifdef VBOX_WITH_NAT_SEND2HOME
70DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
71{
72 int idxAddr;
73 int ret = 0;
74 bool fSendDone = false;
75 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
76 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
77 {
78
79 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
80 AssertReturn((pNewSocket, false));
81 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
82 /* @todo: more verbose on errors,
83 * @note: we shouldn't care if this send fail or not (we're in broadcast).
84 */
85 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
86 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
87 if (ret < 0)
88 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
89 fSendDone |= ret > 0;
90 }
91 LogFlowFunc(("Leave %RTbool\n", fSendDone));
92 return fSendDone;
93}
94#endif /* !VBOX_WITH_NAT_SEND2HOME */
95static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
96#ifdef RT_OS_WINDOWS
97static void sorecvfrom_icmp_win(PNATState, struct socket *);
98#else /* RT_OS_WINDOWS */
99static void sorecvfrom_icmp_unix(PNATState, struct socket *);
100#endif /* !RT_OS_WINDOWS */
101
102void
103so_init()
104{
105}
106
107struct socket *
108solookup(struct socket *head, struct in_addr laddr,
109 u_int lport, struct in_addr faddr, u_int fport)
110{
111 struct socket *so;
112
113 for (so = head->so_next; so != head; so = so->so_next)
114 {
115 if ( so->so_lport == lport
116 && so->so_laddr.s_addr == laddr.s_addr
117 && so->so_faddr.s_addr == faddr.s_addr
118 && so->so_fport == fport)
119 return so;
120 }
121
122 return (struct socket *)NULL;
123}
124
125/*
126 * Create a new socket, initialise the fields
127 * It is the responsibility of the caller to
128 * insque() it into the correct linked-list
129 */
130struct socket *
131socreate()
132{
133 struct socket *so;
134
135 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
136 if (so)
137 {
138 so->so_state = SS_NOFDREF;
139 so->s = -1;
140#if !defined(RT_OS_WINDOWS)
141 so->so_poll_index = -1;
142#endif
143 }
144 return so;
145}
146
147/*
148 * remque and free a socket, clobber cache
149 * VBOX_WITH_SLIRP_MT: before sofree queue should be locked, because
150 * in sofree we don't know from which queue item beeing removed.
151 */
152void
153sofree(PNATState pData, struct socket *so)
154{
155 if (so == tcp_last_so)
156 tcp_last_so = &tcb;
157 else if (so == udp_last_so)
158 udp_last_so = &udb;
159
160 /* check if mbuf haven't been already freed */
161 if (so->so_m != NULL)
162 m_freem(pData, so->so_m);
163#ifndef VBOX_WITH_SLIRP_MT
164 if (so->so_next && so->so_prev)
165 {
166 remque(pData, so); /* crashes if so is not in a queue */
167 NSOCK_DEC();
168 }
169
170 RTMemFree(so);
171#else
172 so->so_deleted = 1;
173#endif
174}
175
176#ifdef VBOX_WITH_SLIRP_MT
177void
178soread_queue(PNATState pData, struct socket *so, int *ret)
179{
180 *ret = soread(pData, so);
181}
182#endif
183
184/*
185 * Read from so's socket into sb_snd, updating all relevant sbuf fields
186 * NOTE: This will only be called if it is select()ed for reading, so
187 * a read() of 0 (or less) means it's disconnected
188 */
189#ifndef VBOX_WITH_SLIRP_BSD_SBUF
190int
191soread(PNATState pData, struct socket *so)
192{
193 int n, nn, lss, total;
194 struct sbuf *sb = &so->so_snd;
195 size_t len = sb->sb_datalen - sb->sb_cc;
196 struct iovec iov[2];
197 int mss = so->so_tcpcb->t_maxseg;
198
199 STAM_PROFILE_START(&pData->StatIOread, a);
200 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
201 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
202
203 QSOCKET_LOCK(tcb);
204 SOCKET_LOCK(so);
205 QSOCKET_UNLOCK(tcb);
206
207 LogFlow(("soread: so = %R[natsock]\n", so));
208 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
209
210 /*
211 * No need to check if there's enough room to read.
212 * soread wouldn't have been called if there weren't
213 */
214
215 len = sb->sb_datalen - sb->sb_cc;
216
217 iov[0].iov_base = sb->sb_wptr;
218 iov[1].iov_base = 0;
219 iov[1].iov_len = 0;
220 if (sb->sb_wptr < sb->sb_rptr)
221 {
222 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
223 /* Should never succeed, but... */
224 if (iov[0].iov_len > len)
225 iov[0].iov_len = len;
226 if (iov[0].iov_len > mss)
227 iov[0].iov_len -= iov[0].iov_len%mss;
228 n = 1;
229 }
230 else
231 {
232 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
233 /* Should never succeed, but... */
234 if (iov[0].iov_len > len)
235 iov[0].iov_len = len;
236 len -= iov[0].iov_len;
237 if (len)
238 {
239 iov[1].iov_base = sb->sb_data;
240 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
241 if (iov[1].iov_len > len)
242 iov[1].iov_len = len;
243 total = iov[0].iov_len + iov[1].iov_len;
244 if (total > mss)
245 {
246 lss = total % mss;
247 if (iov[1].iov_len > lss)
248 {
249 iov[1].iov_len -= lss;
250 n = 2;
251 }
252 else
253 {
254 lss -= iov[1].iov_len;
255 iov[0].iov_len -= lss;
256 n = 1;
257 }
258 }
259 else
260 n = 2;
261 }
262 else
263 {
264 if (iov[0].iov_len > mss)
265 iov[0].iov_len -= iov[0].iov_len%mss;
266 n = 1;
267 }
268 }
269
270#ifdef HAVE_READV
271 nn = readv(so->s, (struct iovec *)iov, n);
272#else
273 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
274#endif
275 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
276 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
277 if (nn <= 0)
278 {
279 /*
280 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
281 * _could_ mean that the connection is closed. But we will receive an
282 * FD_CLOSE event later if the connection was _really_ closed. With
283 * www.youtube.com I see this very often. Closing the socket too early
284 * would be dangerous.
285 */
286 int status;
287 unsigned long pending = 0;
288 status = ioctlsocket(so->s, FIONREAD, &pending);
289 if (status < 0)
290 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
291 if (nn == 0 && (pending != 0))
292 {
293 SOCKET_UNLOCK(so);
294 STAM_PROFILE_STOP(&pData->StatIOread, a);
295 return 0;
296 }
297 if ( nn < 0
298 && ( errno == EINTR
299 || errno == EAGAIN
300 || errno == EWOULDBLOCK))
301 {
302 SOCKET_UNLOCK(so);
303 STAM_PROFILE_STOP(&pData->StatIOread, a);
304 return 0;
305 }
306 else
307 {
308 /* nn == 0 means peer has performed an orderly shutdown */
309 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
310 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
311 sofcantrcvmore(so);
312 tcp_sockclosed(pData, sototcpcb(so));
313 SOCKET_UNLOCK(so);
314 STAM_PROFILE_STOP(&pData->StatIOread, a);
315 return -1;
316 }
317 }
318 STAM_STATS(
319 if (n == 1)
320 {
321 STAM_COUNTER_INC(&pData->StatIORead_in_1);
322 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
323 }
324 else
325 {
326 STAM_COUNTER_INC(&pData->StatIORead_in_2);
327 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
328 }
329 );
330
331#ifndef HAVE_READV
332 /*
333 * If there was no error, try and read the second time round
334 * We read again if n = 2 (ie, there's another part of the buffer)
335 * and we read as much as we could in the first read
336 * We don't test for <= 0 this time, because there legitimately
337 * might not be any more data (since the socket is non-blocking),
338 * a close will be detected on next iteration.
339 * A return of -1 wont (shouldn't) happen, since it didn't happen above
340 */
341 if (n == 2 && nn == iov[0].iov_len)
342 {
343 int ret;
344 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
345 if (ret > 0)
346 nn += ret;
347 STAM_STATS(
348 if (ret > 0)
349 {
350 STAM_COUNTER_INC(&pData->StatIORead_in_2);
351 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
352 }
353 );
354 }
355
356 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
357#endif
358
359 /* Update fields */
360 sb->sb_cc += nn;
361 sb->sb_wptr += nn;
362 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
363 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
364 {
365 sb->sb_wptr -= sb->sb_datalen;
366 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
367 }
368 STAM_PROFILE_STOP(&pData->StatIOread, a);
369 SOCKET_UNLOCK(so);
370 return nn;
371}
372#else /* VBOX_WITH_SLIRP_BSD_SBUF */
373int
374soread(PNATState pData, struct socket *so)
375{
376 int n;
377 char *buf;
378 struct sbuf *sb = &so->so_snd;
379 size_t len = sbspace(sb);
380 int mss = so->so_tcpcb->t_maxseg;
381
382 STAM_PROFILE_START(&pData->StatIOread, a);
383 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
384 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
385
386 QSOCKET_LOCK(tcb);
387 SOCKET_LOCK(so);
388 QSOCKET_UNLOCK(tcb);
389
390 LogFlowFunc(("soread: so = %lx\n", (long)so));
391
392 if (len > mss)
393 len -= len % mss;
394 buf = RTMemAlloc(len);
395 if (buf == NULL)
396 {
397 Log(("NAT: can't alloc enough memory\n"));
398 return -1;
399 }
400
401 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
402 if (n <= 0)
403 {
404 /*
405 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
406 * _could_ mean that the connection is closed. But we will receive an
407 * FD_CLOSE event later if the connection was _really_ closed. With
408 * www.youtube.com I see this very often. Closing the socket too early
409 * would be dangerous.
410 */
411 int status;
412 unsigned long pending = 0;
413 status = ioctlsocket(so->s, FIONREAD, &pending);
414 if (status < 0)
415 Log(("NAT:error in WSAIoctl: %d\n", errno));
416 if (n == 0 && (pending != 0))
417 {
418 SOCKET_UNLOCK(so);
419 STAM_PROFILE_STOP(&pData->StatIOread, a);
420 RTMemFree(buf);
421 return 0;
422 }
423 if ( n < 0
424 && ( errno == EINTR
425 || errno == EAGAIN
426 || errno == EWOULDBLOCK))
427 {
428 SOCKET_UNLOCK(so);
429 STAM_PROFILE_STOP(&pData->StatIOread, a);
430 RTMemFree(buf);
431 return 0;
432 }
433 else
434 {
435 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
436 n, errno, strerror(errno)));
437 sofcantrcvmore(so);
438 tcp_sockclosed(pData, sototcpcb(so));
439 SOCKET_UNLOCK(so);
440 STAM_PROFILE_STOP(&pData->StatIOread, a);
441 RTMemFree(buf);
442 return -1;
443 }
444 }
445
446 sbuf_bcat(sb, buf, n);
447 RTMemFree(buf);
448 return n;
449}
450#endif
451
452/*
453 * Get urgent data
454 *
455 * When the socket is created, we set it SO_OOBINLINE,
456 * so when OOB data arrives, we soread() it and everything
457 * in the send buffer is sent as urgent data
458 */
459void
460sorecvoob(PNATState pData, struct socket *so)
461{
462 struct tcpcb *tp = sototcpcb(so);
463 ssize_t ret;
464
465 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
466
467 /*
468 * We take a guess at how much urgent data has arrived.
469 * In most situations, when urgent data arrives, the next
470 * read() should get all the urgent data. This guess will
471 * be wrong however if more data arrives just after the
472 * urgent data, or the read() doesn't return all the
473 * urgent data.
474 */
475 ret = soread(pData, so);
476 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
477 tp->t_force = 1;
478 tcp_output(pData, tp);
479 tp->t_force = 0;
480}
481#ifndef VBOX_WITH_SLIRP_BSD_SBUF
482/*
483 * Send urgent data
484 * There's a lot duplicated code here, but...
485 */
486int
487sosendoob(struct socket *so)
488{
489 struct sbuf *sb = &so->so_rcv;
490 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
491
492 int n, len;
493
494 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
495
496 if (so->so_urgc > sizeof(buff))
497 so->so_urgc = sizeof(buff); /* XXX */
498
499 if (sb->sb_rptr < sb->sb_wptr)
500 {
501 /* We can send it directly */
502 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
503 so->so_urgc -= n;
504
505 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
506 n, so->so_urgc));
507 }
508 else
509 {
510 /*
511 * Since there's no sendv or sendtov like writev,
512 * we must copy all data to a linear buffer then
513 * send it all
514 */
515 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
516 if (len > so->so_urgc)
517 len = so->so_urgc;
518 memcpy(buff, sb->sb_rptr, len);
519 so->so_urgc -= len;
520 if (so->so_urgc)
521 {
522 n = sb->sb_wptr - sb->sb_data;
523 if (n > so->so_urgc)
524 n = so->so_urgc;
525 memcpy(buff + len, sb->sb_data, n);
526 so->so_urgc -= n;
527 len += n;
528 }
529 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
530#ifdef DEBUG
531 if (n != len)
532 Log(("Didn't send all data urgently XXXXX\n"));
533#endif
534 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
535 n, so->so_urgc));
536 }
537
538 sb->sb_cc -= n;
539 sb->sb_rptr += n;
540 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
541 sb->sb_rptr -= sb->sb_datalen;
542
543 return n;
544}
545
546/*
547 * Write data from so_rcv to so's socket,
548 * updating all sbuf field as necessary
549 */
550int
551sowrite(PNATState pData, struct socket *so)
552{
553 int n, nn;
554 struct sbuf *sb = &so->so_rcv;
555 size_t len = sb->sb_cc;
556 struct iovec iov[2];
557
558 STAM_PROFILE_START(&pData->StatIOwrite, a);
559 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
560 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
561 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
562 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
563 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
564 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
565 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
566 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
567 LogFlowFunc(("so = %R[natsock]\n", so));
568 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
569 QSOCKET_LOCK(tcb);
570 SOCKET_LOCK(so);
571 QSOCKET_UNLOCK(tcb);
572 if (so->so_urgc)
573 {
574 sosendoob(so);
575 if (sb->sb_cc == 0)
576 {
577 SOCKET_UNLOCK(so);
578 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
579 return 0;
580 }
581 }
582
583 /*
584 * No need to check if there's something to write,
585 * sowrite wouldn't have been called otherwise
586 */
587
588 len = sb->sb_cc;
589
590 iov[0].iov_base = sb->sb_rptr;
591 iov[1].iov_base = 0;
592 iov[1].iov_len = 0;
593 if (sb->sb_rptr < sb->sb_wptr)
594 {
595 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
596 /* Should never succeed, but... */
597 if (iov[0].iov_len > len)
598 iov[0].iov_len = len;
599 n = 1;
600 }
601 else
602 {
603 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
604 if (iov[0].iov_len > len)
605 iov[0].iov_len = len;
606 len -= iov[0].iov_len;
607 if (len)
608 {
609 iov[1].iov_base = sb->sb_data;
610 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
611 if (iov[1].iov_len > len)
612 iov[1].iov_len = len;
613 n = 2;
614 }
615 else
616 n = 1;
617 }
618 STAM_STATS({
619 if (n == 1)
620 {
621 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
622 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
623 }
624 else
625 {
626 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
627 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
628 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
629 }
630 });
631 /* Check if there's urgent data to send, and if so, send it */
632#ifdef HAVE_READV
633 nn = writev(so->s, (const struct iovec *)iov, n);
634#else
635 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
636#endif
637 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
638 /* This should never happen, but people tell me it does *shrug* */
639 if ( nn < 0
640 && ( errno == EAGAIN
641 || errno == EINTR
642 || errno == EWOULDBLOCK))
643 {
644 SOCKET_UNLOCK(so);
645 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
646 return 0;
647 }
648
649 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
650 {
651 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
652 __PRETTY_FUNCTION__, so->so_state, errno));
653 sofcantsendmore(so);
654 tcp_sockclosed(pData, sototcpcb(so));
655 SOCKET_UNLOCK(so);
656 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
657 return -1;
658 }
659
660#ifndef HAVE_READV
661 if (n == 2 && nn == iov[0].iov_len)
662 {
663 int ret;
664 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
665 if (ret > 0)
666 nn += ret;
667 STAM_STATS({
668 if (ret > 0 && ret != iov[1].iov_len)
669 {
670 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
671 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
672 }
673 });
674 }
675 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
676#endif
677
678 /* Update sbuf */
679 sb->sb_cc -= nn;
680 sb->sb_rptr += nn;
681 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
682 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
683 {
684 sb->sb_rptr -= sb->sb_datalen;
685 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
686 }
687
688 /*
689 * If in DRAIN mode, and there's no more data, set
690 * it CANTSENDMORE
691 */
692 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
693 sofcantsendmore(so);
694
695 SOCKET_UNLOCK(so);
696 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
697 return nn;
698}
699#else /* VBOX_WITH_SLIRP_BSD_SBUF */
700static int
701do_sosend(struct socket *so, int fUrg)
702{
703 struct sbuf *sb = &so->so_rcv;
704
705 int n, len;
706
707 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
708
709 len = sbuf_len(sb);
710
711 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
712 if (n < 0)
713 Log(("NAT: Can't sent sbuf via socket.\n"));
714 if (fUrg)
715 so->so_urgc -= n;
716 if (n > 0 && n < len)
717 {
718 char *ptr;
719 char *buff;
720 buff = RTMemAlloc(len);
721 if (buff == NULL)
722 {
723 Log(("NAT: No space to allocate temporal buffer\n"));
724 return -1;
725 }
726 ptr = sbuf_data(sb);
727 memcpy(buff, &ptr[n], len - n);
728 sbuf_bcpy(sb, buff, len - n);
729 RTMemFree(buff);
730 return n;
731 }
732 sbuf_clear(sb);
733 return n;
734}
735int
736sosendoob(struct socket *so)
737{
738 return do_sosend(so, 1);
739}
740
741/*
742 * Write data from so_rcv to so's socket,
743 * updating all sbuf field as necessary
744 */
745int
746sowrite(PNATState pData, struct socket *so)
747{
748 return do_sosend(so, 0);
749}
750#endif
751
752/*
753 * recvfrom() a UDP socket
754 */
755void
756sorecvfrom(PNATState pData, struct socket *so)
757{
758 ssize_t ret = 0;
759 struct sockaddr_in addr;
760 socklen_t addrlen = sizeof(struct sockaddr_in);
761
762 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
763
764 if (so->so_type == IPPROTO_ICMP)
765 {
766 /* This is a "ping" reply */
767#ifdef RT_OS_WINDOWS
768 sorecvfrom_icmp_win(pData, so);
769#else /* RT_OS_WINDOWS */
770 sorecvfrom_icmp_unix(pData, so);
771#endif /* !RT_OS_WINDOWS */
772 udp_detach(pData, so);
773 }
774 else
775 {
776 /* A "normal" UDP packet */
777 struct mbuf *m;
778 ssize_t len;
779 u_long n = 0;
780 int rc = 0;
781 static int signalled = 0;
782 char *pchBuffer = NULL;
783 bool fWithTemporalBuffer = false;
784
785 QSOCKET_LOCK(udb);
786 SOCKET_LOCK(so);
787 QSOCKET_UNLOCK(udb);
788
789 /*How many data has been received ?*/
790 /*
791 * 1. calculate how much we can read
792 * 2. read as much as possible
793 * 3. attach buffer to allocated header mbuf
794 */
795 rc = ioctlsocket(so->s, FIONREAD, &n);
796 if (rc == -1)
797 {
798 if ( errno == EAGAIN
799 || errno == EWOULDBLOCK
800 || errno == EINPROGRESS
801 || errno == ENOTCONN)
802 return;
803 else if (signalled == 0)
804 {
805 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
806 signalled = 1;
807 }
808 return;
809 }
810
811 len = sizeof(struct udpiphdr);
812 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
813 if (m == NULL)
814 return;
815
816 len += n;
817 m->m_data += ETH_HLEN;
818 m->m_pkthdr.header = mtod(m, void *);
819 m->m_data += sizeof(struct udpiphdr);
820
821 pchBuffer = mtod(m, char *);
822 fWithTemporalBuffer = false;
823 /*
824 * Even if amounts of bytes on socket is greater than MTU value
825 * Slirp will able fragment it, but we won't create temporal location
826 * here.
827 */
828 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
829 {
830 pchBuffer = RTMemAlloc((n) * sizeof(char));
831 if (!pchBuffer)
832 {
833 m_freem(pData, m);
834 return;
835 }
836 fWithTemporalBuffer = true;
837 }
838 ret = recvfrom(so->s, pchBuffer, n, 0,
839 (struct sockaddr *)&addr, &addrlen);
840 if (fWithTemporalBuffer)
841 {
842 if (ret > 0)
843 {
844 m_copyback(pData, m, 0, ret, pchBuffer);
845 /*
846 * If we've met comporison below our size prediction was failed
847 * it's not fatal just we've allocated for nothing. (@todo add counter here
848 * to calculate how rare we here)
849 */
850 if(ret < slirp_size(pData) && !m->m_next)
851 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
852 n, ret, slirp_size(pData)));
853 }
854 /* we're freeing buffer anyway */
855 RTMemFree(pchBuffer);
856 }
857 else
858 m->m_len = ret;
859
860 if (ret < 0)
861 {
862 u_char code = ICMP_UNREACH_PORT;
863
864 if (errno == EHOSTUNREACH)
865 code = ICMP_UNREACH_HOST;
866 else if (errno == ENETUNREACH)
867 code = ICMP_UNREACH_NET;
868
869 m_freem(pData, m);
870 if ( errno == EAGAIN
871 || errno == EWOULDBLOCK
872 || errno == EINPROGRESS
873 || errno == ENOTCONN)
874 {
875 return;
876 }
877
878 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
879 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
880 so->so_m = NULL;
881 }
882 else
883 {
884 Assert((m_length(m,NULL) == ret));
885 /*
886 * Hack: domain name lookup will be used the most for UDP,
887 * and since they'll only be used once there's no need
888 * for the 4 minute (or whatever) timeout... So we time them
889 * out much quicker (10 seconds for now...)
890 */
891 if (so->so_expire)
892 {
893 if (so->so_fport != RT_H2N_U16_C(53))
894 so->so_expire = curtime + SO_EXPIRE;
895 }
896 /*
897 * last argument should be changed if Slirp will inject IP attributes
898 * Note: Here we can't check if dnsproxy's sent initial request
899 */
900 if ( pData->fUseDnsProxy
901 && so->so_fport == RT_H2N_U16_C(53))
902 dnsproxy_answer(pData, so, m);
903
904#if 0
905 if (m->m_len == len)
906 {
907 m_inc(m, MINCSIZE);
908 m->m_len = 0;
909 }
910#endif
911
912 /* packets definetly will be fragmented, could confuse receiver peer. */
913 if (m_length(m, NULL) > if_mtu)
914 m->m_flags |= M_SKIP_FIREWALL;
915 /*
916 * If this packet was destined for CTL_ADDR,
917 * make it look like that's where it came from, done by udp_output
918 */
919 udp_output(pData, so, m, &addr);
920 SOCKET_UNLOCK(so);
921 } /* rx error */
922 } /* if ping packet */
923}
924
925/*
926 * sendto() a socket
927 */
928int
929sosendto(PNATState pData, struct socket *so, struct mbuf *m)
930{
931 int ret;
932 struct sockaddr_in *paddr;
933 struct sockaddr addr;
934#if 0
935 struct sockaddr_in host_addr;
936#endif
937 caddr_t buf = 0;
938 int mlen;
939
940 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
941
942 memset(&addr, 0, sizeof(struct sockaddr));
943#ifdef RT_OS_DARWIN
944 addr.sa_len = sizeof(struct sockaddr_in);
945#endif
946 paddr = (struct sockaddr_in *)&addr;
947 paddr->sin_family = AF_INET;
948 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
949 {
950 /* It's an alias */
951 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
952 switch(last_byte)
953 {
954#if 0
955 /* handle this case at 'default:' */
956 case CTL_BROADCAST:
957 addr.sin_addr.s_addr = INADDR_BROADCAST;
958 /* Send the packet to host to fully emulate broadcast */
959 /** @todo r=klaus: on Linux host this causes the host to receive
960 * the packet twice for some reason. And I cannot find any place
961 * in the man pages which states that sending a broadcast does not
962 * reach the host itself. */
963 host_addr.sin_family = AF_INET;
964 host_addr.sin_port = so->so_fport;
965 host_addr.sin_addr = our_addr;
966 sendto(so->s, m->m_data, m->m_len, 0,
967 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
968 break;
969#endif
970 case CTL_DNS:
971 case CTL_ALIAS:
972 default:
973 if (last_byte == ~pData->netmask)
974 paddr->sin_addr.s_addr = INADDR_BROADCAST;
975 else
976 paddr->sin_addr = loopback_addr;
977 break;
978 }
979 }
980 else
981 paddr->sin_addr = so->so_faddr;
982 paddr->sin_port = so->so_fport;
983
984 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
985 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
986
987 /* Don't care what port we get */
988 /*
989 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
990 * generates bodyless messages, annoying memmory management system.
991 */
992 mlen = m_length(m, NULL);
993 if (mlen > 0)
994 {
995 buf = RTMemAlloc(mlen);
996 if (buf == NULL)
997 {
998 return -1;
999 }
1000 m_copydata(m, 0, mlen, buf);
1001 }
1002 ret = sendto(so->s, buf, mlen, 0,
1003 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1004#ifdef VBOX_WITH_NAT_SEND2HOME
1005 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1006 {
1007 slirpSend2Home(pData, so, buf, mlen, 0);
1008 }
1009#endif
1010 if (buf)
1011 RTMemFree(buf);
1012 if (ret < 0)
1013 {
1014 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1015 return -1;
1016 }
1017
1018 /*
1019 * Kill the socket if there's no reply in 4 minutes,
1020 * but only if it's an expirable socket
1021 */
1022 if (so->so_expire)
1023 so->so_expire = curtime + SO_EXPIRE;
1024 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1025 return 0;
1026}
1027
1028/*
1029 * XXX This should really be tcp_listen
1030 */
1031struct socket *
1032solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1033{
1034 struct sockaddr_in addr;
1035 struct socket *so;
1036 socklen_t addrlen = sizeof(addr);
1037 int s, opt = 1;
1038 int status;
1039
1040 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1041
1042 if ((so = socreate()) == NULL)
1043 {
1044 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1045 return NULL;
1046 }
1047
1048 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1049 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1050 {
1051 RTMemFree(so);
1052 return NULL;
1053 }
1054
1055 SOCKET_LOCK_CREATE(so);
1056 SOCKET_LOCK(so);
1057 QSOCKET_LOCK(tcb);
1058 insque(pData, so,&tcb);
1059 NSOCK_INC();
1060 QSOCKET_UNLOCK(tcb);
1061
1062 /*
1063 * SS_FACCEPTONCE sockets must time out.
1064 */
1065 if (flags & SS_FACCEPTONCE)
1066 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1067
1068 so->so_state = (SS_FACCEPTCONN|flags);
1069 so->so_lport = lport; /* Kept in network format */
1070 so->so_laddr.s_addr = laddr; /* Ditto */
1071
1072 memset(&addr, 0, sizeof(addr));
1073#ifdef RT_OS_DARWIN
1074 addr.sin_len = sizeof(addr);
1075#endif
1076 addr.sin_family = AF_INET;
1077 addr.sin_addr.s_addr = bind_addr;
1078 addr.sin_port = port;
1079
1080 /**
1081 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1082 * kernel will choose the optimal value for requests queue length.
1083 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1084 */
1085 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1086 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1087 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1088 || (listen(s, pData->soMaxConn) < 0))
1089 {
1090#ifdef RT_OS_WINDOWS
1091 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1092 closesocket(s);
1093 QSOCKET_LOCK(tcb);
1094 sofree(pData, so);
1095 QSOCKET_UNLOCK(tcb);
1096 /* Restore the real errno */
1097 WSASetLastError(tmperrno);
1098#else
1099 int tmperrno = errno; /* Don't clobber the real reason we failed */
1100 close(s);
1101 QSOCKET_LOCK(tcb);
1102 sofree(pData, so);
1103 QSOCKET_UNLOCK(tcb);
1104 /* Restore the real errno */
1105 errno = tmperrno;
1106#endif
1107 return NULL;
1108 }
1109 fd_nonblock(s);
1110 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1111
1112 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1113 so->so_fport = addr.sin_port;
1114 /* set socket buffers */
1115 opt = pData->socket_rcv;
1116 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1117 if (status < 0)
1118 {
1119 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1120 goto no_sockopt;
1121 }
1122 opt = pData->socket_snd;
1123 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1124 if (status < 0)
1125 {
1126 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1127 goto no_sockopt;
1128 }
1129no_sockopt:
1130 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1131 so->so_faddr = alias_addr;
1132 else
1133 so->so_faddr = addr.sin_addr;
1134
1135 so->s = s;
1136 SOCKET_UNLOCK(so);
1137 return so;
1138}
1139
1140/*
1141 * Data is available in so_rcv
1142 * Just write() the data to the socket
1143 * XXX not yet...
1144 * @todo do we really need this function, what it's intended to do?
1145 */
1146void
1147sorwakeup(struct socket *so)
1148{
1149 NOREF(so);
1150#if 0
1151 sowrite(so);
1152 FD_CLR(so->s,&writefds);
1153#endif
1154}
1155
1156/*
1157 * Data has been freed in so_snd
1158 * We have room for a read() if we want to
1159 * For now, don't read, it'll be done in the main loop
1160 */
1161void
1162sowwakeup(struct socket *so)
1163{
1164 NOREF(so);
1165}
1166
1167/*
1168 * Various session state calls
1169 * XXX Should be #define's
1170 * The socket state stuff needs work, these often get call 2 or 3
1171 * times each when only 1 was needed
1172 */
1173void
1174soisfconnecting(struct socket *so)
1175{
1176 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1177 SS_FCANTSENDMORE|SS_FWDRAIN);
1178 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1179}
1180
1181void
1182soisfconnected(struct socket *so)
1183{
1184 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1185 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1186}
1187
1188void
1189sofcantrcvmore(struct socket *so)
1190{
1191 if ((so->so_state & SS_NOFDREF) == 0)
1192 {
1193 shutdown(so->s, 0);
1194 }
1195 so->so_state &= ~(SS_ISFCONNECTING);
1196 if (so->so_state & SS_FCANTSENDMORE)
1197 so->so_state = SS_NOFDREF; /* Don't select it */
1198 /* XXX close() here as well? */
1199 else
1200 so->so_state |= SS_FCANTRCVMORE;
1201}
1202
1203void
1204sofcantsendmore(struct socket *so)
1205{
1206 if ((so->so_state & SS_NOFDREF) == 0)
1207 shutdown(so->s, 1); /* send FIN to fhost */
1208
1209 so->so_state &= ~(SS_ISFCONNECTING);
1210 if (so->so_state & SS_FCANTRCVMORE)
1211 so->so_state = SS_NOFDREF; /* as above */
1212 else
1213 so->so_state |= SS_FCANTSENDMORE;
1214}
1215
1216void
1217soisfdisconnected(struct socket *so)
1218{
1219 NOREF(so);
1220#if 0
1221 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1222 close(so->s);
1223 so->so_state = SS_ISFDISCONNECTED;
1224 /*
1225 * XXX Do nothing ... ?
1226 */
1227#endif
1228}
1229
1230/*
1231 * Set write drain mode
1232 * Set CANTSENDMORE once all data has been write()n
1233 */
1234void
1235sofwdrain(struct socket *so)
1236{
1237 if (SBUF_LEN(&so->so_rcv))
1238 so->so_state |= SS_FWDRAIN;
1239 else
1240 sofcantsendmore(so);
1241}
1242
1243static void
1244send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1245{
1246 struct ip *ip;
1247 uint32_t dst, src;
1248 char ip_copy[256];
1249 struct icmp *icp;
1250 int old_ip_len = 0;
1251 int hlen, original_hlen = 0;
1252 struct mbuf *m;
1253 struct icmp_msg *icm;
1254 uint8_t proto;
1255 int type = 0;
1256
1257 ip = (struct ip *)buff;
1258 /* Fix ip->ip_len to contain the total packet length including the header
1259 * in _host_ byte order for all OSes. On Darwin, that value already is in
1260 * host byte order. Solaris and Darwin report only the payload. */
1261#ifndef RT_OS_DARWIN
1262 ip->ip_len = RT_N2H_U16(ip->ip_len);
1263#endif
1264 hlen = (ip->ip_hl << 2);
1265#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1266 ip->ip_len += hlen;
1267#endif
1268 if (ip->ip_len < hlen + ICMP_MINLEN)
1269 {
1270 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1271 return;
1272 }
1273 icp = (struct icmp *)((char *)ip + hlen);
1274
1275 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1276 if ( icp->icmp_type != ICMP_ECHOREPLY
1277 && icp->icmp_type != ICMP_TIMXCEED
1278 && icp->icmp_type != ICMP_UNREACH)
1279 {
1280 return;
1281 }
1282
1283 /*
1284 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1285 * ICMP_ECHOREPLY assuming data 0
1286 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1287 */
1288 if (ip->ip_len < hlen + 8)
1289 {
1290 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1291 return;
1292 }
1293
1294 type = icp->icmp_type;
1295 if ( type == ICMP_TIMXCEED
1296 || type == ICMP_UNREACH)
1297 {
1298 /*
1299 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1300 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1301 */
1302 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1303 {
1304 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1305 return;
1306 }
1307 ip = &icp->icmp_ip;
1308 }
1309
1310 icm = icmp_find_original_mbuf(pData, ip);
1311 if (icm == NULL)
1312 {
1313 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1314 return;
1315 }
1316
1317 m = icm->im_m;
1318 Assert(m != NULL);
1319
1320 src = addr->sin_addr.s_addr;
1321 if (type == ICMP_ECHOREPLY)
1322 {
1323 struct ip *ip0 = mtod(m, struct ip *);
1324 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1325 if (icp0->icmp_type != ICMP_ECHO)
1326 {
1327 Log(("NAT: we haven't found echo for this reply\n"));
1328 return;
1329 }
1330 /*
1331 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1332 * IP header combined by OS network stack, our local copy of IP header contians values
1333 * in host byte order so no byte order conversion is required. IP headers fields are converting
1334 * in ip_output0 routine only.
1335 */
1336 if ( (ip->ip_len - hlen)
1337 != (ip0->ip_len - (ip0->ip_hl << 2)))
1338 {
1339 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1340 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1341 return;
1342 }
1343 }
1344
1345 /* ip points on origianal ip header */
1346 ip = mtod(m, struct ip *);
1347 proto = ip->ip_p;
1348 /* Now ip is pointing on header we've sent from guest */
1349 if ( icp->icmp_type == ICMP_TIMXCEED
1350 || icp->icmp_type == ICMP_UNREACH)
1351 {
1352 old_ip_len = (ip->ip_hl << 2) + 64;
1353 if (old_ip_len > sizeof(ip_copy))
1354 old_ip_len = sizeof(ip_copy);
1355 memcpy(ip_copy, ip, old_ip_len);
1356 }
1357
1358 /* source address from original IP packet*/
1359 dst = ip->ip_src.s_addr;
1360
1361 /* overide ther tail of old packet */
1362 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1363 original_hlen = ip->ip_hl << 2;
1364 /* saves original ip header and options */
1365 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1366 ip->ip_len = m_length(m, NULL);
1367 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1368
1369 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1370 type = icp->icmp_type;
1371 if ( type == ICMP_TIMXCEED
1372 || type == ICMP_UNREACH)
1373 {
1374 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1375 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1376 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1377 }
1378
1379 ip->ip_src.s_addr = src;
1380 ip->ip_dst.s_addr = dst;
1381 icmp_reflect(pData, m);
1382 LIST_REMOVE(icm, im_list);
1383 pData->cIcmpCacheSize--;
1384 /* Don't call m_free here*/
1385
1386 if ( type == ICMP_TIMXCEED
1387 || type == ICMP_UNREACH)
1388 {
1389 icm->im_so->so_m = NULL;
1390 switch (proto)
1391 {
1392 case IPPROTO_UDP:
1393 /*XXX: so->so_m already freed so we shouldn't call sofree */
1394 udp_detach(pData, icm->im_so);
1395 break;
1396 case IPPROTO_TCP:
1397 /*close tcp should be here */
1398 break;
1399 default:
1400 /* do nothing */
1401 break;
1402 }
1403 }
1404 RTMemFree(icm);
1405}
1406
1407#ifdef RT_OS_WINDOWS
1408static void
1409sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1410{
1411 int len;
1412 int i;
1413 struct ip *ip;
1414 struct mbuf *m;
1415 struct icmp *icp;
1416 struct icmp_msg *icm;
1417 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1418 uint32_t src;
1419 ICMP_ECHO_REPLY *icr;
1420 int hlen = 0;
1421 int nbytes = 0;
1422 u_char code = ~0;
1423 int out_len;
1424 int size;
1425
1426 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1427 if (len < 0)
1428 {
1429 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1430 return;
1431 }
1432 if (len == 0)
1433 return; /* no error */
1434
1435 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1436 for (i = 0; i < len; ++i)
1437 {
1438 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1439 i, icr[i].Data, icr[i].DataSize));
1440 switch(icr[i].Status)
1441 {
1442 case IP_DEST_HOST_UNREACHABLE:
1443 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1444 case IP_DEST_NET_UNREACHABLE:
1445 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1446 case IP_DEST_PROT_UNREACHABLE:
1447 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1448 /* UNREACH error inject here */
1449 case IP_DEST_PORT_UNREACHABLE:
1450 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1451 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1452 so->so_m = NULL;
1453 break;
1454 case IP_SUCCESS: /* echo replied */
1455 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1456 size;
1457 size = MCLBYTES;
1458 if (out_len < MSIZE)
1459 size = MCLBYTES;
1460 else if (out_len < MCLBYTES)
1461 size = MCLBYTES;
1462 else if (out_len < MJUM9BYTES)
1463 size = MJUM9BYTES;
1464 else if (out_len < MJUM16BYTES)
1465 size = MJUM16BYTES;
1466 else
1467 AssertMsgFailed(("Unsupported size"));
1468
1469 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1470 LogFunc(("m_getjcl returns m: %p\n", m));
1471 if (m == NULL)
1472 return;
1473 m->m_len = 0;
1474 m->m_data += if_maxlinkhdr;
1475 m->m_pkthdr.header = mtod(m, void *);
1476
1477 ip = mtod(m, struct ip *);
1478 ip->ip_src.s_addr = icr[i].Address;
1479 ip->ip_p = IPPROTO_ICMP;
1480 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1481 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1482 ip->ip_ttl = icr[i].Options.Ttl;
1483
1484 icp = (struct icmp *)&ip[1]; /* no options */
1485 icp->icmp_type = ICMP_ECHOREPLY;
1486 icp->icmp_code = 0;
1487 icp->icmp_id = so->so_icmp_id;
1488 icp->icmp_seq = so->so_icmp_seq;
1489
1490 icm = icmp_find_original_mbuf(pData, ip);
1491 if (icm)
1492 {
1493 /* on this branch we don't need stored variant */
1494 m_freem(pData, icm->im_m);
1495 LIST_REMOVE(icm, im_list);
1496 pData->cIcmpCacheSize--;
1497 RTMemFree(icm);
1498 }
1499
1500
1501 hlen = (ip->ip_hl << 2);
1502 Assert((hlen >= sizeof(struct ip)));
1503
1504 m->m_data += hlen + ICMP_MINLEN;
1505 if (!RT_VALID_PTR(icr[i].Data))
1506 {
1507 m_freem(pData, m);
1508 break;
1509 }
1510 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1511 m->m_data -= hlen + ICMP_MINLEN;
1512 m->m_len += hlen + ICMP_MINLEN;
1513
1514
1515 ip->ip_len = m_length(m, NULL);
1516 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1517
1518 icmp_reflect(pData, m);
1519 break;
1520 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1521
1522 ip_broken = icr[i].Data;
1523 icm = icmp_find_original_mbuf(pData, ip_broken);
1524 if (icm == NULL) {
1525 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1526 return;
1527 }
1528 m = icm->im_m;
1529 ip = mtod(m, struct ip *);
1530 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1531 ip->ip_ttl = icr[i].Options.Ttl;
1532 src = ip->ip_src.s_addr;
1533 ip->ip_dst.s_addr = src;
1534 ip->ip_dst.s_addr = icr[i].Address;
1535
1536 hlen = (ip->ip_hl << 2);
1537 icp = (struct icmp *)((char *)ip + hlen);
1538 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1539
1540 m->m_len = (ip_broken->ip_hl << 2) + 64;
1541 m->m_pkthdr.header = mtod(m, void *);
1542 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1543 icmp_reflect(pData, m);
1544 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1545 LIST_REMOVE(icm, im_list);
1546 pData->cIcmpCacheSize--;
1547 RTMemFree(icm);
1548 break;
1549 default:
1550 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1551 break;
1552 }
1553 }
1554}
1555#else /* !RT_OS_WINDOWS */
1556static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1557{
1558 struct sockaddr_in addr;
1559 socklen_t addrlen = sizeof(struct sockaddr_in);
1560 struct ip ip;
1561 char *buff;
1562 int len = 0;
1563
1564 /* 1- step: read the ip header */
1565 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1566 (struct sockaddr *)&addr, &addrlen);
1567 if ( len < 0
1568 && ( errno == EAGAIN
1569 || errno == EWOULDBLOCK
1570 || errno == EINPROGRESS
1571 || errno == ENOTCONN))
1572 {
1573 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1574 return;
1575 }
1576
1577 if ( len < sizeof(struct ip)
1578 || len < 0
1579 || len == 0)
1580 {
1581 u_char code;
1582 code = ICMP_UNREACH_PORT;
1583
1584 if (errno == EHOSTUNREACH)
1585 code = ICMP_UNREACH_HOST;
1586 else if (errno == ENETUNREACH)
1587 code = ICMP_UNREACH_NET;
1588
1589 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1590 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1591 so->so_m = NULL;
1592 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1593 return;
1594 }
1595 /* basic check of IP header */
1596 if ( ip.ip_v != IPVERSION
1597# ifndef RT_OS_DARWIN
1598 || ip.ip_p != IPPROTO_ICMP
1599# endif
1600 )
1601 {
1602 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1603 return;
1604 }
1605# ifndef RT_OS_DARWIN
1606 /* Darwin reports the IP length already in host byte order. */
1607 ip.ip_len = RT_N2H_U16(ip.ip_len);
1608# endif
1609# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1610 /* Solaris and Darwin report the payload only */
1611 ip.ip_len += (ip.ip_hl << 2);
1612# endif
1613 /* Note: ip->ip_len in host byte order (all OS) */
1614 len = ip.ip_len;
1615 buff = RTMemAlloc(len);
1616 if (buff == NULL)
1617 {
1618 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1619 return;
1620 }
1621 /* 2 - step: we're reading rest of the datagramm to the buffer */
1622 addrlen = sizeof(struct sockaddr_in);
1623 memset(&addr, 0, addrlen);
1624 len = recvfrom(so->s, buff, len, 0,
1625 (struct sockaddr *)&addr, &addrlen);
1626 if ( len < 0
1627 && ( errno == EAGAIN
1628 || errno == EWOULDBLOCK
1629 || errno == EINPROGRESS
1630 || errno == ENOTCONN))
1631 {
1632 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1633 ip.ip_len));
1634 RTMemFree(buff);
1635 return;
1636 }
1637 if ( len < 0
1638 || len == 0)
1639 {
1640 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1641 errno, len, (ip.ip_len - sizeof(struct ip))));
1642 RTMemFree(buff);
1643 return;
1644 }
1645 /* len is modified in 2nd read, when the rest of the datagramm was read */
1646 send_icmp_to_guest(pData, buff, len, &addr);
1647 RTMemFree(buff);
1648}
1649#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette