VirtualBox

source: vbox/trunk/src/VBox/NetworkServices/NAT/pxtcp.c@ 69734

Last change on this file since 69734 was 69500, checked in by vboxsync, 7 years ago

*: scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 69.1 KB
Line 
1/* $Id: pxtcp.c 69500 2017-10-28 15:14:05Z vboxsync $ */
2/** @file
3 * NAT Network - TCP proxy.
4 */
5
6/*
7 * Copyright (C) 2013-2017 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18#define LOG_GROUP LOG_GROUP_NAT_SERVICE
19
20#include "winutils.h"
21
22#include "pxtcp.h"
23
24#include "proxy.h"
25#include "proxy_pollmgr.h"
26#include "pxremap.h"
27#include "portfwd.h" /* fwspec */
28
29#ifndef RT_OS_WINDOWS
30#include <sys/types.h>
31#include <sys/socket.h>
32#include <sys/ioctl.h>
33#ifdef RT_OS_SOLARIS
34#include <sys/filio.h> /* FIONREAD is BSD'ism */
35#endif
36#include <stdlib.h>
37#include <stdint.h>
38#include <stdio.h>
39#include <string.h>
40#include <poll.h>
41
42#include <err.h> /* BSD'ism */
43#else
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47
48#include <iprt/stdint.h>
49#include "winpoll.h"
50#endif
51
52#include "lwip/opt.h"
53
54#include "lwip/sys.h"
55#include "lwip/tcpip.h"
56#include "lwip/netif.h"
57#include "lwip/tcp_impl.h" /* XXX: to access tcp_abandon() */
58#include "lwip/icmp.h"
59#include "lwip/icmp6.h"
60
61/*
62 * Different OSes have different quirks in reporting POLLHUP for TCP
63 * sockets.
64 *
65 * Using shutdown(2) "how" values here would be more readable, but
66 * since SHUT_RD is 0, we can't use 0 for "none", unfortunately.
67 */
68#if defined(RT_OS_NETBSD) || defined(RT_OS_SOLARIS)
69# define HAVE_TCP_POLLHUP 0 /* not reported */
70#elif defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS)
71# define HAVE_TCP_POLLHUP POLLIN /* reported when remote closes */
72#else
73# define HAVE_TCP_POLLHUP (POLLIN|POLLOUT) /* reported when both directions are closed */
74#endif
75
76
77/**
78 * Ring buffer for inbound data. Filled with data from the host
79 * socket on poll manager thread. Data consumed by scheduling
80 * tcp_write() to the pcb on the lwip thread.
81 *
82 * NB: There is actually third party present, the lwip stack itself.
83 * Thus the buffer doesn't have dual free vs. data split, but rather
84 * three-way free / send and unACKed data / unsent data split.
85 */
86struct ringbuf {
87 char *buf;
88 size_t bufsize;
89
90 /*
91 * Start of free space, producer writes here (up till "unacked").
92 */
93 volatile size_t vacant;
94
95 /*
96 * Start of sent but unacknowledged data. The data are "owned" by
97 * the stack as it may need to retransmit. This is the free space
98 * limit for producer.
99 */
100 volatile size_t unacked;
101
102 /*
103 * Start of unsent data, consumer reads/sends from here (up till
104 * "vacant"). Not declared volatile since it's only accessed from
105 * the consumer thread.
106 */
107 size_t unsent;
108};
109
110
111/**
112 */
113struct pxtcp {
114 /**
115 * Our poll manager handler. Must be first, strong/weak
116 * references depend on this "inheritance".
117 */
118 struct pollmgr_handler pmhdl;
119
120 /**
121 * lwIP (internal/guest) side of the proxied connection.
122 */
123 struct tcp_pcb *pcb;
124
125 /**
126 * Host (external) side of the proxied connection.
127 */
128 SOCKET sock;
129
130 /**
131 * Socket events we are currently polling for.
132 */
133 int events;
134
135 /**
136 * Socket error. Currently used to save connect(2) errors so that
137 * we can decide if we need to send ICMP error.
138 */
139 int sockerr;
140
141 /**
142 * Interface that we have got the SYN from. Needed to send ICMP
143 * with correct source address.
144 */
145 struct netif *netif;
146
147 /**
148 * For tentatively accepted connections for which we are in
149 * process of connecting to the real destination this is the
150 * initial pbuf that we might need to build ICMP error.
151 *
152 * When connection is established this is used to hold outbound
153 * pbuf chain received by pxtcp_pcb_recv() but not yet completely
154 * forwarded over the socket. We cannot "return" it to lwIP since
155 * the head of the chain is already sent and freed.
156 */
157 struct pbuf *unsent;
158
159 /**
160 * Guest has closed its side. Reported to pxtcp_pcb_recv() only
161 * once and we might not be able to forward it immediately if we
162 * have unsent pbuf.
163 */
164 int outbound_close;
165
166 /**
167 * Outbound half-close has been done on the socket.
168 */
169 int outbound_close_done;
170
171 /**
172 * External has closed its side. We might not be able to forward
173 * it immediately if we have unforwarded data.
174 */
175 int inbound_close;
176
177 /**
178 * Inbound half-close has been done on the pcb.
179 */
180 int inbound_close_done;
181
182 /**
183 * On systems that report POLLHUP as soon as the final FIN is
184 * received on a socket we cannot continue polling for the rest of
185 * input, so we have to read (pull) last data from the socket on
186 * the lwIP thread instead of polling/pushing it from the poll
187 * manager thread. See comment in pxtcp_pmgr_pump() POLLHUP case.
188 */
189 int inbound_pull;
190
191
192 /**
193 * When poll manager schedules delete we may not be able to delete
194 * a pxtcp immediately if not all inbound data has been acked by
195 * the guest: lwIP may need to resend and the data are in pxtcp's
196 * inbuf::buf. We defer delete until all data are acked to
197 * pxtcp_pcb_sent().
198 */
199 int deferred_delete;
200
201 /**
202 * Ring-buffer for inbound data.
203 */
204 struct ringbuf inbuf;
205
206 /**
207 * lwIP thread's strong reference to us.
208 */
209 struct pollmgr_refptr *rp;
210
211
212 /*
213 * We use static messages to call functions on the lwIP thread to
214 * void malloc/free overhead.
215 */
216 struct tcpip_msg msg_delete; /* delete pxtcp */
217 struct tcpip_msg msg_reset; /* reset connection and delete pxtcp */
218 struct tcpip_msg msg_accept; /* confirm accept of proxied connection */
219 struct tcpip_msg msg_outbound; /* trigger send of outbound data */
220 struct tcpip_msg msg_inbound; /* trigger send of inbound data */
221 struct tcpip_msg msg_inpull; /* trigger pull of last inbound data */
222};
223
224
225
226static struct pxtcp *pxtcp_allocate(void);
227static void pxtcp_free(struct pxtcp *);
228
229static void pxtcp_pcb_associate(struct pxtcp *, struct tcp_pcb *);
230static void pxtcp_pcb_dissociate(struct pxtcp *);
231
232/* poll manager callbacks for pxtcp related channels */
233static int pxtcp_pmgr_chan_add(struct pollmgr_handler *, SOCKET, int);
234static int pxtcp_pmgr_chan_pollout(struct pollmgr_handler *, SOCKET, int);
235static int pxtcp_pmgr_chan_pollin(struct pollmgr_handler *, SOCKET, int);
236#if !(HAVE_TCP_POLLHUP & POLLOUT)
237static int pxtcp_pmgr_chan_del(struct pollmgr_handler *, SOCKET, int);
238#endif
239static int pxtcp_pmgr_chan_reset(struct pollmgr_handler *, SOCKET, int);
240
241/* helper functions for sending/receiving pxtcp over poll manager channels */
242static ssize_t pxtcp_chan_send(enum pollmgr_slot_t, struct pxtcp *);
243static ssize_t pxtcp_chan_send_weak(enum pollmgr_slot_t, struct pxtcp *);
244static struct pxtcp *pxtcp_chan_recv(struct pollmgr_handler *, SOCKET, int);
245static struct pxtcp *pxtcp_chan_recv_strong(struct pollmgr_handler *, SOCKET, int);
246
247/* poll manager callbacks for individual sockets */
248static int pxtcp_pmgr_connect(struct pollmgr_handler *, SOCKET, int);
249static int pxtcp_pmgr_pump(struct pollmgr_handler *, SOCKET, int);
250
251/* get incoming traffic into ring buffer */
252static ssize_t pxtcp_sock_read(struct pxtcp *, int *);
253static ssize_t pxtcp_sock_recv(struct pxtcp *, IOVEC *, size_t); /* default */
254
255/* convenience functions for poll manager callbacks */
256static int pxtcp_schedule_delete(struct pxtcp *);
257static int pxtcp_schedule_reset(struct pxtcp *);
258static int pxtcp_schedule_reject(struct pxtcp *);
259
260/* lwip thread callbacks called via proxy_lwip_post() */
261static void pxtcp_pcb_delete_pxtcp(void *);
262static void pxtcp_pcb_reset_pxtcp(void *);
263static void pxtcp_pcb_accept_refuse(void *);
264static void pxtcp_pcb_accept_confirm(void *);
265static void pxtcp_pcb_write_outbound(void *);
266static void pxtcp_pcb_write_inbound(void *);
267static void pxtcp_pcb_pull_inbound(void *);
268
269/* tcp pcb callbacks */
270static err_t pxtcp_pcb_heard(void *, struct tcp_pcb *, struct pbuf *); /* global */
271static err_t pxtcp_pcb_accept(void *, struct tcp_pcb *, err_t);
272static err_t pxtcp_pcb_connected(void *, struct tcp_pcb *, err_t);
273static err_t pxtcp_pcb_recv(void *, struct tcp_pcb *, struct pbuf *, err_t);
274static err_t pxtcp_pcb_sent(void *, struct tcp_pcb *, u16_t);
275static err_t pxtcp_pcb_poll(void *, struct tcp_pcb *);
276static void pxtcp_pcb_err(void *, err_t);
277
278static err_t pxtcp_pcb_forward_outbound(struct pxtcp *, struct pbuf *);
279static void pxtcp_pcb_forward_outbound_close(struct pxtcp *);
280
281static ssize_t pxtcp_sock_send(struct pxtcp *, IOVEC *, size_t);
282
283static void pxtcp_pcb_forward_inbound(struct pxtcp *);
284static void pxtcp_pcb_forward_inbound_close(struct pxtcp *);
285DECLINLINE(int) pxtcp_pcb_forward_inbound_done(const struct pxtcp *);
286static void pxtcp_pcb_schedule_poll(struct pxtcp *);
287static void pxtcp_pcb_cancel_poll(struct pxtcp *);
288
289static void pxtcp_pcb_reject(struct tcp_pcb *, int, struct netif *, struct pbuf *);
290DECLINLINE(void) pxtcp_pcb_maybe_deferred_delete(struct pxtcp *);
291
292/* poll manager handlers for pxtcp channels */
293static struct pollmgr_handler pxtcp_pmgr_chan_add_hdl;
294static struct pollmgr_handler pxtcp_pmgr_chan_pollout_hdl;
295static struct pollmgr_handler pxtcp_pmgr_chan_pollin_hdl;
296#if !(HAVE_TCP_POLLHUP & POLLOUT)
297static struct pollmgr_handler pxtcp_pmgr_chan_del_hdl;
298#endif
299static struct pollmgr_handler pxtcp_pmgr_chan_reset_hdl;
300
301
302/**
303 * Init PXTCP - must be run when neither lwIP tcpip thread, nor poll
304 * manager threads haven't been created yet.
305 */
306void
307pxtcp_init(void)
308{
309 /*
310 * Create channels.
311 */
312#define CHANNEL(SLOT, NAME) do { \
313 NAME##_hdl.callback = NAME; \
314 NAME##_hdl.data = NULL; \
315 NAME##_hdl.slot = -1; \
316 pollmgr_add_chan(SLOT, &NAME##_hdl); \
317 } while (0)
318
319 CHANNEL(POLLMGR_CHAN_PXTCP_ADD, pxtcp_pmgr_chan_add);
320 CHANNEL(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp_pmgr_chan_pollin);
321 CHANNEL(POLLMGR_CHAN_PXTCP_POLLOUT, pxtcp_pmgr_chan_pollout);
322#if !(HAVE_TCP_POLLHUP & POLLOUT)
323 CHANNEL(POLLMGR_CHAN_PXTCP_DEL, pxtcp_pmgr_chan_del);
324#endif
325 CHANNEL(POLLMGR_CHAN_PXTCP_RESET, pxtcp_pmgr_chan_reset);
326
327#undef CHANNEL
328
329 /*
330 * Listen to outgoing connection from guest(s).
331 */
332 tcp_proxy_accept(pxtcp_pcb_heard);
333}
334
335
336/**
337 * Syntactic sugar for sending pxtcp pointer over poll manager
338 * channel. Used by lwip thread functions.
339 */
340static ssize_t
341pxtcp_chan_send(enum pollmgr_slot_t slot, struct pxtcp *pxtcp)
342{
343 return pollmgr_chan_send(slot, &pxtcp, sizeof(pxtcp));
344}
345
346
347/**
348 * Syntactic sugar for sending weak reference to pxtcp over poll
349 * manager channel. Used by lwip thread functions.
350 */
351static ssize_t
352pxtcp_chan_send_weak(enum pollmgr_slot_t slot, struct pxtcp *pxtcp)
353{
354 pollmgr_refptr_weak_ref(pxtcp->rp);
355 return pollmgr_chan_send(slot, &pxtcp->rp, sizeof(pxtcp->rp));
356}
357
358
359/**
360 * Counterpart of pxtcp_chan_send().
361 */
362static struct pxtcp *
363pxtcp_chan_recv(struct pollmgr_handler *handler, SOCKET fd, int revents)
364{
365 struct pxtcp *pxtcp;
366
367 pxtcp = (struct pxtcp *)pollmgr_chan_recv_ptr(handler, fd, revents);
368 return pxtcp;
369}
370
371
372/**
373 * Counterpart of pxtcp_chan_send_weak().
374 */
375static struct pxtcp *
376pxtcp_chan_recv_strong(struct pollmgr_handler *handler, SOCKET fd, int revents)
377{
378 struct pollmgr_refptr *rp;
379 struct pollmgr_handler *base;
380 struct pxtcp *pxtcp;
381
382 rp = (struct pollmgr_refptr *)pollmgr_chan_recv_ptr(handler, fd, revents);
383 base = (struct pollmgr_handler *)pollmgr_refptr_get(rp);
384 pxtcp = (struct pxtcp *)base;
385
386 return pxtcp;
387}
388
389
390/**
391 * Register pxtcp with poll manager.
392 *
393 * Used for POLLMGR_CHAN_PXTCP_ADD and by port-forwarding. Since
394 * error handling is different in these two cases, we leave it up to
395 * the caller.
396 */
397int
398pxtcp_pmgr_add(struct pxtcp *pxtcp)
399{
400 int status;
401
402 LWIP_ASSERT1(pxtcp != NULL);
403#ifdef RT_OS_WINDOWS
404 LWIP_ASSERT1(pxtcp->sock != INVALID_SOCKET);
405#else
406 LWIP_ASSERT1(pxtcp->sock >= 0);
407#endif
408 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
409 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
410 LWIP_ASSERT1(pxtcp->pmhdl.slot < 0);
411
412 status = pollmgr_add(&pxtcp->pmhdl, pxtcp->sock, pxtcp->events);
413 return status;
414}
415
416
417/**
418 * Unregister pxtcp with poll manager.
419 *
420 * Used for POLLMGR_CHAN_PXTCP_RESET and by port-forwarding (on error
421 * leg).
422 */
423void
424pxtcp_pmgr_del(struct pxtcp *pxtcp)
425{
426 LWIP_ASSERT1(pxtcp != NULL);
427
428 pollmgr_del_slot(pxtcp->pmhdl.slot);
429}
430
431
432/**
433 * POLLMGR_CHAN_PXTCP_ADD handler.
434 *
435 * Get new pxtcp from lwip thread and start polling its socket.
436 */
437static int
438pxtcp_pmgr_chan_add(struct pollmgr_handler *handler, SOCKET fd, int revents)
439{
440 struct pxtcp *pxtcp;
441 int status;
442
443 pxtcp = pxtcp_chan_recv(handler, fd, revents);
444 DPRINTF0(("pxtcp_add: new pxtcp %p; pcb %p; sock %d\n",
445 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
446
447 status = pxtcp_pmgr_add(pxtcp);
448 if (status < 0) {
449 (void) pxtcp_schedule_reset(pxtcp);
450 }
451
452 return POLLIN;
453}
454
455
456/**
457 * POLLMGR_CHAN_PXTCP_POLLOUT handler.
458 *
459 * pxtcp_pcb_forward_outbound() on the lwIP thread tried to send data
460 * and failed, it now requests us to poll the socket for POLLOUT and
461 * schedule pxtcp_pcb_forward_outbound() when sock is writable again.
462 */
463static int
464pxtcp_pmgr_chan_pollout(struct pollmgr_handler *handler, SOCKET fd, int revents)
465{
466 struct pxtcp *pxtcp;
467
468 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
469 DPRINTF0(("pxtcp_pollout: pxtcp %p\n", (void *)pxtcp));
470
471 if (pxtcp == NULL) {
472 return POLLIN;
473 }
474
475 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
476 LWIP_ASSERT1(pxtcp->pmhdl.slot > 0);
477
478 pxtcp->events |= POLLOUT;
479 pollmgr_update_events(pxtcp->pmhdl.slot, pxtcp->events);
480
481 return POLLIN;
482}
483
484
485/**
486 * POLLMGR_CHAN_PXTCP_POLLIN handler.
487 */
488static int
489pxtcp_pmgr_chan_pollin(struct pollmgr_handler *handler, SOCKET fd, int revents)
490{
491 struct pxtcp *pxtcp;
492
493 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
494 DPRINTF2(("pxtcp_pollin: pxtcp %p\n", (void *)pxtcp));
495
496 if (pxtcp == NULL) {
497 return POLLIN;
498 }
499
500 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
501 LWIP_ASSERT1(pxtcp->pmhdl.slot > 0);
502
503 if (pxtcp->inbound_close) {
504 return POLLIN;
505 }
506
507 pxtcp->events |= POLLIN;
508 pollmgr_update_events(pxtcp->pmhdl.slot, pxtcp->events);
509
510 return POLLIN;
511}
512
513
514#if !(HAVE_TCP_POLLHUP & POLLOUT)
515/**
516 * POLLMGR_CHAN_PXTCP_DEL handler.
517 *
518 * Schedule pxtcp deletion. We only need this if host system doesn't
519 * report POLLHUP for fully closed tcp sockets.
520 */
521static int
522pxtcp_pmgr_chan_del(struct pollmgr_handler *handler, SOCKET fd, int revents)
523{
524 struct pxtcp *pxtcp;
525
526 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
527 if (pxtcp == NULL) {
528 return POLLIN;
529 }
530
531 DPRINTF(("PXTCP_DEL: pxtcp %p; pcb %p; sock %d\n",
532 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
533
534 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
535 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
536
537 LWIP_ASSERT1(pxtcp->inbound_close); /* EOF read */
538 LWIP_ASSERT1(pxtcp->outbound_close_done); /* EOF sent */
539
540 pxtcp_pmgr_del(pxtcp);
541 (void) pxtcp_schedule_delete(pxtcp);
542
543 return POLLIN;
544}
545#endif /* !(HAVE_TCP_POLLHUP & POLLOUT) */
546
547
548/**
549 * POLLMGR_CHAN_PXTCP_RESET handler.
550 *
551 * Close the socket with RST and delete pxtcp.
552 */
553static int
554pxtcp_pmgr_chan_reset(struct pollmgr_handler *handler, SOCKET fd, int revents)
555{
556 struct pxtcp *pxtcp;
557
558 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
559 if (pxtcp == NULL) {
560 return POLLIN;
561 }
562
563 DPRINTF0(("PXTCP_RESET: pxtcp %p; pcb %p; sock %d\n",
564 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
565
566 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
567 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
568
569 pxtcp_pmgr_del(pxtcp);
570
571 proxy_reset_socket(pxtcp->sock);
572 pxtcp->sock = INVALID_SOCKET;
573
574 (void) pxtcp_schedule_reset(pxtcp);
575
576 return POLLIN;
577}
578
579
580static struct pxtcp *
581pxtcp_allocate(void)
582{
583 struct pxtcp *pxtcp;
584
585 pxtcp = (struct pxtcp *)malloc(sizeof(*pxtcp));
586 if (pxtcp == NULL) {
587 return NULL;
588 }
589
590 pxtcp->pmhdl.callback = NULL;
591 pxtcp->pmhdl.data = (void *)pxtcp;
592 pxtcp->pmhdl.slot = -1;
593
594 pxtcp->pcb = NULL;
595 pxtcp->sock = INVALID_SOCKET;
596 pxtcp->events = 0;
597 pxtcp->sockerr = 0;
598 pxtcp->netif = NULL;
599 pxtcp->unsent = NULL;
600 pxtcp->outbound_close = 0;
601 pxtcp->outbound_close_done = 0;
602 pxtcp->inbound_close = 0;
603 pxtcp->inbound_close_done = 0;
604 pxtcp->inbound_pull = 0;
605 pxtcp->deferred_delete = 0;
606
607 pxtcp->inbuf.bufsize = 64 * 1024;
608 pxtcp->inbuf.buf = (char *)malloc(pxtcp->inbuf.bufsize);
609 if (pxtcp->inbuf.buf == NULL) {
610 free(pxtcp);
611 return NULL;
612 }
613 pxtcp->inbuf.vacant = 0;
614 pxtcp->inbuf.unacked = 0;
615 pxtcp->inbuf.unsent = 0;
616
617 pxtcp->rp = pollmgr_refptr_create(&pxtcp->pmhdl);
618 if (pxtcp->rp == NULL) {
619 free(pxtcp->inbuf.buf);
620 free(pxtcp);
621 return NULL;
622 }
623
624#define CALLBACK_MSG(MSG, FUNC) \
625 do { \
626 pxtcp->MSG.type = TCPIP_MSG_CALLBACK_STATIC; \
627 pxtcp->MSG.sem = NULL; \
628 pxtcp->MSG.msg.cb.function = FUNC; \
629 pxtcp->MSG.msg.cb.ctx = (void *)pxtcp; \
630 } while (0)
631
632 CALLBACK_MSG(msg_delete, pxtcp_pcb_delete_pxtcp);
633 CALLBACK_MSG(msg_reset, pxtcp_pcb_reset_pxtcp);
634 CALLBACK_MSG(msg_accept, pxtcp_pcb_accept_confirm);
635 CALLBACK_MSG(msg_outbound, pxtcp_pcb_write_outbound);
636 CALLBACK_MSG(msg_inbound, pxtcp_pcb_write_inbound);
637 CALLBACK_MSG(msg_inpull, pxtcp_pcb_pull_inbound);
638
639#undef CALLBACK_MSG
640
641 return pxtcp;
642}
643
644
645/**
646 * Exported to fwtcp to create pxtcp for incoming port-forwarded
647 * connections. Completed with pcb in pxtcp_pcb_connect().
648 */
649struct pxtcp *
650pxtcp_create_forwarded(SOCKET sock)
651{
652 struct pxtcp *pxtcp;
653
654 pxtcp = pxtcp_allocate();
655 if (pxtcp == NULL) {
656 return NULL;
657 }
658
659 pxtcp->sock = sock;
660 pxtcp->pmhdl.callback = pxtcp_pmgr_pump;
661 pxtcp->events = 0;
662
663 return pxtcp;
664}
665
666
667static void
668pxtcp_pcb_associate(struct pxtcp *pxtcp, struct tcp_pcb *pcb)
669{
670 LWIP_ASSERT1(pxtcp != NULL);
671 LWIP_ASSERT1(pcb != NULL);
672
673 pxtcp->pcb = pcb;
674
675 tcp_arg(pcb, pxtcp);
676
677 tcp_recv(pcb, pxtcp_pcb_recv);
678 tcp_sent(pcb, pxtcp_pcb_sent);
679 tcp_poll(pcb, NULL, 255);
680 tcp_err(pcb, pxtcp_pcb_err);
681}
682
683
684static void
685pxtcp_free(struct pxtcp *pxtcp)
686{
687 if (pxtcp->unsent != NULL) {
688 pbuf_free(pxtcp->unsent);
689 }
690 if (pxtcp->inbuf.buf != NULL) {
691 free(pxtcp->inbuf.buf);
692 }
693 free(pxtcp);
694}
695
696
697/**
698 * Counterpart to pxtcp_create_forwarded() to destruct pxtcp that
699 * fwtcp failed to register with poll manager to post to lwip thread
700 * for doing connect.
701 */
702void
703pxtcp_cancel_forwarded(struct pxtcp *pxtcp)
704{
705 LWIP_ASSERT1(pxtcp->pcb == NULL);
706 pxtcp_pcb_reset_pxtcp(pxtcp);
707}
708
709
710static void
711pxtcp_pcb_dissociate(struct pxtcp *pxtcp)
712{
713 if (pxtcp == NULL || pxtcp->pcb == NULL) {
714 return;
715 }
716
717 DPRINTF(("%s: pxtcp %p <-> pcb %p\n",
718 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
719
720 /*
721 * We must have dissociated from a fully closed pcb immediately
722 * since lwip recycles them and we don't wan't to mess with what
723 * would be someone else's pcb that we happen to have a stale
724 * pointer to.
725 */
726 LWIP_ASSERT1(pxtcp->pcb->callback_arg == pxtcp);
727
728 tcp_recv(pxtcp->pcb, NULL);
729 tcp_sent(pxtcp->pcb, NULL);
730 tcp_poll(pxtcp->pcb, NULL, 255);
731 tcp_err(pxtcp->pcb, NULL);
732 tcp_arg(pxtcp->pcb, NULL);
733 pxtcp->pcb = NULL;
734}
735
736
737/**
738 * Lwip thread callback invoked via pxtcp::msg_delete
739 *
740 * Since we use static messages to communicate to the lwip thread, we
741 * cannot delete pxtcp without making sure there are no unprocessed
742 * messages in the lwip thread mailbox.
743 *
744 * The easiest way to ensure that is to send this "delete" message as
745 * the last one and when it's processed we know there are no more and
746 * it's safe to delete pxtcp.
747 *
748 * Poll manager handlers should use pxtcp_schedule_delete()
749 * convenience function.
750 */
751static void
752pxtcp_pcb_delete_pxtcp(void *ctx)
753{
754 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
755
756 DPRINTF(("%s: pxtcp %p, pcb %p, sock %d%s\n",
757 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock,
758 (pxtcp->deferred_delete && !pxtcp->inbound_pull
759 ? " (was deferred)" : "")));
760
761 LWIP_ASSERT1(pxtcp != NULL);
762 LWIP_ASSERT1(pxtcp->pmhdl.slot < 0);
763 LWIP_ASSERT1(pxtcp->outbound_close_done);
764 LWIP_ASSERT1(pxtcp->inbound_close); /* not necessarily done */
765
766
767 /*
768 * pxtcp is no longer registered with poll manager, so it's safe
769 * to close the socket.
770 */
771 if (pxtcp->sock != INVALID_SOCKET) {
772 closesocket(pxtcp->sock);
773 pxtcp->sock = INVALID_SOCKET;
774 }
775
776 /*
777 * We might have already dissociated from a fully closed pcb, or
778 * guest might have sent us a reset while msg_delete was in
779 * transit. If there's no pcb, we are done.
780 */
781 if (pxtcp->pcb == NULL) {
782 pollmgr_refptr_unref(pxtcp->rp);
783 pxtcp_free(pxtcp);
784 return;
785 }
786
787 /*
788 * Have we completely forwarded all inbound traffic to the guest?
789 *
790 * We may still be waiting for ACKs. We may have failed to send
791 * some of the data (tcp_write() failed with ERR_MEM). We may
792 * have failed to send the FIN (tcp_shutdown() failed with
793 * ERR_MEM).
794 */
795 if (pxtcp_pcb_forward_inbound_done(pxtcp)) {
796 pxtcp_pcb_dissociate(pxtcp);
797 pollmgr_refptr_unref(pxtcp->rp);
798 pxtcp_free(pxtcp);
799 }
800 else {
801 DPRINTF2(("delete: pxtcp %p; pcb %p:"
802 " unacked %d, unsent %d, vacant %d, %s - DEFER!\n",
803 (void *)pxtcp, (void *)pxtcp->pcb,
804 (int)pxtcp->inbuf.unacked,
805 (int)pxtcp->inbuf.unsent,
806 (int)pxtcp->inbuf.vacant,
807 pxtcp->inbound_close_done ? "FIN sent" : "FIN is NOT sent"));
808
809 LWIP_ASSERT1(!pxtcp->deferred_delete);
810 pxtcp->deferred_delete = 1;
811 }
812}
813
814
815/**
816 * If we couldn't delete pxtcp right away in the msg_delete callback
817 * from the poll manager thread, we repeat the check at the end of
818 * relevant pcb callbacks.
819 */
820DECLINLINE(void)
821pxtcp_pcb_maybe_deferred_delete(struct pxtcp *pxtcp)
822{
823 if (pxtcp->deferred_delete && pxtcp_pcb_forward_inbound_done(pxtcp)) {
824 pxtcp_pcb_delete_pxtcp(pxtcp);
825 }
826}
827
828
829/**
830 * Poll manager callbacks should use this convenience wrapper to
831 * schedule pxtcp deletion on the lwip thread and to deregister from
832 * the poll manager.
833 */
834static int
835pxtcp_schedule_delete(struct pxtcp *pxtcp)
836{
837 /*
838 * If pollmgr_refptr_get() is called by any channel before
839 * scheduled deletion happens, let them know we are gone.
840 */
841 pxtcp->pmhdl.slot = -1;
842
843 /*
844 * Schedule deletion. Since poll manager thread may be pre-empted
845 * right after we send the message, the deletion may actually
846 * happen on the lwip thread before we return from this function,
847 * so it's not safe to refer to pxtcp after this call.
848 */
849 proxy_lwip_post(&pxtcp->msg_delete);
850
851 /* tell poll manager to deregister us */
852 return -1;
853}
854
855
856/**
857 * Lwip thread callback invoked via pxtcp::msg_reset
858 *
859 * Like pxtcp_pcb_delete(), but sends RST to the guest before
860 * deleting this pxtcp.
861 */
862static void
863pxtcp_pcb_reset_pxtcp(void *ctx)
864{
865 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
866 LWIP_ASSERT1(pxtcp != NULL);
867
868 DPRINTF0(("%s: pxtcp %p, pcb %p, sock %d\n",
869 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
870
871 if (pxtcp->sock != INVALID_SOCKET) {
872 proxy_reset_socket(pxtcp->sock);
873 pxtcp->sock = INVALID_SOCKET;
874 }
875
876 if (pxtcp->pcb != NULL) {
877 struct tcp_pcb *pcb = pxtcp->pcb;
878 pxtcp_pcb_dissociate(pxtcp);
879 tcp_abort(pcb);
880 }
881
882 pollmgr_refptr_unref(pxtcp->rp);
883 pxtcp_free(pxtcp);
884}
885
886
887
888/**
889 * Poll manager callbacks should use this convenience wrapper to
890 * schedule pxtcp reset and deletion on the lwip thread and to
891 * deregister from the poll manager.
892 *
893 * See pxtcp_schedule_delete() for additional comments.
894 */
895static int
896pxtcp_schedule_reset(struct pxtcp *pxtcp)
897{
898 pxtcp->pmhdl.slot = -1;
899 proxy_lwip_post(&pxtcp->msg_reset);
900 return -1;
901}
902
903
904/**
905 * Reject proxy connection attempt. Depending on the cause (sockerr)
906 * we may just drop the pcb silently, generate an ICMP datagram or
907 * send TCP reset.
908 */
909static void
910pxtcp_pcb_reject(struct tcp_pcb *pcb, int sockerr,
911 struct netif *netif, struct pbuf *p)
912{
913 int reset = 0;
914
915 if (sockerr == ECONNREFUSED) {
916 reset = 1;
917 }
918 else if (p != NULL) {
919 struct netif *oif;
920
921 LWIP_ASSERT1(netif != NULL);
922
923 oif = ip_current_netif();
924 ip_current_netif() = netif;
925
926 if (PCB_ISIPV6(pcb)) {
927 if (sockerr == EHOSTDOWN) {
928 icmp6_dest_unreach(p, ICMP6_DUR_ADDRESS); /* XXX: ??? */
929 }
930 else if (sockerr == EHOSTUNREACH
931 || sockerr == ENETDOWN
932 || sockerr == ENETUNREACH)
933 {
934 icmp6_dest_unreach(p, ICMP6_DUR_NO_ROUTE);
935 }
936 }
937 else {
938 if (sockerr == EHOSTDOWN
939 || sockerr == EHOSTUNREACH
940 || sockerr == ENETDOWN
941 || sockerr == ENETUNREACH)
942 {
943 icmp_dest_unreach(p, ICMP_DUR_HOST);
944 }
945 }
946
947 ip_current_netif() = oif;
948 }
949
950 tcp_abandon(pcb, reset);
951}
952
953
954/**
955 * Called from poll manager thread via pxtcp::msg_accept when proxy
956 * failed to connect to the destination. Also called when we failed
957 * to register pxtcp with poll manager.
958 *
959 * This is like pxtcp_pcb_reset_pxtcp() but is more discriminate in
960 * how this unestablished connection is terminated.
961 */
962static void
963pxtcp_pcb_accept_refuse(void *ctx)
964{
965 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
966
967 DPRINTF0(("%s: pxtcp %p, pcb %p, sock %d: %R[sockerr]\n",
968 __func__, (void *)pxtcp, (void *)pxtcp->pcb,
969 pxtcp->sock, pxtcp->sockerr));
970
971 LWIP_ASSERT1(pxtcp != NULL);
972 LWIP_ASSERT1(pxtcp->sock == INVALID_SOCKET);
973
974 if (pxtcp->pcb != NULL) {
975 struct tcp_pcb *pcb = pxtcp->pcb;
976 pxtcp_pcb_dissociate(pxtcp);
977 pxtcp_pcb_reject(pcb, pxtcp->sockerr, pxtcp->netif, pxtcp->unsent);
978 }
979
980 pollmgr_refptr_unref(pxtcp->rp);
981 pxtcp_free(pxtcp);
982}
983
984
985/**
986 * Convenience wrapper for poll manager connect callback to reject
987 * connection attempt.
988 *
989 * Like pxtcp_schedule_reset(), but the callback is more discriminate
990 * in how this unestablished connection is terminated.
991 */
992static int
993pxtcp_schedule_reject(struct pxtcp *pxtcp)
994{
995 pxtcp->msg_accept.msg.cb.function = pxtcp_pcb_accept_refuse;
996 pxtcp->pmhdl.slot = -1;
997 proxy_lwip_post(&pxtcp->msg_accept);
998 return -1;
999}
1000
1001
1002/**
1003 * Global tcp_proxy_accept() callback for proxied outgoing TCP
1004 * connections from guest(s).
1005 */
1006static err_t
1007pxtcp_pcb_heard(void *arg, struct tcp_pcb *newpcb, struct pbuf *syn)
1008{
1009 LWIP_UNUSED_ARG(arg);
1010
1011 return pxtcp_pcb_accept_outbound(newpcb, syn,
1012 PCB_ISIPV6(newpcb), &newpcb->local_ip, newpcb->local_port);
1013}
1014
1015
1016err_t
1017pxtcp_pcb_accept_outbound(struct tcp_pcb *newpcb, struct pbuf *p,
1018 int is_ipv6, ipX_addr_t *dst_addr, u16_t dst_port)
1019{
1020 struct pxtcp *pxtcp;
1021 ipX_addr_t mapped_dst_addr;
1022 int sdom;
1023 SOCKET sock;
1024 ssize_t nsent;
1025 int sockerr = 0;
1026
1027 /*
1028 * TCP first calls accept callback when it receives the first SYN
1029 * and "tentatively accepts" new proxied connection attempt. When
1030 * proxy "confirms" the SYN and sends SYN|ACK and the guest
1031 * replies with ACK the accept callback is called again, this time
1032 * with the established connection.
1033 */
1034 LWIP_ASSERT1(newpcb->state == SYN_RCVD_0);
1035 tcp_accept(newpcb, pxtcp_pcb_accept);
1036 tcp_arg(newpcb, NULL);
1037
1038 tcp_setprio(newpcb, TCP_PRIO_MAX);
1039
1040 pxremap_outbound_ipX(is_ipv6, &mapped_dst_addr, dst_addr);
1041
1042 sdom = is_ipv6 ? PF_INET6 : PF_INET;
1043 sock = proxy_connected_socket(sdom, SOCK_STREAM,
1044 &mapped_dst_addr, dst_port);
1045 if (sock == INVALID_SOCKET) {
1046 sockerr = SOCKERRNO();
1047 goto abort;
1048 }
1049
1050 pxtcp = pxtcp_allocate();
1051 if (pxtcp == NULL) {
1052 proxy_reset_socket(sock);
1053 goto abort;
1054 }
1055
1056 /* save initial datagram in case we need to reply with ICMP */
1057 if (p != NULL) {
1058 pbuf_ref(p);
1059 pxtcp->unsent = p;
1060 pxtcp->netif = ip_current_netif();
1061 }
1062
1063 pxtcp_pcb_associate(pxtcp, newpcb);
1064 pxtcp->sock = sock;
1065
1066 pxtcp->pmhdl.callback = pxtcp_pmgr_connect;
1067 pxtcp->events = POLLOUT;
1068
1069 nsent = pxtcp_chan_send(POLLMGR_CHAN_PXTCP_ADD, pxtcp);
1070 if (nsent < 0) {
1071 pxtcp->sock = INVALID_SOCKET;
1072 proxy_reset_socket(sock);
1073 pxtcp_pcb_accept_refuse(pxtcp);
1074 return ERR_ABRT;
1075 }
1076
1077 return ERR_OK;
1078
1079 abort:
1080 DPRINTF0(("%s: pcb %p, sock %d: %R[sockerr]\n",
1081 __func__, (void *)newpcb, sock, sockerr));
1082 pxtcp_pcb_reject(newpcb, sockerr, ip_current_netif(), p);
1083 return ERR_ABRT;
1084}
1085
1086
1087/**
1088 * tcp_proxy_accept() callback for accepted proxied outgoing TCP
1089 * connections from guest(s). This is "real" accept with three-way
1090 * handshake completed.
1091 */
1092static err_t
1093pxtcp_pcb_accept(void *arg, struct tcp_pcb *pcb, err_t error)
1094{
1095 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1096
1097 LWIP_UNUSED_ARG(pcb); /* used only in asserts */
1098 LWIP_UNUSED_ARG(error); /* always ERR_OK */
1099
1100 LWIP_ASSERT1(pxtcp != NULL);
1101 LWIP_ASSERT1(pxtcp->pcb = pcb);
1102 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1103
1104 /* send any inbound data that are already queued */
1105 pxtcp_pcb_forward_inbound(pxtcp);
1106 return ERR_OK;
1107}
1108
1109
1110/**
1111 * Initial poll manager callback for proxied outgoing TCP connections.
1112 * pxtcp_pcb_accept() sets pxtcp::pmhdl::callback to this.
1113 *
1114 * Waits for connect(2) to the destination to complete. On success
1115 * replaces itself with pxtcp_pmgr_pump() callback common to all
1116 * established TCP connections.
1117 */
1118static int
1119pxtcp_pmgr_connect(struct pollmgr_handler *handler, SOCKET fd, int revents)
1120{
1121 struct pxtcp *pxtcp;
1122 RT_NOREF(fd);
1123
1124 pxtcp = (struct pxtcp *)handler->data;
1125 LWIP_ASSERT1(handler == &pxtcp->pmhdl);
1126 LWIP_ASSERT1(fd == pxtcp->sock);
1127 LWIP_ASSERT1(pxtcp->sockerr == 0);
1128
1129 if (revents & POLLNVAL) {
1130 pxtcp->sock = INVALID_SOCKET;
1131 pxtcp->sockerr = ETIMEDOUT;
1132 return pxtcp_schedule_reject(pxtcp);
1133 }
1134
1135 /*
1136 * Solaris and NetBSD don't report either POLLERR or POLLHUP when
1137 * connect(2) fails, just POLLOUT. In that case we always need to
1138 * check SO_ERROR.
1139 */
1140#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
1141# define CONNECT_CHECK_ERROR POLLOUT
1142#else
1143# define CONNECT_CHECK_ERROR (POLLERR | POLLHUP)
1144#endif
1145
1146 /*
1147 * Check the cause of the failure so that pxtcp_pcb_reject() may
1148 * behave accordingly.
1149 */
1150 if (revents & CONNECT_CHECK_ERROR) {
1151 socklen_t optlen = (socklen_t)sizeof(pxtcp->sockerr);
1152 int status;
1153 SOCKET s;
1154
1155 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1156 (char *)&pxtcp->sockerr, &optlen);
1157 if (RT_UNLIKELY(status == SOCKET_ERROR)) { /* should not happen */
1158 DPRINTF(("%s: sock %d: SO_ERROR failed: %R[sockerr]\n",
1159 __func__, fd, SOCKERRNO()));
1160 pxtcp->sockerr = ETIMEDOUT;
1161 }
1162 else {
1163 /* don't spam this log on successful connect(2) */
1164 if ((revents & (POLLERR | POLLHUP)) /* we were told it's failed */
1165 || pxtcp->sockerr != 0) /* we determined it's failed */
1166 {
1167 DPRINTF(("%s: sock %d: connect: %R[sockerr]\n",
1168 __func__, fd, pxtcp->sockerr));
1169 }
1170
1171 if ((revents & (POLLERR | POLLHUP))
1172 && RT_UNLIKELY(pxtcp->sockerr == 0))
1173 {
1174 /* if we're told it's failed, make sure it's marked as such */
1175 pxtcp->sockerr = ETIMEDOUT;
1176 }
1177 }
1178
1179 if (pxtcp->sockerr != 0) {
1180 s = pxtcp->sock;
1181 pxtcp->sock = INVALID_SOCKET;
1182 closesocket(s);
1183 return pxtcp_schedule_reject(pxtcp);
1184 }
1185 }
1186
1187 if (revents & POLLOUT) { /* connect is successful */
1188 /* confirm accept to the guest */
1189 proxy_lwip_post(&pxtcp->msg_accept);
1190
1191 /*
1192 * Switch to common callback used for all established proxied
1193 * connections.
1194 */
1195 pxtcp->pmhdl.callback = pxtcp_pmgr_pump;
1196
1197 /*
1198 * Initially we poll for incoming traffic only. Outgoing
1199 * traffic is fast-forwarded by pxtcp_pcb_recv(); if it fails
1200 * it will ask us to poll for POLLOUT too.
1201 */
1202 pxtcp->events = POLLIN;
1203 return pxtcp->events;
1204 }
1205
1206 /* should never get here */
1207 DPRINTF0(("%s: pxtcp %p, sock %d: unexpected revents 0x%x\n",
1208 __func__, (void *)pxtcp, fd, revents));
1209 return pxtcp_schedule_reset(pxtcp);
1210}
1211
1212
1213/**
1214 * Called from poll manager thread via pxtcp::msg_accept when proxy
1215 * connected to the destination. Finalize accept by sending SYN|ACK
1216 * to the guest.
1217 */
1218static void
1219pxtcp_pcb_accept_confirm(void *ctx)
1220{
1221 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
1222 err_t error;
1223
1224 LWIP_ASSERT1(pxtcp != NULL);
1225 if (pxtcp->pcb == NULL) {
1226 return;
1227 }
1228
1229 /* we are not going to reply with ICMP, so we can drop initial pbuf */
1230 if (pxtcp->unsent != NULL) {
1231 pbuf_free(pxtcp->unsent);
1232 pxtcp->unsent = NULL;
1233 }
1234
1235 error = tcp_proxy_accept_confirm(pxtcp->pcb);
1236
1237 /*
1238 * If lwIP failed to enqueue SYN|ACK because it's out of pbufs it
1239 * abandons the pcb. Retrying that is not very easy, since it
1240 * would require keeping "fractional state". From guest's point
1241 * of view there is no reply to its SYN so it will either resend
1242 * the SYN (effetively triggering full connection retry for us),
1243 * or it will eventually time out.
1244 */
1245 if (error == ERR_ABRT) {
1246 pxtcp->pcb = NULL; /* pcb is gone */
1247 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
1248 }
1249
1250 /*
1251 * else if (error != ERR_OK): even if tcp_output() failed with
1252 * ERR_MEM - don't give up, that SYN|ACK is enqueued and will be
1253 * retransmitted eventually.
1254 */
1255}
1256
1257
1258/**
1259 * Entry point for port-forwarding.
1260 *
1261 * fwtcp accepts new incoming connection, creates pxtcp for the socket
1262 * (with no pcb yet) and adds it to the poll manager (polling for
1263 * errors only). Then it calls this function to construct the pcb and
1264 * perform connection to the guest.
1265 */
1266void
1267pxtcp_pcb_connect(struct pxtcp *pxtcp, const struct fwspec *fwspec)
1268{
1269 struct sockaddr_storage ss;
1270 socklen_t sslen;
1271 struct tcp_pcb *pcb;
1272 ipX_addr_t src_addr, dst_addr;
1273 u16_t src_port, dst_port;
1274 int status;
1275 err_t error;
1276
1277 LWIP_ASSERT1(pxtcp != NULL);
1278 LWIP_ASSERT1(pxtcp->pcb == NULL);
1279 LWIP_ASSERT1(fwspec->stype == SOCK_STREAM);
1280
1281 pcb = tcp_new();
1282 if (pcb == NULL) {
1283 goto reset;
1284 }
1285
1286 tcp_setprio(pcb, TCP_PRIO_MAX);
1287 pxtcp_pcb_associate(pxtcp, pcb);
1288
1289 sslen = sizeof(ss);
1290 status = getpeername(pxtcp->sock, (struct sockaddr *)&ss, &sslen);
1291 if (status == SOCKET_ERROR) {
1292 goto reset;
1293 }
1294
1295 /* nit: compares PF and AF, but they are the same everywhere */
1296 LWIP_ASSERT1(ss.ss_family == fwspec->sdom);
1297
1298 status = fwany_ipX_addr_set_src(&src_addr, (const struct sockaddr *)&ss);
1299 if (status == PXREMAP_FAILED) {
1300 goto reset;
1301 }
1302
1303 if (ss.ss_family == PF_INET) {
1304 const struct sockaddr_in *peer4 = (const struct sockaddr_in *)&ss;
1305
1306 src_port = peer4->sin_port;
1307
1308 memcpy(&dst_addr.ip4, &fwspec->dst.sin.sin_addr, sizeof(ip_addr_t));
1309 dst_port = fwspec->dst.sin.sin_port;
1310 }
1311 else { /* PF_INET6 */
1312 const struct sockaddr_in6 *peer6 = (const struct sockaddr_in6 *)&ss;
1313 ip_set_v6(pcb, 1);
1314
1315 src_port = peer6->sin6_port;
1316
1317 memcpy(&dst_addr.ip6, &fwspec->dst.sin6.sin6_addr, sizeof(ip6_addr_t));
1318 dst_port = fwspec->dst.sin6.sin6_port;
1319 }
1320
1321 /* lwip port arguments are in host order */
1322 src_port = ntohs(src_port);
1323 dst_port = ntohs(dst_port);
1324
1325 error = tcp_proxy_bind(pcb, ipX_2_ip(&src_addr), src_port);
1326 if (error != ERR_OK) {
1327 goto reset;
1328 }
1329
1330 error = tcp_connect(pcb, ipX_2_ip(&dst_addr), dst_port,
1331 /* callback: */ pxtcp_pcb_connected);
1332 if (error != ERR_OK) {
1333 goto reset;
1334 }
1335
1336 return;
1337
1338 reset:
1339 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
1340}
1341
1342
1343/**
1344 * Port-forwarded connection to guest is successful, pump data.
1345 */
1346static err_t
1347pxtcp_pcb_connected(void *arg, struct tcp_pcb *pcb, err_t error)
1348{
1349 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1350
1351 LWIP_ASSERT1(error == ERR_OK); /* always called with ERR_OK */
1352 LWIP_UNUSED_ARG(error);
1353
1354 LWIP_ASSERT1(pxtcp != NULL);
1355 LWIP_ASSERT1(pxtcp->pcb == pcb);
1356 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1357 LWIP_UNUSED_ARG(pcb);
1358
1359 DPRINTF0(("%s: new pxtcp %p; pcb %p; sock %d\n",
1360 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
1361
1362 /* ACK on connection is like ACK on data in pxtcp_pcb_sent() */
1363 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp);
1364
1365 return ERR_OK;
1366}
1367
1368
1369/**
1370 * tcp_recv() callback.
1371 */
1372static err_t
1373pxtcp_pcb_recv(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t error)
1374{
1375 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1376
1377 LWIP_ASSERT1(error == ERR_OK); /* always called with ERR_OK */
1378 LWIP_UNUSED_ARG(error);
1379
1380 LWIP_ASSERT1(pxtcp != NULL);
1381 LWIP_ASSERT1(pxtcp->pcb == pcb);
1382 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1383 LWIP_UNUSED_ARG(pcb);
1384
1385
1386 /*
1387 * Have we done sending previous batch?
1388 */
1389 if (pxtcp->unsent != NULL) {
1390 if (p != NULL) {
1391 /*
1392 * Return an error to tell TCP to hold onto that pbuf.
1393 * It will be presented to us later from tcp_fasttmr().
1394 */
1395 return ERR_WOULDBLOCK;
1396 }
1397 else {
1398 /*
1399 * Unlike data, p == NULL indicating orderly shutdown is
1400 * NOT presented to us again
1401 */
1402 pxtcp->outbound_close = 1;
1403 return ERR_OK;
1404 }
1405 }
1406
1407
1408 /*
1409 * Guest closed?
1410 */
1411 if (p == NULL) {
1412 pxtcp->outbound_close = 1;
1413 pxtcp_pcb_forward_outbound_close(pxtcp);
1414 return ERR_OK;
1415 }
1416
1417
1418 /*
1419 * Got data, send what we can without blocking.
1420 */
1421 return pxtcp_pcb_forward_outbound(pxtcp, p);
1422}
1423
1424
1425/**
1426 * Guest half-closed its TX side of the connection.
1427 *
1428 * Called either immediately from pxtcp_pcb_recv() when it gets NULL,
1429 * or from pxtcp_pcb_forward_outbound() when it finishes forwarding
1430 * previously unsent data and sees pxtcp::outbound_close flag saved by
1431 * pxtcp_pcb_recv().
1432 */
1433static void
1434pxtcp_pcb_forward_outbound_close(struct pxtcp *pxtcp)
1435{
1436 struct tcp_pcb *pcb;
1437
1438 LWIP_ASSERT1(pxtcp != NULL);
1439 LWIP_ASSERT1(pxtcp->outbound_close);
1440 LWIP_ASSERT1(!pxtcp->outbound_close_done);
1441
1442 pcb = pxtcp->pcb;
1443 LWIP_ASSERT1(pcb != NULL);
1444
1445 DPRINTF(("outbound_close: pxtcp %p; pcb %p %s\n",
1446 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
1447
1448
1449 /* set the flag first, since shutdown() may trigger POLLHUP */
1450 pxtcp->outbound_close_done = 1;
1451 shutdown(pxtcp->sock, SHUT_WR); /* half-close the socket */
1452
1453#if !(HAVE_TCP_POLLHUP & POLLOUT)
1454 /*
1455 * We need to nudge poll manager manually, since OS will not
1456 * report POLLHUP.
1457 */
1458 if (pxtcp->inbound_close) {
1459 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_DEL, pxtcp);
1460 }
1461#endif
1462
1463
1464 /* no more outbound data coming to us */
1465 tcp_recv(pcb, NULL);
1466
1467 /*
1468 * If we have already done inbound close previously (active close
1469 * on the pcb), then we must not hold onto a pcb in TIME_WAIT
1470 * state since those will be recycled by lwip when it runs out of
1471 * free pcbs in the pool.
1472 *
1473 * The test is true also for a pcb in CLOSING state that waits
1474 * just for the ACK of its FIN (to transition to TIME_WAIT).
1475 */
1476 if (pxtcp_pcb_forward_inbound_done(pxtcp)) {
1477 pxtcp_pcb_dissociate(pxtcp);
1478 }
1479}
1480
1481
1482/**
1483 * Forward outbound data from pcb to socket.
1484 *
1485 * Called by pxtcp_pcb_recv() to forward new data and by callout
1486 * triggered by POLLOUT on the socket to send previously unsent data.
1487 *
1488 * (Re)scehdules one-time callout if not all data are sent.
1489 */
1490static err_t
1491pxtcp_pcb_forward_outbound(struct pxtcp *pxtcp, struct pbuf *p)
1492{
1493 struct pbuf *qs, *q;
1494 size_t qoff;
1495 size_t forwarded;
1496 int sockerr;
1497
1498 LWIP_ASSERT1(pxtcp->unsent == NULL || pxtcp->unsent == p);
1499
1500 forwarded = 0;
1501 sockerr = 0;
1502
1503 q = NULL;
1504 qoff = 0;
1505
1506 qs = p;
1507 while (qs != NULL) {
1508 IOVEC iov[8];
1509 const size_t iovsize = sizeof(iov)/sizeof(iov[0]);
1510 size_t fwd1;
1511 ssize_t nsent;
1512 size_t i;
1513
1514 fwd1 = 0;
1515 for (i = 0, q = qs; i < iovsize && q != NULL; ++i, q = q->next) {
1516 LWIP_ASSERT1(q->len > 0);
1517 IOVEC_SET_BASE(iov[i], q->payload);
1518 IOVEC_SET_LEN(iov[i], q->len);
1519 fwd1 += q->len;
1520 }
1521
1522 /*
1523 * TODO: This is where application-level proxy can hook into
1524 * to process outbound traffic.
1525 */
1526 nsent = pxtcp_sock_send(pxtcp, iov, i);
1527
1528 if (nsent == (ssize_t)fwd1) {
1529 /* successfully sent this chain fragment completely */
1530 forwarded += nsent;
1531 qs = q;
1532 }
1533 else if (nsent >= 0) {
1534 /* successfully sent only some data */
1535 forwarded += nsent;
1536
1537 /* find the first pbuf that was not completely forwarded */
1538 qoff = nsent;
1539 for (i = 0, q = qs; i < iovsize && q != NULL; ++i, q = q->next) {
1540 if (qoff < q->len) {
1541 break;
1542 }
1543 qoff -= q->len;
1544 }
1545 LWIP_ASSERT1(q != NULL);
1546 LWIP_ASSERT1(qoff < q->len);
1547 break;
1548 }
1549 else {
1550 sockerr = -nsent;
1551
1552 /*
1553 * Some errors are really not errors - if we get them,
1554 * it's not different from getting nsent == 0, so filter
1555 * them out here.
1556 */
1557 if (proxy_error_is_transient(sockerr)) {
1558 sockerr = 0;
1559 }
1560 q = qs;
1561 qoff = 0;
1562 break;
1563 }
1564 }
1565
1566 if (forwarded > 0) {
1567 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: sent %d bytes\n",
1568 (void *)pxtcp, (void *)pxtcp->pcb, (int)forwarded));
1569 tcp_recved(pxtcp->pcb, (u16_t)forwarded);
1570 }
1571
1572 if (q == NULL) { /* everything is forwarded? */
1573 LWIP_ASSERT1(sockerr == 0);
1574 LWIP_ASSERT1(forwarded == p->tot_len);
1575
1576 pxtcp->unsent = NULL;
1577 pbuf_free(p);
1578 if (pxtcp->outbound_close) {
1579 pxtcp_pcb_forward_outbound_close(pxtcp);
1580 }
1581 }
1582 else {
1583 if (q != p) {
1584 /* free forwarded pbufs at the beginning of the chain */
1585 pbuf_ref(q);
1586 pbuf_free(p);
1587 }
1588 if (qoff > 0) {
1589 /* advance payload pointer past the forwarded part */
1590 pbuf_header(q, -(s16_t)qoff);
1591 }
1592 pxtcp->unsent = q;
1593 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: kept %d bytes\n",
1594 (void *)pxtcp, (void *)pxtcp->pcb, (int)q->tot_len));
1595
1596 /*
1597 * Have sendmsg() failed?
1598 *
1599 * Connection reset will be detected by poll and
1600 * pxtcp_schedule_reset() will be called.
1601 *
1602 * Otherwise something *really* unexpected must have happened,
1603 * so we'd better abort.
1604 */
1605 if (sockerr != 0 && sockerr != ECONNRESET) {
1606 struct tcp_pcb *pcb = pxtcp->pcb;
1607 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: %R[sockerr]\n",
1608 (void *)pxtcp, (void *)pcb, sockerr));
1609
1610 pxtcp_pcb_dissociate(pxtcp);
1611
1612 tcp_abort(pcb);
1613
1614 /* call error callback manually since we've already dissociated */
1615 pxtcp_pcb_err((void *)pxtcp, ERR_ABRT);
1616 return ERR_ABRT;
1617 }
1618
1619 /* schedule one-shot POLLOUT on the socket */
1620 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLOUT, pxtcp);
1621 }
1622 return ERR_OK;
1623}
1624
1625
1626#if !defined(RT_OS_WINDOWS)
1627static ssize_t
1628pxtcp_sock_send(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1629{
1630 struct msghdr mh;
1631 ssize_t nsent;
1632
1633#ifdef MSG_NOSIGNAL
1634 const int send_flags = MSG_NOSIGNAL;
1635#else
1636 const int send_flags = 0;
1637#endif
1638
1639 memset(&mh, 0, sizeof(mh));
1640
1641 mh.msg_iov = iov;
1642 mh.msg_iovlen = iovlen;
1643
1644 nsent = sendmsg(pxtcp->sock, &mh, send_flags);
1645 if (nsent < 0) {
1646 nsent = -SOCKERRNO();
1647 }
1648
1649 return nsent;
1650}
1651#else /* RT_OS_WINDOWS */
1652static ssize_t
1653pxtcp_sock_send(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1654{
1655 DWORD nsent;
1656 int status;
1657
1658 status = WSASend(pxtcp->sock, iov, (DWORD)iovlen, &nsent,
1659 0, NULL, NULL);
1660 if (status == SOCKET_ERROR) {
1661 return -SOCKERRNO();
1662 }
1663
1664 return nsent;
1665}
1666#endif /* RT_OS_WINDOWS */
1667
1668
1669/**
1670 * Callback from poll manager (on POLLOUT) to send data from
1671 * pxtcp::unsent pbuf to socket.
1672 */
1673static void
1674pxtcp_pcb_write_outbound(void *ctx)
1675{
1676 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
1677 LWIP_ASSERT1(pxtcp != NULL);
1678
1679 if (pxtcp->pcb == NULL) {
1680 return;
1681 }
1682
1683 pxtcp_pcb_forward_outbound(pxtcp, pxtcp->unsent);
1684}
1685
1686
1687/**
1688 * Common poll manager callback used by both outgoing and incoming
1689 * (port-forwarded) connections that has connected socket.
1690 */
1691static int
1692pxtcp_pmgr_pump(struct pollmgr_handler *handler, SOCKET fd, int revents)
1693{
1694 struct pxtcp *pxtcp;
1695 int status;
1696 int sockerr;
1697 RT_NOREF(fd);
1698
1699 pxtcp = (struct pxtcp *)handler->data;
1700 LWIP_ASSERT1(handler == &pxtcp->pmhdl);
1701 LWIP_ASSERT1(fd == pxtcp->sock);
1702
1703 if (revents & POLLNVAL) {
1704 pxtcp->sock = INVALID_SOCKET;
1705 return pxtcp_schedule_reset(pxtcp);
1706 }
1707
1708 if (revents & POLLERR) {
1709 socklen_t optlen = (socklen_t)sizeof(sockerr);
1710
1711 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1712 (char *)&sockerr, &optlen);
1713 if (status == SOCKET_ERROR) { /* should not happen */
1714 DPRINTF(("sock %d: POLLERR: SO_ERROR failed: %R[sockerr]\n",
1715 fd, SOCKERRNO()));
1716 }
1717 else {
1718 DPRINTF0(("sock %d: POLLERR: %R[sockerr]\n", fd, sockerr));
1719 }
1720 return pxtcp_schedule_reset(pxtcp);
1721 }
1722
1723 if (revents & POLLOUT) {
1724 pxtcp->events &= ~POLLOUT;
1725 proxy_lwip_post(&pxtcp->msg_outbound);
1726 }
1727
1728 if (revents & POLLIN) {
1729 ssize_t nread;
1730 int stop_pollin;
1731
1732 nread = pxtcp_sock_read(pxtcp, &stop_pollin);
1733 if (nread < 0) {
1734 sockerr = -(int)nread;
1735 DPRINTF0(("sock %d: POLLIN: %R[sockerr]\n", fd, sockerr));
1736 return pxtcp_schedule_reset(pxtcp);
1737 }
1738
1739 if (stop_pollin) {
1740 pxtcp->events &= ~POLLIN;
1741 }
1742
1743 if (nread > 0) {
1744 proxy_lwip_post(&pxtcp->msg_inbound);
1745#if !HAVE_TCP_POLLHUP
1746 /*
1747 * If host does not report POLLHUP for closed sockets
1748 * (e.g. NetBSD) we should check for full close manually.
1749 */
1750 if (pxtcp->inbound_close && pxtcp->outbound_close_done) {
1751 LWIP_ASSERT1((revents & POLLHUP) == 0);
1752 return pxtcp_schedule_delete(pxtcp);
1753 }
1754#endif
1755 }
1756 }
1757
1758#if !HAVE_TCP_POLLHUP
1759 LWIP_ASSERT1((revents & POLLHUP) == 0);
1760#else
1761 if (revents & POLLHUP) {
1762 DPRINTF(("sock %d: HUP\n", fd));
1763
1764#if HAVE_TCP_POLLHUP == POLLIN
1765 /*
1766 * XXX: OSX reports POLLHUP once more when inbound is already
1767 * half-closed (which has already been reported as a "normal"
1768 * POLLHUP, handled below), the socket is polled for POLLOUT
1769 * (guest sends a lot of data that we can't push out fast
1770 * enough), and remote sends a reset - e.g. an http client
1771 * that half-closes after request and then aborts the transfer.
1772 *
1773 * It really should have been reported as POLLERR, but it
1774 * seems OSX never reports POLLERR for sockets.
1775 */
1776#if defined(RT_OS_DARWIN)
1777 {
1778 socklen_t optlen = (socklen_t)sizeof(sockerr);
1779
1780 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1781 (char *)&sockerr, &optlen);
1782 if (status == SOCKET_ERROR) { /* should not happen */
1783 DPRINTF(("sock %d: POLLHUP: SO_ERROR failed: %R[sockerr]\n",
1784 fd, SOCKERRNO()));
1785 sockerr = ECONNRESET;
1786 }
1787 else if (sockerr != 0) {
1788 DPRINTF0(("sock %d: POLLHUP: %R[sockerr]\n", fd, sockerr));
1789 }
1790
1791 if (sockerr != 0) { /* XXX: should have been POLLERR */
1792 return pxtcp_schedule_reset(pxtcp);
1793 }
1794 }
1795#endif /* RT_OS_DARWIN */
1796
1797 /*
1798 * Remote closed inbound.
1799 */
1800 if (!pxtcp->outbound_close_done) {
1801 /*
1802 * We might still need to poll for POLLOUT, but we can not
1803 * poll for POLLIN anymore (even if not all data are read)
1804 * because we will be spammed by POLLHUP.
1805 */
1806 pxtcp->events &= ~POLLIN;
1807 if (!pxtcp->inbound_close) {
1808 /* the rest of the input has to be pulled */
1809 proxy_lwip_post(&pxtcp->msg_inpull);
1810 }
1811 }
1812 else
1813#endif
1814 /*
1815 * Both directions are closed.
1816 */
1817 {
1818 LWIP_ASSERT1(pxtcp->outbound_close_done);
1819
1820 if (pxtcp->inbound_close) {
1821 /* there's no unread data, we are done */
1822 return pxtcp_schedule_delete(pxtcp);
1823 }
1824 else {
1825 /* pull the rest of the input first (deferred_delete) */
1826 pxtcp->pmhdl.slot = -1;
1827 proxy_lwip_post(&pxtcp->msg_inpull);
1828 return -1;
1829 }
1830 /* NOTREACHED */
1831 }
1832
1833 }
1834#endif /* HAVE_TCP_POLLHUP */
1835
1836 return pxtcp->events;
1837}
1838
1839
1840/**
1841 * Read data from socket to ringbuf. This may be used both on lwip
1842 * and poll manager threads.
1843 *
1844 * Flag pointed to by pstop is set when further reading is impossible,
1845 * either temporary when buffer is full, or permanently when EOF is
1846 * received.
1847 *
1848 * Returns number of bytes read. NB: EOF is reported as 1!
1849 *
1850 * Returns zero if nothing was read, either because buffer is full, or
1851 * if no data is available (EWOULDBLOCK, EINTR &c).
1852 *
1853 * Returns -errno on real socket errors.
1854 */
1855static ssize_t
1856pxtcp_sock_read(struct pxtcp *pxtcp, int *pstop)
1857{
1858 IOVEC iov[2];
1859 size_t iovlen;
1860 ssize_t nread;
1861
1862 const size_t sz = pxtcp->inbuf.bufsize;
1863 size_t beg, lim, wrnew;
1864
1865 *pstop = 0;
1866
1867 beg = pxtcp->inbuf.vacant;
1868 IOVEC_SET_BASE(iov[0], &pxtcp->inbuf.buf[beg]);
1869
1870 /* lim is the index we can NOT write to */
1871 lim = pxtcp->inbuf.unacked;
1872 if (lim == 0) {
1873 lim = sz - 1; /* empty slot at the end */
1874 }
1875 else if (lim == 1 && beg != 0) {
1876 lim = sz; /* empty slot at the beginning */
1877 }
1878 else {
1879 --lim;
1880 }
1881
1882 if (beg == lim) {
1883 /*
1884 * Buffer is full, stop polling for POLLIN.
1885 *
1886 * pxtcp_pcb_sent() will re-enable POLLIN when guest ACKs
1887 * data, freeing space in the ring buffer.
1888 */
1889 *pstop = 1;
1890 return 0;
1891 }
1892
1893 if (beg < lim) {
1894 /* free space in one chunk */
1895 iovlen = 1;
1896 IOVEC_SET_LEN(iov[0], lim - beg);
1897 }
1898 else {
1899 /* free space in two chunks */
1900 iovlen = 2;
1901 IOVEC_SET_LEN(iov[0], sz - beg);
1902 IOVEC_SET_BASE(iov[1], &pxtcp->inbuf.buf[0]);
1903 IOVEC_SET_LEN(iov[1], lim);
1904 }
1905
1906 /*
1907 * TODO: This is where application-level proxy can hook into to
1908 * process inbound traffic.
1909 */
1910 nread = pxtcp_sock_recv(pxtcp, iov, iovlen);
1911
1912 if (nread > 0) {
1913 wrnew = beg + nread;
1914 if (wrnew >= sz) {
1915 wrnew -= sz;
1916 }
1917 pxtcp->inbuf.vacant = wrnew;
1918 DPRINTF2(("pxtcp %p: sock %d read %d bytes\n",
1919 (void *)pxtcp, pxtcp->sock, (int)nread));
1920 return nread;
1921 }
1922 else if (nread == 0) {
1923 *pstop = 1;
1924 pxtcp->inbound_close = 1;
1925 DPRINTF2(("pxtcp %p: sock %d read EOF\n",
1926 (void *)pxtcp, pxtcp->sock));
1927 return 1;
1928 }
1929 else {
1930 int sockerr = -nread;
1931
1932 if (proxy_error_is_transient(sockerr)) {
1933 /* haven't read anything, just return */
1934 DPRINTF2(("pxtcp %p: sock %d read cancelled\n",
1935 (void *)pxtcp, pxtcp->sock));
1936 return 0;
1937 }
1938 else {
1939 /* socket error! */
1940 DPRINTF0(("pxtcp %p: sock %d read: %R[sockerr]\n",
1941 (void *)pxtcp, pxtcp->sock, sockerr));
1942 return -sockerr;
1943 }
1944 }
1945}
1946
1947
1948#if !defined(RT_OS_WINDOWS)
1949static ssize_t
1950pxtcp_sock_recv(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1951{
1952 struct msghdr mh;
1953 ssize_t nread;
1954
1955 memset(&mh, 0, sizeof(mh));
1956
1957 mh.msg_iov = iov;
1958 mh.msg_iovlen = iovlen;
1959
1960 nread = recvmsg(pxtcp->sock, &mh, 0);
1961 if (nread < 0) {
1962 nread = -SOCKERRNO();
1963 }
1964
1965 return nread;
1966}
1967#else /* RT_OS_WINDOWS */
1968static ssize_t
1969pxtcp_sock_recv(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1970{
1971 DWORD flags;
1972 DWORD nread;
1973 int status;
1974
1975 flags = 0;
1976 status = WSARecv(pxtcp->sock, iov, (DWORD)iovlen, &nread,
1977 &flags, NULL, NULL);
1978 if (status == SOCKET_ERROR) {
1979 return -SOCKERRNO();
1980 }
1981
1982 return (ssize_t)nread;
1983}
1984#endif /* RT_OS_WINDOWS */
1985
1986
1987/**
1988 * Callback from poll manager (pxtcp::msg_inbound) to trigger output
1989 * from ringbuf to guest.
1990 */
1991static void
1992pxtcp_pcb_write_inbound(void *ctx)
1993{
1994 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
1995 LWIP_ASSERT1(pxtcp != NULL);
1996
1997 if (pxtcp->pcb == NULL) {
1998 return;
1999 }
2000
2001 pxtcp_pcb_forward_inbound(pxtcp);
2002}
2003
2004
2005/**
2006 * tcp_poll() callback
2007 *
2008 * We swtich it on when tcp_write() or tcp_shutdown() fail with
2009 * ERR_MEM to prevent connection from stalling. If there are ACKs or
2010 * more inbound data then pxtcp_pcb_forward_inbound() will be
2011 * triggered again, but if neither happens, tcp_poll() comes to the
2012 * rescue.
2013 */
2014static err_t
2015pxtcp_pcb_poll(void *arg, struct tcp_pcb *pcb)
2016{
2017 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2018 LWIP_UNUSED_ARG(pcb);
2019
2020 DPRINTF2(("%s: pxtcp %p; pcb %p\n",
2021 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2022
2023 pxtcp_pcb_forward_inbound(pxtcp);
2024
2025 /*
2026 * If the last thing holding up deletion of the pxtcp was failed
2027 * tcp_shutdown() and it succeeded, we may be the last callback.
2028 */
2029 pxtcp_pcb_maybe_deferred_delete(pxtcp);
2030
2031 return ERR_OK;
2032}
2033
2034
2035static void
2036pxtcp_pcb_schedule_poll(struct pxtcp *pxtcp)
2037{
2038 tcp_poll(pxtcp->pcb, pxtcp_pcb_poll, 0);
2039}
2040
2041
2042static void
2043pxtcp_pcb_cancel_poll(struct pxtcp *pxtcp)
2044{
2045 tcp_poll(pxtcp->pcb, NULL, 255);
2046}
2047
2048
2049/**
2050 * Forward inbound data from ring buffer to the guest.
2051 *
2052 * Scheduled by poll manager thread after it receives more data into
2053 * the ring buffer (we have more data to send).
2054
2055 * Also called from tcp_sent() callback when guest ACKs some data,
2056 * increasing pcb->snd_buf (we are permitted to send more data).
2057 *
2058 * Also called from tcp_poll() callback if previous attempt to forward
2059 * inbound data failed with ERR_MEM (we need to try again).
2060 */
2061static void
2062pxtcp_pcb_forward_inbound(struct pxtcp *pxtcp)
2063{
2064 struct tcp_pcb *pcb;
2065 size_t sndbuf;
2066 size_t beg, lim, sndlim;
2067 size_t toeob, tolim;
2068 size_t nsent;
2069 err_t error;
2070
2071 LWIP_ASSERT1(pxtcp != NULL);
2072 pcb = pxtcp->pcb;
2073 if (pcb == NULL) {
2074 return;
2075 }
2076
2077 if (/* __predict_false */ pcb->state < ESTABLISHED) {
2078 /*
2079 * If we have just confirmed accept of this connection, the
2080 * pcb is in SYN_RCVD state and we still haven't received the
2081 * ACK of our SYN. It's only in SYN_RCVD -> ESTABLISHED
2082 * transition that lwip decrements pcb->acked so that that ACK
2083 * is not reported to pxtcp_pcb_sent(). If we send something
2084 * now and immediately close (think "daytime", e.g.) while
2085 * still in SYN_RCVD state, we will move directly to
2086 * FIN_WAIT_1 and when our confirming SYN is ACK'ed lwip will
2087 * report it to pxtcp_pcb_sent().
2088 */
2089 DPRINTF2(("forward_inbound: pxtcp %p; pcb %p %s - later...\n",
2090 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
2091 return;
2092 }
2093
2094
2095 beg = pxtcp->inbuf.unsent; /* private to lwip thread */
2096 lim = pxtcp->inbuf.vacant;
2097
2098 if (beg == lim) {
2099 if (pxtcp->inbound_close && !pxtcp->inbound_close_done) {
2100 pxtcp_pcb_forward_inbound_close(pxtcp);
2101 tcp_output(pcb);
2102 return;
2103 }
2104
2105 /*
2106 * Else, there's no data to send.
2107 *
2108 * If there is free space in the buffer, producer will
2109 * reschedule us as it receives more data and vacant (lim)
2110 * advances.
2111 *
2112 * If buffer is full when all data have been passed to
2113 * tcp_write() but not yet acknowledged, we will advance
2114 * unacked on ACK, freeing some space for producer to write to
2115 * (then see above).
2116 */
2117 return;
2118 }
2119
2120 sndbuf = tcp_sndbuf(pcb);
2121 if (sndbuf == 0) {
2122 /*
2123 * Can't send anything now. As guest ACKs some data, TCP will
2124 * call pxtcp_pcb_sent() callback and we will come here again.
2125 */
2126 return;
2127 }
2128
2129 nsent = 0;
2130
2131 /*
2132 * We have three limits to consider:
2133 * - how much data we have in the ringbuf
2134 * - how much data we are allowed to send
2135 * - ringbuf size
2136 */
2137 toeob = pxtcp->inbuf.bufsize - beg;
2138 if (lim < beg) { /* lim wrapped */
2139 if (sndbuf < toeob) { /* but we are limited by sndbuf */
2140 /* so beg is not going to wrap, treat sndbuf as lim */
2141 lim = beg + sndbuf; /* ... and proceed to the simple case */
2142 }
2143 else { /* we are limited by the end of the buffer, beg will wrap */
2144 u8_t maybemore;
2145 if (toeob == sndbuf || lim == 0) {
2146 maybemore = 0;
2147 }
2148 else {
2149 maybemore = TCP_WRITE_FLAG_MORE;
2150 }
2151
2152 Assert(toeob == (u16_t)toeob);
2153 error = tcp_write(pcb, &pxtcp->inbuf.buf[beg], (u16_t)toeob, maybemore);
2154 if (error != ERR_OK) {
2155 goto writeerr;
2156 }
2157 nsent += toeob;
2158 pxtcp->inbuf.unsent = 0; /* wrap */
2159
2160 if (maybemore) {
2161 beg = 0;
2162 sndbuf -= toeob;
2163 }
2164 else {
2165 /* we are done sending, but ... */
2166 goto check_inbound_close;
2167 }
2168 }
2169 }
2170
2171 LWIP_ASSERT1(beg < lim);
2172 sndlim = beg + sndbuf;
2173 if (lim > sndlim) {
2174 lim = sndlim;
2175 }
2176 tolim = lim - beg;
2177 if (tolim > 0) {
2178 error = tcp_write(pcb, &pxtcp->inbuf.buf[beg], (u16_t)tolim, 0);
2179 if (error != ERR_OK) {
2180 goto writeerr;
2181 }
2182 nsent += tolim;
2183 pxtcp->inbuf.unsent = lim;
2184 }
2185
2186 check_inbound_close:
2187 if (pxtcp->inbound_close && pxtcp->inbuf.unsent == pxtcp->inbuf.vacant) {
2188 pxtcp_pcb_forward_inbound_close(pxtcp);
2189 }
2190
2191 DPRINTF2(("forward_inbound: pxtcp %p, pcb %p: sent %d bytes\n",
2192 (void *)pxtcp, (void *)pcb, (int)nsent));
2193 tcp_output(pcb);
2194 pxtcp_pcb_cancel_poll(pxtcp);
2195 return;
2196
2197 writeerr:
2198 if (error == ERR_MEM) {
2199 if (nsent > 0) { /* first write succeeded, second failed */
2200 DPRINTF2(("forward_inbound: pxtcp %p, pcb %p: sent %d bytes only\n",
2201 (void *)pxtcp, (void *)pcb, (int)nsent));
2202 tcp_output(pcb);
2203 }
2204 DPRINTF(("forward_inbound: pxtcp %p, pcb %p: ERR_MEM\n",
2205 (void *)pxtcp, (void *)pcb));
2206 pxtcp_pcb_schedule_poll(pxtcp);
2207 }
2208 else {
2209 DPRINTF(("forward_inbound: pxtcp %p, pcb %p: %s\n",
2210 (void *)pxtcp, (void *)pcb, proxy_lwip_strerr(error)));
2211
2212 /* XXX: We shouldn't get ERR_ARG. Check ERR_CONN conditions early? */
2213 LWIP_ASSERT1(error == ERR_MEM);
2214 }
2215}
2216
2217
2218static void
2219pxtcp_pcb_forward_inbound_close(struct pxtcp *pxtcp)
2220{
2221 struct tcp_pcb *pcb;
2222 err_t error;
2223
2224 LWIP_ASSERT1(pxtcp != NULL);
2225 LWIP_ASSERT1(pxtcp->inbound_close);
2226 LWIP_ASSERT1(!pxtcp->inbound_close_done);
2227 LWIP_ASSERT1(pxtcp->inbuf.unsent == pxtcp->inbuf.vacant);
2228
2229 pcb = pxtcp->pcb;
2230 LWIP_ASSERT1(pcb != NULL);
2231
2232 DPRINTF(("inbound_close: pxtcp %p; pcb %p: %s\n",
2233 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
2234
2235 error = tcp_shutdown(pcb, /*RX*/ 0, /*TX*/ 1);
2236 if (error != ERR_OK) {
2237 DPRINTF(("inbound_close: pxtcp %p; pcb %p:"
2238 " tcp_shutdown: error=%s\n",
2239 (void *)pxtcp, (void *)pcb, proxy_lwip_strerr(error)));
2240 pxtcp_pcb_schedule_poll(pxtcp);
2241 return;
2242 }
2243
2244 pxtcp_pcb_cancel_poll(pxtcp);
2245 pxtcp->inbound_close_done = 1;
2246
2247
2248 /*
2249 * If we have already done outbound close previously (passive
2250 * close on the pcb), then we must not hold onto a pcb in LAST_ACK
2251 * state since those will be deleted by lwip when that last ack
2252 * comes from the guest.
2253 *
2254 * NB: We do NOT check for deferred delete here, even though we
2255 * have just set one of its conditions, inbound_close_done. We
2256 * let pcb callbacks that called us do that. It's simpler and
2257 * cleaner that way.
2258 */
2259 if (pxtcp->outbound_close_done && pxtcp_pcb_forward_inbound_done(pxtcp)) {
2260 pxtcp_pcb_dissociate(pxtcp);
2261 }
2262}
2263
2264
2265/**
2266 * Check that all forwarded inbound data is sent and acked, and that
2267 * inbound close is scheduled (we aren't called back when it's acked).
2268 */
2269DECLINLINE(int)
2270pxtcp_pcb_forward_inbound_done(const struct pxtcp *pxtcp)
2271{
2272 return (pxtcp->inbound_close_done /* also implies that all data forwarded */
2273 && pxtcp->inbuf.unacked == pxtcp->inbuf.unsent);
2274}
2275
2276
2277/**
2278 * tcp_sent() callback - guest acknowledged len bytes.
2279 *
2280 * We can advance inbuf::unacked index, making more free space in the
2281 * ringbuf and wake up producer on poll manager thread.
2282 *
2283 * We can also try to send more data if we have any since pcb->snd_buf
2284 * was increased and we are now permitted to send more.
2285 */
2286static err_t
2287pxtcp_pcb_sent(void *arg, struct tcp_pcb *pcb, u16_t len)
2288{
2289 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2290 size_t unacked;
2291
2292 LWIP_ASSERT1(pxtcp != NULL);
2293 LWIP_ASSERT1(pxtcp->pcb == pcb);
2294 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
2295 LWIP_UNUSED_ARG(pcb); /* only in assert */
2296
2297 DPRINTF2(("%s: pxtcp %p; pcb %p: +%d ACKed:"
2298 " unacked %d, unsent %d, vacant %d\n",
2299 __func__, (void *)pxtcp, (void *)pcb, (int)len,
2300 (int)pxtcp->inbuf.unacked,
2301 (int)pxtcp->inbuf.unsent,
2302 (int)pxtcp->inbuf.vacant));
2303
2304 if (/* __predict_false */ len == 0) {
2305 /* we are notified to start pulling */
2306 LWIP_ASSERT1(!pxtcp->inbound_close);
2307 LWIP_ASSERT1(pxtcp->inbound_pull);
2308
2309 unacked = pxtcp->inbuf.unacked;
2310 }
2311 else {
2312 /*
2313 * Advance unacked index. Guest acknowledged the data, so it
2314 * won't be needed again for potential retransmits.
2315 */
2316 unacked = pxtcp->inbuf.unacked + len;
2317 if (unacked > pxtcp->inbuf.bufsize) {
2318 unacked -= pxtcp->inbuf.bufsize;
2319 }
2320 pxtcp->inbuf.unacked = unacked;
2321 }
2322
2323 /* arrange for more inbound data */
2324 if (!pxtcp->inbound_close) {
2325 if (!pxtcp->inbound_pull) {
2326 /* wake up producer, in case it has stopped polling for POLLIN */
2327 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp);
2328#ifdef RT_OS_WINDOWS
2329 /**
2330 * We have't got enought room in ring buffer to read atm,
2331 * but we don't want to lose notification from WSAW4ME when
2332 * space would be available, so we reset event with empty recv
2333 */
2334 recv(pxtcp->sock, NULL, 0, 0);
2335#endif
2336 }
2337 else {
2338 ssize_t nread;
2339 int stop_pollin; /* ignored */
2340
2341 nread = pxtcp_sock_read(pxtcp, &stop_pollin);
2342
2343 if (nread < 0) {
2344 int sockerr = -(int)nread;
2345 LWIP_UNUSED_ARG(sockerr);
2346 DPRINTF0(("%s: sock %d: %R[sockerr]\n",
2347 __func__, pxtcp->sock, sockerr));
2348
2349#if HAVE_TCP_POLLHUP == POLLIN /* see counterpart in pxtcp_pmgr_pump() */
2350 /*
2351 * It may still be registered with poll manager for POLLOUT.
2352 */
2353 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
2354 return ERR_OK;
2355#else
2356 /*
2357 * It is no longer registered with poll manager so we
2358 * can kill it directly.
2359 */
2360 pxtcp_pcb_reset_pxtcp(pxtcp);
2361 return ERR_ABRT;
2362#endif
2363 }
2364 }
2365 }
2366
2367 /* forward more data if we can */
2368 if (!pxtcp->inbound_close_done) {
2369 pxtcp_pcb_forward_inbound(pxtcp);
2370
2371 /*
2372 * NB: we might have dissociated from a pcb that transitioned
2373 * to LAST_ACK state, so don't refer to pcb below.
2374 */
2375 }
2376
2377
2378 /* have we got all the acks? */
2379 if (pxtcp->inbound_close /* no more new data */
2380 && pxtcp->inbuf.unsent == pxtcp->inbuf.vacant /* all data is sent */
2381 && unacked == pxtcp->inbuf.unsent) /* ... and is acked */
2382 {
2383 char *buf;
2384
2385 DPRINTF(("%s: pxtcp %p; pcb %p; all data ACKed\n",
2386 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2387
2388 /* no more retransmits, so buf is not needed */
2389 buf = pxtcp->inbuf.buf;
2390 pxtcp->inbuf.buf = NULL;
2391 free(buf);
2392
2393 /* no more acks, so no more callbacks */
2394 if (pxtcp->pcb != NULL) {
2395 tcp_sent(pxtcp->pcb, NULL);
2396 }
2397
2398 /*
2399 * We may be the last callback for this pcb if we have also
2400 * successfully forwarded inbound_close.
2401 */
2402 pxtcp_pcb_maybe_deferred_delete(pxtcp);
2403 }
2404
2405 return ERR_OK;
2406}
2407
2408
2409/**
2410 * Callback from poll manager (pxtcp::msg_inpull) to switch
2411 * pxtcp_pcb_sent() to actively pull the last bits of input. See
2412 * POLLHUP comment in pxtcp_pmgr_pump().
2413 *
2414 * pxtcp::sock is deregistered from poll manager after this callback
2415 * is scheduled.
2416 */
2417static void
2418pxtcp_pcb_pull_inbound(void *ctx)
2419{
2420 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
2421 LWIP_ASSERT1(pxtcp != NULL);
2422
2423 if (pxtcp->pcb == NULL) {
2424 DPRINTF(("%s: pxtcp %p: PCB IS GONE\n", __func__, (void *)pxtcp));
2425 pxtcp_pcb_reset_pxtcp(pxtcp);
2426 return;
2427 }
2428
2429 pxtcp->inbound_pull = 1;
2430 if (pxtcp->outbound_close_done) {
2431 DPRINTF(("%s: pxtcp %p: pcb %p (deferred delete)\n",
2432 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2433 pxtcp->deferred_delete = 1;
2434 }
2435 else {
2436 DPRINTF(("%s: pxtcp %p: pcb %p\n",
2437 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2438 }
2439
2440 pxtcp_pcb_sent(pxtcp, pxtcp->pcb, 0);
2441}
2442
2443
2444/**
2445 * tcp_err() callback.
2446 *
2447 * pcb is not passed to this callback since it may be already
2448 * deallocated by the stack, but we can't do anything useful with it
2449 * anyway since connection is gone.
2450 */
2451static void
2452pxtcp_pcb_err(void *arg, err_t error)
2453{
2454 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2455 LWIP_ASSERT1(pxtcp != NULL);
2456
2457 /*
2458 * ERR_CLSD is special - it is reported here when:
2459 *
2460 * . guest has already half-closed
2461 * . we send FIN to guest when external half-closes
2462 * . guest acks that FIN
2463 *
2464 * Since connection is closed but receive has been already closed
2465 * lwip can only report this via tcp_err. At this point the pcb
2466 * is still alive, so we can peek at it if need be.
2467 *
2468 * The interesting twist is when the ACK from guest that akcs our
2469 * FIN also acks some data. In this scenario lwip will NOT call
2470 * tcp_sent() callback with the ACK for that last bit of data but
2471 * instead will call tcp_err with ERR_CLSD right away. Since that
2472 * ACK also acknowledges all the data, we should run some of
2473 * pxtcp_pcb_sent() logic here.
2474 */
2475 if (error == ERR_CLSD) {
2476 struct tcp_pcb *pcb = pxtcp->pcb; /* still alive */
2477
2478 DPRINTF2(("ERR_CLSD: pxtcp %p; pcb %p:"
2479 " pcb->acked %d;"
2480 " unacked %d, unsent %d, vacant %d\n",
2481 (void *)pxtcp, (void *)pcb,
2482 pcb->acked,
2483 (int)pxtcp->inbuf.unacked,
2484 (int)pxtcp->inbuf.unsent,
2485 (int)pxtcp->inbuf.vacant));
2486
2487 LWIP_ASSERT1(pxtcp->pcb == pcb);
2488 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
2489
2490 if (pcb->acked > 0) {
2491 pxtcp_pcb_sent(pxtcp, pcb, pcb->acked);
2492 }
2493 return;
2494 }
2495
2496 DPRINTF0(("tcp_err: pxtcp=%p, error=%s\n",
2497 (void *)pxtcp, proxy_lwip_strerr(error)));
2498
2499 pxtcp->pcb = NULL; /* pcb is gone */
2500 if (pxtcp->deferred_delete) {
2501 pxtcp_pcb_reset_pxtcp(pxtcp);
2502 }
2503 else {
2504 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
2505 }
2506}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette