VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 19846

Last change on this file since 19846 was 19839, checked in by vboxsync, 16 years ago

NAT: Slirp don't use ether address of guest anymore
instead it calculates ethernet address of destination
with lookup operation. Currently it's very simple looks
over send addresses via dhcp or assume destination in outer
network and gets Slirp's ethernet address.

  • Property svn:eol-style set to native
File size: 61.7 KB
Line 
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
34 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
35 */
36
37/*
38 * Changes and additions relating to SLiRP
39 * Copyright (c) 1995 Danny Gasparovski.
40 *
41 * Please read the file COPYRIGHT for the
42 * terms and conditions of the copyright.
43 */
44
45#include <slirp.h>
46#include "ip_icmp.h"
47
48
49#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
50
51/* for modulo comparisons of timestamps */
52#define TSTMP_LT(a, b) ((int)((a)-(b)) < 0)
53#define TSTMP_GEQ(a, b) ((int)((a)-(b)) >= 0)
54
55#ifndef TCP_ACK_HACK
56#define DELAY_ACK(tp, ti) \
57 if (ti->ti_flags & TH_PUSH) \
58 tp->t_flags |= TF_ACKNOW; \
59 else \
60 tp->t_flags |= TF_DELACK;
61#else /* !TCP_ACK_HACK */
62#define DELAY_ACK(tp, ign) \
63 tp->t_flags |= TF_DELACK;
64#endif /* TCP_ACK_HACK */
65
66
67/*
68 * deps: netinet/tcp_reass.c
69 * tcp_reass_maxqlen = 48 (deafault)
70 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
71 */
72int
73tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
74{
75 struct tseg_qent *q;
76 struct tseg_qent *p = NULL;
77 struct tseg_qent *nq;
78 struct tseg_qent *te = NULL;
79 struct socket *so = tp->t_socket;
80 int flags;
81
82 /*
83 * XXX: tcp_reass() is rather inefficient with its data structures
84 * and should be rewritten (see NetBSD for optimizations). While
85 * doing that it should move to its own file tcp_reass.c.
86 */
87
88 /*
89 * Call with th==NULL after become established to
90 * force pre-ESTABLISHED data up to user socket.
91 */
92 if (th == NULL)
93 goto present;
94
95 /*
96 * Limit the number of segments in the reassembly queue to prevent
97 * holding on to too many segments (and thus running out of mbufs).
98 * Make sure to let the missing segment through which caused this
99 * queue. Always keep one global queue entry spare to be able to
100 * process the missing segment.
101 */
102 if ( th->th_seq != tp->rcv_nxt
103 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
104 || tp->t_segqlen >= tcp_reass_maxqlen))
105 {
106 tcp_reass_overflows++;
107 tcpstat.tcps_rcvmemdrop++;
108 m_freem(pData, m);
109 *tlenp = 0;
110 return (0);
111 }
112
113 /*
114 * Allocate a new queue entry. If we can't, or hit the zone limit
115 * just drop the pkt.
116 */
117 te = RTMemAlloc(sizeof(struct tseg_qent));
118 if (te == NULL)
119 {
120 tcpstat.tcps_rcvmemdrop++;
121 m_freem(pData, m);
122 *tlenp = 0;
123 return (0);
124 }
125 tp->t_segqlen++;
126 tcp_reass_qsize++;
127
128 /*
129 * Find a segment which begins after this one does.
130 */
131 LIST_FOREACH(q, &tp->t_segq, tqe_q)
132 {
133 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
134 break;
135 p = q;
136 }
137
138 /*
139 * If there is a preceding segment, it may provide some of
140 * our data already. If so, drop the data from the incoming
141 * segment. If it provides all of our data, drop us.
142 */
143 if (p != NULL)
144 {
145 int i;
146 /* conversion to int (in i) handles seq wraparound */
147 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
148 if (i > 0)
149 {
150 if (i >= *tlenp)
151 {
152 tcpstat.tcps_rcvduppack++;
153 tcpstat.tcps_rcvdupbyte += *tlenp;
154 m_freem(pData, m);
155 RTMemFree(te);
156 tp->t_segqlen--;
157 tcp_reass_qsize--;
158 /*
159 * Try to present any queued data
160 * at the left window edge to the user.
161 * This is needed after the 3-WHS
162 * completes.
163 */
164 goto present; /* ??? */
165 }
166 m_adj(m, i);
167 *tlenp -= i;
168 th->th_seq += i;
169 }
170 }
171 tcpstat.tcps_rcvoopack++;
172 tcpstat.tcps_rcvoobyte += *tlenp;
173
174 /*
175 * While we overlap succeeding segments trim them or,
176 * if they are completely covered, dequeue them.
177 */
178 while (q)
179 {
180 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
181 if (i <= 0)
182 break;
183 if (i < q->tqe_len)
184 {
185 q->tqe_th->th_seq += i;
186 q->tqe_len -= i;
187 m_adj(q->tqe_m, i);
188 break;
189 }
190
191 nq = LIST_NEXT(q, tqe_q);
192 LIST_REMOVE(q, tqe_q);
193 m_freem(pData, q->tqe_m);
194 RTMemFree(q);
195 tp->t_segqlen--;
196 tcp_reass_qsize--;
197 q = nq;
198 }
199
200 /* Insert the new segment queue entry into place. */
201 te->tqe_m = m;
202 te->tqe_th = th;
203 te->tqe_len = *tlenp;
204
205 if (p == NULL)
206 {
207 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
208 }
209 else
210 {
211 LIST_INSERT_AFTER(p, te, tqe_q);
212 }
213
214present:
215 /*
216 * Present data to user, advancing rcv_nxt through
217 * completed sequence space.
218 */
219 if (!TCPS_HAVEESTABLISHED(tp->t_state))
220 return (0);
221 q = LIST_FIRST(&tp->t_segq);
222 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
223 return (0);
224 do
225 {
226 tp->rcv_nxt += q->tqe_len;
227 flags = q->tqe_th->th_flags & TH_FIN;
228 nq = LIST_NEXT(q, tqe_q);
229 LIST_REMOVE(q, tqe_q);
230 /* XXX: This place should be checked for the same code in
231 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
232 */
233 if (so->so_state & SS_FCANTSENDMORE)
234 m_freem(pData, q->tqe_m);
235 else
236 {
237 if (so->so_emu)
238 {
239 if (tcp_emu(pData, so, q->tqe_m))
240 sbappend(pData, so, q->tqe_m);
241 }
242 else
243 sbappend(pData, so, q->tqe_m);
244 }
245 RTMemFree(q);
246 tp->t_segqlen--;
247 tcp_reass_qsize--;
248 q = nq;
249 }
250 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
251
252 return flags;
253}
254
255/*
256 * TCP input routine, follows pages 65-76 of the
257 * protocol specification dated September, 1981 very closely.
258 */
259void
260tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
261{
262 struct ip save_ip, *ip;
263 register struct tcpiphdr *ti;
264 caddr_t optp = NULL;
265 int optlen = 0;
266 int len, tlen, off;
267 register struct tcpcb *tp = 0;
268 register int tiflags;
269 struct socket *so = 0;
270 int todrop, acked, ourfinisacked, needoutput = 0;
271/* int dropsocket = 0; */
272 int iss = 0;
273 u_long tiwin;
274/* int ts_present = 0; */
275
276 DEBUG_CALL("tcp_input");
277 DEBUG_ARGS((dfd," m = %8lx iphlen = %2d inso = %lx\n",
278 (long )m, iphlen, (long )inso ));
279
280 if (inso != NULL)
281 {
282 QSOCKET_LOCK(tcb);
283 SOCKET_LOCK(inso);
284 QSOCKET_UNLOCK(tcb);
285 }
286 /*
287 * If called with m == 0, then we're continuing the connect
288 */
289 if (m == NULL)
290 {
291 so = inso;
292 Log4(("NAT: tcp_input: %R[natsock]\n", so));
293 /* Re-set a few variables */
294 tp = sototcpcb(so);
295 m = so->so_m;
296
297 so->so_m = 0;
298 ti = so->so_ti;
299 tiwin = ti->ti_win;
300 tiflags = ti->ti_flags;
301
302 goto cont_conn;
303 }
304
305 tcpstat.tcps_rcvtotal++;
306 /*
307 * Get IP and TCP header together in first mbuf.
308 * Note: IP leaves IP header in first mbuf.
309 */
310 ti = mtod(m, struct tcpiphdr *);
311 if (iphlen > sizeof(struct ip ))
312 {
313 ip_stripoptions(m, (struct mbuf *)0);
314 iphlen = sizeof(struct ip );
315 }
316 /* XXX Check if too short */
317
318
319 /*
320 * Save a copy of the IP header in case we want restore it
321 * for sending an ICMP error message in response.
322 */
323 ip = mtod(m, struct ip *);
324 save_ip = *ip;
325 save_ip.ip_len+= iphlen;
326
327 /*
328 * Checksum extended TCP header and data.
329 */
330 tlen = ((struct ip *)ti)->ip_len;
331 memset(ti->ti_x1, 0, 9);
332 ti->ti_len = htons((u_int16_t)tlen);
333 len = sizeof(struct ip ) + tlen;
334 /* keep checksum for ICMP reply
335 * ti->ti_sum = cksum(m, len);
336 * if (ti->ti_sum) { */
337 if (cksum(m, len))
338 {
339 tcpstat.tcps_rcvbadsum++;
340 Log2(("checksum is invalid => drop\n"));
341 goto drop;
342 }
343
344 /*
345 * Check that TCP offset makes sense,
346 * pull out TCP options and adjust length. XXX
347 */
348 off = ti->ti_off << 2;
349 if ( off < sizeof (struct tcphdr)
350 || off > tlen)
351 {
352 tcpstat.tcps_rcvbadoff++;
353 Log2(("ti_off(tlen(%d)<%d<(tcphdr(%d))) is invalid =>drop\n", tlen, off, sizeof(struct tcphdr)));
354 goto drop;
355 }
356 tlen -= off;
357 ti->ti_len = tlen;
358 if (off > sizeof (struct tcphdr))
359 {
360 optlen = off - sizeof (struct tcphdr);
361 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
362
363 /*
364 * Do quick retrieval of timestamp options ("options
365 * prediction?"). If timestamp is the only option and it's
366 * formatted as recommended in RFC 1323 appendix A, we
367 * quickly get the values now and not bother calling
368 * tcp_dooptions(), etc.
369 */
370#if 0
371 if (( optlen == TCPOLEN_TSTAMP_APPA
372 || ( optlen > TCPOLEN_TSTAMP_APPA
373 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
374 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
375 (ti->ti_flags & TH_SYN) == 0)
376 {
377 ts_present = 1;
378 ts_val = ntohl(*(u_int32_t *)(optp + 4));
379 ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
380 optp = NULL; / * we have parsed the options * /
381 }
382#endif
383 }
384 tiflags = ti->ti_flags;
385
386 /*
387 * Convert TCP protocol specific fields to host format.
388 */
389 NTOHL(ti->ti_seq);
390 NTOHL(ti->ti_ack);
391 NTOHS(ti->ti_win);
392 NTOHS(ti->ti_urp);
393
394 /*
395 * Drop TCP, IP headers and TCP options.
396 */
397 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
398 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
399
400 /*
401 * Locate pcb for segment.
402 */
403findso:
404 if (so != NULL && so != &tcb)
405 SOCKET_UNLOCK(so);
406 QSOCKET_LOCK(tcb);
407 so = tcp_last_so;
408 if ( so->so_fport != ti->ti_dport
409 || so->so_lport != ti->ti_sport
410 || so->so_laddr.s_addr != ti->ti_src.s_addr
411 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
412 {
413 struct socket *sonxt;
414 QSOCKET_UNLOCK(tcb);
415 /* @todo fix SOLOOKUP macrodefinition to be usable here */
416#ifndef VBOX_WITH_SLIRP_MT
417 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
418 ti->ti_dst, ti->ti_dport);
419#else
420 so = NULL;
421 QSOCKET_FOREACH(so, sonxt, tcp)
422 /* { */
423 if ( so->so_lport == ti->ti_sport
424 && so->so_laddr.s_addr == ti->ti_src.s_addr
425 && so->so_faddr.s_addr == ti->ti_dst.s_addr
426 && so->so_fport == ti->ti_dport
427 && so->so_deleted != 1)
428 {
429 Log2(("lock: %s:%d We found socket %R[natsock]\n", __FUNCTION__, __LINE__, so));
430 break; /* so is locked here */
431 }
432 LOOP_LABEL(tcp, so, sonxt);
433 }
434 if (so == &tcb) {
435 Log2(("lock: %s:%d Haven't find anything \n", __FUNCTION__, __LINE__));
436 so = NULL;
437 }
438#endif
439 if (so)
440 {
441 tcp_last_so = so;
442 }
443 ++tcpstat.tcps_socachemiss;
444 }
445 else
446 {
447 SOCKET_LOCK(so);
448 QSOCKET_UNLOCK(tcb);
449 }
450
451 /*
452 * If the state is CLOSED (i.e., TCB does not exist) then
453 * all data in the incoming segment is discarded.
454 * If the TCB exists but is in CLOSED state, it is embryonic,
455 * but should either do a listen or a connect soon.
456 *
457 * state == CLOSED means we've done socreate() but haven't
458 * attached it to a protocol yet...
459 *
460 * XXX If a TCB does not exist, and the TH_SYN flag is
461 * the only flag set, then create a session, mark it
462 * as if it was LISTENING, and continue...
463 */
464 Log2(("so = %R[natsock]\n", so));
465 if (so == 0)
466 {
467 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
468 goto dropwithreset;
469
470 if ((so = socreate()) == NULL)
471 goto dropwithreset;
472 if (tcp_attach(pData, so) < 0)
473 {
474 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
475 goto dropwithreset;
476 }
477 SOCKET_LOCK(so);
478 sbreserve(&so->so_snd, tcp_sndspace);
479 sbreserve(&so->so_rcv, tcp_rcvspace);
480
481/* tcp_last_so = so; */ /* XXX ? */
482/* tp = sototcpcb(so); */
483
484 so->so_laddr = ti->ti_src;
485 so->so_lport = ti->ti_sport;
486 so->so_faddr = ti->ti_dst;
487 so->so_fport = ti->ti_dport;
488
489 if ((so->so_iptos = tcp_tos(so)) == 0)
490 so->so_iptos = ((struct ip *)ti)->ip_tos;
491
492 tp = sototcpcb(so);
493 tp->t_state = TCPS_LISTEN;
494 }
495
496 /*
497 * If this is a still-connecting socket, this probably
498 * a retransmit of the SYN. Whether it's a retransmit SYN
499 * or something else, we nuke it.
500 */
501 if (so->so_state & SS_ISFCONNECTING)
502 {
503 Log2(("so_state(%x) of %R[natsock] is still connecting =>drop\n", so->so_state, so));
504 goto drop;
505 }
506
507 tp = sototcpcb(so);
508
509 /* XXX Should never fail */
510 if (tp == 0)
511 goto dropwithreset;
512 if (tp->t_state == TCPS_CLOSED)
513 {
514 Log2(("t_state(%x) is closed =>drop\n", tp->t_state));
515 goto drop;
516 }
517
518 /* Unscale the window into a 32-bit value. */
519/* if ((tiflags & TH_SYN) == 0)
520 * tiwin = ti->ti_win << tp->snd_scale;
521 * else
522 */
523 tiwin = ti->ti_win;
524
525 /*
526 * Segment received on connection.
527 * Reset idle time and keep-alive timer.
528 */
529 tp->t_idle = 0;
530 if (so_options)
531 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
532 else
533 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
534
535 /*
536 * Process options if not in LISTEN state,
537 * else do it below (after getting remote address).
538 */
539 if (optp && tp->t_state != TCPS_LISTEN)
540 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
541/* , */
542/* &ts_present, &ts_val, &ts_ecr); */
543
544 /*
545 * Header prediction: check for the two common cases
546 * of a uni-directional data xfer. If the packet has
547 * no control flags, is in-sequence, the window didn't
548 * change and we're not retransmitting, it's a
549 * candidate. If the length is zero and the ack moved
550 * forward, we're the sender side of the xfer. Just
551 * free the data acked & wake any higher level process
552 * that was blocked waiting for space. If the length
553 * is non-zero and the ack didn't move, we're the
554 * receiver side. If we're getting packets in-order
555 * (the reassembly queue is empty), add the data to
556 * the socket buffer and note that we need a delayed ack.
557 *
558 * XXX Some of these tests are not needed
559 * eg: the tiwin == tp->snd_wnd prevents many more
560 * predictions.. with no *real* advantage..
561 */
562 if ( tp->t_state == TCPS_ESTABLISHED
563 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
564/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
565 && ti->ti_seq == tp->rcv_nxt
566 && tiwin && tiwin == tp->snd_wnd
567 && tp->snd_nxt == tp->snd_max)
568 {
569 /*
570 * If last ACK falls within this segment's sequence numbers,
571 * record the timestamp.
572 */
573#if 0
574 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
575 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
576 {
577 tp->ts_recent_age = tcp_now;
578 tp->ts_recent = ts_val;
579 }
580#endif
581
582 if (ti->ti_len == 0)
583 {
584 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
585 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
586 && tp->snd_cwnd >= tp->snd_wnd)
587 {
588 /*
589 * this is a pure ack for outstanding data.
590 */
591 ++tcpstat.tcps_predack;
592#if 0
593 if (ts_present)
594 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
595 else
596#endif
597 if ( tp->t_rtt
598 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
599 tcp_xmit_timer(pData, tp, tp->t_rtt);
600 acked = ti->ti_ack - tp->snd_una;
601 tcpstat.tcps_rcvackpack++;
602 tcpstat.tcps_rcvackbyte += acked;
603 sbdrop(&so->so_snd, acked);
604 tp->snd_una = ti->ti_ack;
605 m_freem(pData, m);
606
607 /*
608 * If all outstanding data are acked, stop
609 * retransmit timer, otherwise restart timer
610 * using current (possibly backed-off) value.
611 * If process is waiting for space,
612 * wakeup/selwakeup/signal. If data
613 * are ready to send, let tcp_output
614 * decide between more output or persist.
615 */
616 if (tp->snd_una == tp->snd_max)
617 tp->t_timer[TCPT_REXMT] = 0;
618 else if (tp->t_timer[TCPT_PERSIST] == 0)
619 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
620
621 /*
622 * There's room in so_snd, sowwakup will read()
623 * from the socket if we can
624 */
625#if 0
626 if (so->so_snd.sb_flags & SB_NOTIFY)
627 sowwakeup(so);
628#endif
629 /*
630 * This is called because sowwakeup might have
631 * put data into so_snd. Since we don't so sowwakeup,
632 * we don't need this.. XXX???
633 */
634 if (so->so_snd.sb_cc)
635 (void) tcp_output(pData, tp);
636
637 SOCKET_UNLOCK(so);
638 return;
639 }
640 }
641 else if ( ti->ti_ack == tp->snd_una
642 && LIST_FIRST(&tp->t_segq)
643 && ti->ti_len <= sbspace(&so->so_rcv))
644 {
645 /*
646 * this is a pure, in-sequence data packet
647 * with nothing on the reassembly queue and
648 * we have enough buffer space to take it.
649 */
650 ++tcpstat.tcps_preddat;
651 tp->rcv_nxt += ti->ti_len;
652 tcpstat.tcps_rcvpack++;
653 tcpstat.tcps_rcvbyte += ti->ti_len;
654 /*
655 * Add data to socket buffer.
656 */
657 if (so->so_emu)
658 {
659 if (tcp_emu(pData, so, m))
660 sbappend(pData, so, m);
661 }
662 else
663 sbappend(pData, so, m);
664
665 /*
666 * XXX This is called when data arrives. Later, check
667 * if we can actually write() to the socket
668 * XXX Need to check? It's be NON_BLOCKING
669 */
670/* sorwakeup(so); */
671
672 /*
673 * If this is a short packet, then ACK now - with Nagel
674 * congestion avoidance sender won't send more until
675 * he gets an ACK.
676 *
677 * It is better to not delay acks at all to maximize
678 * TCP throughput. See RFC 2581.
679 */
680 tp->t_flags |= TF_ACKNOW;
681 tcp_output(pData, tp);
682 SOCKET_UNLOCK(so);
683 return;
684 }
685 } /* header prediction */
686 /*
687 * Calculate amount of space in receive window,
688 * and then do TCP input processing.
689 * Receive window is amount of space in rcv queue,
690 * but not less than advertised window.
691 */
692 {
693 int win;
694 win = sbspace(&so->so_rcv);
695 if (win < 0)
696 win = 0;
697 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
698 }
699
700 switch (tp->t_state)
701 {
702 /*
703 * If the state is LISTEN then ignore segment if it contains an RST.
704 * If the segment contains an ACK then it is bad and send a RST.
705 * If it does not contain a SYN then it is not interesting; drop it.
706 * Don't bother responding if the destination was a broadcast.
707 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
708 * tp->iss, and send a segment:
709 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
710 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
711 * Fill in remote peer address fields if not previously specified.
712 * Enter SYN_RECEIVED state, and process any other fields of this
713 * segment in this state.
714 */
715 case TCPS_LISTEN:
716 {
717 if (tiflags & TH_RST) {
718 Log2(("RST(%x) is on listen =>drop\n", tiflags));
719 goto drop;
720 }
721 if (tiflags & TH_ACK)
722 goto dropwithreset;
723 if ((tiflags & TH_SYN) == 0)
724 {
725 Log2(("SYN(%x) is off on listen =>drop\n", tiflags));
726 goto drop;
727 }
728
729 /*
730 * This has way too many gotos...
731 * But a bit of spaghetti code never hurt anybody :)
732 */
733
734 if (so->so_emu & EMU_NOCONNECT)
735 {
736 so->so_emu &= ~EMU_NOCONNECT;
737 goto cont_input;
738 }
739
740 if ( (tcp_fconnect(pData, so) == -1)
741 && errno != EINPROGRESS
742 && errno != EWOULDBLOCK)
743 {
744 u_char code = ICMP_UNREACH_NET;
745 DEBUG_MISC((dfd," tcp fconnect errno = %d-%s\n",
746 errno, strerror(errno)));
747 if (errno == ECONNREFUSED)
748 {
749 /* ACK the SYN, send RST to refuse the connection */
750 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
751 TH_RST|TH_ACK);
752 }
753 else
754 {
755 if (errno == EHOSTUNREACH)
756 code = ICMP_UNREACH_HOST;
757 HTONL(ti->ti_seq); /* restore tcp header */
758 HTONL(ti->ti_ack);
759 HTONS(ti->ti_win);
760 HTONS(ti->ti_urp);
761 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
762 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
763 *ip = save_ip;
764 icmp_error(pData, m, ICMP_UNREACH, code, 0, strerror(errno));
765 tp->t_socket->so_m = NULL;
766 }
767 tp = tcp_close(pData, tp);
768 m_free(pData, m);
769 }
770 else
771 {
772 /*
773 * Haven't connected yet, save the current mbuf
774 * and ti, and return
775 * XXX Some OS's don't tell us whether the connect()
776 * succeeded or not. So we must time it out.
777 */
778#ifdef VBOX_WITH_NAT_SERVICE
779 Assert(m);
780 {
781 struct ethhdr *eh0;
782 eh0 = (struct ethhdr *)m->m_dat;
783 }
784#endif
785 so->so_m = m;
786 so->so_ti = ti;
787 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
788 tp->t_state = TCPS_SYN_RECEIVED;
789 }
790 SOCKET_UNLOCK(so);
791 return;
792
793cont_conn:
794 /* m==NULL
795 * Check if the connect succeeded
796 */
797 if (so->so_state & SS_NOFDREF)
798 {
799 tp = tcp_close(pData, tp);
800 goto dropwithreset;
801 }
802cont_input:
803 tcp_template(tp);
804
805 if (optp)
806 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
807
808 if (iss)
809 tp->iss = iss;
810 else
811 tp->iss = tcp_iss;
812 tcp_iss += TCP_ISSINCR/2;
813 tp->irs = ti->ti_seq;
814 tcp_sendseqinit(tp);
815 tcp_rcvseqinit(tp);
816 tp->t_flags |= TF_ACKNOW;
817 tp->t_state = TCPS_SYN_RECEIVED;
818 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
819 tcpstat.tcps_accepts++;
820 Log2(("hit trimthenstep6\n"));
821 goto trimthenstep6;
822 } /* case TCPS_LISTEN */
823
824 /*
825 * If the state is SYN_SENT:
826 * if seg contains an ACK, but not for our SYN, drop the input.
827 * if seg contains a RST, then drop the connection.
828 * if seg does not contain SYN, then drop it.
829 * Otherwise this is an acceptable SYN segment
830 * initialize tp->rcv_nxt and tp->irs
831 * if seg contains ack then advance tp->snd_una
832 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
833 * arrange for segment to be acked (eventually)
834 * continue processing rest of data/controls, beginning with URG
835 */
836 case TCPS_SYN_SENT:
837 if ( (tiflags & TH_ACK)
838 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
839 || SEQ_GT(ti->ti_ack, tp->snd_max)))
840 goto dropwithreset;
841
842 if (tiflags & TH_RST)
843 {
844 if (tiflags & TH_ACK)
845 tp = tcp_drop(pData, tp, 0); /* XXX Check t_softerror! */
846 Log2(("RST(%x) is on SYN_SENT =>drop\n", tiflags));
847 goto drop;
848 }
849
850 if ((tiflags & TH_SYN) == 0)
851 {
852 Log2(("SYN(%x) bit is off on SYN_SENT =>drop\n", tiflags));
853 goto drop;
854 }
855 if (tiflags & TH_ACK)
856 {
857 tp->snd_una = ti->ti_ack;
858 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
859 tp->snd_nxt = tp->snd_una;
860 }
861
862 tp->t_timer[TCPT_REXMT] = 0;
863 tp->irs = ti->ti_seq;
864 tcp_rcvseqinit(tp);
865 tp->t_flags |= TF_ACKNOW;
866 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
867 {
868 tcpstat.tcps_connects++;
869 soisfconnected(so);
870 tp->t_state = TCPS_ESTABLISHED;
871
872 /* Do window scaling on this connection? */
873#if 0
874 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
875 == (TF_RCVD_SCALE|TF_REQ_SCALE))
876 {
877 tp->snd_scale = tp->requested_s_scale;
878 tp->rcv_scale = tp->request_r_scale;
879 }
880#endif
881 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
882 /*
883 * if we didn't have to retransmit the SYN,
884 * use its rtt as our initial srtt & rtt var.
885 */
886 if (tp->t_rtt)
887 tcp_xmit_timer(pData, tp, tp->t_rtt);
888 }
889 else
890 tp->t_state = TCPS_SYN_RECEIVED;
891
892trimthenstep6:
893 /*
894 * Advance ti->ti_seq to correspond to first data byte.
895 * If data, trim to stay within window,
896 * dropping FIN if necessary.
897 */
898 ti->ti_seq++;
899 if (ti->ti_len > tp->rcv_wnd)
900 {
901 todrop = ti->ti_len - tp->rcv_wnd;
902 m_adj(m, -todrop);
903 ti->ti_len = tp->rcv_wnd;
904 tiflags &= ~TH_FIN;
905 tcpstat.tcps_rcvpackafterwin++;
906 tcpstat.tcps_rcvbyteafterwin += todrop;
907 }
908 tp->snd_wl1 = ti->ti_seq - 1;
909 tp->rcv_up = ti->ti_seq;
910 Log2(("hit6"));
911 goto step6;
912 } /* switch tp->t_state */
913 /*
914 * States other than LISTEN or SYN_SENT.
915 * First check timestamp, if present.
916 * Then check that at least some bytes of segment are within
917 * receive window. If segment begins before rcv_nxt,
918 * drop leading data (and SYN); if nothing left, just ack.
919 *
920 * RFC 1323 PAWS: If we have a timestamp reply on this segment
921 * and it's less than ts_recent, drop it.
922 */
923#if 0
924 if ( ts_present
925 && (tiflags & TH_RST) == 0
926 && tp->ts_recent
927 && TSTMP_LT(ts_val, tp->ts_recent))
928 {
929 /* Check to see if ts_recent is over 24 days old. */
930 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
931 {
932 /*
933 * Invalidate ts_recent. If this segment updates
934 * ts_recent, the age will be reset later and ts_recent
935 * will get a valid value. If it does not, setting
936 * ts_recent to zero will at least satisfy the
937 * requirement that zero be placed in the timestamp
938 * echo reply when ts_recent isn't valid. The
939 * age isn't reset until we get a valid ts_recent
940 * because we don't want out-of-order segments to be
941 * dropped when ts_recent is old.
942 */
943 tp->ts_recent = 0;
944 }
945 else
946 {
947 tcpstat.tcps_rcvduppack++;
948 tcpstat.tcps_rcvdupbyte += ti->ti_len;
949 tcpstat.tcps_pawsdrop++;
950 goto dropafterack;
951 }
952 }
953#endif
954
955 todrop = tp->rcv_nxt - ti->ti_seq;
956 if (todrop > 0)
957 {
958 if (tiflags & TH_SYN)
959 {
960 tiflags &= ~TH_SYN;
961 ti->ti_seq++;
962 if (ti->ti_urp > 1)
963 ti->ti_urp--;
964 else
965 tiflags &= ~TH_URG;
966 todrop--;
967 }
968 /*
969 * Following if statement from Stevens, vol. 2, p. 960.
970 */
971 if ( todrop > ti->ti_len
972 || ( todrop == ti->ti_len
973 && (tiflags & TH_FIN) == 0))
974 {
975 /*
976 * Any valid FIN must be to the left of the window.
977 * At this point the FIN must be a duplicate or out
978 * of sequence; drop it.
979 */
980 tiflags &= ~TH_FIN;
981
982 /*
983 * Send an ACK to resynchronize and drop any data.
984 * But keep on processing for RST or ACK.
985 */
986 tp->t_flags |= TF_ACKNOW;
987 todrop = ti->ti_len;
988 tcpstat.tcps_rcvduppack++;
989 tcpstat.tcps_rcvdupbyte += todrop;
990 }
991 else
992 {
993 tcpstat.tcps_rcvpartduppack++;
994 tcpstat.tcps_rcvpartdupbyte += todrop;
995 }
996 m_adj(m, todrop);
997 ti->ti_seq += todrop;
998 ti->ti_len -= todrop;
999 if (ti->ti_urp > todrop)
1000 ti->ti_urp -= todrop;
1001 else
1002 {
1003 tiflags &= ~TH_URG;
1004 ti->ti_urp = 0;
1005 }
1006 }
1007 /*
1008 * If new data are received on a connection after the
1009 * user processes are gone, then RST the other end.
1010 */
1011 if ( (so->so_state & SS_NOFDREF)
1012 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
1013 {
1014 tp = tcp_close(pData, tp);
1015 tcpstat.tcps_rcvafterclose++;
1016 goto dropwithreset;
1017 }
1018
1019 /*
1020 * If segment ends after window, drop trailing data
1021 * (and PUSH and FIN); if nothing left, just ACK.
1022 */
1023 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1024 if (todrop > 0)
1025 {
1026 tcpstat.tcps_rcvpackafterwin++;
1027 if (todrop >= ti->ti_len)
1028 {
1029 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1030 /*
1031 * If a new connection request is received
1032 * while in TIME_WAIT, drop the old connection
1033 * and start over if the sequence numbers
1034 * are above the previous ones.
1035 */
1036 if ( tiflags & TH_SYN
1037 && tp->t_state == TCPS_TIME_WAIT
1038 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
1039 {
1040 iss = tp->rcv_nxt + TCP_ISSINCR;
1041 tp = tcp_close(pData, tp);
1042 SOCKET_UNLOCK(tp->t_socket);
1043 goto findso;
1044 }
1045 /*
1046 * If window is closed can only take segments at
1047 * window edge, and have to drop data and PUSH from
1048 * incoming segments. Continue processing, but
1049 * remember to ack. Otherwise, drop segment
1050 * and ack.
1051 */
1052 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
1053 {
1054 tp->t_flags |= TF_ACKNOW;
1055 tcpstat.tcps_rcvwinprobe++;
1056 }
1057 else
1058 goto dropafterack;
1059 }
1060 else
1061 tcpstat.tcps_rcvbyteafterwin += todrop;
1062 m_adj(m, -todrop);
1063 ti->ti_len -= todrop;
1064 tiflags &= ~(TH_PUSH|TH_FIN);
1065 }
1066
1067 /*
1068 * If last ACK falls within this segment's sequence numbers,
1069 * record its timestamp.
1070 */
1071#if 0
1072 if ( ts_present
1073 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1074 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1075 {
1076 tp->ts_recent_age = tcp_now;
1077 tp->ts_recent = ts_val;
1078 }
1079#endif
1080
1081 /*
1082 * If the RST bit is set examine the state:
1083 * SYN_RECEIVED STATE:
1084 * If passive open, return to LISTEN state.
1085 * If active open, inform user that connection was refused.
1086 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1087 * Inform user that connection was reset, and close tcb.
1088 * CLOSING, LAST_ACK, TIME_WAIT STATES
1089 * Close the tcb.
1090 */
1091 if (tiflags&TH_RST)
1092 switch (tp->t_state)
1093 {
1094 case TCPS_SYN_RECEIVED:
1095/* so->so_error = ECONNREFUSED; */
1096 goto close;
1097
1098 case TCPS_ESTABLISHED:
1099 case TCPS_FIN_WAIT_1:
1100 case TCPS_FIN_WAIT_2:
1101 case TCPS_CLOSE_WAIT:
1102/* so->so_error = ECONNRESET; */
1103close:
1104 Log2(("closing...=>drop\n", tp->t_state));
1105 tp->t_state = TCPS_CLOSED;
1106 tcpstat.tcps_drops++;
1107 tp = tcp_close(pData, tp);
1108 goto drop;
1109
1110 case TCPS_CLOSING:
1111 case TCPS_LAST_ACK:
1112 case TCPS_TIME_WAIT:
1113 Log2(("t_state is (%x) sort of close =>drop\n", tp->t_state));
1114 tp = tcp_close(pData, tp);
1115 goto drop;
1116 }
1117
1118 /*
1119 * If a SYN is in the window, then this is an
1120 * error and we send an RST and drop the connection.
1121 */
1122 if (tiflags & TH_SYN)
1123 {
1124 tp = tcp_drop(pData, tp, 0);
1125 goto dropwithreset;
1126 }
1127
1128 /*
1129 * If the ACK bit is off we drop the segment and return.
1130 */
1131 if ((tiflags & TH_ACK) == 0)
1132 {
1133 Log2(("ACK(%x) bit is off =>drop\n", tiflags));
1134 goto drop;
1135 }
1136
1137 /*
1138 * Ack processing.
1139 */
1140 switch (tp->t_state)
1141 {
1142 /*
1143 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1144 * ESTABLISHED state and continue processing, otherwise
1145 * send an RST. una<=ack<=max
1146 */
1147 case TCPS_SYN_RECEIVED:
1148 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1149 || SEQ_GT(ti->ti_ack, tp->snd_max))
1150 goto dropwithreset;
1151 tcpstat.tcps_connects++;
1152 tp->t_state = TCPS_ESTABLISHED;
1153 /*
1154 * The sent SYN is ack'ed with our sequence number +1
1155 * The first data byte already in the buffer will get
1156 * lost if no correction is made. This is only needed for
1157 * SS_CTL since the buffer is empty otherwise.
1158 * tp->snd_una++; or:
1159 */
1160 tp->snd_una = ti->ti_ack;
1161 soisfconnected(so);
1162
1163 /* Do window scaling? */
1164#if 0
1165 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1166 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1167 {
1168 tp->snd_scale = tp->requested_s_scale;
1169 tp->rcv_scale = tp->request_r_scale;
1170 }
1171#endif
1172 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1173 tp->snd_wl1 = ti->ti_seq - 1;
1174 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1175 Log2(("hit synrx_to_est\n"));
1176 goto synrx_to_est;
1177 /* fall into ... */
1178
1179 /*
1180 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1181 * ACKs. If the ack is in the range
1182 * tp->snd_una < ti->ti_ack <= tp->snd_max
1183 * then advance tp->snd_una to ti->ti_ack and drop
1184 * data from the retransmission queue. If this ACK reflects
1185 * more up to date window information we update our window information.
1186 */
1187 case TCPS_ESTABLISHED:
1188 case TCPS_FIN_WAIT_1:
1189 case TCPS_FIN_WAIT_2:
1190 case TCPS_CLOSE_WAIT:
1191 case TCPS_CLOSING:
1192 case TCPS_LAST_ACK:
1193 case TCPS_TIME_WAIT:
1194 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1195 {
1196 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1197 {
1198 tcpstat.tcps_rcvdupack++;
1199 DEBUG_MISC((dfd," dup ack m = %lx so = %lx \n",
1200 (long )m, (long )so));
1201 /*
1202 * If we have outstanding data (other than
1203 * a window probe), this is a completely
1204 * duplicate ack (ie, window info didn't
1205 * change), the ack is the biggest we've
1206 * seen and we've seen exactly our rexmt
1207 * threshold of them, assume a packet
1208 * has been dropped and retransmit it.
1209 * Kludge snd_nxt & the congestion
1210 * window so we send only this one
1211 * packet.
1212 *
1213 * We know we're losing at the current
1214 * window size so do congestion avoidance
1215 * (set ssthresh to half the current window
1216 * and pull our congestion window back to
1217 * the new ssthresh).
1218 *
1219 * Dup acks mean that packets have left the
1220 * network (they're now cached at the receiver)
1221 * so bump cwnd by the amount in the receiver
1222 * to keep a constant cwnd packets in the
1223 * network.
1224 */
1225 if ( tp->t_timer[TCPT_REXMT] == 0
1226 || ti->ti_ack != tp->snd_una)
1227 tp->t_dupacks = 0;
1228 else if (++tp->t_dupacks == tcprexmtthresh)
1229 {
1230 tcp_seq onxt = tp->snd_nxt;
1231 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1232 if (win < 2)
1233 win = 2;
1234 tp->snd_ssthresh = win * tp->t_maxseg;
1235 tp->t_timer[TCPT_REXMT] = 0;
1236 tp->t_rtt = 0;
1237 tp->snd_nxt = ti->ti_ack;
1238 tp->snd_cwnd = tp->t_maxseg;
1239 (void) tcp_output(pData, tp);
1240 tp->snd_cwnd = tp->snd_ssthresh +
1241 tp->t_maxseg * tp->t_dupacks;
1242 if (SEQ_GT(onxt, tp->snd_nxt))
1243 tp->snd_nxt = onxt;
1244 Log2(("t_dupacks(%d) == tcprexmtthresh(%d)=>drop\n", tp->t_dupacks, tcprexmtthresh));
1245 goto drop;
1246 }
1247 else if (tp->t_dupacks > tcprexmtthresh)
1248 {
1249 tp->snd_cwnd += tp->t_maxseg;
1250 (void) tcp_output(pData, tp);
1251 Log2(("t_dupacks(%d) > tcprexmtthresh(%d)=>drop\n", tp->t_dupacks, tcprexmtthresh));
1252 goto drop;
1253 }
1254 }
1255 else
1256 tp->t_dupacks = 0;
1257 break;
1258 }
1259synrx_to_est:
1260 Log2(("enter synrx_to_est\n"));
1261 /*
1262 * If the congestion window was inflated to account
1263 * for the other side's cached packets, retract it.
1264 */
1265 if ( tp->t_dupacks > tcprexmtthresh
1266 && tp->snd_cwnd > tp->snd_ssthresh)
1267 tp->snd_cwnd = tp->snd_ssthresh;
1268 tp->t_dupacks = 0;
1269 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1270 {
1271 tcpstat.tcps_rcvacktoomuch++;
1272 goto dropafterack;
1273 }
1274 acked = ti->ti_ack - tp->snd_una;
1275 tcpstat.tcps_rcvackpack++;
1276 tcpstat.tcps_rcvackbyte += acked;
1277
1278 /*
1279 * If we have a timestamp reply, update smoothed
1280 * round trip time. If no timestamp is present but
1281 * transmit timer is running and timed sequence
1282 * number was acked, update smoothed round trip time.
1283 * Since we now have an rtt measurement, cancel the
1284 * timer backoff (cf., Phil Karn's retransmit alg.).
1285 * Recompute the initial retransmit timer.
1286 */
1287#if 0
1288 if (ts_present)
1289 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1290 else
1291#endif
1292 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1293 tcp_xmit_timer(pData, tp, tp->t_rtt);
1294
1295 /*
1296 * If all outstanding data is acked, stop retransmit
1297 * timer and remember to restart (more output or persist).
1298 * If there is more data to be acked, restart retransmit
1299 * timer, using current (possibly backed-off) value.
1300 */
1301 if (ti->ti_ack == tp->snd_max)
1302 {
1303 tp->t_timer[TCPT_REXMT] = 0;
1304 needoutput = 1;
1305 }
1306 else if (tp->t_timer[TCPT_PERSIST] == 0)
1307 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1308 /*
1309 * When new data is acked, open the congestion window.
1310 * If the window gives us less than ssthresh packets
1311 * in flight, open exponentially (maxseg per packet).
1312 * Otherwise open linearly: maxseg per window
1313 * (maxseg^2 / cwnd per packet).
1314 */
1315 {
1316 register u_int cw = tp->snd_cwnd;
1317 register u_int incr = tp->t_maxseg;
1318
1319 if (cw > tp->snd_ssthresh)
1320 incr = incr * incr / cw;
1321 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1322 }
1323 if (acked > so->so_snd.sb_cc)
1324 {
1325 tp->snd_wnd -= so->so_snd.sb_cc;
1326 sbdrop(&so->so_snd, (int )so->so_snd.sb_cc);
1327 ourfinisacked = 1;
1328 }
1329 else
1330 {
1331 sbdrop(&so->so_snd, acked);
1332 tp->snd_wnd -= acked;
1333 ourfinisacked = 0;
1334 }
1335 /*
1336 * XXX sowwakup is called when data is acked and there's room for
1337 * for more data... it should read() the socket
1338 */
1339#if 0
1340 if (so->so_snd.sb_flags & SB_NOTIFY)
1341 sowwakeup(so);
1342#endif
1343 tp->snd_una = ti->ti_ack;
1344 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1345 tp->snd_nxt = tp->snd_una;
1346
1347 switch (tp->t_state)
1348 {
1349 /*
1350 * In FIN_WAIT_1 STATE in addition to the processing
1351 * for the ESTABLISHED state if our FIN is now acknowledged
1352 * then enter FIN_WAIT_2.
1353 */
1354 case TCPS_FIN_WAIT_1:
1355 if (ourfinisacked)
1356 {
1357 /*
1358 * If we can't receive any more
1359 * data, then closing user can proceed.
1360 * Starting the timer is contrary to the
1361 * specification, but if we don't get a FIN
1362 * we'll hang forever.
1363 */
1364 if (so->so_state & SS_FCANTRCVMORE)
1365 {
1366 soisfdisconnected(so);
1367 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1368 }
1369 tp->t_state = TCPS_FIN_WAIT_2;
1370 }
1371 break;
1372
1373 /*
1374 * In CLOSING STATE in addition to the processing for
1375 * the ESTABLISHED state if the ACK acknowledges our FIN
1376 * then enter the TIME-WAIT state, otherwise ignore
1377 * the segment.
1378 */
1379 case TCPS_CLOSING:
1380 if (ourfinisacked)
1381 {
1382 tp->t_state = TCPS_TIME_WAIT;
1383 tcp_canceltimers(tp);
1384 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1385 soisfdisconnected(so);
1386 }
1387 break;
1388
1389 /*
1390 * In LAST_ACK, we may still be waiting for data to drain
1391 * and/or to be acked, as well as for the ack of our FIN.
1392 * If our FIN is now acknowledged, delete the TCB,
1393 * enter the closed state and return.
1394 */
1395 case TCPS_LAST_ACK:
1396 if (ourfinisacked)
1397 {
1398 Log2(("ourfinisacked=>drop\n"));
1399 tp = tcp_close(pData, tp);
1400 goto drop;
1401 }
1402 break;
1403
1404 /*
1405 * In TIME_WAIT state the only thing that should arrive
1406 * is a retransmission of the remote FIN. Acknowledge
1407 * it and restart the finack timer.
1408 */
1409 case TCPS_TIME_WAIT:
1410 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1411 goto dropafterack;
1412 }
1413 } /* switch(tp->t_state) */
1414
1415step6:
1416 /*
1417 * Update window information.
1418 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1419 */
1420 if ( (tiflags & TH_ACK)
1421 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1422 || ( tp->snd_wl1 == ti->ti_seq
1423 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1424 || ( tp->snd_wl2 == ti->ti_ack
1425 && tiwin > tp->snd_wnd)))))
1426 {
1427 /* keep track of pure window updates */
1428 if ( ti->ti_len == 0
1429 && tp->snd_wl2 == ti->ti_ack
1430 && tiwin > tp->snd_wnd)
1431 tcpstat.tcps_rcvwinupd++;
1432 tp->snd_wnd = tiwin;
1433 tp->snd_wl1 = ti->ti_seq;
1434 tp->snd_wl2 = ti->ti_ack;
1435 if (tp->snd_wnd > tp->max_sndwnd)
1436 tp->max_sndwnd = tp->snd_wnd;
1437 needoutput = 1;
1438 }
1439
1440 /*
1441 * Process segments with URG.
1442 */
1443 if ((tiflags & TH_URG) && ti->ti_urp &&
1444 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1445 {
1446 /*
1447 * This is a kludge, but if we receive and accept
1448 * random urgent pointers, we'll crash in
1449 * soreceive. It's hard to imagine someone
1450 * actually wanting to send this much urgent data.
1451 */
1452 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1453 {
1454 ti->ti_urp = 0;
1455 tiflags &= ~TH_URG;
1456 goto dodata;
1457 }
1458 /*
1459 * If this segment advances the known urgent pointer,
1460 * then mark the data stream. This should not happen
1461 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1462 * a FIN has been received from the remote side.
1463 * In these states we ignore the URG.
1464 *
1465 * According to RFC961 (Assigned Protocols),
1466 * the urgent pointer points to the last octet
1467 * of urgent data. We continue, however,
1468 * to consider it to indicate the first octet
1469 * of data past the urgent section as the original
1470 * spec states (in one of two places).
1471 */
1472 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1473 {
1474 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1475 so->so_urgc = so->so_rcv.sb_cc +
1476 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1477 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1478 }
1479 }
1480 else
1481 /*
1482 * If no out of band data is expected,
1483 * pull receive urgent pointer along
1484 * with the receive window.
1485 */
1486 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1487 tp->rcv_up = tp->rcv_nxt;
1488dodata:
1489 Log2(("do data hit!\n"));
1490
1491 /*
1492 * If this is a small packet, then ACK now - with Nagel
1493 * congestion avoidance sender won't send more until
1494 * he gets an ACK.
1495 *
1496 * See above.
1497 */
1498 if ( ti->ti_len
1499 && (unsigned)ti->ti_len <= 5
1500 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1501 {
1502 tp->t_flags |= TF_ACKNOW;
1503 }
1504
1505 /*
1506 * Process the segment text, merging it into the TCP sequencing queue,
1507 * and arranging for acknowledgment of receipt if necessary.
1508 * This process logically involves adjusting tp->rcv_wnd as data
1509 * is presented to the user (this happens in tcp_usrreq.c,
1510 * case PRU_RCVD). If a FIN has already been received on this
1511 * connection then we just ignore the text.
1512 */
1513 if ( (ti->ti_len || (tiflags&TH_FIN))
1514 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1515 {
1516 if ( ti->ti_seq == tp->rcv_nxt
1517 && LIST_EMPTY(&tp->t_segq)
1518 && tp->t_state == TCPS_ESTABLISHED)
1519 {
1520 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1521 tp->rcv_nxt += tlen;
1522 tiflags = ti->ti_t.th_flags & TH_FIN;
1523 tcpstat.tcps_rcvpack++;
1524 tcpstat.tcps_rcvbyte += tlen;
1525 if (so->so_state & SS_FCANTRCVMORE)
1526 m_freem(pData, m);
1527 else
1528 {
1529 if (so->so_emu)
1530 {
1531 if (tcp_emu(pData, so, m))
1532 sbappend(pData, so, m);
1533 }
1534 else
1535 sbappend(pData, so, m);
1536 }
1537 }
1538 else
1539 {
1540 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1541 tiflags |= TF_ACKNOW;
1542 }
1543 /*
1544 * Note the amount of data that peer has sent into
1545 * our window, in order to estimate the sender's
1546 * buffer size.
1547 */
1548 len = so->so_rcv.sb_datalen - (tp->rcv_adv - tp->rcv_nxt);
1549 }
1550 else
1551 {
1552 m_free(pData, m);
1553 tiflags &= ~TH_FIN;
1554 }
1555
1556 /*
1557 * If FIN is received ACK the FIN and let the user know
1558 * that the connection is closing.
1559 */
1560 if (tiflags & TH_FIN)
1561 {
1562 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1563 {
1564 /*
1565 * If we receive a FIN we can't send more data,
1566 * set it SS_FDRAIN
1567 * Shutdown the socket if there is no rx data in the
1568 * buffer.
1569 * soread() is called on completion of shutdown() and
1570 * will got to TCPS_LAST_ACK, and use tcp_output()
1571 * to send the FIN.
1572 */
1573/* sofcantrcvmore(so); */
1574 sofwdrain(so);
1575
1576 tp->t_flags |= TF_ACKNOW;
1577 tp->rcv_nxt++;
1578 }
1579 switch (tp->t_state)
1580 {
1581 /*
1582 * In SYN_RECEIVED and ESTABLISHED STATES
1583 * enter the CLOSE_WAIT state.
1584 */
1585 case TCPS_SYN_RECEIVED:
1586 case TCPS_ESTABLISHED:
1587 if(so->so_emu == EMU_CTL) /* no shutdown on socket */
1588 tp->t_state = TCPS_LAST_ACK;
1589 else
1590 tp->t_state = TCPS_CLOSE_WAIT;
1591 break;
1592
1593 /*
1594 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1595 * enter the CLOSING state.
1596 */
1597 case TCPS_FIN_WAIT_1:
1598 tp->t_state = TCPS_CLOSING;
1599 break;
1600
1601 /*
1602 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1603 * starting the time-wait timer, turning off the other
1604 * standard timers.
1605 */
1606 case TCPS_FIN_WAIT_2:
1607 tp->t_state = TCPS_TIME_WAIT;
1608 tcp_canceltimers(tp);
1609 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1610 soisfdisconnected(so);
1611 break;
1612
1613 /*
1614 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1615 */
1616 case TCPS_TIME_WAIT:
1617 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1618 break;
1619 }
1620 }
1621
1622 /*
1623 * Return any desired output.
1624 */
1625 if (needoutput || (tp->t_flags & TF_ACKNOW))
1626 tcp_output(pData, tp);
1627
1628 SOCKET_UNLOCK(so);
1629 return;
1630
1631dropafterack:
1632 Log2(("drop after ack\n"));
1633 /*
1634 * Generate an ACK dropping incoming segment if it occupies
1635 * sequence space, where the ACK reflects our state.
1636 */
1637 if (tiflags & TH_RST)
1638 goto drop;
1639 m_freem(pData, m);
1640 tp->t_flags |= TF_ACKNOW;
1641 (void) tcp_output(pData, tp);
1642 SOCKET_UNLOCK(so);
1643 return;
1644
1645dropwithreset:
1646 /* reuses m if m!=NULL, m_free() unnecessary */
1647 if (tiflags & TH_ACK)
1648 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1649 else
1650 {
1651 if (tiflags & TH_SYN) ti->ti_len++;
1652 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1653 TH_RST|TH_ACK);
1654 }
1655
1656 if (so != &tcb)
1657 SOCKET_UNLOCK(so);
1658 return;
1659
1660drop:
1661 /*
1662 * Drop space held by incoming segment and return.
1663 */
1664 m_free(pData, m);
1665
1666#ifdef VBOX_WITH_SLIRP_MT
1667 if (RTCritSectIsOwned(&so->so_mutex))
1668 {
1669 SOCKET_UNLOCK(so);
1670 }
1671#endif
1672
1673 return;
1674}
1675
1676void
1677tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1678{
1679 u_int16_t mss;
1680 int opt, optlen;
1681
1682 DEBUG_CALL("tcp_dooptions");
1683 DEBUG_ARGS((dfd," tp = %lx cnt=%i \n", (long )tp, cnt));
1684
1685 for (; cnt > 0; cnt -= optlen, cp += optlen)
1686 {
1687 opt = cp[0];
1688 if (opt == TCPOPT_EOL)
1689 break;
1690 if (opt == TCPOPT_NOP)
1691 optlen = 1;
1692 else
1693 {
1694 optlen = cp[1];
1695 if (optlen <= 0)
1696 break;
1697 }
1698 switch (opt)
1699 {
1700 default:
1701 continue;
1702
1703 case TCPOPT_MAXSEG:
1704 if (optlen != TCPOLEN_MAXSEG)
1705 continue;
1706 if (!(ti->ti_flags & TH_SYN))
1707 continue;
1708 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1709 NTOHS(mss);
1710 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1711 break;
1712
1713#if 0
1714 case TCPOPT_WINDOW:
1715 if (optlen != TCPOLEN_WINDOW)
1716 continue;
1717 if (!(ti->ti_flags & TH_SYN))
1718 continue;
1719 tp->t_flags |= TF_RCVD_SCALE;
1720 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1721 break;
1722
1723 case TCPOPT_TIMESTAMP:
1724 if (optlen != TCPOLEN_TIMESTAMP)
1725 continue;
1726 *ts_present = 1;
1727 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1728 NTOHL(*ts_val);
1729 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1730 NTOHL(*ts_ecr);
1731
1732 /*
1733 * A timestamp received in a SYN makes
1734 * it ok to send timestamp requests and replies.
1735 */
1736 if (ti->ti_flags & TH_SYN)
1737 {
1738 tp->t_flags |= TF_RCVD_TSTMP;
1739 tp->ts_recent = *ts_val;
1740 tp->ts_recent_age = tcp_now;
1741 }
1742 break;
1743#endif
1744 }
1745 }
1746}
1747
1748
1749/*
1750 * Pull out of band byte out of a segment so
1751 * it doesn't appear in the user's data queue.
1752 * It is still reflected in the segment length for
1753 * sequencing purposes.
1754 */
1755
1756#if 0
1757void
1758tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1759{
1760 int cnt = ti->ti_urp - 1;
1761
1762 while (cnt >= 0)
1763 {
1764 if (m->m_len > cnt)
1765 {
1766 char *cp = mtod(m, caddr_t) + cnt;
1767 struct tcpcb *tp = sototcpcb(so);
1768
1769 tp->t_iobc = *cp;
1770 tp->t_oobflags |= TCPOOB_HAVEDATA;
1771 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1772 m->m_len--;
1773 return;
1774 }
1775 cnt -= m->m_len;
1776 m = m->m_next; /* XXX WRONG! Fix it! */
1777 if (m == 0)
1778 break;
1779 }
1780 panic("tcp_pulloutofband");
1781}
1782#endif
1783
1784/*
1785 * Collect new round-trip time estimate
1786 * and update averages and current timeout.
1787 */
1788
1789void
1790tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1791{
1792 register short delta;
1793
1794 DEBUG_CALL("tcp_xmit_timer");
1795 DEBUG_ARG("tp = %lx", (long)tp);
1796 DEBUG_ARG("rtt = %d", rtt);
1797
1798 tcpstat.tcps_rttupdated++;
1799 if (tp->t_srtt != 0)
1800 {
1801 /*
1802 * srtt is stored as fixed point with 3 bits after the
1803 * binary point (i.e., scaled by 8). The following magic
1804 * is equivalent to the smoothing algorithm in rfc793 with
1805 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1806 * point). Adjust rtt to origin 0.
1807 */
1808 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1809 if ((tp->t_srtt += delta) <= 0)
1810 tp->t_srtt = 1;
1811 /*
1812 * We accumulate a smoothed rtt variance (actually, a
1813 * smoothed mean difference), then set the retransmit
1814 * timer to smoothed rtt + 4 times the smoothed variance.
1815 * rttvar is stored as fixed point with 2 bits after the
1816 * binary point (scaled by 4). The following is
1817 * equivalent to rfc793 smoothing with an alpha of .75
1818 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1819 * rfc793's wired-in beta.
1820 */
1821 if (delta < 0)
1822 delta = -delta;
1823 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1824 if ((tp->t_rttvar += delta) <= 0)
1825 tp->t_rttvar = 1;
1826 }
1827 else
1828 {
1829 /*
1830 * No rtt measurement yet - use the unsmoothed rtt.
1831 * Set the variance to half the rtt (so our first
1832 * retransmit happens at 3*rtt).
1833 */
1834 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1835 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1836 }
1837 tp->t_rtt = 0;
1838 tp->t_rxtshift = 0;
1839
1840 /*
1841 * the retransmit should happen at rtt + 4 * rttvar.
1842 * Because of the way we do the smoothing, srtt and rttvar
1843 * will each average +1/2 tick of bias. When we compute
1844 * the retransmit timer, we want 1/2 tick of rounding and
1845 * 1 extra tick because of +-1/2 tick uncertainty in the
1846 * firing of the timer. The bias will give us exactly the
1847 * 1.5 tick we need. But, because the bias is
1848 * statistical, we have to test that we don't drop below
1849 * the minimum feasible timer (which is 2 ticks).
1850 */
1851 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1852 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
1853
1854 /*
1855 * We received an ack for a packet that wasn't retransmitted;
1856 * it is probably safe to discard any error indications we've
1857 * received recently. This isn't quite right, but close enough
1858 * for now (a route might have failed after we sent a segment,
1859 * and the return path might not be symmetrical).
1860 */
1861 tp->t_softerror = 0;
1862}
1863
1864/*
1865 * Determine a reasonable value for maxseg size.
1866 * If the route is known, check route for mtu.
1867 * If none, use an mss that can be handled on the outgoing
1868 * interface without forcing IP to fragment; if bigger than
1869 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
1870 * to utilize large mbufs. If no route is found, route has no mtu,
1871 * or the destination isn't local, use a default, hopefully conservative
1872 * size (usually 512 or the default IP max size, but no more than the mtu
1873 * of the interface), as we can't discover anything about intervening
1874 * gateways or networks. We also initialize the congestion/slow start
1875 * window to be a single segment if the destination isn't local.
1876 * While looking at the routing entry, we also initialize other path-dependent
1877 * parameters from pre-set or cached values in the routing entry.
1878 */
1879
1880int
1881tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
1882{
1883 struct socket *so = tp->t_socket;
1884 int mss;
1885
1886 DEBUG_CALL("tcp_mss");
1887 DEBUG_ARG("tp = %lx", (long)tp);
1888 DEBUG_ARG("offer = %d", offer);
1889
1890 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
1891 if (offer)
1892 mss = min(mss, offer);
1893 mss = max(mss, 32);
1894 if (mss < tp->t_maxseg || offer != 0)
1895 tp->t_maxseg = mss;
1896
1897 tp->snd_cwnd = mss;
1898
1899 sbreserve(&so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
1900 sbreserve(&so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
1901
1902 DEBUG_MISC((dfd, " returning mss = %d\n", mss));
1903
1904 return mss;
1905}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette