VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 95573

Last change on this file since 95573 was 95573, checked in by vboxsync, 3 years ago

Network/slirp: Advertising clause for Danny Gasparovsky was unintentional, should have always been 3-clause BSD. Replace 4-clause BSD license by 3-clause, see retroactive license change by UC Berkeley https://www.freebsd.org/copyright/license/

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 66.9 KB
Line 
1/* $Id: tcp_input.c 95573 2022-07-08 18:16:35Z vboxsync $ */
2/** @file
3 * NAT - TCP input.
4 */
5
6/*
7 * Copyright (C) 2006-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
22 * The Regents of the University of California. All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. Neither the name of the University nor the names of its contributors
33 * may be used to endorse or promote products derived from this software
34 * without specific prior written permission.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
37 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
39 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
41 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
42 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
44 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
45 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 * SUCH DAMAGE.
47 *
48 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
49 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
50 */
51
52/*
53 * Changes and additions relating to SLiRP
54 * Copyright (c) 1995 Danny Gasparovski.
55 *
56 * Please read the file COPYRIGHT for the
57 * terms and conditions of the copyright.
58 */
59
60#include <slirp.h>
61#include "ip_icmp.h"
62
63
64#if 0 /* code using this macroses is commented out */
65# define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
66
67/* for modulo comparisons of timestamps */
68# define TSTMP_LT(a, b) ((int)((a)-(b)) < 0)
69# define TSTMP_GEQ(a, b) ((int)((a)-(b)) >= 0)
70#endif
71
72#ifndef TCP_ACK_HACK
73#define DELAY_ACK(tp, ti) \
74 if (ti->ti_flags & TH_PUSH) \
75 tp->t_flags |= TF_ACKNOW; \
76 else \
77 tp->t_flags |= TF_DELACK;
78#else /* !TCP_ACK_HACK */
79#define DELAY_ACK(tp, ign) \
80 tp->t_flags |= TF_DELACK;
81#endif /* TCP_ACK_HACK */
82
83
84/*
85 * deps: netinet/tcp_reass.c
86 * tcp_reass_maxqlen = 48 (deafault)
87 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
88 */
89int
90tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
91{
92 struct tseg_qent *q;
93 struct tseg_qent *p = NULL;
94 struct tseg_qent *nq;
95 struct tseg_qent *te = NULL;
96 struct socket *so = tp->t_socket;
97 int flags;
98 STAM_PROFILE_START(&pData->StatTCP_reassamble, tcp_reassamble);
99 LogFlowFunc(("ENTER: pData:%p, tp:%R[tcpcb793], th:%p, tlenp:%p, m:%p\n", pData, tp, th, tlenp, m));
100
101 /*
102 * XXX: tcp_reass() is rather inefficient with its data structures
103 * and should be rewritten (see NetBSD for optimizations). While
104 * doing that it should move to its own file tcp_reass.c.
105 */
106
107 /*
108 * Call with th==NULL after become established to
109 * force pre-ESTABLISHED data up to user socket.
110 */
111 if (th == NULL)
112 {
113 LogFlowFunc(("%d -> present\n", __LINE__));
114 goto present;
115 }
116
117 /*
118 * Limit the number of segments in the reassembly queue to prevent
119 * holding on to too many segments (and thus running out of mbufs).
120 * Make sure to let the missing segment through which caused this
121 * queue. Always keep one global queue entry spare to be able to
122 * process the missing segment.
123 */
124 if ( th->th_seq != tp->rcv_nxt
125 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
126 || tp->t_segqlen >= tcp_reass_maxqlen))
127 {
128 tcp_reass_overflows++;
129 tcpstat.tcps_rcvmemdrop++;
130 m_freem(pData, m);
131 *tlenp = 0;
132 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
133 LogFlowFuncLeave();
134 return (0);
135 }
136
137 /*
138 * Allocate a new queue entry. If we can't, or hit the zone limit
139 * just drop the pkt.
140 */
141 te = RTMemAlloc(sizeof(struct tseg_qent));
142 if (te == NULL)
143 {
144 tcpstat.tcps_rcvmemdrop++;
145 m_freem(pData, m);
146 *tlenp = 0;
147 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
148 LogFlowFuncLeave();
149 return (0);
150 }
151 tp->t_segqlen++;
152 tcp_reass_qsize++;
153
154 /*
155 * Find a segment which begins after this one does.
156 */
157 LIST_FOREACH(q, &tp->t_segq, tqe_q)
158 {
159 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
160 break;
161 p = q;
162 }
163
164 /*
165 * If there is a preceding segment, it may provide some of
166 * our data already. If so, drop the data from the incoming
167 * segment. If it provides all of our data, drop us.
168 */
169 if (p != NULL)
170 {
171 int i;
172 /* conversion to int (in i) handles seq wraparound */
173 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
174 if (i > 0)
175 {
176 if (i >= *tlenp)
177 {
178 tcpstat.tcps_rcvduppack++;
179 tcpstat.tcps_rcvdupbyte += *tlenp;
180 m_freem(pData, m);
181 RTMemFree(te);
182 tp->t_segqlen--;
183 tcp_reass_qsize--;
184 /*
185 * Try to present any queued data
186 * at the left window edge to the user.
187 * This is needed after the 3-WHS
188 * completes.
189 */
190 LogFlowFunc(("%d -> present\n", __LINE__));
191 goto present; /* ??? */
192 }
193 m_adj(m, i);
194 *tlenp -= i;
195 th->th_seq += i;
196 }
197 }
198 tcpstat.tcps_rcvoopack++;
199 tcpstat.tcps_rcvoobyte += *tlenp;
200
201 /*
202 * While we overlap succeeding segments trim them or,
203 * if they are completely covered, dequeue them.
204 */
205 while (q)
206 {
207 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
208 if (i <= 0)
209 break;
210 if (i < q->tqe_len)
211 {
212 q->tqe_th->th_seq += i;
213 q->tqe_len -= i;
214 m_adj(q->tqe_m, i);
215 break;
216 }
217
218 nq = LIST_NEXT(q, tqe_q);
219 LIST_REMOVE(q, tqe_q);
220 m_freem(pData, q->tqe_m);
221 RTMemFree(q);
222 tp->t_segqlen--;
223 tcp_reass_qsize--;
224 q = nq;
225 }
226
227 /* Insert the new segment queue entry into place. */
228 te->tqe_m = m;
229 te->tqe_th = th;
230 te->tqe_len = *tlenp;
231
232 if (p == NULL)
233 {
234 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
235 }
236 else
237 {
238 LIST_INSERT_AFTER(p, te, tqe_q);
239 }
240
241present:
242 /*
243 * Present data to user, advancing rcv_nxt through
244 * completed sequence space.
245 */
246 if (!TCPS_HAVEESTABLISHED(tp->t_state))
247 {
248 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
249 return (0);
250 }
251 q = LIST_FIRST(&tp->t_segq);
252 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
253 {
254 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
255 return (0);
256 }
257 do
258 {
259 tp->rcv_nxt += q->tqe_len;
260 flags = q->tqe_th->th_flags & TH_FIN;
261 nq = LIST_NEXT(q, tqe_q);
262 LIST_REMOVE(q, tqe_q);
263 /* XXX: This place should be checked for the same code in
264 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
265 */
266 if (so->so_state & SS_FCANTSENDMORE)
267 m_freem(pData, q->tqe_m);
268 else
269 sbappend(pData, so, q->tqe_m);
270 RTMemFree(q);
271 tp->t_segqlen--;
272 tcp_reass_qsize--;
273 q = nq;
274 }
275 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
276
277 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
278 return flags;
279}
280
281/*
282 * TCP input routine, follows pages 65-76 of the
283 * protocol specification dated September, 1981 very closely.
284 */
285void
286tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
287{
288 struct ip *ip, *save_ip;
289 register struct tcpiphdr *ti;
290 caddr_t optp = NULL;
291 int optlen = 0;
292 int len, off;
293 int tlen = 0; /* Shut up MSC (didn't check whether MSC was right). */
294 register struct tcpcb *tp = 0;
295 register int tiflags;
296 struct socket *so = 0;
297 int todrop, acked, ourfinisacked, needoutput = 0;
298/* int dropsocket = 0; */
299 int iss = 0;
300 u_long tiwin;
301/* int ts_present = 0; */
302 unsigned ohdrlen;
303 uint8_t ohdr[60 + 8]; /* max IP header plus 8 bytes of payload for icmp */
304
305 STAM_PROFILE_START(&pData->StatTCP_input, counter_input);
306
307 LogFlow(("tcp_input: m = %p, iphlen = %2d, inso = %R[natsock]\n", m, iphlen, inso));
308
309 if (inso != NULL)
310 {
311 QSOCKET_LOCK(tcb);
312 SOCKET_LOCK(inso);
313 QSOCKET_UNLOCK(tcb);
314 }
315 /*
316 * If called with m == 0, then we're continuing the connect
317 */
318 if (m == NULL)
319 {
320 so = inso;
321 Log4(("NAT: tcp_input: %R[natsock]\n", so));
322
323 /* Re-set a few variables */
324 tp = sototcpcb(so);
325
326 m = so->so_m;
327 optp = so->so_optp; /* points into m if set */
328 optlen = so->so_optlen;
329 so->so_m = NULL;
330 so->so_optp = 0;
331 so->so_optlen = 0;
332
333 if (RT_LIKELY(so->so_ohdr != NULL))
334 {
335 RTMemFree(so->so_ohdr);
336 so->so_ohdr = NULL;
337 }
338
339 ti = so->so_ti;
340
341 /** @todo (vvl) clarify why it might happens */
342 if (ti == NULL)
343 {
344 LogRel(("NAT: ti is null. can't do any reseting connection actions\n"));
345 /* mbuf should be cleared in sofree called from tcp_close */
346 tcp_close(pData, tp);
347 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
348 LogFlowFuncLeave();
349 return;
350 }
351
352 tiwin = ti->ti_win;
353 tiflags = ti->ti_flags;
354
355 LogFlowFunc(("%d -> cont_conn\n", __LINE__));
356 goto cont_conn;
357 }
358
359 tcpstat.tcps_rcvtotal++;
360
361 ip = mtod(m, struct ip *);
362
363 /* ip_input() subtracts iphlen from ip::ip_len */
364 AssertStmt(ip->ip_len + iphlen == (ssize_t)m_length(m, NULL), goto drop);
365 if (RT_UNLIKELY(ip->ip_len < sizeof(struct tcphdr)))
366 {
367 /* tcps_rcvshort++; */
368 goto drop;
369 }
370
371 /*
372 * Save a copy of the IP header in case we want to restore it for
373 * sending an ICMP error message in response.
374 *
375 * XXX: This function should really be fixed to not strip IP
376 * options, to not overwrite IP header and to use "tlen" local
377 * variable (instead of ti->ti_len), then "m" could be passed to
378 * icmp_error() directly.
379 */
380 ohdrlen = iphlen + 8;
381 m_copydata(m, 0, ohdrlen, (caddr_t)ohdr);
382 save_ip = (struct ip *)ohdr;
383 save_ip->ip_len += iphlen; /* undo change by ip_input() */
384
385
386 /*
387 * Get IP and TCP header together in first mbuf.
388 * Note: IP leaves IP header in first mbuf.
389 */
390 ti = mtod(m, struct tcpiphdr *);
391 if (iphlen > sizeof(struct ip))
392 {
393 ip_stripoptions(m, (struct mbuf *)0);
394 iphlen = sizeof(struct ip);
395 }
396
397 /*
398 * Checksum extended TCP header and data.
399 */
400 tlen = ((struct ip *)ti)->ip_len;
401 memset(ti->ti_x1, 0, 9);
402 ti->ti_len = RT_H2N_U16((u_int16_t)tlen);
403 len = sizeof(struct ip) + tlen;
404 /* keep checksum for ICMP reply
405 * ti->ti_sum = cksum(m, len);
406 * if (ti->ti_sum) { */
407 if (cksum(m, len))
408 {
409 tcpstat.tcps_rcvbadsum++;
410 LogFlowFunc(("%d -> drop\n", __LINE__));
411 goto drop;
412 }
413
414 /*
415 * Check that TCP offset makes sense,
416 * pull out TCP options and adjust length. XXX
417 */
418 off = ti->ti_off << 2;
419 if ( off < sizeof (struct tcphdr)
420 || off > tlen)
421 {
422 tcpstat.tcps_rcvbadoff++;
423 LogFlowFunc(("%d -> drop\n", __LINE__));
424 goto drop;
425 }
426 tlen -= off;
427 ti->ti_len = tlen;
428 if (off > sizeof (struct tcphdr))
429 {
430 optlen = off - sizeof (struct tcphdr);
431 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
432
433 /*
434 * Do quick retrieval of timestamp options ("options
435 * prediction?"). If timestamp is the only option and it's
436 * formatted as recommended in RFC 1323 appendix A, we
437 * quickly get the values now and not bother calling
438 * tcp_dooptions(), etc.
439 */
440#if 0
441 if (( optlen == TCPOLEN_TSTAMP_APPA
442 || ( optlen > TCPOLEN_TSTAMP_APPA
443 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
444 *(u_int32_t *)optp == RT_H2N_U32_C(TCPOPT_TSTAMP_HDR) &&
445 (ti->ti_flags & TH_SYN) == 0)
446 {
447 ts_present = 1;
448 ts_val = RT_N2H_U32(*(u_int32_t *)(optp + 4));
449 ts_ecr = RT_N2H_U32(*(u_int32_t *)(optp + 8));
450 optp = NULL; / * we have parsed the options * /
451 }
452#endif
453 }
454 tiflags = ti->ti_flags;
455
456 /*
457 * Convert TCP protocol specific fields to host format.
458 */
459 NTOHL(ti->ti_seq);
460 NTOHL(ti->ti_ack);
461 NTOHS(ti->ti_win);
462 NTOHS(ti->ti_urp);
463
464 /*
465 * Drop TCP, IP headers and TCP options.
466 */
467 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
468 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
469
470 /*
471 * Locate pcb for segment.
472 */
473findso:
474 LogFlowFunc(("(enter) findso: %R[natsock]\n", so));
475 if (so != NULL && so != &tcb)
476 SOCKET_UNLOCK(so);
477 QSOCKET_LOCK(tcb);
478 so = tcp_last_so;
479 if ( so->so_fport != ti->ti_dport
480 || so->so_lport != ti->ti_sport
481 || so->so_laddr.s_addr != ti->ti_src.s_addr
482 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
483 {
484 QSOCKET_UNLOCK(tcb);
485 /** @todo fix SOLOOKUP macrodefinition to be usable here */
486 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
487 ti->ti_dst, ti->ti_dport);
488 if (so)
489 {
490 tcp_last_so = so;
491 }
492 ++tcpstat.tcps_socachemiss;
493 }
494 else
495 {
496 SOCKET_LOCK(so);
497 QSOCKET_UNLOCK(tcb);
498 }
499 LogFlowFunc(("(leave) findso: %R[natsock]\n", so));
500
501 /*
502 * Check whether the packet is targeting CTL_ALIAS and drop it if the connection wasn't
503 * initiated by localhost (so == NULL), see @bugref{9896}.
504 */
505 if ( (RT_N2H_U32(ti->ti_dst.s_addr) & ~pData->netmask) == CTL_ALIAS
506 && !pData->fLocalhostReachable
507 && !so)
508 {
509 LogFlowFunc(("Packet for CTL_ALIAS and fLocalhostReachable=false so=NULL -> drop\n"));
510 goto drop;
511 }
512
513 /*
514 * If the state is CLOSED (i.e., TCB does not exist) then
515 * all data in the incoming segment is discarded.
516 * If the TCB exists but is in CLOSED state, it is embryonic,
517 * but should either do a listen or a connect soon.
518 *
519 * state == CLOSED means we've done socreate() but haven't
520 * attached it to a protocol yet...
521 *
522 * XXX If a TCB does not exist, and the TH_SYN flag is
523 * the only flag set, then create a session, mark it
524 * as if it was LISTENING, and continue...
525 */
526 if (so == 0)
527 {
528 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
529 {
530 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
531 goto dropwithreset;
532 }
533
534 if ((so = socreate()) == NULL)
535 {
536 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
537 goto dropwithreset;
538 }
539 if (tcp_attach(pData, so) < 0)
540 {
541 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
542 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
543 goto dropwithreset;
544 }
545 SOCKET_LOCK(so);
546 sbreserve(pData, &so->so_snd, tcp_sndspace);
547 sbreserve(pData, &so->so_rcv, tcp_rcvspace);
548
549/* tcp_last_so = so; */ /* XXX ? */
550/* tp = sototcpcb(so); */
551
552 so->so_laddr = ti->ti_src;
553 so->so_lport = ti->ti_sport;
554 so->so_faddr = ti->ti_dst;
555 so->so_fport = ti->ti_dport;
556
557 so->so_iptos = ((struct ip *)ti)->ip_tos;
558
559 tp = sototcpcb(so);
560 TCP_STATE_SWITCH_TO(tp, TCPS_LISTEN);
561 }
562
563 /*
564 * If this is a still-connecting socket, this probably
565 * a retransmit of the SYN. Whether it's a retransmit SYN
566 * or something else, we nuke it.
567 */
568 if (so->so_state & SS_ISFCONNECTING)
569 {
570 LogFlowFunc(("%d -> drop\n", __LINE__));
571 goto drop;
572 }
573
574 tp = sototcpcb(so);
575
576 /* XXX Should never fail */
577 if (tp == 0)
578 {
579 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
580 goto dropwithreset;
581 }
582 if (tp->t_state == TCPS_CLOSED)
583 {
584 LogFlowFunc(("%d -> drop\n", __LINE__));
585 goto drop;
586 }
587
588 /* Unscale the window into a 32-bit value. */
589/* if ((tiflags & TH_SYN) == 0)
590 * tiwin = ti->ti_win << tp->snd_scale;
591 * else
592 */
593 tiwin = ti->ti_win;
594
595 /*
596 * Segment received on connection.
597 * Reset idle time and keep-alive timer.
598 */
599 tp->t_idle = 0;
600 if (so_options)
601 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
602 else
603 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
604
605 /*
606 * Process options if not in LISTEN state,
607 * else do it below (after getting remote address).
608 */
609 if (optp && tp->t_state != TCPS_LISTEN)
610 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
611/* , */
612/* &ts_present, &ts_val, &ts_ecr); */
613
614 /*
615 * Header prediction: check for the two common cases
616 * of a uni-directional data xfer. If the packet has
617 * no control flags, is in-sequence, the window didn't
618 * change and we're not retransmitting, it's a
619 * candidate. If the length is zero and the ack moved
620 * forward, we're the sender side of the xfer. Just
621 * free the data acked & wake any higher level process
622 * that was blocked waiting for space. If the length
623 * is non-zero and the ack didn't move, we're the
624 * receiver side. If we're getting packets in-order
625 * (the reassembly queue is empty), add the data to
626 * the socket buffer and note that we need a delayed ack.
627 *
628 * XXX Some of these tests are not needed
629 * eg: the tiwin == tp->snd_wnd prevents many more
630 * predictions.. with no *real* advantage..
631 */
632 if ( tp->t_state == TCPS_ESTABLISHED
633 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
634/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
635 && ti->ti_seq == tp->rcv_nxt
636 && tiwin && tiwin == tp->snd_wnd
637 && tp->snd_nxt == tp->snd_max)
638 {
639 /*
640 * If last ACK falls within this segment's sequence numbers,
641 * record the timestamp.
642 */
643#if 0
644 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
645 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
646 {
647 tp->ts_recent_age = tcp_now;
648 tp->ts_recent = ts_val;
649 }
650#endif
651
652 if (ti->ti_len == 0)
653 {
654 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
655 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
656 && tp->snd_cwnd >= tp->snd_wnd)
657 {
658 /*
659 * this is a pure ack for outstanding data.
660 */
661 ++tcpstat.tcps_predack;
662#if 0
663 if (ts_present)
664 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
665 else
666#endif
667 if ( tp->t_rtt
668 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
669 tcp_xmit_timer(pData, tp, tp->t_rtt);
670 acked = ti->ti_ack - tp->snd_una;
671 tcpstat.tcps_rcvackpack++;
672 tcpstat.tcps_rcvackbyte += acked;
673 sbdrop(&so->so_snd, acked);
674 tp->snd_una = ti->ti_ack;
675 m_freem(pData, m);
676
677 /*
678 * If all outstanding data are acked, stop
679 * retransmit timer, otherwise restart timer
680 * using current (possibly backed-off) value.
681 * If process is waiting for space,
682 * wakeup/selwakeup/signal. If data
683 * are ready to send, let tcp_output
684 * decide between more output or persist.
685 */
686 if (tp->snd_una == tp->snd_max)
687 tp->t_timer[TCPT_REXMT] = 0;
688 else if (tp->t_timer[TCPT_PERSIST] == 0)
689 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
690
691 /*
692 * There's room in so_snd, sowwakup will read()
693 * from the socket if we can
694 */
695#if 0
696 if (so->so_snd.sb_flags & SB_NOTIFY)
697 sowwakeup(so);
698#endif
699 /*
700 * This is called because sowwakeup might have
701 * put data into so_snd. Since we don't so sowwakeup,
702 * we don't need this.. XXX???
703 */
704 if (SBUF_LEN(&so->so_snd))
705 (void) tcp_output(pData, tp);
706
707 SOCKET_UNLOCK(so);
708 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
709 return;
710 }
711 }
712 else if ( ti->ti_ack == tp->snd_una
713 && LIST_EMPTY(&tp->t_segq)
714 && ti->ti_len <= sbspace(&so->so_rcv))
715 {
716 /*
717 * this is a pure, in-sequence data packet
718 * with nothing on the reassembly queue and
719 * we have enough buffer space to take it.
720 */
721 ++tcpstat.tcps_preddat;
722 tp->rcv_nxt += ti->ti_len;
723 tcpstat.tcps_rcvpack++;
724 tcpstat.tcps_rcvbyte += ti->ti_len;
725 /*
726 * Add data to socket buffer.
727 */
728 sbappend(pData, so, m);
729
730 /*
731 * XXX This is called when data arrives. Later, check
732 * if we can actually write() to the socket
733 * XXX Need to check? It's be NON_BLOCKING
734 */
735/* sorwakeup(so); */
736
737 /*
738 * If this is a short packet, then ACK now - with Nagle
739 * congestion avoidance sender won't send more until
740 * he gets an ACK.
741 *
742 * It is better to not delay acks at all to maximize
743 * TCP throughput. See RFC 2581.
744 */
745 tp->t_flags |= TF_ACKNOW;
746 tcp_output(pData, tp);
747 SOCKET_UNLOCK(so);
748 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
749 return;
750 }
751 } /* header prediction */
752 /*
753 * Calculate amount of space in receive window,
754 * and then do TCP input processing.
755 * Receive window is amount of space in rcv queue,
756 * but not less than advertised window.
757 */
758 {
759 int win;
760 win = sbspace(&so->so_rcv);
761 if (win < 0)
762 win = 0;
763 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
764 }
765
766 switch (tp->t_state)
767 {
768 /*
769 * If the state is LISTEN then ignore segment if it contains an RST.
770 * If the segment contains an ACK then it is bad and send a RST.
771 * If it does not contain a SYN then it is not interesting; drop it.
772 * Don't bother responding if the destination was a broadcast.
773 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
774 * tp->iss, and send a segment:
775 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
776 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
777 * Fill in remote peer address fields if not previously specified.
778 * Enter SYN_RECEIVED state, and process any other fields of this
779 * segment in this state.
780 */
781 case TCPS_LISTEN:
782 {
783 if (tiflags & TH_RST)
784 {
785 LogFlowFunc(("%d -> drop\n", __LINE__));
786 goto drop;
787 }
788 if (tiflags & TH_ACK)
789 {
790 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
791 goto dropwithreset;
792 }
793 if ((tiflags & TH_SYN) == 0)
794 {
795 LogFlowFunc(("%d -> drop\n", __LINE__));
796 goto drop;
797 }
798
799 /*
800 * This has way too many gotos...
801 * But a bit of spaghetti code never hurt anybody :)
802 */
803 if ( (tcp_fconnect(pData, so) == -1)
804 && errno != EINPROGRESS
805 && errno != EWOULDBLOCK)
806 {
807 u_char code = ICMP_UNREACH_NET;
808 Log2((" tcp fconnect errno = %d (%s)\n", errno, strerror(errno)));
809 if (errno == ECONNREFUSED)
810 {
811 /* ACK the SYN, send RST to refuse the connection */
812 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
813 TH_RST|TH_ACK);
814 }
815 else
816 {
817 if (errno == EHOSTUNREACH)
818 code = ICMP_UNREACH_HOST;
819 HTONL(ti->ti_seq); /* restore tcp header */
820 HTONL(ti->ti_ack);
821 HTONS(ti->ti_win);
822 HTONS(ti->ti_urp);
823 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
824 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
825 *ip = *save_ip;
826 icmp_error(pData, m, ICMP_UNREACH, code, 0, strerror(errno));
827 tp->t_socket->so_m = NULL;
828 }
829 tp = tcp_close(pData, tp);
830 }
831 else
832 {
833 /*
834 * Haven't connected yet, save the current mbuf
835 * and ti, and return
836 * XXX Some OS's don't tell us whether the connect()
837 * succeeded or not. So we must time it out.
838 */
839 so->so_m = m;
840 so->so_ti = ti;
841 so->so_ohdr = RTMemDup(ohdr, ohdrlen);
842 so->so_optp = optp;
843 so->so_optlen = optlen;
844 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
845 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
846 }
847 SOCKET_UNLOCK(so);
848 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
849 LogFlowFuncLeave();
850 return;
851
852cont_conn:
853 /* m==NULL
854 * Check if the connect succeeded
855 */
856 LogFlowFunc(("cont_conn:\n"));
857 if (so->so_state & SS_NOFDREF)
858 {
859 tp = tcp_close(pData, tp);
860 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
861 goto dropwithreset;
862 }
863
864 tcp_template(tp);
865
866 if (optp)
867 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
868
869 if (iss)
870 tp->iss = iss;
871 else
872 tp->iss = tcp_iss;
873 tcp_iss += TCP_ISSINCR/2;
874 tp->irs = ti->ti_seq;
875 tcp_sendseqinit(tp);
876 tcp_rcvseqinit(tp);
877 tp->t_flags |= TF_ACKNOW;
878 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
879 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
880 tcpstat.tcps_accepts++;
881 LogFlowFunc(("%d -> trimthenstep6\n", __LINE__));
882 goto trimthenstep6;
883 } /* case TCPS_LISTEN */
884
885 /*
886 * If the state is SYN_SENT:
887 * if seg contains an ACK, but not for our SYN, drop the input.
888 * if seg contains a RST, then drop the connection.
889 * if seg does not contain SYN, then drop it.
890 * Otherwise this is an acceptable SYN segment
891 * initialize tp->rcv_nxt and tp->irs
892 * if seg contains ack then advance tp->snd_una
893 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
894 * arrange for segment to be acked (eventually)
895 * continue processing rest of data/controls, beginning with URG
896 */
897 case TCPS_SYN_SENT:
898 if ( (tiflags & TH_ACK)
899 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
900 || SEQ_GT(ti->ti_ack, tp->snd_max)))
901 {
902 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
903 goto dropwithreset;
904 }
905
906 if (tiflags & TH_RST)
907 {
908 if (tiflags & TH_ACK)
909 tp = tcp_drop(pData, tp, 0); /* XXX Check t_softerror! */
910 LogFlowFunc(("%d -> drop\n", __LINE__));
911 goto drop;
912 }
913
914 if ((tiflags & TH_SYN) == 0)
915 {
916 LogFlowFunc(("%d -> drop\n", __LINE__));
917 goto drop;
918 }
919 if (tiflags & TH_ACK)
920 {
921 tp->snd_una = ti->ti_ack;
922 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
923 tp->snd_nxt = tp->snd_una;
924 }
925
926 tp->t_timer[TCPT_REXMT] = 0;
927 tp->irs = ti->ti_seq;
928 tcp_rcvseqinit(tp);
929 tp->t_flags |= TF_ACKNOW;
930 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
931 {
932 tcpstat.tcps_connects++;
933 soisfconnected(so);
934 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
935
936 /* Do window scaling on this connection? */
937#if 0
938 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
939 == (TF_RCVD_SCALE|TF_REQ_SCALE))
940 {
941 tp->snd_scale = tp->requested_s_scale;
942 tp->rcv_scale = tp->request_r_scale;
943 }
944#endif
945 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
946 /*
947 * if we didn't have to retransmit the SYN,
948 * use its rtt as our initial srtt & rtt var.
949 */
950 if (tp->t_rtt)
951 tcp_xmit_timer(pData, tp, tp->t_rtt);
952 }
953 else
954 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
955
956trimthenstep6:
957 LogFlowFunc(("trimthenstep6:\n"));
958 /*
959 * Advance ti->ti_seq to correspond to first data byte.
960 * If data, trim to stay within window,
961 * dropping FIN if necessary.
962 */
963 ti->ti_seq++;
964 if (ti->ti_len > tp->rcv_wnd)
965 {
966 todrop = ti->ti_len - tp->rcv_wnd;
967 m_adj(m, -todrop);
968 ti->ti_len = tp->rcv_wnd;
969 tiflags &= ~TH_FIN;
970 tcpstat.tcps_rcvpackafterwin++;
971 tcpstat.tcps_rcvbyteafterwin += todrop;
972 }
973 tp->snd_wl1 = ti->ti_seq - 1;
974 tp->rcv_up = ti->ti_seq;
975 LogFlowFunc(("%d -> step6\n", __LINE__));
976 goto step6;
977 } /* switch tp->t_state */
978 /*
979 * States other than LISTEN or SYN_SENT.
980 * First check timestamp, if present.
981 * Then check that at least some bytes of segment are within
982 * receive window. If segment begins before rcv_nxt,
983 * drop leading data (and SYN); if nothing left, just ack.
984 *
985 * RFC 1323 PAWS: If we have a timestamp reply on this segment
986 * and it's less than ts_recent, drop it.
987 */
988#if 0
989 if ( ts_present
990 && (tiflags & TH_RST) == 0
991 && tp->ts_recent
992 && TSTMP_LT(ts_val, tp->ts_recent))
993 {
994 /* Check to see if ts_recent is over 24 days old. */
995 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
996 {
997 /*
998 * Invalidate ts_recent. If this segment updates
999 * ts_recent, the age will be reset later and ts_recent
1000 * will get a valid value. If it does not, setting
1001 * ts_recent to zero will at least satisfy the
1002 * requirement that zero be placed in the timestamp
1003 * echo reply when ts_recent isn't valid. The
1004 * age isn't reset until we get a valid ts_recent
1005 * because we don't want out-of-order segments to be
1006 * dropped when ts_recent is old.
1007 */
1008 tp->ts_recent = 0;
1009 }
1010 else
1011 {
1012 tcpstat.tcps_rcvduppack++;
1013 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1014 tcpstat.tcps_pawsdrop++;
1015 goto dropafterack;
1016 }
1017 }
1018#endif
1019
1020 todrop = tp->rcv_nxt - ti->ti_seq;
1021 if (todrop > 0)
1022 {
1023 if (tiflags & TH_SYN)
1024 {
1025 tiflags &= ~TH_SYN;
1026 ti->ti_seq++;
1027 if (ti->ti_urp > 1)
1028 ti->ti_urp--;
1029 else
1030 tiflags &= ~TH_URG;
1031 todrop--;
1032 }
1033 /*
1034 * Following if statement from Stevens, vol. 2, p. 960.
1035 */
1036 if ( todrop > ti->ti_len
1037 || ( todrop == ti->ti_len
1038 && (tiflags & TH_FIN) == 0))
1039 {
1040 /*
1041 * Any valid FIN must be to the left of the window.
1042 * At this point the FIN must be a duplicate or out
1043 * of sequence; drop it.
1044 */
1045 tiflags &= ~TH_FIN;
1046
1047 /*
1048 * Send an ACK to resynchronize and drop any data.
1049 * But keep on processing for RST or ACK.
1050 */
1051 tp->t_flags |= TF_ACKNOW;
1052 todrop = ti->ti_len;
1053 tcpstat.tcps_rcvduppack++;
1054 tcpstat.tcps_rcvdupbyte += todrop;
1055 }
1056 else
1057 {
1058 tcpstat.tcps_rcvpartduppack++;
1059 tcpstat.tcps_rcvpartdupbyte += todrop;
1060 }
1061 m_adj(m, todrop);
1062 ti->ti_seq += todrop;
1063 ti->ti_len -= todrop;
1064 if (ti->ti_urp > todrop)
1065 ti->ti_urp -= todrop;
1066 else
1067 {
1068 tiflags &= ~TH_URG;
1069 ti->ti_urp = 0;
1070 }
1071 }
1072 /*
1073 * If new data are received on a connection after the
1074 * user processes are gone, then RST the other end.
1075 */
1076 if ( (so->so_state & SS_NOFDREF)
1077 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
1078 {
1079 tp = tcp_close(pData, tp);
1080 tcpstat.tcps_rcvafterclose++;
1081 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1082 goto dropwithreset;
1083 }
1084
1085 /*
1086 * If segment ends after window, drop trailing data
1087 * (and PUSH and FIN); if nothing left, just ACK.
1088 */
1089 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1090 if (todrop > 0)
1091 {
1092 tcpstat.tcps_rcvpackafterwin++;
1093 if (todrop >= ti->ti_len)
1094 {
1095 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1096 /*
1097 * If a new connection request is received
1098 * while in TIME_WAIT, drop the old connection
1099 * and start over if the sequence numbers
1100 * are above the previous ones.
1101 */
1102 if ( tiflags & TH_SYN
1103 && tp->t_state == TCPS_TIME_WAIT
1104 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
1105 {
1106 iss = tp->rcv_nxt + TCP_ISSINCR;
1107 tp = tcp_close(pData, tp);
1108 SOCKET_UNLOCK(tp->t_socket);
1109 LogFlowFunc(("%d -> findso\n", __LINE__));
1110 goto findso;
1111 }
1112 /*
1113 * If window is closed can only take segments at
1114 * window edge, and have to drop data and PUSH from
1115 * incoming segments. Continue processing, but
1116 * remember to ack. Otherwise, drop segment
1117 * and ack.
1118 */
1119 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
1120 {
1121 tp->t_flags |= TF_ACKNOW;
1122 tcpstat.tcps_rcvwinprobe++;
1123 }
1124 else
1125 {
1126 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1127 goto dropafterack;
1128 }
1129 }
1130 else
1131 tcpstat.tcps_rcvbyteafterwin += todrop;
1132 m_adj(m, -todrop);
1133 ti->ti_len -= todrop;
1134 tiflags &= ~(TH_PUSH|TH_FIN);
1135 }
1136
1137 /*
1138 * If last ACK falls within this segment's sequence numbers,
1139 * record its timestamp.
1140 */
1141#if 0
1142 if ( ts_present
1143 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1144 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1145 {
1146 tp->ts_recent_age = tcp_now;
1147 tp->ts_recent = ts_val;
1148 }
1149#endif
1150
1151 /*
1152 * If the RST bit is set examine the state:
1153 * SYN_RECEIVED STATE:
1154 * If passive open, return to LISTEN state.
1155 * If active open, inform user that connection was refused.
1156 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1157 * Inform user that connection was reset, and close tcb.
1158 * CLOSING, LAST_ACK, TIME_WAIT STATES
1159 * Close the tcb.
1160 */
1161 if (tiflags&TH_RST)
1162 switch (tp->t_state)
1163 {
1164 case TCPS_SYN_RECEIVED:
1165/* so->so_error = ECONNREFUSED; */
1166 LogFlowFunc(("%d -> close\n", __LINE__));
1167 goto close;
1168
1169 case TCPS_ESTABLISHED:
1170 case TCPS_FIN_WAIT_1:
1171 case TCPS_FIN_WAIT_2:
1172 case TCPS_CLOSE_WAIT:
1173/* so->so_error = ECONNRESET; */
1174close:
1175 LogFlowFunc(("close:\n"));
1176 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSED);
1177 tcpstat.tcps_drops++;
1178 tp = tcp_close(pData, tp);
1179 LogFlowFunc(("%d -> drop\n", __LINE__));
1180 goto drop;
1181
1182 case TCPS_CLOSING:
1183 case TCPS_LAST_ACK:
1184 case TCPS_TIME_WAIT:
1185 tp = tcp_close(pData, tp);
1186 LogFlowFunc(("%d -> drop\n", __LINE__));
1187 goto drop;
1188 }
1189
1190 /*
1191 * If a SYN is in the window, then this is an
1192 * error and we send an RST and drop the connection.
1193 */
1194 if (tiflags & TH_SYN)
1195 {
1196 tp = tcp_drop(pData, tp, 0);
1197 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1198 goto dropwithreset;
1199 }
1200
1201 /*
1202 * If the ACK bit is off we drop the segment and return.
1203 */
1204 if ((tiflags & TH_ACK) == 0)
1205 {
1206 LogFlowFunc(("%d -> drop\n", __LINE__));
1207 goto drop;
1208 }
1209
1210 /*
1211 * Ack processing.
1212 */
1213 switch (tp->t_state)
1214 {
1215 /*
1216 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1217 * ESTABLISHED state and continue processing, otherwise
1218 * send an RST. una<=ack<=max
1219 */
1220 case TCPS_SYN_RECEIVED:
1221 LogFlowFunc(("%d -> TCPS_SYN_RECEIVED\n", __LINE__));
1222 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1223 || SEQ_GT(ti->ti_ack, tp->snd_max))
1224 goto dropwithreset;
1225 tcpstat.tcps_connects++;
1226 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
1227 /*
1228 * The sent SYN is ack'ed with our sequence number +1
1229 * The first data byte already in the buffer will get
1230 * lost if no correction is made. This is only needed for
1231 * SS_CTL since the buffer is empty otherwise.
1232 * tp->snd_una++; or:
1233 */
1234 tp->snd_una = ti->ti_ack;
1235 soisfconnected(so);
1236
1237 /* Do window scaling? */
1238#if 0
1239 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1240 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1241 {
1242 tp->snd_scale = tp->requested_s_scale;
1243 tp->rcv_scale = tp->request_r_scale;
1244 }
1245#endif
1246 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1247 tp->snd_wl1 = ti->ti_seq - 1;
1248 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1249 LogFlowFunc(("%d -> synrx_to_est\n", __LINE__));
1250 goto synrx_to_est;
1251 /* fall into ... */
1252
1253 /*
1254 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1255 * ACKs. If the ack is in the range
1256 * tp->snd_una < ti->ti_ack <= tp->snd_max
1257 * then advance tp->snd_una to ti->ti_ack and drop
1258 * data from the retransmission queue. If this ACK reflects
1259 * more up to date window information we update our window information.
1260 */
1261 case TCPS_ESTABLISHED:
1262 case TCPS_FIN_WAIT_1:
1263 case TCPS_FIN_WAIT_2:
1264 case TCPS_CLOSE_WAIT:
1265 case TCPS_CLOSING:
1266 case TCPS_LAST_ACK:
1267 case TCPS_TIME_WAIT:
1268 LogFlowFunc(("%d -> TCPS_ESTABLISHED|TCPS_FIN_WAIT_1|TCPS_FIN_WAIT_2|TCPS_CLOSE_WAIT|"
1269 "TCPS_CLOSING|TCPS_LAST_ACK|TCPS_TIME_WAIT\n", __LINE__));
1270 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1271 {
1272 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1273 {
1274 tcpstat.tcps_rcvdupack++;
1275 Log2((" dup ack m = %p, so = %p\n", m, so));
1276 /*
1277 * If we have outstanding data (other than
1278 * a window probe), this is a completely
1279 * duplicate ack (ie, window info didn't
1280 * change), the ack is the biggest we've
1281 * seen and we've seen exactly our rexmt
1282 * threshold of them, assume a packet
1283 * has been dropped and retransmit it.
1284 * Kludge snd_nxt & the congestion
1285 * window so we send only this one
1286 * packet.
1287 *
1288 * We know we're losing at the current
1289 * window size so do congestion avoidance
1290 * (set ssthresh to half the current window
1291 * and pull our congestion window back to
1292 * the new ssthresh).
1293 *
1294 * Dup acks mean that packets have left the
1295 * network (they're now cached at the receiver)
1296 * so bump cwnd by the amount in the receiver
1297 * to keep a constant cwnd packets in the
1298 * network.
1299 */
1300 if ( tp->t_timer[TCPT_REXMT] == 0
1301 || ti->ti_ack != tp->snd_una)
1302 tp->t_dupacks = 0;
1303 else if (++tp->t_dupacks == tcprexmtthresh)
1304 {
1305 tcp_seq onxt = tp->snd_nxt;
1306 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1307 if (win < 2)
1308 win = 2;
1309 tp->snd_ssthresh = win * tp->t_maxseg;
1310 tp->t_timer[TCPT_REXMT] = 0;
1311 tp->t_rtt = 0;
1312 tp->snd_nxt = ti->ti_ack;
1313 tp->snd_cwnd = tp->t_maxseg;
1314 (void) tcp_output(pData, tp);
1315 tp->snd_cwnd = tp->snd_ssthresh +
1316 tp->t_maxseg * tp->t_dupacks;
1317 if (SEQ_GT(onxt, tp->snd_nxt))
1318 tp->snd_nxt = onxt;
1319 LogFlowFunc(("%d -> drop\n", __LINE__));
1320 goto drop;
1321 }
1322 else if (tp->t_dupacks > tcprexmtthresh)
1323 {
1324 tp->snd_cwnd += tp->t_maxseg;
1325 (void) tcp_output(pData, tp);
1326 LogFlowFunc(("%d -> drop\n", __LINE__));
1327 goto drop;
1328 }
1329 }
1330 else
1331 tp->t_dupacks = 0;
1332 break;
1333 }
1334synrx_to_est:
1335 LogFlowFunc(("synrx_to_est:\n"));
1336 /*
1337 * If the congestion window was inflated to account
1338 * for the other side's cached packets, retract it.
1339 */
1340 if ( tp->t_dupacks > tcprexmtthresh
1341 && tp->snd_cwnd > tp->snd_ssthresh)
1342 tp->snd_cwnd = tp->snd_ssthresh;
1343 tp->t_dupacks = 0;
1344 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1345 {
1346 tcpstat.tcps_rcvacktoomuch++;
1347 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1348 goto dropafterack;
1349 }
1350 acked = ti->ti_ack - tp->snd_una;
1351 tcpstat.tcps_rcvackpack++;
1352 tcpstat.tcps_rcvackbyte += acked;
1353
1354 /*
1355 * If we have a timestamp reply, update smoothed
1356 * round trip time. If no timestamp is present but
1357 * transmit timer is running and timed sequence
1358 * number was acked, update smoothed round trip time.
1359 * Since we now have an rtt measurement, cancel the
1360 * timer backoff (cf., Phil Karn's retransmit alg.).
1361 * Recompute the initial retransmit timer.
1362 */
1363#if 0
1364 if (ts_present)
1365 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1366 else
1367#endif
1368 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1369 tcp_xmit_timer(pData, tp, tp->t_rtt);
1370
1371 /*
1372 * If all outstanding data is acked, stop retransmit
1373 * timer and remember to restart (more output or persist).
1374 * If there is more data to be acked, restart retransmit
1375 * timer, using current (possibly backed-off) value.
1376 */
1377 if (ti->ti_ack == tp->snd_max)
1378 {
1379 tp->t_timer[TCPT_REXMT] = 0;
1380 needoutput = 1;
1381 }
1382 else if (tp->t_timer[TCPT_PERSIST] == 0)
1383 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1384 /*
1385 * When new data is acked, open the congestion window.
1386 * If the window gives us less than ssthresh packets
1387 * in flight, open exponentially (maxseg per packet).
1388 * Otherwise open linearly: maxseg per window
1389 * (maxseg^2 / cwnd per packet).
1390 */
1391 {
1392 register u_int cw = tp->snd_cwnd;
1393 register u_int incr = tp->t_maxseg;
1394
1395 if (cw > tp->snd_ssthresh)
1396 incr = incr * incr / cw;
1397 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1398 }
1399 if (acked > SBUF_LEN(&so->so_snd))
1400 {
1401 tp->snd_wnd -= SBUF_LEN(&so->so_snd);
1402 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1403 ourfinisacked = 1;
1404 }
1405 else
1406 {
1407 sbdrop(&so->so_snd, acked);
1408 tp->snd_wnd -= acked;
1409 ourfinisacked = 0;
1410 }
1411 /*
1412 * XXX sowwakup is called when data is acked and there's room for
1413 * for more data... it should read() the socket
1414 */
1415#if 0
1416 if (so->so_snd.sb_flags & SB_NOTIFY)
1417 sowwakeup(so);
1418#endif
1419 tp->snd_una = ti->ti_ack;
1420 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1421 tp->snd_nxt = tp->snd_una;
1422
1423 switch (tp->t_state)
1424 {
1425 /*
1426 * In FIN_WAIT_1 STATE in addition to the processing
1427 * for the ESTABLISHED state if our FIN is now acknowledged
1428 * then enter FIN_WAIT_2.
1429 */
1430 case TCPS_FIN_WAIT_1:
1431 if (ourfinisacked)
1432 {
1433 /*
1434 * If we can't receive any more
1435 * data, then closing user can proceed.
1436 * Starting the timer is contrary to the
1437 * specification, but if we don't get a FIN
1438 * we'll hang forever.
1439 */
1440 if (so->so_state & SS_FCANTRCVMORE)
1441 {
1442 soisfdisconnected(so);
1443 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1444 }
1445 TCP_STATE_SWITCH_TO(tp, TCPS_FIN_WAIT_2);
1446 }
1447 break;
1448
1449 /*
1450 * In CLOSING STATE in addition to the processing for
1451 * the ESTABLISHED state if the ACK acknowledges our FIN
1452 * then enter the TIME-WAIT state, otherwise ignore
1453 * the segment.
1454 */
1455 case TCPS_CLOSING:
1456 if (ourfinisacked)
1457 {
1458 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1459 tcp_canceltimers(tp);
1460 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1461 soisfdisconnected(so);
1462 }
1463 break;
1464
1465 /*
1466 * In LAST_ACK, we may still be waiting for data to drain
1467 * and/or to be acked, as well as for the ack of our FIN.
1468 * If our FIN is now acknowledged, delete the TCB,
1469 * enter the closed state and return.
1470 */
1471 case TCPS_LAST_ACK:
1472 if (ourfinisacked)
1473 {
1474 tp = tcp_close(pData, tp);
1475 LogFlowFunc(("%d -> drop\n", __LINE__));
1476 goto drop;
1477 }
1478 break;
1479
1480 /*
1481 * In TIME_WAIT state the only thing that should arrive
1482 * is a retransmission of the remote FIN. Acknowledge
1483 * it and restart the finack timer.
1484 */
1485 case TCPS_TIME_WAIT:
1486 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1487 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1488 goto dropafterack;
1489 }
1490 } /* switch(tp->t_state) */
1491
1492step6:
1493 LogFlowFunc(("step6:\n"));
1494 /*
1495 * Update window information.
1496 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1497 */
1498 if ( (tiflags & TH_ACK)
1499 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1500 || ( tp->snd_wl1 == ti->ti_seq
1501 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1502 || ( tp->snd_wl2 == ti->ti_ack
1503 && tiwin > tp->snd_wnd)))))
1504 {
1505 /* keep track of pure window updates */
1506 if ( ti->ti_len == 0
1507 && tp->snd_wl2 == ti->ti_ack
1508 && tiwin > tp->snd_wnd)
1509 tcpstat.tcps_rcvwinupd++;
1510 tp->snd_wnd = tiwin;
1511 tp->snd_wl1 = ti->ti_seq;
1512 tp->snd_wl2 = ti->ti_ack;
1513 if (tp->snd_wnd > tp->max_sndwnd)
1514 tp->max_sndwnd = tp->snd_wnd;
1515 needoutput = 1;
1516 }
1517
1518 /*
1519 * Process segments with URG.
1520 */
1521 if ((tiflags & TH_URG) && ti->ti_urp &&
1522 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1523 {
1524 /*
1525 * This is a kludge, but if we receive and accept
1526 * random urgent pointers, we'll crash in
1527 * soreceive. It's hard to imagine someone
1528 * actually wanting to send this much urgent data.
1529 */
1530 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1531 {
1532 ti->ti_urp = 0;
1533 tiflags &= ~TH_URG;
1534 LogFlowFunc(("%d -> dodata\n", __LINE__));
1535 goto dodata;
1536 }
1537
1538 /*
1539 * If this segment advances the known urgent pointer,
1540 * then mark the data stream. This should not happen
1541 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1542 * a FIN has been received from the remote side.
1543 * In these states we ignore the URG.
1544 *
1545 * According to RFC961 (Assigned Protocols),
1546 * the urgent pointer points to the last octet
1547 * of urgent data. We continue, however,
1548 * to consider it to indicate the first octet
1549 * of data past the urgent section as the original
1550 * spec states (in one of two places).
1551 */
1552 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1553 {
1554 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1555 so->so_urgc = SBUF_LEN(&so->so_rcv) +
1556 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1557 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1558 }
1559 }
1560 else
1561 /*
1562 * If no out of band data is expected,
1563 * pull receive urgent pointer along
1564 * with the receive window.
1565 */
1566 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1567 tp->rcv_up = tp->rcv_nxt;
1568dodata:
1569 LogFlowFunc(("dodata:\n"));
1570
1571 /*
1572 * If this is a small packet, then ACK now - with Nagel
1573 * congestion avoidance sender won't send more until
1574 * he gets an ACK.
1575 *
1576 * XXX: In case you wonder... The magic "27" below is ESC that
1577 * presumably starts a terminal escape-sequence and that we want
1578 * to ACK ASAP. [Original slirp code had three different
1579 * heuristics to chose from here and in the header prediction case
1580 * above, but the commented out alternatives were lost and the
1581 * header prediction case that had an expanded comment about this
1582 * has been modified to always send an ACK].
1583 */
1584 if ( ti->ti_len
1585 && (unsigned)ti->ti_len <= 5
1586 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1587 {
1588 tp->t_flags |= TF_ACKNOW;
1589 }
1590
1591 /*
1592 * Process the segment text, merging it into the TCP sequencing queue,
1593 * and arranging for acknowledgment of receipt if necessary.
1594 * This process logically involves adjusting tp->rcv_wnd as data
1595 * is presented to the user (this happens in tcp_usrreq.c,
1596 * case PRU_RCVD). If a FIN has already been received on this
1597 * connection then we just ignore the text.
1598 */
1599 if ( (ti->ti_len || (tiflags&TH_FIN))
1600 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1601 {
1602 if ( ti->ti_seq == tp->rcv_nxt
1603 && LIST_EMPTY(&tp->t_segq)
1604 && tp->t_state == TCPS_ESTABLISHED)
1605 {
1606 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1607 tp->rcv_nxt += tlen;
1608 tiflags = ti->ti_t.th_flags & TH_FIN;
1609 tcpstat.tcps_rcvpack++;
1610 tcpstat.tcps_rcvbyte += tlen;
1611 if (so->so_state & SS_FCANTRCVMORE)
1612 m_freem(pData, m);
1613 else
1614 sbappend(pData, so, m);
1615 }
1616 else
1617 {
1618 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1619 tp->t_flags |= TF_ACKNOW;
1620 }
1621 /*
1622 * Note the amount of data that peer has sent into
1623 * our window, in order to estimate the sender's
1624 * buffer size.
1625 */
1626 len = SBUF_SIZE(&so->so_rcv) - (tp->rcv_adv - tp->rcv_nxt);
1627 }
1628 else
1629 {
1630 m_freem(pData, m);
1631 tiflags &= ~TH_FIN;
1632 }
1633
1634 /*
1635 * If FIN is received ACK the FIN and let the user know
1636 * that the connection is closing.
1637 */
1638 if (tiflags & TH_FIN)
1639 {
1640 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1641 {
1642 /*
1643 * If we receive a FIN we can't send more data,
1644 * set it SS_FDRAIN
1645 * Shutdown the socket if there is no rx data in the
1646 * buffer.
1647 * soread() is called on completion of shutdown() and
1648 * will got to TCPS_LAST_ACK, and use tcp_output()
1649 * to send the FIN.
1650 */
1651/* sofcantrcvmore(so); */
1652 sofwdrain(so);
1653
1654 tp->t_flags |= TF_ACKNOW;
1655 tp->rcv_nxt++;
1656 }
1657 switch (tp->t_state)
1658 {
1659 /*
1660 * In SYN_RECEIVED and ESTABLISHED STATES
1661 * enter the CLOSE_WAIT state.
1662 */
1663 case TCPS_SYN_RECEIVED:
1664 case TCPS_ESTABLISHED:
1665 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSE_WAIT);
1666 break;
1667
1668 /*
1669 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1670 * enter the CLOSING state.
1671 */
1672 case TCPS_FIN_WAIT_1:
1673 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSING);
1674 break;
1675
1676 /*
1677 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1678 * starting the time-wait timer, turning off the other
1679 * standard timers.
1680 */
1681 case TCPS_FIN_WAIT_2:
1682 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1683 tcp_canceltimers(tp);
1684 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1685 soisfdisconnected(so);
1686 break;
1687
1688 /*
1689 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1690 */
1691 case TCPS_TIME_WAIT:
1692 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1693 break;
1694 }
1695 }
1696
1697 /*
1698 * Return any desired output.
1699 */
1700 if (needoutput || (tp->t_flags & TF_ACKNOW))
1701 tcp_output(pData, tp);
1702
1703 SOCKET_UNLOCK(so);
1704 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1705 LogFlowFuncLeave();
1706 return;
1707
1708dropafterack:
1709 LogFlowFunc(("dropafterack:\n"));
1710 /*
1711 * Generate an ACK dropping incoming segment if it occupies
1712 * sequence space, where the ACK reflects our state.
1713 */
1714 if (tiflags & TH_RST)
1715 {
1716 LogFlowFunc(("%d -> drop\n", __LINE__));
1717 goto drop;
1718 }
1719 m_freem(pData, m);
1720 tp->t_flags |= TF_ACKNOW;
1721 (void) tcp_output(pData, tp);
1722 SOCKET_UNLOCK(so);
1723 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1724 LogFlowFuncLeave();
1725 return;
1726
1727dropwithreset:
1728 LogFlowFunc(("dropwithreset:\n"));
1729 /* reuses m if m!=NULL, m_free() unnecessary */
1730 if (tiflags & TH_ACK)
1731 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1732 else
1733 {
1734 if (tiflags & TH_SYN)
1735 ti->ti_len++;
1736 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1737 TH_RST|TH_ACK);
1738 }
1739
1740 if (so != &tcb)
1741 SOCKET_UNLOCK(so);
1742 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1743 LogFlowFuncLeave();
1744 return;
1745
1746drop:
1747 LogFlowFunc(("drop:\n"));
1748 /*
1749 * Drop space held by incoming segment and return.
1750 */
1751 m_freem(pData, m);
1752
1753#ifdef VBOX_WITH_SLIRP_MT
1754 if (RTCritSectIsOwned(&so->so_mutex))
1755 {
1756 SOCKET_UNLOCK(so);
1757 }
1758#endif
1759
1760 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1761 LogFlowFuncLeave();
1762 return;
1763}
1764
1765
1766void
1767tcp_fconnect_failed(PNATState pData, struct socket *so, int sockerr)
1768{
1769 struct tcpcb *tp;
1770 int code;
1771
1772 Log2(("NAT: connect error %d %R[natsock]\n", sockerr, so));
1773
1774 Assert(so->so_state & SS_ISFCONNECTING);
1775 so->so_state = SS_NOFDREF;
1776
1777 if (sockerr == ECONNREFUSED || sockerr == ECONNRESET)
1778 {
1779 /* hand off to tcp_input():cont_conn to send RST */
1780 TCP_INPUT(pData, NULL, 0, so);
1781 return;
1782 }
1783
1784 tp = sototcpcb(so);
1785 if (RT_UNLIKELY(tp == NULL)) /* should never happen */
1786 {
1787 LogRel(("NAT: tp == NULL %R[natsock]\n", so));
1788 sofree(pData, so);
1789 return;
1790 }
1791
1792 if (sockerr == ENETUNREACH || sockerr == ENETDOWN)
1793 code = ICMP_UNREACH_NET;
1794 else if (sockerr == EHOSTUNREACH || sockerr == EHOSTDOWN)
1795 code = ICMP_UNREACH_HOST;
1796 else
1797 code = -1;
1798
1799 if (code >= 0)
1800 {
1801 struct ip *oip;
1802 unsigned ohdrlen;
1803 struct mbuf *m;
1804
1805 if (RT_UNLIKELY(so->so_ohdr == NULL))
1806 goto out;
1807
1808 oip = (struct ip *)so->so_ohdr;
1809 ohdrlen = oip->ip_hl * 4 + 8;
1810
1811 m = m_gethdr(pData, M_NOWAIT, MT_HEADER);
1812 if (RT_UNLIKELY(m == NULL))
1813 goto out;
1814
1815 m_copyback(pData, m, 0, ohdrlen, (caddr_t)so->so_ohdr);
1816 m->m_pkthdr.header = mtod(m, void *);
1817
1818 icmp_error(pData, m, ICMP_UNREACH, code, 0, NULL);
1819 }
1820
1821 out:
1822 tcp_close(pData, tp);
1823}
1824
1825
1826void
1827tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1828{
1829 u_int16_t mss;
1830 int opt, optlen;
1831
1832 LogFlowFunc(("tcp_dooptions: tp = %R[tcpcb793], cnt=%i\n", tp, cnt));
1833
1834 for (; cnt > 0; cnt -= optlen, cp += optlen)
1835 {
1836 opt = cp[0];
1837 if (opt == TCPOPT_EOL)
1838 break;
1839 if (opt == TCPOPT_NOP)
1840 optlen = 1;
1841 else
1842 {
1843 optlen = cp[1];
1844 if (optlen <= 0)
1845 break;
1846 }
1847 switch (opt)
1848 {
1849 default:
1850 continue;
1851
1852 case TCPOPT_MAXSEG:
1853 if (optlen != TCPOLEN_MAXSEG)
1854 continue;
1855 if (!(ti->ti_flags & TH_SYN))
1856 continue;
1857 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1858 NTOHS(mss);
1859 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1860 break;
1861
1862#if 0
1863 case TCPOPT_WINDOW:
1864 if (optlen != TCPOLEN_WINDOW)
1865 continue;
1866 if (!(ti->ti_flags & TH_SYN))
1867 continue;
1868 tp->t_flags |= TF_RCVD_SCALE;
1869 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1870 break;
1871
1872 case TCPOPT_TIMESTAMP:
1873 if (optlen != TCPOLEN_TIMESTAMP)
1874 continue;
1875 *ts_present = 1;
1876 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1877 NTOHL(*ts_val);
1878 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1879 NTOHL(*ts_ecr);
1880
1881 /*
1882 * A timestamp received in a SYN makes
1883 * it ok to send timestamp requests and replies.
1884 */
1885 if (ti->ti_flags & TH_SYN)
1886 {
1887 tp->t_flags |= TF_RCVD_TSTMP;
1888 tp->ts_recent = *ts_val;
1889 tp->ts_recent_age = tcp_now;
1890 }
1891 break;
1892#endif
1893 }
1894 }
1895}
1896
1897
1898/*
1899 * Pull out of band byte out of a segment so
1900 * it doesn't appear in the user's data queue.
1901 * It is still reflected in the segment length for
1902 * sequencing purposes.
1903 */
1904
1905#if 0
1906void
1907tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1908{
1909 int cnt = ti->ti_urp - 1;
1910
1911 while (cnt >= 0)
1912 {
1913 if (m->m_len > cnt)
1914 {
1915 char *cp = mtod(m, caddr_t) + cnt;
1916 struct tcpcb *tp = sototcpcb(so);
1917
1918 tp->t_iobc = *cp;
1919 tp->t_oobflags |= TCPOOB_HAVEDATA;
1920 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1921 m->m_len--;
1922 return;
1923 }
1924 cnt -= m->m_len;
1925 m = m->m_next; /* XXX WRONG! Fix it! */
1926 if (m == 0)
1927 break;
1928 }
1929 panic("tcp_pulloutofband");
1930}
1931#endif
1932
1933/*
1934 * Collect new round-trip time estimate
1935 * and update averages and current timeout.
1936 */
1937
1938void
1939tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1940{
1941 register short delta;
1942
1943 LogFlowFunc(("ENTER: tcp_xmit_timer: tp = %R[tcpcb793] rtt = %d\n", tp, rtt));
1944
1945 tcpstat.tcps_rttupdated++;
1946 if (tp->t_srtt != 0)
1947 {
1948 /*
1949 * srtt is stored as fixed point with 3 bits after the
1950 * binary point (i.e., scaled by 8). The following magic
1951 * is equivalent to the smoothing algorithm in rfc793 with
1952 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1953 * point). Adjust rtt to origin 0.
1954 */
1955 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1956 if ((tp->t_srtt += delta) <= 0)
1957 tp->t_srtt = 1;
1958 /*
1959 * We accumulate a smoothed rtt variance (actually, a
1960 * smoothed mean difference), then set the retransmit
1961 * timer to smoothed rtt + 4 times the smoothed variance.
1962 * rttvar is stored as fixed point with 2 bits after the
1963 * binary point (scaled by 4). The following is
1964 * equivalent to rfc793 smoothing with an alpha of .75
1965 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1966 * rfc793's wired-in beta.
1967 */
1968 if (delta < 0)
1969 delta = -delta;
1970 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1971 if ((tp->t_rttvar += delta) <= 0)
1972 tp->t_rttvar = 1;
1973 }
1974 else
1975 {
1976 /*
1977 * No rtt measurement yet - use the unsmoothed rtt.
1978 * Set the variance to half the rtt (so our first
1979 * retransmit happens at 3*rtt).
1980 */
1981 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1982 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1983 }
1984 tp->t_rtt = 0;
1985 tp->t_rxtshift = 0;
1986
1987 /*
1988 * the retransmit should happen at rtt + 4 * rttvar.
1989 * Because of the way we do the smoothing, srtt and rttvar
1990 * will each average +1/2 tick of bias. When we compute
1991 * the retransmit timer, we want 1/2 tick of rounding and
1992 * 1 extra tick because of +-1/2 tick uncertainty in the
1993 * firing of the timer. The bias will give us exactly the
1994 * 1.5 tick we need. But, because the bias is
1995 * statistical, we have to test that we don't drop below
1996 * the minimum feasible timer (which is 2 ticks).
1997 */
1998 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1999 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
2000
2001 /*
2002 * We received an ack for a packet that wasn't retransmitted;
2003 * it is probably safe to discard any error indications we've
2004 * received recently. This isn't quite right, but close enough
2005 * for now (a route might have failed after we sent a segment,
2006 * and the return path might not be symmetrical).
2007 */
2008 tp->t_softerror = 0;
2009}
2010
2011/*
2012 * Determine a reasonable value for maxseg size.
2013 * If the route is known, check route for mtu.
2014 * If none, use an mss that can be handled on the outgoing
2015 * interface without forcing IP to fragment; if bigger than
2016 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2017 * to utilize large mbufs. If no route is found, route has no mtu,
2018 * or the destination isn't local, use a default, hopefully conservative
2019 * size (usually 512 or the default IP max size, but no more than the mtu
2020 * of the interface), as we can't discover anything about intervening
2021 * gateways or networks. We also initialize the congestion/slow start
2022 * window to be a single segment if the destination isn't local.
2023 * While looking at the routing entry, we also initialize other path-dependent
2024 * parameters from pre-set or cached values in the routing entry.
2025 */
2026
2027int
2028tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
2029{
2030 struct socket *so = tp->t_socket;
2031 int mss;
2032
2033 LogFlowFunc(("ENTER: tcp_mss: offer=%u, t_maxseg=%u; tp=%R[natsock]\n",
2034 offer, (unsigned int)tp->t_maxseg, so));
2035
2036 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
2037 if (offer)
2038 mss = min(mss, offer);
2039 mss = max(mss, 32);
2040 if (mss < tp->t_maxseg || offer != 0)
2041 tp->t_maxseg = mss;
2042
2043 tp->snd_cwnd = mss;
2044
2045 sbreserve(pData, &so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
2046 sbreserve(pData, &so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
2047
2048 LogFlowFunc(("LEAVE: mss=%d\n", mss));
2049 return mss;
2050}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette