VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 88566

Last change on this file since 88566 was 86843, checked in by vboxsync, 4 years ago

slirp: bugref:9856 - in tcp_input() save optp and optlen in struct
socket for the re-entry on connect dance. Restore them on the second
entry because the options are only parsed at that time.

I'm not sure why - this is code from the original slirp - but nothing
in the options parsing code needs getting the "remote address" as the
comment claims. This bug is pretty corner case and has been
introduced in slirp changes to the BSD stack. Opt for the
conservative fix. The practical consequence is that we should now
respect the MSS that the guest advertises to us (ticketref:15256).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 66.7 KB
Line 
1/* $Id: tcp_input.c 86843 2020-11-10 04:26:48Z vboxsync $ */
2/** @file
3 * NAT - TCP input.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
22 * The Regents of the University of California. All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
53 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
54 */
55
56/*
57 * Changes and additions relating to SLiRP
58 * Copyright (c) 1995 Danny Gasparovski.
59 *
60 * Please read the file COPYRIGHT for the
61 * terms and conditions of the copyright.
62 */
63
64#include <slirp.h>
65#include "ip_icmp.h"
66
67
68#if 0 /* code using this macroses is commented out */
69# define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
70
71/* for modulo comparisons of timestamps */
72# define TSTMP_LT(a, b) ((int)((a)-(b)) < 0)
73# define TSTMP_GEQ(a, b) ((int)((a)-(b)) >= 0)
74#endif
75
76#ifndef TCP_ACK_HACK
77#define DELAY_ACK(tp, ti) \
78 if (ti->ti_flags & TH_PUSH) \
79 tp->t_flags |= TF_ACKNOW; \
80 else \
81 tp->t_flags |= TF_DELACK;
82#else /* !TCP_ACK_HACK */
83#define DELAY_ACK(tp, ign) \
84 tp->t_flags |= TF_DELACK;
85#endif /* TCP_ACK_HACK */
86
87
88/*
89 * deps: netinet/tcp_reass.c
90 * tcp_reass_maxqlen = 48 (deafault)
91 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
92 */
93int
94tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
95{
96 struct tseg_qent *q;
97 struct tseg_qent *p = NULL;
98 struct tseg_qent *nq;
99 struct tseg_qent *te = NULL;
100 struct socket *so = tp->t_socket;
101 int flags;
102 STAM_PROFILE_START(&pData->StatTCP_reassamble, tcp_reassamble);
103 LogFlowFunc(("ENTER: pData:%p, tp:%R[tcpcb793], th:%p, tlenp:%p, m:%p\n", pData, tp, th, tlenp, m));
104
105 /*
106 * XXX: tcp_reass() is rather inefficient with its data structures
107 * and should be rewritten (see NetBSD for optimizations). While
108 * doing that it should move to its own file tcp_reass.c.
109 */
110
111 /*
112 * Call with th==NULL after become established to
113 * force pre-ESTABLISHED data up to user socket.
114 */
115 if (th == NULL)
116 {
117 LogFlowFunc(("%d -> present\n", __LINE__));
118 goto present;
119 }
120
121 /*
122 * Limit the number of segments in the reassembly queue to prevent
123 * holding on to too many segments (and thus running out of mbufs).
124 * Make sure to let the missing segment through which caused this
125 * queue. Always keep one global queue entry spare to be able to
126 * process the missing segment.
127 */
128 if ( th->th_seq != tp->rcv_nxt
129 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
130 || tp->t_segqlen >= tcp_reass_maxqlen))
131 {
132 tcp_reass_overflows++;
133 tcpstat.tcps_rcvmemdrop++;
134 m_freem(pData, m);
135 *tlenp = 0;
136 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
137 LogFlowFuncLeave();
138 return (0);
139 }
140
141 /*
142 * Allocate a new queue entry. If we can't, or hit the zone limit
143 * just drop the pkt.
144 */
145 te = RTMemAlloc(sizeof(struct tseg_qent));
146 if (te == NULL)
147 {
148 tcpstat.tcps_rcvmemdrop++;
149 m_freem(pData, m);
150 *tlenp = 0;
151 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
152 LogFlowFuncLeave();
153 return (0);
154 }
155 tp->t_segqlen++;
156 tcp_reass_qsize++;
157
158 /*
159 * Find a segment which begins after this one does.
160 */
161 LIST_FOREACH(q, &tp->t_segq, tqe_q)
162 {
163 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
164 break;
165 p = q;
166 }
167
168 /*
169 * If there is a preceding segment, it may provide some of
170 * our data already. If so, drop the data from the incoming
171 * segment. If it provides all of our data, drop us.
172 */
173 if (p != NULL)
174 {
175 int i;
176 /* conversion to int (in i) handles seq wraparound */
177 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
178 if (i > 0)
179 {
180 if (i >= *tlenp)
181 {
182 tcpstat.tcps_rcvduppack++;
183 tcpstat.tcps_rcvdupbyte += *tlenp;
184 m_freem(pData, m);
185 RTMemFree(te);
186 tp->t_segqlen--;
187 tcp_reass_qsize--;
188 /*
189 * Try to present any queued data
190 * at the left window edge to the user.
191 * This is needed after the 3-WHS
192 * completes.
193 */
194 LogFlowFunc(("%d -> present\n", __LINE__));
195 goto present; /* ??? */
196 }
197 m_adj(m, i);
198 *tlenp -= i;
199 th->th_seq += i;
200 }
201 }
202 tcpstat.tcps_rcvoopack++;
203 tcpstat.tcps_rcvoobyte += *tlenp;
204
205 /*
206 * While we overlap succeeding segments trim them or,
207 * if they are completely covered, dequeue them.
208 */
209 while (q)
210 {
211 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
212 if (i <= 0)
213 break;
214 if (i < q->tqe_len)
215 {
216 q->tqe_th->th_seq += i;
217 q->tqe_len -= i;
218 m_adj(q->tqe_m, i);
219 break;
220 }
221
222 nq = LIST_NEXT(q, tqe_q);
223 LIST_REMOVE(q, tqe_q);
224 m_freem(pData, q->tqe_m);
225 RTMemFree(q);
226 tp->t_segqlen--;
227 tcp_reass_qsize--;
228 q = nq;
229 }
230
231 /* Insert the new segment queue entry into place. */
232 te->tqe_m = m;
233 te->tqe_th = th;
234 te->tqe_len = *tlenp;
235
236 if (p == NULL)
237 {
238 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
239 }
240 else
241 {
242 LIST_INSERT_AFTER(p, te, tqe_q);
243 }
244
245present:
246 /*
247 * Present data to user, advancing rcv_nxt through
248 * completed sequence space.
249 */
250 if (!TCPS_HAVEESTABLISHED(tp->t_state))
251 {
252 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
253 return (0);
254 }
255 q = LIST_FIRST(&tp->t_segq);
256 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
257 {
258 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
259 return (0);
260 }
261 do
262 {
263 tp->rcv_nxt += q->tqe_len;
264 flags = q->tqe_th->th_flags & TH_FIN;
265 nq = LIST_NEXT(q, tqe_q);
266 LIST_REMOVE(q, tqe_q);
267 /* XXX: This place should be checked for the same code in
268 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
269 */
270 if (so->so_state & SS_FCANTSENDMORE)
271 m_freem(pData, q->tqe_m);
272 else
273 sbappend(pData, so, q->tqe_m);
274 RTMemFree(q);
275 tp->t_segqlen--;
276 tcp_reass_qsize--;
277 q = nq;
278 }
279 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
280
281 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
282 return flags;
283}
284
285/*
286 * TCP input routine, follows pages 65-76 of the
287 * protocol specification dated September, 1981 very closely.
288 */
289void
290tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
291{
292 struct ip *ip, *save_ip;
293 register struct tcpiphdr *ti;
294 caddr_t optp = NULL;
295 int optlen = 0;
296 int len, off;
297 int tlen = 0; /* Shut up MSC (didn't check whether MSC was right). */
298 register struct tcpcb *tp = 0;
299 register int tiflags;
300 struct socket *so = 0;
301 int todrop, acked, ourfinisacked, needoutput = 0;
302/* int dropsocket = 0; */
303 int iss = 0;
304 u_long tiwin;
305/* int ts_present = 0; */
306 unsigned ohdrlen;
307 uint8_t ohdr[60 + 8]; /* max IP header plus 8 bytes of payload for icmp */
308
309 STAM_PROFILE_START(&pData->StatTCP_input, counter_input);
310
311 LogFlow(("tcp_input: m = %p, iphlen = %2d, inso = %R[natsock]\n", m, iphlen, inso));
312
313 if (inso != NULL)
314 {
315 QSOCKET_LOCK(tcb);
316 SOCKET_LOCK(inso);
317 QSOCKET_UNLOCK(tcb);
318 }
319 /*
320 * If called with m == 0, then we're continuing the connect
321 */
322 if (m == NULL)
323 {
324 so = inso;
325 Log4(("NAT: tcp_input: %R[natsock]\n", so));
326
327 /* Re-set a few variables */
328 tp = sototcpcb(so);
329
330 m = so->so_m;
331 optp = so->so_optp; /* points into m if set */
332 optlen = so->so_optlen;
333 so->so_m = NULL;
334 so->so_optp = 0;
335 so->so_optlen = 0;
336
337 if (RT_LIKELY(so->so_ohdr != NULL))
338 {
339 RTMemFree(so->so_ohdr);
340 so->so_ohdr = NULL;
341 }
342
343 ti = so->so_ti;
344
345 /** @todo (vvl) clarify why it might happens */
346 if (ti == NULL)
347 {
348 LogRel(("NAT: ti is null. can't do any reseting connection actions\n"));
349 /* mbuf should be cleared in sofree called from tcp_close */
350 tcp_close(pData, tp);
351 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
352 LogFlowFuncLeave();
353 return;
354 }
355
356 tiwin = ti->ti_win;
357 tiflags = ti->ti_flags;
358
359 LogFlowFunc(("%d -> cont_conn\n", __LINE__));
360 goto cont_conn;
361 }
362
363 tcpstat.tcps_rcvtotal++;
364
365 ip = mtod(m, struct ip *);
366
367 /* ip_input() subtracts iphlen from ip::ip_len */
368 AssertStmt(ip->ip_len + iphlen == (ssize_t)m_length(m, NULL), goto drop);
369 if (RT_UNLIKELY(ip->ip_len < sizeof(struct tcphdr)))
370 {
371 /* tcps_rcvshort++; */
372 goto drop;
373 }
374
375 /*
376 * Save a copy of the IP header in case we want to restore it for
377 * sending an ICMP error message in response.
378 *
379 * XXX: This function should really be fixed to not strip IP
380 * options, to not overwrite IP header and to use "tlen" local
381 * variable (instead of ti->ti_len), then "m" could be passed to
382 * icmp_error() directly.
383 */
384 ohdrlen = iphlen + 8;
385 m_copydata(m, 0, ohdrlen, (caddr_t)ohdr);
386 save_ip = (struct ip *)ohdr;
387 save_ip->ip_len += iphlen; /* undo change by ip_input() */
388
389
390 /*
391 * Get IP and TCP header together in first mbuf.
392 * Note: IP leaves IP header in first mbuf.
393 */
394 ti = mtod(m, struct tcpiphdr *);
395 if (iphlen > sizeof(struct ip))
396 {
397 ip_stripoptions(m, (struct mbuf *)0);
398 iphlen = sizeof(struct ip);
399 }
400
401 /*
402 * Checksum extended TCP header and data.
403 */
404 tlen = ((struct ip *)ti)->ip_len;
405 memset(ti->ti_x1, 0, 9);
406 ti->ti_len = RT_H2N_U16((u_int16_t)tlen);
407 len = sizeof(struct ip) + tlen;
408 /* keep checksum for ICMP reply
409 * ti->ti_sum = cksum(m, len);
410 * if (ti->ti_sum) { */
411 if (cksum(m, len))
412 {
413 tcpstat.tcps_rcvbadsum++;
414 LogFlowFunc(("%d -> drop\n", __LINE__));
415 goto drop;
416 }
417
418 /*
419 * Check that TCP offset makes sense,
420 * pull out TCP options and adjust length. XXX
421 */
422 off = ti->ti_off << 2;
423 if ( off < sizeof (struct tcphdr)
424 || off > tlen)
425 {
426 tcpstat.tcps_rcvbadoff++;
427 LogFlowFunc(("%d -> drop\n", __LINE__));
428 goto drop;
429 }
430 tlen -= off;
431 ti->ti_len = tlen;
432 if (off > sizeof (struct tcphdr))
433 {
434 optlen = off - sizeof (struct tcphdr);
435 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
436
437 /*
438 * Do quick retrieval of timestamp options ("options
439 * prediction?"). If timestamp is the only option and it's
440 * formatted as recommended in RFC 1323 appendix A, we
441 * quickly get the values now and not bother calling
442 * tcp_dooptions(), etc.
443 */
444#if 0
445 if (( optlen == TCPOLEN_TSTAMP_APPA
446 || ( optlen > TCPOLEN_TSTAMP_APPA
447 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
448 *(u_int32_t *)optp == RT_H2N_U32_C(TCPOPT_TSTAMP_HDR) &&
449 (ti->ti_flags & TH_SYN) == 0)
450 {
451 ts_present = 1;
452 ts_val = RT_N2H_U32(*(u_int32_t *)(optp + 4));
453 ts_ecr = RT_N2H_U32(*(u_int32_t *)(optp + 8));
454 optp = NULL; / * we have parsed the options * /
455 }
456#endif
457 }
458 tiflags = ti->ti_flags;
459
460 /*
461 * Convert TCP protocol specific fields to host format.
462 */
463 NTOHL(ti->ti_seq);
464 NTOHL(ti->ti_ack);
465 NTOHS(ti->ti_win);
466 NTOHS(ti->ti_urp);
467
468 /*
469 * Drop TCP, IP headers and TCP options.
470 */
471 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
472 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
473
474 /*
475 * Locate pcb for segment.
476 */
477findso:
478 LogFlowFunc(("(enter) findso: %R[natsock]\n", so));
479 if (so != NULL && so != &tcb)
480 SOCKET_UNLOCK(so);
481 QSOCKET_LOCK(tcb);
482 so = tcp_last_so;
483 if ( so->so_fport != ti->ti_dport
484 || so->so_lport != ti->ti_sport
485 || so->so_laddr.s_addr != ti->ti_src.s_addr
486 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
487 {
488 QSOCKET_UNLOCK(tcb);
489 /** @todo fix SOLOOKUP macrodefinition to be usable here */
490 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
491 ti->ti_dst, ti->ti_dport);
492 if (so)
493 {
494 tcp_last_so = so;
495 }
496 ++tcpstat.tcps_socachemiss;
497 }
498 else
499 {
500 SOCKET_LOCK(so);
501 QSOCKET_UNLOCK(tcb);
502 }
503 LogFlowFunc(("(leave) findso: %R[natsock]\n", so));
504
505 /*
506 * If the state is CLOSED (i.e., TCB does not exist) then
507 * all data in the incoming segment is discarded.
508 * If the TCB exists but is in CLOSED state, it is embryonic,
509 * but should either do a listen or a connect soon.
510 *
511 * state == CLOSED means we've done socreate() but haven't
512 * attached it to a protocol yet...
513 *
514 * XXX If a TCB does not exist, and the TH_SYN flag is
515 * the only flag set, then create a session, mark it
516 * as if it was LISTENING, and continue...
517 */
518 if (so == 0)
519 {
520 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
521 {
522 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
523 goto dropwithreset;
524 }
525
526 if ((so = socreate()) == NULL)
527 {
528 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
529 goto dropwithreset;
530 }
531 if (tcp_attach(pData, so) < 0)
532 {
533 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
534 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
535 goto dropwithreset;
536 }
537 SOCKET_LOCK(so);
538 sbreserve(pData, &so->so_snd, tcp_sndspace);
539 sbreserve(pData, &so->so_rcv, tcp_rcvspace);
540
541/* tcp_last_so = so; */ /* XXX ? */
542/* tp = sototcpcb(so); */
543
544 so->so_laddr = ti->ti_src;
545 so->so_lport = ti->ti_sport;
546 so->so_faddr = ti->ti_dst;
547 so->so_fport = ti->ti_dport;
548
549 so->so_iptos = ((struct ip *)ti)->ip_tos;
550
551 tp = sototcpcb(so);
552 TCP_STATE_SWITCH_TO(tp, TCPS_LISTEN);
553 }
554
555 /*
556 * If this is a still-connecting socket, this probably
557 * a retransmit of the SYN. Whether it's a retransmit SYN
558 * or something else, we nuke it.
559 */
560 if (so->so_state & SS_ISFCONNECTING)
561 {
562 LogFlowFunc(("%d -> drop\n", __LINE__));
563 goto drop;
564 }
565
566 tp = sototcpcb(so);
567
568 /* XXX Should never fail */
569 if (tp == 0)
570 {
571 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
572 goto dropwithreset;
573 }
574 if (tp->t_state == TCPS_CLOSED)
575 {
576 LogFlowFunc(("%d -> drop\n", __LINE__));
577 goto drop;
578 }
579
580 /* Unscale the window into a 32-bit value. */
581/* if ((tiflags & TH_SYN) == 0)
582 * tiwin = ti->ti_win << tp->snd_scale;
583 * else
584 */
585 tiwin = ti->ti_win;
586
587 /*
588 * Segment received on connection.
589 * Reset idle time and keep-alive timer.
590 */
591 tp->t_idle = 0;
592 if (so_options)
593 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
594 else
595 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
596
597 /*
598 * Process options if not in LISTEN state,
599 * else do it below (after getting remote address).
600 */
601 if (optp && tp->t_state != TCPS_LISTEN)
602 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
603/* , */
604/* &ts_present, &ts_val, &ts_ecr); */
605
606 /*
607 * Header prediction: check for the two common cases
608 * of a uni-directional data xfer. If the packet has
609 * no control flags, is in-sequence, the window didn't
610 * change and we're not retransmitting, it's a
611 * candidate. If the length is zero and the ack moved
612 * forward, we're the sender side of the xfer. Just
613 * free the data acked & wake any higher level process
614 * that was blocked waiting for space. If the length
615 * is non-zero and the ack didn't move, we're the
616 * receiver side. If we're getting packets in-order
617 * (the reassembly queue is empty), add the data to
618 * the socket buffer and note that we need a delayed ack.
619 *
620 * XXX Some of these tests are not needed
621 * eg: the tiwin == tp->snd_wnd prevents many more
622 * predictions.. with no *real* advantage..
623 */
624 if ( tp->t_state == TCPS_ESTABLISHED
625 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
626/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
627 && ti->ti_seq == tp->rcv_nxt
628 && tiwin && tiwin == tp->snd_wnd
629 && tp->snd_nxt == tp->snd_max)
630 {
631 /*
632 * If last ACK falls within this segment's sequence numbers,
633 * record the timestamp.
634 */
635#if 0
636 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
637 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
638 {
639 tp->ts_recent_age = tcp_now;
640 tp->ts_recent = ts_val;
641 }
642#endif
643
644 if (ti->ti_len == 0)
645 {
646 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
647 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
648 && tp->snd_cwnd >= tp->snd_wnd)
649 {
650 /*
651 * this is a pure ack for outstanding data.
652 */
653 ++tcpstat.tcps_predack;
654#if 0
655 if (ts_present)
656 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
657 else
658#endif
659 if ( tp->t_rtt
660 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
661 tcp_xmit_timer(pData, tp, tp->t_rtt);
662 acked = ti->ti_ack - tp->snd_una;
663 tcpstat.tcps_rcvackpack++;
664 tcpstat.tcps_rcvackbyte += acked;
665 sbdrop(&so->so_snd, acked);
666 tp->snd_una = ti->ti_ack;
667 m_freem(pData, m);
668
669 /*
670 * If all outstanding data are acked, stop
671 * retransmit timer, otherwise restart timer
672 * using current (possibly backed-off) value.
673 * If process is waiting for space,
674 * wakeup/selwakeup/signal. If data
675 * are ready to send, let tcp_output
676 * decide between more output or persist.
677 */
678 if (tp->snd_una == tp->snd_max)
679 tp->t_timer[TCPT_REXMT] = 0;
680 else if (tp->t_timer[TCPT_PERSIST] == 0)
681 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
682
683 /*
684 * There's room in so_snd, sowwakup will read()
685 * from the socket if we can
686 */
687#if 0
688 if (so->so_snd.sb_flags & SB_NOTIFY)
689 sowwakeup(so);
690#endif
691 /*
692 * This is called because sowwakeup might have
693 * put data into so_snd. Since we don't so sowwakeup,
694 * we don't need this.. XXX???
695 */
696 if (SBUF_LEN(&so->so_snd))
697 (void) tcp_output(pData, tp);
698
699 SOCKET_UNLOCK(so);
700 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
701 return;
702 }
703 }
704 else if ( ti->ti_ack == tp->snd_una
705 && LIST_EMPTY(&tp->t_segq)
706 && ti->ti_len <= sbspace(&so->so_rcv))
707 {
708 /*
709 * this is a pure, in-sequence data packet
710 * with nothing on the reassembly queue and
711 * we have enough buffer space to take it.
712 */
713 ++tcpstat.tcps_preddat;
714 tp->rcv_nxt += ti->ti_len;
715 tcpstat.tcps_rcvpack++;
716 tcpstat.tcps_rcvbyte += ti->ti_len;
717 /*
718 * Add data to socket buffer.
719 */
720 sbappend(pData, so, m);
721
722 /*
723 * XXX This is called when data arrives. Later, check
724 * if we can actually write() to the socket
725 * XXX Need to check? It's be NON_BLOCKING
726 */
727/* sorwakeup(so); */
728
729 /*
730 * If this is a short packet, then ACK now - with Nagle
731 * congestion avoidance sender won't send more until
732 * he gets an ACK.
733 *
734 * It is better to not delay acks at all to maximize
735 * TCP throughput. See RFC 2581.
736 */
737 tp->t_flags |= TF_ACKNOW;
738 tcp_output(pData, tp);
739 SOCKET_UNLOCK(so);
740 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
741 return;
742 }
743 } /* header prediction */
744 /*
745 * Calculate amount of space in receive window,
746 * and then do TCP input processing.
747 * Receive window is amount of space in rcv queue,
748 * but not less than advertised window.
749 */
750 {
751 int win;
752 win = sbspace(&so->so_rcv);
753 if (win < 0)
754 win = 0;
755 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
756 }
757
758 switch (tp->t_state)
759 {
760 /*
761 * If the state is LISTEN then ignore segment if it contains an RST.
762 * If the segment contains an ACK then it is bad and send a RST.
763 * If it does not contain a SYN then it is not interesting; drop it.
764 * Don't bother responding if the destination was a broadcast.
765 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
766 * tp->iss, and send a segment:
767 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
768 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
769 * Fill in remote peer address fields if not previously specified.
770 * Enter SYN_RECEIVED state, and process any other fields of this
771 * segment in this state.
772 */
773 case TCPS_LISTEN:
774 {
775 if (tiflags & TH_RST)
776 {
777 LogFlowFunc(("%d -> drop\n", __LINE__));
778 goto drop;
779 }
780 if (tiflags & TH_ACK)
781 {
782 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
783 goto dropwithreset;
784 }
785 if ((tiflags & TH_SYN) == 0)
786 {
787 LogFlowFunc(("%d -> drop\n", __LINE__));
788 goto drop;
789 }
790
791 /*
792 * This has way too many gotos...
793 * But a bit of spaghetti code never hurt anybody :)
794 */
795 if ( (tcp_fconnect(pData, so) == -1)
796 && errno != EINPROGRESS
797 && errno != EWOULDBLOCK)
798 {
799 u_char code = ICMP_UNREACH_NET;
800 Log2((" tcp fconnect errno = %d (%s)\n", errno, strerror(errno)));
801 if (errno == ECONNREFUSED)
802 {
803 /* ACK the SYN, send RST to refuse the connection */
804 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
805 TH_RST|TH_ACK);
806 }
807 else
808 {
809 if (errno == EHOSTUNREACH)
810 code = ICMP_UNREACH_HOST;
811 HTONL(ti->ti_seq); /* restore tcp header */
812 HTONL(ti->ti_ack);
813 HTONS(ti->ti_win);
814 HTONS(ti->ti_urp);
815 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
816 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
817 *ip = *save_ip;
818 icmp_error(pData, m, ICMP_UNREACH, code, 0, strerror(errno));
819 tp->t_socket->so_m = NULL;
820 }
821 tp = tcp_close(pData, tp);
822 }
823 else
824 {
825 /*
826 * Haven't connected yet, save the current mbuf
827 * and ti, and return
828 * XXX Some OS's don't tell us whether the connect()
829 * succeeded or not. So we must time it out.
830 */
831 so->so_m = m;
832 so->so_ti = ti;
833 so->so_ohdr = RTMemDup(ohdr, ohdrlen);
834 so->so_optp = optp;
835 so->so_optlen = optlen;
836 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
837 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
838 }
839 SOCKET_UNLOCK(so);
840 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
841 LogFlowFuncLeave();
842 return;
843
844cont_conn:
845 /* m==NULL
846 * Check if the connect succeeded
847 */
848 LogFlowFunc(("cont_conn:\n"));
849 if (so->so_state & SS_NOFDREF)
850 {
851 tp = tcp_close(pData, tp);
852 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
853 goto dropwithreset;
854 }
855
856 tcp_template(tp);
857
858 if (optp)
859 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
860
861 if (iss)
862 tp->iss = iss;
863 else
864 tp->iss = tcp_iss;
865 tcp_iss += TCP_ISSINCR/2;
866 tp->irs = ti->ti_seq;
867 tcp_sendseqinit(tp);
868 tcp_rcvseqinit(tp);
869 tp->t_flags |= TF_ACKNOW;
870 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
871 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
872 tcpstat.tcps_accepts++;
873 LogFlowFunc(("%d -> trimthenstep6\n", __LINE__));
874 goto trimthenstep6;
875 } /* case TCPS_LISTEN */
876
877 /*
878 * If the state is SYN_SENT:
879 * if seg contains an ACK, but not for our SYN, drop the input.
880 * if seg contains a RST, then drop the connection.
881 * if seg does not contain SYN, then drop it.
882 * Otherwise this is an acceptable SYN segment
883 * initialize tp->rcv_nxt and tp->irs
884 * if seg contains ack then advance tp->snd_una
885 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
886 * arrange for segment to be acked (eventually)
887 * continue processing rest of data/controls, beginning with URG
888 */
889 case TCPS_SYN_SENT:
890 if ( (tiflags & TH_ACK)
891 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
892 || SEQ_GT(ti->ti_ack, tp->snd_max)))
893 {
894 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
895 goto dropwithreset;
896 }
897
898 if (tiflags & TH_RST)
899 {
900 if (tiflags & TH_ACK)
901 tp = tcp_drop(pData, tp, 0); /* XXX Check t_softerror! */
902 LogFlowFunc(("%d -> drop\n", __LINE__));
903 goto drop;
904 }
905
906 if ((tiflags & TH_SYN) == 0)
907 {
908 LogFlowFunc(("%d -> drop\n", __LINE__));
909 goto drop;
910 }
911 if (tiflags & TH_ACK)
912 {
913 tp->snd_una = ti->ti_ack;
914 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
915 tp->snd_nxt = tp->snd_una;
916 }
917
918 tp->t_timer[TCPT_REXMT] = 0;
919 tp->irs = ti->ti_seq;
920 tcp_rcvseqinit(tp);
921 tp->t_flags |= TF_ACKNOW;
922 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
923 {
924 tcpstat.tcps_connects++;
925 soisfconnected(so);
926 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
927
928 /* Do window scaling on this connection? */
929#if 0
930 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
931 == (TF_RCVD_SCALE|TF_REQ_SCALE))
932 {
933 tp->snd_scale = tp->requested_s_scale;
934 tp->rcv_scale = tp->request_r_scale;
935 }
936#endif
937 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
938 /*
939 * if we didn't have to retransmit the SYN,
940 * use its rtt as our initial srtt & rtt var.
941 */
942 if (tp->t_rtt)
943 tcp_xmit_timer(pData, tp, tp->t_rtt);
944 }
945 else
946 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
947
948trimthenstep6:
949 LogFlowFunc(("trimthenstep6:\n"));
950 /*
951 * Advance ti->ti_seq to correspond to first data byte.
952 * If data, trim to stay within window,
953 * dropping FIN if necessary.
954 */
955 ti->ti_seq++;
956 if (ti->ti_len > tp->rcv_wnd)
957 {
958 todrop = ti->ti_len - tp->rcv_wnd;
959 m_adj(m, -todrop);
960 ti->ti_len = tp->rcv_wnd;
961 tiflags &= ~TH_FIN;
962 tcpstat.tcps_rcvpackafterwin++;
963 tcpstat.tcps_rcvbyteafterwin += todrop;
964 }
965 tp->snd_wl1 = ti->ti_seq - 1;
966 tp->rcv_up = ti->ti_seq;
967 LogFlowFunc(("%d -> step6\n", __LINE__));
968 goto step6;
969 } /* switch tp->t_state */
970 /*
971 * States other than LISTEN or SYN_SENT.
972 * First check timestamp, if present.
973 * Then check that at least some bytes of segment are within
974 * receive window. If segment begins before rcv_nxt,
975 * drop leading data (and SYN); if nothing left, just ack.
976 *
977 * RFC 1323 PAWS: If we have a timestamp reply on this segment
978 * and it's less than ts_recent, drop it.
979 */
980#if 0
981 if ( ts_present
982 && (tiflags & TH_RST) == 0
983 && tp->ts_recent
984 && TSTMP_LT(ts_val, tp->ts_recent))
985 {
986 /* Check to see if ts_recent is over 24 days old. */
987 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
988 {
989 /*
990 * Invalidate ts_recent. If this segment updates
991 * ts_recent, the age will be reset later and ts_recent
992 * will get a valid value. If it does not, setting
993 * ts_recent to zero will at least satisfy the
994 * requirement that zero be placed in the timestamp
995 * echo reply when ts_recent isn't valid. The
996 * age isn't reset until we get a valid ts_recent
997 * because we don't want out-of-order segments to be
998 * dropped when ts_recent is old.
999 */
1000 tp->ts_recent = 0;
1001 }
1002 else
1003 {
1004 tcpstat.tcps_rcvduppack++;
1005 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1006 tcpstat.tcps_pawsdrop++;
1007 goto dropafterack;
1008 }
1009 }
1010#endif
1011
1012 todrop = tp->rcv_nxt - ti->ti_seq;
1013 if (todrop > 0)
1014 {
1015 if (tiflags & TH_SYN)
1016 {
1017 tiflags &= ~TH_SYN;
1018 ti->ti_seq++;
1019 if (ti->ti_urp > 1)
1020 ti->ti_urp--;
1021 else
1022 tiflags &= ~TH_URG;
1023 todrop--;
1024 }
1025 /*
1026 * Following if statement from Stevens, vol. 2, p. 960.
1027 */
1028 if ( todrop > ti->ti_len
1029 || ( todrop == ti->ti_len
1030 && (tiflags & TH_FIN) == 0))
1031 {
1032 /*
1033 * Any valid FIN must be to the left of the window.
1034 * At this point the FIN must be a duplicate or out
1035 * of sequence; drop it.
1036 */
1037 tiflags &= ~TH_FIN;
1038
1039 /*
1040 * Send an ACK to resynchronize and drop any data.
1041 * But keep on processing for RST or ACK.
1042 */
1043 tp->t_flags |= TF_ACKNOW;
1044 todrop = ti->ti_len;
1045 tcpstat.tcps_rcvduppack++;
1046 tcpstat.tcps_rcvdupbyte += todrop;
1047 }
1048 else
1049 {
1050 tcpstat.tcps_rcvpartduppack++;
1051 tcpstat.tcps_rcvpartdupbyte += todrop;
1052 }
1053 m_adj(m, todrop);
1054 ti->ti_seq += todrop;
1055 ti->ti_len -= todrop;
1056 if (ti->ti_urp > todrop)
1057 ti->ti_urp -= todrop;
1058 else
1059 {
1060 tiflags &= ~TH_URG;
1061 ti->ti_urp = 0;
1062 }
1063 }
1064 /*
1065 * If new data are received on a connection after the
1066 * user processes are gone, then RST the other end.
1067 */
1068 if ( (so->so_state & SS_NOFDREF)
1069 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
1070 {
1071 tp = tcp_close(pData, tp);
1072 tcpstat.tcps_rcvafterclose++;
1073 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1074 goto dropwithreset;
1075 }
1076
1077 /*
1078 * If segment ends after window, drop trailing data
1079 * (and PUSH and FIN); if nothing left, just ACK.
1080 */
1081 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1082 if (todrop > 0)
1083 {
1084 tcpstat.tcps_rcvpackafterwin++;
1085 if (todrop >= ti->ti_len)
1086 {
1087 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1088 /*
1089 * If a new connection request is received
1090 * while in TIME_WAIT, drop the old connection
1091 * and start over if the sequence numbers
1092 * are above the previous ones.
1093 */
1094 if ( tiflags & TH_SYN
1095 && tp->t_state == TCPS_TIME_WAIT
1096 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
1097 {
1098 iss = tp->rcv_nxt + TCP_ISSINCR;
1099 tp = tcp_close(pData, tp);
1100 SOCKET_UNLOCK(tp->t_socket);
1101 LogFlowFunc(("%d -> findso\n", __LINE__));
1102 goto findso;
1103 }
1104 /*
1105 * If window is closed can only take segments at
1106 * window edge, and have to drop data and PUSH from
1107 * incoming segments. Continue processing, but
1108 * remember to ack. Otherwise, drop segment
1109 * and ack.
1110 */
1111 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
1112 {
1113 tp->t_flags |= TF_ACKNOW;
1114 tcpstat.tcps_rcvwinprobe++;
1115 }
1116 else
1117 {
1118 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1119 goto dropafterack;
1120 }
1121 }
1122 else
1123 tcpstat.tcps_rcvbyteafterwin += todrop;
1124 m_adj(m, -todrop);
1125 ti->ti_len -= todrop;
1126 tiflags &= ~(TH_PUSH|TH_FIN);
1127 }
1128
1129 /*
1130 * If last ACK falls within this segment's sequence numbers,
1131 * record its timestamp.
1132 */
1133#if 0
1134 if ( ts_present
1135 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1136 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1137 {
1138 tp->ts_recent_age = tcp_now;
1139 tp->ts_recent = ts_val;
1140 }
1141#endif
1142
1143 /*
1144 * If the RST bit is set examine the state:
1145 * SYN_RECEIVED STATE:
1146 * If passive open, return to LISTEN state.
1147 * If active open, inform user that connection was refused.
1148 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1149 * Inform user that connection was reset, and close tcb.
1150 * CLOSING, LAST_ACK, TIME_WAIT STATES
1151 * Close the tcb.
1152 */
1153 if (tiflags&TH_RST)
1154 switch (tp->t_state)
1155 {
1156 case TCPS_SYN_RECEIVED:
1157/* so->so_error = ECONNREFUSED; */
1158 LogFlowFunc(("%d -> close\n", __LINE__));
1159 goto close;
1160
1161 case TCPS_ESTABLISHED:
1162 case TCPS_FIN_WAIT_1:
1163 case TCPS_FIN_WAIT_2:
1164 case TCPS_CLOSE_WAIT:
1165/* so->so_error = ECONNRESET; */
1166close:
1167 LogFlowFunc(("close:\n"));
1168 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSED);
1169 tcpstat.tcps_drops++;
1170 tp = tcp_close(pData, tp);
1171 LogFlowFunc(("%d -> drop\n", __LINE__));
1172 goto drop;
1173
1174 case TCPS_CLOSING:
1175 case TCPS_LAST_ACK:
1176 case TCPS_TIME_WAIT:
1177 tp = tcp_close(pData, tp);
1178 LogFlowFunc(("%d -> drop\n", __LINE__));
1179 goto drop;
1180 }
1181
1182 /*
1183 * If a SYN is in the window, then this is an
1184 * error and we send an RST and drop the connection.
1185 */
1186 if (tiflags & TH_SYN)
1187 {
1188 tp = tcp_drop(pData, tp, 0);
1189 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1190 goto dropwithreset;
1191 }
1192
1193 /*
1194 * If the ACK bit is off we drop the segment and return.
1195 */
1196 if ((tiflags & TH_ACK) == 0)
1197 {
1198 LogFlowFunc(("%d -> drop\n", __LINE__));
1199 goto drop;
1200 }
1201
1202 /*
1203 * Ack processing.
1204 */
1205 switch (tp->t_state)
1206 {
1207 /*
1208 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1209 * ESTABLISHED state and continue processing, otherwise
1210 * send an RST. una<=ack<=max
1211 */
1212 case TCPS_SYN_RECEIVED:
1213 LogFlowFunc(("%d -> TCPS_SYN_RECEIVED\n", __LINE__));
1214 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1215 || SEQ_GT(ti->ti_ack, tp->snd_max))
1216 goto dropwithreset;
1217 tcpstat.tcps_connects++;
1218 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
1219 /*
1220 * The sent SYN is ack'ed with our sequence number +1
1221 * The first data byte already in the buffer will get
1222 * lost if no correction is made. This is only needed for
1223 * SS_CTL since the buffer is empty otherwise.
1224 * tp->snd_una++; or:
1225 */
1226 tp->snd_una = ti->ti_ack;
1227 soisfconnected(so);
1228
1229 /* Do window scaling? */
1230#if 0
1231 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1232 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1233 {
1234 tp->snd_scale = tp->requested_s_scale;
1235 tp->rcv_scale = tp->request_r_scale;
1236 }
1237#endif
1238 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1239 tp->snd_wl1 = ti->ti_seq - 1;
1240 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1241 LogFlowFunc(("%d -> synrx_to_est\n", __LINE__));
1242 goto synrx_to_est;
1243 /* fall into ... */
1244
1245 /*
1246 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1247 * ACKs. If the ack is in the range
1248 * tp->snd_una < ti->ti_ack <= tp->snd_max
1249 * then advance tp->snd_una to ti->ti_ack and drop
1250 * data from the retransmission queue. If this ACK reflects
1251 * more up to date window information we update our window information.
1252 */
1253 case TCPS_ESTABLISHED:
1254 case TCPS_FIN_WAIT_1:
1255 case TCPS_FIN_WAIT_2:
1256 case TCPS_CLOSE_WAIT:
1257 case TCPS_CLOSING:
1258 case TCPS_LAST_ACK:
1259 case TCPS_TIME_WAIT:
1260 LogFlowFunc(("%d -> TCPS_ESTABLISHED|TCPS_FIN_WAIT_1|TCPS_FIN_WAIT_2|TCPS_CLOSE_WAIT|"
1261 "TCPS_CLOSING|TCPS_LAST_ACK|TCPS_TIME_WAIT\n", __LINE__));
1262 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1263 {
1264 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1265 {
1266 tcpstat.tcps_rcvdupack++;
1267 Log2((" dup ack m = %p, so = %p\n", m, so));
1268 /*
1269 * If we have outstanding data (other than
1270 * a window probe), this is a completely
1271 * duplicate ack (ie, window info didn't
1272 * change), the ack is the biggest we've
1273 * seen and we've seen exactly our rexmt
1274 * threshold of them, assume a packet
1275 * has been dropped and retransmit it.
1276 * Kludge snd_nxt & the congestion
1277 * window so we send only this one
1278 * packet.
1279 *
1280 * We know we're losing at the current
1281 * window size so do congestion avoidance
1282 * (set ssthresh to half the current window
1283 * and pull our congestion window back to
1284 * the new ssthresh).
1285 *
1286 * Dup acks mean that packets have left the
1287 * network (they're now cached at the receiver)
1288 * so bump cwnd by the amount in the receiver
1289 * to keep a constant cwnd packets in the
1290 * network.
1291 */
1292 if ( tp->t_timer[TCPT_REXMT] == 0
1293 || ti->ti_ack != tp->snd_una)
1294 tp->t_dupacks = 0;
1295 else if (++tp->t_dupacks == tcprexmtthresh)
1296 {
1297 tcp_seq onxt = tp->snd_nxt;
1298 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1299 if (win < 2)
1300 win = 2;
1301 tp->snd_ssthresh = win * tp->t_maxseg;
1302 tp->t_timer[TCPT_REXMT] = 0;
1303 tp->t_rtt = 0;
1304 tp->snd_nxt = ti->ti_ack;
1305 tp->snd_cwnd = tp->t_maxseg;
1306 (void) tcp_output(pData, tp);
1307 tp->snd_cwnd = tp->snd_ssthresh +
1308 tp->t_maxseg * tp->t_dupacks;
1309 if (SEQ_GT(onxt, tp->snd_nxt))
1310 tp->snd_nxt = onxt;
1311 LogFlowFunc(("%d -> drop\n", __LINE__));
1312 goto drop;
1313 }
1314 else if (tp->t_dupacks > tcprexmtthresh)
1315 {
1316 tp->snd_cwnd += tp->t_maxseg;
1317 (void) tcp_output(pData, tp);
1318 LogFlowFunc(("%d -> drop\n", __LINE__));
1319 goto drop;
1320 }
1321 }
1322 else
1323 tp->t_dupacks = 0;
1324 break;
1325 }
1326synrx_to_est:
1327 LogFlowFunc(("synrx_to_est:\n"));
1328 /*
1329 * If the congestion window was inflated to account
1330 * for the other side's cached packets, retract it.
1331 */
1332 if ( tp->t_dupacks > tcprexmtthresh
1333 && tp->snd_cwnd > tp->snd_ssthresh)
1334 tp->snd_cwnd = tp->snd_ssthresh;
1335 tp->t_dupacks = 0;
1336 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1337 {
1338 tcpstat.tcps_rcvacktoomuch++;
1339 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1340 goto dropafterack;
1341 }
1342 acked = ti->ti_ack - tp->snd_una;
1343 tcpstat.tcps_rcvackpack++;
1344 tcpstat.tcps_rcvackbyte += acked;
1345
1346 /*
1347 * If we have a timestamp reply, update smoothed
1348 * round trip time. If no timestamp is present but
1349 * transmit timer is running and timed sequence
1350 * number was acked, update smoothed round trip time.
1351 * Since we now have an rtt measurement, cancel the
1352 * timer backoff (cf., Phil Karn's retransmit alg.).
1353 * Recompute the initial retransmit timer.
1354 */
1355#if 0
1356 if (ts_present)
1357 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1358 else
1359#endif
1360 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1361 tcp_xmit_timer(pData, tp, tp->t_rtt);
1362
1363 /*
1364 * If all outstanding data is acked, stop retransmit
1365 * timer and remember to restart (more output or persist).
1366 * If there is more data to be acked, restart retransmit
1367 * timer, using current (possibly backed-off) value.
1368 */
1369 if (ti->ti_ack == tp->snd_max)
1370 {
1371 tp->t_timer[TCPT_REXMT] = 0;
1372 needoutput = 1;
1373 }
1374 else if (tp->t_timer[TCPT_PERSIST] == 0)
1375 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1376 /*
1377 * When new data is acked, open the congestion window.
1378 * If the window gives us less than ssthresh packets
1379 * in flight, open exponentially (maxseg per packet).
1380 * Otherwise open linearly: maxseg per window
1381 * (maxseg^2 / cwnd per packet).
1382 */
1383 {
1384 register u_int cw = tp->snd_cwnd;
1385 register u_int incr = tp->t_maxseg;
1386
1387 if (cw > tp->snd_ssthresh)
1388 incr = incr * incr / cw;
1389 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1390 }
1391 if (acked > SBUF_LEN(&so->so_snd))
1392 {
1393 tp->snd_wnd -= SBUF_LEN(&so->so_snd);
1394 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1395 ourfinisacked = 1;
1396 }
1397 else
1398 {
1399 sbdrop(&so->so_snd, acked);
1400 tp->snd_wnd -= acked;
1401 ourfinisacked = 0;
1402 }
1403 /*
1404 * XXX sowwakup is called when data is acked and there's room for
1405 * for more data... it should read() the socket
1406 */
1407#if 0
1408 if (so->so_snd.sb_flags & SB_NOTIFY)
1409 sowwakeup(so);
1410#endif
1411 tp->snd_una = ti->ti_ack;
1412 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1413 tp->snd_nxt = tp->snd_una;
1414
1415 switch (tp->t_state)
1416 {
1417 /*
1418 * In FIN_WAIT_1 STATE in addition to the processing
1419 * for the ESTABLISHED state if our FIN is now acknowledged
1420 * then enter FIN_WAIT_2.
1421 */
1422 case TCPS_FIN_WAIT_1:
1423 if (ourfinisacked)
1424 {
1425 /*
1426 * If we can't receive any more
1427 * data, then closing user can proceed.
1428 * Starting the timer is contrary to the
1429 * specification, but if we don't get a FIN
1430 * we'll hang forever.
1431 */
1432 if (so->so_state & SS_FCANTRCVMORE)
1433 {
1434 soisfdisconnected(so);
1435 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1436 }
1437 TCP_STATE_SWITCH_TO(tp, TCPS_FIN_WAIT_2);
1438 }
1439 break;
1440
1441 /*
1442 * In CLOSING STATE in addition to the processing for
1443 * the ESTABLISHED state if the ACK acknowledges our FIN
1444 * then enter the TIME-WAIT state, otherwise ignore
1445 * the segment.
1446 */
1447 case TCPS_CLOSING:
1448 if (ourfinisacked)
1449 {
1450 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1451 tcp_canceltimers(tp);
1452 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1453 soisfdisconnected(so);
1454 }
1455 break;
1456
1457 /*
1458 * In LAST_ACK, we may still be waiting for data to drain
1459 * and/or to be acked, as well as for the ack of our FIN.
1460 * If our FIN is now acknowledged, delete the TCB,
1461 * enter the closed state and return.
1462 */
1463 case TCPS_LAST_ACK:
1464 if (ourfinisacked)
1465 {
1466 tp = tcp_close(pData, tp);
1467 LogFlowFunc(("%d -> drop\n", __LINE__));
1468 goto drop;
1469 }
1470 break;
1471
1472 /*
1473 * In TIME_WAIT state the only thing that should arrive
1474 * is a retransmission of the remote FIN. Acknowledge
1475 * it and restart the finack timer.
1476 */
1477 case TCPS_TIME_WAIT:
1478 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1479 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1480 goto dropafterack;
1481 }
1482 } /* switch(tp->t_state) */
1483
1484step6:
1485 LogFlowFunc(("step6:\n"));
1486 /*
1487 * Update window information.
1488 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1489 */
1490 if ( (tiflags & TH_ACK)
1491 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1492 || ( tp->snd_wl1 == ti->ti_seq
1493 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1494 || ( tp->snd_wl2 == ti->ti_ack
1495 && tiwin > tp->snd_wnd)))))
1496 {
1497 /* keep track of pure window updates */
1498 if ( ti->ti_len == 0
1499 && tp->snd_wl2 == ti->ti_ack
1500 && tiwin > tp->snd_wnd)
1501 tcpstat.tcps_rcvwinupd++;
1502 tp->snd_wnd = tiwin;
1503 tp->snd_wl1 = ti->ti_seq;
1504 tp->snd_wl2 = ti->ti_ack;
1505 if (tp->snd_wnd > tp->max_sndwnd)
1506 tp->max_sndwnd = tp->snd_wnd;
1507 needoutput = 1;
1508 }
1509
1510 /*
1511 * Process segments with URG.
1512 */
1513 if ((tiflags & TH_URG) && ti->ti_urp &&
1514 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1515 {
1516 /*
1517 * This is a kludge, but if we receive and accept
1518 * random urgent pointers, we'll crash in
1519 * soreceive. It's hard to imagine someone
1520 * actually wanting to send this much urgent data.
1521 */
1522 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1523 {
1524 ti->ti_urp = 0;
1525 tiflags &= ~TH_URG;
1526 LogFlowFunc(("%d -> dodata\n", __LINE__));
1527 goto dodata;
1528 }
1529
1530 /*
1531 * If this segment advances the known urgent pointer,
1532 * then mark the data stream. This should not happen
1533 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1534 * a FIN has been received from the remote side.
1535 * In these states we ignore the URG.
1536 *
1537 * According to RFC961 (Assigned Protocols),
1538 * the urgent pointer points to the last octet
1539 * of urgent data. We continue, however,
1540 * to consider it to indicate the first octet
1541 * of data past the urgent section as the original
1542 * spec states (in one of two places).
1543 */
1544 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1545 {
1546 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1547 so->so_urgc = SBUF_LEN(&so->so_rcv) +
1548 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1549 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1550 }
1551 }
1552 else
1553 /*
1554 * If no out of band data is expected,
1555 * pull receive urgent pointer along
1556 * with the receive window.
1557 */
1558 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1559 tp->rcv_up = tp->rcv_nxt;
1560dodata:
1561 LogFlowFunc(("dodata:\n"));
1562
1563 /*
1564 * If this is a small packet, then ACK now - with Nagel
1565 * congestion avoidance sender won't send more until
1566 * he gets an ACK.
1567 *
1568 * XXX: In case you wonder... The magic "27" below is ESC that
1569 * presumably starts a terminal escape-sequence and that we want
1570 * to ACK ASAP. [Original slirp code had three different
1571 * heuristics to chose from here and in the header prediction case
1572 * above, but the commented out alternatives were lost and the
1573 * header prediction case that had an expanded comment about this
1574 * has been modified to always send an ACK].
1575 */
1576 if ( ti->ti_len
1577 && (unsigned)ti->ti_len <= 5
1578 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1579 {
1580 tp->t_flags |= TF_ACKNOW;
1581 }
1582
1583 /*
1584 * Process the segment text, merging it into the TCP sequencing queue,
1585 * and arranging for acknowledgment of receipt if necessary.
1586 * This process logically involves adjusting tp->rcv_wnd as data
1587 * is presented to the user (this happens in tcp_usrreq.c,
1588 * case PRU_RCVD). If a FIN has already been received on this
1589 * connection then we just ignore the text.
1590 */
1591 if ( (ti->ti_len || (tiflags&TH_FIN))
1592 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1593 {
1594 if ( ti->ti_seq == tp->rcv_nxt
1595 && LIST_EMPTY(&tp->t_segq)
1596 && tp->t_state == TCPS_ESTABLISHED)
1597 {
1598 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1599 tp->rcv_nxt += tlen;
1600 tiflags = ti->ti_t.th_flags & TH_FIN;
1601 tcpstat.tcps_rcvpack++;
1602 tcpstat.tcps_rcvbyte += tlen;
1603 if (so->so_state & SS_FCANTRCVMORE)
1604 m_freem(pData, m);
1605 else
1606 sbappend(pData, so, m);
1607 }
1608 else
1609 {
1610 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1611 tp->t_flags |= TF_ACKNOW;
1612 }
1613 /*
1614 * Note the amount of data that peer has sent into
1615 * our window, in order to estimate the sender's
1616 * buffer size.
1617 */
1618 len = SBUF_SIZE(&so->so_rcv) - (tp->rcv_adv - tp->rcv_nxt);
1619 }
1620 else
1621 {
1622 m_freem(pData, m);
1623 tiflags &= ~TH_FIN;
1624 }
1625
1626 /*
1627 * If FIN is received ACK the FIN and let the user know
1628 * that the connection is closing.
1629 */
1630 if (tiflags & TH_FIN)
1631 {
1632 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1633 {
1634 /*
1635 * If we receive a FIN we can't send more data,
1636 * set it SS_FDRAIN
1637 * Shutdown the socket if there is no rx data in the
1638 * buffer.
1639 * soread() is called on completion of shutdown() and
1640 * will got to TCPS_LAST_ACK, and use tcp_output()
1641 * to send the FIN.
1642 */
1643/* sofcantrcvmore(so); */
1644 sofwdrain(so);
1645
1646 tp->t_flags |= TF_ACKNOW;
1647 tp->rcv_nxt++;
1648 }
1649 switch (tp->t_state)
1650 {
1651 /*
1652 * In SYN_RECEIVED and ESTABLISHED STATES
1653 * enter the CLOSE_WAIT state.
1654 */
1655 case TCPS_SYN_RECEIVED:
1656 case TCPS_ESTABLISHED:
1657 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSE_WAIT);
1658 break;
1659
1660 /*
1661 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1662 * enter the CLOSING state.
1663 */
1664 case TCPS_FIN_WAIT_1:
1665 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSING);
1666 break;
1667
1668 /*
1669 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1670 * starting the time-wait timer, turning off the other
1671 * standard timers.
1672 */
1673 case TCPS_FIN_WAIT_2:
1674 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1675 tcp_canceltimers(tp);
1676 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1677 soisfdisconnected(so);
1678 break;
1679
1680 /*
1681 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1682 */
1683 case TCPS_TIME_WAIT:
1684 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1685 break;
1686 }
1687 }
1688
1689 /*
1690 * Return any desired output.
1691 */
1692 if (needoutput || (tp->t_flags & TF_ACKNOW))
1693 tcp_output(pData, tp);
1694
1695 SOCKET_UNLOCK(so);
1696 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1697 LogFlowFuncLeave();
1698 return;
1699
1700dropafterack:
1701 LogFlowFunc(("dropafterack:\n"));
1702 /*
1703 * Generate an ACK dropping incoming segment if it occupies
1704 * sequence space, where the ACK reflects our state.
1705 */
1706 if (tiflags & TH_RST)
1707 {
1708 LogFlowFunc(("%d -> drop\n", __LINE__));
1709 goto drop;
1710 }
1711 m_freem(pData, m);
1712 tp->t_flags |= TF_ACKNOW;
1713 (void) tcp_output(pData, tp);
1714 SOCKET_UNLOCK(so);
1715 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1716 LogFlowFuncLeave();
1717 return;
1718
1719dropwithreset:
1720 LogFlowFunc(("dropwithreset:\n"));
1721 /* reuses m if m!=NULL, m_free() unnecessary */
1722 if (tiflags & TH_ACK)
1723 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1724 else
1725 {
1726 if (tiflags & TH_SYN)
1727 ti->ti_len++;
1728 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1729 TH_RST|TH_ACK);
1730 }
1731
1732 if (so != &tcb)
1733 SOCKET_UNLOCK(so);
1734 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1735 LogFlowFuncLeave();
1736 return;
1737
1738drop:
1739 LogFlowFunc(("drop:\n"));
1740 /*
1741 * Drop space held by incoming segment and return.
1742 */
1743 m_freem(pData, m);
1744
1745#ifdef VBOX_WITH_SLIRP_MT
1746 if (RTCritSectIsOwned(&so->so_mutex))
1747 {
1748 SOCKET_UNLOCK(so);
1749 }
1750#endif
1751
1752 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1753 LogFlowFuncLeave();
1754 return;
1755}
1756
1757
1758void
1759tcp_fconnect_failed(PNATState pData, struct socket *so, int sockerr)
1760{
1761 struct tcpcb *tp;
1762 int code;
1763
1764 Log2(("NAT: connect error %d %R[natsock]\n", sockerr, so));
1765
1766 Assert(so->so_state & SS_ISFCONNECTING);
1767 so->so_state = SS_NOFDREF;
1768
1769 if (sockerr == ECONNREFUSED || sockerr == ECONNRESET)
1770 {
1771 /* hand off to tcp_input():cont_conn to send RST */
1772 TCP_INPUT(pData, NULL, 0, so);
1773 return;
1774 }
1775
1776 tp = sototcpcb(so);
1777 if (RT_UNLIKELY(tp == NULL)) /* should never happen */
1778 {
1779 LogRel(("NAT: tp == NULL %R[natsock]\n", so));
1780 sofree(pData, so);
1781 return;
1782 }
1783
1784 if (sockerr == ENETUNREACH || sockerr == ENETDOWN)
1785 code = ICMP_UNREACH_NET;
1786 else if (sockerr == EHOSTUNREACH || sockerr == EHOSTDOWN)
1787 code = ICMP_UNREACH_HOST;
1788 else
1789 code = -1;
1790
1791 if (code >= 0)
1792 {
1793 struct ip *oip;
1794 unsigned ohdrlen;
1795 struct mbuf *m;
1796
1797 if (RT_UNLIKELY(so->so_ohdr == NULL))
1798 goto out;
1799
1800 oip = (struct ip *)so->so_ohdr;
1801 ohdrlen = oip->ip_hl * 4 + 8;
1802
1803 m = m_gethdr(pData, M_NOWAIT, MT_HEADER);
1804 if (RT_UNLIKELY(m == NULL))
1805 goto out;
1806
1807 m_copyback(pData, m, 0, ohdrlen, (caddr_t)so->so_ohdr);
1808 m->m_pkthdr.header = mtod(m, void *);
1809
1810 icmp_error(pData, m, ICMP_UNREACH, code, 0, NULL);
1811 }
1812
1813 out:
1814 tcp_close(pData, tp);
1815}
1816
1817
1818void
1819tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1820{
1821 u_int16_t mss;
1822 int opt, optlen;
1823
1824 LogFlowFunc(("tcp_dooptions: tp = %R[tcpcb793], cnt=%i\n", tp, cnt));
1825
1826 for (; cnt > 0; cnt -= optlen, cp += optlen)
1827 {
1828 opt = cp[0];
1829 if (opt == TCPOPT_EOL)
1830 break;
1831 if (opt == TCPOPT_NOP)
1832 optlen = 1;
1833 else
1834 {
1835 optlen = cp[1];
1836 if (optlen <= 0)
1837 break;
1838 }
1839 switch (opt)
1840 {
1841 default:
1842 continue;
1843
1844 case TCPOPT_MAXSEG:
1845 if (optlen != TCPOLEN_MAXSEG)
1846 continue;
1847 if (!(ti->ti_flags & TH_SYN))
1848 continue;
1849 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1850 NTOHS(mss);
1851 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1852 break;
1853
1854#if 0
1855 case TCPOPT_WINDOW:
1856 if (optlen != TCPOLEN_WINDOW)
1857 continue;
1858 if (!(ti->ti_flags & TH_SYN))
1859 continue;
1860 tp->t_flags |= TF_RCVD_SCALE;
1861 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1862 break;
1863
1864 case TCPOPT_TIMESTAMP:
1865 if (optlen != TCPOLEN_TIMESTAMP)
1866 continue;
1867 *ts_present = 1;
1868 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1869 NTOHL(*ts_val);
1870 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1871 NTOHL(*ts_ecr);
1872
1873 /*
1874 * A timestamp received in a SYN makes
1875 * it ok to send timestamp requests and replies.
1876 */
1877 if (ti->ti_flags & TH_SYN)
1878 {
1879 tp->t_flags |= TF_RCVD_TSTMP;
1880 tp->ts_recent = *ts_val;
1881 tp->ts_recent_age = tcp_now;
1882 }
1883 break;
1884#endif
1885 }
1886 }
1887}
1888
1889
1890/*
1891 * Pull out of band byte out of a segment so
1892 * it doesn't appear in the user's data queue.
1893 * It is still reflected in the segment length for
1894 * sequencing purposes.
1895 */
1896
1897#if 0
1898void
1899tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1900{
1901 int cnt = ti->ti_urp - 1;
1902
1903 while (cnt >= 0)
1904 {
1905 if (m->m_len > cnt)
1906 {
1907 char *cp = mtod(m, caddr_t) + cnt;
1908 struct tcpcb *tp = sototcpcb(so);
1909
1910 tp->t_iobc = *cp;
1911 tp->t_oobflags |= TCPOOB_HAVEDATA;
1912 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1913 m->m_len--;
1914 return;
1915 }
1916 cnt -= m->m_len;
1917 m = m->m_next; /* XXX WRONG! Fix it! */
1918 if (m == 0)
1919 break;
1920 }
1921 panic("tcp_pulloutofband");
1922}
1923#endif
1924
1925/*
1926 * Collect new round-trip time estimate
1927 * and update averages and current timeout.
1928 */
1929
1930void
1931tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1932{
1933 register short delta;
1934
1935 LogFlowFunc(("ENTER: tcp_xmit_timer: tp = %R[tcpcb793] rtt = %d\n", tp, rtt));
1936
1937 tcpstat.tcps_rttupdated++;
1938 if (tp->t_srtt != 0)
1939 {
1940 /*
1941 * srtt is stored as fixed point with 3 bits after the
1942 * binary point (i.e., scaled by 8). The following magic
1943 * is equivalent to the smoothing algorithm in rfc793 with
1944 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1945 * point). Adjust rtt to origin 0.
1946 */
1947 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1948 if ((tp->t_srtt += delta) <= 0)
1949 tp->t_srtt = 1;
1950 /*
1951 * We accumulate a smoothed rtt variance (actually, a
1952 * smoothed mean difference), then set the retransmit
1953 * timer to smoothed rtt + 4 times the smoothed variance.
1954 * rttvar is stored as fixed point with 2 bits after the
1955 * binary point (scaled by 4). The following is
1956 * equivalent to rfc793 smoothing with an alpha of .75
1957 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1958 * rfc793's wired-in beta.
1959 */
1960 if (delta < 0)
1961 delta = -delta;
1962 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1963 if ((tp->t_rttvar += delta) <= 0)
1964 tp->t_rttvar = 1;
1965 }
1966 else
1967 {
1968 /*
1969 * No rtt measurement yet - use the unsmoothed rtt.
1970 * Set the variance to half the rtt (so our first
1971 * retransmit happens at 3*rtt).
1972 */
1973 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1974 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1975 }
1976 tp->t_rtt = 0;
1977 tp->t_rxtshift = 0;
1978
1979 /*
1980 * the retransmit should happen at rtt + 4 * rttvar.
1981 * Because of the way we do the smoothing, srtt and rttvar
1982 * will each average +1/2 tick of bias. When we compute
1983 * the retransmit timer, we want 1/2 tick of rounding and
1984 * 1 extra tick because of +-1/2 tick uncertainty in the
1985 * firing of the timer. The bias will give us exactly the
1986 * 1.5 tick we need. But, because the bias is
1987 * statistical, we have to test that we don't drop below
1988 * the minimum feasible timer (which is 2 ticks).
1989 */
1990 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1991 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
1992
1993 /*
1994 * We received an ack for a packet that wasn't retransmitted;
1995 * it is probably safe to discard any error indications we've
1996 * received recently. This isn't quite right, but close enough
1997 * for now (a route might have failed after we sent a segment,
1998 * and the return path might not be symmetrical).
1999 */
2000 tp->t_softerror = 0;
2001}
2002
2003/*
2004 * Determine a reasonable value for maxseg size.
2005 * If the route is known, check route for mtu.
2006 * If none, use an mss that can be handled on the outgoing
2007 * interface without forcing IP to fragment; if bigger than
2008 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2009 * to utilize large mbufs. If no route is found, route has no mtu,
2010 * or the destination isn't local, use a default, hopefully conservative
2011 * size (usually 512 or the default IP max size, but no more than the mtu
2012 * of the interface), as we can't discover anything about intervening
2013 * gateways or networks. We also initialize the congestion/slow start
2014 * window to be a single segment if the destination isn't local.
2015 * While looking at the routing entry, we also initialize other path-dependent
2016 * parameters from pre-set or cached values in the routing entry.
2017 */
2018
2019int
2020tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
2021{
2022 struct socket *so = tp->t_socket;
2023 int mss;
2024
2025 LogFlowFunc(("ENTER: tcp_mss: offer=%u, t_maxseg=%u; tp=%R[natsock]\n",
2026 offer, (unsigned int)tp->t_maxseg, so));
2027
2028 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
2029 if (offer)
2030 mss = min(mss, offer);
2031 mss = max(mss, 32);
2032 if (mss < tp->t_maxseg || offer != 0)
2033 tp->t_maxseg = mss;
2034
2035 tp->snd_cwnd = mss;
2036
2037 sbreserve(pData, &so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
2038 sbreserve(pData, &so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
2039
2040 LogFlowFunc(("LEAVE: mss=%d\n", mss));
2041 return mss;
2042}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette