VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 93115

Last change on this file since 93115 was 93115, checked in by vboxsync, 3 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 67.1 KB
Line 
1/* $Id: tcp_input.c 93115 2022-01-01 11:31:46Z vboxsync $ */
2/** @file
3 * NAT - TCP input.
4 */
5
6/*
7 * Copyright (C) 2006-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
22 * The Regents of the University of California. All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
53 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
54 */
55
56/*
57 * Changes and additions relating to SLiRP
58 * Copyright (c) 1995 Danny Gasparovski.
59 *
60 * Please read the file COPYRIGHT for the
61 * terms and conditions of the copyright.
62 */
63
64#include <slirp.h>
65#include "ip_icmp.h"
66
67
68#if 0 /* code using this macroses is commented out */
69# define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
70
71/* for modulo comparisons of timestamps */
72# define TSTMP_LT(a, b) ((int)((a)-(b)) < 0)
73# define TSTMP_GEQ(a, b) ((int)((a)-(b)) >= 0)
74#endif
75
76#ifndef TCP_ACK_HACK
77#define DELAY_ACK(tp, ti) \
78 if (ti->ti_flags & TH_PUSH) \
79 tp->t_flags |= TF_ACKNOW; \
80 else \
81 tp->t_flags |= TF_DELACK;
82#else /* !TCP_ACK_HACK */
83#define DELAY_ACK(tp, ign) \
84 tp->t_flags |= TF_DELACK;
85#endif /* TCP_ACK_HACK */
86
87
88/*
89 * deps: netinet/tcp_reass.c
90 * tcp_reass_maxqlen = 48 (deafault)
91 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
92 */
93int
94tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
95{
96 struct tseg_qent *q;
97 struct tseg_qent *p = NULL;
98 struct tseg_qent *nq;
99 struct tseg_qent *te = NULL;
100 struct socket *so = tp->t_socket;
101 int flags;
102 STAM_PROFILE_START(&pData->StatTCP_reassamble, tcp_reassamble);
103 LogFlowFunc(("ENTER: pData:%p, tp:%R[tcpcb793], th:%p, tlenp:%p, m:%p\n", pData, tp, th, tlenp, m));
104
105 /*
106 * XXX: tcp_reass() is rather inefficient with its data structures
107 * and should be rewritten (see NetBSD for optimizations). While
108 * doing that it should move to its own file tcp_reass.c.
109 */
110
111 /*
112 * Call with th==NULL after become established to
113 * force pre-ESTABLISHED data up to user socket.
114 */
115 if (th == NULL)
116 {
117 LogFlowFunc(("%d -> present\n", __LINE__));
118 goto present;
119 }
120
121 /*
122 * Limit the number of segments in the reassembly queue to prevent
123 * holding on to too many segments (and thus running out of mbufs).
124 * Make sure to let the missing segment through which caused this
125 * queue. Always keep one global queue entry spare to be able to
126 * process the missing segment.
127 */
128 if ( th->th_seq != tp->rcv_nxt
129 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
130 || tp->t_segqlen >= tcp_reass_maxqlen))
131 {
132 tcp_reass_overflows++;
133 tcpstat.tcps_rcvmemdrop++;
134 m_freem(pData, m);
135 *tlenp = 0;
136 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
137 LogFlowFuncLeave();
138 return (0);
139 }
140
141 /*
142 * Allocate a new queue entry. If we can't, or hit the zone limit
143 * just drop the pkt.
144 */
145 te = RTMemAlloc(sizeof(struct tseg_qent));
146 if (te == NULL)
147 {
148 tcpstat.tcps_rcvmemdrop++;
149 m_freem(pData, m);
150 *tlenp = 0;
151 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
152 LogFlowFuncLeave();
153 return (0);
154 }
155 tp->t_segqlen++;
156 tcp_reass_qsize++;
157
158 /*
159 * Find a segment which begins after this one does.
160 */
161 LIST_FOREACH(q, &tp->t_segq, tqe_q)
162 {
163 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
164 break;
165 p = q;
166 }
167
168 /*
169 * If there is a preceding segment, it may provide some of
170 * our data already. If so, drop the data from the incoming
171 * segment. If it provides all of our data, drop us.
172 */
173 if (p != NULL)
174 {
175 int i;
176 /* conversion to int (in i) handles seq wraparound */
177 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
178 if (i > 0)
179 {
180 if (i >= *tlenp)
181 {
182 tcpstat.tcps_rcvduppack++;
183 tcpstat.tcps_rcvdupbyte += *tlenp;
184 m_freem(pData, m);
185 RTMemFree(te);
186 tp->t_segqlen--;
187 tcp_reass_qsize--;
188 /*
189 * Try to present any queued data
190 * at the left window edge to the user.
191 * This is needed after the 3-WHS
192 * completes.
193 */
194 LogFlowFunc(("%d -> present\n", __LINE__));
195 goto present; /* ??? */
196 }
197 m_adj(m, i);
198 *tlenp -= i;
199 th->th_seq += i;
200 }
201 }
202 tcpstat.tcps_rcvoopack++;
203 tcpstat.tcps_rcvoobyte += *tlenp;
204
205 /*
206 * While we overlap succeeding segments trim them or,
207 * if they are completely covered, dequeue them.
208 */
209 while (q)
210 {
211 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
212 if (i <= 0)
213 break;
214 if (i < q->tqe_len)
215 {
216 q->tqe_th->th_seq += i;
217 q->tqe_len -= i;
218 m_adj(q->tqe_m, i);
219 break;
220 }
221
222 nq = LIST_NEXT(q, tqe_q);
223 LIST_REMOVE(q, tqe_q);
224 m_freem(pData, q->tqe_m);
225 RTMemFree(q);
226 tp->t_segqlen--;
227 tcp_reass_qsize--;
228 q = nq;
229 }
230
231 /* Insert the new segment queue entry into place. */
232 te->tqe_m = m;
233 te->tqe_th = th;
234 te->tqe_len = *tlenp;
235
236 if (p == NULL)
237 {
238 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
239 }
240 else
241 {
242 LIST_INSERT_AFTER(p, te, tqe_q);
243 }
244
245present:
246 /*
247 * Present data to user, advancing rcv_nxt through
248 * completed sequence space.
249 */
250 if (!TCPS_HAVEESTABLISHED(tp->t_state))
251 {
252 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
253 return (0);
254 }
255 q = LIST_FIRST(&tp->t_segq);
256 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
257 {
258 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
259 return (0);
260 }
261 do
262 {
263 tp->rcv_nxt += q->tqe_len;
264 flags = q->tqe_th->th_flags & TH_FIN;
265 nq = LIST_NEXT(q, tqe_q);
266 LIST_REMOVE(q, tqe_q);
267 /* XXX: This place should be checked for the same code in
268 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
269 */
270 if (so->so_state & SS_FCANTSENDMORE)
271 m_freem(pData, q->tqe_m);
272 else
273 sbappend(pData, so, q->tqe_m);
274 RTMemFree(q);
275 tp->t_segqlen--;
276 tcp_reass_qsize--;
277 q = nq;
278 }
279 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
280
281 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
282 return flags;
283}
284
285/*
286 * TCP input routine, follows pages 65-76 of the
287 * protocol specification dated September, 1981 very closely.
288 */
289void
290tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
291{
292 struct ip *ip, *save_ip;
293 register struct tcpiphdr *ti;
294 caddr_t optp = NULL;
295 int optlen = 0;
296 int len, off;
297 int tlen = 0; /* Shut up MSC (didn't check whether MSC was right). */
298 register struct tcpcb *tp = 0;
299 register int tiflags;
300 struct socket *so = 0;
301 int todrop, acked, ourfinisacked, needoutput = 0;
302/* int dropsocket = 0; */
303 int iss = 0;
304 u_long tiwin;
305/* int ts_present = 0; */
306 unsigned ohdrlen;
307 uint8_t ohdr[60 + 8]; /* max IP header plus 8 bytes of payload for icmp */
308
309 STAM_PROFILE_START(&pData->StatTCP_input, counter_input);
310
311 LogFlow(("tcp_input: m = %p, iphlen = %2d, inso = %R[natsock]\n", m, iphlen, inso));
312
313 if (inso != NULL)
314 {
315 QSOCKET_LOCK(tcb);
316 SOCKET_LOCK(inso);
317 QSOCKET_UNLOCK(tcb);
318 }
319 /*
320 * If called with m == 0, then we're continuing the connect
321 */
322 if (m == NULL)
323 {
324 so = inso;
325 Log4(("NAT: tcp_input: %R[natsock]\n", so));
326
327 /* Re-set a few variables */
328 tp = sototcpcb(so);
329
330 m = so->so_m;
331 optp = so->so_optp; /* points into m if set */
332 optlen = so->so_optlen;
333 so->so_m = NULL;
334 so->so_optp = 0;
335 so->so_optlen = 0;
336
337 if (RT_LIKELY(so->so_ohdr != NULL))
338 {
339 RTMemFree(so->so_ohdr);
340 so->so_ohdr = NULL;
341 }
342
343 ti = so->so_ti;
344
345 /** @todo (vvl) clarify why it might happens */
346 if (ti == NULL)
347 {
348 LogRel(("NAT: ti is null. can't do any reseting connection actions\n"));
349 /* mbuf should be cleared in sofree called from tcp_close */
350 tcp_close(pData, tp);
351 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
352 LogFlowFuncLeave();
353 return;
354 }
355
356 tiwin = ti->ti_win;
357 tiflags = ti->ti_flags;
358
359 LogFlowFunc(("%d -> cont_conn\n", __LINE__));
360 goto cont_conn;
361 }
362
363 tcpstat.tcps_rcvtotal++;
364
365 ip = mtod(m, struct ip *);
366
367 /* ip_input() subtracts iphlen from ip::ip_len */
368 AssertStmt(ip->ip_len + iphlen == (ssize_t)m_length(m, NULL), goto drop);
369 if (RT_UNLIKELY(ip->ip_len < sizeof(struct tcphdr)))
370 {
371 /* tcps_rcvshort++; */
372 goto drop;
373 }
374
375 /*
376 * Save a copy of the IP header in case we want to restore it for
377 * sending an ICMP error message in response.
378 *
379 * XXX: This function should really be fixed to not strip IP
380 * options, to not overwrite IP header and to use "tlen" local
381 * variable (instead of ti->ti_len), then "m" could be passed to
382 * icmp_error() directly.
383 */
384 ohdrlen = iphlen + 8;
385 m_copydata(m, 0, ohdrlen, (caddr_t)ohdr);
386 save_ip = (struct ip *)ohdr;
387 save_ip->ip_len += iphlen; /* undo change by ip_input() */
388
389
390 /*
391 * Get IP and TCP header together in first mbuf.
392 * Note: IP leaves IP header in first mbuf.
393 */
394 ti = mtod(m, struct tcpiphdr *);
395 if (iphlen > sizeof(struct ip))
396 {
397 ip_stripoptions(m, (struct mbuf *)0);
398 iphlen = sizeof(struct ip);
399 }
400
401 /*
402 * Checksum extended TCP header and data.
403 */
404 tlen = ((struct ip *)ti)->ip_len;
405 memset(ti->ti_x1, 0, 9);
406 ti->ti_len = RT_H2N_U16((u_int16_t)tlen);
407 len = sizeof(struct ip) + tlen;
408 /* keep checksum for ICMP reply
409 * ti->ti_sum = cksum(m, len);
410 * if (ti->ti_sum) { */
411 if (cksum(m, len))
412 {
413 tcpstat.tcps_rcvbadsum++;
414 LogFlowFunc(("%d -> drop\n", __LINE__));
415 goto drop;
416 }
417
418 /*
419 * Check that TCP offset makes sense,
420 * pull out TCP options and adjust length. XXX
421 */
422 off = ti->ti_off << 2;
423 if ( off < sizeof (struct tcphdr)
424 || off > tlen)
425 {
426 tcpstat.tcps_rcvbadoff++;
427 LogFlowFunc(("%d -> drop\n", __LINE__));
428 goto drop;
429 }
430 tlen -= off;
431 ti->ti_len = tlen;
432 if (off > sizeof (struct tcphdr))
433 {
434 optlen = off - sizeof (struct tcphdr);
435 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
436
437 /*
438 * Do quick retrieval of timestamp options ("options
439 * prediction?"). If timestamp is the only option and it's
440 * formatted as recommended in RFC 1323 appendix A, we
441 * quickly get the values now and not bother calling
442 * tcp_dooptions(), etc.
443 */
444#if 0
445 if (( optlen == TCPOLEN_TSTAMP_APPA
446 || ( optlen > TCPOLEN_TSTAMP_APPA
447 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
448 *(u_int32_t *)optp == RT_H2N_U32_C(TCPOPT_TSTAMP_HDR) &&
449 (ti->ti_flags & TH_SYN) == 0)
450 {
451 ts_present = 1;
452 ts_val = RT_N2H_U32(*(u_int32_t *)(optp + 4));
453 ts_ecr = RT_N2H_U32(*(u_int32_t *)(optp + 8));
454 optp = NULL; / * we have parsed the options * /
455 }
456#endif
457 }
458 tiflags = ti->ti_flags;
459
460 /*
461 * Convert TCP protocol specific fields to host format.
462 */
463 NTOHL(ti->ti_seq);
464 NTOHL(ti->ti_ack);
465 NTOHS(ti->ti_win);
466 NTOHS(ti->ti_urp);
467
468 /*
469 * Drop TCP, IP headers and TCP options.
470 */
471 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
472 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
473
474 /*
475 * Locate pcb for segment.
476 */
477findso:
478 LogFlowFunc(("(enter) findso: %R[natsock]\n", so));
479 if (so != NULL && so != &tcb)
480 SOCKET_UNLOCK(so);
481 QSOCKET_LOCK(tcb);
482 so = tcp_last_so;
483 if ( so->so_fport != ti->ti_dport
484 || so->so_lport != ti->ti_sport
485 || so->so_laddr.s_addr != ti->ti_src.s_addr
486 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
487 {
488 QSOCKET_UNLOCK(tcb);
489 /** @todo fix SOLOOKUP macrodefinition to be usable here */
490 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
491 ti->ti_dst, ti->ti_dport);
492 if (so)
493 {
494 tcp_last_so = so;
495 }
496 ++tcpstat.tcps_socachemiss;
497 }
498 else
499 {
500 SOCKET_LOCK(so);
501 QSOCKET_UNLOCK(tcb);
502 }
503 LogFlowFunc(("(leave) findso: %R[natsock]\n", so));
504
505 /*
506 * Check whether the packet is targeting CTL_ALIAS and drop it if the connection wasn't
507 * initiated by localhost (so == NULL), see @bugref{9896}.
508 */
509 if ( (RT_N2H_U32(ti->ti_dst.s_addr) & ~pData->netmask) == CTL_ALIAS
510 && !pData->fLocalhostReachable
511 && !so)
512 {
513 LogFlowFunc(("Packet for CTL_ALIAS and fLocalhostReachable=false so=NULL -> drop\n"));
514 goto drop;
515 }
516
517 /*
518 * If the state is CLOSED (i.e., TCB does not exist) then
519 * all data in the incoming segment is discarded.
520 * If the TCB exists but is in CLOSED state, it is embryonic,
521 * but should either do a listen or a connect soon.
522 *
523 * state == CLOSED means we've done socreate() but haven't
524 * attached it to a protocol yet...
525 *
526 * XXX If a TCB does not exist, and the TH_SYN flag is
527 * the only flag set, then create a session, mark it
528 * as if it was LISTENING, and continue...
529 */
530 if (so == 0)
531 {
532 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
533 {
534 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
535 goto dropwithreset;
536 }
537
538 if ((so = socreate()) == NULL)
539 {
540 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
541 goto dropwithreset;
542 }
543 if (tcp_attach(pData, so) < 0)
544 {
545 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
546 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
547 goto dropwithreset;
548 }
549 SOCKET_LOCK(so);
550 sbreserve(pData, &so->so_snd, tcp_sndspace);
551 sbreserve(pData, &so->so_rcv, tcp_rcvspace);
552
553/* tcp_last_so = so; */ /* XXX ? */
554/* tp = sototcpcb(so); */
555
556 so->so_laddr = ti->ti_src;
557 so->so_lport = ti->ti_sport;
558 so->so_faddr = ti->ti_dst;
559 so->so_fport = ti->ti_dport;
560
561 so->so_iptos = ((struct ip *)ti)->ip_tos;
562
563 tp = sototcpcb(so);
564 TCP_STATE_SWITCH_TO(tp, TCPS_LISTEN);
565 }
566
567 /*
568 * If this is a still-connecting socket, this probably
569 * a retransmit of the SYN. Whether it's a retransmit SYN
570 * or something else, we nuke it.
571 */
572 if (so->so_state & SS_ISFCONNECTING)
573 {
574 LogFlowFunc(("%d -> drop\n", __LINE__));
575 goto drop;
576 }
577
578 tp = sototcpcb(so);
579
580 /* XXX Should never fail */
581 if (tp == 0)
582 {
583 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
584 goto dropwithreset;
585 }
586 if (tp->t_state == TCPS_CLOSED)
587 {
588 LogFlowFunc(("%d -> drop\n", __LINE__));
589 goto drop;
590 }
591
592 /* Unscale the window into a 32-bit value. */
593/* if ((tiflags & TH_SYN) == 0)
594 * tiwin = ti->ti_win << tp->snd_scale;
595 * else
596 */
597 tiwin = ti->ti_win;
598
599 /*
600 * Segment received on connection.
601 * Reset idle time and keep-alive timer.
602 */
603 tp->t_idle = 0;
604 if (so_options)
605 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
606 else
607 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
608
609 /*
610 * Process options if not in LISTEN state,
611 * else do it below (after getting remote address).
612 */
613 if (optp && tp->t_state != TCPS_LISTEN)
614 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
615/* , */
616/* &ts_present, &ts_val, &ts_ecr); */
617
618 /*
619 * Header prediction: check for the two common cases
620 * of a uni-directional data xfer. If the packet has
621 * no control flags, is in-sequence, the window didn't
622 * change and we're not retransmitting, it's a
623 * candidate. If the length is zero and the ack moved
624 * forward, we're the sender side of the xfer. Just
625 * free the data acked & wake any higher level process
626 * that was blocked waiting for space. If the length
627 * is non-zero and the ack didn't move, we're the
628 * receiver side. If we're getting packets in-order
629 * (the reassembly queue is empty), add the data to
630 * the socket buffer and note that we need a delayed ack.
631 *
632 * XXX Some of these tests are not needed
633 * eg: the tiwin == tp->snd_wnd prevents many more
634 * predictions.. with no *real* advantage..
635 */
636 if ( tp->t_state == TCPS_ESTABLISHED
637 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
638/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
639 && ti->ti_seq == tp->rcv_nxt
640 && tiwin && tiwin == tp->snd_wnd
641 && tp->snd_nxt == tp->snd_max)
642 {
643 /*
644 * If last ACK falls within this segment's sequence numbers,
645 * record the timestamp.
646 */
647#if 0
648 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
649 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
650 {
651 tp->ts_recent_age = tcp_now;
652 tp->ts_recent = ts_val;
653 }
654#endif
655
656 if (ti->ti_len == 0)
657 {
658 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
659 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
660 && tp->snd_cwnd >= tp->snd_wnd)
661 {
662 /*
663 * this is a pure ack for outstanding data.
664 */
665 ++tcpstat.tcps_predack;
666#if 0
667 if (ts_present)
668 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
669 else
670#endif
671 if ( tp->t_rtt
672 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
673 tcp_xmit_timer(pData, tp, tp->t_rtt);
674 acked = ti->ti_ack - tp->snd_una;
675 tcpstat.tcps_rcvackpack++;
676 tcpstat.tcps_rcvackbyte += acked;
677 sbdrop(&so->so_snd, acked);
678 tp->snd_una = ti->ti_ack;
679 m_freem(pData, m);
680
681 /*
682 * If all outstanding data are acked, stop
683 * retransmit timer, otherwise restart timer
684 * using current (possibly backed-off) value.
685 * If process is waiting for space,
686 * wakeup/selwakeup/signal. If data
687 * are ready to send, let tcp_output
688 * decide between more output or persist.
689 */
690 if (tp->snd_una == tp->snd_max)
691 tp->t_timer[TCPT_REXMT] = 0;
692 else if (tp->t_timer[TCPT_PERSIST] == 0)
693 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
694
695 /*
696 * There's room in so_snd, sowwakup will read()
697 * from the socket if we can
698 */
699#if 0
700 if (so->so_snd.sb_flags & SB_NOTIFY)
701 sowwakeup(so);
702#endif
703 /*
704 * This is called because sowwakeup might have
705 * put data into so_snd. Since we don't so sowwakeup,
706 * we don't need this.. XXX???
707 */
708 if (SBUF_LEN(&so->so_snd))
709 (void) tcp_output(pData, tp);
710
711 SOCKET_UNLOCK(so);
712 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
713 return;
714 }
715 }
716 else if ( ti->ti_ack == tp->snd_una
717 && LIST_EMPTY(&tp->t_segq)
718 && ti->ti_len <= sbspace(&so->so_rcv))
719 {
720 /*
721 * this is a pure, in-sequence data packet
722 * with nothing on the reassembly queue and
723 * we have enough buffer space to take it.
724 */
725 ++tcpstat.tcps_preddat;
726 tp->rcv_nxt += ti->ti_len;
727 tcpstat.tcps_rcvpack++;
728 tcpstat.tcps_rcvbyte += ti->ti_len;
729 /*
730 * Add data to socket buffer.
731 */
732 sbappend(pData, so, m);
733
734 /*
735 * XXX This is called when data arrives. Later, check
736 * if we can actually write() to the socket
737 * XXX Need to check? It's be NON_BLOCKING
738 */
739/* sorwakeup(so); */
740
741 /*
742 * If this is a short packet, then ACK now - with Nagle
743 * congestion avoidance sender won't send more until
744 * he gets an ACK.
745 *
746 * It is better to not delay acks at all to maximize
747 * TCP throughput. See RFC 2581.
748 */
749 tp->t_flags |= TF_ACKNOW;
750 tcp_output(pData, tp);
751 SOCKET_UNLOCK(so);
752 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
753 return;
754 }
755 } /* header prediction */
756 /*
757 * Calculate amount of space in receive window,
758 * and then do TCP input processing.
759 * Receive window is amount of space in rcv queue,
760 * but not less than advertised window.
761 */
762 {
763 int win;
764 win = sbspace(&so->so_rcv);
765 if (win < 0)
766 win = 0;
767 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
768 }
769
770 switch (tp->t_state)
771 {
772 /*
773 * If the state is LISTEN then ignore segment if it contains an RST.
774 * If the segment contains an ACK then it is bad and send a RST.
775 * If it does not contain a SYN then it is not interesting; drop it.
776 * Don't bother responding if the destination was a broadcast.
777 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
778 * tp->iss, and send a segment:
779 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
780 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
781 * Fill in remote peer address fields if not previously specified.
782 * Enter SYN_RECEIVED state, and process any other fields of this
783 * segment in this state.
784 */
785 case TCPS_LISTEN:
786 {
787 if (tiflags & TH_RST)
788 {
789 LogFlowFunc(("%d -> drop\n", __LINE__));
790 goto drop;
791 }
792 if (tiflags & TH_ACK)
793 {
794 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
795 goto dropwithreset;
796 }
797 if ((tiflags & TH_SYN) == 0)
798 {
799 LogFlowFunc(("%d -> drop\n", __LINE__));
800 goto drop;
801 }
802
803 /*
804 * This has way too many gotos...
805 * But a bit of spaghetti code never hurt anybody :)
806 */
807 if ( (tcp_fconnect(pData, so) == -1)
808 && errno != EINPROGRESS
809 && errno != EWOULDBLOCK)
810 {
811 u_char code = ICMP_UNREACH_NET;
812 Log2((" tcp fconnect errno = %d (%s)\n", errno, strerror(errno)));
813 if (errno == ECONNREFUSED)
814 {
815 /* ACK the SYN, send RST to refuse the connection */
816 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
817 TH_RST|TH_ACK);
818 }
819 else
820 {
821 if (errno == EHOSTUNREACH)
822 code = ICMP_UNREACH_HOST;
823 HTONL(ti->ti_seq); /* restore tcp header */
824 HTONL(ti->ti_ack);
825 HTONS(ti->ti_win);
826 HTONS(ti->ti_urp);
827 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
828 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
829 *ip = *save_ip;
830 icmp_error(pData, m, ICMP_UNREACH, code, 0, strerror(errno));
831 tp->t_socket->so_m = NULL;
832 }
833 tp = tcp_close(pData, tp);
834 }
835 else
836 {
837 /*
838 * Haven't connected yet, save the current mbuf
839 * and ti, and return
840 * XXX Some OS's don't tell us whether the connect()
841 * succeeded or not. So we must time it out.
842 */
843 so->so_m = m;
844 so->so_ti = ti;
845 so->so_ohdr = RTMemDup(ohdr, ohdrlen);
846 so->so_optp = optp;
847 so->so_optlen = optlen;
848 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
849 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
850 }
851 SOCKET_UNLOCK(so);
852 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
853 LogFlowFuncLeave();
854 return;
855
856cont_conn:
857 /* m==NULL
858 * Check if the connect succeeded
859 */
860 LogFlowFunc(("cont_conn:\n"));
861 if (so->so_state & SS_NOFDREF)
862 {
863 tp = tcp_close(pData, tp);
864 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
865 goto dropwithreset;
866 }
867
868 tcp_template(tp);
869
870 if (optp)
871 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
872
873 if (iss)
874 tp->iss = iss;
875 else
876 tp->iss = tcp_iss;
877 tcp_iss += TCP_ISSINCR/2;
878 tp->irs = ti->ti_seq;
879 tcp_sendseqinit(tp);
880 tcp_rcvseqinit(tp);
881 tp->t_flags |= TF_ACKNOW;
882 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
883 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
884 tcpstat.tcps_accepts++;
885 LogFlowFunc(("%d -> trimthenstep6\n", __LINE__));
886 goto trimthenstep6;
887 } /* case TCPS_LISTEN */
888
889 /*
890 * If the state is SYN_SENT:
891 * if seg contains an ACK, but not for our SYN, drop the input.
892 * if seg contains a RST, then drop the connection.
893 * if seg does not contain SYN, then drop it.
894 * Otherwise this is an acceptable SYN segment
895 * initialize tp->rcv_nxt and tp->irs
896 * if seg contains ack then advance tp->snd_una
897 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
898 * arrange for segment to be acked (eventually)
899 * continue processing rest of data/controls, beginning with URG
900 */
901 case TCPS_SYN_SENT:
902 if ( (tiflags & TH_ACK)
903 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
904 || SEQ_GT(ti->ti_ack, tp->snd_max)))
905 {
906 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
907 goto dropwithreset;
908 }
909
910 if (tiflags & TH_RST)
911 {
912 if (tiflags & TH_ACK)
913 tp = tcp_drop(pData, tp, 0); /* XXX Check t_softerror! */
914 LogFlowFunc(("%d -> drop\n", __LINE__));
915 goto drop;
916 }
917
918 if ((tiflags & TH_SYN) == 0)
919 {
920 LogFlowFunc(("%d -> drop\n", __LINE__));
921 goto drop;
922 }
923 if (tiflags & TH_ACK)
924 {
925 tp->snd_una = ti->ti_ack;
926 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
927 tp->snd_nxt = tp->snd_una;
928 }
929
930 tp->t_timer[TCPT_REXMT] = 0;
931 tp->irs = ti->ti_seq;
932 tcp_rcvseqinit(tp);
933 tp->t_flags |= TF_ACKNOW;
934 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
935 {
936 tcpstat.tcps_connects++;
937 soisfconnected(so);
938 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
939
940 /* Do window scaling on this connection? */
941#if 0
942 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
943 == (TF_RCVD_SCALE|TF_REQ_SCALE))
944 {
945 tp->snd_scale = tp->requested_s_scale;
946 tp->rcv_scale = tp->request_r_scale;
947 }
948#endif
949 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
950 /*
951 * if we didn't have to retransmit the SYN,
952 * use its rtt as our initial srtt & rtt var.
953 */
954 if (tp->t_rtt)
955 tcp_xmit_timer(pData, tp, tp->t_rtt);
956 }
957 else
958 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
959
960trimthenstep6:
961 LogFlowFunc(("trimthenstep6:\n"));
962 /*
963 * Advance ti->ti_seq to correspond to first data byte.
964 * If data, trim to stay within window,
965 * dropping FIN if necessary.
966 */
967 ti->ti_seq++;
968 if (ti->ti_len > tp->rcv_wnd)
969 {
970 todrop = ti->ti_len - tp->rcv_wnd;
971 m_adj(m, -todrop);
972 ti->ti_len = tp->rcv_wnd;
973 tiflags &= ~TH_FIN;
974 tcpstat.tcps_rcvpackafterwin++;
975 tcpstat.tcps_rcvbyteafterwin += todrop;
976 }
977 tp->snd_wl1 = ti->ti_seq - 1;
978 tp->rcv_up = ti->ti_seq;
979 LogFlowFunc(("%d -> step6\n", __LINE__));
980 goto step6;
981 } /* switch tp->t_state */
982 /*
983 * States other than LISTEN or SYN_SENT.
984 * First check timestamp, if present.
985 * Then check that at least some bytes of segment are within
986 * receive window. If segment begins before rcv_nxt,
987 * drop leading data (and SYN); if nothing left, just ack.
988 *
989 * RFC 1323 PAWS: If we have a timestamp reply on this segment
990 * and it's less than ts_recent, drop it.
991 */
992#if 0
993 if ( ts_present
994 && (tiflags & TH_RST) == 0
995 && tp->ts_recent
996 && TSTMP_LT(ts_val, tp->ts_recent))
997 {
998 /* Check to see if ts_recent is over 24 days old. */
999 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
1000 {
1001 /*
1002 * Invalidate ts_recent. If this segment updates
1003 * ts_recent, the age will be reset later and ts_recent
1004 * will get a valid value. If it does not, setting
1005 * ts_recent to zero will at least satisfy the
1006 * requirement that zero be placed in the timestamp
1007 * echo reply when ts_recent isn't valid. The
1008 * age isn't reset until we get a valid ts_recent
1009 * because we don't want out-of-order segments to be
1010 * dropped when ts_recent is old.
1011 */
1012 tp->ts_recent = 0;
1013 }
1014 else
1015 {
1016 tcpstat.tcps_rcvduppack++;
1017 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1018 tcpstat.tcps_pawsdrop++;
1019 goto dropafterack;
1020 }
1021 }
1022#endif
1023
1024 todrop = tp->rcv_nxt - ti->ti_seq;
1025 if (todrop > 0)
1026 {
1027 if (tiflags & TH_SYN)
1028 {
1029 tiflags &= ~TH_SYN;
1030 ti->ti_seq++;
1031 if (ti->ti_urp > 1)
1032 ti->ti_urp--;
1033 else
1034 tiflags &= ~TH_URG;
1035 todrop--;
1036 }
1037 /*
1038 * Following if statement from Stevens, vol. 2, p. 960.
1039 */
1040 if ( todrop > ti->ti_len
1041 || ( todrop == ti->ti_len
1042 && (tiflags & TH_FIN) == 0))
1043 {
1044 /*
1045 * Any valid FIN must be to the left of the window.
1046 * At this point the FIN must be a duplicate or out
1047 * of sequence; drop it.
1048 */
1049 tiflags &= ~TH_FIN;
1050
1051 /*
1052 * Send an ACK to resynchronize and drop any data.
1053 * But keep on processing for RST or ACK.
1054 */
1055 tp->t_flags |= TF_ACKNOW;
1056 todrop = ti->ti_len;
1057 tcpstat.tcps_rcvduppack++;
1058 tcpstat.tcps_rcvdupbyte += todrop;
1059 }
1060 else
1061 {
1062 tcpstat.tcps_rcvpartduppack++;
1063 tcpstat.tcps_rcvpartdupbyte += todrop;
1064 }
1065 m_adj(m, todrop);
1066 ti->ti_seq += todrop;
1067 ti->ti_len -= todrop;
1068 if (ti->ti_urp > todrop)
1069 ti->ti_urp -= todrop;
1070 else
1071 {
1072 tiflags &= ~TH_URG;
1073 ti->ti_urp = 0;
1074 }
1075 }
1076 /*
1077 * If new data are received on a connection after the
1078 * user processes are gone, then RST the other end.
1079 */
1080 if ( (so->so_state & SS_NOFDREF)
1081 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
1082 {
1083 tp = tcp_close(pData, tp);
1084 tcpstat.tcps_rcvafterclose++;
1085 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1086 goto dropwithreset;
1087 }
1088
1089 /*
1090 * If segment ends after window, drop trailing data
1091 * (and PUSH and FIN); if nothing left, just ACK.
1092 */
1093 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1094 if (todrop > 0)
1095 {
1096 tcpstat.tcps_rcvpackafterwin++;
1097 if (todrop >= ti->ti_len)
1098 {
1099 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1100 /*
1101 * If a new connection request is received
1102 * while in TIME_WAIT, drop the old connection
1103 * and start over if the sequence numbers
1104 * are above the previous ones.
1105 */
1106 if ( tiflags & TH_SYN
1107 && tp->t_state == TCPS_TIME_WAIT
1108 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
1109 {
1110 iss = tp->rcv_nxt + TCP_ISSINCR;
1111 tp = tcp_close(pData, tp);
1112 SOCKET_UNLOCK(tp->t_socket);
1113 LogFlowFunc(("%d -> findso\n", __LINE__));
1114 goto findso;
1115 }
1116 /*
1117 * If window is closed can only take segments at
1118 * window edge, and have to drop data and PUSH from
1119 * incoming segments. Continue processing, but
1120 * remember to ack. Otherwise, drop segment
1121 * and ack.
1122 */
1123 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
1124 {
1125 tp->t_flags |= TF_ACKNOW;
1126 tcpstat.tcps_rcvwinprobe++;
1127 }
1128 else
1129 {
1130 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1131 goto dropafterack;
1132 }
1133 }
1134 else
1135 tcpstat.tcps_rcvbyteafterwin += todrop;
1136 m_adj(m, -todrop);
1137 ti->ti_len -= todrop;
1138 tiflags &= ~(TH_PUSH|TH_FIN);
1139 }
1140
1141 /*
1142 * If last ACK falls within this segment's sequence numbers,
1143 * record its timestamp.
1144 */
1145#if 0
1146 if ( ts_present
1147 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1148 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1149 {
1150 tp->ts_recent_age = tcp_now;
1151 tp->ts_recent = ts_val;
1152 }
1153#endif
1154
1155 /*
1156 * If the RST bit is set examine the state:
1157 * SYN_RECEIVED STATE:
1158 * If passive open, return to LISTEN state.
1159 * If active open, inform user that connection was refused.
1160 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1161 * Inform user that connection was reset, and close tcb.
1162 * CLOSING, LAST_ACK, TIME_WAIT STATES
1163 * Close the tcb.
1164 */
1165 if (tiflags&TH_RST)
1166 switch (tp->t_state)
1167 {
1168 case TCPS_SYN_RECEIVED:
1169/* so->so_error = ECONNREFUSED; */
1170 LogFlowFunc(("%d -> close\n", __LINE__));
1171 goto close;
1172
1173 case TCPS_ESTABLISHED:
1174 case TCPS_FIN_WAIT_1:
1175 case TCPS_FIN_WAIT_2:
1176 case TCPS_CLOSE_WAIT:
1177/* so->so_error = ECONNRESET; */
1178close:
1179 LogFlowFunc(("close:\n"));
1180 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSED);
1181 tcpstat.tcps_drops++;
1182 tp = tcp_close(pData, tp);
1183 LogFlowFunc(("%d -> drop\n", __LINE__));
1184 goto drop;
1185
1186 case TCPS_CLOSING:
1187 case TCPS_LAST_ACK:
1188 case TCPS_TIME_WAIT:
1189 tp = tcp_close(pData, tp);
1190 LogFlowFunc(("%d -> drop\n", __LINE__));
1191 goto drop;
1192 }
1193
1194 /*
1195 * If a SYN is in the window, then this is an
1196 * error and we send an RST and drop the connection.
1197 */
1198 if (tiflags & TH_SYN)
1199 {
1200 tp = tcp_drop(pData, tp, 0);
1201 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1202 goto dropwithreset;
1203 }
1204
1205 /*
1206 * If the ACK bit is off we drop the segment and return.
1207 */
1208 if ((tiflags & TH_ACK) == 0)
1209 {
1210 LogFlowFunc(("%d -> drop\n", __LINE__));
1211 goto drop;
1212 }
1213
1214 /*
1215 * Ack processing.
1216 */
1217 switch (tp->t_state)
1218 {
1219 /*
1220 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1221 * ESTABLISHED state and continue processing, otherwise
1222 * send an RST. una<=ack<=max
1223 */
1224 case TCPS_SYN_RECEIVED:
1225 LogFlowFunc(("%d -> TCPS_SYN_RECEIVED\n", __LINE__));
1226 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1227 || SEQ_GT(ti->ti_ack, tp->snd_max))
1228 goto dropwithreset;
1229 tcpstat.tcps_connects++;
1230 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
1231 /*
1232 * The sent SYN is ack'ed with our sequence number +1
1233 * The first data byte already in the buffer will get
1234 * lost if no correction is made. This is only needed for
1235 * SS_CTL since the buffer is empty otherwise.
1236 * tp->snd_una++; or:
1237 */
1238 tp->snd_una = ti->ti_ack;
1239 soisfconnected(so);
1240
1241 /* Do window scaling? */
1242#if 0
1243 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1244 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1245 {
1246 tp->snd_scale = tp->requested_s_scale;
1247 tp->rcv_scale = tp->request_r_scale;
1248 }
1249#endif
1250 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1251 tp->snd_wl1 = ti->ti_seq - 1;
1252 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1253 LogFlowFunc(("%d -> synrx_to_est\n", __LINE__));
1254 goto synrx_to_est;
1255 /* fall into ... */
1256
1257 /*
1258 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1259 * ACKs. If the ack is in the range
1260 * tp->snd_una < ti->ti_ack <= tp->snd_max
1261 * then advance tp->snd_una to ti->ti_ack and drop
1262 * data from the retransmission queue. If this ACK reflects
1263 * more up to date window information we update our window information.
1264 */
1265 case TCPS_ESTABLISHED:
1266 case TCPS_FIN_WAIT_1:
1267 case TCPS_FIN_WAIT_2:
1268 case TCPS_CLOSE_WAIT:
1269 case TCPS_CLOSING:
1270 case TCPS_LAST_ACK:
1271 case TCPS_TIME_WAIT:
1272 LogFlowFunc(("%d -> TCPS_ESTABLISHED|TCPS_FIN_WAIT_1|TCPS_FIN_WAIT_2|TCPS_CLOSE_WAIT|"
1273 "TCPS_CLOSING|TCPS_LAST_ACK|TCPS_TIME_WAIT\n", __LINE__));
1274 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1275 {
1276 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1277 {
1278 tcpstat.tcps_rcvdupack++;
1279 Log2((" dup ack m = %p, so = %p\n", m, so));
1280 /*
1281 * If we have outstanding data (other than
1282 * a window probe), this is a completely
1283 * duplicate ack (ie, window info didn't
1284 * change), the ack is the biggest we've
1285 * seen and we've seen exactly our rexmt
1286 * threshold of them, assume a packet
1287 * has been dropped and retransmit it.
1288 * Kludge snd_nxt & the congestion
1289 * window so we send only this one
1290 * packet.
1291 *
1292 * We know we're losing at the current
1293 * window size so do congestion avoidance
1294 * (set ssthresh to half the current window
1295 * and pull our congestion window back to
1296 * the new ssthresh).
1297 *
1298 * Dup acks mean that packets have left the
1299 * network (they're now cached at the receiver)
1300 * so bump cwnd by the amount in the receiver
1301 * to keep a constant cwnd packets in the
1302 * network.
1303 */
1304 if ( tp->t_timer[TCPT_REXMT] == 0
1305 || ti->ti_ack != tp->snd_una)
1306 tp->t_dupacks = 0;
1307 else if (++tp->t_dupacks == tcprexmtthresh)
1308 {
1309 tcp_seq onxt = tp->snd_nxt;
1310 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1311 if (win < 2)
1312 win = 2;
1313 tp->snd_ssthresh = win * tp->t_maxseg;
1314 tp->t_timer[TCPT_REXMT] = 0;
1315 tp->t_rtt = 0;
1316 tp->snd_nxt = ti->ti_ack;
1317 tp->snd_cwnd = tp->t_maxseg;
1318 (void) tcp_output(pData, tp);
1319 tp->snd_cwnd = tp->snd_ssthresh +
1320 tp->t_maxseg * tp->t_dupacks;
1321 if (SEQ_GT(onxt, tp->snd_nxt))
1322 tp->snd_nxt = onxt;
1323 LogFlowFunc(("%d -> drop\n", __LINE__));
1324 goto drop;
1325 }
1326 else if (tp->t_dupacks > tcprexmtthresh)
1327 {
1328 tp->snd_cwnd += tp->t_maxseg;
1329 (void) tcp_output(pData, tp);
1330 LogFlowFunc(("%d -> drop\n", __LINE__));
1331 goto drop;
1332 }
1333 }
1334 else
1335 tp->t_dupacks = 0;
1336 break;
1337 }
1338synrx_to_est:
1339 LogFlowFunc(("synrx_to_est:\n"));
1340 /*
1341 * If the congestion window was inflated to account
1342 * for the other side's cached packets, retract it.
1343 */
1344 if ( tp->t_dupacks > tcprexmtthresh
1345 && tp->snd_cwnd > tp->snd_ssthresh)
1346 tp->snd_cwnd = tp->snd_ssthresh;
1347 tp->t_dupacks = 0;
1348 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1349 {
1350 tcpstat.tcps_rcvacktoomuch++;
1351 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1352 goto dropafterack;
1353 }
1354 acked = ti->ti_ack - tp->snd_una;
1355 tcpstat.tcps_rcvackpack++;
1356 tcpstat.tcps_rcvackbyte += acked;
1357
1358 /*
1359 * If we have a timestamp reply, update smoothed
1360 * round trip time. If no timestamp is present but
1361 * transmit timer is running and timed sequence
1362 * number was acked, update smoothed round trip time.
1363 * Since we now have an rtt measurement, cancel the
1364 * timer backoff (cf., Phil Karn's retransmit alg.).
1365 * Recompute the initial retransmit timer.
1366 */
1367#if 0
1368 if (ts_present)
1369 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1370 else
1371#endif
1372 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1373 tcp_xmit_timer(pData, tp, tp->t_rtt);
1374
1375 /*
1376 * If all outstanding data is acked, stop retransmit
1377 * timer and remember to restart (more output or persist).
1378 * If there is more data to be acked, restart retransmit
1379 * timer, using current (possibly backed-off) value.
1380 */
1381 if (ti->ti_ack == tp->snd_max)
1382 {
1383 tp->t_timer[TCPT_REXMT] = 0;
1384 needoutput = 1;
1385 }
1386 else if (tp->t_timer[TCPT_PERSIST] == 0)
1387 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1388 /*
1389 * When new data is acked, open the congestion window.
1390 * If the window gives us less than ssthresh packets
1391 * in flight, open exponentially (maxseg per packet).
1392 * Otherwise open linearly: maxseg per window
1393 * (maxseg^2 / cwnd per packet).
1394 */
1395 {
1396 register u_int cw = tp->snd_cwnd;
1397 register u_int incr = tp->t_maxseg;
1398
1399 if (cw > tp->snd_ssthresh)
1400 incr = incr * incr / cw;
1401 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1402 }
1403 if (acked > SBUF_LEN(&so->so_snd))
1404 {
1405 tp->snd_wnd -= SBUF_LEN(&so->so_snd);
1406 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1407 ourfinisacked = 1;
1408 }
1409 else
1410 {
1411 sbdrop(&so->so_snd, acked);
1412 tp->snd_wnd -= acked;
1413 ourfinisacked = 0;
1414 }
1415 /*
1416 * XXX sowwakup is called when data is acked and there's room for
1417 * for more data... it should read() the socket
1418 */
1419#if 0
1420 if (so->so_snd.sb_flags & SB_NOTIFY)
1421 sowwakeup(so);
1422#endif
1423 tp->snd_una = ti->ti_ack;
1424 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1425 tp->snd_nxt = tp->snd_una;
1426
1427 switch (tp->t_state)
1428 {
1429 /*
1430 * In FIN_WAIT_1 STATE in addition to the processing
1431 * for the ESTABLISHED state if our FIN is now acknowledged
1432 * then enter FIN_WAIT_2.
1433 */
1434 case TCPS_FIN_WAIT_1:
1435 if (ourfinisacked)
1436 {
1437 /*
1438 * If we can't receive any more
1439 * data, then closing user can proceed.
1440 * Starting the timer is contrary to the
1441 * specification, but if we don't get a FIN
1442 * we'll hang forever.
1443 */
1444 if (so->so_state & SS_FCANTRCVMORE)
1445 {
1446 soisfdisconnected(so);
1447 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1448 }
1449 TCP_STATE_SWITCH_TO(tp, TCPS_FIN_WAIT_2);
1450 }
1451 break;
1452
1453 /*
1454 * In CLOSING STATE in addition to the processing for
1455 * the ESTABLISHED state if the ACK acknowledges our FIN
1456 * then enter the TIME-WAIT state, otherwise ignore
1457 * the segment.
1458 */
1459 case TCPS_CLOSING:
1460 if (ourfinisacked)
1461 {
1462 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1463 tcp_canceltimers(tp);
1464 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1465 soisfdisconnected(so);
1466 }
1467 break;
1468
1469 /*
1470 * In LAST_ACK, we may still be waiting for data to drain
1471 * and/or to be acked, as well as for the ack of our FIN.
1472 * If our FIN is now acknowledged, delete the TCB,
1473 * enter the closed state and return.
1474 */
1475 case TCPS_LAST_ACK:
1476 if (ourfinisacked)
1477 {
1478 tp = tcp_close(pData, tp);
1479 LogFlowFunc(("%d -> drop\n", __LINE__));
1480 goto drop;
1481 }
1482 break;
1483
1484 /*
1485 * In TIME_WAIT state the only thing that should arrive
1486 * is a retransmission of the remote FIN. Acknowledge
1487 * it and restart the finack timer.
1488 */
1489 case TCPS_TIME_WAIT:
1490 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1491 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1492 goto dropafterack;
1493 }
1494 } /* switch(tp->t_state) */
1495
1496step6:
1497 LogFlowFunc(("step6:\n"));
1498 /*
1499 * Update window information.
1500 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1501 */
1502 if ( (tiflags & TH_ACK)
1503 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1504 || ( tp->snd_wl1 == ti->ti_seq
1505 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1506 || ( tp->snd_wl2 == ti->ti_ack
1507 && tiwin > tp->snd_wnd)))))
1508 {
1509 /* keep track of pure window updates */
1510 if ( ti->ti_len == 0
1511 && tp->snd_wl2 == ti->ti_ack
1512 && tiwin > tp->snd_wnd)
1513 tcpstat.tcps_rcvwinupd++;
1514 tp->snd_wnd = tiwin;
1515 tp->snd_wl1 = ti->ti_seq;
1516 tp->snd_wl2 = ti->ti_ack;
1517 if (tp->snd_wnd > tp->max_sndwnd)
1518 tp->max_sndwnd = tp->snd_wnd;
1519 needoutput = 1;
1520 }
1521
1522 /*
1523 * Process segments with URG.
1524 */
1525 if ((tiflags & TH_URG) && ti->ti_urp &&
1526 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1527 {
1528 /*
1529 * This is a kludge, but if we receive and accept
1530 * random urgent pointers, we'll crash in
1531 * soreceive. It's hard to imagine someone
1532 * actually wanting to send this much urgent data.
1533 */
1534 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1535 {
1536 ti->ti_urp = 0;
1537 tiflags &= ~TH_URG;
1538 LogFlowFunc(("%d -> dodata\n", __LINE__));
1539 goto dodata;
1540 }
1541
1542 /*
1543 * If this segment advances the known urgent pointer,
1544 * then mark the data stream. This should not happen
1545 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1546 * a FIN has been received from the remote side.
1547 * In these states we ignore the URG.
1548 *
1549 * According to RFC961 (Assigned Protocols),
1550 * the urgent pointer points to the last octet
1551 * of urgent data. We continue, however,
1552 * to consider it to indicate the first octet
1553 * of data past the urgent section as the original
1554 * spec states (in one of two places).
1555 */
1556 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1557 {
1558 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1559 so->so_urgc = SBUF_LEN(&so->so_rcv) +
1560 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1561 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1562 }
1563 }
1564 else
1565 /*
1566 * If no out of band data is expected,
1567 * pull receive urgent pointer along
1568 * with the receive window.
1569 */
1570 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1571 tp->rcv_up = tp->rcv_nxt;
1572dodata:
1573 LogFlowFunc(("dodata:\n"));
1574
1575 /*
1576 * If this is a small packet, then ACK now - with Nagel
1577 * congestion avoidance sender won't send more until
1578 * he gets an ACK.
1579 *
1580 * XXX: In case you wonder... The magic "27" below is ESC that
1581 * presumably starts a terminal escape-sequence and that we want
1582 * to ACK ASAP. [Original slirp code had three different
1583 * heuristics to chose from here and in the header prediction case
1584 * above, but the commented out alternatives were lost and the
1585 * header prediction case that had an expanded comment about this
1586 * has been modified to always send an ACK].
1587 */
1588 if ( ti->ti_len
1589 && (unsigned)ti->ti_len <= 5
1590 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1591 {
1592 tp->t_flags |= TF_ACKNOW;
1593 }
1594
1595 /*
1596 * Process the segment text, merging it into the TCP sequencing queue,
1597 * and arranging for acknowledgment of receipt if necessary.
1598 * This process logically involves adjusting tp->rcv_wnd as data
1599 * is presented to the user (this happens in tcp_usrreq.c,
1600 * case PRU_RCVD). If a FIN has already been received on this
1601 * connection then we just ignore the text.
1602 */
1603 if ( (ti->ti_len || (tiflags&TH_FIN))
1604 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1605 {
1606 if ( ti->ti_seq == tp->rcv_nxt
1607 && LIST_EMPTY(&tp->t_segq)
1608 && tp->t_state == TCPS_ESTABLISHED)
1609 {
1610 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1611 tp->rcv_nxt += tlen;
1612 tiflags = ti->ti_t.th_flags & TH_FIN;
1613 tcpstat.tcps_rcvpack++;
1614 tcpstat.tcps_rcvbyte += tlen;
1615 if (so->so_state & SS_FCANTRCVMORE)
1616 m_freem(pData, m);
1617 else
1618 sbappend(pData, so, m);
1619 }
1620 else
1621 {
1622 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1623 tp->t_flags |= TF_ACKNOW;
1624 }
1625 /*
1626 * Note the amount of data that peer has sent into
1627 * our window, in order to estimate the sender's
1628 * buffer size.
1629 */
1630 len = SBUF_SIZE(&so->so_rcv) - (tp->rcv_adv - tp->rcv_nxt);
1631 }
1632 else
1633 {
1634 m_freem(pData, m);
1635 tiflags &= ~TH_FIN;
1636 }
1637
1638 /*
1639 * If FIN is received ACK the FIN and let the user know
1640 * that the connection is closing.
1641 */
1642 if (tiflags & TH_FIN)
1643 {
1644 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1645 {
1646 /*
1647 * If we receive a FIN we can't send more data,
1648 * set it SS_FDRAIN
1649 * Shutdown the socket if there is no rx data in the
1650 * buffer.
1651 * soread() is called on completion of shutdown() and
1652 * will got to TCPS_LAST_ACK, and use tcp_output()
1653 * to send the FIN.
1654 */
1655/* sofcantrcvmore(so); */
1656 sofwdrain(so);
1657
1658 tp->t_flags |= TF_ACKNOW;
1659 tp->rcv_nxt++;
1660 }
1661 switch (tp->t_state)
1662 {
1663 /*
1664 * In SYN_RECEIVED and ESTABLISHED STATES
1665 * enter the CLOSE_WAIT state.
1666 */
1667 case TCPS_SYN_RECEIVED:
1668 case TCPS_ESTABLISHED:
1669 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSE_WAIT);
1670 break;
1671
1672 /*
1673 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1674 * enter the CLOSING state.
1675 */
1676 case TCPS_FIN_WAIT_1:
1677 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSING);
1678 break;
1679
1680 /*
1681 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1682 * starting the time-wait timer, turning off the other
1683 * standard timers.
1684 */
1685 case TCPS_FIN_WAIT_2:
1686 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1687 tcp_canceltimers(tp);
1688 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1689 soisfdisconnected(so);
1690 break;
1691
1692 /*
1693 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1694 */
1695 case TCPS_TIME_WAIT:
1696 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1697 break;
1698 }
1699 }
1700
1701 /*
1702 * Return any desired output.
1703 */
1704 if (needoutput || (tp->t_flags & TF_ACKNOW))
1705 tcp_output(pData, tp);
1706
1707 SOCKET_UNLOCK(so);
1708 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1709 LogFlowFuncLeave();
1710 return;
1711
1712dropafterack:
1713 LogFlowFunc(("dropafterack:\n"));
1714 /*
1715 * Generate an ACK dropping incoming segment if it occupies
1716 * sequence space, where the ACK reflects our state.
1717 */
1718 if (tiflags & TH_RST)
1719 {
1720 LogFlowFunc(("%d -> drop\n", __LINE__));
1721 goto drop;
1722 }
1723 m_freem(pData, m);
1724 tp->t_flags |= TF_ACKNOW;
1725 (void) tcp_output(pData, tp);
1726 SOCKET_UNLOCK(so);
1727 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1728 LogFlowFuncLeave();
1729 return;
1730
1731dropwithreset:
1732 LogFlowFunc(("dropwithreset:\n"));
1733 /* reuses m if m!=NULL, m_free() unnecessary */
1734 if (tiflags & TH_ACK)
1735 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1736 else
1737 {
1738 if (tiflags & TH_SYN)
1739 ti->ti_len++;
1740 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1741 TH_RST|TH_ACK);
1742 }
1743
1744 if (so != &tcb)
1745 SOCKET_UNLOCK(so);
1746 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1747 LogFlowFuncLeave();
1748 return;
1749
1750drop:
1751 LogFlowFunc(("drop:\n"));
1752 /*
1753 * Drop space held by incoming segment and return.
1754 */
1755 m_freem(pData, m);
1756
1757#ifdef VBOX_WITH_SLIRP_MT
1758 if (RTCritSectIsOwned(&so->so_mutex))
1759 {
1760 SOCKET_UNLOCK(so);
1761 }
1762#endif
1763
1764 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1765 LogFlowFuncLeave();
1766 return;
1767}
1768
1769
1770void
1771tcp_fconnect_failed(PNATState pData, struct socket *so, int sockerr)
1772{
1773 struct tcpcb *tp;
1774 int code;
1775
1776 Log2(("NAT: connect error %d %R[natsock]\n", sockerr, so));
1777
1778 Assert(so->so_state & SS_ISFCONNECTING);
1779 so->so_state = SS_NOFDREF;
1780
1781 if (sockerr == ECONNREFUSED || sockerr == ECONNRESET)
1782 {
1783 /* hand off to tcp_input():cont_conn to send RST */
1784 TCP_INPUT(pData, NULL, 0, so);
1785 return;
1786 }
1787
1788 tp = sototcpcb(so);
1789 if (RT_UNLIKELY(tp == NULL)) /* should never happen */
1790 {
1791 LogRel(("NAT: tp == NULL %R[natsock]\n", so));
1792 sofree(pData, so);
1793 return;
1794 }
1795
1796 if (sockerr == ENETUNREACH || sockerr == ENETDOWN)
1797 code = ICMP_UNREACH_NET;
1798 else if (sockerr == EHOSTUNREACH || sockerr == EHOSTDOWN)
1799 code = ICMP_UNREACH_HOST;
1800 else
1801 code = -1;
1802
1803 if (code >= 0)
1804 {
1805 struct ip *oip;
1806 unsigned ohdrlen;
1807 struct mbuf *m;
1808
1809 if (RT_UNLIKELY(so->so_ohdr == NULL))
1810 goto out;
1811
1812 oip = (struct ip *)so->so_ohdr;
1813 ohdrlen = oip->ip_hl * 4 + 8;
1814
1815 m = m_gethdr(pData, M_NOWAIT, MT_HEADER);
1816 if (RT_UNLIKELY(m == NULL))
1817 goto out;
1818
1819 m_copyback(pData, m, 0, ohdrlen, (caddr_t)so->so_ohdr);
1820 m->m_pkthdr.header = mtod(m, void *);
1821
1822 icmp_error(pData, m, ICMP_UNREACH, code, 0, NULL);
1823 }
1824
1825 out:
1826 tcp_close(pData, tp);
1827}
1828
1829
1830void
1831tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1832{
1833 u_int16_t mss;
1834 int opt, optlen;
1835
1836 LogFlowFunc(("tcp_dooptions: tp = %R[tcpcb793], cnt=%i\n", tp, cnt));
1837
1838 for (; cnt > 0; cnt -= optlen, cp += optlen)
1839 {
1840 opt = cp[0];
1841 if (opt == TCPOPT_EOL)
1842 break;
1843 if (opt == TCPOPT_NOP)
1844 optlen = 1;
1845 else
1846 {
1847 optlen = cp[1];
1848 if (optlen <= 0)
1849 break;
1850 }
1851 switch (opt)
1852 {
1853 default:
1854 continue;
1855
1856 case TCPOPT_MAXSEG:
1857 if (optlen != TCPOLEN_MAXSEG)
1858 continue;
1859 if (!(ti->ti_flags & TH_SYN))
1860 continue;
1861 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1862 NTOHS(mss);
1863 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1864 break;
1865
1866#if 0
1867 case TCPOPT_WINDOW:
1868 if (optlen != TCPOLEN_WINDOW)
1869 continue;
1870 if (!(ti->ti_flags & TH_SYN))
1871 continue;
1872 tp->t_flags |= TF_RCVD_SCALE;
1873 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1874 break;
1875
1876 case TCPOPT_TIMESTAMP:
1877 if (optlen != TCPOLEN_TIMESTAMP)
1878 continue;
1879 *ts_present = 1;
1880 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1881 NTOHL(*ts_val);
1882 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1883 NTOHL(*ts_ecr);
1884
1885 /*
1886 * A timestamp received in a SYN makes
1887 * it ok to send timestamp requests and replies.
1888 */
1889 if (ti->ti_flags & TH_SYN)
1890 {
1891 tp->t_flags |= TF_RCVD_TSTMP;
1892 tp->ts_recent = *ts_val;
1893 tp->ts_recent_age = tcp_now;
1894 }
1895 break;
1896#endif
1897 }
1898 }
1899}
1900
1901
1902/*
1903 * Pull out of band byte out of a segment so
1904 * it doesn't appear in the user's data queue.
1905 * It is still reflected in the segment length for
1906 * sequencing purposes.
1907 */
1908
1909#if 0
1910void
1911tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1912{
1913 int cnt = ti->ti_urp - 1;
1914
1915 while (cnt >= 0)
1916 {
1917 if (m->m_len > cnt)
1918 {
1919 char *cp = mtod(m, caddr_t) + cnt;
1920 struct tcpcb *tp = sototcpcb(so);
1921
1922 tp->t_iobc = *cp;
1923 tp->t_oobflags |= TCPOOB_HAVEDATA;
1924 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1925 m->m_len--;
1926 return;
1927 }
1928 cnt -= m->m_len;
1929 m = m->m_next; /* XXX WRONG! Fix it! */
1930 if (m == 0)
1931 break;
1932 }
1933 panic("tcp_pulloutofband");
1934}
1935#endif
1936
1937/*
1938 * Collect new round-trip time estimate
1939 * and update averages and current timeout.
1940 */
1941
1942void
1943tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1944{
1945 register short delta;
1946
1947 LogFlowFunc(("ENTER: tcp_xmit_timer: tp = %R[tcpcb793] rtt = %d\n", tp, rtt));
1948
1949 tcpstat.tcps_rttupdated++;
1950 if (tp->t_srtt != 0)
1951 {
1952 /*
1953 * srtt is stored as fixed point with 3 bits after the
1954 * binary point (i.e., scaled by 8). The following magic
1955 * is equivalent to the smoothing algorithm in rfc793 with
1956 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1957 * point). Adjust rtt to origin 0.
1958 */
1959 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1960 if ((tp->t_srtt += delta) <= 0)
1961 tp->t_srtt = 1;
1962 /*
1963 * We accumulate a smoothed rtt variance (actually, a
1964 * smoothed mean difference), then set the retransmit
1965 * timer to smoothed rtt + 4 times the smoothed variance.
1966 * rttvar is stored as fixed point with 2 bits after the
1967 * binary point (scaled by 4). The following is
1968 * equivalent to rfc793 smoothing with an alpha of .75
1969 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1970 * rfc793's wired-in beta.
1971 */
1972 if (delta < 0)
1973 delta = -delta;
1974 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1975 if ((tp->t_rttvar += delta) <= 0)
1976 tp->t_rttvar = 1;
1977 }
1978 else
1979 {
1980 /*
1981 * No rtt measurement yet - use the unsmoothed rtt.
1982 * Set the variance to half the rtt (so our first
1983 * retransmit happens at 3*rtt).
1984 */
1985 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1986 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1987 }
1988 tp->t_rtt = 0;
1989 tp->t_rxtshift = 0;
1990
1991 /*
1992 * the retransmit should happen at rtt + 4 * rttvar.
1993 * Because of the way we do the smoothing, srtt and rttvar
1994 * will each average +1/2 tick of bias. When we compute
1995 * the retransmit timer, we want 1/2 tick of rounding and
1996 * 1 extra tick because of +-1/2 tick uncertainty in the
1997 * firing of the timer. The bias will give us exactly the
1998 * 1.5 tick we need. But, because the bias is
1999 * statistical, we have to test that we don't drop below
2000 * the minimum feasible timer (which is 2 ticks).
2001 */
2002 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2003 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
2004
2005 /*
2006 * We received an ack for a packet that wasn't retransmitted;
2007 * it is probably safe to discard any error indications we've
2008 * received recently. This isn't quite right, but close enough
2009 * for now (a route might have failed after we sent a segment,
2010 * and the return path might not be symmetrical).
2011 */
2012 tp->t_softerror = 0;
2013}
2014
2015/*
2016 * Determine a reasonable value for maxseg size.
2017 * If the route is known, check route for mtu.
2018 * If none, use an mss that can be handled on the outgoing
2019 * interface without forcing IP to fragment; if bigger than
2020 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2021 * to utilize large mbufs. If no route is found, route has no mtu,
2022 * or the destination isn't local, use a default, hopefully conservative
2023 * size (usually 512 or the default IP max size, but no more than the mtu
2024 * of the interface), as we can't discover anything about intervening
2025 * gateways or networks. We also initialize the congestion/slow start
2026 * window to be a single segment if the destination isn't local.
2027 * While looking at the routing entry, we also initialize other path-dependent
2028 * parameters from pre-set or cached values in the routing entry.
2029 */
2030
2031int
2032tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
2033{
2034 struct socket *so = tp->t_socket;
2035 int mss;
2036
2037 LogFlowFunc(("ENTER: tcp_mss: offer=%u, t_maxseg=%u; tp=%R[natsock]\n",
2038 offer, (unsigned int)tp->t_maxseg, so));
2039
2040 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
2041 if (offer)
2042 mss = min(mss, offer);
2043 mss = max(mss, 32);
2044 if (mss < tp->t_maxseg || offer != 0)
2045 tp->t_maxseg = mss;
2046
2047 tp->snd_cwnd = mss;
2048
2049 sbreserve(pData, &so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
2050 sbreserve(pData, &so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
2051
2052 LogFlowFunc(("LEAVE: mss=%d\n", mss));
2053 return mss;
2054}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette