VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_output.c@ 16581

Last change on this file since 16581 was 15890, checked in by vboxsync, 16 years ago

NAT: 1. wo sync enhancement branch is still functional (was corrupted with using ICMP file handler in select(1))

  1. after sending send queue doesn't need to synchronize with NAT thread to free mbuf instead NAT queue used to call freeing slirp routine.
  2. no more copying on slirp to guest sent.


  • Property svn:eol-style set to native
File size: 20.4 KB
Line 
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93
34 * tcp_output.c,v 1.3 1994/09/15 10:36:55 davidg Exp
35 */
36
37/*
38 * Changes and additions relating to SLiRP
39 * Copyright (c) 1995 Danny Gasparovski.
40 *
41 * Please read the file COPYRIGHT for the
42 * terms and conditions of the copyright.
43 */
44
45#include <slirp.h>
46
47/*
48 * Since this is only used in "stats socket", we give meaning
49 * names instead of the REAL names
50 */
51const char * const tcpstates[] =
52{
53/* "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", */
54 "REDIRECT", "LISTEN", "SYN_SENT", "SYN_RCVD",
55 "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
56 "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
57};
58
59static const u_char tcp_outflags[TCP_NSTATES] =
60{
61 TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
62 TH_ACK, TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK,
63 TH_FIN|TH_ACK, TH_ACK, TH_ACK,
64};
65
66
67#define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
68
69/*
70 * Tcp output routine: figure out what should be sent and send it.
71 */
72int
73tcp_output(PNATState pData, register struct tcpcb *tp)
74{
75 register struct socket *so = tp->t_socket;
76 register long len, win;
77 int off, flags, error;
78 register struct mbuf *m;
79 register struct tcpiphdr *ti;
80 u_char opt[MAX_TCPOPTLEN];
81 unsigned optlen, hdrlen;
82 int idle, sendalot;
83
84 DEBUG_CALL("tcp_output");
85 DEBUG_ARG("tp = %lx", (long )tp);
86
87 /*
88 * Determine length of data that should be transmitted,
89 * and flags that will be used.
90 * If there is some data or critical controls (SYN, RST)
91 * to send, then transmit; otherwise, investigate further.
92 */
93 idle = (tp->snd_max == tp->snd_una);
94 if (idle && tp->t_idle >= tp->t_rxtcur)
95 /*
96 * We have been idle for "a while" and no acks are
97 * expected to clock out any data we send --
98 * slow start to get ack "clock" running again.
99 */
100 tp->snd_cwnd = tp->t_maxseg;
101
102again:
103 sendalot = 0;
104 off = tp->snd_nxt - tp->snd_una;
105 win = min(tp->snd_wnd, tp->snd_cwnd);
106
107 flags = tcp_outflags[tp->t_state];
108
109 DEBUG_MISC((dfd, " --- tcp_output flags = 0x%x\n",flags));
110
111 /*
112 * If in persist timeout with window of 0, send 1 byte.
113 * Otherwise, if window is small but nonzero
114 * and timer expired, we will send what we can
115 * and go to transmit state.
116 */
117 if (tp->t_force)
118 {
119 if (win == 0)
120 {
121 /*
122 * If we still have some data to send, then
123 * clear the FIN bit. Usually this would
124 * happen below when it realizes that we
125 * aren't sending all the data. However,
126 * if we have exactly 1 byte of unset data,
127 * then it won't clear the FIN bit below,
128 * and if we are in persist state, we wind
129 * up sending the packet without recording
130 * that we sent the FIN bit.
131 *
132 * We can't just blindly clear the FIN bit,
133 * because if we don't have any more data
134 * to send then the probe will be the FIN
135 * itself.
136 */
137 if (off < so->so_snd.sb_cc)
138 flags &= ~TH_FIN;
139 win = 1;
140 }
141 else
142 {
143 tp->t_timer[TCPT_PERSIST] = 0;
144 tp->t_rxtshift = 0;
145 }
146 }
147
148 len = min(so->so_snd.sb_cc, win) - off;
149 if (len < 0)
150 {
151 /*
152 * If FIN has been sent but not acked,
153 * but we haven't been called to retransmit,
154 * len will be -1. Otherwise, window shrank
155 * after we sent into it. If window shrank to 0,
156 * cancel pending retransmit and pull snd_nxt
157 * back to (closed) window. We will enter persist
158 * state below. If the window didn't close completely,
159 * just wait for an ACK.
160 */
161 len = 0;
162 if (win == 0)
163 {
164 tp->t_timer[TCPT_REXMT] = 0;
165 tp->snd_nxt = tp->snd_una;
166 }
167 }
168 if (len > tp->t_maxseg)
169 {
170 len = tp->t_maxseg;
171 sendalot = 1;
172 }
173 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
174 flags &= ~TH_FIN;
175
176 win = sbspace(&so->so_rcv);
177
178 /*
179 * Sender silly window avoidance. If connection is idle
180 * and can send all data, a maximum segment,
181 * at least a maximum default-size segment do it,
182 * or are forced, do it; otherwise don't bother.
183 * If peer's buffer is tiny, then send
184 * when window is at least half open.
185 * If retransmitting (possibly after persist timer forced us
186 * to send into a small window), then must resend.
187 */
188 if (len)
189 {
190 if (len == tp->t_maxseg)
191 goto send;
192 if ((1 || idle || tp->t_flags & TF_NODELAY) &&
193 len + off >= so->so_snd.sb_cc)
194 goto send;
195 if (tp->t_force)
196 goto send;
197 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
198 goto send;
199 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
200 goto send;
201 }
202
203 /*
204 * Compare available window to amount of window
205 * known to peer (as advertised window less
206 * next expected input). If the difference is at least two
207 * max size segments, or at least 50% of the maximum possible
208 * window, then want to send a window update to peer.
209 */
210 if (win > 0)
211 {
212 /*
213 * "adv" is the amount we can increase the window,
214 * taking into account that we are limited by
215 * TCP_MAXWIN << tp->rcv_scale.
216 */
217 long adv = min(win,
218 (long)TCP_MAXWIN << tp->rcv_scale) -
219 (tp->rcv_adv - tp->rcv_nxt);
220
221 if (adv >= (long) (2 * tp->t_maxseg))
222 goto send;
223 if (2 * adv >= (long) so->so_rcv.sb_datalen)
224 goto send;
225 }
226
227 /*
228 * Send if we owe peer an ACK.
229 */
230 if (tp->t_flags & TF_ACKNOW)
231 goto send;
232 if (flags & (TH_SYN|TH_RST))
233 goto send;
234 if (SEQ_GT(tp->snd_up, tp->snd_una))
235 goto send;
236 /*
237 * If our state indicates that FIN should be sent
238 * and we have not yet done so, or we're retransmitting the FIN,
239 * then we need to send.
240 */
241 if ( flags & TH_FIN
242 && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
243 goto send;
244
245 /*
246 * TCP window updates are not reliable, rather a polling protocol
247 * using ``persist'' packets is used to insure receipt of window
248 * updates. The three ``states'' for the output side are:
249 * idle not doing retransmits or persists
250 * persisting to move a small or zero window
251 * (re)transmitting and thereby not persisting
252 *
253 * tp->t_timer[TCPT_PERSIST]
254 * is set when we are in persist state.
255 * tp->t_force
256 * is set when we are called to send a persist packet.
257 * tp->t_timer[TCPT_REXMT]
258 * is set when we are retransmitting
259 * The output side is idle when both timers are zero.
260 *
261 * If send window is too small, there is data to transmit, and no
262 * retransmit or persist is pending, then go to persist state.
263 * If nothing happens soon, send when timer expires:
264 * if window is nonzero, transmit what we can,
265 * otherwise force out a byte.
266 */
267 if ( so->so_snd.sb_cc
268 && tp->t_timer[TCPT_REXMT] == 0
269 && tp->t_timer[TCPT_PERSIST] == 0)
270 {
271 tp->t_rxtshift = 0;
272 tcp_setpersist(tp);
273 }
274
275 /*
276 * No reason to send a segment, just return.
277 */
278 tcpstat.tcps_didnuttin++;
279
280 return (0);
281
282send:
283 /*
284 * Before ESTABLISHED, force sending of initial options
285 * unless TCP set not to do any options.
286 * NOTE: we assume that the IP/TCP header plus TCP options
287 * always fit in a single mbuf, leaving room for a maximum
288 * link header, i.e.
289 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
290 */
291 optlen = 0;
292 hdrlen = sizeof (struct tcpiphdr);
293 if (flags & TH_SYN)
294 {
295 tp->snd_nxt = tp->iss;
296 if ((tp->t_flags & TF_NOOPT) == 0)
297 {
298 u_int16_t mss;
299
300 opt[0] = TCPOPT_MAXSEG;
301 opt[1] = 4;
302 mss = htons((u_int16_t) tcp_mss(pData, tp, 0));
303 memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss));
304 optlen = 4;
305
306#if 0
307 if ( (tp->t_flags & TF_REQ_SCALE)
308 && ( (flags & TH_ACK) == 0
309 || (tp->t_flags & TF_RCVD_SCALE)))
310 {
311 *((u_int32_t *) (opt + optlen)) = htonl( TCPOPT_NOP << 24
312 | TCPOPT_WINDOW << 16
313 | TCPOLEN_WINDOW << 8
314 | tp->request_r_scale);
315 optlen += 4;
316 }
317#endif
318 }
319 }
320
321 /*
322 * Send a timestamp and echo-reply if this is a SYN and our side
323 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
324 * and our peer have sent timestamps in our SYN's.
325 */
326#if 0
327 if ( (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP
328 && (flags & TH_RST) == 0
329 && ( (flags & (TH_SYN|TH_ACK)) == TH_SYN
330 || (tp->t_flags & TF_RCVD_TSTMP)))
331 {
332 u_int32_t *lp = (u_int32_t *)(opt + optlen);
333
334 /* Form timestamp option as shown in appendix A of RFC 1323. */
335 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
336 *lp++ = htonl(tcp_now);
337 *lp = htonl(tp->ts_recent);
338 optlen += TCPOLEN_TSTAMP_APPA;
339 }
340#endif
341 hdrlen += optlen;
342
343 /*
344 * Adjust data length if insertion of options will
345 * bump the packet length beyond the t_maxseg length.
346 */
347 if (len > tp->t_maxseg - optlen)
348 {
349 len = tp->t_maxseg - optlen;
350 sendalot = 1;
351 }
352
353 /*
354 * Grab a header mbuf, attaching a copy of data to
355 * be transmitted, and initialize the header from
356 * the template for sends on this connection.
357 */
358 if (len)
359 {
360 if (tp->t_force && len == 1)
361 tcpstat.tcps_sndprobe++;
362 else if (SEQ_LT(tp->snd_nxt, tp->snd_max))
363 {
364 tcpstat.tcps_sndrexmitpack++;
365 tcpstat.tcps_sndrexmitbyte += len;
366 }
367 else
368 {
369 tcpstat.tcps_sndpack++;
370 tcpstat.tcps_sndbyte += len;
371 }
372
373 m = m_get(pData);
374 if (m == NULL)
375 {
376/* error = ENOBUFS; */
377 error = 1;
378 goto out;
379 }
380 m->m_data += if_maxlinkhdr;
381 m->m_len = hdrlen;
382
383 /*
384 * This will always succeed, since we make sure our mbufs
385 * are big enough to hold one MSS packet + header + ... etc.
386 */
387#if 0
388 if (len <= MHLEN - hdrlen - max_linkhdr)
389 {
390#endif
391 sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen);
392 m->m_len += len;
393#if 0
394 }
395 else
396 {
397 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
398 if (m->m_next == 0)
399 len = 0;
400 }
401#endif
402 /*
403 * If we're sending everything we've got, set PUSH.
404 * (This will keep happy those implementations which only
405 * give data to the user when a buffer fills or
406 * a PUSH comes in.)
407 */
408 if (off + len == so->so_snd.sb_cc)
409 flags |= TH_PUSH;
410 }
411 else
412 {
413 if (tp->t_flags & TF_ACKNOW)
414 tcpstat.tcps_sndacks++;
415 else if (flags & (TH_SYN|TH_FIN|TH_RST))
416 tcpstat.tcps_sndctrl++;
417 else if (SEQ_GT(tp->snd_up, tp->snd_una))
418 tcpstat.tcps_sndurg++;
419 else
420 tcpstat.tcps_sndwinup++;
421
422 m = m_get(pData);
423 if (m == NULL)
424 {
425/* error = ENOBUFS; */
426 error = 1;
427 goto out;
428 }
429 m->m_data += if_maxlinkhdr;
430#ifdef VBOX_WITH_SIMPLIFIED_SLIRP_SYNC
431 m->m_data += sizeof(struct ip)
432 + sizeof(struct tcphdr);
433#endif
434 m->m_len = hdrlen;
435 }
436
437 ti = mtod(m, struct tcpiphdr *);
438
439 memcpy((caddr_t)ti, &tp->t_template, sizeof (struct tcpiphdr));
440
441 /*
442 * Fill in fields, remembering maximum advertised
443 * window for use in delaying messages about window sizes.
444 * If resending a FIN, be sure not to use a new sequence number.
445 */
446 if ( flags & TH_FIN
447 && tp->t_flags & TF_SENTFIN
448 && tp->snd_nxt == tp->snd_max)
449 tp->snd_nxt--;
450 /*
451 * If we are doing retransmissions, then snd_nxt will
452 * not reflect the first unsent octet. For ACK only
453 * packets, we do not want the sequence number of the
454 * retransmitted packet, we want the sequence number
455 * of the next unsent octet. So, if there is no data
456 * (and no SYN or FIN), use snd_max instead of snd_nxt
457 * when filling in ti_seq. But if we are in persist
458 * state, snd_max might reflect one byte beyond the
459 * right edge of the window, so use snd_nxt in that
460 * case, since we know we aren't doing a retransmission.
461 * (retransmit and persist are mutually exclusive...)
462 */
463 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
464 ti->ti_seq = htonl(tp->snd_nxt);
465 else
466 ti->ti_seq = htonl(tp->snd_max);
467 ti->ti_ack = htonl(tp->rcv_nxt);
468 if (optlen)
469 {
470 memcpy((caddr_t)(ti + 1), (caddr_t)opt, optlen);
471 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
472 }
473 ti->ti_flags = flags;
474 /*
475 * Calculate receive window. Don't shrink window,
476 * but avoid silly window syndrome.
477 */
478 if (win < (long)(so->so_rcv.sb_datalen / 4) && win < (long)tp->t_maxseg)
479 win = 0;
480 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
481 win = (long)TCP_MAXWIN << tp->rcv_scale;
482 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
483 win = (long)(tp->rcv_adv - tp->rcv_nxt);
484 ti->ti_win = htons((u_int16_t) (win>>tp->rcv_scale));
485
486#if 0
487 if (SEQ_GT(tp->snd_up, tp->snd_nxt))
488 {
489 ti->ti_urp = htons((u_int16_t)(tp->snd_up - tp->snd_nxt));
490#else
491 if (SEQ_GT(tp->snd_up, tp->snd_una))
492 {
493 ti->ti_urp = htons((u_int16_t)(tp->snd_up - ntohl(ti->ti_seq)));
494#endif
495 ti->ti_flags |= TH_URG;
496 }
497 else
498 /*
499 * If no urgent pointer to send, then we pull
500 * the urgent pointer to the left edge of the send window
501 * so that it doesn't drift into the send window on sequence
502 * number wraparound.
503 */
504 tp->snd_up = tp->snd_una; /* drag it along */
505
506 /*
507 * Put TCP length in extended header, and then
508 * checksum extended header and data.
509 */
510 if (len + optlen)
511 ti->ti_len = htons((u_int16_t)(sizeof (struct tcphdr)
512 + optlen + len));
513 ti->ti_sum = cksum(m, (int)(hdrlen + len));
514
515 /*
516 * In transmit state, time the transmission and arrange for
517 * the retransmit. In persist state, just set snd_max.
518 */
519 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0)
520 {
521 tcp_seq startseq = tp->snd_nxt;
522
523 /*
524 * Advance snd_nxt over sequence space of this segment.
525 */
526 if (flags & (TH_SYN|TH_FIN))
527 {
528 if (flags & TH_SYN)
529 tp->snd_nxt++;
530 if (flags & TH_FIN)
531 {
532 tp->snd_nxt++;
533 tp->t_flags |= TF_SENTFIN;
534 }
535 }
536 tp->snd_nxt += len;
537 if (SEQ_GT(tp->snd_nxt, tp->snd_max))
538 {
539 tp->snd_max = tp->snd_nxt;
540 /*
541 * Time this transmission if not a retransmission and
542 * not currently timing anything.
543 */
544 if (tp->t_rtt == 0)
545 {
546 tp->t_rtt = 1;
547 tp->t_rtseq = startseq;
548 tcpstat.tcps_segstimed++;
549 }
550 }
551
552 /*
553 * Set retransmit timer if not currently set,
554 * and not doing an ack or a keep-alive probe.
555 * Initial value for retransmit timer is smoothed
556 * round-trip time + 2 * round-trip time variance.
557 * Initialize shift counter which is used for backoff
558 * of retransmit time.
559 */
560 if ( tp->t_timer[TCPT_REXMT] == 0
561 && tp->snd_nxt != tp->snd_una)
562 {
563 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
564 if (tp->t_timer[TCPT_PERSIST])
565 {
566 tp->t_timer[TCPT_PERSIST] = 0;
567 tp->t_rxtshift = 0;
568 }
569 }
570 }
571 else
572 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
573 tp->snd_max = tp->snd_nxt + len;
574
575 /*
576 * Fill in IP length and desired time to live and
577 * send to IP level. There should be a better way
578 * to handle ttl and tos; we could keep them in
579 * the template, but need a way to checksum without them.
580 */
581 m->m_len = hdrlen + len; /* XXX Needed? m_len should be correct */
582
583 {
584 ((struct ip *)ti)->ip_len = m->m_len;
585 ((struct ip *)ti)->ip_ttl = ip_defttl;
586 ((struct ip *)ti)->ip_tos = so->so_iptos;
587
588 /* #if BSD >= 43 */
589 /* Don't do IP options... */
590#if 0
591 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
592 so->so_options & SO_DONTROUTE, 0);
593#endif
594 error = ip_output(pData, so, m);
595
596#if 0
597/* #else */
598 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
599 so->so_options & SO_DONTROUTE);
600/* #endif */
601#endif
602 }
603 if (error)
604 {
605out:
606#if 0
607 if (error == ENOBUFS)
608 {
609 tcp_quench(tp->t_inpcb, 0);
610 return (0);
611 }
612
613 if ( ( error == EHOSTUNREACH
614 || error == ENETDOWN)
615 && TCPS_HAVERCVDSYN(tp->t_state))
616 {
617 tp->t_softerror = error;
618 return (0);
619 }
620#endif
621 return (error);
622 }
623 tcpstat.tcps_sndtotal++;
624
625 /*
626 * Data sent (as far as we can tell).
627 * If this advertises a larger window than any other segment,
628 * then remember the size of the advertised window.
629 * Any pending ACK has now been sent.
630 */
631 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
632 tp->rcv_adv = tp->rcv_nxt + win;
633 tp->last_ack_sent = tp->rcv_nxt;
634 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
635 if (sendalot)
636 goto again;
637
638 return (0);
639}
640
641void
642tcp_setpersist(struct tcpcb *tp)
643{
644 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
645
646#if 0
647 if (tp->t_timer[TCPT_REXMT])
648 panic("tcp_output REXMT");
649#endif
650 /*
651 * Start/restart persistence timer.
652 */
653 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
654 t * tcp_backoff[tp->t_rxtshift],
655 TCPTV_PERSMIN, TCPTV_PERSMAX);
656 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
657 tp->t_rxtshift++;
658}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette