VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_output.c@ 19839

Last change on this file since 19839 was 19839, checked in by vboxsync, 16 years ago

NAT: Slirp don't use ether address of guest anymore
instead it calculates ethernet address of destination
with lookup operation. Currently it's very simple looks
over send addresses via dhcp or assume destination in outer
network and gets Slirp's ethernet address.

  • Property svn:eol-style set to native
File size: 20.6 KB
Line 
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93
34 * tcp_output.c,v 1.3 1994/09/15 10:36:55 davidg Exp
35 */
36
37/*
38 * Changes and additions relating to SLiRP
39 * Copyright (c) 1995 Danny Gasparovski.
40 *
41 * Please read the file COPYRIGHT for the
42 * terms and conditions of the copyright.
43 */
44
45#include <slirp.h>
46
47/*
48 * Since this is only used in "stats socket", we give meaning
49 * names instead of the REAL names
50 */
51const char * const tcpstates[] =
52{
53/* "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", */
54 "REDIRECT", "LISTEN", "SYN_SENT", "SYN_RCVD",
55 "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
56 "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
57};
58
59static const u_char tcp_outflags[TCP_NSTATES] =
60{
61 TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
62 TH_ACK, TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK,
63 TH_FIN|TH_ACK, TH_ACK, TH_ACK,
64};
65
66
67#define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
68
69/*
70 * Tcp output routine: figure out what should be sent and send it.
71 */
72int
73tcp_output(PNATState pData, register struct tcpcb *tp)
74{
75 register struct socket *so = tp->t_socket;
76 register long len, win;
77 int off, flags, error;
78 register struct mbuf *m;
79 register struct tcpiphdr *ti;
80 u_char opt[MAX_TCPOPTLEN];
81 unsigned optlen, hdrlen;
82 int idle, sendalot;
83
84 DEBUG_CALL("tcp_output");
85 DEBUG_ARG("tp = %lx", (long )tp);
86
87 /*
88 * Determine length of data that should be transmitted,
89 * and flags that will be used.
90 * If there is some data or critical controls (SYN, RST)
91 * to send, then transmit; otherwise, investigate further.
92 */
93 idle = (tp->snd_max == tp->snd_una);
94 if (idle && tp->t_idle >= tp->t_rxtcur)
95 /*
96 * We have been idle for "a while" and no acks are
97 * expected to clock out any data we send --
98 * slow start to get ack "clock" running again.
99 */
100 tp->snd_cwnd = tp->t_maxseg;
101
102again:
103 sendalot = 0;
104 off = tp->snd_nxt - tp->snd_una;
105 win = min(tp->snd_wnd, tp->snd_cwnd);
106
107 flags = tcp_outflags[tp->t_state];
108
109 DEBUG_MISC((dfd, " --- tcp_output flags = 0x%x\n",flags));
110
111 /*
112 * If in persist timeout with window of 0, send 1 byte.
113 * Otherwise, if window is small but nonzero
114 * and timer expired, we will send what we can
115 * and go to transmit state.
116 */
117 if (tp->t_force)
118 {
119 if (win == 0)
120 {
121 /*
122 * If we still have some data to send, then
123 * clear the FIN bit. Usually this would
124 * happen below when it realizes that we
125 * aren't sending all the data. However,
126 * if we have exactly 1 byte of unset data,
127 * then it won't clear the FIN bit below,
128 * and if we are in persist state, we wind
129 * up sending the packet without recording
130 * that we sent the FIN bit.
131 *
132 * We can't just blindly clear the FIN bit,
133 * because if we don't have any more data
134 * to send then the probe will be the FIN
135 * itself.
136 */
137 if (off < so->so_snd.sb_cc)
138 flags &= ~TH_FIN;
139 win = 1;
140 }
141 else
142 {
143 tp->t_timer[TCPT_PERSIST] = 0;
144 tp->t_rxtshift = 0;
145 }
146 }
147
148 len = min(so->so_snd.sb_cc, win) - off;
149 if (len < 0)
150 {
151 /*
152 * If FIN has been sent but not acked,
153 * but we haven't been called to retransmit,
154 * len will be -1. Otherwise, window shrank
155 * after we sent into it. If window shrank to 0,
156 * cancel pending retransmit and pull snd_nxt
157 * back to (closed) window. We will enter persist
158 * state below. If the window didn't close completely,
159 * just wait for an ACK.
160 */
161 len = 0;
162 if (win == 0)
163 {
164 tp->t_timer[TCPT_REXMT] = 0;
165 tp->snd_nxt = tp->snd_una;
166 }
167 }
168 if (len > tp->t_maxseg)
169 {
170 len = tp->t_maxseg;
171 sendalot = 1;
172 }
173 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
174 flags &= ~TH_FIN;
175
176 win = sbspace(&so->so_rcv);
177
178 /*
179 * Sender silly window avoidance. If connection is idle
180 * and can send all data, a maximum segment,
181 * at least a maximum default-size segment do it,
182 * or are forced, do it; otherwise don't bother.
183 * If peer's buffer is tiny, then send
184 * when window is at least half open.
185 * If retransmitting (possibly after persist timer forced us
186 * to send into a small window), then must resend.
187 */
188 if (len)
189 {
190 if (len == tp->t_maxseg)
191 goto send;
192 if ((1 || idle || tp->t_flags & TF_NODELAY) &&
193 len + off >= so->so_snd.sb_cc)
194 goto send;
195 if (tp->t_force)
196 goto send;
197 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
198 goto send;
199 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
200 goto send;
201 }
202
203 /*
204 * Compare available window to amount of window
205 * known to peer (as advertised window less
206 * next expected input). If the difference is at least two
207 * max size segments, or at least 50% of the maximum possible
208 * window, then want to send a window update to peer.
209 */
210 if (win > 0)
211 {
212 /*
213 * "adv" is the amount we can increase the window,
214 * taking into account that we are limited by
215 * TCP_MAXWIN << tp->rcv_scale.
216 */
217 long adv = min(win,
218 (long)TCP_MAXWIN << tp->rcv_scale) -
219 (tp->rcv_adv - tp->rcv_nxt);
220
221 if (adv >= (long) (2 * tp->t_maxseg))
222 goto send;
223 if (2 * adv >= (long) so->so_rcv.sb_datalen)
224 goto send;
225 }
226
227 /*
228 * Send if we owe peer an ACK.
229 */
230 if (tp->t_flags & TF_ACKNOW)
231 goto send;
232 if (flags & (TH_SYN|TH_RST))
233 goto send;
234 if (SEQ_GT(tp->snd_up, tp->snd_una))
235 goto send;
236 /*
237 * If our state indicates that FIN should be sent
238 * and we have not yet done so, or we're retransmitting the FIN,
239 * then we need to send.
240 */
241 if ( flags & TH_FIN
242 && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
243 goto send;
244
245 /*
246 * TCP window updates are not reliable, rather a polling protocol
247 * using ``persist'' packets is used to insure receipt of window
248 * updates. The three ``states'' for the output side are:
249 * idle not doing retransmits or persists
250 * persisting to move a small or zero window
251 * (re)transmitting and thereby not persisting
252 *
253 * tp->t_timer[TCPT_PERSIST]
254 * is set when we are in persist state.
255 * tp->t_force
256 * is set when we are called to send a persist packet.
257 * tp->t_timer[TCPT_REXMT]
258 * is set when we are retransmitting
259 * The output side is idle when both timers are zero.
260 *
261 * If send window is too small, there is data to transmit, and no
262 * retransmit or persist is pending, then go to persist state.
263 * If nothing happens soon, send when timer expires:
264 * if window is nonzero, transmit what we can,
265 * otherwise force out a byte.
266 */
267 if ( so->so_snd.sb_cc
268 && tp->t_timer[TCPT_REXMT] == 0
269 && tp->t_timer[TCPT_PERSIST] == 0)
270 {
271 tp->t_rxtshift = 0;
272 tcp_setpersist(tp);
273 }
274
275 /*
276 * No reason to send a segment, just return.
277 */
278 tcpstat.tcps_didnuttin++;
279
280 return (0);
281
282send:
283 /*
284 * Before ESTABLISHED, force sending of initial options
285 * unless TCP set not to do any options.
286 * NOTE: we assume that the IP/TCP header plus TCP options
287 * always fit in a single mbuf, leaving room for a maximum
288 * link header, i.e.
289 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
290 */
291 optlen = 0;
292 hdrlen = sizeof (struct tcpiphdr);
293 if (flags & TH_SYN)
294 {
295 tp->snd_nxt = tp->iss;
296 if ((tp->t_flags & TF_NOOPT) == 0)
297 {
298 u_int16_t mss;
299
300 opt[0] = TCPOPT_MAXSEG;
301 opt[1] = 4;
302 mss = htons((u_int16_t) tcp_mss(pData, tp, 0));
303 memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss));
304 optlen = 4;
305
306#if 0
307 if ( (tp->t_flags & TF_REQ_SCALE)
308 && ( (flags & TH_ACK) == 0
309 || (tp->t_flags & TF_RCVD_SCALE)))
310 {
311 *((u_int32_t *) (opt + optlen)) = htonl( TCPOPT_NOP << 24
312 | TCPOPT_WINDOW << 16
313 | TCPOLEN_WINDOW << 8
314 | tp->request_r_scale);
315 optlen += 4;
316 }
317#endif
318 }
319 }
320
321 /*
322 * Send a timestamp and echo-reply if this is a SYN and our side
323 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
324 * and our peer have sent timestamps in our SYN's.
325 */
326#if 0
327 if ( (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP
328 && (flags & TH_RST) == 0
329 && ( (flags & (TH_SYN|TH_ACK)) == TH_SYN
330 || (tp->t_flags & TF_RCVD_TSTMP)))
331 {
332 u_int32_t *lp = (u_int32_t *)(opt + optlen);
333
334 /* Form timestamp option as shown in appendix A of RFC 1323. */
335 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
336 *lp++ = htonl(tcp_now);
337 *lp = htonl(tp->ts_recent);
338 optlen += TCPOLEN_TSTAMP_APPA;
339 }
340#endif
341 hdrlen += optlen;
342
343 /*
344 * Adjust data length if insertion of options will
345 * bump the packet length beyond the t_maxseg length.
346 */
347 if (len > tp->t_maxseg - optlen)
348 {
349 len = tp->t_maxseg - optlen;
350 sendalot = 1;
351 }
352
353 /*
354 * Grab a header mbuf, attaching a copy of data to
355 * be transmitted, and initialize the header from
356 * the template for sends on this connection.
357 */
358 if (len)
359 {
360 if (tp->t_force && len == 1)
361 tcpstat.tcps_sndprobe++;
362 else if (SEQ_LT(tp->snd_nxt, tp->snd_max))
363 {
364 tcpstat.tcps_sndrexmitpack++;
365 tcpstat.tcps_sndrexmitbyte += len;
366 }
367 else
368 {
369 tcpstat.tcps_sndpack++;
370 tcpstat.tcps_sndbyte += len;
371 }
372
373 m = m_get(pData);
374 if (m == NULL)
375 {
376/* error = ENOBUFS; */
377 error = 1;
378 goto out;
379 }
380 m->m_data += if_maxlinkhdr;
381 m->m_len = hdrlen;
382
383 /*
384 * This will always succeed, since we make sure our mbufs
385 * are big enough to hold one MSS packet + header + ... etc.
386 */
387#if 0
388 if (len <= MHLEN - hdrlen - max_linkhdr)
389 {
390#endif
391 sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen);
392 m->m_len += len;
393#if 0
394 }
395 else
396 {
397 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
398 if (m->m_next == 0)
399 len = 0;
400 }
401#endif
402 /*
403 * If we're sending everything we've got, set PUSH.
404 * (This will keep happy those implementations which only
405 * give data to the user when a buffer fills or
406 * a PUSH comes in.)
407 */
408 if (off + len == so->so_snd.sb_cc)
409 flags |= TH_PUSH;
410 }
411 else
412 {
413 if (tp->t_flags & TF_ACKNOW)
414 tcpstat.tcps_sndacks++;
415 else if (flags & (TH_SYN|TH_FIN|TH_RST))
416 tcpstat.tcps_sndctrl++;
417 else if (SEQ_GT(tp->snd_up, tp->snd_una))
418 tcpstat.tcps_sndurg++;
419 else
420 tcpstat.tcps_sndwinup++;
421
422 m = m_get(pData);
423 if (m == NULL)
424 {
425/* error = ENOBUFS; */
426 error = 1;
427 goto out;
428 }
429 m->m_data += if_maxlinkhdr;
430 m->m_len = hdrlen;
431 }
432
433 ti = mtod(m, struct tcpiphdr *);
434
435 memcpy((caddr_t)ti, &tp->t_template, sizeof (struct tcpiphdr));
436
437 /*
438 * Fill in fields, remembering maximum advertised
439 * window for use in delaying messages about window sizes.
440 * If resending a FIN, be sure not to use a new sequence number.
441 */
442 if ( flags & TH_FIN
443 && tp->t_flags & TF_SENTFIN
444 && tp->snd_nxt == tp->snd_max)
445 tp->snd_nxt--;
446 /*
447 * If we are doing retransmissions, then snd_nxt will
448 * not reflect the first unsent octet. For ACK only
449 * packets, we do not want the sequence number of the
450 * retransmitted packet, we want the sequence number
451 * of the next unsent octet. So, if there is no data
452 * (and no SYN or FIN), use snd_max instead of snd_nxt
453 * when filling in ti_seq. But if we are in persist
454 * state, snd_max might reflect one byte beyond the
455 * right edge of the window, so use snd_nxt in that
456 * case, since we know we aren't doing a retransmission.
457 * (retransmit and persist are mutually exclusive...)
458 */
459 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
460 ti->ti_seq = htonl(tp->snd_nxt);
461 else
462 ti->ti_seq = htonl(tp->snd_max);
463 ti->ti_ack = htonl(tp->rcv_nxt);
464 if (optlen)
465 {
466 memcpy((caddr_t)(ti + 1), (caddr_t)opt, optlen);
467 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
468 }
469 ti->ti_flags = flags;
470 /*
471 * Calculate receive window. Don't shrink window,
472 * but avoid silly window syndrome.
473 */
474 if (win < (long)(so->so_rcv.sb_datalen / 4) && win < (long)tp->t_maxseg)
475 win = 0;
476 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
477 win = (long)TCP_MAXWIN << tp->rcv_scale;
478 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
479 win = (long)(tp->rcv_adv - tp->rcv_nxt);
480 ti->ti_win = htons((u_int16_t) (win>>tp->rcv_scale));
481
482#if 0
483 if (SEQ_GT(tp->snd_up, tp->snd_nxt))
484 {
485 ti->ti_urp = htons((u_int16_t)(tp->snd_up - tp->snd_nxt));
486#else
487 if (SEQ_GT(tp->snd_up, tp->snd_una))
488 {
489 ti->ti_urp = htons((u_int16_t)(tp->snd_up - ntohl(ti->ti_seq)));
490#endif
491 ti->ti_flags |= TH_URG;
492 }
493 else
494 /*
495 * If no urgent pointer to send, then we pull
496 * the urgent pointer to the left edge of the send window
497 * so that it doesn't drift into the send window on sequence
498 * number wraparound.
499 */
500 tp->snd_up = tp->snd_una; /* drag it along */
501
502 /*
503 * Put TCP length in extended header, and then
504 * checksum extended header and data.
505 */
506 if (len + optlen)
507 ti->ti_len = htons((u_int16_t)(sizeof (struct tcphdr)
508 + optlen + len));
509 ti->ti_sum = cksum(m, (int)(hdrlen + len));
510
511 /*
512 * In transmit state, time the transmission and arrange for
513 * the retransmit. In persist state, just set snd_max.
514 */
515 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0)
516 {
517 tcp_seq startseq = tp->snd_nxt;
518
519 /*
520 * Advance snd_nxt over sequence space of this segment.
521 */
522 if (flags & (TH_SYN|TH_FIN))
523 {
524 if (flags & TH_SYN)
525 tp->snd_nxt++;
526 if (flags & TH_FIN)
527 {
528 tp->snd_nxt++;
529 tp->t_flags |= TF_SENTFIN;
530 }
531 }
532 tp->snd_nxt += len;
533 if (SEQ_GT(tp->snd_nxt, tp->snd_max))
534 {
535 tp->snd_max = tp->snd_nxt;
536 /*
537 * Time this transmission if not a retransmission and
538 * not currently timing anything.
539 */
540 if (tp->t_rtt == 0)
541 {
542 tp->t_rtt = 1;
543 tp->t_rtseq = startseq;
544 tcpstat.tcps_segstimed++;
545 }
546 }
547
548 /*
549 * Set retransmit timer if not currently set,
550 * and not doing an ack or a keep-alive probe.
551 * Initial value for retransmit timer is smoothed
552 * round-trip time + 2 * round-trip time variance.
553 * Initialize shift counter which is used for backoff
554 * of retransmit time.
555 */
556 if ( tp->t_timer[TCPT_REXMT] == 0
557 && tp->snd_nxt != tp->snd_una)
558 {
559 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
560 if (tp->t_timer[TCPT_PERSIST])
561 {
562 tp->t_timer[TCPT_PERSIST] = 0;
563 tp->t_rxtshift = 0;
564 }
565 }
566 }
567 else
568 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
569 tp->snd_max = tp->snd_nxt + len;
570
571 /*
572 * Fill in IP length and desired time to live and
573 * send to IP level. There should be a better way
574 * to handle ttl and tos; we could keep them in
575 * the template, but need a way to checksum without them.
576 */
577 Assert(m->m_len == (hdrlen + len));
578 m->m_len = hdrlen + len; /* XXX Needed? m_len should be correct */
579
580 {
581 ((struct ip *)ti)->ip_len = m->m_len;
582 ((struct ip *)ti)->ip_ttl = ip_defttl;
583 ((struct ip *)ti)->ip_tos = so->so_iptos;
584
585 /* #if BSD >= 43 */
586 /* Don't do IP options... */
587#if 0
588 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
589 so->so_options & SO_DONTROUTE, 0);
590#endif
591#ifdef VBOX_WITH_NAT_SERVICE
592 {
593 struct ethhdr *eh0, *eh;
594 eh = (struct ethhdr *)m->m_dat;
595
596 if (so->so_m != NULL)
597 {
598 eh0 = (struct ethhdr *)so->so_m->m_dat;
599 memcpy(eh->h_source, eh0->h_source, ETH_ALEN);
600 }
601 }
602#endif
603 error = ip_output(pData, so, m);
604
605#if 0
606/* #else */
607 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
608 so->so_options & SO_DONTROUTE);
609/* #endif */
610#endif
611 }
612 if (error)
613 {
614out:
615#if 0
616 if (error == ENOBUFS)
617 {
618 tcp_quench(tp->t_inpcb, 0);
619 return (0);
620 }
621
622 if ( ( error == EHOSTUNREACH
623 || error == ENETDOWN)
624 && TCPS_HAVERCVDSYN(tp->t_state))
625 {
626 tp->t_softerror = error;
627 return (0);
628 }
629#endif
630 return (error);
631 }
632 tcpstat.tcps_sndtotal++;
633
634 /*
635 * Data sent (as far as we can tell).
636 * If this advertises a larger window than any other segment,
637 * then remember the size of the advertised window.
638 * Any pending ACK has now been sent.
639 */
640 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
641 tp->rcv_adv = tp->rcv_nxt + win;
642 tp->last_ack_sent = tp->rcv_nxt;
643 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
644 if (sendalot)
645 goto again;
646
647 return (0);
648}
649
650void
651tcp_setpersist(struct tcpcb *tp)
652{
653 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
654
655#if 0
656 if (tp->t_timer[TCPT_REXMT])
657 panic("tcp_output REXMT");
658#endif
659 /*
660 * Start/restart persistence timer.
661 */
662 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
663 t * tcp_backoff[tp->t_rxtshift],
664 TCPTV_PERSMIN, TCPTV_PERSMAX);
665 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
666 tp->t_rxtshift++;
667}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette