git: 9front

ref: a6e5d4bae6075c741a39fcba62a365d9dffaed93
dir: /sys/src/9/ip/tcp.c/

View raw version
#include	"u.h"
#include	"../port/lib.h"
#include	"mem.h"
#include	"dat.h"
#include	"fns.h"
#include	"../port/error.h"

#include	"ip.h"

enum
{
	IP_TCPPROTO	= 6,

	TCP4_IPLEN	= 8,
	TCP4_PHDRSIZE	= 12,
	TCP4_HDRSIZE	= 20,
	TCP4_TCBPHDRSZ	= 40,
	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,

	TCP6_IPLEN	= 0,
	TCP6_PHDRSIZE	= 40,
	TCP6_HDRSIZE	= 20,
	TCP6_TCBPHDRSZ	= 60,
	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,

	TcptimerOFF	= 0,
	TcptimerON	= 1,
	TcptimerDONE	= 2,
	MSPTICK		= 50,		/* Milliseconds per timer tick */
	TCP_ACK		= 50,		/* Timed ack sequence in ms */
	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */

	URG		= 0x20,		/* Data marked urgent */
	ACK		= 0x10,		/* Acknowledge is valid */
	PSH		= 0x08,		/* Whole data pipe is pushed */
	RST		= 0x04,		/* Reset connection */
	SYN		= 0x02,		/* Pkt. is synchronise */
	FIN		= 0x01,		/* Start close down */

	EOLOPT		= 0,
	NOOPOPT		= 1,
	MSSOPT		= 2,
	MSS_LENGTH	= 4,		/* Maximum segment size */
	WSOPT		= 3,
	WS_LENGTH	= 3,		/* Bits to scale window size by */
	MSL2		= 10,
	DEF_MSS		= 1460,		/* Default maximum segment */
	DEF_MSS6	= 1220,		/* Default maximum segment (min) for v6 */
	DEF_RTT		= 500,		/* Default round trip */
	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
	MAX_KAT		= 3,		/* Maximum number of keep-alive timeouts */
	TCP_LISTEN	= 0,		/* Listen connection */
	TCP_CONNECT	= 1,		/* Outgoing connection */

	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */

	FORCE		= 1,
	SYNACK		= 2,

	LOGAGAIN	= 3,		/* alpha 1/8 */
	LOGDGAIN	= 2,		/* beta 1/4 */

	Closed		= 0,		/* Connection states */
	Listen,
	Syn_sent,
	Syn_received,
	Established,
	Finwait1,
	Finwait2,
	Close_wait,
	Closing,
	Last_ack,
	Time_wait,

	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
	NLHT		= 256,		/* hash table size, must be a power of 2 */
	LHTMASK		= NLHT-1,

	/*
	 * window is 64kb · 2ⁿ
	 * these factors determine the ultimate bandwidth-delay product.
	 * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
	 * 64kb · 2⁷ = 8mb, or around 1000mbps · 70ms.
	 */
	QSCALE		= 7,
	QMAX		= 64*1024-1,
};

/* negative return from ipoput means no route */
static char Enoroute[] = "no route";

/* Must correspond to the enumeration above */
static char *tcpstates[] =
{
	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
	"Closing", 	"Last_ack", 	"Time_wait"
};

typedef struct Tcptimer Tcptimer;
struct Tcptimer
{
	Tcptimer	*next;
	Tcptimer	*prev;
	Tcptimer	*readynext;
	int	state;
	int	start;
	int	count;
	void	(*func)(void*);
	void	*arg;
};

/*
 *  v4 and v6 pseudo headers used for
 *  checksuming tcp
 */
typedef struct Tcp4hdr Tcp4hdr;
struct Tcp4hdr
{
	uchar	vihl;		/* Version and header length */
	uchar	tos;		/* Type of service */
	uchar	length[2];	/* packet length */
	uchar	id[2];		/* Identification */
	uchar	frag[2];	/* Fragment information */
	uchar	ttl;
	uchar	proto;
	uchar	tcplen[2];
	uchar	tcpsrc[4];
	uchar	tcpdst[4];
	uchar	tcpsport[2];
	uchar	tcpdport[2];
	uchar	tcpseq[4];
	uchar	tcpack[4];
	uchar	tcpflag[2];
	uchar	tcpwin[2];
	uchar	tcpcksum[2];
	uchar	tcpurg[2];
	/* Options segment */
	uchar	tcpopt[1];
};

typedef struct Tcp6hdr Tcp6hdr;
struct Tcp6hdr
{
	uchar	vcf[4];
	uchar	ploadlen[2];
	uchar	proto;
	uchar	ttl;
	uchar	tcpsrc[IPaddrlen];
	uchar	tcpdst[IPaddrlen];
	uchar	tcpsport[2];
	uchar	tcpdport[2];
	uchar	tcpseq[4];
	uchar	tcpack[4];
	uchar	tcpflag[2];
	uchar	tcpwin[2];
	uchar	tcpcksum[2];
	uchar	tcpurg[2];
	/* Options segment */
	uchar	tcpopt[1];
};

/*
 *  this represents the control info
 *  for a single packet.  It is derived from
 *  a packet in ntohtcp{4,6}() and stuck into
 *  a packet in htontcp{4,6}().
 */
typedef struct Tcp Tcp;
struct	Tcp
{
	ushort	source;
	ushort	dest;
	ulong	seq;
	ulong	ack;
	uchar	flags;
	uchar	update;
	uchar	ws;	/* window scale option */
	ulong	wnd;	/* prescaled window*/
	ushort	mss;	/* max segment size option (if not zero) */
	ushort	len;	/* size of data */
};

/*
 *  this header is malloc'd to thread together fragments
 *  waiting to be coalesced
 */
typedef struct Reseq Reseq;
struct Reseq
{
	Reseq	*next;
	Tcp	seg;
	Block	*bp;
	ushort	length;
};

/*
 *  the qlock in the Conv locks this structure
 */
typedef struct Tcpctl Tcpctl;
struct Tcpctl
{
	Conv	*bypass;		/* The other when bypassing */
	uchar	state;			/* Connection state */
	uchar	flags;			/* State flags */
	uchar	flgcnt;			/* Number of flags in the send sequence (FIN,SYN) */
	struct {
		ulong	una;		/* Unacked data pointer */
		ulong	nxt;		/* Next sequence expected */
		ulong	ptr;		/* Data pointer */
		ulong	wnd;		/* Tcp send window */
		ulong	wl2;		/* Seg.ack of last window update */
		ulong	wl1;		/* Seg.seq of last window update */
		uchar	scale;		/* how much to left shift window in received packets */
		uchar	rto;		/* retransmit timeout counter */ 
		/* to implement tahoe and reno TCP */
		uchar	recovery;	/* loss recovery flag */
		ulong	rxt;		/* right window marker for recovery "recover" rfc3782 */
		ulong	dupacks;	/* number of duplicate acks rcvd */
		ulong	partialack;	/* partial acks received during recovery */
	} snd;
	struct {
		ulong	nxt;		/* Receive pointer to next uchar slot */
		ulong	ackptr;		/* Last acked sequence */
		ulong	wptr;		/* Right side of receive window */
		ulong	wsnt;		/* Last wptr sent */
		ulong	wnd;		/* Receive window incoming */
		uchar	scale;		/* how much to right shift window in transmitted packets */
	} rcv;
	ulong	iss;			/* Initial sequence number */
	ulong	cwind;			/* Congestion window */
	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
	ulong	ssthresh;		/* Slow start threshold */
	ushort	mss;			/* Maximum segment size */
	uchar	scale;			/* desired rcv.scale */
	ulong	window;			/* Our receive window (queue) */
	ulong	overlap;		/* Overlap of data re-recevived */
	Reseq	*reseq;			/* Resequencing queue */
	int	nreseq;
	int	reseqlen;
	int	backoff;		/* Exponential backoff counter */
	int	backedoff;		/* ms we've backed off for rexmits */
	Tcptimer	timer;			/* Activity timer */
	Tcptimer	acktimer;		/* Acknowledge timer */
	Tcptimer	katimer;		/* keep alive timer */
	ulong	kato;			/* keep alive timeouts */
	ulong	time;			/* time Finwait2 or Syn_received or timer was set */
	ulong	timeuna;		/* snd.una when time was set */
	ulong	rttime;			/* Sent time for rtt measurement */
	ulong	rttseq;			/* Round trip sequence */
	int	srtt;			/* Smoothed round trip */
	int	mdev;			/* Mean deviation of round trip */

	union {
		Tcp4hdr	tcp4hdr;
		Tcp6hdr	tcp6hdr;
	} protohdr;		/* prototype header */
};

/*
 *  New calls are put in limbo rather than having a conversation structure
 *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
 *  any real Conv structures mucking things up.  Calls in limbo rexmit their
 *  SYN ACK up to 4 times. They disappear after 2.5 seconds.
 */
typedef struct Limbo Limbo;
struct Limbo
{
	Limbo	*next;

	uchar	laddr[IPaddrlen];
	uchar	raddr[IPaddrlen];
	ushort	lport;
	ushort	rport;
	ulong	irs;		/* initial received sequence */
	ulong	iss;		/* initial sent sequence */
	ushort	mss;		/* mss from the other end */
	uchar	ws;		/* ws from the other end */
	uchar	rexmits;	/* number of retransmissions */
	ulong	lastsend;	/* last time we sent a synack */
	uchar	version;	/* v4 or v6 */
};

static int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */

enum {
	/* MIB stats */
	MaxConn,
	ActiveOpens,
	PassiveOpens,
	EstabResets,
	CurrEstab,
	InSegs,
	OutSegs,
	RetransSegs,
	RetransSegsSent,
	RetransTimeouts,
	InErrs,
	OutRsts,

	/* non-MIB stats */
	CsumErrs,
	HlenErrs,
	LenErrs,
	Resequenced,
	OutOfOrder,
	ReseqBytelim,
	ReseqPktlim,
	Delayack,
	Wopenack,

	Recovery,
	RecoveryDone,
	RecoveryRTO,
	RecoveryNoSeq,
	RecoveryCwind,
	RecoveryPA,

	InLimbo,

	Nstats
};

static char *statnames[Nstats] =
{
[MaxConn]	"MaxConn",
[ActiveOpens]	"ActiveOpens",
[PassiveOpens]	"PassiveOpens",
[EstabResets]	"EstabResets",
[CurrEstab]	"CurrEstab",
[InSegs]	"InSegs",
[OutSegs]	"OutSegs",
[RetransSegs]	"RetransSegs",
[RetransSegsSent]	"RetransSegsSent",
[RetransTimeouts]	"RetransTimeouts",
[InErrs]	"InErrs",
[OutRsts]	"OutRsts",
[CsumErrs]	"CsumErrs",
[HlenErrs]	"HlenErrs",
[LenErrs]	"LenErrs",
[OutOfOrder]	"OutOfOrder",
[Resequenced]	"Resequenced",
[ReseqBytelim]	"ReseqBytelim",
[ReseqPktlim]	"ReseqPktlim",
[Delayack]	"Delayack",
[Wopenack]	"Wopenack",

[Recovery]	"Recovery",
[RecoveryDone]	"RecoveryDone",
[RecoveryRTO]	"RecoveryRTO",

[RecoveryNoSeq]	"RecoveryNoSeq",
[RecoveryCwind]	"RecoveryCwind",
[RecoveryPA]	"RecoveryPA",

[InLimbo]	"InLimbo",
};

typedef struct Tcppriv Tcppriv;
struct Tcppriv
{
	/* List of active timers */
	QLock 	tl;
	Tcptimer *timers;

	/* hash table for matching conversations */
	Ipht	ht;

	/* calls in limbo waiting for an ACK to our SYN ACK */
	int	nlimbo;
	Limbo	*lht[NLHT];

	uvlong	stats[Nstats];

	int ackprocstarted;
};

static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block**, ushort);
static	int	dumpreseq(Tcpctl*);
static	int	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
static	void	limbo(Conv*, Tcp*, uchar*, uchar*, int);
static	void	limborexmit(Proto*);
static	void	localclose(Conv*, char*);
static	void	procsyn(Conv*, Tcp*);
static	void	tcpacktimer(void*);
static	void	tcpiput(Proto*, Ipifc*, Block*);
static	void	tcpkeepalive(void*);
static	void	tcpoutput(Conv*);
static	void	tcprcvwin(Conv*);
static	ulong	tcprxmit(Conv*);
static	void	tcpsetmss(Conv*, ushort);
static	void	tcpsetscale(Conv*, uchar);
static	void	tcpsettimer(Conv*);
static	void	tcpsndsyn(Conv*);
static	void	tcpstart(Conv*, int);
static	void	tcptimeout(void*);
static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);

static void
tcpsetstate(Conv *s, uchar newstate)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcppriv *tpriv;
	uchar oldstate;

	oldstate = tcb->state;
	if(oldstate == newstate)
		return;

	switch(newstate) {
	case Closed:
		qclose(s->rq);
		qclose(s->wq);
		qclose(s->eq);
		if(tcb->bypass != nil){
			tcb->bypass = nil;
			qsetbypass(s->wq, nil);
		}
		break;

	case Close_wait:		/* Remote closes */
		qhangup(s->rq, nil);
		break;
	}

	tcb->state = newstate;

	tpriv = (Tcppriv*)s->p->priv;
	if(oldstate == Established)
		tpriv->stats[CurrEstab]--;
	if(newstate == Established)
		tpriv->stats[CurrEstab]++;

	if(oldstate == Syn_sent && newstate != Closed)
		Fsconnected(s, nil);
}

static char*
tcpconnect(Conv *c, char **argv, int argc)
{
	Tcpctl *tcb = (Tcpctl*)c->ptcl;
	char *e;

	if(tcb->state != Closed)
		return Econinuse;

	e = Fsstdconnect(c, argv, argc);
	if(e != nil)
		return e;
	tcpstart(c, TCP_CONNECT);

	return nil;
}

static int
tcpstate(Conv *c, char *state, int n)
{
	Tcpctl *tcb = (Tcpctl*)c->ptcl;

	return snprint(state, n, "%s qin %d qout %d"
		" rq %d.%d ovl %lud"
		" srtt %d>>%d mdev %d"
		" timer %d/%d"
		" sst %lud cwin %lud"
		" swin %lud>>%d rwin %lud>>%d"
		" ka %d/%d %lud\n",
		tcpstates[tcb->state],
		c->rq != nil ? qlen(c->rq) : 0,
		c->wq != nil ? qlen(c->wq) : 0,
		tcb->nreseq, tcb->reseqlen, tcb->overlap,
		tcb->srtt, LOGAGAIN, tcb->mdev,
		tcb->timer.count*MSPTICK, tcb->timer.start*MSPTICK,
		tcb->ssthresh, tcb->cwind,
		tcb->snd.wnd, tcb->snd.scale, tcb->rcv.wnd, tcb->rcv.scale,
		tcb->katimer.count*MSPTICK, tcb->katimer.start*MSPTICK, tcb->kato);
}

static int
tcpinuse(Conv *c)
{
	Tcpctl *tcb = (Tcpctl*)c->ptcl;

	return tcb->state != Closed;
}

static char*
tcpannounce(Conv *c, char **argv, int argc)
{
	Tcpctl *tcb = (Tcpctl*)c->ptcl;
	char *e;

	if(tcb->state != Closed)
		return Econinuse;

	e = Fsstdannounce(c, argv, argc);
	if(e != nil)
		return e;
	tcpstart(c, TCP_LISTEN);
	Fsconnected(c, nil);

	return nil;
}

/*
 *  tcpclose is always called with the c locked
 */
static void
tcpclose(Conv *c)
{
	Tcpctl *tcb = (Tcpctl*)c->ptcl;

	qhangup(c->rq, nil);
	qhangup(c->wq, nil);
	qhangup(c->eq, nil);
	qflush(c->rq);

	switch(tcb->state) {
	case Listen:
		/*
		 *  reset any incoming calls to this listener
		 */
		Fsconnected(c, "Hangup");

		localclose(c, nil);
		break;
	case Closed:
	case Syn_sent:
		localclose(c, nil);
		break;
	case Established:
		if(tcb->bypass != nil){
			qhangup(tcb->bypass->rq, nil);
			localclose(c, nil);
			break;
		}
		/* wet floor */
	case Syn_received:
		tcb->flgcnt++;
		tcb->snd.nxt++;
		tcpsetstate(c, Finwait1);
		tcpoutput(c);
		break;
	case Close_wait:
		tcb->flgcnt++;
		tcb->snd.nxt++;
		tcpsetstate(c, Last_ack);
		tcpoutput(c);
		break;
	}
}

static void
tcpkick(void *arg)
{
	Conv *s = (Conv*)arg;
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	qlock(s);
	switch(tcb->state) {
	case Established:
		if(tcb->bypass != nil)
			break;
		/* wet floor */
	case Syn_sent:
	case Syn_received:
	case Close_wait:
		/*
		 * Push data
		 */
		tcpoutput(s);
		break;
	default:
		localclose(s, "Hangup");
		break;
	}
	qunlock(s);
}

static int
seq_in(ulong x, ulong low, ulong high)
{
	x -= low, high -= low;
	return (int)x >= 0 && (int)x < high;
}

static int
seq_lt(ulong x, ulong y)
{
	return (int)(x-y) < 0;
}

static int
seq_le(ulong x, ulong y)
{
	return (int)(x-y) <= 0;
}

static int
seq_gt(ulong x, ulong y)
{
	return (int)(x-y) > 0;
}

static int
seq_ge(ulong x, ulong y)
{
	return (int)(x-y) >= 0;
}

static void
tcprcvwin(Conv *s)				/* Call with tcb locked */
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	ulong w;

	w = tcb->window - qlen(s->rq);

	/* RFC 1122 § 4.2.3.3 silly window syndrome avoidance */
	if((int)w < tcb->rcv.wnd + tcb->mss)
		w = 0;

	w += tcb->rcv.nxt;

	/* RFC 1122 § 4.2.2.16 do not move right edge of window left */
	if(seq_lt(w, tcb->rcv.wptr))
		w = tcb->rcv.wptr;

	tcb->rcv.wptr = w;
	tcb->rcv.wnd = w - tcb->rcv.nxt;
}

static void
tcpacktimer(void *arg)
{
	Conv *s = (Conv*)arg;
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	qlock(s);
	if(tcb->state != Closed && tcb->bypass == nil){
		tcb->flags |= FORCE;
		tcpoutput(s);
	}
	qunlock(s);
}

static void
tcpcongestion(Tcpctl *tcb)
{
	ulong inflight;

	inflight = tcb->snd.nxt - tcb->snd.una;
	if(inflight > tcb->cwind)
		inflight = tcb->cwind;
	tcb->ssthresh = inflight / 2;
	if(tcb->ssthresh < 2*tcb->mss)
		tcb->ssthresh = 2*tcb->mss;
}

enum {
	L		= 2,		/* aggressive slow start; legal values ∈ (1.0, 2.0) */
};

static void
tcpabcincr(Tcpctl *tcb, uint acked)
{
	uint limit;

	tcb->abcbytes += acked;
	if(tcb->cwind < tcb->ssthresh){
		/* slow start */
		if(tcb->snd.rto)
			limit = 1*tcb->mss;
		else
			limit = L*tcb->mss;
		tcb->cwind += MIN(tcb->abcbytes, limit);
		tcb->abcbytes = 0;
	}
	else{
		tcb->snd.rto = 0;
		/* avoidance */
		if(tcb->abcbytes >= tcb->cwind){
			tcb->abcbytes -= tcb->cwind;
			tcb->cwind += tcb->mss;
		}
	}
}

static void
tcpcreate(Conv *c)
{
	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
	c->wq = qopen(QMAX, Qkick, tcpkick, c);
}

static void
timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
{
	if(newstate != TcptimerON){
		if(t->state == TcptimerON){
			/* unchain */
			if(priv->timers == t){
				priv->timers = t->next;
				if(t->prev != nil)
					panic("timerstate1");
			}
			if(t->next)
				t->next->prev = t->prev;
			if(t->prev)
				t->prev->next = t->next;
			t->next = t->prev = nil;
		}
	} else {
		if(t->state != TcptimerON){
			/* chain */
			if(t->prev != nil || t->next != nil)
				panic("timerstate2");
			t->prev = nil;
			t->next = priv->timers;
			if(t->next)
				t->next->prev = t;
			priv->timers = t;
		}
	}
	t->state = newstate;
}

static void
tcpackproc(void *arg)
{
	Proto *tcp = (Proto*)arg;
	Tcppriv *priv = (Tcppriv*)tcp->priv;
	Tcptimer *t, *tp, *timeo;

	while(waserror())
		;

	for(;;) {
		tsleep(&up->sleep, return0, 0, MSPTICK);

		qlock(&priv->tl);
		timeo = nil;
		for(t = priv->timers; t != nil; t = tp) {
			tp = t->next;
 			if(t->state == TcptimerON) {
				if(--(t->count) == 0) {
					timerstate(priv, t, TcptimerDONE);
					t->readynext = timeo;
					timeo = t;
				}
			}
		}
		qunlock(&priv->tl);

		while((t = timeo) != nil){
			timeo = t->readynext;
			if(t->state == TcptimerDONE && !waserror()){
				(*t->func)(t->arg);
				poperror();
			}
		}

		limborexmit(tcp);
	}
}

static void
tcpgo(Tcppriv *priv, Tcptimer *t)
{
	if(t == nil || t->start == 0)
		return;

	qlock(&priv->tl);
	t->count = t->start;
	timerstate(priv, t, TcptimerON);
	qunlock(&priv->tl);
}

static void
tcphalt(Tcppriv *priv, Tcptimer *t)
{
	if(t == nil)
		return;

	qlock(&priv->tl);
	timerstate(priv, t, TcptimerOFF);
	qunlock(&priv->tl);
}

static void
localclose(Conv *s, char *reason)	/* called with c locked */
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;

	iphtrem(&tpriv->ht, s);

	tcphalt(tpriv, &tcb->timer);
	tcphalt(tpriv, &tcb->acktimer);
	tcphalt(tpriv, &tcb->katimer);

	/* Flush reassembly queue; nothing more can arrive */
	dumpreseq(tcb);

	if(tcb->state == Syn_sent)
		Fsconnected(s, reason);
	if(s->state == Announced)
		wakeup(&s->listenr);

	qhangup(s->rq, reason);
	qhangup(s->wq, reason);

	tcpsetstate(s, Closed);
}

/* mtu (- TCP + IP hdr len) of 1st hop */
static int
tcpmtu(Route *r, uchar *scale, int version)
{
	Ipifc *ifc;
	int mtu;

	*scale = QSCALE;

	/*
	 * currently we do not implement path MTU discovery
	 * so use interface MTU *only* if directly reachable
	 * or when we use V4 which allows routers to fragment.
	 * otherwise, we use the default MSS which assumes a
	 * safe minimum MTU of 1280 bytes for V6.
	 */  
	if(r != nil && (ifc = r->ifc) != nil){
		mtu = ifc->maxtu - ifc->m->hsize;
		if(version == V4)
			return mtu - (TCP4_PKT + TCP4_HDRSIZE);
		mtu -= TCP6_PKT + TCP6_HDRSIZE;
		if((r->type & (Rifc|Runi)) != 0 || mtu <= DEF_MSS6)
			return mtu;
	}
	if(version == V6)
		return DEF_MSS6;
	else
		return DEF_MSS;
}

static void
inittcpctl(Conv *s, int mode)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcp4hdr* h4;
	Tcp6hdr* h6;

	memset(tcb, 0, sizeof(Tcpctl));

	tcb->srtt = tcp_irtt<<LOGAGAIN;
	tcb->mdev = tcp_irtt<<(LOGDGAIN-1);

	/* setup timers */
	tcb->timer.start = tcp_irtt / MSPTICK;
	tcb->timer.func = tcptimeout;
	tcb->timer.arg = s;

	tcb->acktimer.start = TCP_ACK / MSPTICK;
	tcb->acktimer.func = tcpacktimer;
	tcb->acktimer.arg = s;

	tcb->katimer.start = 0;	/* not enabled by default */
	tcb->katimer.func = tcpkeepalive;
	tcb->katimer.arg = s;

	if(mode == TCP_LISTEN)
		return;

	if(ipcmp(s->laddr, IPnoaddr) == 0)
		findlocalip(s->p->f, s->laddr, s->raddr);

	/* create a prototype(pseudo) header */
	switch(s->ipversion){
	case V4:
		h4 = &tcb->protohdr.tcp4hdr;
		memset(h4, 0, sizeof(*h4));
		h4->vihl = IP_VER4;
		h4->proto = IP_TCPPROTO;
		hnputs(h4->tcpsport, s->lport);
		hnputs(h4->tcpdport, s->rport);
		v6tov4(h4->tcpsrc, s->laddr);
		v6tov4(h4->tcpdst, s->raddr);

		tcb->mss = DEF_MSS;
		break;
	case V6:
		h6 = &tcb->protohdr.tcp6hdr;
		memset(h6, 0, sizeof(*h6));
		h6->proto = IP_TCPPROTO;
		hnputs(h6->tcpsport, s->lport);
		hnputs(h6->tcpdport, s->rport);
		ipmove(h6->tcpsrc, s->laddr);
		ipmove(h6->tcpdst, s->raddr);

		tcb->mss = DEF_MSS6;
		break;
	default:
		panic("inittcpctl: version %d", s->ipversion);
	}
	tcpsetscale(s, 0);
}

/*
 *  called with s locked
 */
static void
tcpstart(Conv *s, int mode)
{
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;

	if(tpriv->ackprocstarted == 0){
		qlock(&tpriv->tl);
		if(tpriv->ackprocstarted == 0){
			char kpname[KNAMELEN];

			snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
			kproc(kpname, tcpackproc, s->p);
			tpriv->ackprocstarted = 1;
		}
		qunlock(&tpriv->tl);
	}

	inittcpctl(s, mode);

	iphtadd(&tpriv->ht, s);
	switch(mode) {
	case TCP_LISTEN:
		tpriv->stats[PassiveOpens]++;
		tcpsetstate(s, Listen);
		break;

	case TCP_CONNECT:
		tpriv->stats[ActiveOpens]++;
		tcpsndsyn(s);
		tcpsetstate(s, Syn_sent);
		tcpoutput(s);
		break;
	}
}

static char*
tcpflag(char *buf, char *e, ushort flag)
{
	char *p;

	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
	if(flag & URG)
		p = seprint(p, e, " URG");
	if(flag & ACK)
		p = seprint(p, e, " ACK");
	if(flag & PSH)
		p = seprint(p, e, " PSH");
	if(flag & RST)
		p = seprint(p, e, " RST");
	if(flag & SYN)
		p = seprint(p, e, " SYN");
	if(flag & FIN)
		p = seprint(p, e, " FIN");
	USED(p);
	return buf;
}

static Block*
htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph)
{
	int dlen;
	Tcp6hdr *h;
	ushort hdrlen, optpad = 0;
	uchar *opt;

	hdrlen = TCP6_HDRSIZE;
	if(tcph->flags & SYN){
		if(tcph->mss)
			hdrlen += MSS_LENGTH;
		if(tcph->ws)
			hdrlen += WS_LENGTH;
		optpad = hdrlen & 3;
		if(optpad)
			optpad = 4 - optpad;
		hdrlen += optpad;
	}

	if(data) {
		dlen = blocklen(data);
		data = padblock(data, hdrlen + TCP6_PKT);
	}
	else {
		dlen = 0;
		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
		data->wp += hdrlen + TCP6_PKT;
	}

	/* copy in pseudo ip header plus port numbers */
	h = (Tcp6hdr *)(data->rp);
	memmove(h, ph, TCP6_TCBPHDRSZ);

	/* compose pseudo tcp header, do cksum calculation */
	hnputl(h->vcf, hdrlen + dlen);
	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
	h->ttl = ph->proto;

	/* copy in variable bits */
	hnputl(h->tcpseq, tcph->seq);
	hnputl(h->tcpack, tcph->ack);
	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
	hnputs(h->tcpwin, tcph->wnd);
	hnputs(h->tcpurg, 0);

	if(tcph->flags & SYN){
		opt = h->tcpopt;
		if(tcph->mss != 0){
			*opt++ = MSSOPT;
			*opt++ = MSS_LENGTH;
			hnputs(opt, tcph->mss);
			opt += 2;
		}
		if(tcph->ws != 0){
			*opt++ = WSOPT;
			*opt++ = WS_LENGTH;
			*opt++ = tcph->ws;
		}
		while(optpad-- > 0)
			*opt++ = NOOPOPT;
	}

	hnputs(h->tcpcksum, ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE));

	/* move from pseudo header back to normal ip header */
	memset(h->vcf, 0, 4);
	h->vcf[0] = IP_VER6;
	hnputs(h->ploadlen, hdrlen+dlen);
	h->proto = ph->proto;

	return data;
}

static Block*
htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph)
{
	int dlen;
	Tcp4hdr *h;
	ushort hdrlen, optpad = 0;
	uchar *opt;

	hdrlen = TCP4_HDRSIZE;
	if(tcph->flags & SYN){
		if(tcph->mss)
			hdrlen += MSS_LENGTH;
		if(1)
			hdrlen += WS_LENGTH;
		optpad = hdrlen & 3;
		if(optpad)
			optpad = 4 - optpad;
		hdrlen += optpad;
	}

	if(data) {
		dlen = blocklen(data);
		data = padblock(data, hdrlen + TCP4_PKT);
	}
	else {
		dlen = 0;
		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
		data->wp += hdrlen + TCP4_PKT;
	}

	/* copy in pseudo ip header plus port numbers */
	h = (Tcp4hdr *)(data->rp);
	memmove(h, ph, TCP4_TCBPHDRSZ);

	/* copy in variable bits */
	hnputs(h->tcplen, hdrlen + dlen);
	hnputl(h->tcpseq, tcph->seq);
	hnputl(h->tcpack, tcph->ack);
	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
	hnputs(h->tcpwin, tcph->wnd);
	hnputs(h->tcpurg, 0);

	if(tcph->flags & SYN){
		opt = h->tcpopt;
		if(tcph->mss != 0){
			*opt++ = MSSOPT;
			*opt++ = MSS_LENGTH;
			hnputs(opt, tcph->mss);
			opt += 2;
		}
		/* always offer.  rfc1323 §2.2 */
		if(1){
			*opt++ = WSOPT;
			*opt++ = WS_LENGTH;
			*opt++ = tcph->ws;
		}
		while(optpad-- > 0)
			*opt++ = NOOPOPT;
	}

	hnputs(h->tcpcksum, ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE));

	return data;
}

static int
ntohtcp6(Tcp *tcph, Block **bpp)
{
	Tcp6hdr *h;
	uchar *optr;
	ushort hdrlen;
	ushort optlen;
	int n;

	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
	if(*bpp == nil)
		return -1;

	h = (Tcp6hdr *)((*bpp)->rp);
	tcph->source = nhgets(h->tcpsport);
	tcph->dest = nhgets(h->tcpdport);
	tcph->seq = nhgetl(h->tcpseq);
	tcph->ack = nhgetl(h->tcpack);
	hdrlen = (h->tcpflag[0]>>2) & ~3;
	if(hdrlen < TCP6_HDRSIZE) {
		freeblist(*bpp);
		*bpp = nil;
		return -1;
	}

	tcph->flags = h->tcpflag[1];
	tcph->wnd = nhgets(h->tcpwin);
	tcph->mss = 0;
	tcph->ws = 0;
	tcph->update = 0;
	tcph->len = nhgets(h->ploadlen) - hdrlen;

	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
	if(*bpp == nil)
		return -1;

	optr = h->tcpopt;
	n = hdrlen - TCP6_HDRSIZE;
	while(n > 0 && *optr != EOLOPT) {
		if(*optr == NOOPOPT) {
			n--;
			optr++;
			continue;
		}
		optlen = optr[1];
		if(optlen < 2 || optlen > n)
			break;
		switch(*optr) {
		case MSSOPT:
			if(optlen == MSS_LENGTH)
				tcph->mss = nhgets(optr+2);
			break;
		case WSOPT:
			if(optlen == WS_LENGTH)
				tcph->ws = optr[2];
			break;
		}
		n -= optlen;
		optr += optlen;
	}
	return hdrlen;
}

static int
ntohtcp4(Tcp *tcph, Block **bpp)
{
	Tcp4hdr *h;
	uchar *optr;
	ushort hdrlen;
	ushort optlen;
	int n;

	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
	if(*bpp == nil)
		return -1;

	h = (Tcp4hdr *)((*bpp)->rp);
	tcph->source = nhgets(h->tcpsport);
	tcph->dest = nhgets(h->tcpdport);
	tcph->seq = nhgetl(h->tcpseq);
	tcph->ack = nhgetl(h->tcpack);

	hdrlen = (h->tcpflag[0]>>2) & ~3;
	if(hdrlen < TCP4_HDRSIZE) {
		freeblist(*bpp);
		*bpp = nil;
		return -1;
	}

	tcph->flags = h->tcpflag[1];
	tcph->wnd = nhgets(h->tcpwin);
	tcph->mss = 0;
	tcph->ws = 0;
	tcph->update = 0;
	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);

	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
	if(*bpp == nil)
		return -1;

	optr = h->tcpopt;
	n = hdrlen - TCP4_HDRSIZE;
	while(n > 0 && *optr != EOLOPT) {
		if(*optr == NOOPOPT) {
			n--;
			optr++;
			continue;
		}
		optlen = optr[1];
		if(optlen < 2 || optlen > n)
			break;
		switch(*optr) {
		case MSSOPT:
			if(optlen == MSS_LENGTH)
				tcph->mss = nhgets(optr+2);
			break;
		case WSOPT:
			if(optlen == WS_LENGTH)
				tcph->ws = optr[2];
			break;
		}
		n -= optlen;
		optr += optlen;
	}
	return hdrlen;
}

/*
 *  For outgoing calls, generate an initial sequence
 *  number and put a SYN on the send queue
 */
static void
tcpsndsyn(Conv *s)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
	tcb->snd.wl2 = tcb->iss;
	tcb->snd.una = tcb->iss;
	tcb->snd.rxt = tcb->iss;
	tcb->snd.ptr = tcb->iss;
	tcb->snd.nxt = tcb->iss;

	tcb->flags = (tcb->flags & ~SYNACK) | FORCE;
	tcb->flgcnt++;

	tcb->rttime = 0;	/* set in tcpoutput() */

	/* set desired mss and scale */
	tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s->laddr, s), &tcb->scale, s->ipversion);
}

static int
sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, int version, char *reason, Routehint *rh)
{
	Tcppriv *tpriv = (Tcppriv*)tcp->priv;
	Block *bp;
	union {
		Tcp4hdr ph4;
		Tcp6hdr ph6;
	} u;
	uchar rflags;

	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);

	if(seg->flags & RST)
		return -1;

	/* make pseudo header */
	switch(version) {
	case V4:
		memset(&u.ph4, 0, sizeof(u.ph4));
		u.ph4.vihl = IP_VER4;
		u.ph4.proto = IP_TCPPROTO;
		hnputs(u.ph4.tcplen, TCP4_HDRSIZE);
		hnputs(u.ph4.tcpsport, seg->dest);
		hnputs(u.ph4.tcpdport, seg->source);
		v6tov4(u.ph4.tcpsrc, dest);
		v6tov4(u.ph4.tcpdst, source);
		break;
	case V6:
		memset(&u.ph6, 0, sizeof(u.ph6));
		u.ph6.proto = IP_TCPPROTO;
		hnputs(u.ph6.ploadlen, TCP6_HDRSIZE);
		hnputs(u.ph6.tcpsport, seg->dest);
		hnputs(u.ph6.tcpdport, seg->source);
		ipmove(u.ph6.tcpsrc, dest);
		ipmove(u.ph6.tcpdst, source);
		break;
	default:
		panic("sndrst: version %d", version);
	}

	tpriv->stats[OutRsts]++;
	rflags = RST;

	/* convince the other end that this reset is in band */
	if(seg->flags & ACK) {
		seg->seq = seg->ack;
		seg->ack = 0;
	}
	else {
		rflags |= ACK;
		seg->ack = seg->seq;
		seg->seq = 0;
		if(seg->flags & SYN)
			seg->ack++;
		seg->ack += length;
		if(seg->flags & FIN)
			seg->ack++;
	}
	seg->flags = rflags;
	seg->wnd = 0;
	seg->mss = 0;
	seg->ws = 0;
	switch(version) {
	case V4:
		bp = htontcp4(seg, nil, &u.ph4);
		return ipoput4(tcp->f, bp, nil, MAXTTL, DFLTTOS, rh);
	case V6:
		bp = htontcp6(seg, nil, &u.ph6);
		return ipoput6(tcp->f, bp, nil, MAXTTL, DFLTTOS, rh);
	}
	return -1;
}

/*
 *  send a reset to the remote side and close the conversation
 *  called with s locked
 */
static char*
tcphangup(Conv *s)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	if(ipcmp(s->raddr, IPnoaddr) != 0 && tcb->state != Closed && tcb->bypass == nil) {
		Block *bp;
		Tcp seg;

		memset(&seg, 0, sizeof seg);
		seg.flags = RST | ACK;
		seg.seq = tcb->snd.ptr;
		seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
		seg.wnd = 0;
		seg.mss = 0;
		seg.ws = 0;
		switch(s->ipversion) {
		case V4:
			bp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr);
			qunlock(s);
			ipoput4(s->p->f, bp, nil, s->ttl, s->tos, s);
			break;
		case V6:
			bp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr);
			qunlock(s);
			ipoput6(s->p->f, bp, nil, s->ttl, s->tos, s);
			break;
		default:
			panic("tcphangup: version %d", s->ipversion);
		}
		qlock(s);
	}
	localclose(s, nil);
	return nil;
}

/*
 *  (re)send a SYN ACK
 */
static int
sndsynack(Proto *tcp, Limbo *lp)
{
	union {
		Tcp4hdr ph4;
		Tcp6hdr ph6;
	} u;
	Tcp seg;
	Block *bp;
	Routehint rh;
	Route *rt;

	rh.r = nil;
	rh.a = nil;
	if((rt = v6lookup(tcp->f, lp->raddr, lp->laddr, &rh)) == nil)
		return -1;

	/* make pseudo header */
	switch(lp->version) {
	case V4:
		memset(&u.ph4, 0, sizeof(u.ph4));
		u.ph4.vihl = IP_VER4;
		u.ph4.proto = IP_TCPPROTO;
		hnputs(u.ph4.tcplen, TCP4_HDRSIZE);
		hnputs(u.ph4.tcpsport, lp->lport);
		hnputs(u.ph4.tcpdport, lp->rport);
		v6tov4(u.ph4.tcpsrc, lp->laddr);
		v6tov4(u.ph4.tcpdst, lp->raddr);
		break;
	case V6:
		memset(&u.ph6, 0, sizeof(u.ph6));
		u.ph6.proto = IP_TCPPROTO;
		hnputs(u.ph6.ploadlen, TCP6_HDRSIZE);
		hnputs(u.ph6.tcpsport, lp->lport);
		hnputs(u.ph6.tcpdport, lp->rport);
		ipmove(u.ph6.tcpsrc, lp->laddr);
		ipmove(u.ph6.tcpdst, lp->raddr);
		break;
	default:
		panic("sndsynack: version %d", lp->version);
	}

	memset(&seg, 0, sizeof seg);
	seg.seq = lp->iss;
	seg.ack = lp->irs+1;
	seg.flags = SYN|ACK;
	seg.mss = tcpmtu(rt, &seg.ws, lp->version);
	seg.wnd = QMAX;

	/* if the other did not set window scale, both should be zero */
	if(lp->ws == 0)
		seg.ws = 0;

	lp->rexmits++;
	lp->lastsend = NOW;

	switch(lp->version) {
	case V4:
		bp = htontcp4(&seg, nil, &u.ph4);
		return ipoput4(tcp->f, bp, nil, MAXTTL, DFLTTOS, &rh);
	case V6:
		bp = htontcp6(&seg, nil, &u.ph6);
		return ipoput6(tcp->f, bp, nil, MAXTTL, DFLTTOS, &rh);
	}
	return -1;
}

#define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )

static Limbo**
limbohash(Tcppriv *tpriv, uchar *raddr, ushort rport)
{
	return &tpriv->lht[hashipa(raddr, rport)];
}

static Limbo**
limboent(Tcppriv *tpriv, uchar *raddr, ushort rport, uchar *laddr, ushort lport, int version)
{
	Limbo *lp, **l;

	for(l = limbohash(tpriv, raddr, rport); (lp = *l) != nil; l = &lp->next){
		if(lp->lport != lport || lp->rport != rport || lp->version != version)
			continue;
		if(ipcmp(lp->raddr, raddr) != 0)
			continue;
		if(ipcmp(lp->laddr, laddr) != 0)
			continue;
		break;
	}
	return l;
}

/*
 *  put a call into limbo and respond with a SYN ACK
 *
 *  called with proto locked
 */
static void
limbo(Conv *s, Tcp *seg, uchar *source, uchar *dest, int version)
{
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
	Limbo *lp, **l;

	l = limboent(tpriv, source, seg->source, dest, seg->dest, version);
	if((lp = *l) != nil){
		/* each new SYN restarts the retransmits */
		lp->irs = seg->seq;
	} else {
		Limbo **h;

		if(tpriv->nlimbo >= Maxlimbo
		/* reuse the oldest entry (head) of this hash bucket */
		&& (lp = *(h = limbohash(tpriv, source, seg->source))) != nil){
			if((*h = lp->next) == nil)
				l = h;
		} else {
			if((lp = malloc(sizeof(*lp))) == nil)
				return;
			tpriv->nlimbo++;
		}
		lp->next = nil;
		ipmove(lp->laddr, dest);
		ipmove(lp->raddr, source);
		lp->lport = seg->dest;
		lp->rport = seg->source;
		lp->mss = seg->mss;
		lp->ws = seg->ws;
		lp->irs = seg->seq;
		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
		lp->rexmits = 0;
		lp->version = version;
		*l = lp;
	}
	if(sndsynack(s->p, lp) < 0){
		tpriv->nlimbo--;
		*l = lp->next;
		free(lp);
	}
}

/*
 *  resend SYN ACK's.
 */
static void
limborexmit(Proto *tcp)
{
	Tcppriv *tpriv;
	Limbo **l, *lp;
	ulong now;
	int h;

	if(!canqlock(tcp))
		return;

	now = NOW;
	tpriv = tcp->priv;
	for(h = 0; h < NLHT; h++){
		for(l = &tpriv->lht[h]; (lp = *l) != nil; ){
			if(now - lp->lastsend >= lp->rexmits*250){
				if(lp->rexmits > 4 || sndsynack(tcp, lp) < 0){
					tpriv->nlimbo--;
					*l = lp->next;
					free(lp);
					continue;
				}
			}
			l = &lp->next;
		}
	}
	qunlock(tcp);
}

/*
 *  lookup call in limbo.  if found, throw it out.
 *
 *  called with proto locked
 */
static void
limborst(Conv *s, Tcp *seg, uchar *source, uchar *dest, int version)
{
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
	Limbo *lp, **l;

	l = limboent(tpriv, source, seg->source, dest, seg->dest, version);
	if((lp = *l) == nil)
		return;
	/* RST can only follow the SYN */
	if(seg->seq == lp->irs+1){
		tpriv->nlimbo--;
		*l = lp->next;
		free(lp);
	}
}
/*
 *  use the time between the first SYN and it's ack as the
 *  initial round trip time
 */
static void
tcpsynackrtt(Tcpctl *tcb)
{
	int rtt = (int)(NOW - tcb->rttime);
	tcb->rttime = 0;
	tcb->srtt = rtt<<LOGAGAIN;
	tcb->mdev = rtt<<(LOGDGAIN-1);
}

static void
tcpbypass(void *arg, Block *b)
{
	Conv *c = (Conv*)arg;
	Tcpctl *tcb = (Tcpctl*)c->ptcl;
	Conv *o = tcb->bypass;

	if(o == nil || ((Tcpctl*)o->ptcl)->bypass != c){
		freeblist(b);
		return;
	}
	qbwrite(o->rq, b);
}

/*
 *  splice two local tcp conversations together,
 *  having the wq directly bypass into the other
 *  ends rq.
 *
 *  both new, old and proto are locked.
 */
static void
tcpsplice(Conv *new, Conv *old)
{
	Tcppriv *tpriv;
	Tcpctl *ntcb, *otcb;

	ntcb = (Tcpctl*)new->ptcl;
	otcb = (Tcpctl*)old->ptcl;

	if(otcb->state != Established)
		return;

	ntcb->bypass = old;
	otcb->bypass = new;

	qsetlimit(new->rq, conf.pipeqsize);
	qsetlimit(old->rq, conf.pipeqsize);
	qsetbypass(new->wq, tcpbypass);
	qsetbypass(old->wq, tcpbypass);

	/* stop timers and dump resequencing queue */
	tpriv = (Tcppriv*)new->p->priv;
	tcphalt(tpriv, &ntcb->timer);
	tcphalt(tpriv, &otcb->timer);
	tcphalt(tpriv, &ntcb->acktimer);
	tcphalt(tpriv, &otcb->acktimer);
	tcphalt(tpriv, &ntcb->katimer);
	tcphalt(tpriv, &otcb->katimer);
	dumpreseq(ntcb);
	dumpreseq(otcb);
}

/*
 *  come here when we finally get an ACK to our SYN-ACK.
 *  lookup call in limbo.  if found, create a new conversation
 *
 *  called with proto locked
 */
static Conv*
tcpincoming(Conv *s, Tcp *seg, uchar *src, uchar *dst, int version)
{
	Tcppriv *tpriv;
	Route *rp;
	Limbo *lp, **l;
	Conv *new;
	Tcpctl *tcb;
	Tcp4hdr *h4;
	Tcp6hdr *h6;

	/* unless it's just an ack, it can't be someone coming out of limbo */
	if((seg->flags & SYN) || (seg->flags & ACK) == 0)
		return nil;

	tpriv = (Tcppriv*)s->p->priv;
	l = limboent(tpriv, src, seg->source, dst, seg->dest, version);
	if((lp = *l) == nil)
		return nil;
	if(seg->seq != lp->irs+1 || seg->ack != lp->iss+1){
		netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
			seg->seq, lp->irs+1, seg->ack, lp->iss+1);
		return nil;
	}
	tpriv->nlimbo--;
	*l = lp->next;

	new = Fsnewcall(s, src, seg->source, dst, seg->dest, version);
	if(new == nil){
		free(lp);
		return nil;
	}

	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
	tcb = (Tcpctl*)new->ptcl;
	tcb->timer.arg = new;
	tcb->timer.state = TcptimerOFF;
	tcb->acktimer.arg = new;
	tcb->acktimer.state = TcptimerOFF;
	tcb->katimer.arg = new;
	tcb->katimer.state = TcptimerOFF;

	tcb->rcv.nxt = tcb->rcv.ackptr = seg->seq;
	tcb->rcv.wptr = tcb->rcv.wsnt = tcb->rcv.nxt;
	tcb->rcv.wnd = 0;

	tcb->iss = lp->iss;

	tcb->snd.una = seg->ack;
	tcb->snd.ptr = seg->ack;
	tcb->snd.nxt = seg->ack;
	tcb->snd.rxt = seg->ack;

	tcb->flgcnt = 0;
	tcb->flags |= SYNACK;

	tcb->rttime = lp->lastsend;
	tcpsynackrtt(tcb);

	/* the same as what we sent in SYN,ACK */
	rp = v6lookup(s->p->f, src, dst, new);
	tcb->mss = tcpmtu(rp, &tcb->scale, version);

	tcpsetmss(new, lp->mss);
	tcpsetscale(new, lp->ws);

	free(lp);

	tcb->snd.wnd = seg->wnd << tcb->snd.scale;
	tcb->snd.wl2 = seg->ack;
	tcb->snd.wl1 = seg->seq;

	/* set up proto header */
	switch(version){
	case V4:
		h4 = &tcb->protohdr.tcp4hdr;
		memset(h4, 0, sizeof(*h4));
		h4->vihl = IP_VER4;
		h4->proto = IP_TCPPROTO;
		hnputs(h4->tcpsport, new->lport);
		hnputs(h4->tcpdport, new->rport);
		v6tov4(h4->tcpsrc, dst);
		v6tov4(h4->tcpdst, src);
		break;
	case V6:
		h6 = &tcb->protohdr.tcp6hdr;
		memset(h6, 0, sizeof(*h6));
		h6->proto = IP_TCPPROTO;
		hnputs(h6->tcpsport, new->lport);
		hnputs(h6->tcpdport, new->rport);
		ipmove(h6->tcpsrc, dst);
		ipmove(h6->tcpdst, src);
		break;
	default:
		panic("tcpincoming: version %d", new->ipversion);
	}

	qlock(new);
	tcpsetstate(new, Established);
	iphtadd(&tpriv->ht, new);
	if(rp != nil && (rp->type & Runi) != 0){
		/* see if we can find the local conversation and splice them together */
		Iphash *iph = iphtlook(&tpriv->ht, new->laddr, new->lport, new->raddr, new->rport);
		if(iph != nil && iph->trans == 0){
			Conv *old = iphconv(iph);
			if(old != new){
				qlock(old);
				tcpsplice(new, old);
				qunlock(old);
			}
		}
	}
	qunlock(new);

	return new;
}

static void
update(Conv *s, Tcp *seg)
{
	Tcpctl *tcb;
	Tcppriv *tpriv;
	int rtt, delta, acked;

	if(seg->update)
		return;
	seg->update = 1;
	if((seg->flags & ACK) == 0)
		return;

	tcb = (Tcpctl*)s->ptcl;
	tpriv = (Tcppriv*)s->p->priv;

	/* ghost acks should be ignored */
	if(seq_gt(seg->ack, tcb->snd.nxt))
		return;

	/*
	 *  update window
	 */
	if(seq_gt(seg->seq, tcb->snd.wl1)
	|| seg->seq == tcb->snd.wl1
		&& (seq_gt(seg->ack, tcb->snd.wl2)
		|| seg->ack == tcb->snd.wl2 && seg->wnd > tcb->snd.wnd)){
		/* clear dupack if we advance wl2 */
		if(tcb->snd.wl2 != seg->ack)
			tcb->snd.dupacks = 0;
		tcb->snd.wl2 = seg->ack;
		tcb->snd.wl1 = seg->seq;
		if(tcb->snd.wnd == 0 && seg->wnd > 0){
			tcb->snd.wnd = seg->wnd;
			goto recovery;
		}
		tcb->snd.wnd = seg->wnd;
	}

	/* newreno fast retransmit */
	if(tcb->snd.una != tcb->snd.nxt)
	if(seg->ack == tcb->snd.una)
	if(seg->len == 0 && (seg->flags & (SYN|FIN)) == 0)
	if(++tcb->snd.dupacks == 3){
recovery:
		if(tcb->snd.recovery){
			tpriv->stats[RecoveryCwind]++;
			tcb->cwind += tcb->mss;
		}else if(tcb->snd.wnd == 0){
			netlog(s->p->f, Logtcpwin, "!recov %lud %lud window shut\n",
				tcb->snd.rxt, seg->ack);
			/* force window probe */
			tcprxmit(s);
		}else if(seq_ge(seg->ack, tcb->snd.rxt)){
			tpriv->stats[Recovery]++;
			tcb->snd.recovery = 1;
			tcb->snd.partialack = 0;
			tcb->snd.rxt = tcb->snd.nxt;
			tcpcongestion(tcb);
			tcb->abcbytes = 0;
			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
			/* initial fast-retransmit, preserve send pointer */
			tcb->snd.ptr = tcprxmit(s);
		}else{
			tpriv->stats[RecoveryNoSeq]++;
			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
			/* do not enter fast retransmit */
			/* do not change ssthresh */
		}
	}else if(tcb->snd.recovery){
		tpriv->stats[RecoveryCwind]++;
		tcb->cwind += tcb->mss;
	}

	/* Compute the new send window size */
	acked = (int)(seg->ack - tcb->snd.una);
	if(acked <= 0){
		/*
		 *  don't let us hangup if sending into a closed window and
		 *  we're still getting acks
		 */
		if(tcb->snd.wnd == 0)
			tcb->backedoff = MAXBACKMS/4;
		return;
	}

	/* RTT measurement */
	if(tcb->rttime && seq_ge(seg->ack, tcb->rttseq)) {
		rtt = (int)(NOW - tcb->rttime);
		tcb->rttime = 0;

		delta = rtt - (tcb->srtt>>LOGAGAIN);
		tcb->srtt += delta;
		if(delta < 0) delta = -delta;
		tcb->mdev += delta - (tcb->mdev>>LOGDGAIN);
	}

	/*
	 *  update queue
	 */
	if((tcb->flags & SYNACK) == 0) {
		tcb->flags |= SYNACK;
		tcb->flgcnt--;
		acked--;
	}
	if(qdiscard(s->wq, acked) < acked)
		tcb->flgcnt--;
	tcb->snd.una = seg->ack;
	if(seq_gt(seg->ack, tcb->snd.ptr))
		tcb->snd.ptr = seg->ack;

	/*
	 *  congestion control
	 */
	if(tcb->snd.recovery){
		if(seq_ge(seg->ack, tcb->snd.rxt) || tcb->snd.wnd == 0){
			/* recovery finished; deflate window */
			tpriv->stats[RecoveryDone]++;
			tcb->snd.dupacks = 0;
			tcb->snd.recovery = 0;

			/* RFC 6582 2.3 (3) min(ssthresh, max(FlightSize, SMSS) + SMSS) */
			tcb->cwind = tcb->snd.nxt - tcb->snd.una;
			if(tcb->cwind < tcb->mss)
				tcb->cwind = tcb->mss;
			tcb->cwind += tcb->mss;
			if(tcb->cwind > tcb->ssthresh)
				tcb->cwind = tcb->ssthresh;
			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
				tcb->cwind, tcb->ssthresh);
		} else {
			/* partial ack; we lost more than one segment */
			tpriv->stats[RecoveryPA]++;
			if(tcb->cwind > acked)
				tcb->cwind -= acked;
			else{
				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
				tcb->cwind = tcb->mss;
			}
			if(acked >= tcb->mss)
				tcb->cwind += tcb->mss;
			tcb->snd.partialack++;
			netlog(s->p->f, Logtcpwin, "partial ack %d left %ld cwind %ld\n",
				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
			/* retransmit, preserve send pointer */
			tcb->snd.ptr = tcprxmit(s);
		}
	} else {
		tcpabcincr(tcb, acked);
	}

	tcb->backoff = 0;
	tcb->backedoff = 0;

	if(tcb->snd.una != tcb->snd.nxt){
		/* “impatient” variant */
		if(!tcb->snd.recovery || tcb->snd.partialack == 1)
			tcpsettimer(s);
	}
	else
		tcphalt(tpriv, &tcb->timer);
}

static void
tcpiput(Proto *tcp, Ipifc *ifc, Block *bp)
{
	Tcp seg;
	Tcp4hdr *h4;
	Tcp6hdr *h6;
	Tcpctl *tcb;
	int hdrlen;
	ushort length;
	uchar source[IPaddrlen], dest[IPaddrlen];
	Iphash *iph;
	Conv *s;
	Fs *f;
	Tcppriv *tpriv;
	char *reason;
	int version;

	f = tcp->f;
	tpriv = (Tcppriv*)tcp->priv;
	tpriv->stats[InSegs]++;

	h4 = (Tcp4hdr*)(bp->rp);
	h6 = (Tcp6hdr*)(bp->rp);

	if((h4->vihl&0xF0)==IP_VER4) {
		int ttl = h4->ttl;

		version = V4;
		length = nhgets(h4->length);
		if(length < TCP4_PKT){
			tpriv->stats[HlenErrs]++;
			tpriv->stats[InErrs]++;
			netlog(f, Logtcp, "bad tcp len\n");
			freeblist(bp);
			return;
		}
		length -= TCP4_PKT;
		v4tov6(dest, h4->tcpdst);
		v4tov6(source, h4->tcpsrc);

		h4->ttl = 0;
		hnputs(h4->tcplen, length);
		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1])
		&& ptclcsum(bp, TCP4_IPLEN, length + TCP4_PKT - TCP4_IPLEN)) {
			tpriv->stats[CsumErrs]++;
			tpriv->stats[InErrs]++;
			netlog(f, Logtcp, "bad tcp proto cksum\n");
			freeblist(bp);
			return;
		}
		h4->ttl = ttl;

		hdrlen = ntohtcp4(&seg, &bp);
		if(hdrlen < 0){
			tpriv->stats[HlenErrs]++;
			tpriv->stats[InErrs]++;
			netlog(f, Logtcp, "bad tcp hdr len\n");
			return;
		}
		length -= hdrlen;
		hdrlen += TCP4_PKT;
	}
	else {
		int ttl = h6->ttl;
		int proto = h6->proto;

		version = V6;
		length = nhgets(h6->ploadlen);
		ipmove(dest, h6->tcpdst);
		ipmove(source, h6->tcpsrc);

		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
		h6->ttl = proto;
		hnputl(h6->vcf, length);
		if((h6->tcpcksum[0] || h6->tcpcksum[1])
		&& ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE) != 0) {
			tpriv->stats[CsumErrs]++;
			tpriv->stats[InErrs]++;
			netlog(f, Logtcp,
			    "bad tcpv6 proto cksum: got %#ux\n",
				h6->tcpcksum[0]<<8 | h6->tcpcksum[1]);
			freeblist(bp);
			return;
		}
		h6->ttl = ttl;
		h6->proto = proto;
		hnputs(h6->ploadlen, length);

		hdrlen = ntohtcp6(&seg, &bp);
		if(hdrlen < 0){
			tpriv->stats[HlenErrs]++;
			tpriv->stats[InErrs]++;
			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
			return;
		}
		length -= hdrlen;
		hdrlen += TCP6_PKT;
	}

	/* lock protocol while searching for a conversation */
	qlock(tcp);

	/* Look for a matching conversation */
	iph = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
	if(iph == nil){
		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
			source, seg.source, dest, seg.dest);
reset:
		qunlock(tcp);
		freeblist(bp);
		sndrst(tcp, source, dest, length, &seg, version, "no conversation", nil);
		return;
	}
	if(iph->trans){
		Translation *q;
		int hop = h4->ttl;

		if(hop <= 1 || (q = transbackward(tcp, iph)) == nil)
			goto reset;
		hnputs_csum(h4->tcpdst+0, nhgets(q->forward.raddr+IPv4off+0), h4->tcpcksum);
		hnputs_csum(h4->tcpdst+2, nhgets(q->forward.raddr+IPv4off+2), h4->tcpcksum);
		hnputs_csum(h4->tcpdport, q->forward.rport, h4->tcpcksum);
		qunlock(tcp);
		ipoput4(f, bp, ifc, hop - 1, h4->tos, q);
		return;
	}
	s = iphconv(iph);

	/* trim off ip and tcp headers */
	bp = trimblock(bp, hdrlen, length);
	if(bp == nil){
		tpriv->stats[LenErrs]++;
		tpriv->stats[InErrs]++;
		netlog(f, Logtcp, "tcp bad length after header trim off\n");
		qunlock(tcp);
		return;
	}

	/* if it's a listener, look for the right flags and get a new conv */
	tcb = (Tcpctl*)s->ptcl;
	if(tcb->state == Listen){
		if(seg.flags & RST){
			limborst(s, &seg, source, dest, version);
			qunlock(tcp);
			freeblist(bp);
			return;
		}

		/* if this is a new SYN, put the call into limbo */
		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
			limbo(s, &seg, source, dest, version);
			qunlock(tcp);
			freeblist(bp);
			return;
		}

		/*
		 *  if there's a matching call in limbo, tcpincoming will
		 *  return it in state Established
		 */
		s = tcpincoming(s, &seg, source, dest, version);
		if(s == nil)
			goto reset;
	}

	/* The rest of the input state machine is run with the control block
	 * locked and implements the state machine directly out of the RFC.
	 * Out-of-band data is ignored - it was always a bad idea.
	 */
	tcb = (Tcpctl*)s->ptcl;
	qlock(s);
	qunlock(tcp);

	/* fix up window */
	seg.wnd <<= tcb->snd.scale;

	/* every input packet in puts off the keep alive time out */
	tcb->kato = 0;

	switch(tcb->state) {
	case Closed:
	closed:
		reason = "sending to Closed";
	reset2:
		qunlock(s);
		freeblist(bp);
		sndrst(tcp, source, dest, length, &seg, version, reason, s);
		return;
	case Syn_sent:
		if(seg.flags & ACK) {
			if(!seq_in(seg.ack, tcb->snd.una, tcb->snd.nxt+1)) {
				if(seg.flags & RST)
					goto raise;
				reason = "bad seq in Syn_sent";
				goto reset2;
			}
		}
		if(seg.flags & RST) {
			if(seg.flags & ACK)
				localclose(s, Econrefused);
			goto raise;
		}
		if(seg.flags & SYN) {
			procsyn(s, &seg);
			if(seg.flags & ACK){
				tcb->snd.wnd = seg.wnd;	/* window in SYN,ACK must not be scaled */
				tcb->snd.wl2 = seg.ack;
				tcb->snd.wl1 = seg.seq;
				tcpsynackrtt(tcb);
				update(s, &seg);
				tcpsetstate(s, Established);
			}
			else {
				tcb->time = NOW;
				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
			}
			if(length != 0 || (seg.flags & FIN))
				break;
			freeblist(bp);
			goto output;
		}
		goto raise;
	case Syn_received:
		if(seg.flags & ACK)
			tcpsynackrtt(tcb);
		break;
	}

	/* Cut the data to fit the receive window */
	tcprcvwin(s);
	if(tcptrim(tcb, &seg, &bp, &length) < 0) {
		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud (%d) win %lud-%lud (%lud) from %I!%d -> %I!%d\n", 
			seg.seq, seg.seq+length, length,
			tcb->rcv.nxt, tcb->rcv.wptr, tcb->rcv.wnd,
			s->raddr, s->rport, s->laddr, s->lport);
		tcb->flags |= FORCE;
		update(s, &seg);
		if(tcb->state == Closing)
		if(qlen(s->wq)+tcb->flgcnt == 0) {
			tcphalt(tpriv, &tcb->acktimer);
			tcphalt(tpriv, &tcb->katimer);
			tcpsetstate(s, Time_wait);
			tcb->timer.start = MSL2*(1000 / MSPTICK);
			tcpgo(tpriv, &tcb->timer);
		}
		if(seg.flags & RST)
			goto raise;
		goto output;
	}

	/* Cannot accept so answer with a rst */
	if(length && tcb->state == Closed)
		goto closed;

	/* The segment is beyond the current receive pointer so
	 * queue the data in the resequence queue
	 */
	if(seg.seq != tcb->rcv.nxt)
	if(length != 0 || (seg.flags & (SYN|FIN))) {
		/*
		 *  force duplicate ack; RFC 5681 §3.2
		 *
		 *  FORCE must be set before update()
		 *  which can send fast-retransmits
		 *  (clearing FORCE flag) as we must
		 *  only produce one output segment
		 *  per input segment.
		 */
		tcb->flags |= FORCE;
		update(s, &seg);
		if(addreseq(f, tcb, tpriv, &seg, &bp, length) < 0)
			print("tcp: reseq: %I!%d -> %I!%d\n", s->raddr, s->rport, s->laddr, s->lport);
		goto output;
	}

	/*
	 *  keep looping till we've processed this packet plus any
	 *  adjacent packets in the resequence queue
	 */
	for(;;) {
		if(seg.flags & RST) {
			if(tcb->state == Established) {
				tpriv->stats[EstabResets]++;
				if(tcb->rcv.nxt != seg.seq)
					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n",
						s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
			}
			localclose(s, Econrefused);
			goto raise;
		}

		if((seg.flags&ACK) == 0)
			goto raise;

		switch(tcb->state) {
		case Syn_received:
			if(!seq_in(seg.ack, tcb->snd.una, tcb->snd.nxt+1)){
				reason = "bad seq in Syn_received";
				goto reset2;
			}
			tcb->snd.wnd = seg.wnd;	/* already scaled */
			tcb->snd.wl2 = seg.ack;
			tcb->snd.wl1 = seg.seq;
			update(s, &seg);
			tcpsetstate(s, Established);
			break;
		case Established:
			if(tcb->bypass != nil)
				goto raise;
			/* wet floor */
		case Close_wait:
		case Finwait2:
			update(s, &seg);
			break;
		case Finwait1:
			update(s, &seg);
			if(qlen(s->wq)+tcb->flgcnt == 0){
				tcphalt(tpriv, &tcb->acktimer);
				tcphalt(tpriv, &tcb->katimer);
				tcb->time = NOW;
				tcpsetstate(s, Finwait2);
				tcb->katimer.start = MSL2*(1000 / MSPTICK);
				tcpgo(tpriv, &tcb->katimer);
			}
			break;
		case Closing:
			update(s, &seg);
			if(qlen(s->wq)+tcb->flgcnt == 0) {
				tcphalt(tpriv, &tcb->acktimer);
				tcphalt(tpriv, &tcb->katimer);
				tcpsetstate(s, Time_wait);
				tcb->timer.start = MSL2*(1000 / MSPTICK);
				tcpgo(tpriv, &tcb->timer);
			}
			break;
		case Last_ack:
			update(s, &seg);
			if(qlen(s->wq)+tcb->flgcnt == 0) {
				localclose(s, nil);
				goto raise;
			}
			/* wet floor */
		case Time_wait:
			if(seg.flags & FIN)
				tcb->flags |= FORCE;
			if(tcb->timer.state != TcptimerON)
				tcpsettimer(s);
		}

		if(length == 0) {
			freeblist(bp);
			bp = nil;
		}
		else {
			switch(tcb->state){
			default:
				/* Ignore segment text */
				freeblist(bp);
				bp = nil;
				break;

			case Syn_received:
			case Established:
			case Finwait1:
				/* If we still have some data place on
				 * receive queue
				 */
				if(bp != nil) {
					qpassnolim(s->rq, packblock(bp));
					bp = nil;
				}
				tcb->rcv.nxt += length;
				break;
			case Finwait2:
				reason = "send to Finwait2";
				goto reset2;
			}
		}

		if(seg.flags & FIN) {
			tcb->flags |= FORCE;

			switch(tcb->state) {
			case Syn_received:
			case Established:
				tcb->rcv.nxt++;
				tcpsetstate(s, Close_wait);
				break;
			case Finwait1:
				tcb->rcv.nxt++;
				if(qlen(s->wq)+tcb->flgcnt == 0) {
					tcphalt(tpriv, &tcb->acktimer);
					tcphalt(tpriv, &tcb->katimer);
					tcpsetstate(s, Time_wait);
					tcb->timer.start = MSL2*(1000/MSPTICK);
					tcpgo(tpriv, &tcb->timer);
				}
				else
					tcpsetstate(s, Closing);
				break;
			case Finwait2:
				tcb->rcv.nxt++;
				tcphalt(tpriv, &tcb->acktimer);
				tcphalt(tpriv, &tcb->katimer);
				tcpsetstate(s, Time_wait);
				tcb->timer.start = MSL2*(1000/MSPTICK);
				tcpgo(tpriv, &tcb->timer);
				break;
			case Close_wait:
			case Closing:
			case Last_ack:
				break;
			case Time_wait:
				tcpsettimer(s);
				break;
			}
		}

		/*
		 *  get next adjacent segment from the resequence queue.
		 *  dump/trim any overlapping segments
		 */
		for(;;) {
			if(getreseq(tcb, &seg, &bp, &length) < 0)
				goto output;
			tcprcvwin(s);
			if(tcptrim(tcb, &seg, &bp, &length) == 0){
				/* produce an ack when hole filled */
				tcb->flags |= FORCE;
				break;
			}
		}
	}
output:
	tcpoutput(s);

	/*
	 *  turn on the acktimer if there's something
	 *  to ack and we delayed ack in tcpoutput().
	 */
	if(tcb->rcv.ackptr != tcb->rcv.nxt)
	if(tcb->acktimer.state != TcptimerON)
		tcpgo(tpriv, &tcb->acktimer);

	qunlock(s);
	return;
raise:
	qunlock(s);
	freeblist(bp);
}

/*
 *  always enters and exits with the s locked.  We drop
 *  the lock to ipoput the packet so some care has to be
 *  taken by callers.
 */
static void
tcpoutput(Conv *s)
{
	Fs *f = s->p->f;
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
	int ssize, sndcnt, sent;
	Block *bp;
	Tcp seg;

	/* force ack every second mss */
	if((tcb->flags & FORCE) == 0)
	if(tcb->rcv.nxt - tcb->rcv.ackptr > tcb->mss){
		tpriv->stats[Delayack]++;
		tcb->flags |= FORCE;
	}

	for(;;) {
		switch(tcb->state) {
		case Listen:
		case Closed:
		case Finwait2:
			return;
		}

		/* Don't send anything else until our SYN has been acked */
		if((tcb->flags & SYNACK) == 0 && tcb->snd.ptr != tcb->iss)
			return;

		tcprcvwin(s);

		/* force an ack when a window has opened up */
		if((tcb->flags & FORCE) == 0)
		if(seq_in(tcb->rcv.nxt, tcb->rcv.wsnt, tcb->rcv.wptr)) {
			tpriv->stats[Wopenack]++;
			tcb->flags |= FORCE;
		}

		/* figure out yow much to send */
		ssize = sndcnt = qlen(s->wq)+tcb->flgcnt;
		if(tcb->snd.wnd == 0){
			/* zero window probe */
			seg.seq = tcb->snd.una;
			sent = tcb->snd.ptr - seg.seq;
			if(sent > 0)
				ssize = 0;
			else if(ssize > 0)
				ssize = 1;
		} else {
			/* calculate usable segment size */
			if(ssize > tcb->snd.wnd)
				ssize = tcb->snd.wnd;
			if(ssize > tcb->cwind)
				ssize = tcb->cwind;
			seg.seq = tcb->snd.ptr;
			sent = seg.seq - tcb->snd.una;
			ssize -= sent;
			if(ssize < 0)
				ssize = 0;
			else if(ssize > tcb->mss)
				ssize = tcb->mss;
		}

		if((tcb->flags & FORCE) == 0){
			if(ssize == 0)
				return;
			if(ssize < tcb->mss)
			if(seg.seq == tcb->snd.nxt)
			if(sent > TCPREXMTTHRESH*tcb->mss)
				return;
		}
		tcb->flags &= ~FORCE;

		/* By default we will generate an ack */
		seg.source = s->lport;
		seg.dest = s->rport;
		seg.flags = ACK;
		seg.mss = 0;
		seg.ws = 0;
		seg.update = 0;

		seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
		seg.wnd = tcb->rcv.wnd >> tcb->rcv.scale;
		tcb->rcv.wsnt = tcb->rcv.wptr;

		bp = nil;
		if(ssize > 0){
			switch(tcb->state){
			case Syn_sent:
				seg.flags = 0;
				/* wet floor */
			case Syn_received:
				/*
				 *  don't send any data with a SYN packet
				 *  because Linux rejects the packet in its
				 *  attempt to solve the SYN attack problem
				 */
				seg.flags |= SYN;
				seg.mss = tcb->mss;
				seg.ws = tcb->scale;
				ssize = 1;
				break;
			default:
				/* Pull out data to send */
				bp = qcopy(s->wq, ssize, sent);
				if(BLEN(bp) != ssize)
					seg.flags |= FIN;
				else if(sent+ssize == sndcnt)
					seg.flags |= PSH;
				break;
			}
	
			/* Pull up the send pointer so we can accept acks
			 * for this window
			 */
			tcb->snd.ptr += ssize;
			if(seq_gt(tcb->snd.ptr, tcb->snd.nxt)){
				tcb->snd.nxt = tcb->snd.ptr;

				/* start RTT measurement */
				if(tcb->rttime == 0) {
					tcb->rttime = NOW;
					tcb->rttseq = tcb->snd.ptr;
				}
			}

			/* Start the transmission timers if there is new data and we
			 * expect acknowledges
			 */
			if(tcb->timer.state != TcptimerON)
				tcpsettimer(s);
		}

		if(tcb->acktimer.state == TcptimerON)
			tcphalt(tpriv, &tcb->acktimer);

		tpriv->stats[OutSegs]++;
		if(tcb->snd.ptr != tcb->snd.nxt)
			tpriv->stats[RetransSegsSent]++;

		switch(s->ipversion){
		case V4:
			bp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr);
			qunlock(s);
			sent = ipoput4(f, bp, nil, s->ttl, s->tos, s);
			break;
		case V6:
			bp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr);
			qunlock(s);
			sent = ipoput6(f, bp, nil, s->ttl, s->tos, s);
			break;
		default:
			panic("tcpoutput: version %d", s->ipversion);
		}
		if(sent < 0){
			qlock(s);
			localclose(s, Enoroute);
			return;
		}
		qlock(s);
	}
}

/*
 *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
 */
static int
tcpsendka(Conv *s)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Block *bp;
	Tcp seg;
	int ret;

	memset(&seg, 0, sizeof seg);
	seg.source = s->lport;
	seg.dest = s->rport;
	seg.flags = ACK|PSH;
	seg.mss = 0;
	seg.ws = 0;

	tcprcvwin(s);
	seg.seq = tcb->snd.una-1;
	seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
	seg.wnd = tcb->rcv.wnd >> tcb->rcv.scale;
	tcb->rcv.wsnt = tcb->rcv.wptr;

	if(tcb->state == Finwait2){
		seg.flags |= FIN;
		bp = nil;
	} else {
		bp = allocb(1);
		bp->wp++;
	}
	switch(s->ipversion){
	case V4:
		bp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr);
		qunlock(s);
		ret = ipoput4(s->p->f, bp, nil, s->ttl, s->tos, s);
		break;
	case V6:
		bp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr);
		qunlock(s);
		ret = ipoput6(s->p->f, bp, nil, s->ttl, s->tos, s);
		break;
	default:
		panic("tcpsendka: version %d", s->ipversion);
	}
	qlock(s);
	return ret;
}

/*
 *  if we've timed out, close the connection
 *  otherwise, send a keepalive and restart the timer
 */
static void
tcpkeepalive(void *arg)
{
	Conv *s = (Conv*)arg;
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	qlock(s);
	if(tcb->state != Closed && tcb->bypass == nil){
		if(++(tcb->kato) > MAX_KAT) {
			localclose(s, Etimedout);
		} else if(tcpsendka(s) < 0) {
			localclose(s, Enoroute);
		} else {
			tcpgo(s->p->priv, &tcb->katimer);
		}
	}
	qunlock(s);
}

/*
 *  start keepalive timer
 */
static char*
tcpstartka(Conv *s, char **f, int n)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	int x;

	if(tcb->state != Established)
		return "connection must be in Establised state";

	if(tcb->bypass != nil)
		return nil;

	x = DEF_KAT;
	if(n > 1){
		x = atoi(f[1]);
		if(x <= 0)
			x = 0;
	}
	tcphalt(s->p->priv, &tcb->katimer);
	tcb->kato = 0;
	tcb->katimer.start = (x + MSPTICK-1) / MSPTICK;
	tcpgo(s->p->priv, &tcb->katimer);

	return nil;
}

/*
 *  retransmit (at most) one segment at snd.una.
 *  preserve cwind and return original snd.ptr
 */
static ulong
tcprxmit(Conv *s)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcppriv *tpriv;
	ulong tcwind, tptr;

	tcb->flags |= FORCE;

	tptr = tcb->snd.ptr;
	tcb->snd.ptr = tcb->snd.una;

	tcwind = tcb->cwind;
	tcb->cwind = tcb->mss;

	tcpoutput(s);

	tcb->cwind = tcwind;

	tpriv = (Tcppriv*)s->p->priv;
	tpriv->stats[RetransSegs]++;

	/* in case tcpoutput() released the lock */
	if(seq_lt(tptr, tcb->snd.una))
		tptr = tcb->snd.una;

	return tptr;
}

/*
 *  todo: RFC 4138 F-RTO
 */
static void
tcptimeout(void *arg)
{
	Conv *s = (Conv*)arg;
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
	int maxback;

	qlock(s);
	switch(tcb->state){
	default:
		if(tcb->backoff < 8)
			tcb->backoff++;
		if(tcb->state == Syn_sent)
			maxback = MAXBACKMS/2;
		else
			maxback = MAXBACKMS;
		tcb->backedoff += tcb->timer.start * MSPTICK;
		if(tcb->backedoff >= maxback) {
			localclose(s, Etimedout);
			break;
		}
		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
			tcb->srtt, tcb->mdev, NOW-tcb->time,
			tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
			tcpstates[s->state]);
		tpriv->stats[RetransTimeouts]++;
		if(tcb->snd.rto++ == 0)
			tcpcongestion(tcb);
		tcb->abcbytes = 0;
		tcb->cwind = tcb->mss;
		if(tcb->snd.recovery){
			tcb->snd.recovery = 0;
			tcb->snd.dupacks = 0;			/* reno rto */
			tcb->snd.rxt = tcb->snd.nxt;
			netlog(s->p->f, Logtcpwin,
				"rto recovery rxt @%lud\n", tcb->snd.nxt);
			tpriv->stats[RecoveryRTO]++;
		}
		/* resets the send pointer, retransmits and restarts timer */
		tcprxmit(s);
		break;
	case Time_wait:
		localclose(s, nil);
		break;
	case Closed:
		break;
	}
	qunlock(s);
}

/*
 *  set up state for a received SYN (or SYN ACK) packet
 */
static void
procsyn(Conv *s, Tcp *seg)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	tcb->flags |= FORCE;

	tcb->rcv.nxt = tcb->rcv.ackptr = seg->seq + 1;
	tcb->rcv.wptr = tcb->rcv.wsnt = tcb->rcv.nxt;
	tcb->rcv.wnd = 0;

	tcpsetmss(s, seg->mss);
	tcpsetscale(s, seg->ws);
}

static int
dumpreseq(Tcpctl *tcb)
{
	Reseq *r, *next;

	for(r = tcb->reseq; r != nil; r = next){
		next = r->next;
		freeblist(r->bp);
		free(r);
	}
	tcb->reseq = nil;
	tcb->nreseq = 0;
	tcb->reseqlen = 0;
	return -1;
}

static void
logreseq(Fs *f, Reseq *r, ulong n)
{
	char *s;

	for(; r != nil; r = r->next){
		s = nil;
		if(r->next == nil && r->seg.seq != n)
			s = "hole/end";
		else if(r->next == nil)
			s = "end";
		else if(r->seg.seq != n)
			s = "hole";
		if(s != nil)
			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
				n, r->seg.seq, r->seg.seq-n, r->seg.flags);
		n = r->seg.seq + r->seg.len;
	}
}

static int
addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block **bpp, ushort length)
{
	Reseq *rp, **rr;
	int qmax;

	rp = malloc(sizeof(Reseq));
	if(rp == nil){
		freeblist(*bpp);	/* *bpp always consumed by addreseq */
		*bpp = nil;
		return 0;
	}

	rp->seg = *seg;
	rp->bp = *bpp;
	rp->length = length;
	*bpp = nil;

	tcb->reseqlen += length;
	tcb->nreseq++;

	/* Place on reassembly list sorting by starting seq number */
	for(rr = &tcb->reseq;; rr = &(*rr)->next)
		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
			rp->next = *rr;
			*rr = rp;
			tpriv->stats[Resequenced]++;
			if(rp->next != nil)
				tpriv->stats[OutOfOrder]++;
			break;
		}

	qmax = tcb->window;
	if(tcb->reseqlen > qmax){
		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq);
		logreseq(f, tcb->reseq, tcb->rcv.nxt);
		tpriv->stats[ReseqBytelim]++;
		return dumpreseq(tcb);
	}
	qmax = (tcb->window + tcb->mss-1) / tcb->mss;
	if(tcb->nreseq > qmax){
		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
		logreseq(f, tcb->reseq, tcb->rcv.nxt);
		tpriv->stats[ReseqPktlim]++;
		return dumpreseq(tcb);
	}

	return 0;
}

static int
getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
{
	Reseq *rp;

	rp = tcb->reseq;
	if(rp == nil || seq_gt(rp->seg.seq, tcb->rcv.nxt))
		return -1;

	*seg = rp->seg;
	*bp = rp->bp;
	*length = rp->length;

	tcb->reseq = rp->next;
	tcb->reseqlen -= rp->length;
	tcb->nreseq--;

	free(rp);

	return 0;
}

static int
tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
{
	int dupcnt, excess;

	dupcnt = (int)(tcb->rcv.nxt - seg->seq);
	if(dupcnt > 0){
		if(seg->flags & SYN){
			seg->flags &= ~SYN;
			seg->seq++;
			dupcnt--;
		}
		if(dupcnt >= *length){
			tcb->overlap += *length;
			freeblist(*bp);
			*bp = nil;
			return -1;
		}
		tcb->overlap += dupcnt;
		pullblock(bp, dupcnt);
		seg->seq += dupcnt;
		*length -= dupcnt;
	}
	excess = (int)(seg->seq + *length - tcb->rcv.wptr);
	if(excess > 0) {
		if(excess >= *length){
			tcb->overlap += *length;
			freeblist(*bp);
			*bp = nil;
			return -1;
		}
		seg->flags &= ~FIN;
		tcb->overlap += excess;
		*length -= excess;
		*bp = trimblock(*bp, 0, *length);
	}
	if(*bp == nil)
		return -1;
	return 0;
}

static void
tcpadvise(Proto *tcp, Block *bp, Ipifc *ifc, char *msg)
{
	uchar source[IPaddrlen];
	uchar dest[IPaddrlen];
	ushort psource, pdest;
	Tcp4hdr *h4;
	Tcp6hdr *h6;
	Iphash *iph;
	Tcpctl *tcb;
	Conv *s;

	h4 = (Tcp4hdr*)(bp->rp);
	h6 = (Tcp6hdr*)(bp->rp);

	if((h4->vihl&0xF0)==IP_VER4) {
		v4tov6(dest, h4->tcpdst);
		v4tov6(source, h4->tcpsrc);
		psource = nhgets(h4->tcpsport);
		pdest = nhgets(h4->tcpdport);
	} else {
		ipmove(dest, h6->tcpdst);
		ipmove(source, h6->tcpsrc);
		psource = nhgets(h6->tcpsport);
		pdest = nhgets(h6->tcpdport);
	}

	/* Look for a connection (source/dest reversed; this is the original packet we sent) */
	qlock(tcp);
	iph = iphtlook(&((Tcppriv*)tcp->priv)->ht, dest, pdest, source, psource);
	if(iph == nil || iph->match != IPmatchexact)
		goto raise;
	if(iph->trans){
		Translation *q;

		if((q = transbackward(tcp, iph)) == nil)
			goto raise;

		/* h4->tcplen is the ip header checksum */
		hnputs_csum(h4->tcpsrc+0, nhgets(q->forward.raddr+IPv4off+0), h4->tcplen);
		hnputs_csum(h4->tcpsrc+2, nhgets(q->forward.raddr+IPv4off+2), h4->tcplen);

		/* dont bother fixing tcp checksum, packet is most likely truncated */
		hnputs(h4->tcpsport, q->forward.rport);
		qunlock(tcp);

		icmpproxyadvice(tcp->f, ifc, bp, h4->tcpsrc);
		return;
	}
	s = iphconv(iph);
	if(s->ignoreadvice || s->state == Announced || s->state == Closed)
		goto raise;
	tcb = (Tcpctl*)s->ptcl;
	qlock(s);
	qunlock(tcp);
	if(tcb->state == Syn_sent)
		localclose(s, msg);
	qunlock(s);
	freeblist(bp);
	return;
raise:
	qunlock(tcp);
	freeblist(bp);
}

static Block*
tcpforward(Proto *tcp, Block *bp, Route *r)
{
	uchar da[IPaddrlen], sa[IPaddrlen];
	ushort dp, sp;
	Tcp4hdr *h4;
	Translation *q;

	h4 = (Tcp4hdr*)(bp->rp);
	v4tov6(da, h4->tcpdst);
	v4tov6(sa, h4->tcpsrc);
	dp = nhgets(h4->tcpdport);
	sp = nhgets(h4->tcpsport);

	/* don't make new translation when not syn packet */
	if((h4->tcpflag[1] & (ACK|RST|SYN|FIN)) != SYN)
		r = nil;

	qlock(tcp);
	q = transforward(tcp, &((Tcppriv*)tcp->priv)->ht, sa, sp, da, dp, r);
	if(q == nil){
		qunlock(tcp);
		freeblist(bp);
		return nil;
	}
	hnputs_csum(h4->tcpsrc+0, nhgets(q->backward.laddr+IPv4off+0), h4->tcpcksum);
	hnputs_csum(h4->tcpsrc+2, nhgets(q->backward.laddr+IPv4off+2), h4->tcpcksum);
	hnputs_csum(h4->tcpsport, q->backward.lport, h4->tcpcksum);
	qunlock(tcp);

	return bp;
}

void
tcpmssclamp(uchar *p, int n, int mtu)
{
	Tcp4hdr *h4;
	Tcp6hdr *h6;
	uchar *pcksum;
	int hdrlen, optlen, newmss, oldmss;

	if(n < TCP4_PKT)
		return;
	h4 = (Tcp4hdr*)p;
	h6 = (Tcp6hdr*)p;
	if((h4->vihl&0xF0)==IP_VER4) {
		if(h4->proto != IP_TCPPROTO)
			return;
		if((h4->frag[0] & (IP_FO>>8)) | (h4->frag[1] & IP_FO))
			return;
		if(!(h4->tcpflag[1] & SYN))
			return;
		hdrlen = (h4->tcpflag[0] >> 2) & ~3;
		if(hdrlen > (n - TCP4_PKT))
			return;
		n = hdrlen - TCP4_HDRSIZE;
		p = h4->tcpopt;
		pcksum = h4->tcpcksum;
		newmss = mtu - (TCP4_PKT + TCP4_HDRSIZE);
	} else {
		if(n < TCP6_PKT)
			return;
		if(h6->proto != IP_TCPPROTO)
			return;
		if(!(h6->tcpflag[1] & SYN))
			return;
		hdrlen = (h6->tcpflag[0] >> 2) & ~3;
		if(hdrlen > (n - TCP6_PKT))
			return;
		n = hdrlen - TCP6_HDRSIZE;
		p = h6->tcpopt;
		pcksum = h6->tcpcksum;
		newmss = mtu - (TCP6_PKT + TCP6_HDRSIZE);
	}
	while(n > 0 && *p != EOLOPT) {
		if(*p == NOOPOPT) {
			n--;
			p++;
			continue;
		}
		optlen = p[1];
		if(optlen < 2 || optlen > n)
			break;
		if(*p == MSSOPT){
			if(optlen != MSS_LENGTH)
				break;
			oldmss = nhgets(p+2);
			if(newmss >= oldmss)
				break;
			hnputs_csum(p+2, newmss, pcksum);
			break;
		}
		n -= optlen;
		p += optlen;
	}
}

/* called with c qlocked */
static char*
tcpctl(Conv* c, char** f, int n)
{
	if(n == 1 && strcmp(f[0], "close") == 0)
		return tcpclose(c), nil;
	if(n == 1 && strcmp(f[0], "hangup") == 0)
		return tcphangup(c);
	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
		return tcpstartka(c, f, n);
	return "unknown control request";
}

static int
tcpstats(Proto *tcp, char *buf, int len)
{
	Tcppriv *tpriv = (Tcppriv*)tcp->priv;
	char *p, *e;
	int i;

	tpriv->stats[InLimbo] = tpriv->nlimbo;

	p = buf;
	e = p+len;
	for(i = 0; i < Nstats; i++)
		p = seprint(p, e, "%s: %llud\n", statnames[i], tpriv->stats[i]);
	return p - buf;
}

/*
 *  garbage collect any stale conversations:
 *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
 *	- Finwait2 after 5 minutes
 *
 *  this is called whenever we run out of channels.  Both checks are
 *  of questionable validity so we try to use them only when we're
 *  up against the wall.
 */
static int
tcpgc(Proto *tcp)
{
	int n;
	ulong now;
	Conv *c, **pp, **ep;
	Tcpctl *tcb;

	n = 0;
	now = NOW;
	ep = &tcp->conv[tcp->nc];
	for(pp = tcp->conv; pp < ep; pp++) {
		c = *pp;
		if(c == nil)
			break;
		if(!canqlock(c))
			continue;
		tcb = (Tcpctl*)c->ptcl;
		switch(tcb->state){
		case Syn_received:
			if(now - tcb->time > 5000){
				localclose(c, Etimedout);
				n++;
			}
			break;
		case Finwait2:
			if(now - tcb->time > 5*60*1000){
				localclose(c, Etimedout);
				n++;
			}
			break;
		}
		qunlock(c);
	}
	return n;
}

static void
tcpsettimer(Conv *s)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	int x;

	tcb->time = NOW;
	tcb->timeuna = tcb->snd.una;

	/* round trip dependency */
	x = tcb->mdev;
	x += tcb->srtt >> LOGAGAIN;
	x <<= tcb->backoff;

	/* bounded twixt 0.3 and 64 seconds */
	if(x < 300)
		x = 300;
	else if(x > 64000)
		x = 64000;

	/* reset the timer */
	tcb->timer.start = (x + MSPTICK-1) / MSPTICK;
	if(tcb->timer.state == TcptimerON){
		tcb->timer.count = tcb->timer.start;
	} else {
		Tcppriv *tpriv = (Tcppriv*)s->p->priv;
		tcpgo(tpriv, &tcb->timer);
	}
}

void
tcpinit(Fs *fs)
{
	Proto *tcp;
	Tcppriv *tpriv;

	tcp = smalloc(sizeof(Proto));
	tcp->priv = tpriv = smalloc(sizeof(Tcppriv));
	tcp->name = "tcp";
	tcp->connect = tcpconnect;
	tcp->announce = tcpannounce;
	tcp->ctl = tcpctl;
	tcp->state = tcpstate;
	tcp->create = tcpcreate;
	tcp->close = tcpclose;
	tcp->rcv = tcpiput;
	tcp->advise = tcpadvise;
	tcp->forward = tcpforward;
	tcp->stats = tcpstats;
	tcp->inuse = tcpinuse;
	tcp->gc = tcpgc;
	tcp->ipproto = IP_TCPPROTO;
	tcp->nc = scalednconv();
	tcp->ptclsize = sizeof(Tcpctl);

	tpriv->stats[MaxConn] = tcp->nc;

	Fsproto(fs, tcp);
}

static void
tcpsetmss(Conv *s, ushort mss)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;

	/* our sending max segment size cannot be bigger than what he asked for */
	if(mss != 0 && mss < tcb->mss)
		tcb->mss = mss;
	
	/* RFC 3390 initial window */
	if(tcb->mss < 1095)
		tcb->cwind = 4*tcb->mss;
	else if(tcb->mss < 2190)
		tcb->cwind = 4380;
	else
		tcb->cwind = 2*tcb->mss;
}

static void
tcpsetscale(Conv *s, uchar scale)
{
	Tcpctl *tcb = (Tcpctl*)s->ptcl;
	ulong window;

	if(scale == 0)
		tcb->scale = 0;
	else if(scale > 14)
		scale = 14;	/* RFC 7323 2.3 */

	tcb->rcv.scale = tcb->scale;
	tcb->window = QMAX<<tcb->rcv.scale;
	qsetlimit(s->rq, tcb->window);

	tcb->snd.scale = scale;
	window = QMAX<<tcb->snd.scale;
	if(window > tcb->window)
		window = tcb->window;
	tcb->ssthresh = window;
	qsetlimit(s->wq, window);
}