git: 9front

Download patch

ref: 0bfae3882383682b214ac45c41199c53b54d8c63
parent: f57d9800979d2be3edb3464f5ff5def523b14b01
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sat Oct 18 08:02:48 EDT 2025

tcp: cumulative bugfixes and improvements

SYNACK limbo:

Limbo.rexmit was not initialized on reuse. Comments
where wrong, saying we send every 250ms, but we actually
have exponential backoff here and maximum timeout is 2.5s.

Put the limbo entry lookup into its own function,
avoiding repetition. Remove wrong comment, Limbo
entries are indeed hashed.

Window management:

Fix wrong scale option negotiation (swapped snd/rcv).

Use window scale of (QMAX=64k)<<(QSCALE=7) ~= 70ms * 1000mbit.

Apply the other sides window scaling option properly,
but limit our Conv.wq to the maximum size of Conv.rq
to keep memory usage sane.

Implement silly window syndrome avoidance in tcprcvwin().

Handle shut send window properly. Must not enter recovery
when send window is shut. Instead, do zero-window probe.
Also end recovery when window is shut.

Zero-window probes must only be send when retransmitting,
not as an forced acknowledgement.

Must not send forced ack when received segment has no data
(must set FORCE bit *before* update()).

Ignore seg.ack > snd.nxt (ghost acks).

Reno fast retransmit:

Duplicae-ACK are defined to carry no data. We must not
consider seg.len != 0 segments as dupacks for fast-retransmit
trigger.

Unexplained new-reno formula used for deflating congestion
window after fast recovery. Use documented fromula from RFC.

Timers:

Improve RTT measurment. Use NOW time instead of a Tcptimer
for better accuracy.

Avoid enabling the acktimer just to disable it again in
tcpoutput(). Instead, use tha rcv.ackptr to see if acktimer
should be neabled *after* tcpoutput().

Simplify keep-alive timer. Make it a up-counter tcb->kato,
counting timeouts and terminating when over MAX_KAT (3).

Cruft:

Remove "checksum" option.

Remove urgent data handling.

General:

Better /net/tcp/*/status information. Show all times
in milliseconds. Show scale factor for srtt.

Use open intervals for seq_in(). Simplify tcptrim().

Remove useless waserror(). ipoput*() does NOT error().

Always unlock Conv duing ipoput*().

Get rid of redundant state variables and flags.

--- a/sys/man/3/ip
+++ b/sys/man/3/ip
@@ -789,12 +789,6 @@
 .IR N ,
 if given, is the milliseconds between keepalives
 (default 120000).
-.TP
-.BI checksum \ n
-emit TCP checksums of zero if
-.I n
-is zero; otherwise, and by default,
-TCP checksums are computed and sent normally.
 .SS UDP
 UDP connections carry unreliable and unordered datagrams.  A read from
 .B data
--- a/sys/src/9/ip/tcp.c
+++ b/sys/src/9/ip/tcp.c
@@ -9,7 +9,6 @@
 
 enum
 {
-	QMAX		= 64*1024-1,
 	IP_TCPPROTO	= 6,
 
 	TCP4_IPLEN	= 8,
@@ -27,7 +26,7 @@
 	TcptimerOFF	= 0,
 	TcptimerON	= 1,
 	TcptimerDONE	= 2,
-	MAX_TIME 	= (1<<20),	/* Forever */
+	MSPTICK		= 50,		/* Milliseconds per timer tick */
 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
 
@@ -45,25 +44,21 @@
 	WSOPT		= 3,
 	WS_LENGTH	= 3,		/* Bits to scale window size by */
 	MSL2		= 10,
-	MSPTICK		= 50,		/* Milliseconds per timer tick */
 	DEF_MSS		= 1460,		/* Default maximum segment */
 	DEF_MSS6	= 1220,		/* Default maximum segment (min) for v6 */
 	DEF_RTT		= 500,		/* Default round trip */
 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
+	MAX_KAT		= 3,		/* Maximum number of keep-alive timeouts */
 	TCP_LISTEN	= 0,		/* Listen connection */
 	TCP_CONNECT	= 1,		/* Outgoing connection */
-	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
 
 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
 
 	FORCE		= 1,
-	CLONE		= 2,
-	RETRAN		= 4,
-	ACTIVE		= 8,
-	SYNACK		= 16,
+	SYNACK		= 2,
 
-	LOGAGAIN	= 3,
-	LOGDGAIN	= 2,
+	LOGAGAIN	= 3,		/* alpha 1/8 */
+	LOGDGAIN	= 2,		/* beta 1/4 */
 
 	Closed		= 0,		/* Connection states */
 	Listen,
@@ -85,9 +80,10 @@
 	 * window is 64kb · 2ⁿ
 	 * these factors determine the ultimate bandwidth-delay product.
 	 * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
+	 * 64kb · 2⁷ = 8mb, or around 1000mbps · 70ms.
 	 */
-	Maxqscale	= 4,		/* maximum queuing scale */
-	Defadvscale	= 4,		/* default advertisement */
+	QSCALE		= 7,
+	QMAX		= 64*1024-1,
 };
 
 /* negative return from ipoput means no route */
@@ -179,9 +175,8 @@
 	ulong	ack;
 	uchar	flags;
 	uchar	update;
-	ushort	ws;	/* window scale option */
+	uchar	ws;	/* window scale option */
 	ulong	wnd;	/* prescaled window*/
-	ushort	urg;
 	ushort	mss;	/* max segment size option (if not zero) */
 	ushort	len;	/* size of data */
 };
@@ -213,58 +208,49 @@
 		ulong	nxt;		/* Next sequence expected */
 		ulong	ptr;		/* Data pointer */
 		ulong	wnd;		/* Tcp send window */
-		ulong	urg;		/* Urgent data pointer */
-		ulong	wl2;
-		uint	scale;		/* how much to right shift window in xmitted packets */
+		ulong	wl2;		/* Seg.ack of last window update */
+		ulong	wl1;		/* Seg.seq of last window update */
+		uchar	scale;		/* how much to left shift window in received packets */
+		uchar	rto;		/* retransmit timeout counter */ 
 		/* to implement tahoe and reno TCP */
-		ulong	dupacks;	/* number of duplicate acks rcvd */
-		ulong	partialack;
-		int	recovery;	/* loss recovery flag */
-		int	retransmit;	/* retransmit 1 packet @ una flag */
-		int	rto;
+		uchar	recovery;	/* loss recovery flag */
 		ulong	rxt;		/* right window marker for recovery "recover" rfc3782 */
+		ulong	dupacks;	/* number of duplicate acks rcvd */
+		ulong	partialack;	/* partial acks received during recovery */
 	} snd;
 	struct {
 		ulong	nxt;		/* Receive pointer to next uchar slot */
+		ulong	ackptr;		/* Last acked sequence */
+		ulong	wptr;		/* Right side of receive window */
+		ulong	wsnt;		/* Last wptr sent */
 		ulong	wnd;		/* Receive window incoming */
-		ulong	wsnt;		/* Last wptr sent.  important to track for large bdp */
-		ulong	wptr;
-		ulong	urg;		/* Urgent pointer */
-		ulong	ackptr;		/* last acked sequence */
-		int	blocked;
-		uint	scale;		/* how much to left shift window in rcv'd packets */
+		uchar	scale;		/* how much to right shift window in transmitted packets */
 	} rcv;
 	ulong	iss;			/* Initial sequence number */
 	ulong	cwind;			/* Congestion window */
 	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
-	uint	scale;			/* desired snd.scale */
 	ulong	ssthresh;		/* Slow start threshold */
-	int	resent;			/* Bytes just resent */
-	int	irs;			/* Initial received squence */
 	ushort	mss;			/* Maximum segment size */
-	int	rerecv;			/* Overlap of data rerecevived */
+	uchar	scale;			/* desired rcv.scale */
 	ulong	window;			/* Our receive window (queue) */
-	uint	qscale;			/* Log2 of our receive window (queue) */
-	uchar	backoff;		/* Exponential backoff counter */
-	int	backedoff;		/* ms we've backed off for rexmits */
-	uchar	flags;			/* State flags */
+	ulong	overlap;		/* Overlap of data re-recevived */
 	Reseq	*reseq;			/* Resequencing queue */
 	int	nreseq;
 	int	reseqlen;
+	uchar	flags;			/* State flags */
+	uchar	flgcnt;			/* Number of flags in the send sequence (FIN,SYN) */
+	uchar	backoff;		/* Exponential backoff counter */
+	int	backedoff;		/* ms we've backed off for rexmits */
 	Tcptimer	timer;			/* Activity timer */
 	Tcptimer	acktimer;		/* Acknowledge timer */
-	Tcptimer	rtt_timer;		/* Round trip timer */
 	Tcptimer	katimer;		/* keep alive timer */
+	ulong	kato;			/* keep alive timeouts */
+	ulong	time;			/* time Finwait2 or Syn_received or timer was set */
+	ulong	timeuna;		/* snd.una when time was set */
+	ulong	rttime;			/* Sent time for rtt measurement */
 	ulong	rttseq;			/* Round trip sequence */
 	int	srtt;			/* Smoothed round trip */
 	int	mdev;			/* Mean deviation of round trip */
-	int	kacounter;		/* count down for keep alive */
-	uint	sndsyntime;		/* time syn sent */
-	ulong	time;			/* time Finwait2 or Syn_received was sent */
-	ulong	timeuna;			/* snd.una when time was set */
-	int	nochecksum;		/* non-zero means don't send checksums */
-	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
-
 	union {
 		Tcp4hdr	tcp4hdr;
 		Tcp6hdr	tcp6hdr;
@@ -275,14 +261,7 @@
  *  New calls are put in limbo rather than having a conversation structure
  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
- *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
- *
- *  In particular they aren't on a listener's queue so that they don't figure
- *  in the input queue limit.
- *
- *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
- *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
- *  there is no hashing of this list.
+ *  SYN ACK up to 4 times. They disappear after 2.5 seconds.
  */
 typedef struct Limbo Limbo;
 struct Limbo
@@ -296,14 +275,13 @@
 	ulong	irs;		/* initial received sequence */
 	ulong	iss;		/* initial sent sequence */
 	ushort	mss;		/* mss from the other end */
-	ushort	rcvscale;	/* how much to scale rcvd windows */
-	ushort	sndscale;	/* how much to scale sent windows */
+	uchar	ws;		/* ws from the other end */
+	uchar	rexmits;	/* number of retransmissions */
 	ulong	lastsend;	/* last time we sent a synack */
 	uchar	version;	/* v4 or v6 */
-	uchar	rexmits;	/* number of retransmissions */
 };
 
-int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
+static int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
 
 enum {
 	/* MIB stats */
@@ -392,17 +370,15 @@
 	int	nlimbo;
 	Limbo	*lht[NLHT];
 
-	/* for keeping track of tcpackproc */
-	QLock	apl;
-	int	ackprocstarted;
-
 	uvlong	stats[Nstats];
+
+	int ackprocstarted;
 };
 
 static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block**, ushort);
 static	int	dumpreseq(Tcpctl*);
-static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
-static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
+static	int	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
+static	void	limbo(Conv*, Tcp*, uchar*, uchar*, int);
 static	void	limborexmit(Proto*);
 static	void	localclose(Conv*, char*);
 static	void	procsyn(Conv*, Tcp*);
@@ -411,13 +387,12 @@
 static	void	tcpkeepalive(void*);
 static	void	tcpoutput(Conv*);
 static	void	tcprcvwin(Conv*);
-static	void	tcprxmit(Conv*);
-static	void	tcpsetkacounter(Tcpctl*);
-static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
-static	void	tcpsettimer(Tcpctl*);
-static	void	tcpsndsyn(Conv*, Tcpctl*);
+static	ulong	tcprxmit(Conv*);
+static	void	tcpsetmss(Conv*, ushort);
+static	void	tcpsetscale(Conv*, uchar);
+static	void	tcpsettimer(Conv*);
+static	void	tcpsndsyn(Conv*);
 static	void	tcpstart(Conv*, int);
-static	void	tcpsynackrtt(Conv*);
 static	void	tcptimeout(void*);
 static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
 
@@ -424,23 +399,14 @@
 static void
 tcpsetstate(Conv *s, uchar newstate)
 {
-	Tcpctl *tcb;
-	uchar oldstate;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 	Tcppriv *tpriv;
+	uchar oldstate;
 
-	tpriv = s->p->priv;
-
-	tcb = (Tcpctl*)s->ptcl;
-
 	oldstate = tcb->state;
 	if(oldstate == newstate)
 		return;
 
-	if(oldstate == Established)
-		tpriv->stats[CurrEstab]--;
-	if(newstate == Established)
-		tpriv->stats[CurrEstab]++;
-
 	switch(newstate) {
 	case Closed:
 		qclose(s->rq);
@@ -455,6 +421,12 @@
 
 	tcb->state = newstate;
 
+	tpriv = (Tcppriv*)s->p->priv;
+	if(oldstate == Established)
+		tpriv->stats[CurrEstab]--;
+	if(newstate == Established)
+		tpriv->stats[CurrEstab]++;
+
 	if(oldstate == Syn_sent && newstate != Closed)
 		Fsconnected(s, nil);
 }
@@ -462,10 +434,9 @@
 static char*
 tcpconnect(Conv *c, char **argv, int argc)
 {
+	Tcpctl *tcb = (Tcpctl*)c->ptcl;
 	char *e;
-	Tcpctl *tcb;
 
-	tcb = (Tcpctl*)(c->ptcl);
 	if(tcb->state != Closed)
 		return Econinuse;
 
@@ -480,39 +451,40 @@
 static int
 tcpstate(Conv *c, char *state, int n)
 {
-	Tcpctl *s;
+	Tcpctl *tcb = (Tcpctl*)c->ptcl;
 
-	s = (Tcpctl*)(c->ptcl);
-
-	return snprint(state, n,
-		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
-		tcpstates[s->state],
-		c->rq ? qlen(c->rq) : 0,
-		c->wq ? qlen(c->wq) : 0,
-		s->nreseq, s->reseqlen,
-		s->srtt, s->mdev, s->ssthresh,
-		s->cwind, s->snd.wnd, s->snd.scale, s->rcv.wnd, s->rcv.scale,
-		s->qscale,
-		s->timer.start, s->timer.count, s->rerecv,
-		s->katimer.start, s->katimer.count);
+	return snprint(state, n, "%s qin %d qout %d"
+		" rq %d.%d ovl %lud"
+		" srtt %d>>%d mdev %d"
+		" timer %d/%d"
+		" sst %lud cwin %lud"
+		" swin %lud>>%d rwin %lud>>%d"
+		" ka %d/%d %lud\n",
+		tcpstates[tcb->state],
+		c->rq != nil ? qlen(c->rq) : 0,
+		c->wq != nil ? qlen(c->wq) : 0,
+		tcb->nreseq, tcb->reseqlen, tcb->overlap,
+		tcb->srtt, LOGAGAIN, tcb->mdev,
+		tcb->timer.count*MSPTICK, tcb->timer.start*MSPTICK,
+		tcb->ssthresh, tcb->cwind,
+		tcb->snd.wnd, tcb->snd.scale, tcb->rcv.wnd, tcb->rcv.scale,
+		tcb->katimer.count*MSPTICK, tcb->katimer.start*MSPTICK, tcb->kato);
 }
 
 static int
 tcpinuse(Conv *c)
 {
-	Tcpctl *s;
+	Tcpctl *tcb = (Tcpctl*)c->ptcl;
 
-	s = (Tcpctl*)(c->ptcl);
-	return s->state != Closed;
+	return tcb->state != Closed;
 }
 
 static char*
 tcpannounce(Conv *c, char **argv, int argc)
 {
+	Tcpctl *tcb = (Tcpctl*)c->ptcl;
 	char *e;
-	Tcpctl *tcb;
 
-	tcb = (Tcpctl*)(c->ptcl);
 	if(tcb->state != Closed)
 		return Econinuse;
 
@@ -526,15 +498,13 @@
 }
 
 /*
- *  tcpclose is always called with the q locked
+ *  tcpclose is always called with the c locked
  */
 static void
 tcpclose(Conv *c)
 {
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)c->ptcl;
 
-	tcb = (Tcpctl*)c->ptcl;
-
 	qhangup(c->rq, nil);
 	qhangup(c->wq, nil);
 	qhangup(c->eq, nil);
@@ -570,19 +540,12 @@
 }
 
 static void
-tcpkick(void *x)
+tcpkick(void *arg)
 {
-	Conv *s = x;
-	Tcpctl *tcb;
+	Conv *s = (Conv*)arg;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	tcb = (Tcpctl*)s->ptcl;
-
-	if(waserror()){
-		qunlock(s);
-		nexterror();
-	}
 	qlock(s);
-
 	switch(tcb->state) {
 	case Syn_sent:
 	case Syn_received:
@@ -597,49 +560,68 @@
 		localclose(s, "Hangup");
 		break;
 	}
-
 	qunlock(s);
-	poperror();
 }
 
-static int seq_lt(ulong, ulong);
+static int
+seq_in(ulong x, ulong low, ulong high)
+{
+	x -= low, high -= low;
+	return (int)x >= 0 && (int)x < high;
+}
 
+static int
+seq_lt(ulong x, ulong y)
+{
+	return (int)(x-y) < 0;
+}
+
+static int
+seq_le(ulong x, ulong y)
+{
+	return (int)(x-y) <= 0;
+}
+
+static int
+seq_gt(ulong x, ulong y)
+{
+	return (int)(x-y) > 0;
+}
+
+static int
+seq_ge(ulong x, ulong y)
+{
+	return (int)(x-y) >= 0;
+}
+
 static void
 tcprcvwin(Conv *s)				/* Call with tcb locked */
 {
-	int w;
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	ulong w;
 
-	tcb = (Tcpctl*)s->ptcl;
 	w = tcb->window - qlen(s->rq);
-	if(w < 0)
+
+	/* RFC 1122 § 4.2.3.3 silly window syndrome avoidance */
+	if((int)w < tcb->rcv.wnd + tcb->mss)
 		w = 0;
-	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
-	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
-		w = tcb->rcv.wptr - tcb->rcv.nxt;
-	if(w != tcb->rcv.wnd)
-	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
-		tcb->rcv.blocked = 1;
-		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
-			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
-	}
-	tcb->rcv.wnd = w;
-	tcb->rcv.wptr = tcb->rcv.nxt + w;
+
+	w += tcb->rcv.nxt;
+
+	/* RFC 1122 § 4.2.2.16 do not move right edge of window left */
+	if(seq_lt(w, tcb->rcv.wptr))
+		w = tcb->rcv.wptr;
+
+	tcb->rcv.wptr = w;
+	tcb->rcv.wnd = w - tcb->rcv.nxt;
 }
 
 static void
-tcpacktimer(void *v)
+tcpacktimer(void *arg)
 {
-	Tcpctl *tcb;
-	Conv *s;
+	Conv *s = (Conv*)arg;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	s = v;
-	tcb = (Tcpctl*)s->ptcl;
-
-	if(waserror()){
-		qunlock(s);
-		nexterror();
-	}
 	qlock(s);
 	if(tcb->state != Closed){
 		tcb->flags |= FORCE;
@@ -646,7 +628,6 @@
 		tcpoutput(s);
 	}
 	qunlock(s);
-	poperror();
 }
 
 static void
@@ -731,16 +712,12 @@
 }
 
 static void
-tcpackproc(void *a)
+tcpackproc(void *arg)
 {
+	Proto *tcp = (Proto*)arg;
+	Tcppriv *priv = (Tcppriv*)tcp->priv;
 	Tcptimer *t, *tp, *timeo;
-	Proto *tcp;
-	Tcppriv *priv;
-	int loop;
 
-	tcp = a;
-	priv = tcp->priv;
-
 	while(waserror())
 		;
 
@@ -749,14 +726,10 @@
 
 		qlock(&priv->tl);
 		timeo = nil;
-		loop = 0;
 		for(t = priv->timers; t != nil; t = tp) {
-			if(loop++ > 10000)
-				panic("tcpackproc1");
 			tp = t->next;
  			if(t->state == TcptimerON) {
-				t->count--;
-				if(t->count == 0) {
+				if(--(t->count) == 0) {
 					timerstate(priv, t, TcptimerDONE);
 					t->readynext = timeo;
 					timeo = t;
@@ -765,11 +738,9 @@
 		}
 		qunlock(&priv->tl);
 
-		loop = 0;
-		for(t = timeo; t != nil; t = t->readynext) {
-			if(loop++ > 10000)
-				panic("tcpackproc2");
-			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
+		while((t = timeo) != nil){
+			timeo = t->readynext;
+			if(t->state == TcptimerDONE && !waserror()){
 				(*t->func)(t->arg);
 				poperror();
 			}
@@ -802,25 +773,15 @@
 	qunlock(&priv->tl);
 }
 
-static int
-backoff(int n)
-{
-	return 1 << n;
-}
-
 static void
 localclose(Conv *s, char *reason)	/* called with c locked */
 {
-	Tcpctl *tcb;
-	Tcppriv *tpriv;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
 
-	tpriv = s->p->priv;
-	tcb = (Tcpctl*)s->ptcl;
-
 	iphtrem(&tpriv->ht, s);
 
 	tcphalt(tpriv, &tcb->timer);
-	tcphalt(tpriv, &tcb->rtt_timer);
 	tcphalt(tpriv, &tcb->acktimer);
 	tcphalt(tpriv, &tcb->katimer);
 
@@ -840,16 +801,12 @@
 
 /* mtu (- TCP + IP hdr len) of 1st hop */
 static int
-tcpmtu(Route *r, int version, uint *scale)
+tcpmtu(Route *r, uchar *scale, int version)
 {
 	Ipifc *ifc;
 	int mtu;
 
-	/*
-	 * set the ws.  it doesn't commit us to anything.
-	 * ws is the ultimate limit to the bandwidth-delay product.
-	 */
-	*scale = Defadvscale;
+	*scale = QSCALE;
 
 	/*
 	 * currently we do not implement path MTU discovery
@@ -875,93 +832,85 @@
 static void
 inittcpctl(Conv *s, int mode)
 {
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 	Tcp4hdr* h4;
 	Tcp6hdr* h6;
-	int mss;
 
-	tcb = (Tcpctl*)s->ptcl;
-
 	memset(tcb, 0, sizeof(Tcpctl));
 
-	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
 	tcb->srtt = tcp_irtt<<LOGAGAIN;
-	tcb->mdev = 0;
+	tcb->mdev = tcp_irtt<<(LOGDGAIN-1);
 
 	/* setup timers */
 	tcb->timer.start = tcp_irtt / MSPTICK;
 	tcb->timer.func = tcptimeout;
 	tcb->timer.arg = s;
-	tcb->rtt_timer.start = MAX_TIME;
+
 	tcb->acktimer.start = TCP_ACK / MSPTICK;
 	tcb->acktimer.func = tcpacktimer;
 	tcb->acktimer.arg = s;
-	tcb->katimer.start = DEF_KAT / MSPTICK;
+
+	tcb->katimer.start = 0;	/* not enabled by default */
 	tcb->katimer.func = tcpkeepalive;
 	tcb->katimer.arg = s;
 
-	mss = DEF_MSS;
+	if(mode == TCP_LISTEN)
+		return;
 
+	if(ipcmp(s->laddr, IPnoaddr) == 0)
+		findlocalip(s->p->f, s->laddr, s->raddr);
+
 	/* create a prototype(pseudo) header */
-	if(mode != TCP_LISTEN){
-		if(ipcmp(s->laddr, IPnoaddr) == 0)
-			findlocalip(s->p->f, s->laddr, s->raddr);
+	switch(s->ipversion){
+	case V4:
+		h4 = &tcb->protohdr.tcp4hdr;
+		memset(h4, 0, sizeof(*h4));
+		h4->vihl = IP_VER4;
+		h4->proto = IP_TCPPROTO;
+		hnputs(h4->tcpsport, s->lport);
+		hnputs(h4->tcpdport, s->rport);
+		v6tov4(h4->tcpsrc, s->laddr);
+		v6tov4(h4->tcpdst, s->raddr);
 
-		switch(s->ipversion){
-		case V4:
-			h4 = &tcb->protohdr.tcp4hdr;
-			memset(h4, 0, sizeof(*h4));
-			h4->proto = IP_TCPPROTO;
-			hnputs(h4->tcpsport, s->lport);
-			hnputs(h4->tcpdport, s->rport);
-			v6tov4(h4->tcpsrc, s->laddr);
-			v6tov4(h4->tcpdst, s->raddr);
-			break;
-		case V6:
-			h6 = &tcb->protohdr.tcp6hdr;
-			memset(h6, 0, sizeof(*h6));
-			h6->proto = IP_TCPPROTO;
-			hnputs(h6->tcpsport, s->lport);
-			hnputs(h6->tcpdport, s->rport);
-			ipmove(h6->tcpsrc, s->laddr);
-			ipmove(h6->tcpdst, s->raddr);
-			mss = DEF_MSS6;
-			break;
-		default:
-			panic("inittcpctl: version %d", s->ipversion);
-		}
-	}
+		tcb->mss = DEF_MSS;
+		break;
+	case V6:
+		h6 = &tcb->protohdr.tcp6hdr;
+		memset(h6, 0, sizeof(*h6));
+		h6->proto = IP_TCPPROTO;
+		hnputs(h6->tcpsport, s->lport);
+		hnputs(h6->tcpdport, s->rport);
+		ipmove(h6->tcpsrc, s->laddr);
+		ipmove(h6->tcpdst, s->raddr);
 
-	tcb->mss = tcb->cwind = mss;
-	tcb->abcbytes = 0;
-	/* default is no window scaling */
-	tcpsetscale(s, tcb, 0, 0);
+		tcb->mss = DEF_MSS6;
+		break;
+	default:
+		panic("inittcpctl: version %d", s->ipversion);
+	}
+	tcpsetscale(s, 0);
 }
 
 /*
- *  called with s qlocked
+ *  called with s locked
  */
 static void
 tcpstart(Conv *s, int mode)
 {
-	Tcpctl *tcb;
-	Tcppriv *tpriv;
-	char kpname[KNAMELEN];
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
 
-	tpriv = s->p->priv;
-
 	if(tpriv->ackprocstarted == 0){
-		qlock(&tpriv->apl);
+		qlock(&tpriv->tl);
 		if(tpriv->ackprocstarted == 0){
+			char kpname[KNAMELEN];
+
 			snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
 			kproc(kpname, tcpackproc, s->p);
 			tpriv->ackprocstarted = 1;
 		}
-		qunlock(&tpriv->apl);
+		qunlock(&tpriv->tl);
 	}
 
-	tcb = (Tcpctl*)s->ptcl;
-
 	inittcpctl(s, mode);
 
 	iphtadd(&tpriv->ht, s);
@@ -968,14 +917,12 @@
 	switch(mode) {
 	case TCP_LISTEN:
 		tpriv->stats[PassiveOpens]++;
-		tcb->flags |= CLONE;
 		tcpsetstate(s, Listen);
 		break;
 
 	case TCP_CONNECT:
 		tpriv->stats[ActiveOpens]++;
-		tcb->flags |= ACTIVE;
-		tcpsndsyn(s, tcb);
+		tcpsndsyn(s);
 		tcpsetstate(s, Syn_sent);
 		tcpoutput(s);
 		break;
@@ -1005,11 +952,10 @@
 }
 
 static Block*
-htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
+htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph)
 {
 	int dlen;
 	Tcp6hdr *h;
-	ushort csum;
 	ushort hdrlen, optpad = 0;
 	uchar *opt;
 
@@ -1048,8 +994,8 @@
 	hnputl(h->tcpseq, tcph->seq);
 	hnputl(h->tcpack, tcph->ack);
 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
-	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
-	hnputs(h->tcpurg, tcph->urg);
+	hnputs(h->tcpwin, tcph->wnd);
+	hnputs(h->tcpurg, 0);
 
 	if(tcph->flags & SYN){
 		opt = h->tcpopt;
@@ -1068,12 +1014,7 @@
 			*opt++ = NOOPOPT;
 	}
 
-	if(tcb != nil && tcb->nochecksum){
-		h->tcpcksum[0] = h->tcpcksum[1] = 0;
-	} else {
-		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
-		hnputs(h->tcpcksum, csum);
-	}
+	hnputs(h->tcpcksum, ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE));
 
 	/* move from pseudo header back to normal ip header */
 	memset(h->vcf, 0, 4);
@@ -1085,11 +1026,10 @@
 }
 
 static Block*
-htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
+htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph)
 {
 	int dlen;
 	Tcp4hdr *h;
-	ushort csum;
 	ushort hdrlen, optpad = 0;
 	uchar *opt;
 
@@ -1124,8 +1064,8 @@
 	hnputl(h->tcpseq, tcph->seq);
 	hnputl(h->tcpack, tcph->ack);
 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
-	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
-	hnputs(h->tcpurg, tcph->urg);
+	hnputs(h->tcpwin, tcph->wnd);
+	hnputs(h->tcpurg, 0);
 
 	if(tcph->flags & SYN){
 		opt = h->tcpopt;
@@ -1145,12 +1085,7 @@
 			*opt++ = NOOPOPT;
 	}
 
-	if(tcb != nil && tcb->nochecksum){
-		h->tcpcksum[0] = h->tcpcksum[1] = 0;
-	} else {
-		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
-		hnputs(h->tcpcksum, csum);
-	}
+	hnputs(h->tcpcksum, ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE));
 
 	return data;
 }
@@ -1182,7 +1117,6 @@
 
 	tcph->flags = h->tcpflag[1];
 	tcph->wnd = nhgets(h->tcpwin);
-	tcph->urg = nhgets(h->tcpurg);
 	tcph->mss = 0;
 	tcph->ws = 0;
 	tcph->update = 0;
@@ -1209,8 +1143,8 @@
 				tcph->mss = nhgets(optr+2);
 			break;
 		case WSOPT:
-			if(optlen == WS_LENGTH && *(optr+2) <= 14)
-				tcph->ws = *(optr+2);
+			if(optlen == WS_LENGTH)
+				tcph->ws = optr[2];
 			break;
 		}
 		n -= optlen;
@@ -1247,7 +1181,6 @@
 
 	tcph->flags = h->tcpflag[1];
 	tcph->wnd = nhgets(h->tcpwin);
-	tcph->urg = nhgets(h->tcpurg);
 	tcph->mss = 0;
 	tcph->ws = 0;
 	tcph->update = 0;
@@ -1274,8 +1207,8 @@
 				tcph->mss = nhgets(optr+2);
 			break;
 		case WSOPT:
-			if(optlen == WS_LENGTH && *(optr+2) <= 14)
-				tcph->ws = *(optr+2);
+			if(optlen == WS_LENGTH)
+				tcph->ws = optr[2];
 			break;
 		}
 		n -= optlen;
@@ -1289,36 +1222,39 @@
  *  number and put a SYN on the send queue
  */
 static void
-tcpsndsyn(Conv *s, Tcpctl *tcb)
+tcpsndsyn(Conv *s)
 {
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+
 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
-	tcb->rttseq = tcb->iss;
 	tcb->snd.wl2 = tcb->iss;
 	tcb->snd.una = tcb->iss;
 	tcb->snd.rxt = tcb->iss;
-	tcb->snd.ptr = tcb->rttseq;
-	tcb->snd.nxt = tcb->rttseq;
+	tcb->snd.ptr = tcb->iss;
+	tcb->snd.nxt = tcb->iss;
+
+	tcb->flags = (tcb->flags & ~SYNACK) | FORCE;
 	tcb->flgcnt++;
-	tcb->flags |= FORCE;
-	tcb->sndsyntime = NOW;
 
+	tcb->rttime = 0;	/* set in tcpoutput() */
+
 	/* set desired mss and scale */
-	tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s->laddr, s), s->ipversion, &tcb->scale);
+	tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s->laddr, s), &tcb->scale, s->ipversion);
 }
 
 static int
-sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason, Routehint *rh)
+sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, int version, char *reason, Routehint *rh)
 {
-	Block *hbp;
+	Tcppriv *tpriv = (Tcppriv*)tcp->priv;
+	Block *bp;
+	union {
+		Tcp4hdr ph4;
+		Tcp6hdr ph6;
+	} u;
 	uchar rflags;
-	Tcppriv *tpriv;
-	Tcp4hdr ph4;
-	Tcp6hdr ph6;
 
 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
 
-	tpriv = tcp->priv;
-
 	if(seg->flags & RST)
 		return -1;
 
@@ -1325,24 +1261,23 @@
 	/* make pseudo header */
 	switch(version) {
 	case V4:
-		memset(&ph4, 0, sizeof(ph4));
-		ph4.vihl = IP_VER4;
-		v6tov4(ph4.tcpsrc, dest);
-		v6tov4(ph4.tcpdst, source);
-		ph4.proto = IP_TCPPROTO;
-		hnputs(ph4.tcplen, TCP4_HDRSIZE);
-		hnputs(ph4.tcpsport, seg->dest);
-		hnputs(ph4.tcpdport, seg->source);
+		memset(&u.ph4, 0, sizeof(u.ph4));
+		u.ph4.vihl = IP_VER4;
+		u.ph4.proto = IP_TCPPROTO;
+		hnputs(u.ph4.tcplen, TCP4_HDRSIZE);
+		hnputs(u.ph4.tcpsport, seg->dest);
+		hnputs(u.ph4.tcpdport, seg->source);
+		v6tov4(u.ph4.tcpsrc, dest);
+		v6tov4(u.ph4.tcpdst, source);
 		break;
 	case V6:
-		memset(&ph6, 0, sizeof(ph6));
-		ph6.vcf[0] = IP_VER6;
-		ipmove(ph6.tcpsrc, dest);
-		ipmove(ph6.tcpdst, source);
-		ph6.proto = IP_TCPPROTO;
-		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
-		hnputs(ph6.tcpsport, seg->dest);
-		hnputs(ph6.tcpdport, seg->source);
+		memset(&u.ph6, 0, sizeof(u.ph6));
+		u.ph6.proto = IP_TCPPROTO;
+		hnputs(u.ph6.ploadlen, TCP6_HDRSIZE);
+		hnputs(u.ph6.tcpsport, seg->dest);
+		hnputs(u.ph6.tcpdport, seg->source);
+		ipmove(u.ph6.tcpsrc, dest);
+		ipmove(u.ph6.tcpdst, source);
 		break;
 	default:
 		panic("sndrst: version %d", version);
@@ -1368,16 +1303,15 @@
 	}
 	seg->flags = rflags;
 	seg->wnd = 0;
-	seg->urg = 0;
 	seg->mss = 0;
 	seg->ws = 0;
 	switch(version) {
 	case V4:
-		hbp = htontcp4(seg, nil, &ph4, nil);
-		return ipoput4(tcp->f, hbp, nil, MAXTTL, DFLTTOS, rh);
+		bp = htontcp4(seg, nil, &u.ph4);
+		return ipoput4(tcp->f, bp, nil, MAXTTL, DFLTTOS, rh);
 	case V6:
-		hbp = htontcp6(seg, nil, &ph6, nil);
-		return ipoput6(tcp->f, hbp, nil, MAXTTL, DFLTTOS, rh);
+		bp = htontcp6(seg, nil, &u.ph6);
+		return ipoput6(tcp->f, bp, nil, MAXTTL, DFLTTOS, rh);
 	}
 	return -1;
 }
@@ -1384,48 +1318,41 @@
 
 /*
  *  send a reset to the remote side and close the conversation
- *  called with s qlocked
+ *  called with s locked
  */
 static char*
 tcphangup(Conv *s)
 {
-	Tcp seg;
-	Tcpctl *tcb;
-	Block *hbp;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	tcb = (Tcpctl*)s->ptcl;
-	if(waserror())
-		return commonerror();
-	if(ipcmp(s->raddr, IPnoaddr) != 0) {
-		if(!waserror()){
-			memset(&seg, 0, sizeof seg);
-			seg.flags = RST | ACK;
-			seg.ack = tcb->rcv.nxt;
-			tcb->rcv.ackptr = seg.ack;
-			seg.seq = tcb->snd.ptr;
-			seg.wnd = 0;
-			seg.urg = 0;
-			seg.mss = 0;
-			seg.ws = 0;
-			switch(s->ipversion) {
-			case V4:
-				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
-				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
-				ipoput4(s->p->f, hbp, nil, s->ttl, s->tos, s);
-				break;
-			case V6:
-				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
-				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
-				ipoput6(s->p->f, hbp, nil, s->ttl, s->tos, s);
-				break;
-			default:
-				panic("tcphangup: version %d", s->ipversion);
-			}
-			poperror();
+	if(ipcmp(s->raddr, IPnoaddr) != 0 && tcb->state != Closed) {
+		Block *bp;
+		Tcp seg;
+
+		memset(&seg, 0, sizeof seg);
+		seg.flags = RST | ACK;
+		seg.seq = tcb->snd.ptr;
+		seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
+		seg.wnd = 0;
+		seg.mss = 0;
+		seg.ws = 0;
+		switch(s->ipversion) {
+		case V4:
+			bp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr);
+			qunlock(s);
+			ipoput4(s->p->f, bp, nil, s->ttl, s->tos, s);
+			break;
+		case V6:
+			bp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr);
+			qunlock(s);
+			ipoput6(s->p->f, bp, nil, s->ttl, s->tos, s);
+			break;
+		default:
+			panic("tcphangup: version %d", s->ipversion);
 		}
+		qlock(s);
 	}
 	localclose(s, nil);
-	poperror();
 	return nil;
 }
 
@@ -1435,13 +1362,14 @@
 static int
 sndsynack(Proto *tcp, Limbo *lp)
 {
+	union {
+		Tcp4hdr ph4;
+		Tcp6hdr ph6;
+	} u;
+	Tcp seg;
+	Block *bp;
 	Routehint rh;
 	Route *rt;
-	Block *hbp;
-	Tcp4hdr ph4;
-	Tcp6hdr ph6;
-	Tcp seg;
-	uint scale;
 
 	rh.r = nil;
 	rh.a = nil;
@@ -1451,24 +1379,23 @@
 	/* make pseudo header */
 	switch(lp->version) {
 	case V4:
-		memset(&ph4, 0, sizeof(ph4));
-		ph4.vihl = IP_VER4;
-		v6tov4(ph4.tcpsrc, lp->laddr);
-		v6tov4(ph4.tcpdst, lp->raddr);
-		ph4.proto = IP_TCPPROTO;
-		hnputs(ph4.tcplen, TCP4_HDRSIZE);
-		hnputs(ph4.tcpsport, lp->lport);
-		hnputs(ph4.tcpdport, lp->rport);
+		memset(&u.ph4, 0, sizeof(u.ph4));
+		u.ph4.vihl = IP_VER4;
+		u.ph4.proto = IP_TCPPROTO;
+		hnputs(u.ph4.tcplen, TCP4_HDRSIZE);
+		hnputs(u.ph4.tcpsport, lp->lport);
+		hnputs(u.ph4.tcpdport, lp->rport);
+		v6tov4(u.ph4.tcpsrc, lp->laddr);
+		v6tov4(u.ph4.tcpdst, lp->raddr);
 		break;
 	case V6:
-		memset(&ph6, 0, sizeof(ph6));
-		ph6.vcf[0] = IP_VER6;
-		ipmove(ph6.tcpsrc, lp->laddr);
-		ipmove(ph6.tcpdst, lp->raddr);
-		ph6.proto = IP_TCPPROTO;
-		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
-		hnputs(ph6.tcpsport, lp->lport);
-		hnputs(ph6.tcpdport, lp->rport);
+		memset(&u.ph6, 0, sizeof(u.ph6));
+		u.ph6.proto = IP_TCPPROTO;
+		hnputs(u.ph6.ploadlen, TCP6_HDRSIZE);
+		hnputs(u.ph6.tcpsport, lp->lport);
+		hnputs(u.ph6.tcpdport, lp->rport);
+		ipmove(u.ph6.tcpsrc, lp->laddr);
+		ipmove(u.ph6.tcpdst, lp->raddr);
 		break;
 	default:
 		panic("sndsynack: version %d", lp->version);
@@ -1478,27 +1405,23 @@
 	seg.seq = lp->iss;
 	seg.ack = lp->irs+1;
 	seg.flags = SYN|ACK;
-	seg.urg = 0;
-	seg.mss = tcpmtu(rt, lp->version, &scale);
+	seg.mss = tcpmtu(rt, &seg.ws, lp->version);
 	seg.wnd = QMAX;
 
-	/* if the other side set scale, we should too */
-	if(lp->rcvscale){
-		seg.ws = scale;
-		lp->sndscale = scale;
-	} else {
+	/* if the other did not set window scale, both should be zero */
+	if(lp->ws == 0)
 		seg.ws = 0;
-		lp->sndscale = 0;
-	}
+
+	lp->rexmits++;
 	lp->lastsend = NOW;
 
 	switch(lp->version) {
 	case V4:
-		hbp = htontcp4(&seg, nil, &ph4, nil);
-		return ipoput4(tcp->f, hbp, nil, MAXTTL, DFLTTOS, &rh);
+		bp = htontcp4(&seg, nil, &u.ph4);
+		return ipoput4(tcp->f, bp, nil, MAXTTL, DFLTTOS, &rh);
 	case V6:
-		hbp = htontcp6(&seg, nil, &ph6, nil);
-		return ipoput6(tcp->f, hbp, nil, MAXTTL, DFLTTOS, &rh);
+		bp = htontcp6(&seg, nil, &u.ph6);
+		return ipoput6(tcp->f, bp, nil, MAXTTL, DFLTTOS, &rh);
 	}
 	return -1;
 }
@@ -1505,6 +1428,29 @@
 
 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
 
+static Limbo**
+limbohash(Tcppriv *tpriv, uchar *raddr, ushort rport)
+{
+	return &tpriv->lht[hashipa(raddr, rport)];
+}
+
+static Limbo**
+limboent(Tcppriv *tpriv, uchar *raddr, ushort rport, uchar *laddr, ushort lport, int version)
+{
+	Limbo *lp, **l;
+
+	for(l = limbohash(tpriv, raddr, rport); (lp = *l) != nil; l = &lp->next){
+		if(lp->lport != lport || lp->rport != rport || lp->version != version)
+			continue;
+		if(ipcmp(lp->raddr, raddr) != 0)
+			continue;
+		if(ipcmp(lp->laddr, laddr) != 0)
+			continue;
+		break;
+	}
+	return l;
+}
+
 /*
  *  put a call into limbo and respond with a SYN ACK
  *
@@ -1511,31 +1457,23 @@
  *  called with proto locked
  */
 static void
-limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
+limbo(Conv *s, Tcp *seg, uchar *source, uchar *dest, int version)
 {
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
 	Limbo *lp, **l;
-	Tcppriv *tpriv;
-	int h;
 
-	tpriv = s->p->priv;
-	h = hashipa(source, seg->source);
-
-	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
-		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
-			continue;
-		if(ipcmp(lp->raddr, source) != 0)
-			continue;
-		if(ipcmp(lp->laddr, dest) != 0)
-			continue;
-
+	l = limboent(tpriv, source, seg->source, dest, seg->dest, version);
+	if((lp = *l) != nil){
 		/* each new SYN restarts the retransmits */
 		lp->irs = seg->seq;
-		break;
-	}
-	if((lp = *l) == nil){
-		if(tpriv->nlimbo >= Maxlimbo && (lp = tpriv->lht[h]) != nil){
-			if((tpriv->lht[h] = lp->next) == nil)
-				l = &tpriv->lht[h];
+	} else {
+		Limbo **h;
+
+		if(tpriv->nlimbo >= Maxlimbo
+		/* reuse the oldest entry (head) of this hash bucket */
+		&& (lp = *(h = limbohash(tpriv, source, seg->source))) != nil){
+			if((*h = lp->next) == nil)
+				l = h;
 		} else {
 			if((lp = malloc(sizeof(*lp))) == nil)
 				return;
@@ -1542,15 +1480,16 @@
 			tpriv->nlimbo++;
 		}
 		lp->next = nil;
-		lp->version = version;
 		ipmove(lp->laddr, dest);
 		ipmove(lp->raddr, source);
 		lp->lport = seg->dest;
 		lp->rport = seg->source;
 		lp->mss = seg->mss;
-		lp->rcvscale = seg->ws;
+		lp->ws = seg->ws;
 		lp->irs = seg->seq;
 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
+		lp->rexmits = 0;
+		lp->version = version;
 		*l = lp;
 	}
 	if(sndsynack(s->p, lp) < 0){
@@ -1561,7 +1500,7 @@
 }
 
 /*
- *  resend SYN ACK's once every SYNACK_RXTIMER ms.
+ *  resend SYN ACK's.
  */
 static void
 limborexmit(Proto *tcp)
@@ -1571,19 +1510,15 @@
 	ulong now;
 	int h;
 
-	tpriv = tcp->priv;
-
 	if(!canqlock(tcp))
 		return;
-	if(waserror()){
-		qunlock(tcp);
-		return;
-	}
+
 	now = NOW;
+	tpriv = tcp->priv;
 	for(h = 0; h < NLHT; h++){
 		for(l = &tpriv->lht[h]; (lp = *l) != nil; ){
-			if(now - lp->lastsend >= (lp->rexmits+1)*SYNACK_RXTIMER){
-				if(++(lp->rexmits) > 5 || sndsynack(tcp, lp) < 0){
+			if(now - lp->lastsend >= lp->rexmits*250){
+				if(lp->rexmits > 4 || sndsynack(tcp, lp) < 0){
 					tpriv->nlimbo--;
 					*l = lp->next;
 					free(lp);
@@ -1594,7 +1529,6 @@
 		}
 	}
 	qunlock(tcp);
-	poperror();
 }
 
 /*
@@ -1603,45 +1537,32 @@
  *  called with proto locked
  */
 static void
-limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
+limborst(Conv *s, Tcp *seg, uchar *source, uchar *dest, int version)
 {
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
 	Limbo *lp, **l;
-	Tcppriv *tpriv;
-	int h;
 
-	tpriv = s->p->priv;
-
-	/* find a call in limbo */
-	h = hashipa(src, segp->source);
-	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
-		lp = *l;
-		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
-			continue;
-		if(ipcmp(lp->laddr, dst) != 0)
-			continue;
-		if(ipcmp(lp->raddr, src) != 0)
-			continue;
-
-		/* RST can only follow the SYN */
-		if(segp->seq == lp->irs+1){
-			tpriv->nlimbo--;
-			*l = lp->next;
-			free(lp);
-		}
-		break;
+	l = limboent(tpriv, source, seg->source, dest, seg->dest, version);
+	if((lp = *l) == nil)
+		return;
+	/* RST can only follow the SYN */
+	if(seg->seq == lp->irs+1){
+		tpriv->nlimbo--;
+		*l = lp->next;
+		free(lp);
 	}
 }
-
+/*
+ *  use the time between the first SYN and it's ack as the
+ *  initial round trip time
+ */
 static void
-initialwindow(Tcpctl *tcb)
+tcpsynackrtt(Tcpctl *tcb)
 {
-	/* RFC 3390 initial window */
-	if(tcb->mss < 1095)
-		tcb->cwind = 4*tcb->mss;
-	else if(tcb->mss < 2190)
-		tcb->cwind = 4380;
-	else
-		tcb->cwind = 2*tcb->mss;
+	int rtt = (int)(NOW - tcb->rttime);
+	tcb->rttime = 0;
+	tcb->srtt = rtt<<LOGAGAIN;
+	tcb->mdev = rtt<<(LOGDGAIN-1);
 }
 
 /*
@@ -1651,53 +1572,32 @@
  *  called with proto locked
  */
 static Conv*
-tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
+tcpincoming(Conv *s, Tcp *seg, uchar *src, uchar *dst, int version)
 {
+	Tcppriv *tpriv;
+	Limbo *lp, **l;
 	Conv *new;
 	Tcpctl *tcb;
-	Tcppriv *tpriv;
 	Tcp4hdr *h4;
 	Tcp6hdr *h6;
-	Limbo *lp, **l;
-	int h;
 
 	/* unless it's just an ack, it can't be someone coming out of limbo */
-	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
+	if((seg->flags & SYN) || (seg->flags & ACK) == 0)
 		return nil;
 
-	tpriv = s->p->priv;
-
-	/* find a call in limbo */
-	h = hashipa(src, segp->source);
-	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
-		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
-			src, segp->source, lp->raddr, lp->rport,
-			dst, segp->dest, lp->laddr, lp->lport,
-			version, lp->version
- 		);
-
-		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
-			continue;
-		if(ipcmp(lp->laddr, dst) != 0)
-			continue;
-		if(ipcmp(lp->raddr, src) != 0)
-			continue;
-
-		/* we're assuming no data with the initial SYN */
-		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
-			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
-				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
-			lp = nil;
-		} else {
-			tpriv->nlimbo--;
-			*l = lp->next;
-		}
-		break;
-	}
-	if(lp == nil)
+	tpriv = (Tcppriv*)s->p->priv;
+	l = limboent(tpriv, src, seg->source, dst, seg->dest, version);
+	if((lp = *l) == nil)
 		return nil;
+	if(seg->seq != lp->irs+1 || seg->ack != lp->iss+1){
+		netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
+			seg->seq, lp->irs+1, seg->ack, lp->iss+1);
+		return nil;
+	}
+	tpriv->nlimbo--;
+	*l = lp->next;
 
-	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
+	new = Fsnewcall(s, src, seg->source, dst, seg->dest, version);
 	if(new == nil){
 		free(lp);
 		return nil;
@@ -1705,7 +1605,6 @@
 
 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
 	tcb = (Tcpctl*)new->ptcl;
-	tcb->flags &= ~CLONE;
 	tcb->timer.arg = new;
 	tcb->timer.state = TcptimerOFF;
 	tcb->acktimer.arg = new;
@@ -1712,50 +1611,42 @@
 	tcb->acktimer.state = TcptimerOFF;
 	tcb->katimer.arg = new;
 	tcb->katimer.state = TcptimerOFF;
-	tcb->rtt_timer.arg = new;
-	tcb->rtt_timer.state = TcptimerOFF;
 
-	tcb->irs = lp->irs;
-	tcb->rcv.nxt = tcb->irs+1;
-	tcb->rcv.wptr = tcb->rcv.nxt;
-	tcb->rcv.wsnt = 0;
-	tcb->rcv.urg = tcb->rcv.nxt;
+	tcb->rcv.nxt = tcb->rcv.ackptr = seg->seq;
+	tcb->rcv.wptr = tcb->rcv.wsnt = tcb->rcv.nxt;
+	tcb->rcv.wnd = 0;
 
 	tcb->iss = lp->iss;
-	tcb->rttseq = tcb->iss;
-	tcb->snd.wl2 = tcb->iss;
-	tcb->snd.una = tcb->iss+1;
-	tcb->snd.ptr = tcb->iss+1;
-	tcb->snd.nxt = tcb->iss+1;
-	tcb->snd.rxt = tcb->iss+1;
+
+	tcb->snd.una = seg->ack;
+	tcb->snd.ptr = seg->ack;
+	tcb->snd.nxt = seg->ack;
+	tcb->snd.rxt = seg->ack;
+
 	tcb->flgcnt = 0;
 	tcb->flags |= SYNACK;
 
-	/* set desired mss and scale */
-	tcb->mss = tcpmtu(v6lookup(s->p->f, src, dst, s), version, &tcb->scale);
+	tcb->rttime = lp->lastsend;
+	tcpsynackrtt(tcb);
 
-	/* our sending max segment size cannot be bigger than what he asked for */
-	if(lp->mss != 0 && lp->mss < tcb->mss)
-		tcb->mss = lp->mss;
+	/* the same as what we sent in SYN,ACK */
+	tcb->mss = tcpmtu(v6lookup(s->p->f, src, dst, s), &tcb->scale, version);
 
-	/* window scaling */
-	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
+	tcpsetmss(new, lp->mss);
+	tcpsetscale(new, lp->ws);
 
-	/* congestion window */
-	tcb->snd.wnd = segp->wnd;
-	initialwindow(tcb);
-
-	/* set initial round trip time */
-	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
-	tcpsynackrtt(new);
-
 	free(lp);
 
+	tcb->snd.wnd = seg->wnd << tcb->snd.scale;
+	tcb->snd.wl2 = seg->ack;
+	tcb->snd.wl1 = seg->seq;
+
 	/* set up proto header */
 	switch(version){
 	case V4:
 		h4 = &tcb->protohdr.tcp4hdr;
 		memset(h4, 0, sizeof(*h4));
+		h4->vihl = IP_VER4;
 		h4->proto = IP_TCPPROTO;
 		hnputs(h4->tcpsport, new->lport);
 		hnputs(h4->tcpdport, new->rport);
@@ -1780,109 +1671,71 @@
 	return new;
 }
 
-static int
-seq_within(ulong x, ulong low, ulong high)
-{
-	if(low <= high){
-		if(low <= x && x <= high)
-			return 1;
-	}
-	else {
-		if(x >= low || x <= high)
-			return 1;
-	}
-	return 0;
-}
-
-static int
-seq_lt(ulong x, ulong y)
-{
-	return (int)(x-y) < 0;
-}
-
-static int
-seq_le(ulong x, ulong y)
-{
-	return (int)(x-y) <= 0;
-}
-
-static int
-seq_gt(ulong x, ulong y)
-{
-	return (int)(x-y) > 0;
-}
-
-static int
-seq_ge(ulong x, ulong y)
-{
-	return (int)(x-y) >= 0;
-}
-
-/*
- *  use the time between the first SYN and it's ack as the
- *  initial round trip time
- */
 static void
-tcpsynackrtt(Conv *s)
-{
-	Tcpctl *tcb;
-	int delta;
-	Tcppriv *tpriv;
-
-	tcb = (Tcpctl*)s->ptcl;
-	tpriv = s->p->priv;
-
-	delta = NOW - tcb->sndsyntime;
-	tcb->srtt = delta<<LOGAGAIN;
-	tcb->mdev = delta<<LOGDGAIN;
-
-	/* halt round trip timer */
-	tcphalt(tpriv, &tcb->rtt_timer);
-}
-
-static void
 update(Conv *s, Tcp *seg)
 {
-	int rtt, delta;
 	Tcpctl *tcb;
-	ulong acked;
 	Tcppriv *tpriv;
+	int rtt, delta, acked;
 
 	if(seg->update)
 		return;
 	seg->update = 1;
+	if((seg->flags & ACK) == 0)
+		return;
 
-	tpriv = s->p->priv;
 	tcb = (Tcpctl*)s->ptcl;
+	tpriv = (Tcppriv*)s->p->priv;
 
-	/* catch zero-window updates, update window & recover */
-	if(tcb->snd.wnd == 0 && seg->wnd > 0)
-	if(seq_lt(seg->ack,  tcb->snd.ptr)){
-		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
-			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
+	/* ghost acks should be ignored */
+	if(seq_gt(seg->ack, tcb->snd.nxt))
+		return;
+
+	/*
+	 *  update window
+	 */
+	if(seq_gt(seg->seq, tcb->snd.wl1)
+	|| seg->seq == tcb->snd.wl1
+		&& (seq_gt(seg->ack, tcb->snd.wl2)
+		|| seg->ack == tcb->snd.wl2 && seg->wnd > tcb->snd.wnd)){
+		/* clear dupack if we advance wl2 */
+		if(tcb->snd.wl2 != seg->ack)
+			tcb->snd.dupacks = 0;
+		tcb->snd.wl2 = seg->ack;
+		tcb->snd.wl1 = seg->seq;
+		if(tcb->snd.wnd == 0 && seg->wnd > 0){
+			tcb->snd.wnd = seg->wnd;
+			goto recovery;
+		}
 		tcb->snd.wnd = seg->wnd;
-		goto recovery;
 	}
 
 	/* newreno fast retransmit */
-	if(seg->ack == tcb->snd.una)
 	if(tcb->snd.una != tcb->snd.nxt)
+	if(seg->ack == tcb->snd.una)
+	if(seg->len == 0 && (seg->flags & (SYN|FIN)) == 0)
 	if(++tcb->snd.dupacks == 3){
 recovery:
 		if(tcb->snd.recovery){
 			tpriv->stats[RecoveryCwind]++;
 			tcb->cwind += tcb->mss;
-		}else if(seq_le(tcb->snd.rxt, seg->ack)){
+		}else if(tcb->snd.wnd == 0){
+			netlog(s->p->f, Logtcpwin, "!recov %lud %lud window shut\n",
+				tcb->snd.rxt, seg->ack);
+			/* force window probe */
+			tcprxmit(s);
+		}else if(seq_ge(seg->ack, tcb->snd.rxt)){
 			tpriv->stats[Recovery]++;
-			tcb->abcbytes = 0;
 			tcb->snd.recovery = 1;
 			tcb->snd.partialack = 0;
 			tcb->snd.rxt = tcb->snd.nxt;
 			tcpcongestion(tcb);
+			tcb->abcbytes = 0;
 			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
 			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
 				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
-			tcprxmit(s);
+			/* initial fast-retransmit, preserve send pointer */
+			tcb->snd.ptr = tcprxmit(s);
 		}else{
 			tpriv->stats[RecoveryNoSeq]++;
 			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
@@ -1895,50 +1748,59 @@
 		tcb->cwind += tcb->mss;
 	}
 
-	/*
-	 *  update window
-	 */
-	if(seq_gt(seg->ack, tcb->snd.wl2)
-	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
-		/* clear dupack if we advance wl2 */
-		if(tcb->snd.wl2 != seg->ack)
-			tcb->snd.dupacks = 0;
-		tcb->snd.wnd = seg->wnd;
-		tcb->snd.wl2 = seg->ack;
-	}
-
-	if(!seq_gt(seg->ack, tcb->snd.una)){
+	/* Compute the new send window size */
+	acked = (int)(seg->ack - tcb->snd.una);
+	if(acked <= 0){
 		/*
 		 *  don't let us hangup if sending into a closed window and
 		 *  we're still getting acks
 		 */
-		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
+		if(tcb->snd.wnd == 0)
 			tcb->backedoff = MAXBACKMS/4;
 		return;
 	}
 
-	/* Compute the new send window size */
-	acked = seg->ack - tcb->snd.una;
+	/* RTT measurement */
+	if(tcb->rttime && seq_ge(seg->ack, tcb->rttseq)) {
+		rtt = (int)(NOW - tcb->rttime);
+		tcb->rttime = 0;
 
-	/* avoid slow start and timers for SYN acks */
+		delta = rtt - (tcb->srtt>>LOGAGAIN);
+		tcb->srtt += delta;
+		if(delta < 0) delta = -delta;
+		tcb->mdev += delta - (tcb->mdev>>LOGDGAIN);
+	}
+
+	/*
+	 *  update queue
+	 */
 	if((tcb->flags & SYNACK) == 0) {
 		tcb->flags |= SYNACK;
-		acked--;
 		tcb->flgcnt--;
-		goto done;
+		acked--;
 	}
+	if(qdiscard(s->wq, acked) < acked)
+		tcb->flgcnt--;
+	tcb->snd.una = seg->ack;
+	if(seq_gt(seg->ack, tcb->snd.ptr))
+		tcb->snd.ptr = seg->ack;
 
 	/*
 	 *  congestion control
 	 */
 	if(tcb->snd.recovery){
-		if(seq_ge(seg->ack, tcb->snd.rxt)){
+		if(seq_ge(seg->ack, tcb->snd.rxt) || tcb->snd.wnd == 0){
 			/* recovery finished; deflate window */
 			tpriv->stats[RecoveryDone]++;
 			tcb->snd.dupacks = 0;
 			tcb->snd.recovery = 0;
-			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
-			if(tcb->ssthresh < tcb->cwind)
+
+			/* RFC 6582 2.3 (3) min(ssthresh, max(FlightSize, SMSS) + SMSS) */
+			tcb->cwind = tcb->snd.nxt - tcb->snd.una;
+			if(tcb->cwind < tcb->mss)
+				tcb->cwind = tcb->mss;
+			tcb->cwind += tcb->mss;
+			if(tcb->cwind > tcb->ssthresh)
 				tcb->cwind = tcb->ssthresh;
 			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
 				tcb->cwind, tcb->ssthresh);
@@ -1951,75 +1813,28 @@
 				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
 				tcb->cwind = tcb->mss;
 			}
-			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
-				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
-
 			if(acked >= tcb->mss)
 				tcb->cwind += tcb->mss;
 			tcb->snd.partialack++;
+			netlog(s->p->f, Logtcpwin, "partial ack %d left %ld cwind %ld\n",
+				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
+			/* retransmit, preserve send pointer */
+			tcb->snd.ptr = tcprxmit(s);
 		}
-	} else
+	} else {
 		tcpabcincr(tcb, acked);
-
-	/* Adjust the timers according to the round trip time */
-	/* todo: fix sloppy treatment of overflow cases here. */
-	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
-		tcphalt(tpriv, &tcb->rtt_timer);
-		if((tcb->flags&RETRAN) == 0) {
-			tcb->backoff = 0;
-			tcb->backedoff = 0;
-			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
-			if(rtt == 0)
-				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
-			rtt *= MSPTICK;
-			if(tcb->srtt == 0) {
-				tcb->srtt = rtt << LOGAGAIN;
-				tcb->mdev = rtt << LOGDGAIN;
-			} else {
-				delta = rtt - (tcb->srtt>>LOGAGAIN);
-				tcb->srtt += delta;
-				if(tcb->srtt <= 0)
-					tcb->srtt = 1;
-
-				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
-				tcb->mdev += delta;
-				if(tcb->mdev <= 0)
-					tcb->mdev = 1;
-			}
-			tcpsettimer(tcb);
-		}
 	}
 
-done:
-	if(qdiscard(s->wq, acked) < acked)
-		tcb->flgcnt--;
-	tcb->snd.una = seg->ack;
+	tcb->backoff = 0;
+	tcb->backedoff = 0;
 
-	/* newreno fast recovery */
-	if(tcb->snd.recovery)
-		tcprxmit(s);
-
-	if(seq_gt(seg->ack, tcb->snd.urg))
-		tcb->snd.urg = seg->ack;
-
 	if(tcb->snd.una != tcb->snd.nxt){
 		/* “impatient” variant */
-		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
-			tcb->time = NOW;
-			tcb->timeuna = tcb->snd.una;
-			tcpgo(tpriv, &tcb->timer);
-		}
+		if(!tcb->snd.recovery || tcb->snd.partialack == 1)
+			tcpsettimer(s);
 	}
 	else
 		tcphalt(tpriv, &tcb->timer);
-
-	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
-		tcb->snd.ptr = tcb->snd.una;
-
-	if(!tcb->snd.recovery)
-		tcb->flags &= ~RETRAN;
-	tcb->backoff = 0;
-	tcb->backedoff = 0;
 }
 
 static void
@@ -2037,11 +1852,10 @@
 	Fs *f;
 	Tcppriv *tpriv;
 	char *reason;
-	uchar version;
+	int version;
 
 	f = tcp->f;
-	tpriv = tcp->priv;
-
+	tpriv = (Tcppriv*)tcp->priv;
 	tpriv->stats[InSegs]++;
 
 	h4 = (Tcp4hdr*)(bp->rp);
@@ -2173,9 +1987,9 @@
 
 		/* if this is a new SYN, put the call into limbo */
 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
-			freeblist(bp);
-			limbo(s, source, dest, &seg, version);
+			limbo(s, &seg, source, dest, version);
 			qunlock(tcp);
+			freeblist(bp);
 			return;
 		}
 
@@ -2193,18 +2007,14 @@
 	 * Out-of-band data is ignored - it was always a bad idea.
 	 */
 	tcb = (Tcpctl*)s->ptcl;
-	if(waserror()){
-		qunlock(s);
-		nexterror();
-	}
 	qlock(s);
 	qunlock(tcp);
 
 	/* fix up window */
-	seg.wnd <<= tcb->rcv.scale;
+	seg.wnd <<= tcb->snd.scale;
 
 	/* every input packet in puts off the keep alive time out */
-	tcpsetkacounter(tcb);
+	tcb->kato = 0;
 
 	switch(tcb->state) {
 	case Closed:
@@ -2211,13 +2021,15 @@
 	closed:
 		reason = "sending to Closed";
 	reset2:
+		qunlock(s);
 		freeblist(bp);
-		bp = nil;
 		sndrst(tcp, source, dest, length, &seg, version, reason, s);
-		goto raise;
+		return;
 	case Syn_sent:
 		if(seg.flags & ACK) {
-			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
+			if(!seq_in(seg.ack, tcb->snd.una, tcb->snd.nxt+1)) {
+				if(seg.flags & RST)
+					goto raise;
 				reason = "bad seq in Syn_sent";
 				goto reset2;
 			}
@@ -2227,46 +2039,43 @@
 				localclose(s, Econrefused);
 			goto raise;
 		}
-
 		if(seg.flags & SYN) {
 			procsyn(s, &seg);
 			if(seg.flags & ACK){
+				tcb->snd.wnd = seg.wnd;	/* window in SYN,ACK must not be scaled */
+				tcb->snd.wl2 = seg.ack;
+				tcb->snd.wl1 = seg.seq;
+				tcpsynackrtt(tcb);
 				update(s, &seg);
-				tcpsynackrtt(s);
 				tcpsetstate(s, Established);
-				tcpsetscale(s, tcb, seg.ws, tcb->scale);
 			}
 			else {
 				tcb->time = NOW;
 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
 			}
-
 			if(length != 0 || (seg.flags & FIN))
 				break;
 			freeblist(bp);
 			goto output;
 		}
-		qunlock(s);
-		poperror();
-		freeblist(bp);
-		return;
+		goto raise;
 	case Syn_received:
-		/* doesn't matter if it's the correct ack, we're just trying to set timing */
 		if(seg.flags & ACK)
-			tcpsynackrtt(s);
+			tcpsynackrtt(tcb);
 		break;
 	}
 
 	/* Cut the data to fit the receive window */
 	tcprcvwin(s);
-	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
-		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
-		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n", 
-			seg.seq, seg.seq + length - 1, 
-			tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
+	if(tcptrim(tcb, &seg, &bp, &length) < 0) {
+		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud (%d) win %lud-%lud (%lud) from %I!%d -> %I!%d\n", 
+			seg.seq, seg.seq+length, length,
+			tcb->rcv.nxt, tcb->rcv.wptr, tcb->rcv.wnd,
+			s->raddr, s->rport, s->laddr, s->lport);
+		tcb->flags |= FORCE;
 		update(s, &seg);
-		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
-			tcphalt(tpriv, &tcb->rtt_timer);
+		if(tcb->state == Closing)
+		if(qlen(s->wq)+tcb->flgcnt == 0) {
 			tcphalt(tpriv, &tcb->acktimer);
 			tcphalt(tpriv, &tcb->katimer);
 			tcpsetstate(s, Time_wait);
@@ -2273,13 +2082,9 @@
 			tcb->timer.start = MSL2*(1000 / MSPTICK);
 			tcpgo(tpriv, &tcb->timer);
 		}
-		if(!(seg.flags & RST)) {
-			tcb->flags |= FORCE;
-			goto output;
-		}
-		qunlock(s);
-		poperror();
-		return;
+		if(seg.flags & RST)
+			goto raise;
+		goto output;
 	}
 
 	/* Cannot accept so answer with a rst */
@@ -2291,16 +2096,22 @@
 	 */
 	if(seg.seq != tcb->rcv.nxt)
 	if(length != 0 || (seg.flags & (SYN|FIN))) {
+		/*
+		 *  force duplicate ack; RFC 5681 §3.2
+		 *
+		 *  FORCE must be set before update()
+		 *  which can send fast-retransmits
+		 *  (clearing FORCE flag) as we must
+		 *  only produce one output segment
+		 *  per input segment.
+		 */
+		tcb->flags |= FORCE;
 		update(s, &seg);
 		if(addreseq(f, tcb, tpriv, &seg, &bp, length) < 0)
-			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
-		tcb->flags |= FORCE;		/* force duplicate ack; RFC 5681 §3.2 */
+			print("tcp: reseq: %I!%d -> %I!%d\n", s->raddr, s->rport, s->laddr, s->lport);
 		goto output;
 	}
 
-	if(tcb->nreseq > 0)
-		tcb->flags |= FORCE;		/* filled hole in sequence space; RFC 5681 §3.2 */
-
 	/*
 	 *  keep looping till we've processed this packet plus any
 	 *  adjacent packets in the resequence queue
@@ -2322,35 +2133,35 @@
 
 		switch(tcb->state) {
 		case Syn_received:
-			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
+			if(!seq_in(seg.ack, tcb->snd.una, tcb->snd.nxt+1)){
 				reason = "bad seq in Syn_received";
 				goto reset2;
 			}
+			tcb->snd.wnd = seg.wnd;	/* already scaled */
+			tcb->snd.wl2 = seg.ack;
+			tcb->snd.wl1 = seg.seq;
 			update(s, &seg);
 			tcpsetstate(s, Established);
+			break;
 		case Established:
 		case Close_wait:
+		case Finwait2:
 			update(s, &seg);
 			break;
 		case Finwait1:
 			update(s, &seg);
 			if(qlen(s->wq)+tcb->flgcnt == 0){
-				tcphalt(tpriv, &tcb->rtt_timer);
 				tcphalt(tpriv, &tcb->acktimer);
-				tcpsetkacounter(tcb);
+				tcphalt(tpriv, &tcb->katimer);
 				tcb->time = NOW;
 				tcpsetstate(s, Finwait2);
-				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
+				tcb->katimer.start = MSL2*(1000 / MSPTICK);
 				tcpgo(tpriv, &tcb->katimer);
 			}
 			break;
-		case Finwait2:
-			update(s, &seg);
-			break;
 		case Closing:
 			update(s, &seg);
 			if(qlen(s->wq)+tcb->flgcnt == 0) {
-				tcphalt(tpriv, &tcb->rtt_timer);
 				tcphalt(tpriv, &tcb->acktimer);
 				tcphalt(tpriv, &tcb->katimer);
 				tcpsetstate(s, Time_wait);
@@ -2364,23 +2175,14 @@
 				localclose(s, nil);
 				goto raise;
 			}
+			/* wet floor */
 		case Time_wait:
 			if(seg.flags & FIN)
 				tcb->flags |= FORCE;
 			if(tcb->timer.state != TcptimerON)
-				tcpgo(tpriv, &tcb->timer);
+				tcpsettimer(s);
 		}
 
-		if((seg.flags&URG) && seg.urg) {
-			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
-				tcb->rcv.urg = seg.urg + seg.seq;
-				pullblock(&bp, seg.urg);
-			}
-		}
-		else
-		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
-			tcb->rcv.urg = tcb->rcv.nxt;
-
 		if(length == 0) {
 			freeblist(bp);
 			bp = nil;
@@ -2404,14 +2206,6 @@
 					bp = nil;
 				}
 				tcb->rcv.nxt += length;
-
-				/*
-				 *  turn on the acktimer if there's something
-				 *  to ack
-				 */
-				if(tcb->acktimer.state != TcptimerON)
-					tcpgo(tpriv, &tcb->acktimer);
-
 				break;
 			case Finwait2:
 				reason = "send to Finwait2";
@@ -2431,7 +2225,6 @@
 			case Finwait1:
 				tcb->rcv.nxt++;
 				if(qlen(s->wq)+tcb->flgcnt == 0) {
-					tcphalt(tpriv, &tcb->rtt_timer);
 					tcphalt(tpriv, &tcb->acktimer);
 					tcphalt(tpriv, &tcb->katimer);
 					tcpsetstate(s, Time_wait);
@@ -2443,11 +2236,10 @@
 				break;
 			case Finwait2:
 				tcb->rcv.nxt++;
-				tcphalt(tpriv, &tcb->rtt_timer);
 				tcphalt(tpriv, &tcb->acktimer);
 				tcphalt(tpriv, &tcb->katimer);
 				tcpsetstate(s, Time_wait);
-				tcb->timer.start = MSL2 * (1000/MSPTICK);
+				tcb->timer.start = MSL2*(1000/MSPTICK);
 				tcpgo(tpriv, &tcb->timer);
 				break;
 			case Close_wait:
@@ -2455,7 +2247,7 @@
 			case Last_ack:
 				break;
 			case Time_wait:
-				tcpgo(tpriv, &tcb->timer);
+				tcpsettimer(s);
 				break;
 			}
 		}
@@ -2465,16 +2257,11 @@
 		 *  dump/trim any overlapping segments
 		 */
 		for(;;) {
-			if(tcb->reseq == nil)
+			if(getreseq(tcb, &seg, &bp, &length) < 0)
 				goto output;
-
-			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
-				goto output;
-
-			getreseq(tcb, &seg, &bp, &length);
-
 			tcprcvwin(s);
 			if(tcptrim(tcb, &seg, &bp, &length) == 0){
+				/* produce an ack when hole filled */
 				tcb->flags |= FORCE;
 				break;
 			}
@@ -2482,14 +2269,20 @@
 	}
 output:
 	tcpoutput(s);
+
+	/*
+	 *  turn on the acktimer if there's something
+	 *  to ack and we delayed ack in tcpoutput().
+	 */
+	if(tcb->rcv.ackptr != tcb->rcv.nxt)
+	if(tcb->acktimer.state != TcptimerON)
+		tcpgo(tpriv, &tcb->acktimer);
+
 	qunlock(s);
-	poperror();
 	return;
 raise:
 	qunlock(s);
-	poperror();
 	freeblist(bp);
-	tcpkick(s);
 }
 
 /*
@@ -2500,40 +2293,21 @@
 static void
 tcpoutput(Conv *s)
 {
+	Fs *f = s->p->f;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
+	int ssize, sndcnt, sent;
+	Block *bp;
 	Tcp seg;
-	uint msgs;
-	Tcpctl *tcb;
-	Block *hbp, *bp;
-	int sndcnt;
-	ulong ssize, dsize, sent;
-	Fs *f;
-	Tcppriv *tpriv;
-	uchar version;
 
-	f = s->p->f;
-	tpriv = s->p->priv;
-	version = s->ipversion;
-
-	tcb = (Tcpctl*)s->ptcl;
-
-	/* force ack every 2*mss */
+	/* force ack every second mss */
 	if((tcb->flags & FORCE) == 0)
-	if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
+	if(tcb->rcv.nxt - tcb->rcv.ackptr > tcb->mss){
 		tpriv->stats[Delayack]++;
 		tcb->flags |= FORCE;
 	}
 
-	/* force ack if window opening */
-	if(0)
-	if((tcb->flags & FORCE) == 0){
-		tcprcvwin(s);
-		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
-			tpriv->stats[Wopenack]++;
-			tcb->flags |= FORCE;
-		}
-	}
-
-	for(msgs = 0; msgs < 100; msgs++) {
+	for(;;) {
 		switch(tcb->state) {
 		case Listen:
 		case Closed:
@@ -2542,63 +2316,54 @@
 		}
 
 		/* Don't send anything else until our SYN has been acked */
-		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
-			break;
+		if((tcb->flags & SYNACK) == 0 && tcb->snd.ptr != tcb->iss)
+			return;
 
-		/* force an ack when a window has opened up */
 		tcprcvwin(s);
-		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
-			tcb->rcv.blocked = 0;
+
+		/* force an ack when a window has opened up */
+		if((tcb->flags & FORCE) == 0)
+		if(seq_in(tcb->rcv.nxt, tcb->rcv.wsnt, tcb->rcv.wptr)) {
+			tpriv->stats[Wopenack]++;
 			tcb->flags |= FORCE;
 		}
 
-		sndcnt = qlen(s->wq)+tcb->flgcnt;
-		sent = tcb->snd.ptr - tcb->snd.una;
-		ssize = sndcnt;
+		/* figure out yow much to send */
+		ssize = sndcnt = qlen(s->wq)+tcb->flgcnt;
 		if(tcb->snd.wnd == 0){
 			/* zero window probe */
+			seg.seq = tcb->snd.una;
+			sent = tcb->snd.ptr - seg.seq;
 			if(sent > 0)
-			if(!(tcb->flags & FORCE))
-				break;	/* already probing, rto re-probes */
-			if(ssize < sent)
 				ssize = 0;
-			else{
-				ssize -= sent;
-				if(ssize > 0)
-					ssize = 1;
-			}
+			else if(ssize > 0)
+				ssize = 1;
 		} else {
 			/* calculate usable segment size */
-			if(ssize > tcb->cwind)
-				ssize = tcb->cwind;
 			if(ssize > tcb->snd.wnd)
 				ssize = tcb->snd.wnd;
-
-			if(ssize < sent)
+			if(ssize > tcb->cwind)
+				ssize = tcb->cwind;
+			seg.seq = tcb->snd.ptr;
+			sent = seg.seq - tcb->snd.una;
+			ssize -= sent;
+			if(ssize < 0)
 				ssize = 0;
-			else {
-				ssize -= sent;
-				if(ssize > tcb->mss)
-					ssize = tcb->mss;
-			}
+			else if(ssize > tcb->mss)
+				ssize = tcb->mss;
 		}
 
-		dsize = ssize;
-		seg.urg = 0;
-
-		if(!(tcb->flags & FORCE)){
+		if((tcb->flags & FORCE) == 0){
 			if(ssize == 0)
-				break;
+				return;
 			if(ssize < tcb->mss)
-			if(tcb->snd.nxt == tcb->snd.ptr)
+			if(seg.seq == tcb->snd.nxt)
 			if(sent > TCPREXMTTHRESH*tcb->mss)
-				break;
+				return;
 		}
-
 		tcb->flags &= ~FORCE;
 
 		/* By default we will generate an ack */
-		tcphalt(tpriv, &tcb->acktimer);
 		seg.source = s->lport;
 		seg.dest = s->rport;
 		seg.flags = ACK;
@@ -2605,115 +2370,86 @@
 		seg.mss = 0;
 		seg.ws = 0;
 		seg.update = 0;
-		switch(tcb->state){
-		case Syn_sent:
-			seg.flags = 0;
-			if(tcb->snd.ptr == tcb->iss){
+
+		seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
+		seg.wnd = tcb->rcv.wnd >> tcb->rcv.scale;
+		tcb->rcv.wsnt = tcb->rcv.wptr;
+
+		bp = nil;
+		if(ssize > 0){
+			switch(tcb->state){
+			case Syn_sent:
+				seg.flags = 0;
+				/* wet floor */
+			case Syn_received:
+				/*
+				 *  don't send any data with a SYN packet
+				 *  because Linux rejects the packet in its
+				 *  attempt to solve the SYN attack problem
+				 */
 				seg.flags |= SYN;
-				dsize--;
 				seg.mss = tcb->mss;
 				seg.ws = tcb->scale;
-			}
-			break;
-		case Syn_received:
-			/*
-			 *  don't send any data with a SYN/ACK packet
-			 *  because Linux rejects the packet in its
-			 *  attempt to solve the SYN attack problem
-			 */
-			if(tcb->snd.ptr == tcb->iss){
-				seg.flags |= SYN;
-				dsize = 0;
 				ssize = 1;
-				seg.mss = tcb->mss;
-				seg.ws = tcb->scale;
+				break;
+			default:
+				/* Pull out data to send */
+				bp = qcopy(s->wq, ssize, sent);
+				if(BLEN(bp) != ssize)
+					seg.flags |= FIN;
+				else if(sent+ssize == sndcnt)
+					seg.flags |= PSH;
+				break;
 			}
-			break;
-		}
-		seg.seq = tcb->snd.ptr;
-		seg.ack = tcb->rcv.nxt;
-		seg.wnd = tcb->rcv.wnd;
+	
+			/* Pull up the send pointer so we can accept acks
+			 * for this window
+			 */
+			tcb->snd.ptr += ssize;
+			if(seq_gt(tcb->snd.ptr, tcb->snd.nxt)){
+				tcb->snd.nxt = tcb->snd.ptr;
 
-		/* Pull out data to send */
-		bp = nil;
-		if(dsize != 0) {
-			bp = qcopy(s->wq, dsize, sent);
-			if(BLEN(bp) != dsize) {
-				seg.flags |= FIN;
-				dsize--;
+				/* start RTT measurement */
+				if(tcb->rttime == 0) {
+					tcb->rttime = NOW;
+					tcb->rttseq = tcb->snd.ptr;
+				}
 			}
-		}
 
-		if(sent+dsize == sndcnt && dsize)
-			seg.flags |= PSH;
-
-		tcb->snd.ptr += ssize;
-
-		/* Pull up the send pointer so we can accept acks
-		 * for this window
-		 */
-		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
-			tcb->snd.nxt = tcb->snd.ptr;
-
-		/* Build header, link data and compute cksum */
-		switch(version){
-		case V4:
-			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
-			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
-			break;
-		case V6:
-			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
-			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
-			break;
-		default:
-			panic("tcpoutput: version %d", version);
-		}
-
-		/* Start the transmission timers if there is new data and we
-		 * expect acknowledges
-		 */
-		if(ssize != 0){
-			if(tcb->timer.state != TcptimerON){
-				tcb->time = NOW;
-				tcb->timeuna = tcb->snd.una;
-				tcpgo(tpriv, &tcb->timer);
-			}
-
-			/*  If round trip timer isn't running, start it.
-			 *  measure the longest packet only in case the
-			 *  transmission time dominates RTT
+			/* Start the transmission timers if there is new data and we
+			 * expect acknowledges
 			 */
-			if(tcb->snd.retransmit == 0)
-			if(tcb->rtt_timer.state != TcptimerON)
-			if(ssize == tcb->mss) {
-				tcpgo(tpriv, &tcb->rtt_timer);
-				tcb->rttseq = tcb->snd.ptr;
-			}
+			if(tcb->timer.state != TcptimerON)
+				tcpsettimer(s);
 		}
 
+		if(tcb->acktimer.state == TcptimerON)
+			tcphalt(tpriv, &tcb->acktimer);
+
 		tpriv->stats[OutSegs]++;
-		if(tcb->snd.retransmit)
+		if(tcb->snd.ptr != tcb->snd.nxt)
 			tpriv->stats[RetransSegsSent]++;
-		tcb->rcv.ackptr = seg.ack;
-		tcb->rcv.wsnt = tcb->rcv.wptr;
 
-		/* put off the next keep alive */
-		tcpgo(tpriv, &tcb->katimer);
-
-		switch(version){
+		switch(s->ipversion){
 		case V4:
-			if(ipoput4(f, hbp, nil, s->ttl, s->tos, s) < 0)
-				localclose(s, Enoroute);
+			bp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr);
+			qunlock(s);
+			sent = ipoput4(f, bp, nil, s->ttl, s->tos, s);
 			break;
 		case V6:
-			if(ipoput6(f, hbp, nil, s->ttl, s->tos, s) < 0)
-				localclose(s, Enoroute);
+			bp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr);
+			qunlock(s);
+			sent = ipoput6(f, bp, nil, s->ttl, s->tos, s);
 			break;
+		default:
+			panic("tcpoutput: version %d", s->ipversion);
 		}
-		if((msgs%4) == 3){
-			qunlock(s);
+		if(sent < 0){
 			qlock(s);
+			localclose(s, Enoroute);
+			return;
 		}
+		qlock(s);
 	}
 }
 
@@ -2723,76 +2459,62 @@
 static int
 tcpsendka(Conv *s)
 {
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	Block *bp;
 	Tcp seg;
-	Tcpctl *tcb;
-	Block *hbp,*dbp;
+	int ret;
 
-	tcb = (Tcpctl*)s->ptcl;
-
-	dbp = nil;
 	memset(&seg, 0, sizeof seg);
-	seg.urg = 0;
 	seg.source = s->lport;
 	seg.dest = s->rport;
 	seg.flags = ACK|PSH;
 	seg.mss = 0;
 	seg.ws = 0;
-	seg.seq = tcb->snd.una-1;
-	seg.ack = tcb->rcv.nxt;
-	tcb->rcv.ackptr = seg.ack;
+
 	tcprcvwin(s);
-	seg.wnd = tcb->rcv.wnd;
+	seg.seq = tcb->snd.una-1;
+	seg.ack = tcb->rcv.ackptr = tcb->rcv.nxt;
+	seg.wnd = tcb->rcv.wnd >> tcb->rcv.scale;
+	tcb->rcv.wsnt = tcb->rcv.wptr;
+
 	if(tcb->state == Finwait2){
 		seg.flags |= FIN;
+		bp = nil;
 	} else {
-		dbp = allocb(1);
-		dbp->wp++;
+		bp = allocb(1);
+		bp->wp++;
 	}
-
-	if(isv4(s->raddr)) {
-		/* Build header, link data and compute cksum */
-		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
-		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
-		return ipoput4(s->p->f, hbp, nil, s->ttl, s->tos, s);
+	switch(s->ipversion){
+	case V4:
+		bp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr);
+		qunlock(s);
+		ret = ipoput4(s->p->f, bp, nil, s->ttl, s->tos, s);
+		break;
+	case V6:
+		bp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr);
+		qunlock(s);
+		ret = ipoput6(s->p->f, bp, nil, s->ttl, s->tos, s);
+		break;
+	default:
+		panic("tcpsendka: version %d", s->ipversion);
 	}
-	else {
-		/* Build header, link data and compute cksum */
-		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
-		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
-		return ipoput6(s->p->f, hbp, nil, s->ttl, s->tos, s);
-	}
+	qlock(s);
+	return ret;
 }
 
 /*
- *  set connection to time out after 12 minutes
- */
-static void
-tcpsetkacounter(Tcpctl *tcb)
-{
-	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
-	if(tcb->kacounter < 3)
-		tcb->kacounter = 3;
-}
-
-/*
  *  if we've timed out, close the connection
  *  otherwise, send a keepalive and restart the timer
  */
 static void
-tcpkeepalive(void *v)
+tcpkeepalive(void *arg)
 {
-	Tcpctl *tcb;
-	Conv *s;
+	Conv *s = (Conv*)arg;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	s = v;
-	tcb = (Tcpctl*)s->ptcl;
-	if(waserror()){
-		qunlock(s);
-		nexterror();
-	}
 	qlock(s);
 	if(tcb->state != Closed){
-		if(--(tcb->kacounter) <= 0) {
+		if(++(tcb->kato) > MAX_KAT) {
 			localclose(s, Etimedout);
 		} else if(tcpsendka(s) < 0) {
 			localclose(s, Enoroute);
@@ -2801,7 +2523,6 @@
 		}
 	}
 	qunlock(s);
-	poperror();
 }
 
 /*
@@ -2810,18 +2531,21 @@
 static char*
 tcpstartka(Conv *s, char **f, int n)
 {
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 	int x;
 
-	tcb = (Tcpctl*)s->ptcl;
 	if(tcb->state != Established)
 		return "connection must be in Establised state";
+
+	x = DEF_KAT;
 	if(n > 1){
 		x = atoi(f[1]);
-		if(x >= MSPTICK)
-			tcb->katimer.start = x/MSPTICK;
+		if(x <= 0)
+			x = 0;
 	}
-	tcpsetkacounter(tcb);
+	tcphalt(s->p->priv, &tcb->katimer);
+	tcb->kato = 0;
+	tcb->katimer.start = (x + MSPTICK-1) / MSPTICK;
 	tcpgo(s->p->priv, &tcb->katimer);
 
 	return nil;
@@ -2828,45 +2552,36 @@
 }
 
 /*
- *  turn checksums on/off
- */
-static char*
-tcpsetchecksum(Conv *s, char **f, int)
-{
-	Tcpctl *tcb;
-
-	tcb = (Tcpctl*)s->ptcl;
-	tcb->nochecksum = !atoi(f[1]);
-
-	return nil;
-}
-
-/*
  *  retransmit (at most) one segment at snd.una.
- *  preserve cwind & snd.ptr
+ *  preserve cwind and return original snd.ptr
  */
-static void
+static ulong
 tcprxmit(Conv *s)
 {
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 	Tcppriv *tpriv;
 	ulong tcwind, tptr;
 
-	tcb = (Tcpctl*)s->ptcl;
-	tcb->flags |= RETRAN|FORCE;
+	tcb->flags |= FORCE;
 
 	tptr = tcb->snd.ptr;
-	tcwind = tcb->cwind;
 	tcb->snd.ptr = tcb->snd.una;
+
+	tcwind = tcb->cwind;
 	tcb->cwind = tcb->mss;
-	tcb->snd.retransmit = 1;
+
 	tcpoutput(s);
-	tcb->snd.retransmit = 0;
+
 	tcb->cwind = tcwind;
-	tcb->snd.ptr = tptr;
 
-	tpriv = s->p->priv;
+	tpriv = (Tcppriv*)s->p->priv;
 	tpriv->stats[RetransSegs]++;
+
+	/* in case tcpoutput() released the lock */
+	if(seq_lt(tptr, tcb->snd.una))
+		tptr = tcb->snd.una;
+
+	return tptr;
 }
 
 /*
@@ -2875,19 +2590,11 @@
 static void
 tcptimeout(void *arg)
 {
-	Conv *s;
-	Tcpctl *tcb;
+	Conv *s = (Conv*)arg;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	Tcppriv *tpriv = (Tcppriv*)s->p->priv;
 	int maxback;
-	Tcppriv *tpriv;
 
-	s = (Conv*)arg;
-	tpriv = s->p->priv;
-	tcb = (Tcpctl*)s->ptcl;
-
-	if(waserror()){
-		qunlock(s);
-		nexterror();
-	}
 	qlock(s);
 	switch(tcb->state){
 	default:
@@ -2905,25 +2612,21 @@
 			tcb->srtt, tcb->mdev, NOW-tcb->time,
 			tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
 			tcpstates[s->state]);
-		tcpsettimer(tcb);
-		if(tcb->snd.rto == 0)
+		tpriv->stats[RetransTimeouts]++;
+		if(tcb->snd.rto++ == 0)
 			tcpcongestion(tcb);
-		tcprxmit(s);
-		tcb->snd.ptr = tcb->snd.una;
+		tcb->abcbytes = 0;
 		tcb->cwind = tcb->mss;
-		tcb->snd.rto = 1;
-		tpriv->stats[RetransTimeouts]++;
-
 		if(tcb->snd.recovery){
-			tcb->snd.dupacks = 0;			/* reno rto */
 			tcb->snd.recovery = 0;
-			tpriv->stats[RecoveryRTO]++;
+			tcb->snd.dupacks = 0;			/* reno rto */
 			tcb->snd.rxt = tcb->snd.nxt;
 			netlog(s->p->f, Logtcpwin,
 				"rto recovery rxt @%lud\n", tcb->snd.nxt);
+			tpriv->stats[RecoveryRTO]++;
 		}
-
-		tcb->abcbytes = 0;
+		/* resets the send pointer, retransmits and restarts timer */
+		tcprxmit(s);
 		break;
 	case Time_wait:
 		localclose(s, nil);
@@ -2932,15 +2635,8 @@
 		break;
 	}
 	qunlock(s);
-	poperror();
 }
 
-static int
-inwindow(Tcpctl *tcb, int seq)
-{
-	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
-}
-
 /*
  *  set up state for a received SYN (or SYN ACK) packet
  */
@@ -2947,29 +2643,16 @@
 static void
 procsyn(Conv *s, Tcp *seg)
 {
-	Tcpctl *tcb;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	tcb = (Tcpctl*)s->ptcl;
 	tcb->flags |= FORCE;
 
-	tcb->rcv.nxt = seg->seq + 1;
-	tcb->rcv.wptr = tcb->rcv.nxt;
-	tcb->rcv.wsnt = 0;
-	tcb->rcv.urg = tcb->rcv.nxt;
-	tcb->irs = seg->seq;
+	tcb->rcv.nxt = tcb->rcv.ackptr = seg->seq + 1;
+	tcb->rcv.wptr = tcb->rcv.wsnt = tcb->rcv.nxt;
+	tcb->rcv.wnd = 0;
 
-	/* our sending max segment size cannot be bigger than what he asked for */
-	if(seg->mss != 0 && seg->mss < tcb->mss)
-		tcb->mss = seg->mss;
-
-	/* if the server does not support ws option, disable window scaling */
-	if(seg->ws == 0){
-		tcb->scale = 0;
-		tcb->snd.scale = 0;
-	}
-
-	tcb->snd.wnd = seg->wnd;
-	initialwindow(tcb);
+	tcpsetmss(s, seg->mss);
+	tcpsetscale(s, seg->ws);
 }
 
 static int
@@ -3047,7 +2730,7 @@
 		tpriv->stats[ReseqBytelim]++;
 		return dumpreseq(tcb);
 	}
-	qmax = tcb->window / tcb->mss;		/* ~190 for qscale==2, 390 for qscale=3 */
+	qmax = (tcb->window + tcb->mss-1) / tcb->mss;
 	if(tcb->nreseq > qmax){
 		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
@@ -3058,96 +2741,66 @@
 	return 0;
 }
 
-static void
+static int
 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
 {
 	Reseq *rp;
 
 	rp = tcb->reseq;
-	if(rp == nil)
-		return;
+	if(rp == nil || seq_gt(rp->seg.seq, tcb->rcv.nxt))
+		return -1;
 
-	tcb->reseq = rp->next;
-
 	*seg = rp->seg;
 	*bp = rp->bp;
 	*length = rp->length;
 
-	tcb->nreseq--;
+	tcb->reseq = rp->next;
 	tcb->reseqlen -= rp->length;
+	tcb->nreseq--;
 
 	free(rp);
+
+	return 0;
 }
 
 static int
 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
 {
-	ushort len;
-	uchar accept;
 	int dupcnt, excess;
 
-	accept = 0;
-	len = *length;
-	if(seg->flags & SYN)
-		len++;
-	if(seg->flags & FIN)
-		len++;
-
-	if(tcb->rcv.wnd == 0) {
-		if(len == 0 && seg->seq == tcb->rcv.nxt)
-			return 0;
-	}
-	else {
-		/* Some part of the segment should be in the window */
-		if(inwindow(tcb,seg->seq))
-			accept++;
-		else
-		if(len != 0) {
-			if(inwindow(tcb, seg->seq+len-1) ||
-			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
-				accept++;
-		}
-	}
-	if(!accept) {
-		freeblist(*bp);
-		*bp = nil;
-		return -1;
-	}
-	dupcnt = tcb->rcv.nxt - seg->seq;
+	dupcnt = (int)(tcb->rcv.nxt - seg->seq);
 	if(dupcnt > 0){
-		tcb->rerecv += dupcnt;
 		if(seg->flags & SYN){
 			seg->flags &= ~SYN;
 			seg->seq++;
-
-			if(seg->urg > 1)
-				seg->urg--;
-			else
-				seg->flags &= ~URG;
 			dupcnt--;
 		}
-		if(dupcnt > 0){
-			pullblock(bp, (ushort)dupcnt);
-			seg->seq += dupcnt;
-			*length -= dupcnt;
-
-			if(seg->urg > dupcnt)
-				seg->urg -= dupcnt;
-			else {
-				seg->flags &= ~URG;
-				seg->urg = 0;
-			}
+		if(dupcnt >= *length){
+			tcb->overlap += *length;
+			freeblist(*bp);
+			*bp = nil;
+			return -1;
 		}
+		tcb->overlap += dupcnt;
+		pullblock(bp, dupcnt);
+		seg->seq += dupcnt;
+		*length -= dupcnt;
 	}
-	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
+	excess = (int)(seg->seq + *length - tcb->rcv.wptr);
 	if(excess > 0) {
-		tcb->rerecv += excess;
+		if(excess >= *length){
+			tcb->overlap += *length;
+			freeblist(*bp);
+			*bp = nil;
+			return -1;
+		}
+		seg->flags &= ~FIN;
+		tcb->overlap += excess;
 		*length -= excess;
 		*bp = trimblock(*bp, 0, *length);
-		if(*bp == nil)
-			panic("presotto is a boofhead");
-		seg->flags &= ~FIN;
 	}
+	if(*bp == nil)
+		return -1;
 	return 0;
 }
 
@@ -3154,11 +2807,11 @@
 static void
 tcpadvise(Proto *tcp, Block *bp, Ipifc *ifc, char *msg)
 {
-	Tcp4hdr *h4;
-	Tcp6hdr *h6;
 	uchar source[IPaddrlen];
 	uchar dest[IPaddrlen];
 	ushort psource, pdest;
+	Tcp4hdr *h4;
+	Tcp6hdr *h6;
 	Iphash *iph;
 	Tcpctl *tcb;
 	Conv *s;
@@ -3203,9 +2856,9 @@
 	s = iphconv(iph);
 	if(s->ignoreadvice || s->state == Announced || s->state == Closed)
 		goto raise;
+	tcb = (Tcpctl*)s->ptcl;
 	qlock(s);
 	qunlock(tcp);
-	tcb = (Tcpctl*)s->ptcl;
 	if(tcb->state == Syn_sent)
 		localclose(s, msg);
 	qunlock(s);
@@ -3323,8 +2976,6 @@
 		return tcphangup(c);
 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
 		return tcpstartka(c, f, n);
-	if(n >= 1 && strcmp(f[0], "checksum") == 0)
-		return tcpsetchecksum(c, f, n);
 	return "unknown control request";
 }
 
@@ -3331,16 +2982,16 @@
 static int
 tcpstats(Proto *tcp, char *buf, int len)
 {
-	Tcppriv *priv;
+	Tcppriv *tpriv = (Tcppriv*)tcp->priv;
 	char *p, *e;
 	int i;
 
-	priv = tcp->priv;
+	tpriv->stats[InLimbo] = tpriv->nlimbo;
+
 	p = buf;
 	e = p+len;
-	priv->stats[InLimbo] = priv->nlimbo;
 	for(i = 0; i < Nstats; i++)
-		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
+		p = seprint(p, e, "%s: %llud\n", statnames[i], tpriv->stats[i]);
 	return p - buf;
 }
 
@@ -3356,12 +3007,13 @@
 static int
 tcpgc(Proto *tcp)
 {
-	Conv *c, **pp, **ep;
 	int n;
+	ulong now;
+	Conv *c, **pp, **ep;
 	Tcpctl *tcb;
 
-
 	n = 0;
+	now = NOW;
 	ep = &tcp->conv[tcp->nc];
 	for(pp = tcp->conv; pp < ep; pp++) {
 		c = *pp;
@@ -3372,13 +3024,13 @@
 		tcb = (Tcpctl*)c->ptcl;
 		switch(tcb->state){
 		case Syn_received:
-			if(NOW - tcb->time > 5000){
+			if(now - tcb->time > 5000){
 				localclose(c, Etimedout);
 				n++;
 			}
 			break;
 		case Finwait2:
-			if(NOW - tcb->time > 5*60*1000){
+			if(now - tcb->time > 5*60*1000){
 				localclose(c, Etimedout);
 				n++;
 			}
@@ -3390,20 +3042,33 @@
 }
 
 static void
-tcpsettimer(Tcpctl *tcb)
+tcpsettimer(Conv *s)
 {
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 	int x;
 
+	tcb->time = NOW;
+	tcb->timeuna = tcb->snd.una;
+
 	/* round trip dependency */
-	x = backoff(tcb->backoff) *
-		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
+	x = tcb->mdev;
+	x += tcb->srtt >> LOGAGAIN;
+	x <<= tcb->backoff;
 
 	/* bounded twixt 0.3 and 64 seconds */
-	if(x < 300/MSPTICK)
-		x = 300/MSPTICK;
-	else if(x > (64000/MSPTICK))
-		x = 64000/MSPTICK;
-	tcb->timer.start = x;
+	if(x < 300)
+		x = 300;
+	else if(x > 64000)
+		x = 64000;
+
+	/* reset the timer */
+	tcb->timer.start = (x + MSPTICK-1) / MSPTICK;
+	if(tcb->timer.state == TcptimerON){
+		tcb->timer.count = tcb->timer.start;
+	} else {
+		Tcppriv *tpriv = (Tcppriv*)s->p->priv;
+		tcpgo(tpriv, &tcb->timer);
+	}
 }
 
 void
@@ -3413,7 +3078,7 @@
 	Tcppriv *tpriv;
 
 	tcp = smalloc(sizeof(Proto));
-	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
+	tcp->priv = tpriv = smalloc(sizeof(Tcppriv));
 	tcp->name = "tcp";
 	tcp->connect = tcpconnect;
 	tcp->announce = tcpannounce;
@@ -3430,6 +3095,7 @@
 	tcp->ipproto = IP_TCPPROTO;
 	tcp->nc = scalednconv();
 	tcp->ptclsize = sizeof(Tcpctl);
+
 	tpriv->stats[MaxConn] = tcp->nc;
 
 	Fsproto(fs, tcp);
@@ -3436,36 +3102,42 @@
 }
 
 static void
-tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
+tcpsetmss(Conv *s, ushort mss)
 {
-	/*
-	 * guess at reasonable queue sizes.  there's no current way 
-	 * to know how many nic receive buffers we can safely tie up in the
-	 * tcp stack, and we don't adjust our queues to maximize throughput
-	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
-	 * respected, but we still control our own buffer commitment by
-	 * keeping a seperate qscale.
-	 */
-	tcb->rcv.scale = rcvscale & 0xff;
-	tcb->snd.scale = sndscale & 0xff;
-	tcb->qscale = rcvscale & 0xff;
-	if(rcvscale > Maxqscale)
-		tcb->qscale = Maxqscale;
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
 
-	if(rcvscale != tcb->rcv.scale)
-		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n",
-			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
-	tcb->window = QMAX<<tcb->qscale;
-	tcb->ssthresh = tcb->window;
+	/* our sending max segment size cannot be bigger than what he asked for */
+	if(mss != 0 && mss < tcb->mss)
+		tcb->mss = mss;
+	
+	/* RFC 3390 initial window */
+	if(tcb->mss < 1095)
+		tcb->cwind = 4*tcb->mss;
+	else if(tcb->mss < 2190)
+		tcb->cwind = 4380;
+	else
+		tcb->cwind = 2*tcb->mss;
+}
 
-	/*
-	 * it's important to set wq large enough to cover the full
-	 * bandwidth-delay product.  it's possible to be in loss
-	 * recovery with a big window, and we need to keep sending
-	 * into the inflated window.  the difference can be huge
-	 * for even modest (70ms) ping times.
-	 */
-	qsetlimit(s->rq, QMAX<<tcb->qscale);
-	qsetlimit(s->wq, QMAX<<tcb->qscale);
-	tcprcvwin(s);
+static void
+tcpsetscale(Conv *s, uchar scale)
+{
+	Tcpctl *tcb = (Tcpctl*)s->ptcl;
+	ulong window;
+
+	if(scale == 0)
+		tcb->scale = 0;
+	else if(scale > 14)
+		scale = 14;	/* RFC 7323 2.3 */
+
+	tcb->rcv.scale = tcb->scale;
+	tcb->window = QMAX<<tcb->rcv.scale;
+	qsetlimit(s->rq, tcb->window);
+
+	tcb->snd.scale = scale;
+	window = QMAX<<tcb->snd.scale;
+	if(window > tcb->window)
+		window = tcb->window;
+	tcb->ssthresh = window;
+	qsetlimit(s->wq, window);
 }
--