ref: f8b6848a79ecd2c5871ec1629cf763a82bb8371f
dir: /sys/src/cmd/aux/vga/mach64xx.c/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "pci.h"
#include "vga.h"
/*
 * ATI Mach64 family.
 */
enum {
	HTotalDisp,
	HSyncStrtWid,
	VTotalDisp,
	VSyncStrtWid,
	VlineCrntVline,
	OffPitch,
	IntCntl,
	CrtcGenCntl,
	OvrClr,
	OvrWidLR,
	OvrWidTB,
	CurClr0,
	CurClr1,
	CurOffset,
	CurHVposn,
	CurHVoff,
	ScratchReg0,
	ScratchReg1,	/* Scratch Register (BIOS info) */
	ClockCntl,
	BusCntl,
	MemCntl,
	ExtMemCntl,
	MemVgaWpSel,
	MemVgaRpSel,
	DacRegs,
	DacCntl,
	GenTestCntl,
	ConfigCntl,	/* Configuration control */
	ConfigChipId,
	ConfigStat0,	/* Configuration status 0 */
	ConfigStat1,	/* Configuration status 1 */
	ConfigStat2,
	DspConfig,	/* Rage */
	DspOnOff,	/* Rage */
	DpBkgdClr,
	DpChainMsk,
	DpFrgdClr,
	DpMix,
	DpPixWidth,
	DpSrc,
	DpWriteMsk,
	LcdIndex,
	LcdData,
	Nreg,
	TvIndex = 0x1D,
	TvData = 0x27,
	LCD_ConfigPanel = 0,
	LCD_GenCtrl,
	LCD_DstnCntl,
	LCD_HfbPitchAddr,
	LCD_HorzStretch,
	LCD_VertStretch,
	LCD_ExtVertStretch,
	LCD_LtGio,
	LCD_PowerMngmnt,
	LCD_ZvgPio,
	Nlcd,
};
static char* iorname[Nreg] = {
	"HTotalDisp",
	"HSyncStrtWid",
	"VTotalDisp",
	"VSyncStrtWid",
	"VlineCrntVline",
	"OffPitch",
	"IntCntl",
	"CrtcGenCntl",
	"OvrClr",
	"OvrWidLR",
	"OvrWidTB",
	"CurClr0",
	"CurClr1",
	"CurOffset",
	"CurHVposn",
	"CurHVoff",
	"ScratchReg0",
	"ScratchReg1",
	"ClockCntl",
	"BusCntl",
	"MemCntl",
	"ExtMemCntl",
	"MemVgaWpSel",
	"MemVgaRpSel",
	"DacRegs",
	"DacCntl",
	"GenTestCntl",
	"ConfigCntl",
	"ConfigChipId",
	"ConfigStat0",
	"ConfigStat1",
	"ConfigStat2",
	"DspConfig",
	"DspOnOff",
	"DpBkgdClr",
	"DpChainMsk",
	"DpFrgdClr",
	"DpMix",
	"DpPixWidth",
	"DpSrc",
	"DpWriteMsk",
	"LcdIndex",
	"LcdData",	
};
static char* lcdname[Nlcd] = {
	"LCD ConfigPanel",
	"LCD GenCntl",
	"LCD DstnCntl",
	"LCD HfbPitchAddr",
	"LCD HorzStretch",
	"LCD VertStretch",
	"LCD ExtVertStretch",
	"LCD LtGio",
	"LCD PowerMngmnt",
	"LCD ZvgPio"
};
/*
 * Crummy hack: all io register offsets
 * here get IOREG or'ed in, so that we can
 * tell the difference between an uninitialized
 * array entry and HTotalDisp.
 */
enum {
	IOREG = 0x10000,
};
static ushort ioregs[Nreg] = {
 [HTotalDisp]		IOREG|0x0000,
 [HSyncStrtWid]	IOREG|0x0100,
 [VTotalDisp]		IOREG|0x0200,
 [VSyncStrtWid]		IOREG|0x0300,
 [VlineCrntVline]	IOREG|0x0400,
 [OffPitch]			IOREG|0x0500,
 [IntCntl]			IOREG|0x0600,
 [CrtcGenCntl]		IOREG|0x0700,
 [OvrClr]			IOREG|0x0800,
 [OvrWidLR]		IOREG|0x0900,
 [OvrWidTB]		IOREG|0x0A00,
 [CurClr0]			IOREG|0x0B00,
 [CurClr1]			IOREG|0x0C00,
 [CurOffset]		IOREG|0x0D00,
 [CurHVposn]		IOREG|0x0E00,
 [CurHVoff]		IOREG|0x0F00,
 [ScratchReg0]		IOREG|0x1000,
 [ScratchReg1]		IOREG|0x1100,
 [ClockCntl]		IOREG|0x1200,
 [BusCntl]			IOREG|0x1300,
 [MemCntl]		IOREG|0x1400,
 [MemVgaWpSel]	IOREG|0x1500,
 [MemVgaRpSel]	IOREG|0x1600,
 [DacRegs]		IOREG|0x1700,
 [DacCntl]			IOREG|0x1800,
 [GenTestCntl]		IOREG|0x1900,
 [ConfigCntl]		IOREG|0x1A00,
 [ConfigChipId]		IOREG|0x1B00,
 [ConfigStat0]		IOREG|0x1C00,
 [ConfigStat1]		IOREG|0x1D00,
/* [GpIo]				IOREG|0x1E00, */
/* [HTotalDisp]			IOREG|0x1F00, 	duplicate, says XFree86 */
};
static ushort pciregs[Nreg] = {
  [HTotalDisp]		0x00,
  [HSyncStrtWid]	0x01,
  [VTotalDisp]		0x02,
  [VSyncStrtWid]        0x03,
  [VlineCrntVline]      0x04,
  [OffPitch]		0x05,
  [IntCntl]		0x06,
  [CrtcGenCntl]		0x07,
  [DspConfig]		0x08,
  [DspOnOff]		0x09,
  [OvrClr]		0x10,
  [OvrWidLR]		0x11,
  [OvrWidTB]		0x12,
  [CurClr0]		0x18,
  [CurClr1]		0x19,
  [CurOffset]		0x1A,
  [CurHVposn]		0x1B,
  [CurHVoff]		0x1C,
  [ScratchReg0]		0x20,
  [ScratchReg1]		0x21,
  [ClockCntl]		0x24,
  [BusCntl]		0x28,
  [LcdIndex]		0x29,
  [LcdData]		0x2A,
  [ExtMemCntl]		0x2B,
  [MemCntl]		0x2C,
  [MemVgaWpSel]		0x2D,
  [MemVgaRpSel]		0x2E,
  [DacRegs]		0x30,
  [DacCntl]		0x31,
  [GenTestCntl]		0x34,
  [ConfigCntl]		0x37,
  [ConfigChipId]	0x38,
  [ConfigStat0]		0x39,
  [ConfigStat1]		0x25,	/* rsc: was 0x3A, but that's not what the LT manual says */
  [ConfigStat2]		0x26,
  [DpBkgdClr]		0xB0,
  [DpChainMsk]		0xB3,
  [DpFrgdClr]		0xB1,
  [DpMix]		0xB5,
  [DpPixWidth]		0xB4,
  [DpSrc]		0xB6,
  [DpWriteMsk]		0xB2,
};
enum {
	PLLm		= 0x02,
	PLLp		= 0x06,
	PLLn0		= 0x07,
	PLLn1		= 0x08,
	PLLn2		= 0x09,
	PLLn3		= 0x0A,
	PLLx            = 0x0B,         /* external divisor (Rage) */
	Npll		= 32,
	Ntv		= 1,		/* actually 256, but not used */
};
typedef struct Mach64xx	Mach64xx;
struct Mach64xx {
	ulong	io;
	Pcidev*	pci;
	int	bigmem;
	int	lcdon;
	int	lcdpanelid;
	ulong	reg[Nreg];
	ulong	lcd[Nlcd];
	ulong	tv[Ntv];
	uchar	pll[Npll];
	ulong	(*ior32)(Mach64xx*, int);
	void	(*iow32)(Mach64xx*, int, ulong);
};
static ulong
portior32(Mach64xx* mp, int r)
{
	if((ioregs[r] & IOREG) == 0)
		return ~0;
	return inportl(((ioregs[r] & ~IOREG)<<2)+mp->io);
}
static void
portiow32(Mach64xx* mp, int r, ulong l)
{
	if((ioregs[r] & IOREG) == 0)
		return;
	outportl(((ioregs[r] & ~IOREG)<<2)+mp->io, l);
}
static ulong
pciior32(Mach64xx* mp, int r)
{
	return inportl((pciregs[r]<<2)+mp->io);
}
static void
pciiow32(Mach64xx* mp, int r, ulong l)
{
	outportl((pciregs[r]<<2)+mp->io, l);
}
static uchar
pllr(Mach64xx* mp, int r)
{
	int io;
	if(mp->ior32 == portior32)
		io = ((ioregs[ClockCntl]&~IOREG)<<2)+mp->io;
	else
		io = (pciregs[ClockCntl]<<2)+mp->io;
	outportb(io+1, r<<2);
	return inportb(io+2);
}
static void
pllw(Mach64xx* mp, int r, uchar b)
{
	int io;
	if(mp->ior32 == portior32)
		io = ((ioregs[ClockCntl]&~IOREG)<<2)+mp->io;
	else
		io = (pciregs[ClockCntl]<<2)+mp->io;
	outportb(io+1, (r<<2)|0x02);
	outportb(io+2, b);
}
static ulong
lcdr32(Mach64xx *mp, ulong r)
{
	ulong or;
	or = mp->ior32(mp, LcdIndex);
	mp->iow32(mp, LcdIndex, (or&~0x0F) | (r&0x0F));
	return mp->ior32(mp, LcdData);
}
static void
lcdw32(Mach64xx *mp, ulong r, ulong v)
{
	ulong or;
	or = mp->ior32(mp, LcdIndex);
	mp->iow32(mp, LcdIndex, (or&~0x0F) | (r&0x0F));
	mp->iow32(mp, LcdData, v);
}
static ulong
tvr32(Mach64xx *mp, ulong r)
{
	outportb(mp->io+(TvIndex<<2), r&0x0F);
	return inportl(mp->io+(TvData<<2));
}
static void
tvw32(Mach64xx *mp, ulong r, ulong v)
{
	outportb(mp->io+(TvIndex<<2), r&0x0F);
	outportl(mp->io+(TvData<<2), v);
}
static int smallmem[] = {
	   512*1024,	  1024*1024,	 2*1024*1024,	 4*1024*1024,
	6*1024*1024,	8*1024*1024,	12*1024*1024,	16*1024*1024,
};
static int bigmem[] = {
	    512*1024,	  2*512*1024,	  3*512*1024,	  4*512*1024,
	  5*512*1024,	  6*512*1024,	  7*512*1024,	  8*512*1024,
	 5*1024*1024,	 6*1024*1024,	 7*1024*1024,	 8*1024*1024,
	10*1024*1024,	12*1024*1024,	14*1024*1024,	16*1024*1024,
};
static void
snarf(Vga* vga, Ctlr* ctlr)
{
	Mach64xx *mp;
	int i;
	ulong v;
	if(vga->private == nil){
		vga->private = alloc(sizeof(Mach64xx));
		mp = vga->private;
		mp->io = 0x2EC;
		mp->ior32 = portior32;
		mp->iow32 = portiow32;
		mp->pci = pcimatch(0, 0x1002, 0);
		if (mp->pci) {
			v = mp->pci->mem[1].bar & ~0x3;
			if(v != 0) {
				mp->io = v;
				mp->ior32 = pciior32;
				mp->iow32 = pciiow32;
			}
		}
	}
	mp = vga->private;
	for(i = 0; i < Nreg; i++)
		mp->reg[i] = mp->ior32(mp, i);
	for(i = 0; i < Npll; i++)
		mp->pll[i] = pllr(mp, i);
	switch(mp->reg[ConfigChipId] & 0xFFFF){
	default:
		mp->lcdpanelid = 0;
		break;
	case ('L'<<8)|'B':		/* 4C42: Rage LTPro AGP */
	case ('L'<<8)|'I':		/* 4C49: Rage 3D LTPro */
	case ('L'<<8)|'M':		/* 4C4D: Rage Mobility */
	case ('L'<<8)|'P':		/* 4C50: Rage 3D LTPro */
		for(i = 0; i < Nlcd; i++)
			mp->lcd[i] = lcdr32(mp, i);
		if(mp->lcd[LCD_GenCtrl] & 0x02)
			mp->lcdon = 1;
		mp->lcdpanelid = ((mp->reg[ConfigStat2]>>14) & 0x1F);
		break;
	}
	/*
	 * Check which memory size map we are using.
	 */
	mp->bigmem = 0;
	switch(mp->reg[ConfigChipId] & 0xFFFF){
		case ('G'<<8)|'B':	/* 4742: 264GT PRO */
		case ('G'<<8)|'D':	/* 4744: 264GT PRO */
		case ('G'<<8)|'I':	/* 4749: 264GT PRO */
		case ('G'<<8)|'M':	/* 474D: Rage XL */
		case ('G'<<8)|'P':	/* 4750: 264GT PRO */
		case ('G'<<8)|'Q':	/* 4751: 264GT PRO */
		case ('G'<<8)|'R':	/* 4752: */
		case ('G'<<8)|'U':	/* 4755: 264GT DVD */
		case ('G'<<8)|'V':	/* 4756: Rage2C */
		case ('G'<<8)|'Z':	/* 475A: Rage2C */
		case ('V'<<8)|'U':	/* 5655: 264VT3 */
		case ('V'<<8)|'V':	/* 5656: 264VT4 */
		case ('L'<<8)|'B':	/* 4C42: Rage LTPro AGP */
		case ('L'<<8)|'I':	/* 4C49: Rage 3D LTPro */
		case ('L'<<8)|'M':	/* 4C4D: Rage Mobility */
		case ('L'<<8)|'P':	/* 4C50: Rage 3D LTPro */
			mp->bigmem = 1;
			break;
		case ('G'<<8)|'T':	/* 4754: 264GT[B] */
		case ('V'<<8)|'T':	/* 5654: 264VT/GT/VTB */
			/*
			 * Only the VTB and GTB use the new memory encoding,
			 * and they are identified by a nonzero ChipVersion,
			 * apparently.
			 */
			if((mp->reg[ConfigChipId] >> 24) & 0x7)
				mp->bigmem = 1;
			break;
	}
	/*
	 * Memory size and aperture. It's recommended
	 * to use an 8Mb aperture on a 16Mb boundary.
	 */
	if(mp->bigmem)
		vga->vmz = bigmem[mp->reg[MemCntl] & 0x0F];
	else
		vga->vmz = smallmem[mp->reg[MemCntl] & 0x07];
	vga->vma = 16*1024*1024;
	switch(mp->reg[ConfigCntl]&0x3){
	case 0:
		vga->apz = 16*1024*1024;	/* empirical -rsc */
		break;
	case 1:
		vga->apz = 4*1024*1024;
		break;
	case 2:
		vga->apz = 8*1024*1024;
		break;
	case 3:
		vga->apz = 2*1024*1024;	/* empirical: mach64GX -rsc */
		break;
	}
	ctlr->flag |= Fsnarf;
}
static void
options(Vga*, Ctlr* ctlr)
{
	ctlr->flag |= Hlinear|Foptions;
}
static void
clock(Vga* vga, Ctlr* ctlr)
{
	int clk, m, n, p;
	double f, q;
	Mach64xx *mp;
	mp = vga->private;
	/*
	 * Don't compute clock timings for LCD panels.
	 * Just use what's already there.  We can't just use
	 * the frequency in the vgadb for this because 
	 * the frequency being programmed into the PLLs
	 * is not the frequency being used to compute the DSP
	 * settings.  The DSP-relevant frequency is the one
	 * we keep in /lib/vgadb.
	 */
	if(mp->lcdon){
		clk = mp->reg[ClockCntl] & 0x03;
		n = mp->pll[7+clk];
		p = (mp->pll[6]>>(clk*2)) & 0x03;
		p |= (mp->pll[11]>>(2+clk)) & 0x04;
		switch(p){
		case 0:
		case 1:
		case 2:
		case 3:
			p = 1<<p;
			break;
		case 4+0:
			p = 3;
			break;
		case 4+2:
			p = 6;
			break;
		case 4+3:
			p = 12;
			break;
		default:
		case 4+1:
			p = -1;
			break;
		}
		m = mp->pll[PLLm];
		f = (2.0*RefFreq*n)/(m*p) + 0.5;
		vga->m[0] = m;
		vga->p[0] = p;
		vga->n[0] = n;
		vga->f[0] = f;
		return;
	}
	if(vga->f[0] == 0)
		vga->f[0] = vga->mode->frequency;
	f = vga->f[0];
	/*
	 * To generate a specific output frequency, the reference (m),
	 * feedback (n), and post dividers (p) must be loaded with the
	 * appropriate divide-down ratios. In the following r is the
	 * XTALIN frequency (usually RefFreq) and t is the target frequency
	 * (vga->f).
	 *
	 * Use the maximum reference divider left by the BIOS for now,
	 * otherwise MCLK might be a concern. It can be calculated as
	 * follows:
	 * 			    Upper Limit of PLL Lock Range
	 *	Minimum PLLREFCLK = -----------------------------
	 *				      (2*255)
	 *
	 *				       XTALIN
	 *			m = Floor[-----------------]
	 *				  Minimum PLLREFCLK
	 *
	 * For an upper limit of 135MHz and XTALIN of 14.318MHz m
	 * would be 54.
	 */
	m = mp->pll[PLLm];
	vga->m[0] = m;
	/*
	 * The post divider may be 1, 2, 4 or 8 and is determined by
	 * calculating
	 *		 t*m
	 *	    q = -----
	 *		(2*r)
	 * and using the result to look-up p.
	 */
	q = (f*m)/(2*RefFreq);
	if(ctlr->flag&Uenhanced){
	  if(q > 255 || q < 10.6666666667)
		error("%s: vclk %lud out of range\n", ctlr->name, vga->f[0]);
	  if(q > 127.5)
		p = 1;
	  else if(q > 85)
		p = 2;
	  else if(q > 63.75)
		p = 3;
	  else if(q > 42.5)
		p = 4;
	  else if(q > 31.875)
		p = 6;
	  else if(q > 21.25)
		p = 8;
	  else
		p = 12;
	}else{
	  if(q > 255 || q < 16)
		error("%s: vclk %lud out of range\n", ctlr->name, vga->f[0]);
	  if(q >= 127.5)
		p = 1;
	  else if(q >= 63.5)
		p = 2;
	  else if(q >= 31.5)
		p = 4;
	  else
		p = 8;
	}
	vga->p[0] = p;
	/*
	 * The feedback divider should be kept in the range 0x80 to 0xFF
	 * and is found from
	 *	n = q*p
	 * rounded to the nearest whole number.
	 */
	vga->n[0] = (q*p)+0.5;
}
typedef struct Meminfo	Meminfo;
struct Meminfo {
	int latency;
	int latch;
	int trp;		/* filled in from card */
	int trcd;		/* filled in from card */
	int tcrd;		/* filled in from card */
	int tras;		/* filled in from card */
};
enum {
	Mdram,
	Medo,	
	Msdram,
	Mwram,
};
/*
 * The manuals and documentation are silent on which settings
 * to use for Mwdram, or how to tell which to use.
 */
static Meminfo meminfo[] = {
[Mdram]		{ 1, 0 },
[Medo]		{ 1, 2 },
[Msdram]	{ 3, 1 },
[Mwram]		{ 1, 3 },	/* non TYPE_A */
};
static ushort looplatencytab[2][2] = {
	{ 8, 6 },		/* DRAM: ≤1M, > 1M */
	{ 9, 8 },		/* SDRAM: ≤1M, > 1M */
};
static ushort cyclesperqwordtab[2][2] = {
	{ 3, 2 },		/* DRAM: ≤1M, > 1M */
	{ 2, 1 },		/* SDRAM: ≤1M, > 1M */
};
static int memtype[] = {
	-1,			/* disable memory access */
	Mdram,			/* basic DRAM */
	Medo,			/* EDO */
	Medo,			/* hyper page DRAM or EDO */
	Msdram,			/* SDRAM */
	Msdram,			/* SGRAM */
	Mwram,
	Mwram
};
/*
 * Calculate various memory parameters so that the card
 * fetches the right bytes at the right time.  I don't claim to
 * understand the actual calculations very well.
 *
 * This is remarkably useful on laptops, since knowledge of
 * x lets us find the frequency that the screen is really running
 * at, which is not necessarily in the VCLKs.
 */
static void
setdsp(Vga* vga, Ctlr*)
{
	Mach64xx *mp;
	Meminfo *mem;
	ushort table, memclk, memtyp;
	int i, prec, xprec, fprec;
	ulong t;
	double pw, x, fifosz, fifoon, fifooff;
	ushort dspon, dspoff;
	int afifosz, lat, ncycle, pfc, rcc;
	mp = vga->private;
	/*
	 * Get video ram configuration from BIOS and chip
	 */
	table = *(ushort*)readbios(sizeof table, 0xc0048);
	trace("rom table offset %uX\n", table);
	table = *(ushort*)readbios(sizeof table, 0xc0000+table+16);
	trace("freq table offset %uX\n", table);
	memclk = *(ushort*)readbios(sizeof memclk, 0xc0000+table+18);
	trace("memclk %ud\n", memclk);
	memtyp = memtype[mp->reg[ConfigStat0]&07];
	mem = &meminfo[memtyp];
	/*
	 * First we need to calculate x, the number of 
	 * XCLKs that one QWORD occupies in the display FIFO.
	 *
	 * For some reason, x gets stretched out if LCD stretching
	 * is turned on. 
	 */
	x = ((double)memclk*640000.0) /
		((double)vga->mode->frequency * (double)vga->mode->z);
	if(mp->lcd[LCD_HorzStretch] & (1<<31))
		x *= 4096.0 / (double)(mp->lcd[LCD_HorzStretch] & 0xFFFF);
	trace("memclk %d... x %f...", memclk, x);
	/*
	 * We have 14 bits to specify x in.  Decide where to
	 * put the decimal (err, binary) point by counting how
	 * many significant bits are in the integer portion of x.
	 */
	t = x;
	for(i=31; i>=0; i--)
		if(t & (1<<i))
			break;
	xprec = i+1;
	trace("t %lud... xprec %d...", t, xprec);
	/*
	 * The maximum FIFO size is the number of XCLKs per QWORD
	 * multiplied by 32, for some reason.  We have 11 bits to
	 * specify fifosz.
	 */
	fifosz = x * 32.0;
	trace("fifosz %f...", fifosz);
	t = fifosz;
	for(i=31; i>=0; i--)
		if(t & (1<<i))
			break;
	fprec = i+1;
	trace("fprec %d...", fprec);
	/*
	 * Precision is specified as 3 less than the number of bits
	 * in the integer part of x, and 5 less than the number of bits
	 * in the integer part of fifosz.
	 *
	 * It is bounded by zero and seven.
	 */
	prec = (xprec-3 > fprec-5) ? xprec-3 : fprec-5;
	if(prec < 0)
		prec = 0;
	if(prec > 7)
		prec = 7;
	xprec = prec+3;
	fprec = prec+5;
	trace("prec %d...", prec);
	/*
	 * Actual fifo size
	 */
	afifosz = (1<<fprec) / x;
	if(afifosz > 32)
		afifosz = 32;
	fifooff = ceil(x*(afifosz-1));
	/*
	 * I am suspicious of this table, lifted from ATI docs,
	 * because it doesn't agree with the Windows drivers.
	 * We always get 0x0A for lat+2 while Windows uses 0x08.
	 */
	lat = looplatencytab[memtyp > 1][vga->vmz > 1*1024*1024];
	trace("afifosz %d...fifooff %f...", afifosz, fifooff);
	/*
	 * Page fault clock
	 */
	t = mp->reg[MemCntl];
	mem->trp = (t>>8)&3;	/* RAS precharge time */
	mem->trcd = (t>>10)&3;	/* RAS to CAS delay */
	mem->tcrd = (t>>12)&1;	/* CAS to RAS delay */
	mem->tras = (t>>16)&7;	/* RAS low minimum pulse width */
	pfc = mem->trp + 1 + mem->trcd + 1 + mem->tcrd;
	trace("pfc %d...", pfc);
	/*
	 * Maximum random access cycle clock.
	 */
	ncycle = cyclesperqwordtab[memtyp > 1][vga->vmz > 1*1024*1024];
	rcc = mem->trp + 1 + mem->tras + 1;
	if(rcc < pfc+ncycle)
		rcc = pfc+ncycle;
	trace("rcc %d...", rcc);
	fifoon = (rcc > floor(x)) ? rcc : floor(x);
	fifoon += (3.0 * rcc) - 1 + pfc + ncycle;
	trace("fifoon %f...\n", fifoon);
	/*
	 * Now finally put the bits together.
	 * x is stored in a 14 bit field with xprec bits of integer.
	 */
	pw = x * (1<<(14-xprec));
	mp->reg[DspConfig] = (ulong)pw | (((lat+2)&0xF)<<16) | ((prec&7)<<20);
	/*
	 * These are stored in an 11 bit field with fprec bits of integer.
	 */
	dspon  = (ushort)fifoon << (11-fprec);
	dspoff = (ushort)fifooff << (11-fprec);
	mp->reg[DspOnOff] = ((dspon&0x7ff) << 16) | (dspoff&0x7ff);
}
static void
init(Vga* vga, Ctlr* ctlr)
{
	Mode *mode;
	Mach64xx *mp;
	int p, x, y;
	mode = vga->mode;
	if((mode->x > 640 || mode->y > 480) && mode->z == 1)
		error("%s: no support for 1-bit mode other than 640x480x1\n",
			ctlr->name);
	mp = vga->private;
	if(mode->z > 8 && mp->pci == nil)
		error("%s: no support for >8-bit color without PCI\n",
			ctlr->name);
	/*
	 * Check for Rage chip
	 */
	switch (mp->reg[ConfigChipId]&0xffff) {
		case ('G'<<8)|'B':	/* 4742: 264GT PRO */
		case ('G'<<8)|'D':	/* 4744: 264GT PRO */
		case ('G'<<8)|'I':	/* 4749: 264GT PRO */
		case ('G'<<8)|'M':	/* 474D: Rage XL */
		case ('G'<<8)|'P':	/* 4750: 264GT PRO */
		case ('G'<<8)|'Q':	/* 4751: 264GT PRO */
		case ('G'<<8)|'R':	/* 4752: */
		case ('G'<<8)|'U':	/* 4755: 264GT DVD */
		case ('G'<<8)|'V':	/* 4756: Rage2C */
		case ('G'<<8)|'Z':	/* 475A: Rage2C */
		case ('V'<<8)|'U':	/* 5655: 264VT3 */
		case ('V'<<8)|'V':	/* 5656: 264VT4 */
		case ('G'<<8)|'T':	/* 4754: 264GT[B] */
		case ('V'<<8)|'T':	/* 5654: 264VT/GT/VTB */
		case ('L'<<8)|'B':	/* 4C42: Rage LTPro AGP */
		case ('L'<<8)|'I':	/* 4C49: 264LT PRO */
		case ('L'<<8)|'M':	/* 4C4D: Rage Mobility */
		case ('L'<<8)|'P':	/* 4C50: 264LT PRO */
			ctlr->flag |= Uenhanced;
			break;
	}
	/*
	 * Always use VCLK2.
	 */
	clock(vga, ctlr);
	mp->pll[PLLn2] = vga->n[0];
	mp->pll[PLLp] &= ~(0x03<<(2*2));
	switch(vga->p[0]){
	case 1:
	case 3:
		p = 0;
		break;
	case 2:
		p = 1;
		break;
	case 4:
	case 6:
		p = 2;
		break;
	case 8:
	case 12:
		p = 3;
		break;
	default:
		p = 3;
		break;
	}
	mp->pll[PLLp] |= p<<(2*2);
	if ((1<<p) != vga->p[0])
		mp->pll[PLLx] |= 1<<(4+2);
	else
		mp->pll[PLLx] &= ~(1<<(4+2));
	mp->reg[ClockCntl] = 2;
	mp->reg[ConfigCntl] = 0;
	mp->reg[CrtcGenCntl] = 0x02000000|(mp->reg[CrtcGenCntl] & ~0x01400700);
	switch(mode->z){
	default:
	case 1:
		mp->reg[CrtcGenCntl] |= 0x00000100;
		mp->reg[DpPixWidth] = 0x00000000;
		break;
	case 8:
		mp->reg[CrtcGenCntl] |= 0x01000200;
		mp->reg[DpPixWidth] = 0x00020202;
		break;
	case 15:
		mp->reg[CrtcGenCntl] |= 0x01000300;
		mp->reg[DpPixWidth] = 0x00030303;
		break;
	case 16:
		mp->reg[CrtcGenCntl] |= 0x01000400;
		mp->reg[DpPixWidth] = 0x00040404;
		break;
	case 24:
		mp->reg[CrtcGenCntl] |= 0x01000500;
		mp->reg[DpPixWidth] = 0x00050505;
		break;
	case 32:
		mp->reg[CrtcGenCntl] |= 0x01000600;
		mp->reg[DpPixWidth] = 0x00060606;
		break;
	}
	mp->reg[HTotalDisp] = (((mode->x>>3)-1)<<16)|((mode->ht>>3)-1);
	mp->reg[HSyncStrtWid] = (((mode->ehs - mode->shs)>>3)<<16)
				|((mode->shs>>3)-1);
	if(mode->hsync == '-')
		mp->reg[HSyncStrtWid] |= 0x00200000;
	mp->reg[VTotalDisp] = ((mode->y-1)<<16)|(mode->vt-1);
	mp->reg[VSyncStrtWid] = ((mode->vre - mode->vrs)<<16)|(mode->vrs-1);
	if(mode->vsync == '-')
		mp->reg[VSyncStrtWid] |= 0x00200000;
	mp->reg[IntCntl] = 0;
	/*
	 * This used to set it to (mode->x/(8*2))<<22 for depths < 8,
	 * but from the manual that seems wrong to me.  -rsc
	 */
	mp->reg[OffPitch] = (vga->virtx/8)<<22;
	mp->reg[OvrClr] = Pblack;
	if(vga->linear && mode->z != 1)
		ctlr->flag |= Ulinear;
	/*
	 * Heuristic fiddling on LT PRO.
	 * Do this before setdsp so the stretching is right.
	 */
	if(mp->lcdon){
		/* use non-shadowed registers */
		mp->lcd[LCD_GenCtrl] &= ~0x00000404;
		mp->lcd[LCD_ConfigPanel] |= 0x00004000;
		mp->lcd[LCD_VertStretch] = 0;
		y = ((mp->lcd[LCD_ExtVertStretch]>>11) & 0x7FF)+1;
		if(mode->y < y){
			x = (mode->y*1024)/y;
			mp->lcd[LCD_VertStretch] = 0xC0000000|x;
		}
		mp->lcd[LCD_ExtVertStretch] &= ~0x00400400;
		/*
		 * The x value doesn't seem to be available on all
		 * chips so intuit it from the y value which seems to
		 * be reliable.
		 */
		mp->lcd[LCD_HorzStretch] &= ~0xC00000FF;
		x = (mp->lcd[LCD_HorzStretch]>>20) & 0xFF;
		if(x == 0){
			switch(y){
			default:
				break;
			case 480:
				x = 640;
				break;
			case 600:
				x = 800;
				break;
			case 768:
				x = 1024;
				break;
			case 1024:
				x = 1280;
				break;
			}
		}
		else
			x = (x+1)*8;
		if(mode->x < x){
			x = (mode->x*4096)/x;
			mp->lcd[LCD_HorzStretch] |= 0xC0000000|x;
		}
	}
	if(ctlr->flag&Uenhanced)
		setdsp(vga, ctlr);
	ctlr->flag |= Finit;
}
static void
load(Vga* vga, Ctlr* ctlr)
{
	Mach64xx *mp;
	int i;
	mp = vga->private;
	/*
	 * Unlock the CRTC and LCD registers.
	 */
	mp->iow32(mp, CrtcGenCntl, mp->ior32(mp, CrtcGenCntl)&~0x00400000);
	if(mp->lcdon)
		lcdw32(mp, LCD_GenCtrl, mp->lcd[LCD_GenCtrl]|0x80000000);
	/*
	 * Always use an aperture on a 16Mb boundary.
	 */
	if(ctlr->flag & Ulinear)
		mp->reg[ConfigCntl] = ((vga->vmb/(4*1024*1024))<<4)|0x02;
	mp->iow32(mp, ConfigCntl, mp->reg[ConfigCntl]);
	mp->iow32(mp, GenTestCntl, 0);
	mp->iow32(mp, GenTestCntl, 0x100);
	if((ctlr->flag&Uenhanced) == 0)
	  mp->iow32(mp, MemCntl, mp->reg[MemCntl] & ~0x70000);
	mp->iow32(mp, BusCntl, mp->reg[BusCntl]);
	mp->iow32(mp, HTotalDisp, mp->reg[HTotalDisp]);
	mp->iow32(mp, HSyncStrtWid, mp->reg[HSyncStrtWid]);
	mp->iow32(mp, VTotalDisp, mp->reg[VTotalDisp]);
	mp->iow32(mp, VSyncStrtWid, mp->reg[VSyncStrtWid]);
	mp->iow32(mp, IntCntl, mp->reg[IntCntl]);
	mp->iow32(mp, OffPitch, mp->reg[OffPitch]);
	if(mp->lcdon){
		for(i=0; i<Nlcd; i++)
			lcdw32(mp, i, mp->lcd[i]);
	}
	mp->iow32(mp, GenTestCntl, mp->reg[GenTestCntl]);
	mp->iow32(mp, ConfigCntl, mp->reg[ConfigCntl]);
	mp->iow32(mp, CrtcGenCntl, mp->reg[CrtcGenCntl]);
	mp->iow32(mp, OvrClr, mp->reg[OvrClr]);
	mp->iow32(mp, OvrWidLR, mp->reg[OvrWidLR]);
	mp->iow32(mp, OvrWidTB, mp->reg[OvrWidTB]);
	if(ctlr->flag&Uenhanced){
	  mp->iow32(mp, DacRegs, mp->reg[DacRegs]);
	  mp->iow32(mp, DacCntl, mp->reg[DacCntl]);
	  mp->iow32(mp, CrtcGenCntl, mp->reg[CrtcGenCntl]&~0x02000000);
	  mp->iow32(mp, DspOnOff, mp->reg[DspOnOff]);
	  mp->iow32(mp, DspConfig, mp->reg[DspConfig]);
	  mp->iow32(mp, CrtcGenCntl, mp->reg[CrtcGenCntl]);
	  pllw(mp, PLLx, mp->pll[PLLx]);
	}
	pllw(mp, PLLn2, mp->pll[PLLn2]);
	pllw(mp, PLLp, mp->pll[PLLp]);
	pllw(mp, PLLn3, mp->pll[PLLn3]);
	mp->iow32(mp, ClockCntl, mp->reg[ClockCntl]);
	mp->iow32(mp, ClockCntl, 0x40|mp->reg[ClockCntl]);
	mp->iow32(mp, DpPixWidth, mp->reg[DpPixWidth]);
	if(vga->mode->z > 8){
		int sh, i;
		/*
		 * We need to initialize the palette, since the DACs use it
		 * in true color modes.  First see if the card supports an
		 * 8-bit DAC.
		 */
		mp->iow32(mp, DacCntl, mp->reg[DacCntl] | 0x100);
		if(mp->ior32(mp, DacCntl)&0x100){
			/* card appears to support it */
			vgactlw("palettedepth", "8");
			mp->reg[DacCntl] |= 0x100;
		}
		if(mp->reg[DacCntl] & 0x100)
			sh = 0;	/* 8-bit DAC */
		else
			sh = 2;	/* 6-bit DAC */
		for(i=0; i<256; i++)
			setpalette(i, i>>sh, i>>sh, i>>sh);
	}
	ctlr->flag |= Fload;
}
static void
pixelclock(Vga* vga, Ctlr* ctlr)
{
	Mach64xx *mp;
	ushort table, s;
	int memclk, ref_freq, ref_divider, min_freq, max_freq;
	int feedback, nmult, pd, post, value;
	int clock;
	/*
	 * Find the pixel clock from the BIOS and current
	 * settings. Lifted from the ATI-supplied example code.
	 * The clocks stored in the BIOS table are in kHz/10.
	 *
	 * This is the clock LCDs use in vgadb to set the DSP
	 * values.
	 */
	mp = vga->private;
	/*
	 * GetPLLInfo()
	 */
	table = *(ushort*)readbios(sizeof table, 0xc0048);
	trace("rom table offset %uX\n", table);
	table = *(ushort*)readbios(sizeof table, 0xc0000+table+16);
	trace("freq table offset %uX\n", table);
	s = *(ushort*)readbios(sizeof s, 0xc0000+table+18);
	memclk = s*10000;
	trace("memclk %ud\n", memclk);
	s = *(ushort*)readbios(sizeof s, 0xc0000+table+8);
	ref_freq = s*10000;
	trace("ref_freq %ud\n", ref_freq);
	s = *(ushort*)readbios(sizeof s, 0xc0000+table+10);
	ref_divider = s;
	trace("ref_divider %ud\n", ref_divider);
	s = *(ushort*)readbios(sizeof s, 0xc0000+table+2);
	min_freq = s*10000;
	trace("min_freq %ud\n", min_freq);
	s = *(ushort*)readbios(sizeof s, 0xc0000+table+4);
	max_freq = s*10000;
	trace("max_freq %ud\n", max_freq);
	/*
	 * GetDivider()
	 */
	pd = mp->pll[PLLp] & 0x03;
	value = (mp->pll[PLLx] & 0x10)>>2;
	trace("pd %uX value %uX (|%d)\n", pd, value, value|pd);
	value |= pd;
	post = 0;
	switch(value){
	case 0:
		post = 1;
		break;
	case 1:
		post = 2;
		break;
	case 2:
		post = 4;
		break;
	case 3:
		post = 8;
		break;
	case 4:
		post = 3;
		break;
	case 5:
		post = 0;
		break;
	case 6:
		post = 6;
		break;
	case 7:
		post = 12;
		break;
	}
	trace("post = %d\n", post);
	feedback = mp->pll[PLLn0];
	if(mp->pll[PLLx] & 0x08)
		nmult = 4;
	else
		nmult = 2;
	clock = (ref_freq/10000)*nmult*feedback;
	clock /= ref_divider*post;
	clock *= 10000;
	Bprint(&stdout, "%s pixel clock = %ud\n", ctlr->name, clock);
}
static void dumpmach64bios(Mach64xx*);
static void
dump(Vga* vga, Ctlr* ctlr)
{
	Mach64xx *mp;
	int i, m, n, p;
	double f;
	static int first = 1;
	if((mp = vga->private) == 0)
		return;
	Bprint(&stdout, "%s pci %p io %lux %s\n", ctlr->name,
		mp->pci, mp->io, mp->ior32 == pciior32 ? "pciregs" : "ioregs");
	if(mp->pci)
		Bprint(&stdout, "%s ccru %ux\n", ctlr->name, mp->pci->ccru);
	for(i = 0; i < Nreg; i++)
		Bprint(&stdout, "%s %-*s%.8luX\n",
			ctlr->name, 20, iorname[i], mp->reg[i]);
	printitem(ctlr->name, "PLL");
	for(i = 0; i < Npll; i++)
		printreg(mp->pll[i]);
	Bprint(&stdout, "\n");
	switch(mp->reg[ConfigChipId] & 0xFFFF){
	default:
		break;
	case ('L'<<8)|'B':		/* 4C42: Rage LTPro AGP */
	case ('L'<<8)|'I':		/* 4C49: Rage 3D LTPro */
	case ('L'<<8)|'M':		/* 4C4D: Rage Mobility */
	case ('L'<<8)|'P':		/* 4C50: Rage 3D LTPro */
		for(i = 0; i < Nlcd; i++)
			Bprint(&stdout, "%s %-*s%.8luX\n",
				ctlr->name, 20, lcdname[i], mp->lcd[i]);
		break;
	}
	/*
	 *     (2*r*n)
	 * f = -------
	 *	(m*p)
	 */
	m = mp->pll[2];
	for(i = 0; i < 4; i++){
		n = mp->pll[7+i];
		p = (mp->pll[6]>>(i*2)) & 0x03;
		p |= (mp->pll[11]>>(2+i)) & 0x04;
		switch(p){
		case 0:
		case 1:
		case 2:
		case 3:
			p = 1<<p;
			break;
		case 4+0:
			p = 3;
			break;
		case 4+2:
			p = 6;
			break;
		case 4+3:
			p = 12;
			break;
		default:
		case 4+1:
			p = -1;
			break;
		}
		if(m*p == 0)
			Bprint(&stdout, "unknown VCLK%d\n", i);
		else {
			f = (2.0*RefFreq*n)/(m*p) + 0.5;
			Bprint(&stdout, "%s VCLK%d\t%ud\n", ctlr->name, i, (int)f);
		}
	}
	pixelclock(vga, ctlr);
	if(first) {
		first = 0;
		dumpmach64bios(mp);
	}
}
enum {
	ClockFixed=0,
	ClockIcs2595,
	ClockStg1703,
	ClockCh8398,
	ClockInternal,
	ClockAtt20c408,
	ClockIbmrgb514
};
/*
 * mostly derived from the xfree86 probe routines.
 */
static void 
dumpmach64bios(Mach64xx *mp)
{
	int i, romtable, clocktable, freqtable, lcdtable, lcdpanel;
	uchar bios[0x10000];
	memmove(bios, readbios(sizeof bios, 0xC0000), sizeof bios);
	/* find magic string */
	for(i=0; i<1024; i++)
		if(strncmp((char*)bios+i, " 761295520", 10) == 0)
			break;
	if(i==1024) {
		Bprint(&stdout, "no ATI bios found\n");
		return;
	}
	/* this is horribly endian dependent.  sorry. */
	romtable = *(ushort*)(bios+0x48);
	if(romtable+0x12 > sizeof(bios)) {
		Bprint(&stdout, "couldn't find ATI rom table\n");
		return;
	}
	clocktable = *(ushort*)(bios+romtable+0x10);
	if(clocktable+0x0C > sizeof(bios)) {
		Bprint(&stdout, "couldn't find ATI clock table\n");
		return;
	}
	freqtable = *(ushort*)(bios+clocktable-2);
	if(freqtable+0x20 > sizeof(bios)) {
		Bprint(&stdout, "couldn't find ATI frequency table\n");
		return;
	}
	Bprint(&stdout, "ATI BIOS rom 0x%x freq 0x%x clock 0x%x\n", romtable, freqtable, clocktable);
	Bprint(&stdout, "clocks:");
	for(i=0; i<16; i++)
		Bprint(&stdout, " %d", *(ushort*)(bios+freqtable+2*i));
	Bprint(&stdout, "\n");
	Bprint(&stdout, "programmable clock: %d\n", bios[clocktable]);
	Bprint(&stdout, "clock to program: %d\n", bios[clocktable+6]);
	if(*(ushort*)(bios+clocktable+8) != 1430) {
		Bprint(&stdout, "reference numerator: %d\n", *(ushort*)(bios+clocktable+8)*10);
		Bprint(&stdout, "reference denominator: 1\n");
	} else {
		Bprint(&stdout, "default reference numerator: 157500\n");
		Bprint(&stdout, "default reference denominator: 11\n");
	}
	switch(bios[clocktable]) {
	case ClockIcs2595:
		Bprint(&stdout, "ics2595\n");
		Bprint(&stdout, "reference divider: %d\n", *(ushort*)(bios+clocktable+0x0A));
		break;
	case ClockStg1703:
		Bprint(&stdout, "stg1703\n");
		break;
	case ClockCh8398:
		Bprint(&stdout, "ch8398\n");
		break;
	case ClockInternal:
		Bprint(&stdout, "internal clock\n");
		Bprint(&stdout, "reference divider in plls\n");
		break;
	case ClockAtt20c408:
		Bprint(&stdout, "att 20c408\n");
		break;
	case ClockIbmrgb514:
		Bprint(&stdout, "ibm rgb514\n");
		Bprint(&stdout, "clock to program = 7\n");
		break;
	default:
		Bprint(&stdout, "unknown clock\n");
		break;
	}
	USED(mp);
	if(1 || mp->lcdpanelid) {
		lcdtable = *(ushort*)(bios+0x78);
		if(lcdtable+5 > sizeof bios || lcdtable+bios[lcdtable+5] > sizeof bios) {
			Bprint(&stdout, "can't find lcd bios table\n");
			goto NoLcd;
		}
		lcdpanel = *(ushort*)(bios+lcdtable+0x0A);
		if(lcdpanel+0x1D > sizeof bios /*|| bios[lcdpanel] != mp->lcdpanelid*/) {
			Bprint(&stdout, "can't find lcd bios table0\n");
			goto NoLcd;
		}
		Bprint(&stdout, "panelid %d x %d y %d\n", bios[lcdpanel], *(ushort*)(bios+lcdpanel+0x19), *(ushort*)(bios+lcdpanel+0x1B));
	}
NoLcd:;
}
Ctlr mach64xx = {
	"mach64xx",			/* name */
	snarf,				/* snarf */
	0,				/* options */
	init,				/* init */
	load,				/* load */
	dump,				/* dump */
};
Ctlr mach64xxhwgc = {
	"mach64xxhwgc",			/* name */
	0,				/* snarf */
	0,				/* options */
	0,				/* init */
	0,				/* load */
	0,				/* dump */
};