code: plan9front

ref: 5622b0bbd878dbc34045cc6fd37cffa64461eabe
dir: /sys/src/cmd/aux/antiword/wordlib.c/

View raw version
/*
 * wordlib.c
 * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
 *
 * Description:
 * Deal with the internals of a MS Word file
 */

#include "antiword.h"

static BOOL	bOldMacFile = FALSE;


/*
 * Common part of the file checking functions
 */
static BOOL
bCheckBytes(FILE *pFile, const UCHAR *aucBytes, size_t tBytes)
{
	int	iIndex, iChar;

	fail(pFile == NULL || aucBytes == NULL || tBytes == 0);

	rewind(pFile);

	for (iIndex = 0; iIndex < (int)tBytes; iIndex++) {
		iChar = getc(pFile);
		if (iChar == EOF || iChar != (int)aucBytes[iIndex]) {
			NO_DBG_HEX(iChar);
			NO_DBG_HEX(aucBytes[iIndex]);
			return FALSE;
		}
	}
	return TRUE;
} /* end of bCheckBytes */

/*
 * This function checks whether the given file is or is not a "Word for DOS"
 * document
 */
BOOL
bIsWordForDosFile(FILE *pFile, long lFilesize)
{
	static UCHAR	aucBytes[] =
		{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab };	/* Word for DOS */

	DBG_MSG("bIsWordForDosFile");

	if (pFile == NULL || lFilesize < 0) {
		DBG_MSG("No proper file given");
		return FALSE;
	}
	if (lFilesize < 128) {
		DBG_MSG("File too small to be a Word document");
		return FALSE;
	}
	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordForDosFile */

/*
 * This function checks whether the given file is or is not a file with an
 * OLE envelope (That is a document made by Word 6 or later)
 */
static BOOL
bIsWordFileWithOLE(FILE *pFile, long lFilesize)
{
	static UCHAR	aucBytes[] =
		{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
	int	iTailLen;

	if (pFile == NULL || lFilesize < 0) {
		DBG_MSG("No proper file given");
		return FALSE;
	}
	if (lFilesize < (long)BIG_BLOCK_SIZE * 3) {
		DBG_MSG("This file is too small to be a Word document");
		return FALSE;
	}

	iTailLen = (int)(lFilesize % BIG_BLOCK_SIZE);
	switch (iTailLen) {
	case 0:		/* No tail, as it should be */
		break;
	case 1:
	case 2:		/* Filesize mismatch or a buggy email program */
		if ((int)(lFilesize % 3) == iTailLen) {
			DBG_DEC(lFilesize);
			return FALSE;
		}
		/*
		 * Ignore extra bytes caused by buggy email programs.
		 * They have bugs in their base64 encoding or decoding.
		 * 3 bytes -> 4 ascii chars -> 3 bytes
		 */
		DBG_MSG("Document with extra bytes");
		break;
	default:	/* Wrong filesize for a Word document */
		DBG_DEC(lFilesize);
		DBG_DEC(iTailLen);
		return FALSE;
	}
	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordFileWithOLE */

/*
 * This function checks whether the given file is or is not a RTF document
 */
BOOL
bIsRtfFile(FILE *pFile)
{
	static UCHAR	aucBytes[] =
		{ '{', '\\', 'r', 't', 'f', '1' };

	DBG_MSG("bIsRtfFile");

	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsRtfFile */

/*
 * This function checks whether the given file is or is not a WP document
 */
BOOL
bIsWordPerfectFile(FILE *pFile)
{
	static UCHAR	aucBytes[] =
		{ 0xff, 'W', 'P', 'C' };

	DBG_MSG("bIsWordPerfectFile");

	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordPerfectFile */

/*
 * This function checks whether the given file is or is not a "Win Word 1 or 2"
 * document
 */
BOOL
bIsWinWord12File(FILE *pFile, long lFilesize)
{
	static UCHAR	aucBytes[2][4] = {
		{ 0x9b, 0xa5, 0x21, 0x00 },	/* Win Word 1.x */
		{ 0xdb, 0xa5, 0x2d, 0x00 },	/* Win Word 2.0 */
	};
	int	iIndex;

	DBG_MSG("bIsWinWord12File");

	if (pFile == NULL || lFilesize < 0) {
		DBG_MSG("No proper file given");
		return FALSE;
	}
	if (lFilesize < 384) {
		DBG_MSG("This file is too small to be a Word document");
		return FALSE;
	}

	for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
		if (bCheckBytes(pFile,
				aucBytes[iIndex],
				elementsof(aucBytes[iIndex]))) {
			return TRUE;
		}
	}
	return FALSE;
} /* end of bIsWinWord12File */

/*
 * This function checks whether the given file is or is not a "Mac Word 4 or 5"
 * document
 */
BOOL
bIsMacWord45File(FILE *pFile)
{
	static UCHAR	aucBytes[2][6] = {
		{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 },	/* Mac Word 4 */
		{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 },	/* Mac Word 5 */
	};
	int	iIndex;

	DBG_MSG("bIsMacWord45File");

	for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
		if (bCheckBytes(pFile,
				aucBytes[iIndex],
				elementsof(aucBytes[iIndex]))) {
			return TRUE;
		}
	}
	return FALSE;
} /* end of bIsMacWord45File */

/*
 * iGuessVersionNumber - guess the Word version number from first few bytes
 *
 * Returns the guessed version number or -1 when no guess it possible
 */
int
iGuessVersionNumber(FILE *pFile, long lFilesize)
{
	if(bIsWordForDosFile(pFile, lFilesize)) {
		return 0;
	}
	if (bIsWinWord12File(pFile, lFilesize)) {
		return 2;
	}
	if (bIsMacWord45File(pFile)) {
		return 5;
	}
	if (bIsWordFileWithOLE(pFile, lFilesize)) {
		return 6;
	}
	return -1;
} /* end of iGuessVersionNumber */

/*
 * iGetVersionNumber - get the Word version number from the header
 *
 * Returns the version number or -1 when unknown
 */
int
iGetVersionNumber(const UCHAR *aucHeader)
{
	USHORT	usFib, usChse;

	usFib = usGetWord(0x02, aucHeader);
	if (usFib >= 0x1000) {
		/* To big: must be MacWord using Big Endian */
		DBG_HEX(usFib);
		usFib = usGetWordBE(0x02, aucHeader);
	}
	DBG_DEC(usFib);
	bOldMacFile = FALSE;
	switch (usFib) {
	case   0:
		DBG_MSG("Word for DOS");
		return 0;
	case  28:
		DBG_MSG("Word 4 for Macintosh");
		bOldMacFile = TRUE;
		return 4;
	case  33:
		DBG_MSG("Word 1.x for Windows");
		return 1;
	case  35:
		DBG_MSG("Word 5 for Macintosh");
		bOldMacFile = TRUE;
		return 5;
	case  45:
		DBG_MSG("Word 2 for Windows");
		return 2;
	case 101:
	case 102:
		DBG_MSG("Word 6 for Windows");
		return 6;
	case 103:
	case 104:
		usChse = usGetWord(0x14, aucHeader);
		DBG_DEC(usChse);
		switch (usChse) {
		case 0:
			DBG_MSG("Word 7 for Win95");
			return 7;
		case 256:
			DBG_MSG("Word 6 for Macintosh");
			bOldMacFile = TRUE;
			return 6;
		default:
			DBG_FIXME();
			if ((int)ucGetByte(0x05, aucHeader) == 0xe0) {
				DBG_MSG("Word 7 for Win95");
				return 7;
			}
			DBG_MSG("Word 6 for Macintosh");
			bOldMacFile = TRUE;
			return 6;
		}
	default:
		usChse = usGetWord(0x14, aucHeader);
		DBG_DEC(usChse);
		if (usFib < 192) {
			/* Unknown or unsupported version of Word */
			DBG_DEC(usFib);
			return -1;
		}
		DBG_MSG_C(usChse != 256, "Word97 for Win95/98/NT");
		DBG_MSG_C(usChse == 256, "Word98 for Macintosh");
		return 8;
	}
} /* end of iGetVersionNumber */

/*
 * TRUE if the current file was made by Word version 6 or older on an
 * Apple Macintosh, otherwise FALSE.
 * This function hides the methode of how to find out from the rest of the
 * program.
 */
BOOL
bIsOldMacFile(void)
{
	return bOldMacFile;
} /* end of bIsOldMacFile */

/*
 * iInitDocument - initialize a document
 *
 * Returns the version of Word that made the document or -1
 */
int
iInitDocument(FILE *pFile, long lFilesize)
{
	int	iGuess, iWordVersion;

	iGuess = iGuessVersionNumber(pFile, lFilesize);
	switch (iGuess) {
	case 0:
		iWordVersion = iInitDocumentDOS(pFile, lFilesize);
		break;
	case 2:
		iWordVersion = iInitDocumentWIN(pFile, lFilesize);
		break;
	case 5:
		iWordVersion = iInitDocumentMAC(pFile, lFilesize);
		break;
	case 6:
		iWordVersion = iInitDocumentOLE(pFile, lFilesize);
		break;
	default:
		DBG_DEC(iGuess);
		iWordVersion = -1;
		break;
	}
	return iWordVersion;
} /* end of iInitDocument */

/*
 * vFreeDocument - free a document by free-ing its parts
 */
void
vFreeDocument(void)
{
	DBG_MSG("vFreeDocument");

	/* Free the memory */
	vDestroyTextBlockList();
	vDestroyDataBlockList();
	vDestroyListInfoList();
	vDestroyRowInfoList();
	vDestroyStyleInfoList();
	vDestroyFontInfoList();
	vDestroyStylesheetList();
	vDestroyPictInfoList();
	vDestroyDocumentInfoList();
	vDestroySectionInfoList();
	vDestroyHdrFtrInfoList();
	vDestroyPropModList();
	vDestroyNotesInfoLists();
	vDestroyFontTable();
	vDestroySummaryInfo();
} /* end of vFreeDocument */