/*
 * jidct_bin_l1.c
 *
 * binDCT from Loeffler's fact: Var. 3: 16 shifts, 34 adds.
 * Use floor operation for all right-shifting.
 */

/*
 ************************************************
 *
 * $Log: jidct_bin_l1.c,v $
 * Revision 1.1  2000/07/23 15:38:13  jliang
 * Initial revision
 *
 * Revision 1.1  2000/06/26 01:06:44  jliang
 * Initial revision
 *
 *
 *
 ************************************************
 */

/*
**********************************************************************
*
* Modification History:
* Date       Programmer   Description
* --------   ----------   --------------------------------------------
*
***********************************************************************
*/

#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h"		/* Private declarations for DCT subsystem */

#ifdef DCT_BIN_L1_SUPPORTED


/*
 * This module is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif


//Jie 07/07/00: lossless binDCT flag, defined in djpeg.
extern boolean lossless_codec;


/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 * For 8-bit samples with the recommended scaling, all the variable
 * and constant values involved are no more than 16 bits wide, so a
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 * For 12-bit samples, a full 32-bit multiplication will be needed.
 */

#if BITS_IN_JSAMPLE == 8
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
#else
#define MULTIPLY(var,const)  ((var) * (const))
#endif


/* Dequantize a coefficient by multiplying it by the multiplier-table
 * entry; produce an int result.  In this module, both inputs and result
 * are 16 bits or less, so either int or short multiply will work.
 */

#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))


/*
 * Perform dequantization and inverse DCT on one block of coefficients.
 */

GLOBAL(void)
jpeg_idct_bin_l1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
		 JCOEFPTR coef_block,
		 JSAMPARRAY output_buf, JDIMENSION output_col)
{
  INT32 tmp0, tmp1, tmp2, tmp3,tmp4,tmp5,tmp6,tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1;
  JCOEFPTR inptr;
  ISLOW_MULT_TYPE * quantptr;
  int * wsptr;
  JSAMPROW outptr;
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  int ctr;
  int workspace[DCTSIZE2];	/* buffers data between passes */
  int dcval;
  SHIFT_TEMPS

//Case 1: lossless binDCT not required. All scaling caused by butterfly
//are performed at the last stage.
if (!lossless_codec){

  /* Pass 1: process columns from input, store into work array. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  inptr = coef_block;
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  wsptr = workspace;
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
    /* Due to quantization, we will usually find that many of the input
     * coefficients are zero, especially the AC terms.  We can exploit this
     * by short-circuiting the IDCT calculation for any column in which all
     * the AC terms are zero.  In that case each output is equal to the
     * DC coefficient (with scale factor as needed).
     * With typical images and quantization tables, half or more of the
     * column DCT calculations can be simplified this way.
     */
	if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
	inptr[DCTSIZE*7] == 0) {
      dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) >> 1;
      
      wsptr[DCTSIZE*0] = dcval;
      wsptr[DCTSIZE*1] = dcval;
      wsptr[DCTSIZE*2] = dcval;
      wsptr[DCTSIZE*3] = dcval;
      wsptr[DCTSIZE*4] = dcval;
      wsptr[DCTSIZE*5] = dcval;
      wsptr[DCTSIZE*6] = dcval;
      wsptr[DCTSIZE*7] = dcval;
      
      inptr++;			
      quantptr++;
      wsptr++;
      continue;
    }
    
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    tmp3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    tmp4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);  //different from Chen.
    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);  //different from Chen.
    tmp7 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);

	/* X[0] and X[4] */
	tmp11 = ((tmp0 ) >> 1) - tmp1;
	tmp10 = tmp0 - tmp11;
	
	/* X[6] and X[2]: 3/8, 7/16 */
	tmp13 = tmp3 + (((tmp2 << 1) + tmp2 ) >> 3);
	tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - tmp2;

	tmp0 = tmp10 + tmp13;
	tmp3 = tmp10 - tmp13;
	tmp1 = tmp11 + tmp12;
	tmp2 = tmp11 - tmp12;

	/* last lift between X[7] and X[1]: */
	tmp4 = ((tmp7 ) >> 1) - tmp4;
	tmp7 = tmp7 - tmp4;

	//2 intermediate butterflies
	tmp10 = tmp4 + tmp6;
	tmp11 = tmp7 - tmp5;
	tmp12 = tmp4 - tmp6;
	tmp13 = tmp5 + tmp7;

	//pi/16: 3/32, -3/16, 1/8
	tmp12 = (((tmp11 << 1) + tmp11 ) >> 5) + tmp12;
	tmp5  = tmp11 - (((tmp12 << 1) + tmp12 ) >> 4);
	tmp6  = ((tmp5 ) >> 3) + tmp12;

	//3pi/16: 5/16, -9/16, 1/4
	tmp13 = (((tmp10 << 2) + tmp10 ) >> 4) + tmp13;
	tmp4  = tmp10 - (((tmp13 << 3) + tmp13 ) >> 4);
	tmp7  = ((tmp4 ) >> 2) + tmp13;

	/* last stage: butterfly */
	wsptr[DCTSIZE*0] = (tmp0 + tmp7);
    wsptr[DCTSIZE*7] = (tmp0 - tmp7);
    wsptr[DCTSIZE*1] = (tmp1 + tmp6);
    wsptr[DCTSIZE*6] = (tmp1 - tmp6);
    wsptr[DCTSIZE*2] = (tmp2 + tmp5);
    wsptr[DCTSIZE*5] = (tmp2 - tmp5);
    wsptr[DCTSIZE*3] = (tmp3 + tmp4);
    wsptr[DCTSIZE*4] = (tmp3 - tmp4);
    
    inptr++;			/* advance pointers to next column */
    quantptr++;
    wsptr++;
  }
  
  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  //fprintf(stderr, "\nAfter inverse DCT:\n");

  wsptr = workspace;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    outptr = output_buf[ctr] + output_col;
    /* Rows of zeroes can be exploited in the same way as we did with columns.
     * However, the column calculation has created many nonzero AC terms, so
     * the simplification applies less often (typically 5% to 10% of the time).
     * On machines with very fast multiplication, it's possible that the
     * test takes more time than it's worth.  In that case this section
     * may be commented out.
     */
    
#ifndef NO_ZERO_ROW_TEST
    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {

	  /* if all AC are 0, the IDCT will all equal to 1/2 DC, so downscale by 2,
		 After that, apply the downscale of 16 caused by butterflies, so total downscale = 32.*/
	  JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], 5)
			  & RANGE_MASK];
      outptr[0] = dcval;
      outptr[1] = dcval;
      outptr[2] = dcval;
      outptr[3] = dcval;
      outptr[4] = dcval;
      outptr[5] = dcval;
      outptr[6] = dcval;
      outptr[7] = dcval;

      wsptr += DCTSIZE;		
      continue;
    }
#endif
    
    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */

    /* Even part */
/**********************/
/* not necessary ??? */
/************************/

	/* X[0] and X[4] */
	tmp11 = ((wsptr[0] ) >> 1) - wsptr[4];
	tmp10 = wsptr[0] - tmp11;
	
	/* X[6] and X[2]: 3/8, 7/16 */
	tmp13 = wsptr[2] + (((wsptr[6] << 1) + wsptr[6] ) >> 3);
	tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - wsptr[6];

	tmp0 = tmp10 + tmp13;
	tmp3 = tmp10 - tmp13;
	tmp1 = tmp11 + tmp12;
	tmp2 = tmp11 - tmp12;

	/* last lift between X[7] and X[1]: */
	tmp4 = ((wsptr[1] ) >> 1) - wsptr[7];
	tmp7 = wsptr[1] - tmp4;

	//different from Chen's factorization.
	tmp5 = wsptr[3];
	tmp6 = wsptr[5];

	//2 intermediate butterflies
	tmp10 = tmp4 + tmp6;
	tmp11 = tmp7 - tmp5;
	tmp12 = tmp4 - tmp6;
	tmp13 = tmp5 + tmp7;

	//pi/16: 3/32, -3/16, 1/8
	tmp12 = (((tmp11 << 1) + tmp11 ) >> 5) + tmp12;
	tmp5  = tmp11 - (((tmp12 << 1) + tmp12 ) >> 4);
	tmp6  = ((tmp5 ) >> 3) + tmp12;

	//3pi/16: 5/16, -9/16, 1/4
	tmp13 = (((tmp10 << 2) + tmp10 ) >> 4) + tmp13;
	tmp4  = tmp10 - (((tmp13 << 3) + tmp13 ) >> 4);
	tmp7  = ((tmp4 ) >> 2) + tmp13;

	/* last stage: butterfly */

    /* Final output stage: scale down by a factor of 8 and range-limit */
    tmp10=(tmp0 + tmp7);
    tmp11=(tmp0 - tmp7);
	outptr[0] = range_limit[(int)DESCALE(tmp10, 4 ) & RANGE_MASK];
	outptr[7] = range_limit[(int)DESCALE(tmp11, 4 ) & RANGE_MASK];

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

    tmp10=(tmp1 + tmp6);
    tmp11=(tmp1 - tmp6);
	outptr[1] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
	outptr[6] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

    tmp10=(tmp2 + tmp5);
    tmp11=(tmp2 - tmp5);
	outptr[2] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
	outptr[5] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

    tmp10=(tmp3 + tmp4);
    tmp11=(tmp3 - tmp4);
	outptr[3] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
	outptr[4] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];   

   	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }
     
    wsptr += DCTSIZE;		/* advance pointer to next row */

/*******************/
/* Jie: test code */
	/*	for (tmp0 = 0; tmp0 < 8; tmp0 ++) {
	  fprintf(stderr, "%10d", outptr[tmp0]);
	}
	fprintf(stderr, "\n");
	*/
  }

/****************************************************************************/
   
} else {

/****************************************************************************/
  //Case 2: lossless binDCT: descale by 2 immediately after inverse butterfly.


  /* Pass 1: process columns from input, store into work array. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  inptr = coef_block;
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  wsptr = workspace;
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
	
/**************************
 When new butterflies are used, the short-circuiting trick is invalid.
 For example, the input [0, 0, 1, 1, 1, 1, 0, 0] will give output [ 2, 0, 0, 0, 0, 0, 0, 0],
 if round operation is used for 1/2.
 Jie: 07/09/00.

	if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
	inptr[DCTSIZE*7] == 0) {
      dcval = inptr[DCTSIZE*0] >> 2;
      
      wsptr[DCTSIZE*0] = dcval;
      wsptr[DCTSIZE*1] = dcval;
      wsptr[DCTSIZE*2] = dcval;
      wsptr[DCTSIZE*3] = dcval;
      wsptr[DCTSIZE*4] = dcval;
      wsptr[DCTSIZE*5] = dcval;
      wsptr[DCTSIZE*6] = dcval;
      wsptr[DCTSIZE*7] = dcval;
      
      inptr++;			
      quantptr++;
      wsptr++;
      continue;
	  }
**********************/
    
    tmp0 = inptr[DCTSIZE*0];
    tmp1 = inptr[DCTSIZE*4];
    tmp2 = inptr[DCTSIZE*6];
    tmp3 = inptr[DCTSIZE*2];
    tmp4 = inptr[DCTSIZE*7];
    tmp5 = inptr[DCTSIZE*3];  //diff from Chen
    tmp6 = inptr[DCTSIZE*5];  // Diff from Chen
    tmp7 = inptr[DCTSIZE*1];

	/*	
    fprintf(stderr, "%10d", tmp0);
    fprintf(stderr, "%10d", tmp7);
    fprintf(stderr, "%10d", tmp3);
    fprintf(stderr, "%10d", tmp6);
    fprintf(stderr, "%10d", tmp1);
    fprintf(stderr, "%10d", tmp5);
    fprintf(stderr, "%10d", tmp2);
    fprintf(stderr, "%10d", tmp4);
	fprintf(stderr, "\n");	
	*/

	/* X[0] and X[4] */
	//tmp11 = ((tmp0 ) >> 1) - tmp1;
	//tmp10 = tmp0 - tmp11;
	tmp10 = tmp0 + ((tmp1 ) >> 1);
	tmp11 = tmp10 - tmp1;
	
	/* X[6] and X[2]: 3/8, 7/16 */
	tmp13 = tmp3 + (((tmp2 << 1) + tmp2 ) >> 3);
	tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - tmp2;

	//lossless binDCT: use new nutterflies.
	//tmp3 = ((tmp10 ) >> 1) - tmp13;
	//tmp0 = tmp10 - tmp3;
	tmp0 = tmp10 + ((tmp13 ) >> 1);
	tmp3 = tmp0 - tmp13;

	//tmp2 = ((tmp11 ) >> 1) - tmp12;
	//tmp1 = tmp11 - tmp2;
	tmp1 = tmp11 + ((tmp12 ) >> 1);
	tmp2 = tmp1 - tmp12;

	// odd part

	// X[7] and X[1]: butterfly
	tmp7 = tmp7 + ((tmp4 ) >> 1);
	tmp4 = tmp7 - tmp4;

	//intermediate bf
	tmp10 = tmp4 + (( tmp6 ) >> 1);
	tmp12 = tmp10 - tmp6;

	tmp13 = ((tmp5 ) >> 1) + tmp7;
	tmp11 = tmp13 - tmp5;

	//pi/16: 3/32, -3/16, 1/8
	tmp12 = (((tmp11 << 1) + tmp11 ) >> 5) + tmp12;
	tmp5  = tmp11 - (((tmp12 << 1) + tmp12 ) >> 4);
	tmp6  = ((tmp5 ) >> 3) + tmp12;

	//3pi/16: 5/16, -9/16, 1/4
	tmp13 = (((tmp10 << 2) + tmp10 ) >> 4) + tmp13;
	tmp4  = tmp10 - (((tmp13 << 3) + tmp13 ) >> 4);
	tmp7  = ((tmp4 ) >> 2) + tmp13;

 	/* last stage: butterfly */
	wsptr[DCTSIZE*0] = tmp0 + ((tmp7 ) >> 1);
    wsptr[DCTSIZE*7] = wsptr[DCTSIZE*0] - tmp7 ;

    wsptr[DCTSIZE*1] = tmp1 + ((tmp6 ) >> 1);
    wsptr[DCTSIZE*6] = wsptr[DCTSIZE*1] - tmp6 ;

    wsptr[DCTSIZE*2] = tmp2 + ((tmp5 ) >> 1);
    wsptr[DCTSIZE*5] = wsptr[DCTSIZE*2] - tmp5 ;

    wsptr[DCTSIZE*3] = tmp3 + ((tmp4 ) >> 1);
    wsptr[DCTSIZE*4] = wsptr[DCTSIZE*3] - tmp4 ;
 
    inptr++;			/* advance pointers to next column */
    quantptr++;
    wsptr++;
  }
  
  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  //fprintf(stderr, "\nAfter inverse DCT:\n");

  wsptr = workspace;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    outptr = output_buf[ctr] + output_col;
    /* Rows of zeroes can be exploited in the same way as we did with columns.
     * However, the column calculation has created many nonzero AC terms, so
     * the simplification applies less often (typically 5% to 10% of the time).
     * On machines with very fast multiplication, it's possible that the
     * test takes more time than it's worth.  In that case this section
     * may be commented out.
     */
    
#ifndef NO_ZERO_ROW_TEST
/*********************************
 When new butterflies are used, the short-circuiting trick is invalid.
 For example, the input [0, 0, 1, 1, 1, 1, 0, 0] will give output [ 2, 0, 0, 0, 0, 0, 0, 0],
 if round operation is used for 1/2.
 Jie: 07/09/00.

	 if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
			wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {

	  //In lossless binDCT, if all AC are 0, all inverse transform values woule be the DC value scaled by 8,
	  //caused by 3-level butterfly in obtaining the DC.
	  JSAMPLE dcval = range_limit[(wsptr[0] >> 2) & RANGE_MASK];
      outptr[0] = dcval;
      outptr[1] = dcval;
      outptr[2] = dcval;
      outptr[3] = dcval;
      outptr[4] = dcval;
      outptr[5] = dcval;
      outptr[6] = dcval;
      outptr[7] = dcval;

      wsptr += DCTSIZE;		
      continue;
	  }
****************************/
#endif
    
    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */

    /* Even part */
/**********************/
/* not necessary ??? */
/************************/

/********
    tmp0 = (INT32) wsptr[0];
    tmp1 = (INT32) wsptr[4];
    tmp2 = (INT32) wsptr[6];
    tmp3 = (INT32) wsptr[2];
    tmp4 = (INT32) wsptr[7];
    tmp5 = (INT32) wsptr[5];
    tmp6 = (INT32) wsptr[3];
    tmp7 = (INT32) wsptr[1];
*********/

	/* X[0] and X[4] */
	//tmp11 = ((wsptr[0] ) >> 1) - wsptr[4];
	//tmp10 = wsptr[0] - tmp11;
	tmp10 = wsptr[0] + ((wsptr[4] ) >> 1);
	tmp11 = tmp10 - wsptr[4];
	
	/* X[6] and X[2]:3/8, 7/16 */
	tmp13 = wsptr[2] + (((wsptr[6] << 1) + wsptr[6] ) >> 3);
	tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - wsptr[6];

	//lossless binDCT: use new nutterflies.
	//	tmp3 = ((tmp10 ) >> 1) - tmp13;
	//	tmp0 = tmp10 - tmp3;
	tmp0 = tmp10 + ((tmp13 ) >> 1);
	tmp3 = tmp0 - tmp13;

	//	tmp2 = ((tmp11 ) >> 1) - tmp12;
	//	tmp1 = tmp11 - tmp2;
	tmp1 = tmp11 + ((tmp12 ) >> 1);
	tmp2 = tmp1 - tmp12;

	// odd part

	// X[7] and X[1]: butterfly
	tmp7 = wsptr[1] + ((wsptr[7] ) >> 1);
	tmp4 = tmp7 - wsptr[7];

	tmp5 = wsptr[3];
	tmp6 = wsptr[5];

	//intermediate bf
	tmp10 = tmp4 + (( tmp6 ) >> 1);
	tmp12 = tmp10 - tmp6;

	tmp13 = ((tmp5 ) >> 1) + tmp7;
	tmp11 = tmp13 - tmp5;

	//pi/16: 3/32, -3/16, 1/8
	tmp12 = (((tmp11 << 1) + tmp11 ) >> 5) + tmp12;
	tmp5  = tmp11 - (((tmp12 << 1) + tmp12 ) >> 4);
	tmp6  = ((tmp5 ) >> 3) + tmp12;

	//3pi/16: 5/16, -9/16, 1/4
	tmp13 = (((tmp10 << 2) + tmp10 ) >> 4) + tmp13;
	tmp4  = tmp10 - (((tmp13 << 3) + tmp13 ) >> 4);
	tmp7  = ((tmp4 ) >> 2) + tmp13;

	/* last stage: butterfly */

    /* Final output stage: scale down by a factor of 8 and range-limit */
    tmp10=tmp0 + ((tmp7 ) >> 1);
    tmp11=tmp10 - tmp7;
	outptr[0] = range_limit[tmp10 & RANGE_MASK];
	outptr[7] = range_limit[tmp11 & RANGE_MASK];
    //outptr[0] = (char)DESCALE(tmp10, 4);
    //outptr[7] = (char)DESCALE(tmp11,4);

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

    tmp10=tmp1 + ((tmp6 ) >> 1);
    tmp11=(tmp10 - tmp6);
	outptr[1] = range_limit[tmp10 & RANGE_MASK];
	outptr[6] = range_limit[tmp11 & RANGE_MASK];
    //outptr[1] =  (char)DESCALE(tmp10, 4);
    //outptr[6] =  (char)DESCALE(tmp11, 4);

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

    tmp10=tmp2 + ((tmp5 ) >> 1);
    tmp11=(tmp10 - tmp5);
	outptr[2] = range_limit[tmp10 & RANGE_MASK];
	outptr[5] = range_limit[tmp11 & RANGE_MASK];
    //outptr[2] = (char)DESCALE(tmp10,4);
    //outptr[5] = (char)DESCALE(tmp11,4);

	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }


    tmp10=tmp3 + ((tmp4 ) >> 1);
    tmp11=(tmp10 - tmp4);
	outptr[3] = range_limit[tmp10 & RANGE_MASK];
	outptr[4] = range_limit[tmp11 & RANGE_MASK];   
    //outptr[3] =  (char)DESCALE(tmp10,4);
    //outptr[4] =  (char)DESCALE(tmp11,4);

   	if (tmp10 > 4096 || tmp10 < -4096 || tmp11 > 4096 || tmp11 < -4096) {
	  fprintf(stderr,"Possible IDCT overflow!\n");
    }

	/* 	for (z1 = 0; z1 < 8; z1 ++) {
	  fprintf(stderr, "%6d", outptr[z1]-128);
	}
	fprintf(stderr, "\n");*/
     
    wsptr += DCTSIZE;		/* advance pointer to next row */

/*******************/
/* Jie: test code */
	/*	for (tmp0 = 0; tmp0 < 8; tmp0 ++) {
	  fprintf(stderr, "%10d", outptr[tmp0]);
	}
	fprintf(stderr, "\n");
	*/
  }

}

}



#endif /* DCT_BIN_L1_SUPPORTED */
