/*
 * jidct_bin_c4.c
 *
 * Variation 4 of the binDCT from Chen's factorization (see the SPIE paper):
 * Cost:        19 shifts, 37 Adds. 
 * Coding Gain: 8.8220 dB, 
 * MSE:         8.49E-5.
 *
 * Floor operations are used for all right-shifts(<<).
 *
 */

/*
 ************************************************
 *
 * $Log: jidct_bin_b1.c,v $
 * Revision 1.1  2000/07/23 15:38:08  jliang
 * Initial revision
 *
 * Revision 1.1  2000/06/26 01:06:44  jliang
 * Initial revision
 *
 *
 *
 ************************************************
 */

/*
**********************************************************************
*
* Modification History:
* Date       Programmer   Description
* --------   ----------   --------------------------------------------
*
***********************************************************************
*/

/*
**********************************************************************
*
* Modification History FOR H.263
* Date       Programmer   Description
* --------   ----------   --------------------------------------------
* 10/04/2000 
* . Delete range limit for H.263, since no shift by 128. 
* . add "return 0";
* . 2nd pass: outptr = output_buf; and outptr += DCTSIZE;
* . final result: change outptr[x] = (char)DESCALE(tmp10, 4) to "DESCALE(tmp10, 4)".
* . In lossless, remove rangelimit.
*
*   Modification for H.263 decoder:
*   10/14/2000
*   1: return type change to void
*   2: input parameter only 1, "short *" now.
*   3: set outptr = coef_block; (both lossless and lossy cases)
*   4: delete "return 0".
*   5: change inptr and outptr to "short *".
*
*
***********************************************************************
*/

#include "config.h" 
#include "tmndec.h" 
#include "global.h" 

extern int lossless_codec;

void d_idct_bin_c4(short *coef_block)
{
  INT32 tmp0, tmp1, tmp2, tmp3,tmp4,tmp5,tmp6,tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13;
  /*  INT32 z0,z1, z2, z3, z4,z10,z11,z12,z13;*/
  short *inptr;
  int * wsptr;
  short *outptr;
  int ctr, i;
  int workspace[DCTSIZE2];	/* buffers data between passes */
  int dcval;

//Case 1: lossless binDCT not required. All scaling caused by butterfly
//are performed at the last stage.
if (!lossless_codec){

  /* Pass 1: process columns from input, store into work array. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  inptr = coef_block;
  wsptr = workspace;
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
    
    // check all 0 AC 
    // since the shortcut in the 2nd pass will descale by 32,
    // here it's only descaled by 2. overall descale is still 64.
    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
	inptr[DCTSIZE*7] == 0) {
      dcval = inptr[DCTSIZE*0] >> 1;
      
      wsptr[DCTSIZE*0] = dcval;
      wsptr[DCTSIZE*1] = dcval;
      wsptr[DCTSIZE*2] = dcval;
      wsptr[DCTSIZE*3] = dcval;
      wsptr[DCTSIZE*4] = dcval;
      wsptr[DCTSIZE*5] = dcval;
      wsptr[DCTSIZE*6] = dcval;
      wsptr[DCTSIZE*7] = dcval;
      
      inptr++;			
      wsptr++;
      continue;
    }

    tmp0 = inptr[DCTSIZE*0];
    tmp1 = inptr[DCTSIZE*4];
    tmp2 = inptr[DCTSIZE*6];
    tmp3 = inptr[DCTSIZE*2];
    tmp4 = inptr[DCTSIZE*7];
    tmp5 = inptr[DCTSIZE*5];
    tmp6 = inptr[DCTSIZE*3];
    tmp7 = inptr[DCTSIZE*1];

    /* X[0] and X[4] */
    tmp11 = ((tmp0 ) >> 1) - tmp1;
    tmp10 = tmp0 - tmp11;
	
    /* X[6] and X[2]: 3/8, 7/16 */
    tmp13 = tmp3 + (((tmp2 << 1) + tmp2 ) >> 3);
    tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - tmp2;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    /* X[7] and X[1]: */
    /* 7pi/16 = 3/16d 3/16u */
    tmp13 = tmp7 + ( ((tmp4 << 1) + tmp4 ) >> 4 );
    tmp10 = ( ((tmp13 << 1) + tmp13 ) >> 4 ) - tmp4;

    /* X[5] and X[3] */
    /* new 7/16 and -5/8*/
    tmp12 = tmp6 + (((tmp5 << 3) - tmp5 ) >> 4);
    tmp11 = tmp5 - (((tmp12 << 2) + tmp12 ) >> 3);

    /* Butterfly */
    tmp4 = tmp10 + tmp11;
    tmp5 = tmp10 - tmp11;
    tmp6 = tmp13 - tmp12;
    tmp7 = tmp13 + tmp12;

    /* pi/4 = -3/8u -11/16d 7/16u */
    tmp5 = (((tmp6 << 1) + tmp6 ) >> 3) - tmp5;
    tmp6 = tmp6 - tmp5 + (((tmp5 << 2) + tmp5 ) >> 4);
    tmp5 = tmp5 + (((tmp6 << 3) - tmp6 ) >> 4);

    /* last stage: butterfly */
    wsptr[DCTSIZE*0] = (tmp0 + tmp7);
    wsptr[DCTSIZE*7] = (tmp0 - tmp7);
    wsptr[DCTSIZE*1] = (tmp1 + tmp6);
    wsptr[DCTSIZE*6] = (tmp1 - tmp6);
    wsptr[DCTSIZE*2] = (tmp2 + tmp5);
    wsptr[DCTSIZE*5] = (tmp2 - tmp5);
    wsptr[DCTSIZE*3] = (tmp3 + tmp4);
    wsptr[DCTSIZE*4] = (tmp3 - tmp4);
    
    inptr++;			/* advance pointers to next column */
    wsptr++;
  }

  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  //fprintf(stderr, "\nAfter inverse DCT:\n");

  wsptr = workspace;
  outptr = coef_block;

  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    //    outptr = output_buf[ctr] + output_col;

    //shortcut
    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {

      //if all AC are 0, the IDCT will all equal to 1/2 DC, so downscale by 2,
      //After that, apply the downscale of 16 caused by butterflies, so total downscale = 32.
      dcval = DESCALE(wsptr[0], 5);
      outptr[0] = dcval;
      outptr[1] = dcval;
      outptr[2] = dcval;
      outptr[3] = dcval;
      outptr[4] = dcval;
      outptr[5] = dcval;
      outptr[6] = dcval;
      outptr[7] = dcval;

      wsptr += DCTSIZE;		
      outptr += DCTSIZE;     //H.263
      continue;
    }

    /* X[0] and X[4] */
    tmp11 = ((wsptr[0] ) >> 1) - wsptr[4];
    tmp10 = wsptr[0] - tmp11;
	
    /* X[6] and X[2]: 3/8, 7/16 */
    tmp13 = wsptr[2] + (((wsptr[6] << 1) + wsptr[6] ) >> 3);
    tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - wsptr[6];

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    /* 7pi/16 = -3/16d 3/16u */
    tmp13 = wsptr[1] +( ((wsptr[7] << 1) + wsptr[7] ) >> 4 );
    tmp10 = ( ((tmp13 << 1) + tmp13 ) >> 4 ) - wsptr[7];

    /* 3pi/16 = */
    /* new 7/16 and -5/8*/
    tmp12 = wsptr[3] + (((wsptr[5] << 3) - wsptr[5] ) >> 4);
    tmp11 = wsptr[5] - (((tmp12 << 2) + tmp12 ) >> 3);

    tmp4 = tmp10 + tmp11;
    tmp5 = tmp10 - tmp11;
    tmp6 = tmp13 - tmp12;
    tmp7 = tmp13 + tmp12;

    /* pi/4 = -3/8u -11/16d 7/16u */
    tmp5 = (((tmp6 << 1) + tmp6 ) >> 3) - tmp5;
    tmp6 = tmp6 - tmp5 + (((tmp5 << 2) + tmp5 ) >> 4);
    tmp5 = tmp5 + (((tmp6 << 3) - tmp6 ) >> 4);

    /* last stage: butterfly */

    /* Final output stage: scale down by a factor of 8 and range-limit */
    tmp10=(tmp0 + tmp7);
    tmp11=(tmp0 - tmp7);
    //	outptr[0] = range_limit[(int)DESCALE(tmp10, 4 ) & RANGE_MASK];
    //	outptr[7] = range_limit[(int)DESCALE(tmp11, 4 ) & RANGE_MASK];
    outptr[0] = DESCALE(tmp10, 4);
    outptr[7] = DESCALE(tmp11,4);

    tmp10=(tmp1 + tmp6);
    tmp11=(tmp1 - tmp6);
    //	outptr[1] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
    //	outptr[6] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];
    outptr[1] =  DESCALE(tmp10, 4);
    outptr[6] =  DESCALE(tmp11, 4);

    tmp10=(tmp2 + tmp5);
    tmp11=(tmp2 - tmp5);
    //	outptr[2] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
    //	outptr[5] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];
    outptr[2] = DESCALE(tmp10,4);
    outptr[5] = DESCALE(tmp11,4);

    tmp10=(tmp3 + tmp4);
    tmp11=(tmp3 - tmp4);
    //	outptr[3] = range_limit[(int)DESCALE(tmp10, 4) & RANGE_MASK];
    //	outptr[4] = range_limit[(int)DESCALE(tmp11, 4) & RANGE_MASK];   
    outptr[3] = DESCALE(tmp10,4);
    outptr[4] = DESCALE(tmp11,4);

    wsptr += DCTSIZE;		/* advance pointer to next row */

    outptr += DCTSIZE;
/*******************/
/* Jie: test code */
	/*	for (tmp0 = 0; tmp0 < 8; tmp0 ++) {
	  fprintf(stderr, "%10d", outptr[tmp0]);
	}
	fprintf(stderr, "\n");
	*/
  }

/****************************************************************************/
   
} else {

/****************************************************************************/
  //Case 2: lossless binDCT: descale by 2 immediately after inverse butterfly.


  /* Pass 1: process columns from input, store into work array. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  inptr = coef_block;
  wsptr = workspace;
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
	
    tmp0 = inptr[DCTSIZE*0];
    tmp1 = inptr[DCTSIZE*4];
    tmp2 = inptr[DCTSIZE*6];
    tmp3 = inptr[DCTSIZE*2];
    tmp4 = inptr[DCTSIZE*7];
    tmp5 = inptr[DCTSIZE*5];
    tmp6 = inptr[DCTSIZE*3];
    tmp7 = inptr[DCTSIZE*1];

	/*	
    fprintf(stderr, "%10d", tmp0);
    fprintf(stderr, "%10d", tmp7);
    fprintf(stderr, "%10d", tmp3);
    fprintf(stderr, "%10d", tmp6);
    fprintf(stderr, "%10d", tmp1);
    fprintf(stderr, "%10d", tmp5);
    fprintf(stderr, "%10d", tmp2);
    fprintf(stderr, "%10d", tmp4);
    fprintf(stderr, "\n");	
	*/

    /* X[0] and X[4] */
    //tmp11 = ((tmp0 ) >> 1) - tmp1;
    //tmp10 = tmp0 - tmp11;
    tmp10 = tmp0 + ((tmp1 ) >> 1);
    tmp11 = tmp10 - tmp1;
	
    /* X[6] and X[2]: 3/8, 7/16 */
    tmp13 = tmp3 + (((tmp2 << 1) + tmp2 ) >> 3);
    tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - tmp2;

    //lossless binDCT: use new nutterflies.
    //tmp3 = ((tmp10 ) >> 1) - tmp13;
    //tmp0 = tmp10 - tmp3;
    tmp0 = tmp10 + ((tmp13 ) >> 1);
    tmp3 = tmp0 - tmp13;

    //tmp2 = ((tmp11 ) >> 1) - tmp12;
    //tmp1 = tmp11 - tmp2;
    tmp1 = tmp11 + ((tmp12 ) >> 1);
    tmp2 = tmp1 - tmp12;

    /* X[7] and X[1]: */
    /* 7pi/16 = 3/16d 3/16u */
    tmp13 = tmp7 + ( ((tmp4 << 1) + tmp4 ) >> 4 );
    tmp10 = ( ((tmp13 << 1) + tmp13 ) >> 4 ) - tmp4;

    /* X[5] and X[3] */
    /* new 7/16 and -5/8*/
    tmp12 = tmp6 + (((tmp5 << 3) - tmp5 ) >> 4);
    tmp11 = tmp5 - (((tmp12 << 2) + tmp12 ) >> 3);

    //lossless binDCT: use new butterflies.
    //	tmp5 = ((tmp10 ) >> 1) - tmp11;
    //	tmp4 = tmp10 - tmp5;
    tmp4 = tmp10 + ((tmp11 ) >> 1);
    tmp5 = tmp4 - tmp11;

    //  tmp6 = ((tmp13 ) >> 1) - tmp12;
    //	tmp7 = tmp13 - tmp6;
    tmp7 = tmp13 + ((tmp12 ) >> 1);
    tmp6 = tmp7 - tmp12;

    /* pi/4 = -3/8u -11/16d 7/16u */
    tmp5 = (((tmp6 << 1) + tmp6 ) >> 3) - tmp5;
    tmp6 = tmp6 - tmp5 + (((tmp5 << 2) + tmp5 ) >> 4);
    tmp5 = tmp5 + (((tmp6 << 3) - tmp6 ) >> 4);

    /* last stage: butterfly */
    wsptr[DCTSIZE*0] = tmp0 + ((tmp7 ) >> 1);
    wsptr[DCTSIZE*7] = wsptr[DCTSIZE*0] - tmp7 ;

    wsptr[DCTSIZE*1] = tmp1 + ((tmp6 ) >> 1);
    wsptr[DCTSIZE*6] = wsptr[DCTSIZE*1] - tmp6 ;

    wsptr[DCTSIZE*2] = tmp2 + ((tmp5 ) >> 1);
    wsptr[DCTSIZE*5] = wsptr[DCTSIZE*2] - tmp5 ;

    wsptr[DCTSIZE*3] = tmp3 + ((tmp4 ) >> 1);
    wsptr[DCTSIZE*4] = wsptr[DCTSIZE*3] - tmp4 ;
    
    inptr++;			/* advance pointers to next column */
    wsptr++;
  }
  
  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  //fprintf(stderr, "\nAfter inverse DCT:\n");

  wsptr = workspace;
  outptr = coef_block;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    //    outptr = output_buf[ctr] + output_col;

    /* X[0] and X[4] */
    //tmp11 = ((wsptr[0] ) >> 1) - wsptr[4];
    //tmp10 = wsptr[0] - tmp11;
    tmp10 = wsptr[0] + ((wsptr[4] ) >> 1);
    tmp11 = tmp10 - wsptr[4];
    
    /* X[6] and X[2]:3/8, 7/16 */
    tmp13 = wsptr[2] + (((wsptr[6] << 1) + wsptr[6] ) >> 3);
    tmp12 = (((tmp13 << 3) - tmp13 ) >> 4) - wsptr[6];

    //lossless binDCT: use new nutterflies.
    //	tmp3 = ((tmp10 ) >> 1) - tmp13;
    //	tmp0 = tmp10 - tmp3;
    tmp0 = tmp10 + ((tmp13 ) >> 1);
    tmp3 = tmp0 - tmp13;

    //	tmp2 = ((tmp11 ) >> 1) - tmp12;
    //	tmp1 = tmp11 - tmp2;
    tmp1 = tmp11 + ((tmp12 ) >> 1);
    tmp2 = tmp1 - tmp12;

    /* 7pi/16 = -3/16d 3/16u */
    tmp13 = wsptr[1] +( ((wsptr[7] << 1) + wsptr[7] ) >> 4 );
    tmp10 = ( ((tmp13 << 1) + tmp13 ) >> 4 ) - wsptr[7];

    /* 3pi/16 = 1/2d -7/8u */
    /* new 7/16 and -5/8*/
    tmp12 = wsptr[3] + (((wsptr[5] << 3) - wsptr[5] ) >> 4);
    tmp11 = wsptr[5] - (((tmp12 << 2) + tmp12 ) >> 3);

    //lossless binDCT: use new butterflies.
    //	tmp5 = ((tmp10 ) >> 1) - tmp11;
    //	tmp4 = tmp10 - tmp5;
    tmp4 = tmp10 + ((tmp11 ) >> 1);
    tmp5 = tmp4 - tmp11;

    //    tmp6 = ((tmp13 ) >> 1) - tmp12;
    //	tmp7 = tmp13 - tmp6;
    tmp7 = tmp13 + ((tmp12 ) >> 1);
    tmp6 = tmp7 - tmp12;

    /* pi/4 = -3/8u -11/16d 7/16u */
    tmp5 = (((tmp6 << 1) + tmp6 ) >> 3) - tmp5;
    tmp6 = tmp6 - tmp5 + (((tmp5 << 2) + tmp5 ) >> 4);
    tmp5 = tmp5 + (((tmp6 << 3) - tmp6 ) >> 4);

    /* last stage: butterfly */

    /* Final output stage: scale down by a factor of 8 and range-limit */
    tmp10=tmp0 + ((tmp7 ) >> 1);
    tmp11=tmp10 - tmp7;
    //	outptr[0] = range_limit[tmp10 & RANGE_MASK];
    //	outptr[7] = range_limit[tmp11 & RANGE_MASK];
    outptr[0] = tmp10;
    outptr[7] = tmp11;

    tmp10=tmp1 + ((tmp6 ) >> 1);
    tmp11=(tmp10 - tmp6);
    //	outptr[1] = range_limit[tmp10 & RANGE_MASK];
    //	outptr[6] = range_limit[tmp11 & RANGE_MASK];
    outptr[1] = tmp10;
    outptr[6] = tmp11;

    tmp10=tmp2 + ((tmp5 ) >> 1);
    tmp11=(tmp10 - tmp5);
    //	outptr[2] = range_limit[tmp10 & RANGE_MASK];
    //	outptr[5] = range_limit[tmp11 & RANGE_MASK];
    outptr[2] = tmp10;
    outptr[5] = tmp11;

    tmp10=tmp3 + ((tmp4 ) >> 1);
    tmp11=(tmp10 - tmp4);
    //	outptr[3] = range_limit[tmp10 & RANGE_MASK];
    //	outptr[4] = range_limit[tmp11 & RANGE_MASK];   
    outptr[3] =  tmp10;
    outptr[4] =  tmp11;
    
    wsptr += DCTSIZE;		/* advance pointer to next row */
    outptr += DCTSIZE;

/*******************/
/* Jie: test code */
	/*	for (tmp0 = 0; tmp0 < 8; tmp0 ++) {
	  fprintf(stderr, "%10d", outptr[tmp0]);
	}
	fprintf(stderr, "\n");
	*/
  }

}

}

