Blame - third_party/libjpeg/jfdctint.c - RealtimeRoboticsGroup/test

blob: 1dde58c499d79e67fc0b3a8124d189931ea00255 [file] [log] [blame]

Parker Schuh	ebf887e	2016-01-10 18:04:04 -0800	[diff] [blame]	1	/*
				2	* jfdctint.c
				3	*
				4	* Copyright (C) 1991-1996, Thomas G. Lane.
				5	* Modification developed 2003-2009 by Guido Vollbeding.
				6	* This file is part of the Independent JPEG Group's software.
				7	* For conditions of distribution and use, see the accompanying README file.
				8	*
				9	* This file contains a slow-but-accurate integer implementation of the
				10	* forward DCT (Discrete Cosine Transform).
				11	*
				12	* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
				13	* on each column. Direct algorithms are also available, but they are
				14	* much more complex and seem not to be any faster when reduced to code.
				15	*
				16	* This implementation is based on an algorithm described in
				17	* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
				18	* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
				19	* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
				20	* The primary algorithm described there uses 11 multiplies and 29 adds.
				21	* We use their alternate method with 12 multiplies and 32 adds.
				22	* The advantage of this method is that no data path contains more than one
				23	* multiplication; this allows a very simple and accurate implementation in
				24	* scaled fixed-point arithmetic, with a minimal number of shifts.
				25	*
				26	* We also provide FDCT routines with various input sample block sizes for
				27	* direct resolution reduction or enlargement and for direct resolving the
				28	* common 2x1 and 1x2 subsampling cases without additional resampling: NxN
				29	* (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
				30	*
				31	* For N<8 we fill the remaining block coefficients with zero.
				32	* For N>8 we apply a partial N-point FDCT on the input samples, computing
				33	* just the lower 8 frequency coefficients and discarding the rest.
				34	*
				35	* We must scale the output coefficients of the N-point FDCT appropriately
				36	* to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
				37	* is folded into the constant multipliers (pass 2) and/or final/initial
				38	* shifting.
				39	*
				40	* CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
				41	* since there would be too many additional constants to pre-calculate.
				42	*/
				43
				44	#define JPEG_INTERNALS
				45	#include "jinclude.h"
				46	#include "jpeglib.h"
				47	#include "jdct.h" /* Private declarations for DCT subsystem */
				48
				49	#ifdef DCT_ISLOW_SUPPORTED
				50
				51
				52	/*
				53	* This module is specialized to the case DCTSIZE = 8.
				54	*/
				55
				56	#if DCTSIZE != 8
				57	Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
				58	#endif
				59
				60
				61	/*
				62	* The poop on this scaling stuff is as follows:
				63	*
				64	* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
				65	* larger than the true DCT outputs. The final outputs are therefore
				66	* a factor of N larger than desired; since N=8 this can be cured by
				67	* a simple right shift at the end of the algorithm. The advantage of
				68	* this arrangement is that we save two multiplications per 1-D DCT,
				69	* because the y0 and y4 outputs need not be divided by sqrt(N).
				70	* In the IJG code, this factor of 8 is removed by the quantization step
				71	* (in jcdctmgr.c), NOT in this module.
				72	*
				73	* We have to do addition and subtraction of the integer inputs, which
				74	* is no problem, and multiplication by fractional constants, which is
				75	* a problem to do in integer arithmetic. We multiply all the constants
				76	* by CONST_SCALE and convert them to integer constants (thus retaining
				77	* CONST_BITS bits of precision in the constants). After doing a
				78	* multiplication we have to divide the product by CONST_SCALE, with proper
				79	* rounding, to produce the correct output. This division can be done
				80	* cheaply as a right shift of CONST_BITS bits. We postpone shifting
				81	* as long as possible so that partial sums can be added together with
				82	* full fractional precision.
				83	*
				84	* The outputs of the first pass are scaled up by PASS1_BITS bits so that
				85	* they are represented to better-than-integral precision. These outputs
				86	* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
				87	* with the recommended scaling. (For 12-bit sample data, the intermediate
				88	* array is INT32 anyway.)
				89	*
				90	* To avoid overflow of the 32-bit intermediate results in pass 2, we must
				91	* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
				92	* shows that the values given below are the most effective.
				93	*/
				94
				95	#if BITS_IN_JSAMPLE == 8
				96	#define CONST_BITS 13
				97	#define PASS1_BITS 2
				98	#else
				99	#define CONST_BITS 13
				100	#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
				101	#endif
				102
				103	/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
				104	* causing a lot of useless floating-point operations at run time.
				105	* To get around this we use the following pre-calculated constants.
				106	* If you change CONST_BITS you may want to add appropriate values.
				107	* (With a reasonable C compiler, you can just rely on the FIX() macro...)
				108	*/
				109
				110	#if CONST_BITS == 13
				111	#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
				112	#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
				113	#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
				114	#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
				115	#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
				116	#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
				117	#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
				118	#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
				119	#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
				120	#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
				121	#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
				122	#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
				123	#else
				124	#define FIX_0_298631336 FIX(0.298631336)
				125	#define FIX_0_390180644 FIX(0.390180644)
				126	#define FIX_0_541196100 FIX(0.541196100)
				127	#define FIX_0_765366865 FIX(0.765366865)
				128	#define FIX_0_899976223 FIX(0.899976223)
				129	#define FIX_1_175875602 FIX(1.175875602)
				130	#define FIX_1_501321110 FIX(1.501321110)
				131	#define FIX_1_847759065 FIX(1.847759065)
				132	#define FIX_1_961570560 FIX(1.961570560)
				133	#define FIX_2_053119869 FIX(2.053119869)
				134	#define FIX_2_562915447 FIX(2.562915447)
				135	#define FIX_3_072711026 FIX(3.072711026)
				136	#endif
				137
				138
				139	/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
				140	* For 8-bit samples with the recommended scaling, all the variable
				141	* and constant values involved are no more than 16 bits wide, so a
				142	* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
				143	* For 12-bit samples, a full 32-bit multiplication will be needed.
				144	*/
				145
				146	#if BITS_IN_JSAMPLE == 8
				147	#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
				148	#else
				149	#define MULTIPLY(var,const) ((var) * (const))
				150	#endif
				151
				152
				153	/*
				154	* Perform the forward DCT on one block of samples.
				155	*/
				156
				157	GLOBAL(void)
				158	jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				159	{
				160	INT32 tmp0, tmp1, tmp2, tmp3;
				161	INT32 tmp10, tmp11, tmp12, tmp13;
				162	INT32 z1;
				163	DCTELEM *dataptr;
				164	JSAMPROW elemptr;
				165	int ctr;
				166	SHIFT_TEMPS
				167
				168	/* Pass 1: process rows. */
				169	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				170	/* furthermore, we scale the results by 2*PASS1_BITS. /
				171
				172	dataptr = data;
				173	for (ctr = 0; ctr < DCTSIZE; ctr++) {
				174	elemptr = sample_data[ctr] + start_col;
				175
				176	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				177	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				178	*/
				179
				180	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
				181	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
				182	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
				183	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
				184
				185	tmp10 = tmp0 + tmp3;
				186	tmp12 = tmp0 - tmp3;
				187	tmp11 = tmp1 + tmp2;
				188	tmp13 = tmp1 - tmp2;
				189
				190	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
				191	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
				192	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
				193	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
				194
				195	/* Apply unsigned->signed conversion */
				196	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
				197	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
				198
				199	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				200	/* Add fudge factor here for final descale. */
				201	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
				202	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				203	CONST_BITS-PASS1_BITS);
				204	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				205	CONST_BITS-PASS1_BITS);
				206
				207	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				208	* cK represents sqrt(2) * cos(K*pi/16).
				209	* i0..i3 in the paper are tmp0..tmp3 here.
				210	*/
				211
				212	tmp10 = tmp0 + tmp3;
				213	tmp11 = tmp1 + tmp2;
				214	tmp12 = tmp0 + tmp2;
				215	tmp13 = tmp1 + tmp3;
				216	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				217	/* Add fudge factor here for final descale. */
				218	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
				219
				220	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				221	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				222	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				223	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				224	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				225	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				226	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				227	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				228
				229	tmp12 += z1;
				230	tmp13 += z1;
				231
				232	dataptr[1] = (DCTELEM)
				233	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
				234	dataptr[3] = (DCTELEM)
				235	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
				236	dataptr[5] = (DCTELEM)
				237	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
				238	dataptr[7] = (DCTELEM)
				239	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
				240
				241	dataptr += DCTSIZE; /* advance pointer to next row */
				242	}
				243
				244	/* Pass 2: process columns.
				245	* We remove the PASS1_BITS scaling, but leave the results scaled up
				246	* by an overall factor of 8.
				247	*/
				248
				249	dataptr = data;
				250	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				251	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				252	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				253	*/
				254
				255	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
				256	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
				257	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
				258	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
				259
				260	/* Add fudge factor here for final descale. */
				261	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
				262	tmp12 = tmp0 - tmp3;
				263	tmp11 = tmp1 + tmp2;
				264	tmp13 = tmp1 - tmp2;
				265
				266	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
				267	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
				268	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
				269	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
				270
				271	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
				272	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
				273
				274	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				275	/* Add fudge factor here for final descale. */
				276	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
				277	dataptr[DCTSIZE*2] = (DCTELEM)
				278	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
				279	dataptr[DCTSIZE*6] = (DCTELEM)
				280	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
				281
				282	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				283	* cK represents sqrt(2) * cos(K*pi/16).
				284	* i0..i3 in the paper are tmp0..tmp3 here.
				285	*/
				286
				287	tmp10 = tmp0 + tmp3;
				288	tmp11 = tmp1 + tmp2;
				289	tmp12 = tmp0 + tmp2;
				290	tmp13 = tmp1 + tmp3;
				291	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				292	/* Add fudge factor here for final descale. */
				293	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
				294
				295	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				296	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				297	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				298	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				299	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				300	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				301	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				302	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				303
				304	tmp12 += z1;
				305	tmp13 += z1;
				306
				307	dataptr[DCTSIZE*1] = (DCTELEM)
				308	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
				309	dataptr[DCTSIZE*3] = (DCTELEM)
				310	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
				311	dataptr[DCTSIZE*5] = (DCTELEM)
				312	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
				313	dataptr[DCTSIZE*7] = (DCTELEM)
				314	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
				315
				316	dataptr++; /* advance pointer to next column */
				317	}
				318	}
				319
				320	#ifdef DCT_SCALING_SUPPORTED
				321
				322
				323	/*
				324	* Perform the forward DCT on a 7x7 sample block.
				325	*/
				326
				327	GLOBAL(void)
				328	jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				329	{
				330	INT32 tmp0, tmp1, tmp2, tmp3;
				331	INT32 tmp10, tmp11, tmp12;
				332	INT32 z1, z2, z3;
				333	DCTELEM *dataptr;
				334	JSAMPROW elemptr;
				335	int ctr;
				336	SHIFT_TEMPS
				337
				338	/* Pre-zero output coefficient block. */
				339	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				340
				341	/* Pass 1: process rows. */
				342	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				343	/* furthermore, we scale the results by 2*PASS1_BITS. /
				344	/* cK represents sqrt(2) * cos(Kpi/14). /
				345
				346	dataptr = data;
				347	for (ctr = 0; ctr < 7; ctr++) {
				348	elemptr = sample_data[ctr] + start_col;
				349
				350	/* Even part */
				351
				352	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
				353	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
				354	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
				355	tmp3 = GETJSAMPLE(elemptr[3]);
				356
				357	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
				358	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
				359	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
				360
				361	z1 = tmp0 + tmp2;
				362	/* Apply unsigned->signed conversion */
				363	dataptr[0] = (DCTELEM)
				364	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
				365	tmp3 += tmp3;
				366	z1 -= tmp3;
				367	z1 -= tmp3;
				368	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
				369	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
				370	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
				371	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
				372	z1 -= z2;
				373	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
				374	dataptr[4] = (DCTELEM)
				375	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
				376	CONST_BITS-PASS1_BITS);
				377	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
				378
				379	/* Odd part */
				380
				381	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
				382	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
				383	tmp0 = tmp1 - tmp2;
				384	tmp1 += tmp2;
				385	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
				386	tmp1 += tmp2;
				387	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
				388	tmp0 += tmp3;
				389	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
				390
				391	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
				392	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
				393	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
				394
				395	dataptr += DCTSIZE; /* advance pointer to next row */
				396	}
				397
				398	/* Pass 2: process columns.
				399	* We remove the PASS1_BITS scaling, but leave the results scaled up
				400	* by an overall factor of 8.
				401	* We must also scale the output by (8/7)**2 = 64/49, which we fold
				402	* into the constant multipliers:
				403	* cK now represents sqrt(2) * cos(Kpi/14) 64/49.
				404	*/
				405
				406	dataptr = data;
				407	for (ctr = 0; ctr < 7; ctr++) {
				408	/* Even part */
				409
				410	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
				411	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
				412	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
				413	tmp3 = dataptr[DCTSIZE*3];
				414
				415	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
				416	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
				417	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
				418
				419	z1 = tmp0 + tmp2;
				420	dataptr[DCTSIZE*0] = (DCTELEM)
				421	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
				422	CONST_BITS+PASS1_BITS);
				423	tmp3 += tmp3;
				424	z1 -= tmp3;
				425	z1 -= tmp3;
				426	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
				427	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
				428	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
				429	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
				430	z1 -= z2;
				431	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
				432	dataptr[DCTSIZE*4] = (DCTELEM)
				433	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
				434	CONST_BITS+PASS1_BITS);
				435	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
				436
				437	/* Odd part */
				438
				439	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
				440	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
				441	tmp0 = tmp1 - tmp2;
				442	tmp1 += tmp2;
				443	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
				444	tmp1 += tmp2;
				445	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
				446	tmp0 += tmp3;
				447	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
				448
				449	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
				450	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
				451	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
				452
				453	dataptr++; /* advance pointer to next column */
				454	}
				455	}
				456
				457
				458	/*
				459	* Perform the forward DCT on a 6x6 sample block.
				460	*/
				461
				462	GLOBAL(void)
				463	jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				464	{
				465	INT32 tmp0, tmp1, tmp2;
				466	INT32 tmp10, tmp11, tmp12;
				467	DCTELEM *dataptr;
				468	JSAMPROW elemptr;
				469	int ctr;
				470	SHIFT_TEMPS
				471
				472	/* Pre-zero output coefficient block. */
				473	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				474
				475	/* Pass 1: process rows. */
				476	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				477	/* furthermore, we scale the results by 2*PASS1_BITS. /
				478	/* cK represents sqrt(2) * cos(Kpi/12). /
				479
				480	dataptr = data;
				481	for (ctr = 0; ctr < 6; ctr++) {
				482	elemptr = sample_data[ctr] + start_col;
				483
				484	/* Even part */
				485
				486	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
				487	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
				488	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
				489
				490	tmp10 = tmp0 + tmp2;
				491	tmp12 = tmp0 - tmp2;
				492
				493	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
				494	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
				495	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
				496
				497	/* Apply unsigned->signed conversion */
				498	dataptr[0] = (DCTELEM)
				499	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
				500	dataptr[2] = (DCTELEM)
				501	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
				502	CONST_BITS-PASS1_BITS);
				503	dataptr[4] = (DCTELEM)
				504	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
				505	CONST_BITS-PASS1_BITS);
				506
				507	/* Odd part */
				508
				509	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
				510	CONST_BITS-PASS1_BITS);
				511
				512	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
				513	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
				514	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
				515
				516	dataptr += DCTSIZE; /* advance pointer to next row */
				517	}
				518
				519	/* Pass 2: process columns.
				520	* We remove the PASS1_BITS scaling, but leave the results scaled up
				521	* by an overall factor of 8.
				522	* We must also scale the output by (8/6)**2 = 16/9, which we fold
				523	* into the constant multipliers:
				524	* cK now represents sqrt(2) * cos(Kpi/12) 16/9.
				525	*/
				526
				527	dataptr = data;
				528	for (ctr = 0; ctr < 6; ctr++) {
				529	/* Even part */
				530
				531	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
				532	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
				533	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
				534
				535	tmp10 = tmp0 + tmp2;
				536	tmp12 = tmp0 - tmp2;
				537
				538	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
				539	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
				540	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
				541
				542	dataptr[DCTSIZE*0] = (DCTELEM)
				543	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
				544	CONST_BITS+PASS1_BITS);
				545	dataptr[DCTSIZE*2] = (DCTELEM)
				546	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
				547	CONST_BITS+PASS1_BITS);
				548	dataptr[DCTSIZE*4] = (DCTELEM)
				549	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
				550	CONST_BITS+PASS1_BITS);
				551
				552	/* Odd part */
				553
				554	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
				555
				556	dataptr[DCTSIZE*1] = (DCTELEM)
				557	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
				558	CONST_BITS+PASS1_BITS);
				559	dataptr[DCTSIZE*3] = (DCTELEM)
				560	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
				561	CONST_BITS+PASS1_BITS);
				562	dataptr[DCTSIZE*5] = (DCTELEM)
				563	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
				564	CONST_BITS+PASS1_BITS);
				565
				566	dataptr++; /* advance pointer to next column */
				567	}
				568	}
				569
				570
				571	/*
				572	* Perform the forward DCT on a 5x5 sample block.
				573	*/
				574
				575	GLOBAL(void)
				576	jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				577	{
				578	INT32 tmp0, tmp1, tmp2;
				579	INT32 tmp10, tmp11;
				580	DCTELEM *dataptr;
				581	JSAMPROW elemptr;
				582	int ctr;
				583	SHIFT_TEMPS
				584
				585	/* Pre-zero output coefficient block. */
				586	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				587
				588	/* Pass 1: process rows. */
				589	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				590	/* furthermore, we scale the results by 2*PASS1_BITS. /
				591	/* We scale the results further by 2 as part of output adaption */
				592	/* scaling for different DCT size. */
				593	/* cK represents sqrt(2) * cos(Kpi/10). /
				594
				595	dataptr = data;
				596	for (ctr = 0; ctr < 5; ctr++) {
				597	elemptr = sample_data[ctr] + start_col;
				598
				599	/* Even part */
				600
				601	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
				602	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
				603	tmp2 = GETJSAMPLE(elemptr[2]);
				604
				605	tmp10 = tmp0 + tmp1;
				606	tmp11 = tmp0 - tmp1;
				607
				608	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
				609	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
				610
				611	/* Apply unsigned->signed conversion */
				612	dataptr[0] = (DCTELEM)
				613	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
				614	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
				615	tmp10 -= tmp2 << 2;
				616	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
				617	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
				618	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
				619
				620	/* Odd part */
				621
				622	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
				623
				624	dataptr[1] = (DCTELEM)
				625	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
				626	CONST_BITS-PASS1_BITS-1);
				627	dataptr[3] = (DCTELEM)
				628	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
				629	CONST_BITS-PASS1_BITS-1);
				630
				631	dataptr += DCTSIZE; /* advance pointer to next row */
				632	}
				633
				634	/* Pass 2: process columns.
				635	* We remove the PASS1_BITS scaling, but leave the results scaled up
				636	* by an overall factor of 8.
				637	* We must also scale the output by (8/5)**2 = 64/25, which we partially
				638	* fold into the constant multipliers (other part was done in pass 1):
				639	* cK now represents sqrt(2) * cos(Kpi/10) 32/25.
				640	*/
				641
				642	dataptr = data;
				643	for (ctr = 0; ctr < 5; ctr++) {
				644	/* Even part */
				645
				646	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
				647	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
				648	tmp2 = dataptr[DCTSIZE*2];
				649
				650	tmp10 = tmp0 + tmp1;
				651	tmp11 = tmp0 - tmp1;
				652
				653	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
				654	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
				655
				656	dataptr[DCTSIZE*0] = (DCTELEM)
				657	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
				658	CONST_BITS+PASS1_BITS);
				659	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
				660	tmp10 -= tmp2 << 2;
				661	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
				662	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
				663	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
				664
				665	/* Odd part */
				666
				667	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
				668
				669	dataptr[DCTSIZE*1] = (DCTELEM)
				670	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
				671	CONST_BITS+PASS1_BITS);
				672	dataptr[DCTSIZE*3] = (DCTELEM)
				673	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
				674	CONST_BITS+PASS1_BITS);
				675
				676	dataptr++; /* advance pointer to next column */
				677	}
				678	}
				679
				680
				681	/*
				682	* Perform the forward DCT on a 4x4 sample block.
				683	*/
				684
				685	GLOBAL(void)
				686	jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				687	{
				688	INT32 tmp0, tmp1;
				689	INT32 tmp10, tmp11;
				690	DCTELEM *dataptr;
				691	JSAMPROW elemptr;
				692	int ctr;
				693	SHIFT_TEMPS
				694
				695	/* Pre-zero output coefficient block. */
				696	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				697
				698	/* Pass 1: process rows. */
				699	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				700	/* furthermore, we scale the results by 2*PASS1_BITS. /
				701	/* We must also scale the output by (8/4)2 = 22, which we add here. */
				702	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
				703
				704	dataptr = data;
				705	for (ctr = 0; ctr < 4; ctr++) {
				706	elemptr = sample_data[ctr] + start_col;
				707
				708	/* Even part */
				709
				710	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
				711	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
				712
				713	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
				714	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
				715
				716	/* Apply unsigned->signed conversion */
				717	dataptr[0] = (DCTELEM)
				718	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
				719	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
				720
				721	/* Odd part */
				722
				723	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				724	/* Add fudge factor here for final descale. */
				725	tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
				726
				727	dataptr[1] = (DCTELEM)
				728	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				729	CONST_BITS-PASS1_BITS-2);
				730	dataptr[3] = (DCTELEM)
				731	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				732	CONST_BITS-PASS1_BITS-2);
				733
				734	dataptr += DCTSIZE; /* advance pointer to next row */
				735	}
				736
				737	/* Pass 2: process columns.
				738	* We remove the PASS1_BITS scaling, but leave the results scaled up
				739	* by an overall factor of 8.
				740	*/
				741
				742	dataptr = data;
				743	for (ctr = 0; ctr < 4; ctr++) {
				744	/* Even part */
				745
				746	/* Add fudge factor here for final descale. */
				747	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
				748	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
				749
				750	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
				751	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
				752
				753	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
				754	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
				755
				756	/* Odd part */
				757
				758	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				759	/* Add fudge factor here for final descale. */
				760	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
				761
				762	dataptr[DCTSIZE*1] = (DCTELEM)
				763	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				764	CONST_BITS+PASS1_BITS);
				765	dataptr[DCTSIZE*3] = (DCTELEM)
				766	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				767	CONST_BITS+PASS1_BITS);
				768
				769	dataptr++; /* advance pointer to next column */
				770	}
				771	}
				772
				773
				774	/*
				775	* Perform the forward DCT on a 3x3 sample block.
				776	*/
				777
				778	GLOBAL(void)
				779	jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				780	{
				781	INT32 tmp0, tmp1, tmp2;
				782	DCTELEM *dataptr;
				783	JSAMPROW elemptr;
				784	int ctr;
				785	SHIFT_TEMPS
				786
				787	/* Pre-zero output coefficient block. */
				788	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				789
				790	/* Pass 1: process rows. */
				791	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				792	/* furthermore, we scale the results by 2*PASS1_BITS. /
				793	/* We scale the results further by 2*2 as part of output adaption /
				794	/* scaling for different DCT size. */
				795	/* cK represents sqrt(2) * cos(Kpi/6). /
				796
				797	dataptr = data;
				798	for (ctr = 0; ctr < 3; ctr++) {
				799	elemptr = sample_data[ctr] + start_col;
				800
				801	/* Even part */
				802
				803	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
				804	tmp1 = GETJSAMPLE(elemptr[1]);
				805
				806	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
				807
				808	/* Apply unsigned->signed conversion */
				809	dataptr[0] = (DCTELEM)
				810	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
				811	dataptr[2] = (DCTELEM)
				812	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
				813	CONST_BITS-PASS1_BITS-2);
				814
				815	/* Odd part */
				816
				817	dataptr[1] = (DCTELEM)
				818	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
				819	CONST_BITS-PASS1_BITS-2);
				820
				821	dataptr += DCTSIZE; /* advance pointer to next row */
				822	}
				823
				824	/* Pass 2: process columns.
				825	* We remove the PASS1_BITS scaling, but leave the results scaled up
				826	* by an overall factor of 8.
				827	* We must also scale the output by (8/3)**2 = 64/9, which we partially
				828	* fold into the constant multipliers (other part was done in pass 1):
				829	* cK now represents sqrt(2) * cos(Kpi/6) 16/9.
				830	*/
				831
				832	dataptr = data;
				833	for (ctr = 0; ctr < 3; ctr++) {
				834	/* Even part */
				835
				836	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
				837	tmp1 = dataptr[DCTSIZE*1];
				838
				839	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
				840
				841	dataptr[DCTSIZE*0] = (DCTELEM)
				842	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
				843	CONST_BITS+PASS1_BITS);
				844	dataptr[DCTSIZE*2] = (DCTELEM)
				845	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
				846	CONST_BITS+PASS1_BITS);
				847
				848	/* Odd part */
				849
				850	dataptr[DCTSIZE*1] = (DCTELEM)
				851	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
				852	CONST_BITS+PASS1_BITS);
				853
				854	dataptr++; /* advance pointer to next column */
				855	}
				856	}
				857
				858
				859	/*
				860	* Perform the forward DCT on a 2x2 sample block.
				861	*/
				862
				863	GLOBAL(void)
				864	jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				865	{
				866	INT32 tmp0, tmp1, tmp2, tmp3;
				867	JSAMPROW elemptr;
				868
				869	/* Pre-zero output coefficient block. */
				870	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				871
				872	/* Pass 1: process rows. */
				873	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				874
				875	/* Row 0 */
				876	elemptr = sample_data[0] + start_col;
				877
				878	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
				879	tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
				880
				881	/* Row 1 */
				882	elemptr = sample_data[1] + start_col;
				883
				884	tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
				885	tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
				886
				887	/* Pass 2: process columns.
				888	* We leave the results scaled up by an overall factor of 8.
				889	* We must also scale the output by (8/2)2 = 24.
				890	*/
				891
				892	/* Column 0 */
				893	/* Apply unsigned->signed conversion */
				894	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp2 - 4 CENTERJSAMPLE) << 4);
				895	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
				896
				897	/* Column 1 */
				898	data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
				899	data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
				900	}
				901
				902
				903	/*
				904	* Perform the forward DCT on a 1x1 sample block.
				905	*/
				906
				907	GLOBAL(void)
				908	jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				909	{
				910	/* Pre-zero output coefficient block. */
				911	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				912
				913	/* We leave the result scaled up by an overall factor of 8. */
				914	/* We must also scale the output by (8/1)2 = 26. */
				915	/* Apply unsigned->signed conversion */
				916	data[0] = (DCTELEM)
				917	((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
				918	}
				919
				920
				921	/*
				922	* Perform the forward DCT on a 9x9 sample block.
				923	*/
				924
				925	GLOBAL(void)
				926	jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				927	{
				928	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
				929	INT32 tmp10, tmp11, tmp12, tmp13;
				930	INT32 z1, z2;
				931	DCTELEM workspace[8];
				932	DCTELEM *dataptr;
				933	DCTELEM *wsptr;
				934	JSAMPROW elemptr;
				935	int ctr;
				936	SHIFT_TEMPS
				937
				938	/* Pass 1: process rows. */
				939	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				940	/* we scale the results further by 2 as part of output adaption */
				941	/* scaling for different DCT size. */
				942	/* cK represents sqrt(2) * cos(Kpi/18). /
				943
				944	dataptr = data;
				945	ctr = 0;
				946	for (;;) {
				947	elemptr = sample_data[ctr] + start_col;
				948
				949	/* Even part */
				950
				951	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
				952	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
				953	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
				954	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
				955	tmp4 = GETJSAMPLE(elemptr[4]);
				956
				957	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
				958	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
				959	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
				960	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
				961
				962	z1 = tmp0 + tmp2 + tmp3;
				963	z2 = tmp1 + tmp4;
				964	/* Apply unsigned->signed conversion */
				965	dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
				966	dataptr[6] = (DCTELEM)
				967	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
				968	CONST_BITS-1);
				969	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
				970	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
				971	dataptr[2] = (DCTELEM)
				972	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
				973	+ z1 + z2, CONST_BITS-1);
				974	dataptr[4] = (DCTELEM)
				975	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
				976	+ z1 - z2, CONST_BITS-1);
				977
				978	/* Odd part */
				979
				980	dataptr[3] = (DCTELEM)
				981	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
				982	CONST_BITS-1);
				983
				984	tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
				985	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
				986	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
				987
				988	dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
				989
				990	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
				991
				992	dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
				993	dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
				994
				995	ctr++;
				996
				997	if (ctr != DCTSIZE) {
				998	if (ctr == 9)
				999	break; /* Done. */
				1000	dataptr += DCTSIZE; /* advance pointer to next row */
				1001	} else
				1002	dataptr = workspace; /* switch pointer to extended workspace */
				1003	}
				1004
				1005	/* Pass 2: process columns.
				1006	* We leave the results scaled up by an overall factor of 8.
				1007	* We must also scale the output by (8/9)**2 = 64/81, which we partially
				1008	* fold into the constant multipliers and final/initial shifting:
				1009	* cK now represents sqrt(2) * cos(Kpi/18) 128/81.
				1010	*/
				1011
				1012	dataptr = data;
				1013	wsptr = workspace;
				1014	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1015	/* Even part */
				1016
				1017	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE0];
				1018	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE7];
				1019	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE6];
				1020	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE5];
				1021	tmp4 = dataptr[DCTSIZE*4];
				1022
				1023	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE0];
				1024	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE7];
				1025	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE6];
				1026	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE5];
				1027
				1028	z1 = tmp0 + tmp2 + tmp3;
				1029	z2 = tmp1 + tmp4;
				1030	dataptr[DCTSIZE*0] = (DCTELEM)
				1031	DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
				1032	CONST_BITS+2);
				1033	dataptr[DCTSIZE*6] = (DCTELEM)
				1034	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
				1035	CONST_BITS+2);
				1036	z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
				1037	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
				1038	dataptr[DCTSIZE*2] = (DCTELEM)
				1039	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
				1040	+ z1 + z2, CONST_BITS+2);
				1041	dataptr[DCTSIZE*4] = (DCTELEM)
				1042	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
				1043	+ z1 - z2, CONST_BITS+2);
				1044
				1045	/* Odd part */
				1046
				1047	dataptr[DCTSIZE*3] = (DCTELEM)
				1048	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
				1049	CONST_BITS+2);
				1050
				1051	tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
				1052	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
				1053	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
				1054
				1055	dataptr[DCTSIZE*1] = (DCTELEM)
				1056	DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
				1057
				1058	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
				1059
				1060	dataptr[DCTSIZE*5] = (DCTELEM)
				1061	DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
				1062	dataptr[DCTSIZE*7] = (DCTELEM)
				1063	DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
				1064
				1065	dataptr++; /* advance pointer to next column */
				1066	wsptr++; /* advance pointer to next column */
				1067	}
				1068	}
				1069
				1070
				1071	/*
				1072	* Perform the forward DCT on a 10x10 sample block.
				1073	*/
				1074
				1075	GLOBAL(void)
				1076	jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1077	{
				1078	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
				1079	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
				1080	DCTELEM workspace[8*2];
				1081	DCTELEM *dataptr;
				1082	DCTELEM *wsptr;
				1083	JSAMPROW elemptr;
				1084	int ctr;
				1085	SHIFT_TEMPS
				1086
				1087	/* Pass 1: process rows. */
				1088	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				1089	/* we scale the results further by 2 as part of output adaption */
				1090	/* scaling for different DCT size. */
				1091	/* cK represents sqrt(2) * cos(Kpi/20). /
				1092
				1093	dataptr = data;
				1094	ctr = 0;
				1095	for (;;) {
				1096	elemptr = sample_data[ctr] + start_col;
				1097
				1098	/* Even part */
				1099
				1100	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
				1101	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
				1102	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
				1103	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
				1104	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
				1105
				1106	tmp10 = tmp0 + tmp4;
				1107	tmp13 = tmp0 - tmp4;
				1108	tmp11 = tmp1 + tmp3;
				1109	tmp14 = tmp1 - tmp3;
				1110
				1111	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
				1112	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
				1113	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
				1114	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
				1115	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
				1116
				1117	/* Apply unsigned->signed conversion */
				1118	dataptr[0] = (DCTELEM)
				1119	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
				1120	tmp12 += tmp12;
				1121	dataptr[4] = (DCTELEM)
				1122	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
				1123	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
				1124	CONST_BITS-1);
				1125	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
				1126	dataptr[2] = (DCTELEM)
				1127	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
				1128	CONST_BITS-1);
				1129	dataptr[6] = (DCTELEM)
				1130	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
				1131	CONST_BITS-1);
				1132
				1133	/* Odd part */
				1134
				1135	tmp10 = tmp0 + tmp4;
				1136	tmp11 = tmp1 - tmp3;
				1137	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
				1138	tmp2 <<= CONST_BITS;
				1139	dataptr[1] = (DCTELEM)
				1140	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
				1141	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
				1142	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
				1143	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
				1144	CONST_BITS-1);
				1145	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
				1146	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
				1147	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
				1148	(tmp11 << (CONST_BITS - 1)) - tmp2;
				1149	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
				1150	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
				1151
				1152	ctr++;
				1153
				1154	if (ctr != DCTSIZE) {
				1155	if (ctr == 10)
				1156	break; /* Done. */
				1157	dataptr += DCTSIZE; /* advance pointer to next row */
				1158	} else
				1159	dataptr = workspace; /* switch pointer to extended workspace */
				1160	}
				1161
				1162	/* Pass 2: process columns.
				1163	* We leave the results scaled up by an overall factor of 8.
				1164	* We must also scale the output by (8/10)**2 = 16/25, which we partially
				1165	* fold into the constant multipliers and final/initial shifting:
				1166	* cK now represents sqrt(2) * cos(Kpi/20) 32/25.
				1167	*/
				1168
				1169	dataptr = data;
				1170	wsptr = workspace;
				1171	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1172	/* Even part */
				1173
				1174	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
				1175	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
				1176	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
				1177	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
				1178	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
				1179
				1180	tmp10 = tmp0 + tmp4;
				1181	tmp13 = tmp0 - tmp4;
				1182	tmp11 = tmp1 + tmp3;
				1183	tmp14 = tmp1 - tmp3;
				1184
				1185	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
				1186	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
				1187	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
				1188	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
				1189	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
				1190
				1191	dataptr[DCTSIZE*0] = (DCTELEM)
				1192	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
				1193	CONST_BITS+2);
				1194	tmp12 += tmp12;
				1195	dataptr[DCTSIZE*4] = (DCTELEM)
				1196	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
				1197	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
				1198	CONST_BITS+2);
				1199	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
				1200	dataptr[DCTSIZE*2] = (DCTELEM)
				1201	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
				1202	CONST_BITS+2);
				1203	dataptr[DCTSIZE*6] = (DCTELEM)
				1204	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
				1205	CONST_BITS+2);
				1206
				1207	/* Odd part */
				1208
				1209	tmp10 = tmp0 + tmp4;
				1210	tmp11 = tmp1 - tmp3;
				1211	dataptr[DCTSIZE*5] = (DCTELEM)
				1212	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
				1213	CONST_BITS+2);
				1214	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
				1215	dataptr[DCTSIZE*1] = (DCTELEM)
				1216	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
				1217	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
				1218	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
				1219	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
				1220	CONST_BITS+2);
				1221	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
				1222	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
				1223	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
				1224	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
				1225	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
				1226	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
				1227
				1228	dataptr++; /* advance pointer to next column */
				1229	wsptr++; /* advance pointer to next column */
				1230	}
				1231	}
				1232
				1233
				1234	/*
				1235	* Perform the forward DCT on an 11x11 sample block.
				1236	*/
				1237
				1238	GLOBAL(void)
				1239	jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1240	{
				1241	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
				1242	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
				1243	INT32 z1, z2, z3;
				1244	DCTELEM workspace[8*3];
				1245	DCTELEM *dataptr;
				1246	DCTELEM *wsptr;
				1247	JSAMPROW elemptr;
				1248	int ctr;
				1249	SHIFT_TEMPS
				1250
				1251	/* Pass 1: process rows. */
				1252	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				1253	/* we scale the results further by 2 as part of output adaption */
				1254	/* scaling for different DCT size. */
				1255	/* cK represents sqrt(2) * cos(Kpi/22). /
				1256
				1257	dataptr = data;
				1258	ctr = 0;
				1259	for (;;) {
				1260	elemptr = sample_data[ctr] + start_col;
				1261
				1262	/* Even part */
				1263
				1264	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
				1265	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
				1266	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
				1267	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
				1268	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
				1269	tmp5 = GETJSAMPLE(elemptr[5]);
				1270
				1271	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
				1272	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
				1273	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
				1274	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
				1275	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
				1276
				1277	/* Apply unsigned->signed conversion */
				1278	dataptr[0] = (DCTELEM)
				1279	((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
				1280	tmp5 += tmp5;
				1281	tmp0 -= tmp5;
				1282	tmp1 -= tmp5;
				1283	tmp2 -= tmp5;
				1284	tmp3 -= tmp5;
				1285	tmp4 -= tmp5;
				1286	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
				1287	MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
				1288	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
				1289	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
				1290	dataptr[2] = (DCTELEM)
				1291	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
				1292	- MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
				1293	CONST_BITS-1);
				1294	dataptr[4] = (DCTELEM)
				1295	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
				1296	- MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
				1297	+ MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
				1298	CONST_BITS-1);
				1299	dataptr[6] = (DCTELEM)
				1300	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
				1301	- MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
				1302	CONST_BITS-1);
				1303
				1304	/* Odd part */
				1305
				1306	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
				1307	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
				1308	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
				1309	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
				1310	+ MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
				1311	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
				1312	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
				1313	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
				1314	- MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
				1315	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
				1316	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
				1317	+ MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
				1318	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
				1319	- MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
				1320
				1321	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
				1322	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
				1323	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
				1324	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
				1325
				1326	ctr++;
				1327
				1328	if (ctr != DCTSIZE) {
				1329	if (ctr == 11)
				1330	break; /* Done. */
				1331	dataptr += DCTSIZE; /* advance pointer to next row */
				1332	} else
				1333	dataptr = workspace; /* switch pointer to extended workspace */
				1334	}
				1335
				1336	/* Pass 2: process columns.
				1337	* We leave the results scaled up by an overall factor of 8.
				1338	* We must also scale the output by (8/11)**2 = 64/121, which we partially
				1339	* fold into the constant multipliers and final/initial shifting:
				1340	* cK now represents sqrt(2) * cos(Kpi/22) 128/121.
				1341	*/
				1342
				1343	dataptr = data;
				1344	wsptr = workspace;
				1345	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1346	/* Even part */
				1347
				1348	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE2];
				1349	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE1];
				1350	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE0];
				1351	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE7];
				1352	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE6];
				1353	tmp5 = dataptr[DCTSIZE*5];
				1354
				1355	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE2];
				1356	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE1];
				1357	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE0];
				1358	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE7];
				1359	tmp14 = dataptr[DCTSIZE4] - dataptr[DCTSIZE6];
				1360
				1361	dataptr[DCTSIZE*0] = (DCTELEM)
				1362	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
				1363	FIX(1.057851240)), /* 128/121 */
				1364	CONST_BITS+2);
				1365	tmp5 += tmp5;
				1366	tmp0 -= tmp5;
				1367	tmp1 -= tmp5;
				1368	tmp2 -= tmp5;
				1369	tmp3 -= tmp5;
				1370	tmp4 -= tmp5;
				1371	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
				1372	MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
				1373	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
				1374	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
				1375	dataptr[DCTSIZE*2] = (DCTELEM)
				1376	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
				1377	- MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
				1378	CONST_BITS+2);
				1379	dataptr[DCTSIZE*4] = (DCTELEM)
				1380	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
				1381	- MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
				1382	+ MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
				1383	CONST_BITS+2);
				1384	dataptr[DCTSIZE*6] = (DCTELEM)
				1385	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
				1386	- MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
				1387	CONST_BITS+2);
				1388
				1389	/* Odd part */
				1390
				1391	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
				1392	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
				1393	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
				1394	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
				1395	+ MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
				1396	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
				1397	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
				1398	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
				1399	- MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
				1400	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
				1401	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
				1402	+ MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
				1403	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
				1404	- MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
				1405
				1406	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
				1407	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
				1408	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
				1409	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
				1410
				1411	dataptr++; /* advance pointer to next column */
				1412	wsptr++; /* advance pointer to next column */
				1413	}
				1414	}
				1415
				1416
				1417	/*
				1418	* Perform the forward DCT on a 12x12 sample block.
				1419	*/
				1420
				1421	GLOBAL(void)
				1422	jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1423	{
				1424	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
				1425	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
				1426	DCTELEM workspace[8*4];
				1427	DCTELEM *dataptr;
				1428	DCTELEM *wsptr;
				1429	JSAMPROW elemptr;
				1430	int ctr;
				1431	SHIFT_TEMPS
				1432
				1433	/* Pass 1: process rows. */
				1434	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				1435	/* cK represents sqrt(2) * cos(Kpi/24). /
				1436
				1437	dataptr = data;
				1438	ctr = 0;
				1439	for (;;) {
				1440	elemptr = sample_data[ctr] + start_col;
				1441
				1442	/* Even part */
				1443
				1444	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
				1445	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
				1446	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
				1447	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
				1448	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
				1449	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
				1450
				1451	tmp10 = tmp0 + tmp5;
				1452	tmp13 = tmp0 - tmp5;
				1453	tmp11 = tmp1 + tmp4;
				1454	tmp14 = tmp1 - tmp4;
				1455	tmp12 = tmp2 + tmp3;
				1456	tmp15 = tmp2 - tmp3;
				1457
				1458	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
				1459	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
				1460	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
				1461	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
				1462	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
				1463	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
				1464
				1465	/* Apply unsigned->signed conversion */
				1466	dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
				1467	dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
				1468	dataptr[4] = (DCTELEM)
				1469	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
				1470	CONST_BITS);
				1471	dataptr[2] = (DCTELEM)
				1472	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
				1473	CONST_BITS);
				1474
				1475	/* Odd part */
				1476
				1477	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
				1478	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
				1479	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
				1480	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
				1481	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
				1482	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
				1483	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
				1484	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
				1485	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
				1486	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
				1487	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
				1488	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
				1489	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
				1490	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
				1491
				1492	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
				1493	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
				1494	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
				1495	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
				1496
				1497	ctr++;
				1498
				1499	if (ctr != DCTSIZE) {
				1500	if (ctr == 12)
				1501	break; /* Done. */
				1502	dataptr += DCTSIZE; /* advance pointer to next row */
				1503	} else
				1504	dataptr = workspace; /* switch pointer to extended workspace */
				1505	}
				1506
				1507	/* Pass 2: process columns.
				1508	* We leave the results scaled up by an overall factor of 8.
				1509	* We must also scale the output by (8/12)**2 = 4/9, which we partially
				1510	* fold into the constant multipliers and final shifting:
				1511	* cK now represents sqrt(2) * cos(Kpi/24) 8/9.
				1512	*/
				1513
				1514	dataptr = data;
				1515	wsptr = workspace;
				1516	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1517	/* Even part */
				1518
				1519	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
				1520	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
				1521	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
				1522	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
				1523	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
				1524	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
				1525
				1526	tmp10 = tmp0 + tmp5;
				1527	tmp13 = tmp0 - tmp5;
				1528	tmp11 = tmp1 + tmp4;
				1529	tmp14 = tmp1 - tmp4;
				1530	tmp12 = tmp2 + tmp3;
				1531	tmp15 = tmp2 - tmp3;
				1532
				1533	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
				1534	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
				1535	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
				1536	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
				1537	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
				1538	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
				1539
				1540	dataptr[DCTSIZE*0] = (DCTELEM)
				1541	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
				1542	CONST_BITS+1);
				1543	dataptr[DCTSIZE*6] = (DCTELEM)
				1544	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
				1545	CONST_BITS+1);
				1546	dataptr[DCTSIZE*4] = (DCTELEM)
				1547	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
				1548	CONST_BITS+1);
				1549	dataptr[DCTSIZE*2] = (DCTELEM)
				1550	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
				1551	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
				1552	CONST_BITS+1);
				1553
				1554	/* Odd part */
				1555
				1556	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
				1557	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
				1558	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
				1559	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
				1560	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
				1561	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
				1562	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
				1563	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
				1564	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
				1565	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
				1566	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
				1567	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
				1568	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
				1569	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
				1570
				1571	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
				1572	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
				1573	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
				1574	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
				1575
				1576	dataptr++; /* advance pointer to next column */
				1577	wsptr++; /* advance pointer to next column */
				1578	}
				1579	}
				1580
				1581
				1582	/*
				1583	* Perform the forward DCT on a 13x13 sample block.
				1584	*/
				1585
				1586	GLOBAL(void)
				1587	jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1588	{
				1589	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
				1590	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
				1591	INT32 z1, z2;
				1592	DCTELEM workspace[8*5];
				1593	DCTELEM *dataptr;
				1594	DCTELEM *wsptr;
				1595	JSAMPROW elemptr;
				1596	int ctr;
				1597	SHIFT_TEMPS
				1598
				1599	/* Pass 1: process rows. */
				1600	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				1601	/* cK represents sqrt(2) * cos(Kpi/26). /
				1602
				1603	dataptr = data;
				1604	ctr = 0;
				1605	for (;;) {
				1606	elemptr = sample_data[ctr] + start_col;
				1607
				1608	/* Even part */
				1609
				1610	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
				1611	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
				1612	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
				1613	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
				1614	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
				1615	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
				1616	tmp6 = GETJSAMPLE(elemptr[6]);
				1617
				1618	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
				1619	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
				1620	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
				1621	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
				1622	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
				1623	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
				1624
				1625	/* Apply unsigned->signed conversion */
				1626	dataptr[0] = (DCTELEM)
				1627	(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
				1628	tmp6 += tmp6;
				1629	tmp0 -= tmp6;
				1630	tmp1 -= tmp6;
				1631	tmp2 -= tmp6;
				1632	tmp3 -= tmp6;
				1633	tmp4 -= tmp6;
				1634	tmp5 -= tmp6;
				1635	dataptr[2] = (DCTELEM)
				1636	DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
				1637	MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
				1638	MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
				1639	MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
				1640	MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
				1641	MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
				1642	CONST_BITS);
				1643	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
				1644	MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
				1645	MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
				1646	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
				1647	MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
				1648	MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
				1649
				1650	dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
				1651	dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
				1652
				1653	/* Odd part */
				1654
				1655	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
				1656	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
				1657	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
				1658	MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
				1659	tmp0 = tmp1 + tmp2 + tmp3 -
				1660	MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
				1661	MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
				1662	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
				1663	MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
				1664	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
				1665	tmp1 += tmp4 + tmp5 +
				1666	MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
				1667	MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
				1668	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
				1669	tmp2 += tmp4 + tmp6 -
				1670	MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
				1671	MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
				1672	tmp3 += tmp5 + tmp6 +
				1673	MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
				1674	MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
				1675
				1676	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
				1677	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
				1678	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
				1679	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
				1680
				1681	ctr++;
				1682
				1683	if (ctr != DCTSIZE) {
				1684	if (ctr == 13)
				1685	break; /* Done. */
				1686	dataptr += DCTSIZE; /* advance pointer to next row */
				1687	} else
				1688	dataptr = workspace; /* switch pointer to extended workspace */
				1689	}
				1690
				1691	/* Pass 2: process columns.
				1692	* We leave the results scaled up by an overall factor of 8.
				1693	* We must also scale the output by (8/13)**2 = 64/169, which we partially
				1694	* fold into the constant multipliers and final shifting:
				1695	* cK now represents sqrt(2) * cos(Kpi/26) 128/169.
				1696	*/
				1697
				1698	dataptr = data;
				1699	wsptr = workspace;
				1700	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1701	/* Even part */
				1702
				1703	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE4];
				1704	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE3];
				1705	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE2];
				1706	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE1];
				1707	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE0];
				1708	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE7];
				1709	tmp6 = dataptr[DCTSIZE*6];
				1710
				1711	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE4];
				1712	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE3];
				1713	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE2];
				1714	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE1];
				1715	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE0];
				1716	tmp15 = dataptr[DCTSIZE5] - dataptr[DCTSIZE7];
				1717
				1718	dataptr[DCTSIZE*0] = (DCTELEM)
				1719	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
				1720	FIX(0.757396450)), /* 128/169 */
				1721	CONST_BITS+1);
				1722	tmp6 += tmp6;
				1723	tmp0 -= tmp6;
				1724	tmp1 -= tmp6;
				1725	tmp2 -= tmp6;
				1726	tmp3 -= tmp6;
				1727	tmp4 -= tmp6;
				1728	tmp5 -= tmp6;
				1729	dataptr[DCTSIZE*2] = (DCTELEM)
				1730	DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
				1731	MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
				1732	MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
				1733	MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
				1734	MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
				1735	MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
				1736	CONST_BITS+1);
				1737	z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
				1738	MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
				1739	MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
				1740	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
				1741	MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
				1742	MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
				1743
				1744	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
				1745	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
				1746
				1747	/* Odd part */
				1748
				1749	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
				1750	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
				1751	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
				1752	MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
				1753	tmp0 = tmp1 + tmp2 + tmp3 -
				1754	MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
				1755	MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
				1756	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
				1757	MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
				1758	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
				1759	tmp1 += tmp4 + tmp5 +
				1760	MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
				1761	MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
				1762	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
				1763	tmp2 += tmp4 + tmp6 -
				1764	MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
				1765	MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
				1766	tmp3 += tmp5 + tmp6 +
				1767	MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
				1768	MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
				1769
				1770	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
				1771	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
				1772	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
				1773	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
				1774
				1775	dataptr++; /* advance pointer to next column */
				1776	wsptr++; /* advance pointer to next column */
				1777	}
				1778	}
				1779
				1780
				1781	/*
				1782	* Perform the forward DCT on a 14x14 sample block.
				1783	*/
				1784
				1785	GLOBAL(void)
				1786	jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1787	{
				1788	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
				1789	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
				1790	DCTELEM workspace[8*6];
				1791	DCTELEM *dataptr;
				1792	DCTELEM *wsptr;
				1793	JSAMPROW elemptr;
				1794	int ctr;
				1795	SHIFT_TEMPS
				1796
				1797	/* Pass 1: process rows. */
				1798	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				1799	/* cK represents sqrt(2) * cos(Kpi/28). /
				1800
				1801	dataptr = data;
				1802	ctr = 0;
				1803	for (;;) {
				1804	elemptr = sample_data[ctr] + start_col;
				1805
				1806	/* Even part */
				1807
				1808	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
				1809	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
				1810	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
				1811	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
				1812	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
				1813	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
				1814	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
				1815
				1816	tmp10 = tmp0 + tmp6;
				1817	tmp14 = tmp0 - tmp6;
				1818	tmp11 = tmp1 + tmp5;
				1819	tmp15 = tmp1 - tmp5;
				1820	tmp12 = tmp2 + tmp4;
				1821	tmp16 = tmp2 - tmp4;
				1822
				1823	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
				1824	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
				1825	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
				1826	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
				1827	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
				1828	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
				1829	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
				1830
				1831	/* Apply unsigned->signed conversion */
				1832	dataptr[0] = (DCTELEM)
				1833	(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
				1834	tmp13 += tmp13;
				1835	dataptr[4] = (DCTELEM)
				1836	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
				1837	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
				1838	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
				1839	CONST_BITS);
				1840
				1841	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
				1842
				1843	dataptr[2] = (DCTELEM)
				1844	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
				1845	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
				1846	CONST_BITS);
				1847	dataptr[6] = (DCTELEM)
				1848	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
				1849	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
				1850	CONST_BITS);
				1851
				1852	/* Odd part */
				1853
				1854	tmp10 = tmp1 + tmp2;
				1855	tmp11 = tmp5 - tmp4;
				1856	dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
				1857	tmp3 <<= CONST_BITS;
				1858	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
				1859	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
				1860	tmp10 += tmp11 - tmp3;
				1861	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
				1862	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
				1863	dataptr[5] = (DCTELEM)
				1864	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
				1865	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
				1866	CONST_BITS);
				1867	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
				1868	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
				1869	dataptr[3] = (DCTELEM)
				1870	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
				1871	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
				1872	CONST_BITS);
				1873	dataptr[1] = (DCTELEM)
				1874	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
				1875	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
				1876	CONST_BITS);
				1877
				1878	ctr++;
				1879
				1880	if (ctr != DCTSIZE) {
				1881	if (ctr == 14)
				1882	break; /* Done. */
				1883	dataptr += DCTSIZE; /* advance pointer to next row */
				1884	} else
				1885	dataptr = workspace; /* switch pointer to extended workspace */
				1886	}
				1887
				1888	/* Pass 2: process columns.
				1889	* We leave the results scaled up by an overall factor of 8.
				1890	* We must also scale the output by (8/14)**2 = 16/49, which we partially
				1891	* fold into the constant multipliers and final shifting:
				1892	* cK now represents sqrt(2) * cos(Kpi/28) 32/49.
				1893	*/
				1894
				1895	dataptr = data;
				1896	wsptr = workspace;
				1897	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				1898	/* Even part */
				1899
				1900	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
				1901	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
				1902	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
				1903	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
				1904	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
				1905	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
				1906	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
				1907
				1908	tmp10 = tmp0 + tmp6;
				1909	tmp14 = tmp0 - tmp6;
				1910	tmp11 = tmp1 + tmp5;
				1911	tmp15 = tmp1 - tmp5;
				1912	tmp12 = tmp2 + tmp4;
				1913	tmp16 = tmp2 - tmp4;
				1914
				1915	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
				1916	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
				1917	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
				1918	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
				1919	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
				1920	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
				1921	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
				1922
				1923	dataptr[DCTSIZE*0] = (DCTELEM)
				1924	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
				1925	FIX(0.653061224)), /* 32/49 */
				1926	CONST_BITS+1);
				1927	tmp13 += tmp13;
				1928	dataptr[DCTSIZE*4] = (DCTELEM)
				1929	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
				1930	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
				1931	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
				1932	CONST_BITS+1);
				1933
				1934	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
				1935
				1936	dataptr[DCTSIZE*2] = (DCTELEM)
				1937	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
				1938	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
				1939	CONST_BITS+1);
				1940	dataptr[DCTSIZE*6] = (DCTELEM)
				1941	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
				1942	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
				1943	CONST_BITS+1);
				1944
				1945	/* Odd part */
				1946
				1947	tmp10 = tmp1 + tmp2;
				1948	tmp11 = tmp5 - tmp4;
				1949	dataptr[DCTSIZE*7] = (DCTELEM)
				1950	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
				1951	FIX(0.653061224)), /* 32/49 */
				1952	CONST_BITS+1);
				1953	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
				1954	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
				1955	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
				1956	tmp10 += tmp11 - tmp3;
				1957	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
				1958	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
				1959	dataptr[DCTSIZE*5] = (DCTELEM)
				1960	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
				1961	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
				1962	CONST_BITS+1);
				1963	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
				1964	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
				1965	dataptr[DCTSIZE*3] = (DCTELEM)
				1966	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
				1967	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
				1968	CONST_BITS+1);
				1969	dataptr[DCTSIZE*1] = (DCTELEM)
				1970	DESCALE(tmp11 + tmp12 + tmp3
				1971	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
				1972	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
				1973	CONST_BITS+1);
				1974
				1975	dataptr++; /* advance pointer to next column */
				1976	wsptr++; /* advance pointer to next column */
				1977	}
				1978	}
				1979
				1980
				1981	/*
				1982	* Perform the forward DCT on a 15x15 sample block.
				1983	*/
				1984
				1985	GLOBAL(void)
				1986	jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				1987	{
				1988	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
				1989	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
				1990	INT32 z1, z2, z3;
				1991	DCTELEM workspace[8*7];
				1992	DCTELEM *dataptr;
				1993	DCTELEM *wsptr;
				1994	JSAMPROW elemptr;
				1995	int ctr;
				1996	SHIFT_TEMPS
				1997
				1998	/* Pass 1: process rows. */
				1999	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				2000	/* cK represents sqrt(2) * cos(Kpi/30). /
				2001
				2002	dataptr = data;
				2003	ctr = 0;
				2004	for (;;) {
				2005	elemptr = sample_data[ctr] + start_col;
				2006
				2007	/* Even part */
				2008
				2009	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
				2010	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
				2011	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
				2012	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
				2013	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
				2014	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
				2015	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
				2016	tmp7 = GETJSAMPLE(elemptr[7]);
				2017
				2018	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
				2019	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
				2020	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
				2021	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
				2022	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
				2023	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
				2024	tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
				2025
				2026	z1 = tmp0 + tmp4 + tmp5;
				2027	z2 = tmp1 + tmp3 + tmp6;
				2028	z3 = tmp2 + tmp7;
				2029	/* Apply unsigned->signed conversion */
				2030	dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
				2031	z3 += z3;
				2032	dataptr[6] = (DCTELEM)
				2033	DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
				2034	MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
				2035	CONST_BITS);
				2036	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
				2037	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
				2038	MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
				2039	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
				2040	MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
				2041	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
				2042	MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
				2043	MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
				2044
				2045	dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
				2046	dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
				2047
				2048	/* Odd part */
				2049
				2050	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
				2051	FIX(1.224744871)); /* c5 */
				2052	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
				2053	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
				2054	tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
				2055	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
				2056	MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
				2057	MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
				2058	tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
				2059	MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
				2060	MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
				2061	tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
				2062	MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
				2063	MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
				2064
				2065	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
				2066	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
				2067	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
				2068	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
				2069
				2070	ctr++;
				2071
				2072	if (ctr != DCTSIZE) {
				2073	if (ctr == 15)
				2074	break; /* Done. */
				2075	dataptr += DCTSIZE; /* advance pointer to next row */
				2076	} else
				2077	dataptr = workspace; /* switch pointer to extended workspace */
				2078	}
				2079
				2080	/* Pass 2: process columns.
				2081	* We leave the results scaled up by an overall factor of 8.
				2082	* We must also scale the output by (8/15)**2 = 64/225, which we partially
				2083	* fold into the constant multipliers and final shifting:
				2084	* cK now represents sqrt(2) * cos(Kpi/30) 256/225.
				2085	*/
				2086
				2087	dataptr = data;
				2088	wsptr = workspace;
				2089	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2090	/* Even part */
				2091
				2092	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE6];
				2093	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE5];
				2094	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE4];
				2095	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE3];
				2096	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE2];
				2097	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE1];
				2098	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE0];
				2099	tmp7 = dataptr[DCTSIZE*7];
				2100
				2101	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE6];
				2102	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE5];
				2103	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE4];
				2104	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE3];
				2105	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE2];
				2106	tmp15 = dataptr[DCTSIZE5] - wsptr[DCTSIZE1];
				2107	tmp16 = dataptr[DCTSIZE6] - wsptr[DCTSIZE0];
				2108
				2109	z1 = tmp0 + tmp4 + tmp5;
				2110	z2 = tmp1 + tmp3 + tmp6;
				2111	z3 = tmp2 + tmp7;
				2112	dataptr[DCTSIZE*0] = (DCTELEM)
				2113	DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
				2114	CONST_BITS+2);
				2115	z3 += z3;
				2116	dataptr[DCTSIZE*6] = (DCTELEM)
				2117	DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
				2118	MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
				2119	CONST_BITS+2);
				2120	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
				2121	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
				2122	MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
				2123	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
				2124	MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
				2125	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
				2126	MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
				2127	MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
				2128
				2129	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
				2130	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
				2131
				2132	/* Odd part */
				2133
				2134	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
				2135	FIX(1.393487498)); /* c5 */
				2136	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
				2137	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
				2138	tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
				2139	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
				2140	MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
				2141	MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
				2142	tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
				2143	MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
				2144	MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
				2145	tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
				2146	MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
				2147	MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
				2148
				2149	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
				2150	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
				2151	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
				2152	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
				2153
				2154	dataptr++; /* advance pointer to next column */
				2155	wsptr++; /* advance pointer to next column */
				2156	}
				2157	}
				2158
				2159
				2160	/*
				2161	* Perform the forward DCT on a 16x16 sample block.
				2162	*/
				2163
				2164	GLOBAL(void)
				2165	jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2166	{
				2167	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
				2168	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
				2169	DCTELEM workspace[DCTSIZE2];
				2170	DCTELEM *dataptr;
				2171	DCTELEM *wsptr;
				2172	JSAMPROW elemptr;
				2173	int ctr;
				2174	SHIFT_TEMPS
				2175
				2176	/* Pass 1: process rows. */
				2177	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				2178	/* furthermore, we scale the results by 2*PASS1_BITS. /
				2179	/* cK represents sqrt(2) * cos(Kpi/32). /
				2180
				2181	dataptr = data;
				2182	ctr = 0;
				2183	for (;;) {
				2184	elemptr = sample_data[ctr] + start_col;
				2185
				2186	/* Even part */
				2187
				2188	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
				2189	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
				2190	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
				2191	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
				2192	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
				2193	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
				2194	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
				2195	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
				2196
				2197	tmp10 = tmp0 + tmp7;
				2198	tmp14 = tmp0 - tmp7;
				2199	tmp11 = tmp1 + tmp6;
				2200	tmp15 = tmp1 - tmp6;
				2201	tmp12 = tmp2 + tmp5;
				2202	tmp16 = tmp2 - tmp5;
				2203	tmp13 = tmp3 + tmp4;
				2204	tmp17 = tmp3 - tmp4;
				2205
				2206	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
				2207	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
				2208	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
				2209	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
				2210	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
				2211	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
				2212	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
				2213	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
				2214
				2215	/* Apply unsigned->signed conversion */
				2216	dataptr[0] = (DCTELEM)
				2217	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
				2218	dataptr[4] = (DCTELEM)
				2219	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
				2220	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
				2221	CONST_BITS-PASS1_BITS);
				2222
				2223	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
				2224	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
				2225
				2226	dataptr[2] = (DCTELEM)
				2227	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
				2228	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
				2229	CONST_BITS-PASS1_BITS);
				2230	dataptr[6] = (DCTELEM)
				2231	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
				2232	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
				2233	CONST_BITS-PASS1_BITS);
				2234
				2235	/* Odd part */
				2236
				2237	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
				2238	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
				2239	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
				2240	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
				2241	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
				2242	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
				2243	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
				2244	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
				2245	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
				2246	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
				2247	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
				2248	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
				2249	tmp10 = tmp11 + tmp12 + tmp13 -
				2250	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
				2251	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
				2252	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
				2253	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
				2254	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
				2255	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
				2256	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
				2257	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
				2258
				2259	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
				2260	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
				2261	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
				2262	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
				2263
				2264	ctr++;
				2265
				2266	if (ctr != DCTSIZE) {
				2267	if (ctr == DCTSIZE * 2)
				2268	break; /* Done. */
				2269	dataptr += DCTSIZE; /* advance pointer to next row */
				2270	} else
				2271	dataptr = workspace; /* switch pointer to extended workspace */
				2272	}
				2273
				2274	/* Pass 2: process columns.
				2275	* We remove the PASS1_BITS scaling, but leave the results scaled up
				2276	* by an overall factor of 8.
				2277	* We must also scale the output by (8/16)2 = 1/22.
				2278	*/
				2279
				2280	dataptr = data;
				2281	wsptr = workspace;
				2282	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2283	/* Even part */
				2284
				2285	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
				2286	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
				2287	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
				2288	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
				2289	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
				2290	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
				2291	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
				2292	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
				2293
				2294	tmp10 = tmp0 + tmp7;
				2295	tmp14 = tmp0 - tmp7;
				2296	tmp11 = tmp1 + tmp6;
				2297	tmp15 = tmp1 - tmp6;
				2298	tmp12 = tmp2 + tmp5;
				2299	tmp16 = tmp2 - tmp5;
				2300	tmp13 = tmp3 + tmp4;
				2301	tmp17 = tmp3 - tmp4;
				2302
				2303	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
				2304	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
				2305	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
				2306	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
				2307	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
				2308	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
				2309	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
				2310	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
				2311
				2312	dataptr[DCTSIZE*0] = (DCTELEM)
				2313	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
				2314	dataptr[DCTSIZE*4] = (DCTELEM)
				2315	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
				2316	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
				2317	CONST_BITS+PASS1_BITS+2);
				2318
				2319	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
				2320	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
				2321
				2322	dataptr[DCTSIZE*2] = (DCTELEM)
				2323	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
				2324	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
				2325	CONST_BITS+PASS1_BITS+2);
				2326	dataptr[DCTSIZE*6] = (DCTELEM)
				2327	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
				2328	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
				2329	CONST_BITS+PASS1_BITS+2);
				2330
				2331	/* Odd part */
				2332
				2333	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
				2334	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
				2335	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
				2336	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
				2337	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
				2338	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
				2339	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
				2340	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
				2341	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
				2342	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
				2343	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
				2344	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
				2345	tmp10 = tmp11 + tmp12 + tmp13 -
				2346	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
				2347	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
				2348	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
				2349	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
				2350	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
				2351	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
				2352	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
				2353	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
				2354
				2355	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
				2356	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
				2357	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
				2358	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
				2359
				2360	dataptr++; /* advance pointer to next column */
				2361	wsptr++; /* advance pointer to next column */
				2362	}
				2363	}
				2364
				2365
				2366	/*
				2367	* Perform the forward DCT on a 16x8 sample block.
				2368	*
				2369	* 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
				2370	*/
				2371
				2372	GLOBAL(void)
				2373	jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2374	{
				2375	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
				2376	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
				2377	INT32 z1;
				2378	DCTELEM *dataptr;
				2379	JSAMPROW elemptr;
				2380	int ctr;
				2381	SHIFT_TEMPS
				2382
				2383	/* Pass 1: process rows. */
				2384	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				2385	/* furthermore, we scale the results by 2*PASS1_BITS. /
				2386	/* 16-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/32). /
				2387
				2388	dataptr = data;
				2389	ctr = 0;
				2390	for (ctr = 0; ctr < DCTSIZE; ctr++) {
				2391	elemptr = sample_data[ctr] + start_col;
				2392
				2393	/* Even part */
				2394
				2395	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
				2396	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
				2397	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
				2398	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
				2399	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
				2400	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
				2401	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
				2402	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
				2403
				2404	tmp10 = tmp0 + tmp7;
				2405	tmp14 = tmp0 - tmp7;
				2406	tmp11 = tmp1 + tmp6;
				2407	tmp15 = tmp1 - tmp6;
				2408	tmp12 = tmp2 + tmp5;
				2409	tmp16 = tmp2 - tmp5;
				2410	tmp13 = tmp3 + tmp4;
				2411	tmp17 = tmp3 - tmp4;
				2412
				2413	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
				2414	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
				2415	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
				2416	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
				2417	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
				2418	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
				2419	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
				2420	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
				2421
				2422	/* Apply unsigned->signed conversion */
				2423	dataptr[0] = (DCTELEM)
				2424	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
				2425	dataptr[4] = (DCTELEM)
				2426	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
				2427	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
				2428	CONST_BITS-PASS1_BITS);
				2429
				2430	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
				2431	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
				2432
				2433	dataptr[2] = (DCTELEM)
				2434	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
				2435	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
				2436	CONST_BITS-PASS1_BITS);
				2437	dataptr[6] = (DCTELEM)
				2438	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
				2439	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
				2440	CONST_BITS-PASS1_BITS);
				2441
				2442	/* Odd part */
				2443
				2444	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
				2445	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
				2446	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
				2447	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
				2448	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
				2449	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
				2450	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
				2451	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
				2452	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
				2453	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
				2454	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
				2455	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
				2456	tmp10 = tmp11 + tmp12 + tmp13 -
				2457	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
				2458	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
				2459	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
				2460	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
				2461	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
				2462	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
				2463	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
				2464	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
				2465
				2466	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
				2467	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
				2468	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
				2469	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
				2470
				2471	dataptr += DCTSIZE; /* advance pointer to next row */
				2472	}
				2473
				2474	/* Pass 2: process columns.
				2475	* We remove the PASS1_BITS scaling, but leave the results scaled up
				2476	* by an overall factor of 8.
				2477	* We must also scale the output by 8/16 = 1/2.
				2478	*/
				2479
				2480	dataptr = data;
				2481	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2482	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				2483	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				2484	*/
				2485
				2486	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
				2487	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
				2488	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
				2489	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
				2490
				2491	tmp10 = tmp0 + tmp3;
				2492	tmp12 = tmp0 - tmp3;
				2493	tmp11 = tmp1 + tmp2;
				2494	tmp13 = tmp1 - tmp2;
				2495
				2496	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
				2497	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
				2498	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
				2499	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
				2500
				2501	dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
				2502	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
				2503
				2504	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				2505	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				2506	CONST_BITS+PASS1_BITS+1);
				2507	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				2508	CONST_BITS+PASS1_BITS+1);
				2509
				2510	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				2511	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
				2512	* i0..i3 in the paper are tmp0..tmp3 here.
				2513	*/
				2514
				2515	tmp10 = tmp0 + tmp3;
				2516	tmp11 = tmp1 + tmp2;
				2517	tmp12 = tmp0 + tmp2;
				2518	tmp13 = tmp1 + tmp3;
				2519	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				2520
				2521	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				2522	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				2523	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				2524	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				2525	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				2526	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				2527	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				2528	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				2529
				2530	tmp12 += z1;
				2531	tmp13 += z1;
				2532
				2533	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
				2534	CONST_BITS+PASS1_BITS+1);
				2535	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
				2536	CONST_BITS+PASS1_BITS+1);
				2537	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
				2538	CONST_BITS+PASS1_BITS+1);
				2539	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
				2540	CONST_BITS+PASS1_BITS+1);
				2541
				2542	dataptr++; /* advance pointer to next column */
				2543	}
				2544	}
				2545
				2546
				2547	/*
				2548	* Perform the forward DCT on a 14x7 sample block.
				2549	*
				2550	* 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
				2551	*/
				2552
				2553	GLOBAL(void)
				2554	jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2555	{
				2556	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
				2557	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
				2558	INT32 z1, z2, z3;
				2559	DCTELEM *dataptr;
				2560	JSAMPROW elemptr;
				2561	int ctr;
				2562	SHIFT_TEMPS
				2563
				2564	/* Zero bottom row of output coefficient block. */
				2565	MEMZERO(&data[DCTSIZE7], SIZEOF(DCTELEM) DCTSIZE);
				2566
				2567	/* Pass 1: process rows. */
				2568	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				2569	/* furthermore, we scale the results by 2*PASS1_BITS. /
				2570	/* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28). /
				2571
				2572	dataptr = data;
				2573	for (ctr = 0; ctr < 7; ctr++) {
				2574	elemptr = sample_data[ctr] + start_col;
				2575
				2576	/* Even part */
				2577
				2578	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
				2579	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
				2580	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
				2581	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
				2582	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
				2583	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
				2584	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
				2585
				2586	tmp10 = tmp0 + tmp6;
				2587	tmp14 = tmp0 - tmp6;
				2588	tmp11 = tmp1 + tmp5;
				2589	tmp15 = tmp1 - tmp5;
				2590	tmp12 = tmp2 + tmp4;
				2591	tmp16 = tmp2 - tmp4;
				2592
				2593	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
				2594	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
				2595	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
				2596	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
				2597	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
				2598	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
				2599	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
				2600
				2601	/* Apply unsigned->signed conversion */
				2602	dataptr[0] = (DCTELEM)
				2603	((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
				2604	tmp13 += tmp13;
				2605	dataptr[4] = (DCTELEM)
				2606	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
				2607	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
				2608	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
				2609	CONST_BITS-PASS1_BITS);
				2610
				2611	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
				2612
				2613	dataptr[2] = (DCTELEM)
				2614	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
				2615	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
				2616	CONST_BITS-PASS1_BITS);
				2617	dataptr[6] = (DCTELEM)
				2618	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
				2619	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
				2620	CONST_BITS-PASS1_BITS);
				2621
				2622	/* Odd part */
				2623
				2624	tmp10 = tmp1 + tmp2;
				2625	tmp11 = tmp5 - tmp4;
				2626	dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
				2627	tmp3 <<= CONST_BITS;
				2628	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
				2629	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
				2630	tmp10 += tmp11 - tmp3;
				2631	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
				2632	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
				2633	dataptr[5] = (DCTELEM)
				2634	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
				2635	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
				2636	CONST_BITS-PASS1_BITS);
				2637	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
				2638	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
				2639	dataptr[3] = (DCTELEM)
				2640	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
				2641	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
				2642	CONST_BITS-PASS1_BITS);
				2643	dataptr[1] = (DCTELEM)
				2644	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
				2645	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
				2646	CONST_BITS-PASS1_BITS);
				2647
				2648	dataptr += DCTSIZE; /* advance pointer to next row */
				2649	}
				2650
				2651	/* Pass 2: process columns.
				2652	* We remove the PASS1_BITS scaling, but leave the results scaled up
				2653	* by an overall factor of 8.
				2654	* We must also scale the output by (8/14)*(8/7) = 32/49, which we
				2655	* partially fold into the constant multipliers and final shifting:
				2656	* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14) 64/49.
				2657	*/
				2658
				2659	dataptr = data;
				2660	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2661	/* Even part */
				2662
				2663	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
				2664	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
				2665	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
				2666	tmp3 = dataptr[DCTSIZE*3];
				2667
				2668	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
				2669	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
				2670	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
				2671
				2672	z1 = tmp0 + tmp2;
				2673	dataptr[DCTSIZE*0] = (DCTELEM)
				2674	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
				2675	CONST_BITS+PASS1_BITS+1);
				2676	tmp3 += tmp3;
				2677	z1 -= tmp3;
				2678	z1 -= tmp3;
				2679	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
				2680	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
				2681	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
				2682	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
				2683	z1 -= z2;
				2684	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
				2685	dataptr[DCTSIZE*4] = (DCTELEM)
				2686	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
				2687	CONST_BITS+PASS1_BITS+1);
				2688	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
				2689
				2690	/* Odd part */
				2691
				2692	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
				2693	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
				2694	tmp0 = tmp1 - tmp2;
				2695	tmp1 += tmp2;
				2696	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
				2697	tmp1 += tmp2;
				2698	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
				2699	tmp0 += tmp3;
				2700	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
				2701
				2702	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
				2703	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
				2704	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
				2705
				2706	dataptr++; /* advance pointer to next column */
				2707	}
				2708	}
				2709
				2710
				2711	/*
				2712	* Perform the forward DCT on a 12x6 sample block.
				2713	*
				2714	* 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
				2715	*/
				2716
				2717	GLOBAL(void)
				2718	jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2719	{
				2720	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
				2721	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
				2722	DCTELEM *dataptr;
				2723	JSAMPROW elemptr;
				2724	int ctr;
				2725	SHIFT_TEMPS
				2726
				2727	/* Zero 2 bottom rows of output coefficient block. */
				2728	MEMZERO(&data[DCTSIZE6], SIZEOF(DCTELEM) DCTSIZE * 2);
				2729
				2730	/* Pass 1: process rows. */
				2731	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				2732	/* furthermore, we scale the results by 2*PASS1_BITS. /
				2733	/* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24). /
				2734
				2735	dataptr = data;
				2736	for (ctr = 0; ctr < 6; ctr++) {
				2737	elemptr = sample_data[ctr] + start_col;
				2738
				2739	/* Even part */
				2740
				2741	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
				2742	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
				2743	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
				2744	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
				2745	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
				2746	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
				2747
				2748	tmp10 = tmp0 + tmp5;
				2749	tmp13 = tmp0 - tmp5;
				2750	tmp11 = tmp1 + tmp4;
				2751	tmp14 = tmp1 - tmp4;
				2752	tmp12 = tmp2 + tmp3;
				2753	tmp15 = tmp2 - tmp3;
				2754
				2755	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
				2756	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
				2757	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
				2758	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
				2759	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
				2760	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
				2761
				2762	/* Apply unsigned->signed conversion */
				2763	dataptr[0] = (DCTELEM)
				2764	((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
				2765	dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
				2766	dataptr[4] = (DCTELEM)
				2767	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
				2768	CONST_BITS-PASS1_BITS);
				2769	dataptr[2] = (DCTELEM)
				2770	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
				2771	CONST_BITS-PASS1_BITS);
				2772
				2773	/* Odd part */
				2774
				2775	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
				2776	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
				2777	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
				2778	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
				2779	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
				2780	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
				2781	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
				2782	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
				2783	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
				2784	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
				2785	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
				2786	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
				2787	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
				2788	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
				2789
				2790	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
				2791	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
				2792	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
				2793	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
				2794
				2795	dataptr += DCTSIZE; /* advance pointer to next row */
				2796	}
				2797
				2798	/* Pass 2: process columns.
				2799	* We remove the PASS1_BITS scaling, but leave the results scaled up
				2800	* by an overall factor of 8.
				2801	* We must also scale the output by (8/12)*(8/6) = 8/9, which we
				2802	* partially fold into the constant multipliers and final shifting:
				2803	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
				2804	*/
				2805
				2806	dataptr = data;
				2807	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2808	/* Even part */
				2809
				2810	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
				2811	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
				2812	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
				2813
				2814	tmp10 = tmp0 + tmp2;
				2815	tmp12 = tmp0 - tmp2;
				2816
				2817	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
				2818	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
				2819	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
				2820
				2821	dataptr[DCTSIZE*0] = (DCTELEM)
				2822	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
				2823	CONST_BITS+PASS1_BITS+1);
				2824	dataptr[DCTSIZE*2] = (DCTELEM)
				2825	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
				2826	CONST_BITS+PASS1_BITS+1);
				2827	dataptr[DCTSIZE*4] = (DCTELEM)
				2828	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
				2829	CONST_BITS+PASS1_BITS+1);
				2830
				2831	/* Odd part */
				2832
				2833	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
				2834
				2835	dataptr[DCTSIZE*1] = (DCTELEM)
				2836	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
				2837	CONST_BITS+PASS1_BITS+1);
				2838	dataptr[DCTSIZE*3] = (DCTELEM)
				2839	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
				2840	CONST_BITS+PASS1_BITS+1);
				2841	dataptr[DCTSIZE*5] = (DCTELEM)
				2842	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
				2843	CONST_BITS+PASS1_BITS+1);
				2844
				2845	dataptr++; /* advance pointer to next column */
				2846	}
				2847	}
				2848
				2849
				2850	/*
				2851	* Perform the forward DCT on a 10x5 sample block.
				2852	*
				2853	* 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
				2854	*/
				2855
				2856	GLOBAL(void)
				2857	jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2858	{
				2859	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
				2860	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
				2861	DCTELEM *dataptr;
				2862	JSAMPROW elemptr;
				2863	int ctr;
				2864	SHIFT_TEMPS
				2865
				2866	/* Zero 3 bottom rows of output coefficient block. */
				2867	MEMZERO(&data[DCTSIZE5], SIZEOF(DCTELEM) DCTSIZE * 3);
				2868
				2869	/* Pass 1: process rows. */
				2870	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				2871	/* furthermore, we scale the results by 2*PASS1_BITS. /
				2872	/* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20). /
				2873
				2874	dataptr = data;
				2875	for (ctr = 0; ctr < 5; ctr++) {
				2876	elemptr = sample_data[ctr] + start_col;
				2877
				2878	/* Even part */
				2879
				2880	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
				2881	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
				2882	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
				2883	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
				2884	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
				2885
				2886	tmp10 = tmp0 + tmp4;
				2887	tmp13 = tmp0 - tmp4;
				2888	tmp11 = tmp1 + tmp3;
				2889	tmp14 = tmp1 - tmp3;
				2890
				2891	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
				2892	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
				2893	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
				2894	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
				2895	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
				2896
				2897	/* Apply unsigned->signed conversion */
				2898	dataptr[0] = (DCTELEM)
				2899	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
				2900	tmp12 += tmp12;
				2901	dataptr[4] = (DCTELEM)
				2902	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
				2903	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
				2904	CONST_BITS-PASS1_BITS);
				2905	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
				2906	dataptr[2] = (DCTELEM)
				2907	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
				2908	CONST_BITS-PASS1_BITS);
				2909	dataptr[6] = (DCTELEM)
				2910	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
				2911	CONST_BITS-PASS1_BITS);
				2912
				2913	/* Odd part */
				2914
				2915	tmp10 = tmp0 + tmp4;
				2916	tmp11 = tmp1 - tmp3;
				2917	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
				2918	tmp2 <<= CONST_BITS;
				2919	dataptr[1] = (DCTELEM)
				2920	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
				2921	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
				2922	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
				2923	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
				2924	CONST_BITS-PASS1_BITS);
				2925	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
				2926	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
				2927	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
				2928	(tmp11 << (CONST_BITS - 1)) - tmp2;
				2929	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
				2930	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
				2931
				2932	dataptr += DCTSIZE; /* advance pointer to next row */
				2933	}
				2934
				2935	/* Pass 2: process columns.
				2936	* We remove the PASS1_BITS scaling, but leave the results scaled up
				2937	* by an overall factor of 8.
				2938	* We must also scale the output by (8/10)*(8/5) = 32/25, which we
				2939	* fold into the constant multipliers:
				2940	* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10) 32/25.
				2941	*/
				2942
				2943	dataptr = data;
				2944	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				2945	/* Even part */
				2946
				2947	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
				2948	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
				2949	tmp2 = dataptr[DCTSIZE*2];
				2950
				2951	tmp10 = tmp0 + tmp1;
				2952	tmp11 = tmp0 - tmp1;
				2953
				2954	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
				2955	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
				2956
				2957	dataptr[DCTSIZE*0] = (DCTELEM)
				2958	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
				2959	CONST_BITS+PASS1_BITS);
				2960	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
				2961	tmp10 -= tmp2 << 2;
				2962	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
				2963	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
				2964	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
				2965
				2966	/* Odd part */
				2967
				2968	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
				2969
				2970	dataptr[DCTSIZE*1] = (DCTELEM)
				2971	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
				2972	CONST_BITS+PASS1_BITS);
				2973	dataptr[DCTSIZE*3] = (DCTELEM)
				2974	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
				2975	CONST_BITS+PASS1_BITS);
				2976
				2977	dataptr++; /* advance pointer to next column */
				2978	}
				2979	}
				2980
				2981
				2982	/*
				2983	* Perform the forward DCT on an 8x4 sample block.
				2984	*
				2985	* 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
				2986	*/
				2987
				2988	GLOBAL(void)
				2989	jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				2990	{
				2991	INT32 tmp0, tmp1, tmp2, tmp3;
				2992	INT32 tmp10, tmp11, tmp12, tmp13;
				2993	INT32 z1;
				2994	DCTELEM *dataptr;
				2995	JSAMPROW elemptr;
				2996	int ctr;
				2997	SHIFT_TEMPS
				2998
				2999	/* Zero 4 bottom rows of output coefficient block. */
				3000	MEMZERO(&data[DCTSIZE4], SIZEOF(DCTELEM) DCTSIZE * 4);
				3001
				3002	/* Pass 1: process rows. */
				3003	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3004	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3005	/* We must also scale the output by 8/4 = 2, which we add here. */
				3006
				3007	dataptr = data;
				3008	for (ctr = 0; ctr < 4; ctr++) {
				3009	elemptr = sample_data[ctr] + start_col;
				3010
				3011	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				3012	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				3013	*/
				3014
				3015	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
				3016	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
				3017	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
				3018	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
				3019
				3020	tmp10 = tmp0 + tmp3;
				3021	tmp12 = tmp0 - tmp3;
				3022	tmp11 = tmp1 + tmp2;
				3023	tmp13 = tmp1 - tmp2;
				3024
				3025	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
				3026	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
				3027	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
				3028	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
				3029
				3030	/* Apply unsigned->signed conversion */
				3031	dataptr[0] = (DCTELEM)
				3032	((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
				3033	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
				3034
				3035	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				3036	/* Add fudge factor here for final descale. */
				3037	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
				3038	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				3039	CONST_BITS-PASS1_BITS-1);
				3040	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				3041	CONST_BITS-PASS1_BITS-1);
				3042
				3043	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				3044	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
				3045	* i0..i3 in the paper are tmp0..tmp3 here.
				3046	*/
				3047
				3048	tmp10 = tmp0 + tmp3;
				3049	tmp11 = tmp1 + tmp2;
				3050	tmp12 = tmp0 + tmp2;
				3051	tmp13 = tmp1 + tmp3;
				3052	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				3053	/* Add fudge factor here for final descale. */
				3054	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
				3055
				3056	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				3057	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				3058	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				3059	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				3060	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				3061	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				3062	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				3063	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				3064
				3065	tmp12 += z1;
				3066	tmp13 += z1;
				3067
				3068	dataptr[1] = (DCTELEM)
				3069	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
				3070	dataptr[3] = (DCTELEM)
				3071	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
				3072	dataptr[5] = (DCTELEM)
				3073	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
				3074	dataptr[7] = (DCTELEM)
				3075	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
				3076
				3077	dataptr += DCTSIZE; /* advance pointer to next row */
				3078	}
				3079
				3080	/* Pass 2: process columns.
				3081	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3082	* by an overall factor of 8.
				3083	* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
				3084	*/
				3085
				3086	dataptr = data;
				3087	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				3088	/* Even part */
				3089
				3090	/* Add fudge factor here for final descale. */
				3091	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
				3092	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
				3093
				3094	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
				3095	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
				3096
				3097	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
				3098	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
				3099
				3100	/* Odd part */
				3101
				3102	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				3103	/* Add fudge factor here for final descale. */
				3104	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
				3105
				3106	dataptr[DCTSIZE*1] = (DCTELEM)
				3107	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				3108	CONST_BITS+PASS1_BITS);
				3109	dataptr[DCTSIZE*3] = (DCTELEM)
				3110	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				3111	CONST_BITS+PASS1_BITS);
				3112
				3113	dataptr++; /* advance pointer to next column */
				3114	}
				3115	}
				3116
				3117
				3118	/*
				3119	* Perform the forward DCT on a 6x3 sample block.
				3120	*
				3121	* 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
				3122	*/
				3123
				3124	GLOBAL(void)
				3125	jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3126	{
				3127	INT32 tmp0, tmp1, tmp2;
				3128	INT32 tmp10, tmp11, tmp12;
				3129	DCTELEM *dataptr;
				3130	JSAMPROW elemptr;
				3131	int ctr;
				3132	SHIFT_TEMPS
				3133
				3134	/* Pre-zero output coefficient block. */
				3135	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3136
				3137	/* Pass 1: process rows. */
				3138	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3139	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3140	/* We scale the results further by 2 as part of output adaption */
				3141	/* scaling for different DCT size. */
				3142	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
				3143
				3144	dataptr = data;
				3145	for (ctr = 0; ctr < 3; ctr++) {
				3146	elemptr = sample_data[ctr] + start_col;
				3147
				3148	/* Even part */
				3149
				3150	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
				3151	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
				3152	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
				3153
				3154	tmp10 = tmp0 + tmp2;
				3155	tmp12 = tmp0 - tmp2;
				3156
				3157	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
				3158	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
				3159	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
				3160
				3161	/* Apply unsigned->signed conversion */
				3162	dataptr[0] = (DCTELEM)
				3163	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
				3164	dataptr[2] = (DCTELEM)
				3165	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
				3166	CONST_BITS-PASS1_BITS-1);
				3167	dataptr[4] = (DCTELEM)
				3168	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
				3169	CONST_BITS-PASS1_BITS-1);
				3170
				3171	/* Odd part */
				3172
				3173	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
				3174	CONST_BITS-PASS1_BITS-1);
				3175
				3176	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
				3177	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
				3178	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
				3179
				3180	dataptr += DCTSIZE; /* advance pointer to next row */
				3181	}
				3182
				3183	/* Pass 2: process columns.
				3184	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3185	* by an overall factor of 8.
				3186	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
				3187	* fold into the constant multipliers (other part was done in pass 1):
				3188	* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6) 16/9.
				3189	*/
				3190
				3191	dataptr = data;
				3192	for (ctr = 0; ctr < 6; ctr++) {
				3193	/* Even part */
				3194
				3195	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
				3196	tmp1 = dataptr[DCTSIZE*1];
				3197
				3198	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
				3199
				3200	dataptr[DCTSIZE*0] = (DCTELEM)
				3201	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
				3202	CONST_BITS+PASS1_BITS);
				3203	dataptr[DCTSIZE*2] = (DCTELEM)
				3204	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
				3205	CONST_BITS+PASS1_BITS);
				3206
				3207	/* Odd part */
				3208
				3209	dataptr[DCTSIZE*1] = (DCTELEM)
				3210	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
				3211	CONST_BITS+PASS1_BITS);
				3212
				3213	dataptr++; /* advance pointer to next column */
				3214	}
				3215	}
				3216
				3217
				3218	/*
				3219	* Perform the forward DCT on a 4x2 sample block.
				3220	*
				3221	* 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
				3222	*/
				3223
				3224	GLOBAL(void)
				3225	jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3226	{
				3227	INT32 tmp0, tmp1;
				3228	INT32 tmp10, tmp11;
				3229	DCTELEM *dataptr;
				3230	JSAMPROW elemptr;
				3231	int ctr;
				3232	SHIFT_TEMPS
				3233
				3234	/* Pre-zero output coefficient block. */
				3235	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3236
				3237	/* Pass 1: process rows. */
				3238	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3239	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3240	/* We must also scale the output by (8/4)(8/2) = 23, which we add here. /
				3241	/* 4-point FDCT kernel, */
				3242	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
				3243
				3244	dataptr = data;
				3245	for (ctr = 0; ctr < 2; ctr++) {
				3246	elemptr = sample_data[ctr] + start_col;
				3247
				3248	/* Even part */
				3249
				3250	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
				3251	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
				3252
				3253	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
				3254	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
				3255
				3256	/* Apply unsigned->signed conversion */
				3257	dataptr[0] = (DCTELEM)
				3258	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
				3259	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
				3260
				3261	/* Odd part */
				3262
				3263	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				3264	/* Add fudge factor here for final descale. */
				3265	tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
				3266
				3267	dataptr[1] = (DCTELEM)
				3268	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				3269	CONST_BITS-PASS1_BITS-3);
				3270	dataptr[3] = (DCTELEM)
				3271	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				3272	CONST_BITS-PASS1_BITS-3);
				3273
				3274	dataptr += DCTSIZE; /* advance pointer to next row */
				3275	}
				3276
				3277	/* Pass 2: process columns.
				3278	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3279	* by an overall factor of 8.
				3280	*/
				3281
				3282	dataptr = data;
				3283	for (ctr = 0; ctr < 4; ctr++) {
				3284	/* Even part */
				3285
				3286	/* Add fudge factor here for final descale. */
				3287	tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
				3288	tmp1 = dataptr[DCTSIZE*1];
				3289
				3290	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
				3291
				3292	/* Odd part */
				3293
				3294	dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
				3295
				3296	dataptr++; /* advance pointer to next column */
				3297	}
				3298	}
				3299
				3300
				3301	/*
				3302	* Perform the forward DCT on a 2x1 sample block.
				3303	*
				3304	* 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
				3305	*/
				3306
				3307	GLOBAL(void)
				3308	jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3309	{
				3310	INT32 tmp0, tmp1;
				3311	JSAMPROW elemptr;
				3312
				3313	/* Pre-zero output coefficient block. */
				3314	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3315
				3316	elemptr = sample_data[0] + start_col;
				3317
				3318	tmp0 = GETJSAMPLE(elemptr[0]);
				3319	tmp1 = GETJSAMPLE(elemptr[1]);
				3320
				3321	/* We leave the results scaled up by an overall factor of 8.
				3322	* We must also scale the output by (8/2)(8/1) = 2*5.
				3323	*/
				3324
				3325	/* Even part */
				3326	/* Apply unsigned->signed conversion */
				3327	data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
				3328
				3329	/* Odd part */
				3330	data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
				3331	}
				3332
				3333
				3334	/*
				3335	* Perform the forward DCT on an 8x16 sample block.
				3336	*
				3337	* 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
				3338	*/
				3339
				3340	GLOBAL(void)
				3341	jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3342	{
				3343	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
				3344	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
				3345	INT32 z1;
				3346	DCTELEM workspace[DCTSIZE2];
				3347	DCTELEM *dataptr;
				3348	DCTELEM *wsptr;
				3349	JSAMPROW elemptr;
				3350	int ctr;
				3351	SHIFT_TEMPS
				3352
				3353	/* Pass 1: process rows. */
				3354	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3355	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3356
				3357	dataptr = data;
				3358	ctr = 0;
				3359	for (;;) {
				3360	elemptr = sample_data[ctr] + start_col;
				3361
				3362	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				3363	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				3364	*/
				3365
				3366	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
				3367	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
				3368	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
				3369	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
				3370
				3371	tmp10 = tmp0 + tmp3;
				3372	tmp12 = tmp0 - tmp3;
				3373	tmp11 = tmp1 + tmp2;
				3374	tmp13 = tmp1 - tmp2;
				3375
				3376	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
				3377	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
				3378	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
				3379	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
				3380
				3381	/* Apply unsigned->signed conversion */
				3382	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
				3383	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
				3384
				3385	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				3386	dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
				3387	CONST_BITS-PASS1_BITS);
				3388	dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
				3389	CONST_BITS-PASS1_BITS);
				3390
				3391	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				3392	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
				3393	* i0..i3 in the paper are tmp0..tmp3 here.
				3394	*/
				3395
				3396	tmp10 = tmp0 + tmp3;
				3397	tmp11 = tmp1 + tmp2;
				3398	tmp12 = tmp0 + tmp2;
				3399	tmp13 = tmp1 + tmp3;
				3400	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				3401
				3402	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				3403	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				3404	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				3405	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				3406	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				3407	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				3408	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				3409	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				3410
				3411	tmp12 += z1;
				3412	tmp13 += z1;
				3413
				3414	dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
				3415	dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
				3416	dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
				3417	dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
				3418
				3419	ctr++;
				3420
				3421	if (ctr != DCTSIZE) {
				3422	if (ctr == DCTSIZE * 2)
				3423	break; /* Done. */
				3424	dataptr += DCTSIZE; /* advance pointer to next row */
				3425	} else
				3426	dataptr = workspace; /* switch pointer to extended workspace */
				3427	}
				3428
				3429	/* Pass 2: process columns.
				3430	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3431	* by an overall factor of 8.
				3432	* We must also scale the output by 8/16 = 1/2.
				3433	* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
				3434	*/
				3435
				3436	dataptr = data;
				3437	wsptr = workspace;
				3438	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
				3439	/* Even part */
				3440
				3441	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
				3442	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
				3443	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
				3444	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
				3445	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
				3446	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
				3447	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
				3448	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
				3449
				3450	tmp10 = tmp0 + tmp7;
				3451	tmp14 = tmp0 - tmp7;
				3452	tmp11 = tmp1 + tmp6;
				3453	tmp15 = tmp1 - tmp6;
				3454	tmp12 = tmp2 + tmp5;
				3455	tmp16 = tmp2 - tmp5;
				3456	tmp13 = tmp3 + tmp4;
				3457	tmp17 = tmp3 - tmp4;
				3458
				3459	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
				3460	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
				3461	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
				3462	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
				3463	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
				3464	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
				3465	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
				3466	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
				3467
				3468	dataptr[DCTSIZE*0] = (DCTELEM)
				3469	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
				3470	dataptr[DCTSIZE*4] = (DCTELEM)
				3471	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
				3472	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
				3473	CONST_BITS+PASS1_BITS+1);
				3474
				3475	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
				3476	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
				3477
				3478	dataptr[DCTSIZE*2] = (DCTELEM)
				3479	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
				3480	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
				3481	CONST_BITS+PASS1_BITS+1);
				3482	dataptr[DCTSIZE*6] = (DCTELEM)
				3483	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
				3484	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
				3485	CONST_BITS+PASS1_BITS+1);
				3486
				3487	/* Odd part */
				3488
				3489	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
				3490	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
				3491	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
				3492	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
				3493	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
				3494	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
				3495	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
				3496	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
				3497	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
				3498	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
				3499	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
				3500	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
				3501	tmp10 = tmp11 + tmp12 + tmp13 -
				3502	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
				3503	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
				3504	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
				3505	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
				3506	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
				3507	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
				3508	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
				3509	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
				3510
				3511	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
				3512	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
				3513	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
				3514	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
				3515
				3516	dataptr++; /* advance pointer to next column */
				3517	wsptr++; /* advance pointer to next column */
				3518	}
				3519	}
				3520
				3521
				3522	/*
				3523	* Perform the forward DCT on a 7x14 sample block.
				3524	*
				3525	* 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
				3526	*/
				3527
				3528	GLOBAL(void)
				3529	jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3530	{
				3531	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
				3532	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
				3533	INT32 z1, z2, z3;
				3534	DCTELEM workspace[8*6];
				3535	DCTELEM *dataptr;
				3536	DCTELEM *wsptr;
				3537	JSAMPROW elemptr;
				3538	int ctr;
				3539	SHIFT_TEMPS
				3540
				3541	/* Pre-zero output coefficient block. */
				3542	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3543
				3544	/* Pass 1: process rows. */
				3545	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3546	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3547	/* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14). /
				3548
				3549	dataptr = data;
				3550	ctr = 0;
				3551	for (;;) {
				3552	elemptr = sample_data[ctr] + start_col;
				3553
				3554	/* Even part */
				3555
				3556	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
				3557	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
				3558	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
				3559	tmp3 = GETJSAMPLE(elemptr[3]);
				3560
				3561	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
				3562	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
				3563	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
				3564
				3565	z1 = tmp0 + tmp2;
				3566	/* Apply unsigned->signed conversion */
				3567	dataptr[0] = (DCTELEM)
				3568	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
				3569	tmp3 += tmp3;
				3570	z1 -= tmp3;
				3571	z1 -= tmp3;
				3572	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
				3573	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
				3574	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
				3575	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
				3576	z1 -= z2;
				3577	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
				3578	dataptr[4] = (DCTELEM)
				3579	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
				3580	CONST_BITS-PASS1_BITS);
				3581	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
				3582
				3583	/* Odd part */
				3584
				3585	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
				3586	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
				3587	tmp0 = tmp1 - tmp2;
				3588	tmp1 += tmp2;
				3589	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
				3590	tmp1 += tmp2;
				3591	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
				3592	tmp0 += tmp3;
				3593	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
				3594
				3595	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
				3596	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
				3597	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
				3598
				3599	ctr++;
				3600
				3601	if (ctr != DCTSIZE) {
				3602	if (ctr == 14)
				3603	break; /* Done. */
				3604	dataptr += DCTSIZE; /* advance pointer to next row */
				3605	} else
				3606	dataptr = workspace; /* switch pointer to extended workspace */
				3607	}
				3608
				3609	/* Pass 2: process columns.
				3610	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3611	* by an overall factor of 8.
				3612	* We must also scale the output by (8/7)*(8/14) = 32/49, which we
				3613	* fold into the constant multipliers:
				3614	* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28) 32/49.
				3615	*/
				3616
				3617	dataptr = data;
				3618	wsptr = workspace;
				3619	for (ctr = 0; ctr < 7; ctr++) {
				3620	/* Even part */
				3621
				3622	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
				3623	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
				3624	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
				3625	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
				3626	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
				3627	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
				3628	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
				3629
				3630	tmp10 = tmp0 + tmp6;
				3631	tmp14 = tmp0 - tmp6;
				3632	tmp11 = tmp1 + tmp5;
				3633	tmp15 = tmp1 - tmp5;
				3634	tmp12 = tmp2 + tmp4;
				3635	tmp16 = tmp2 - tmp4;
				3636
				3637	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
				3638	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
				3639	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
				3640	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
				3641	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
				3642	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
				3643	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
				3644
				3645	dataptr[DCTSIZE*0] = (DCTELEM)
				3646	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
				3647	FIX(0.653061224)), /* 32/49 */
				3648	CONST_BITS+PASS1_BITS);
				3649	tmp13 += tmp13;
				3650	dataptr[DCTSIZE*4] = (DCTELEM)
				3651	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
				3652	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
				3653	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
				3654	CONST_BITS+PASS1_BITS);
				3655
				3656	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
				3657
				3658	dataptr[DCTSIZE*2] = (DCTELEM)
				3659	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
				3660	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
				3661	CONST_BITS+PASS1_BITS);
				3662	dataptr[DCTSIZE*6] = (DCTELEM)
				3663	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
				3664	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
				3665	CONST_BITS+PASS1_BITS);
				3666
				3667	/* Odd part */
				3668
				3669	tmp10 = tmp1 + tmp2;
				3670	tmp11 = tmp5 - tmp4;
				3671	dataptr[DCTSIZE*7] = (DCTELEM)
				3672	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
				3673	FIX(0.653061224)), /* 32/49 */
				3674	CONST_BITS+PASS1_BITS);
				3675	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
				3676	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
				3677	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
				3678	tmp10 += tmp11 - tmp3;
				3679	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
				3680	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
				3681	dataptr[DCTSIZE*5] = (DCTELEM)
				3682	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
				3683	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
				3684	CONST_BITS+PASS1_BITS);
				3685	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
				3686	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
				3687	dataptr[DCTSIZE*3] = (DCTELEM)
				3688	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
				3689	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
				3690	CONST_BITS+PASS1_BITS);
				3691	dataptr[DCTSIZE*1] = (DCTELEM)
				3692	DESCALE(tmp11 + tmp12 + tmp3
				3693	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
				3694	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
				3695	CONST_BITS+PASS1_BITS);
				3696
				3697	dataptr++; /* advance pointer to next column */
				3698	wsptr++; /* advance pointer to next column */
				3699	}
				3700	}
				3701
				3702
				3703	/*
				3704	* Perform the forward DCT on a 6x12 sample block.
				3705	*
				3706	* 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
				3707	*/
				3708
				3709	GLOBAL(void)
				3710	jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3711	{
				3712	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
				3713	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
				3714	DCTELEM workspace[8*4];
				3715	DCTELEM *dataptr;
				3716	DCTELEM *wsptr;
				3717	JSAMPROW elemptr;
				3718	int ctr;
				3719	SHIFT_TEMPS
				3720
				3721	/* Pre-zero output coefficient block. */
				3722	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3723
				3724	/* Pass 1: process rows. */
				3725	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3726	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3727	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
				3728
				3729	dataptr = data;
				3730	ctr = 0;
				3731	for (;;) {
				3732	elemptr = sample_data[ctr] + start_col;
				3733
				3734	/* Even part */
				3735
				3736	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
				3737	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
				3738	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
				3739
				3740	tmp10 = tmp0 + tmp2;
				3741	tmp12 = tmp0 - tmp2;
				3742
				3743	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
				3744	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
				3745	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
				3746
				3747	/* Apply unsigned->signed conversion */
				3748	dataptr[0] = (DCTELEM)
				3749	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
				3750	dataptr[2] = (DCTELEM)
				3751	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
				3752	CONST_BITS-PASS1_BITS);
				3753	dataptr[4] = (DCTELEM)
				3754	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
				3755	CONST_BITS-PASS1_BITS);
				3756
				3757	/* Odd part */
				3758
				3759	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
				3760	CONST_BITS-PASS1_BITS);
				3761
				3762	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
				3763	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
				3764	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
				3765
				3766	ctr++;
				3767
				3768	if (ctr != DCTSIZE) {
				3769	if (ctr == 12)
				3770	break; /* Done. */
				3771	dataptr += DCTSIZE; /* advance pointer to next row */
				3772	} else
				3773	dataptr = workspace; /* switch pointer to extended workspace */
				3774	}
				3775
				3776	/* Pass 2: process columns.
				3777	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3778	* by an overall factor of 8.
				3779	* We must also scale the output by (8/6)*(8/12) = 8/9, which we
				3780	* fold into the constant multipliers:
				3781	* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24) 8/9.
				3782	*/
				3783
				3784	dataptr = data;
				3785	wsptr = workspace;
				3786	for (ctr = 0; ctr < 6; ctr++) {
				3787	/* Even part */
				3788
				3789	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
				3790	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
				3791	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
				3792	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
				3793	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
				3794	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
				3795
				3796	tmp10 = tmp0 + tmp5;
				3797	tmp13 = tmp0 - tmp5;
				3798	tmp11 = tmp1 + tmp4;
				3799	tmp14 = tmp1 - tmp4;
				3800	tmp12 = tmp2 + tmp3;
				3801	tmp15 = tmp2 - tmp3;
				3802
				3803	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
				3804	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
				3805	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
				3806	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
				3807	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
				3808	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
				3809
				3810	dataptr[DCTSIZE*0] = (DCTELEM)
				3811	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
				3812	CONST_BITS+PASS1_BITS);
				3813	dataptr[DCTSIZE*6] = (DCTELEM)
				3814	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
				3815	CONST_BITS+PASS1_BITS);
				3816	dataptr[DCTSIZE*4] = (DCTELEM)
				3817	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
				3818	CONST_BITS+PASS1_BITS);
				3819	dataptr[DCTSIZE*2] = (DCTELEM)
				3820	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
				3821	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
				3822	CONST_BITS+PASS1_BITS);
				3823
				3824	/* Odd part */
				3825
				3826	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
				3827	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
				3828	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
				3829	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
				3830	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
				3831	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
				3832	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
				3833	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
				3834	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
				3835	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
				3836	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
				3837	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
				3838	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
				3839	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
				3840
				3841	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
				3842	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
				3843	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
				3844	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
				3845
				3846	dataptr++; /* advance pointer to next column */
				3847	wsptr++; /* advance pointer to next column */
				3848	}
				3849	}
				3850
				3851
				3852	/*
				3853	* Perform the forward DCT on a 5x10 sample block.
				3854	*
				3855	* 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
				3856	*/
				3857
				3858	GLOBAL(void)
				3859	jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				3860	{
				3861	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
				3862	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
				3863	DCTELEM workspace[8*2];
				3864	DCTELEM *dataptr;
				3865	DCTELEM *wsptr;
				3866	JSAMPROW elemptr;
				3867	int ctr;
				3868	SHIFT_TEMPS
				3869
				3870	/* Pre-zero output coefficient block. */
				3871	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				3872
				3873	/* Pass 1: process rows. */
				3874	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				3875	/* furthermore, we scale the results by 2*PASS1_BITS. /
				3876	/* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10). /
				3877
				3878	dataptr = data;
				3879	ctr = 0;
				3880	for (;;) {
				3881	elemptr = sample_data[ctr] + start_col;
				3882
				3883	/* Even part */
				3884
				3885	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
				3886	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
				3887	tmp2 = GETJSAMPLE(elemptr[2]);
				3888
				3889	tmp10 = tmp0 + tmp1;
				3890	tmp11 = tmp0 - tmp1;
				3891
				3892	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
				3893	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
				3894
				3895	/* Apply unsigned->signed conversion */
				3896	dataptr[0] = (DCTELEM)
				3897	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
				3898	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
				3899	tmp10 -= tmp2 << 2;
				3900	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
				3901	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
				3902	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
				3903
				3904	/* Odd part */
				3905
				3906	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
				3907
				3908	dataptr[1] = (DCTELEM)
				3909	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
				3910	CONST_BITS-PASS1_BITS);
				3911	dataptr[3] = (DCTELEM)
				3912	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
				3913	CONST_BITS-PASS1_BITS);
				3914
				3915	ctr++;
				3916
				3917	if (ctr != DCTSIZE) {
				3918	if (ctr == 10)
				3919	break; /* Done. */
				3920	dataptr += DCTSIZE; /* advance pointer to next row */
				3921	} else
				3922	dataptr = workspace; /* switch pointer to extended workspace */
				3923	}
				3924
				3925	/* Pass 2: process columns.
				3926	* We remove the PASS1_BITS scaling, but leave the results scaled up
				3927	* by an overall factor of 8.
				3928	* We must also scale the output by (8/5)*(8/10) = 32/25, which we
				3929	* fold into the constant multipliers:
				3930	* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20) 32/25.
				3931	*/
				3932
				3933	dataptr = data;
				3934	wsptr = workspace;
				3935	for (ctr = 0; ctr < 5; ctr++) {
				3936	/* Even part */
				3937
				3938	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
				3939	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
				3940	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
				3941	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
				3942	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
				3943
				3944	tmp10 = tmp0 + tmp4;
				3945	tmp13 = tmp0 - tmp4;
				3946	tmp11 = tmp1 + tmp3;
				3947	tmp14 = tmp1 - tmp3;
				3948
				3949	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
				3950	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
				3951	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
				3952	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
				3953	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
				3954
				3955	dataptr[DCTSIZE*0] = (DCTELEM)
				3956	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
				3957	CONST_BITS+PASS1_BITS);
				3958	tmp12 += tmp12;
				3959	dataptr[DCTSIZE*4] = (DCTELEM)
				3960	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
				3961	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
				3962	CONST_BITS+PASS1_BITS);
				3963	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
				3964	dataptr[DCTSIZE*2] = (DCTELEM)
				3965	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
				3966	CONST_BITS+PASS1_BITS);
				3967	dataptr[DCTSIZE*6] = (DCTELEM)
				3968	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
				3969	CONST_BITS+PASS1_BITS);
				3970
				3971	/* Odd part */
				3972
				3973	tmp10 = tmp0 + tmp4;
				3974	tmp11 = tmp1 - tmp3;
				3975	dataptr[DCTSIZE*5] = (DCTELEM)
				3976	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
				3977	CONST_BITS+PASS1_BITS);
				3978	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
				3979	dataptr[DCTSIZE*1] = (DCTELEM)
				3980	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
				3981	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
				3982	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
				3983	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
				3984	CONST_BITS+PASS1_BITS);
				3985	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
				3986	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
				3987	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
				3988	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
				3989	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
				3990	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
				3991
				3992	dataptr++; /* advance pointer to next column */
				3993	wsptr++; /* advance pointer to next column */
				3994	}
				3995	}
				3996
				3997
				3998	/*
				3999	* Perform the forward DCT on a 4x8 sample block.
				4000	*
				4001	* 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
				4002	*/
				4003
				4004	GLOBAL(void)
				4005	jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				4006	{
				4007	INT32 tmp0, tmp1, tmp2, tmp3;
				4008	INT32 tmp10, tmp11, tmp12, tmp13;
				4009	INT32 z1;
				4010	DCTELEM *dataptr;
				4011	JSAMPROW elemptr;
				4012	int ctr;
				4013	SHIFT_TEMPS
				4014
				4015	/* Pre-zero output coefficient block. */
				4016	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				4017
				4018	/* Pass 1: process rows. */
				4019	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				4020	/* furthermore, we scale the results by 2*PASS1_BITS. /
				4021	/* We must also scale the output by 8/4 = 2, which we add here. */
				4022	/* 4-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/16). /
				4023
				4024	dataptr = data;
				4025	for (ctr = 0; ctr < DCTSIZE; ctr++) {
				4026	elemptr = sample_data[ctr] + start_col;
				4027
				4028	/* Even part */
				4029
				4030	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
				4031	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
				4032
				4033	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
				4034	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
				4035
				4036	/* Apply unsigned->signed conversion */
				4037	dataptr[0] = (DCTELEM)
				4038	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
				4039	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
				4040
				4041	/* Odd part */
				4042
				4043	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				4044	/* Add fudge factor here for final descale. */
				4045	tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
				4046
				4047	dataptr[1] = (DCTELEM)
				4048	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				4049	CONST_BITS-PASS1_BITS-1);
				4050	dataptr[3] = (DCTELEM)
				4051	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				4052	CONST_BITS-PASS1_BITS-1);
				4053
				4054	dataptr += DCTSIZE; /* advance pointer to next row */
				4055	}
				4056
				4057	/* Pass 2: process columns.
				4058	* We remove the PASS1_BITS scaling, but leave the results scaled up
				4059	* by an overall factor of 8.
				4060	*/
				4061
				4062	dataptr = data;
				4063	for (ctr = 0; ctr < 4; ctr++) {
				4064	/* Even part per LL&M figure 1 --- note that published figure is faulty;
				4065	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
				4066	*/
				4067
				4068	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
				4069	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
				4070	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
				4071	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
				4072
				4073	/* Add fudge factor here for final descale. */
				4074	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
				4075	tmp12 = tmp0 - tmp3;
				4076	tmp11 = tmp1 + tmp2;
				4077	tmp13 = tmp1 - tmp2;
				4078
				4079	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
				4080	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
				4081	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
				4082	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
				4083
				4084	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
				4085	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
				4086
				4087	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
				4088	/* Add fudge factor here for final descale. */
				4089	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
				4090	dataptr[DCTSIZE*2] = (DCTELEM)
				4091	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
				4092	dataptr[DCTSIZE*6] = (DCTELEM)
				4093	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
				4094
				4095	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
				4096	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
				4097	* i0..i3 in the paper are tmp0..tmp3 here.
				4098	*/
				4099
				4100	tmp10 = tmp0 + tmp3;
				4101	tmp11 = tmp1 + tmp2;
				4102	tmp12 = tmp0 + tmp2;
				4103	tmp13 = tmp1 + tmp3;
				4104	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
				4105	/* Add fudge factor here for final descale. */
				4106	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
				4107
				4108	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
				4109	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
				4110	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
				4111	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
				4112	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
				4113	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
				4114	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
				4115	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
				4116
				4117	tmp12 += z1;
				4118	tmp13 += z1;
				4119
				4120	dataptr[DCTSIZE*1] = (DCTELEM)
				4121	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
				4122	dataptr[DCTSIZE*3] = (DCTELEM)
				4123	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
				4124	dataptr[DCTSIZE*5] = (DCTELEM)
				4125	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
				4126	dataptr[DCTSIZE*7] = (DCTELEM)
				4127	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
				4128
				4129	dataptr++; /* advance pointer to next column */
				4130	}
				4131	}
				4132
				4133
				4134	/*
				4135	* Perform the forward DCT on a 3x6 sample block.
				4136	*
				4137	* 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
				4138	*/
				4139
				4140	GLOBAL(void)
				4141	jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				4142	{
				4143	INT32 tmp0, tmp1, tmp2;
				4144	INT32 tmp10, tmp11, tmp12;
				4145	DCTELEM *dataptr;
				4146	JSAMPROW elemptr;
				4147	int ctr;
				4148	SHIFT_TEMPS
				4149
				4150	/* Pre-zero output coefficient block. */
				4151	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				4152
				4153	/* Pass 1: process rows. */
				4154	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
				4155	/* furthermore, we scale the results by 2*PASS1_BITS. /
				4156	/* We scale the results further by 2 as part of output adaption */
				4157	/* scaling for different DCT size. */
				4158	/* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6). /
				4159
				4160	dataptr = data;
				4161	for (ctr = 0; ctr < 6; ctr++) {
				4162	elemptr = sample_data[ctr] + start_col;
				4163
				4164	/* Even part */
				4165
				4166	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
				4167	tmp1 = GETJSAMPLE(elemptr[1]);
				4168
				4169	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
				4170
				4171	/* Apply unsigned->signed conversion */
				4172	dataptr[0] = (DCTELEM)
				4173	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
				4174	dataptr[2] = (DCTELEM)
				4175	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
				4176	CONST_BITS-PASS1_BITS-1);
				4177
				4178	/* Odd part */
				4179
				4180	dataptr[1] = (DCTELEM)
				4181	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
				4182	CONST_BITS-PASS1_BITS-1);
				4183
				4184	dataptr += DCTSIZE; /* advance pointer to next row */
				4185	}
				4186
				4187	/* Pass 2: process columns.
				4188	* We remove the PASS1_BITS scaling, but leave the results scaled up
				4189	* by an overall factor of 8.
				4190	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
				4191	* fold into the constant multipliers (other part was done in pass 1):
				4192	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
				4193	*/
				4194
				4195	dataptr = data;
				4196	for (ctr = 0; ctr < 3; ctr++) {
				4197	/* Even part */
				4198
				4199	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
				4200	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
				4201	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
				4202
				4203	tmp10 = tmp0 + tmp2;
				4204	tmp12 = tmp0 - tmp2;
				4205
				4206	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
				4207	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
				4208	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
				4209
				4210	dataptr[DCTSIZE*0] = (DCTELEM)
				4211	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
				4212	CONST_BITS+PASS1_BITS);
				4213	dataptr[DCTSIZE*2] = (DCTELEM)
				4214	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
				4215	CONST_BITS+PASS1_BITS);
				4216	dataptr[DCTSIZE*4] = (DCTELEM)
				4217	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
				4218	CONST_BITS+PASS1_BITS);
				4219
				4220	/* Odd part */
				4221
				4222	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
				4223
				4224	dataptr[DCTSIZE*1] = (DCTELEM)
				4225	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
				4226	CONST_BITS+PASS1_BITS);
				4227	dataptr[DCTSIZE*3] = (DCTELEM)
				4228	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
				4229	CONST_BITS+PASS1_BITS);
				4230	dataptr[DCTSIZE*5] = (DCTELEM)
				4231	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
				4232	CONST_BITS+PASS1_BITS);
				4233
				4234	dataptr++; /* advance pointer to next column */
				4235	}
				4236	}
				4237
				4238
				4239	/*
				4240	* Perform the forward DCT on a 2x4 sample block.
				4241	*
				4242	* 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
				4243	*/
				4244
				4245	GLOBAL(void)
				4246	jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				4247	{
				4248	INT32 tmp0, tmp1;
				4249	INT32 tmp10, tmp11;
				4250	DCTELEM *dataptr;
				4251	JSAMPROW elemptr;
				4252	int ctr;
				4253	SHIFT_TEMPS
				4254
				4255	/* Pre-zero output coefficient block. */
				4256	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				4257
				4258	/* Pass 1: process rows. */
				4259	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
				4260	/* We must also scale the output by (8/2)(8/4) = 23, which we add here. /
				4261
				4262	dataptr = data;
				4263	for (ctr = 0; ctr < 4; ctr++) {
				4264	elemptr = sample_data[ctr] + start_col;
				4265
				4266	/* Even part */
				4267
				4268	tmp0 = GETJSAMPLE(elemptr[0]);
				4269	tmp1 = GETJSAMPLE(elemptr[1]);
				4270
				4271	/* Apply unsigned->signed conversion */
				4272	dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
				4273
				4274	/* Odd part */
				4275
				4276	dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
				4277
				4278	dataptr += DCTSIZE; /* advance pointer to next row */
				4279	}
				4280
				4281	/* Pass 2: process columns.
				4282	* We leave the results scaled up by an overall factor of 8.
				4283	* 4-point FDCT kernel,
				4284	* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
				4285	*/
				4286
				4287	dataptr = data;
				4288	for (ctr = 0; ctr < 2; ctr++) {
				4289	/* Even part */
				4290
				4291	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3];
				4292	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
				4293
				4294	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
				4295	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
				4296
				4297	dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
				4298	dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
				4299
				4300	/* Odd part */
				4301
				4302	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
				4303	/* Add fudge factor here for final descale. */
				4304	tmp0 += ONE << (CONST_BITS-1);
				4305
				4306	dataptr[DCTSIZE*1] = (DCTELEM)
				4307	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
				4308	CONST_BITS);
				4309	dataptr[DCTSIZE*3] = (DCTELEM)
				4310	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
				4311	CONST_BITS);
				4312
				4313	dataptr++; /* advance pointer to next column */
				4314	}
				4315	}
				4316
				4317
				4318	/*
				4319	* Perform the forward DCT on a 1x2 sample block.
				4320	*
				4321	* 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
				4322	*/
				4323
				4324	GLOBAL(void)
				4325	jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
				4326	{
				4327	INT32 tmp0, tmp1;
				4328
				4329	/* Pre-zero output coefficient block. */
				4330	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
				4331
				4332	tmp0 = GETJSAMPLE(sample_data[0][start_col]);
				4333	tmp1 = GETJSAMPLE(sample_data[1][start_col]);
				4334
				4335	/* We leave the results scaled up by an overall factor of 8.
				4336	* We must also scale the output by (8/1)(8/2) = 2*5.
				4337	*/
				4338
				4339	/* Even part */
				4340	/* Apply unsigned->signed conversion */
				4341	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp1 - 2 CENTERJSAMPLE) << 5);
				4342
				4343	/* Odd part */
				4344	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
				4345	}
				4346
				4347	#endif /* DCT_SCALING_SUPPORTED */
				4348	#endif /* DCT_ISLOW_SUPPORTED */