VirtualBox

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/i386/simple_idct_mmx.c@ 9384

Last change on this file since 9384 was 5776, checked in by vboxsync, 17 years ago

ffmpeg: exported to OSE

File size: 74.9 KB
Line 
1/*
2 * Simple IDCT MMX
3 *
4 * Copyright (c) 2001, 2002 Michael Niedermayer <[email protected]>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20#include "../dsputil.h"
21#include "../simple_idct.h"
22
23/*
2423170.475006
2522725.260826
2621406.727617
2719265.545870
2816384.000000
2912872.826198
308866.956905
314520.335430
32*/
33#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37#if 0
38#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39#else
40#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41#endif
42#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
46#define ROW_SHIFT 11
47#define COL_SHIFT 20 // 6
48
49static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
51
52static const int16_t __attribute__((aligned(8))) coeffs[]= {
53 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58// 0, 0, 0, 0,
59// 0, 0, 0, 0,
60
61 C4, C4, C4, C4,
62 C4, -C4, C4, -C4,
63
64 C2, C6, C2, C6,
65 C6, -C2, C6, -C2,
66
67 C1, C3, C1, C3,
68 C5, C7, C5, C7,
69
70 C3, -C7, C3, -C7,
71-C1, -C5, -C1, -C5,
72
73 C5, -C1, C5, -C1,
74 C7, C3, C7, C3,
75
76 C7, -C5, C7, -C5,
77 C3, -C1, C3, -C1
78};
79
80#if 0
81static void unused_var_killer(){
82 int a= wm1010 + d40000;
83 temp[0]=a;
84}
85
86static void inline idctCol (int16_t * col, int16_t *input)
87{
88#undef C0
89#undef C1
90#undef C2
91#undef C3
92#undef C4
93#undef C5
94#undef C6
95#undef C7
96 int a0, a1, a2, a3, b0, b1, b2, b3;
97 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105/*
106 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109 return;
110 }*/
111
112col[8*0] = input[8*0 + 0];
113col[8*1] = input[8*2 + 0];
114col[8*2] = input[8*0 + 1];
115col[8*3] = input[8*2 + 1];
116col[8*4] = input[8*4 + 0];
117col[8*5] = input[8*6 + 0];
118col[8*6] = input[8*4 + 1];
119col[8*7] = input[8*6 + 1];
120
121 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125
126 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130
131 col[8*0] = (a0 + b0) >> COL_SHIFT;
132 col[8*1] = (a1 + b1) >> COL_SHIFT;
133 col[8*2] = (a2 + b2) >> COL_SHIFT;
134 col[8*3] = (a3 + b3) >> COL_SHIFT;
135 col[8*4] = (a3 - b3) >> COL_SHIFT;
136 col[8*5] = (a2 - b2) >> COL_SHIFT;
137 col[8*6] = (a1 - b1) >> COL_SHIFT;
138 col[8*7] = (a0 - b0) >> COL_SHIFT;
139}
140
141static void inline idctRow (int16_t * output, int16_t * input)
142{
143 int16_t row[8];
144
145 int a0, a1, a2, a3, b0, b1, b2, b3;
146 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
155row[0] = input[0];
156row[2] = input[1];
157row[4] = input[4];
158row[6] = input[5];
159row[1] = input[8];
160row[3] = input[9];
161row[5] = input[12];
162row[7] = input[13];
163
164 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165 row[0] = row[1] = row[2] = row[3] = row[4] =
166 row[5] = row[6] = row[7] = row[0]<<3;
167 output[0] = row[0];
168 output[2] = row[1];
169 output[4] = row[2];
170 output[6] = row[3];
171 output[8] = row[4];
172 output[10] = row[5];
173 output[12] = row[6];
174 output[14] = row[7];
175 return;
176 }
177
178 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182
183 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187
188 row[0] = (a0 + b0) >> ROW_SHIFT;
189 row[1] = (a1 + b1) >> ROW_SHIFT;
190 row[2] = (a2 + b2) >> ROW_SHIFT;
191 row[3] = (a3 + b3) >> ROW_SHIFT;
192 row[4] = (a3 - b3) >> ROW_SHIFT;
193 row[5] = (a2 - b2) >> ROW_SHIFT;
194 row[6] = (a1 - b1) >> ROW_SHIFT;
195 row[7] = (a0 - b0) >> ROW_SHIFT;
196
197 output[0] = row[0];
198 output[2] = row[1];
199 output[4] = row[2];
200 output[6] = row[3];
201 output[8] = row[4];
202 output[10] = row[5];
203 output[12] = row[6];
204 output[14] = row[7];
205}
206#endif
207
208static inline void idct(int16_t *block)
209{
210 int64_t __attribute__((aligned(8))) align_tmp[16];
211 int16_t * const temp= (int16_t*)align_tmp;
212
213 asm volatile(
214#if 0 //Alternative, simpler variant
215
216#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
218 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
219 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
220 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
221 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
222 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
223 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
224 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
225 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
226 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
227 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
228 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
229 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
230 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
231 #rounder ", %%mm4 \n\t"\
232 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
233 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
234 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
235 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
236 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
237 #rounder ", %%mm0 \n\t"\
238 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
239 "paddd %%mm0, %%mm0 \n\t" \
240 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
241 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
242 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
243 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
244 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
245 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
246 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
247 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
248 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
249 "psrad $" #shift ", %%mm7 \n\t"\
250 "psrad $" #shift ", %%mm4 \n\t"\
251 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
252 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
253 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
254 "psrad $" #shift ", %%mm1 \n\t"\
255 "psrad $" #shift ", %%mm2 \n\t"\
256 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
257 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
258 "movq %%mm7, " #dst " \n\t"\
259 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
260 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
261 "movq %%mm2, 24+" #dst " \n\t"\
262 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
263 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
264 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
265 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
266 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
267 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
268 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
269 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
270 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
271 "psrad $" #shift ", %%mm2 \n\t"\
272 "psrad $" #shift ", %%mm0 \n\t"\
273 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
274 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
275 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
276 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
277 "psrad $" #shift ", %%mm6 \n\t"\
278 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
279 "movq %%mm2, 8+" #dst " \n\t"\
280 "psrad $" #shift ", %%mm4 \n\t"\
281 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
282 "movq %%mm4, 16+" #dst " \n\t"\
283
284#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
286 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
287 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
288 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
289 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
290 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
291 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
292 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
293 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
294 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
295 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
296 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
297 #rounder ", %%mm4 \n\t"\
298 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
299 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
300 #rounder ", %%mm0 \n\t"\
301 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
302 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
303 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
304 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
305 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
306 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
307 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
308 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
309 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
310 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
311 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
312 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
313 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
314 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
315 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
316 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
317 "psrad $" #shift ", %%mm7 \n\t"\
318 "psrad $" #shift ", %%mm4 \n\t"\
319 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
320 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
321 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
322 "psrad $" #shift ", %%mm0 \n\t"\
323 "psrad $" #shift ", %%mm2 \n\t"\
324 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
325 "movd %%mm7, " #dst " \n\t"\
326 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
327 "movd %%mm0, 16+" #dst " \n\t"\
328 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
329 "movd %%mm2, 96+" #dst " \n\t"\
330 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
331 "movd %%mm4, 112+" #dst " \n\t"\
332 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
333 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
334 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
335 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
336 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
337 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
338 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
339 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
340 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
341 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
342 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
343 "psrad $" #shift ", %%mm2 \n\t"\
344 "psrad $" #shift ", %%mm5 \n\t"\
345 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
346 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
347 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
348 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
349 "psrad $" #shift ", %%mm6 \n\t"\
350 "psrad $" #shift ", %%mm4 \n\t"\
351 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
352 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
353 "movd %%mm2, 32+" #dst " \n\t"\
354 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
355 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
356 "movd %%mm6, 48+" #dst " \n\t"\
357 "movd %%mm4, 64+" #dst " \n\t"\
358 "movd %%mm5, 80+" #dst " \n\t"\
359
360
361#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
363 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
364 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
365 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
367 "pand %%mm0, %%mm4 \n\t"\
368 "por %%mm1, %%mm4 \n\t"\
369 "por %%mm2, %%mm4 \n\t"\
370 "por %%mm3, %%mm4 \n\t"\
371 "packssdw %%mm4,%%mm4 \n\t"\
372 "movd %%mm4, %%eax \n\t"\
373 "orl %%eax, %%eax \n\t"\
374 "jz 1f \n\t"\
375 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
376 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
377 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
378 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
379 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
380 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
381 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
382 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
383 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
384 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
385 #rounder ", %%mm4 \n\t"\
386 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
387 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
388 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
389 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
390 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
391 #rounder ", %%mm0 \n\t"\
392 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
393 "paddd %%mm0, %%mm0 \n\t" \
394 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
395 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
396 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
397 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
398 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
399 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
400 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
401 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
402 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
403 "psrad $" #shift ", %%mm7 \n\t"\
404 "psrad $" #shift ", %%mm4 \n\t"\
405 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
406 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
407 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
408 "psrad $" #shift ", %%mm1 \n\t"\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
411 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
412 "movq %%mm7, " #dst " \n\t"\
413 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
414 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
415 "movq %%mm2, 24+" #dst " \n\t"\
416 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
417 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
418 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
419 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
420 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
421 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
422 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
423 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
424 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
425 "psrad $" #shift ", %%mm2 \n\t"\
426 "psrad $" #shift ", %%mm0 \n\t"\
427 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
428 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
429 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
430 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
431 "psrad $" #shift ", %%mm6 \n\t"\
432 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
433 "movq %%mm2, 8+" #dst " \n\t"\
434 "psrad $" #shift ", %%mm4 \n\t"\
435 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
436 "movq %%mm4, 16+" #dst " \n\t"\
437 "jmp 2f \n\t"\
438 "1: \n\t"\
439 "pslld $16, %%mm0 \n\t"\
440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441 "psrad $13, %%mm0 \n\t"\
442 "packssdw %%mm0, %%mm0 \n\t"\
443 "movq %%mm0, " #dst " \n\t"\
444 "movq %%mm0, 8+" #dst " \n\t"\
445 "movq %%mm0, 16+" #dst " \n\t"\
446 "movq %%mm0, 24+" #dst " \n\t"\
447 "2: \n\t"
448
449
450//IDCT( src0, src4, src1, src5, dst, rounder, shift)
451ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
452/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455
456DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459
460
461//IDCT( src0, src4, src1, src5, dst, rounder, shift)
462#if !defined(VBOX) || !defined(__DARWIN__)
463COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
464COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
465COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
466COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
467#else
468COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
469COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
470COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
471COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
472#endif
473
474#else
475
476#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
477 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
478 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
479 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
480 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
481 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
482 "pand %%mm0, %%mm4 \n\t"\
483 "por %%mm1, %%mm4 \n\t"\
484 "por %%mm2, %%mm4 \n\t"\
485 "por %%mm3, %%mm4 \n\t"\
486 "packssdw %%mm4,%%mm4 \n\t"\
487 "movd %%mm4, %%eax \n\t"\
488 "orl %%eax, %%eax \n\t"\
489 "jz 1f \n\t"\
490 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
491 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
492 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
493 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
494 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
495 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
496 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
497 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
498 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
499 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
500 #rounder ", %%mm4 \n\t"\
501 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
502 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
503 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
504 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
505 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
506 #rounder ", %%mm0 \n\t"\
507 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
508 "paddd %%mm0, %%mm0 \n\t" \
509 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
510 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
511 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
512 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
513 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
514 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
515 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
516 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
517 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
518 "psrad $" #shift ", %%mm7 \n\t"\
519 "psrad $" #shift ", %%mm4 \n\t"\
520 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
521 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
522 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
523 "psrad $" #shift ", %%mm1 \n\t"\
524 "psrad $" #shift ", %%mm2 \n\t"\
525 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
526 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
527 "movq %%mm7, " #dst " \n\t"\
528 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
529 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
530 "movq %%mm2, 24+" #dst " \n\t"\
531 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
532 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
533 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
534 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
535 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
536 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
537 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
538 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
539 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
540 "psrad $" #shift ", %%mm2 \n\t"\
541 "psrad $" #shift ", %%mm0 \n\t"\
542 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
543 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
544 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
545 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
546 "psrad $" #shift ", %%mm6 \n\t"\
547 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
548 "movq %%mm2, 8+" #dst " \n\t"\
549 "psrad $" #shift ", %%mm4 \n\t"\
550 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
551 "movq %%mm4, 16+" #dst " \n\t"\
552 "jmp 2f \n\t"\
553 "1: \n\t"\
554 "pslld $16, %%mm0 \n\t"\
555 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
556 "psrad $13, %%mm0 \n\t"\
557 "packssdw %%mm0, %%mm0 \n\t"\
558 "movq %%mm0, " #dst " \n\t"\
559 "movq %%mm0, 8+" #dst " \n\t"\
560 "movq %%mm0, 16+" #dst " \n\t"\
561 "movq %%mm0, 24+" #dst " \n\t"\
562 "2: \n\t"
563
564#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
565 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
566 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
567 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
568 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
569 "movq %%mm0, %%mm4 \n\t"\
570 "por %%mm1, %%mm4 \n\t"\
571 "por %%mm2, %%mm4 \n\t"\
572 "por %%mm3, %%mm4 \n\t"\
573 "packssdw %%mm4,%%mm4 \n\t"\
574 "movd %%mm4, %%eax \n\t"\
575 "orl %%eax, %%eax \n\t"\
576 "jz " #bt " \n\t"\
577 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
578 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
579 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
580 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
581 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
582 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
583 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
584 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
585 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
586 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
587 #rounder ", %%mm4 \n\t"\
588 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
589 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
590 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
591 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
592 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
593 #rounder ", %%mm0 \n\t"\
594 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
595 "paddd %%mm0, %%mm0 \n\t" \
596 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
597 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
598 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
599 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
600 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
601 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
602 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
603 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
604 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
605 "psrad $" #shift ", %%mm7 \n\t"\
606 "psrad $" #shift ", %%mm4 \n\t"\
607 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
608 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
609 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
610 "psrad $" #shift ", %%mm1 \n\t"\
611 "psrad $" #shift ", %%mm2 \n\t"\
612 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
613 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
614 "movq %%mm7, " #dst " \n\t"\
615 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
616 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
617 "movq %%mm2, 24+" #dst " \n\t"\
618 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
619 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
620 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
621 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
622 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
623 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
624 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
625 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
626 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
627 "psrad $" #shift ", %%mm2 \n\t"\
628 "psrad $" #shift ", %%mm0 \n\t"\
629 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
630 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
631 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
632 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
633 "psrad $" #shift ", %%mm6 \n\t"\
634 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
635 "movq %%mm2, 8+" #dst " \n\t"\
636 "psrad $" #shift ", %%mm4 \n\t"\
637 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
638 "movq %%mm4, 16+" #dst " \n\t"\
639
640#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
641 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
642 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
643 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
644 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
645 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
646 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
647 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
648 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
649 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
650 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
651 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
652 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
653 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
654 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
655 #rounder ", %%mm4 \n\t"\
656 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
657 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
658 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
659 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
660 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
661 #rounder ", %%mm0 \n\t"\
662 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
663 "paddd %%mm0, %%mm0 \n\t" \
664 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
665 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
666 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
667 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
668 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
669 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
670 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
671 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
672 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
673 "psrad $" #shift ", %%mm7 \n\t"\
674 "psrad $" #shift ", %%mm4 \n\t"\
675 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
676 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
677 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
678 "psrad $" #shift ", %%mm1 \n\t"\
679 "psrad $" #shift ", %%mm2 \n\t"\
680 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
681 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
682 "movq %%mm7, " #dst " \n\t"\
683 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
684 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
685 "movq %%mm2, 24+" #dst " \n\t"\
686 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
687 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
688 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
689 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
690 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
691 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
692 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
693 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
694 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
695 "psrad $" #shift ", %%mm2 \n\t"\
696 "psrad $" #shift ", %%mm0 \n\t"\
697 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
698 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
699 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
700 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
701 "psrad $" #shift ", %%mm6 \n\t"\
702 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
703 "movq %%mm2, 8+" #dst " \n\t"\
704 "psrad $" #shift ", %%mm4 \n\t"\
705 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
706 "movq %%mm4, 16+" #dst " \n\t"\
707
708//IDCT( src0, src4, src1, src5, dst, rounder, shift)
709DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
710Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
711Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
712Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
713
714#undef IDCT
715#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
716 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
717 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
718 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
719 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
720 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
721 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
722 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
723 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
724 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
725 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
726 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
727 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
728 #rounder ", %%mm4 \n\t"\
729 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
730 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
731 #rounder ", %%mm0 \n\t"\
732 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
733 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
734 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
735 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
736 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
737 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
738 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
739 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
740 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
741 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
742 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
743 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
744 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
745 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
746 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
747 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
748 "psrad $" #shift ", %%mm7 \n\t"\
749 "psrad $" #shift ", %%mm4 \n\t"\
750 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
751 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
752 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
753 "psrad $" #shift ", %%mm0 \n\t"\
754 "psrad $" #shift ", %%mm2 \n\t"\
755 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
756 "movd %%mm7, " #dst " \n\t"\
757 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
758 "movd %%mm0, 16+" #dst " \n\t"\
759 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
760 "movd %%mm2, 96+" #dst " \n\t"\
761 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
762 "movd %%mm4, 112+" #dst " \n\t"\
763 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
764 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
765 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
766 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
767 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
768 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
769 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
770 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
771 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
772 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
773 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
774 "psrad $" #shift ", %%mm2 \n\t"\
775 "psrad $" #shift ", %%mm5 \n\t"\
776 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
777 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
778 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
779 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
780 "psrad $" #shift ", %%mm6 \n\t"\
781 "psrad $" #shift ", %%mm4 \n\t"\
782 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
783 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
784 "movd %%mm2, 32+" #dst " \n\t"\
785 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
786 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
787 "movd %%mm6, 48+" #dst " \n\t"\
788 "movd %%mm4, 64+" #dst " \n\t"\
789 "movd %%mm5, 80+" #dst " \n\t"
790
791
792//IDCT( src0, src4, src1, src5, dst, rounder, shift)
793#if !defined(VBOX) || !defined(__DARWIN__)
794IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
795IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
796IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
797IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
798#else
799IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
800IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
801IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
802IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
803#endif
804 "jmp 9f \n\t"
805
806 "#.balign 16 \n\t"\
807 "4: \n\t"
808Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
809Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
810
811#undef IDCT
812#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
813 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
814 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
815 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
816 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
817 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
818 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
819 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
821 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
822 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
823 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
824 #rounder ", %%mm4 \n\t"\
825 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
826 #rounder ", %%mm0 \n\t"\
827 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
828 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
829 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
830 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
831 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
832 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
833 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
834 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
835 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
836 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
837 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
838 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
839 "psrad $" #shift ", %%mm1 \n\t"\
840 "psrad $" #shift ", %%mm4 \n\t"\
841 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
842 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
843 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
844 "psrad $" #shift ", %%mm0 \n\t"\
845 "psrad $" #shift ", %%mm2 \n\t"\
846 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
847 "movd %%mm1, " #dst " \n\t"\
848 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
849 "movd %%mm0, 16+" #dst " \n\t"\
850 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
851 "movd %%mm2, 96+" #dst " \n\t"\
852 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
853 "movd %%mm4, 112+" #dst " \n\t"\
854 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
855 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
856 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
857 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
858 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
859 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
860 "psrad $" #shift ", %%mm2 \n\t"\
861 "psrad $" #shift ", %%mm5 \n\t"\
862 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
863 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
864 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
865 "psrad $" #shift ", %%mm6 \n\t"\
866 "psrad $" #shift ", %%mm1 \n\t"\
867 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
868 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
869 "movd %%mm2, 32+" #dst " \n\t"\
870 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
871 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
872 "movd %%mm6, 48+" #dst " \n\t"\
873 "movd %%mm1, 64+" #dst " \n\t"\
874 "movd %%mm5, 80+" #dst " \n\t"
875
876//IDCT( src0, src4, src1, src5, dst, rounder, shift)
877#if !defined(VBOX) || !defined(__DARWIN__)
878IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
879IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
880IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
881IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
882#else
883IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
884IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
885IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
886IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
887#endif
888 "jmp 9f \n\t"
889
890 "#.balign 16 \n\t"\
891 "6: \n\t"
892Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
893
894#undef IDCT
895#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
896 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
897 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
898 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
899 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
900 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
901 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
902 #rounder ", %%mm4 \n\t"\
903 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
904 #rounder ", %%mm0 \n\t"\
905 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
906 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
907 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
908 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
909 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
910 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
911 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
912 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
913 "psrad $" #shift ", %%mm1 \n\t"\
914 "psrad $" #shift ", %%mm4 \n\t"\
915 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
916 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
917 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
918 "psrad $" #shift ", %%mm0 \n\t"\
919 "psrad $" #shift ", %%mm2 \n\t"\
920 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
921 "movd %%mm1, " #dst " \n\t"\
922 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
923 "movd %%mm0, 16+" #dst " \n\t"\
924 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
925 "movd %%mm2, 96+" #dst " \n\t"\
926 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
927 "movd %%mm4, 112+" #dst " \n\t"\
928 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
929 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
930 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
931 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
932 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
933 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
934 "psrad $" #shift ", %%mm2 \n\t"\
935 "psrad $" #shift ", %%mm5 \n\t"\
936 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
937 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
938 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
939 "psrad $" #shift ", %%mm6 \n\t"\
940 "psrad $" #shift ", %%mm1 \n\t"\
941 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
942 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
943 "movd %%mm2, 32+" #dst " \n\t"\
944 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
945 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
946 "movd %%mm6, 48+" #dst " \n\t"\
947 "movd %%mm1, 64+" #dst " \n\t"\
948 "movd %%mm5, 80+" #dst " \n\t"
949
950
951//IDCT( src0, src4, src1, src5, dst, rounder, shift)
952#if !defined(VBOX) || !defined(__DARWIN__)
953IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
954IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
955IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
956IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
957#else
958IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
959IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
960IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
961IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
962#endif
963 "jmp 9f \n\t"
964
965 "#.balign 16 \n\t"\
966 "2: \n\t"
967Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
968
969#undef IDCT
970#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
971 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
972 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
973 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
974 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
975 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
976 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
977 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
978 #rounder ", %%mm4 \n\t"\
979 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
980 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
981 #rounder ", %%mm0 \n\t"\
982 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
983 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
984 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
985 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
986 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
987 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
988 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
989 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
990 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
991 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
992 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
993 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
994 "psrad $" #shift ", %%mm7 \n\t"\
995 "psrad $" #shift ", %%mm4 \n\t"\
996 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
997 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
998 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
999 "psrad $" #shift ", %%mm0 \n\t"\
1000 "psrad $" #shift ", %%mm2 \n\t"\
1001 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1002 "movd %%mm7, " #dst " \n\t"\
1003 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1004 "movd %%mm0, 16+" #dst " \n\t"\
1005 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
1006 "movd %%mm2, 96+" #dst " \n\t"\
1007 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1008 "movd %%mm4, 112+" #dst " \n\t"\
1009 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
1010 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1011 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1012 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
1013 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1014 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
1015 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
1016 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
1017 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
1018 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1019 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1020 "psrad $" #shift ", %%mm2 \n\t"\
1021 "psrad $" #shift ", %%mm5 \n\t"\
1022 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1023 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
1024 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1025 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1026 "psrad $" #shift ", %%mm6 \n\t"\
1027 "psrad $" #shift ", %%mm4 \n\t"\
1028 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1029 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1030 "movd %%mm2, 32+" #dst " \n\t"\
1031 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1032 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1033 "movd %%mm6, 48+" #dst " \n\t"\
1034 "movd %%mm4, 64+" #dst " \n\t"\
1035 "movd %%mm5, 80+" #dst " \n\t"
1036
1037//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1038#if !defined(VBOX) || !defined(__DARWIN__)
1039IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1040IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1041IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1042IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1043#else
1044IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
1045IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
1046IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
1047IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
1048#endif
1049 "jmp 9f \n\t"
1050
1051 "#.balign 16 \n\t"\
1052 "3: \n\t"
1053#undef IDCT
1054#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1055 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1056 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1057 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1058 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1059 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1060 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1061 #rounder ", %%mm4 \n\t"\
1062 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1063 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1064 #rounder ", %%mm0 \n\t"\
1065 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1066 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1067 "movq 64(%2), %%mm3 \n\t"\
1068 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1069 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1070 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1071 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1072 "psrad $" #shift ", %%mm7 \n\t"\
1073 "psrad $" #shift ", %%mm4 \n\t"\
1074 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1075 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1076 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1077 "psrad $" #shift ", %%mm0 \n\t"\
1078 "psrad $" #shift ", %%mm1 \n\t"\
1079 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1080 "movd %%mm7, " #dst " \n\t"\
1081 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1082 "movd %%mm0, 16+" #dst " \n\t"\
1083 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1084 "movd %%mm1, 96+" #dst " \n\t"\
1085 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1086 "movd %%mm4, 112+" #dst " \n\t"\
1087 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1088 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1089 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1090 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1091 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1092 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1093 "psrad $" #shift ", %%mm1 \n\t"\
1094 "psrad $" #shift ", %%mm5 \n\t"\
1095 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1096 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1097 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1098 "psrad $" #shift ", %%mm6 \n\t"\
1099 "psrad $" #shift ", %%mm4 \n\t"\
1100 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1101 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1102 "movd %%mm1, 32+" #dst " \n\t"\
1103 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1104 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1105 "movd %%mm6, 48+" #dst " \n\t"\
1106 "movd %%mm4, 64+" #dst " \n\t"\
1107 "movd %%mm5, 80+" #dst " \n\t"
1108
1109
1110//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1111#if !defined(VBOX) || !defined(__DARWIN__)
1112IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1113IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1114IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1115IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1116#else
1117IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
1118IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
1119IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
1120IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
1121#endif
1122 "jmp 9f \n\t"
1123
1124 "#.balign 16 \n\t"\
1125 "5: \n\t"
1126#undef IDCT
1127#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1128 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1129 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1130 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1131 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1132 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1133 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1134 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1135 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1136 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1137 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1138 #rounder ", %%mm4 \n\t"\
1139 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1140 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1141 #rounder ", %%mm0 \n\t"\
1142 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1143 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1144 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1145 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1146 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1147 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1148 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1149 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1150 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1151 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1152 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1153 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1154 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1155 #rounder ", %%mm1 \n\t"\
1156 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1157 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1158 #rounder ", %%mm2 \n\t"\
1159 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1160 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1161 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1162 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1163 "psrad $" #shift ", %%mm4 \n\t"\
1164 "psrad $" #shift ", %%mm7 \n\t"\
1165 "psrad $" #shift ", %%mm3 \n\t"\
1166 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1167 "movq %%mm4, " #dst " \n\t"\
1168 "psrad $" #shift ", %%mm0 \n\t"\
1169 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1170 "movq %%mm0, 16+" #dst " \n\t"\
1171 "movq %%mm0, 96+" #dst " \n\t"\
1172 "movq %%mm4, 112+" #dst " \n\t"\
1173 "psrad $" #shift ", %%mm5 \n\t"\
1174 "psrad $" #shift ", %%mm6 \n\t"\
1175 "psrad $" #shift ", %%mm2 \n\t"\
1176 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1177 "movq %%mm5, 32+" #dst " \n\t"\
1178 "psrad $" #shift ", %%mm1 \n\t"\
1179 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1180 "movq %%mm6, 48+" #dst " \n\t"\
1181 "movq %%mm6, 64+" #dst " \n\t"\
1182 "movq %%mm5, 80+" #dst " \n\t"
1183
1184
1185//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1186#if !defined(VBOX) || !defined(__DARWIN__)
1187IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1188//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1189IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1190//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1191#else
1192IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
1193//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
1194IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
1195//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
1196#endif
1197 "jmp 9f \n\t"
1198
1199
1200 "#.balign 16 \n\t"\
1201 "1: \n\t"
1202#undef IDCT
1203#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1204 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1205 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1206 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1207 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1208 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1209 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1210 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1211 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1212 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1213 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1214 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1215 #rounder ", %%mm4 \n\t"\
1216 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1217 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1218 #rounder ", %%mm0 \n\t"\
1219 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1220 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1221 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1222 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1223 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1224 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1225 "movq 64(%2), %%mm1 \n\t"\
1226 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1227 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1228 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1229 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1230 "psrad $" #shift ", %%mm7 \n\t"\
1231 "psrad $" #shift ", %%mm4 \n\t"\
1232 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1233 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1234 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1235 "psrad $" #shift ", %%mm0 \n\t"\
1236 "psrad $" #shift ", %%mm3 \n\t"\
1237 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1238 "movd %%mm7, " #dst " \n\t"\
1239 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1240 "movd %%mm0, 16+" #dst " \n\t"\
1241 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1242 "movd %%mm3, 96+" #dst " \n\t"\
1243 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1244 "movd %%mm4, 112+" #dst " \n\t"\
1245 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1246 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1247 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1248 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1249 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1250 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1251 "psrad $" #shift ", %%mm3 \n\t"\
1252 "psrad $" #shift ", %%mm5 \n\t"\
1253 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1254 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1255 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1256 "psrad $" #shift ", %%mm6 \n\t"\
1257 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1258 "movd %%mm3, 32+" #dst " \n\t"\
1259 "psrad $" #shift ", %%mm4 \n\t"\
1260 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1261 "movd %%mm6, 48+" #dst " \n\t"\
1262 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1263 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1264 "movd %%mm4, 64+" #dst " \n\t"\
1265 "movd %%mm5, 80+" #dst " \n\t"
1266
1267
1268//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1269#if !defined(VBOX) || !defined(__DARWIN__)
1270IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1271IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1272IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1273IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1274#else
1275IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
1276IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
1277IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
1278IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
1279#endif
1280 "jmp 9f \n\t"
1281
1282
1283 "#.balign 16 \n\t"
1284 "7: \n\t"
1285#undef IDCT
1286#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1287 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1288 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1289 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1290 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1291 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1292 #rounder ", %%mm4 \n\t"\
1293 #rounder ", %%mm0 \n\t"\
1294 "psrad $" #shift ", %%mm4 \n\t"\
1295 "psrad $" #shift ", %%mm0 \n\t"\
1296 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1297 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1298 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1299 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1300 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1301 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1302 #rounder ", %%mm1 \n\t"\
1303 #rounder ", %%mm2 \n\t"\
1304 "psrad $" #shift ", %%mm1 \n\t"\
1305 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1306 "movq %%mm4, " #dst " \n\t"\
1307 "psrad $" #shift ", %%mm2 \n\t"\
1308 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1309 "movq %%mm0, 16+" #dst " \n\t"\
1310 "movq %%mm0, 96+" #dst " \n\t"\
1311 "movq %%mm4, 112+" #dst " \n\t"\
1312 "movq %%mm0, 32+" #dst " \n\t"\
1313 "movq %%mm4, 48+" #dst " \n\t"\
1314 "movq %%mm4, 64+" #dst " \n\t"\
1315 "movq %%mm0, 80+" #dst " \n\t"
1316
1317//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1318#if !defined(VBOX) || !defined(__DARWIN__)
1319IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1320//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1321IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1322//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1323#else
1324IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
1325//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
1326IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
1327//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
1328#endif
1329
1330
1331#endif
1332
1333/*
1334Input
1335 00 40 04 44 20 60 24 64
1336 10 30 14 34 50 70 54 74
1337 01 41 03 43 21 61 23 63
1338 11 31 13 33 51 71 53 73
1339 02 42 06 46 22 62 26 66
1340 12 32 16 36 52 72 56 76
1341 05 45 07 47 25 65 27 67
1342 15 35 17 37 55 75 57 77
1343
1344Temp
1345 00 04 10 14 20 24 30 34
1346 40 44 50 54 60 64 70 74
1347 01 03 11 13 21 23 31 33
1348 41 43 51 53 61 63 71 73
1349 02 06 12 16 22 26 32 36
1350 42 46 52 56 62 66 72 76
1351 05 07 15 17 25 27 35 37
1352 45 47 55 57 65 67 75 77
1353*/
1354
1355"9: \n\t"
1356 :: "r" (block), "r" (temp), "r" (coeffs)
1357 : "%eax"
1358 );
1359}
1360
1361void ff_simple_idct_mmx(int16_t *block)
1362{
1363 idct(block);
1364}
1365
1366//FIXME merge add/put into the idct
1367
1368void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1369{
1370 idct(block);
1371 put_pixels_clamped_mmx(block, dest, line_size);
1372}
1373void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1374{
1375 idct(block);
1376 add_pixels_clamped_mmx(block, dest, line_size);
1377}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette