1 | /*
|
---|
2 | * MMX optimized forward DCT
|
---|
3 | * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
|
---|
4 | * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <[email protected]>
|
---|
5 | * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
|
---|
6 | *
|
---|
7 | * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
|
---|
8 | *
|
---|
9 | * Intel Application Note AP-922 - fast, precise implementation of DCT
|
---|
10 | * http://developer.intel.com/vtune/cbts/appnotes.htm
|
---|
11 | *
|
---|
12 | * Also of inspiration:
|
---|
13 | * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
|
---|
14 | * Skal's fdct at http://skal.planet-d.net/coding/dct.html
|
---|
15 | */
|
---|
16 | #include "common.h"
|
---|
17 | #include "../dsputil.h"
|
---|
18 | #include "mmx.h"
|
---|
19 |
|
---|
20 | #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
|
---|
21 |
|
---|
22 | //////////////////////////////////////////////////////////////////////
|
---|
23 | //
|
---|
24 | // constants for the forward DCT
|
---|
25 | // -----------------------------
|
---|
26 | //
|
---|
27 | // Be sure to check that your compiler is aligning all constants to QWORD
|
---|
28 | // (8-byte) memory boundaries! Otherwise the unaligned memory access will
|
---|
29 | // severely stall MMX execution.
|
---|
30 | //
|
---|
31 | //////////////////////////////////////////////////////////////////////
|
---|
32 |
|
---|
33 | #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
|
---|
34 | #define SHIFT_FRW_COL BITS_FRW_ACC
|
---|
35 | #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
|
---|
36 | #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
|
---|
37 | //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
|
---|
38 |
|
---|
39 | //concatenated table, for forward DCT transformation
|
---|
40 | static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
|
---|
41 | 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
|
---|
42 | 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
|
---|
43 | -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
|
---|
44 | };
|
---|
45 |
|
---|
46 | static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
|
---|
47 | 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
|
---|
48 | };
|
---|
49 |
|
---|
50 | static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
|
---|
51 |
|
---|
52 | static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
|
---|
53 |
|
---|
54 | struct
|
---|
55 | {
|
---|
56 | const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
|
---|
57 | } fdct_r_row_sse2 ATTR_ALIGN(16)=
|
---|
58 | {{
|
---|
59 | RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
|
---|
60 | }};
|
---|
61 | //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
|
---|
62 |
|
---|
63 | static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
|
---|
64 | 16384, 16384, 22725, 19266,
|
---|
65 | 16384, 16384, 12873, 4520,
|
---|
66 | 21407, 8867, 19266, -4520,
|
---|
67 | -8867, -21407, -22725, -12873,
|
---|
68 | 16384, -16384, 12873, -22725,
|
---|
69 | -16384, 16384, 4520, 19266,
|
---|
70 | 8867, -21407, 4520, -12873,
|
---|
71 | 21407, -8867, 19266, -22725,
|
---|
72 |
|
---|
73 | 22725, 22725, 31521, 26722,
|
---|
74 | 22725, 22725, 17855, 6270,
|
---|
75 | 29692, 12299, 26722, -6270,
|
---|
76 | -12299, -29692, -31521, -17855,
|
---|
77 | 22725, -22725, 17855, -31521,
|
---|
78 | -22725, 22725, 6270, 26722,
|
---|
79 | 12299, -29692, 6270, -17855,
|
---|
80 | 29692, -12299, 26722, -31521,
|
---|
81 |
|
---|
82 | 21407, 21407, 29692, 25172,
|
---|
83 | 21407, 21407, 16819, 5906,
|
---|
84 | 27969, 11585, 25172, -5906,
|
---|
85 | -11585, -27969, -29692, -16819,
|
---|
86 | 21407, -21407, 16819, -29692,
|
---|
87 | -21407, 21407, 5906, 25172,
|
---|
88 | 11585, -27969, 5906, -16819,
|
---|
89 | 27969, -11585, 25172, -29692,
|
---|
90 |
|
---|
91 | 19266, 19266, 26722, 22654,
|
---|
92 | 19266, 19266, 15137, 5315,
|
---|
93 | 25172, 10426, 22654, -5315,
|
---|
94 | -10426, -25172, -26722, -15137,
|
---|
95 | 19266, -19266, 15137, -26722,
|
---|
96 | -19266, 19266, 5315, 22654,
|
---|
97 | 10426, -25172, 5315, -15137,
|
---|
98 | 25172, -10426, 22654, -26722,
|
---|
99 |
|
---|
100 | 16384, 16384, 22725, 19266,
|
---|
101 | 16384, 16384, 12873, 4520,
|
---|
102 | 21407, 8867, 19266, -4520,
|
---|
103 | -8867, -21407, -22725, -12873,
|
---|
104 | 16384, -16384, 12873, -22725,
|
---|
105 | -16384, 16384, 4520, 19266,
|
---|
106 | 8867, -21407, 4520, -12873,
|
---|
107 | 21407, -8867, 19266, -22725,
|
---|
108 |
|
---|
109 | 19266, 19266, 26722, 22654,
|
---|
110 | 19266, 19266, 15137, 5315,
|
---|
111 | 25172, 10426, 22654, -5315,
|
---|
112 | -10426, -25172, -26722, -15137,
|
---|
113 | 19266, -19266, 15137, -26722,
|
---|
114 | -19266, 19266, 5315, 22654,
|
---|
115 | 10426, -25172, 5315, -15137,
|
---|
116 | 25172, -10426, 22654, -26722,
|
---|
117 |
|
---|
118 | 21407, 21407, 29692, 25172,
|
---|
119 | 21407, 21407, 16819, 5906,
|
---|
120 | 27969, 11585, 25172, -5906,
|
---|
121 | -11585, -27969, -29692, -16819,
|
---|
122 | 21407, -21407, 16819, -29692,
|
---|
123 | -21407, 21407, 5906, 25172,
|
---|
124 | 11585, -27969, 5906, -16819,
|
---|
125 | 27969, -11585, 25172, -29692,
|
---|
126 |
|
---|
127 | 22725, 22725, 31521, 26722,
|
---|
128 | 22725, 22725, 17855, 6270,
|
---|
129 | 29692, 12299, 26722, -6270,
|
---|
130 | -12299, -29692, -31521, -17855,
|
---|
131 | 22725, -22725, 17855, -31521,
|
---|
132 | -22725, 22725, 6270, 26722,
|
---|
133 | 12299, -29692, 6270, -17855,
|
---|
134 | 29692, -12299, 26722, -31521,
|
---|
135 | };
|
---|
136 |
|
---|
137 | struct
|
---|
138 | {
|
---|
139 | const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
|
---|
140 | } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
|
---|
141 | {{
|
---|
142 | //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
|
---|
143 | #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
|
---|
144 | C4, C4, C5, C7, C2, C6, C3, -C7, \
|
---|
145 | -C4, C4, C7, C3, C6, -C2, C7, -C5, \
|
---|
146 | C4, -C4, C5, -C1, C2, -C6, C3, -C1,
|
---|
147 | // c1..c7 * cos(pi/4) * 2^15
|
---|
148 | #define C1 22725
|
---|
149 | #define C2 21407
|
---|
150 | #define C3 19266
|
---|
151 | #define C4 16384
|
---|
152 | #define C5 12873
|
---|
153 | #define C6 8867
|
---|
154 | #define C7 4520
|
---|
155 | TABLE_SSE2
|
---|
156 |
|
---|
157 | #undef C1
|
---|
158 | #undef C2
|
---|
159 | #undef C3
|
---|
160 | #undef C4
|
---|
161 | #undef C5
|
---|
162 | #undef C6
|
---|
163 | #undef C7
|
---|
164 | #define C1 31521
|
---|
165 | #define C2 29692
|
---|
166 | #define C3 26722
|
---|
167 | #define C4 22725
|
---|
168 | #define C5 17855
|
---|
169 | #define C6 12299
|
---|
170 | #define C7 6270
|
---|
171 | TABLE_SSE2
|
---|
172 |
|
---|
173 | #undef C1
|
---|
174 | #undef C2
|
---|
175 | #undef C3
|
---|
176 | #undef C4
|
---|
177 | #undef C5
|
---|
178 | #undef C6
|
---|
179 | #undef C7
|
---|
180 | #define C1 29692
|
---|
181 | #define C2 27969
|
---|
182 | #define C3 25172
|
---|
183 | #define C4 21407
|
---|
184 | #define C5 16819
|
---|
185 | #define C6 11585
|
---|
186 | #define C7 5906
|
---|
187 | TABLE_SSE2
|
---|
188 |
|
---|
189 | #undef C1
|
---|
190 | #undef C2
|
---|
191 | #undef C3
|
---|
192 | #undef C4
|
---|
193 | #undef C5
|
---|
194 | #undef C6
|
---|
195 | #undef C7
|
---|
196 | #define C1 26722
|
---|
197 | #define C2 25172
|
---|
198 | #define C3 22654
|
---|
199 | #define C4 19266
|
---|
200 | #define C5 15137
|
---|
201 | #define C6 10426
|
---|
202 | #define C7 5315
|
---|
203 | TABLE_SSE2
|
---|
204 |
|
---|
205 | #undef C1
|
---|
206 | #undef C2
|
---|
207 | #undef C3
|
---|
208 | #undef C4
|
---|
209 | #undef C5
|
---|
210 | #undef C6
|
---|
211 | #undef C7
|
---|
212 | #define C1 22725
|
---|
213 | #define C2 21407
|
---|
214 | #define C3 19266
|
---|
215 | #define C4 16384
|
---|
216 | #define C5 12873
|
---|
217 | #define C6 8867
|
---|
218 | #define C7 4520
|
---|
219 | TABLE_SSE2
|
---|
220 |
|
---|
221 | #undef C1
|
---|
222 | #undef C2
|
---|
223 | #undef C3
|
---|
224 | #undef C4
|
---|
225 | #undef C5
|
---|
226 | #undef C6
|
---|
227 | #undef C7
|
---|
228 | #define C1 26722
|
---|
229 | #define C2 25172
|
---|
230 | #define C3 22654
|
---|
231 | #define C4 19266
|
---|
232 | #define C5 15137
|
---|
233 | #define C6 10426
|
---|
234 | #define C7 5315
|
---|
235 | TABLE_SSE2
|
---|
236 |
|
---|
237 | #undef C1
|
---|
238 | #undef C2
|
---|
239 | #undef C3
|
---|
240 | #undef C4
|
---|
241 | #undef C5
|
---|
242 | #undef C6
|
---|
243 | #undef C7
|
---|
244 | #define C1 29692
|
---|
245 | #define C2 27969
|
---|
246 | #define C3 25172
|
---|
247 | #define C4 21407
|
---|
248 | #define C5 16819
|
---|
249 | #define C6 11585
|
---|
250 | #define C7 5906
|
---|
251 | TABLE_SSE2
|
---|
252 |
|
---|
253 | #undef C1
|
---|
254 | #undef C2
|
---|
255 | #undef C3
|
---|
256 | #undef C4
|
---|
257 | #undef C5
|
---|
258 | #undef C6
|
---|
259 | #undef C7
|
---|
260 | #define C1 31521
|
---|
261 | #define C2 29692
|
---|
262 | #define C3 26722
|
---|
263 | #define C4 22725
|
---|
264 | #define C5 17855
|
---|
265 | #define C6 12299
|
---|
266 | #define C7 6270
|
---|
267 | TABLE_SSE2
|
---|
268 | }};
|
---|
269 |
|
---|
270 |
|
---|
271 | static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
|
---|
272 | {
|
---|
273 | movq_m2r(*(in + offset + 1 * 8), mm0);
|
---|
274 | movq_m2r(*(in + offset + 6 * 8), mm1);
|
---|
275 | movq_r2r(mm0, mm2);
|
---|
276 | movq_m2r(*(in + offset + 2 * 8), mm3);
|
---|
277 | paddsw_r2r(mm1, mm0);
|
---|
278 | movq_m2r(*(in + offset + 5 * 8), mm4);
|
---|
279 | psllw_i2r(SHIFT_FRW_COL, mm0);
|
---|
280 | movq_m2r(*(in + offset + 0 * 8), mm5);
|
---|
281 | paddsw_r2r(mm3, mm4);
|
---|
282 | paddsw_m2r(*(in + offset + 7 * 8), mm5);
|
---|
283 | psllw_i2r(SHIFT_FRW_COL, mm4);
|
---|
284 | movq_r2r(mm0, mm6);
|
---|
285 | psubsw_r2r(mm1, mm2);
|
---|
286 | movq_m2r(*(fdct_tg_all_16 + 4), mm1);
|
---|
287 | psubsw_r2r(mm4, mm0);
|
---|
288 | movq_m2r(*(in + offset + 3 * 8), mm7);
|
---|
289 | pmulhw_r2r(mm0, mm1);
|
---|
290 | paddsw_m2r(*(in + offset + 4 * 8), mm7);
|
---|
291 | psllw_i2r(SHIFT_FRW_COL, mm5);
|
---|
292 | paddsw_r2r(mm4, mm6);
|
---|
293 | psllw_i2r(SHIFT_FRW_COL, mm7);
|
---|
294 | movq_r2r(mm5, mm4);
|
---|
295 | psubsw_r2r(mm7, mm5);
|
---|
296 | paddsw_r2r(mm5, mm1);
|
---|
297 | paddsw_r2r(mm7, mm4);
|
---|
298 | por_m2r(fdct_one_corr, mm1);
|
---|
299 | psllw_i2r(SHIFT_FRW_COL + 1, mm2);
|
---|
300 | pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
|
---|
301 | movq_r2r(mm4, mm7);
|
---|
302 | psubsw_m2r(*(in + offset + 5 * 8), mm3);
|
---|
303 | psubsw_r2r(mm6, mm4);
|
---|
304 | movq_r2m(mm1, *(out + offset + 2 * 8));
|
---|
305 | paddsw_r2r(mm6, mm7);
|
---|
306 | movq_m2r(*(in + offset + 3 * 8), mm1);
|
---|
307 | psllw_i2r(SHIFT_FRW_COL + 1, mm3);
|
---|
308 | psubsw_m2r(*(in + offset + 4 * 8), mm1);
|
---|
309 | movq_r2r(mm2, mm6);
|
---|
310 | movq_r2m(mm4, *(out + offset + 4 * 8));
|
---|
311 | paddsw_r2r(mm3, mm2);
|
---|
312 | pmulhw_m2r(*ocos_4_16, mm2);
|
---|
313 | psubsw_r2r(mm3, mm6);
|
---|
314 | pmulhw_m2r(*ocos_4_16, mm6);
|
---|
315 | psubsw_r2r(mm0, mm5);
|
---|
316 | por_m2r(fdct_one_corr, mm5);
|
---|
317 | psllw_i2r(SHIFT_FRW_COL, mm1);
|
---|
318 | por_m2r(fdct_one_corr, mm2);
|
---|
319 | movq_r2r(mm1, mm4);
|
---|
320 | movq_m2r(*(in + offset + 0 * 8), mm3);
|
---|
321 | paddsw_r2r(mm6, mm1);
|
---|
322 | psubsw_m2r(*(in + offset + 7 * 8), mm3);
|
---|
323 | psubsw_r2r(mm6, mm4);
|
---|
324 | movq_m2r(*(fdct_tg_all_16 + 0), mm0);
|
---|
325 | psllw_i2r(SHIFT_FRW_COL, mm3);
|
---|
326 | movq_m2r(*(fdct_tg_all_16 + 8), mm6);
|
---|
327 | pmulhw_r2r(mm1, mm0);
|
---|
328 | movq_r2m(mm7, *(out + offset + 0 * 8));
|
---|
329 | pmulhw_r2r(mm4, mm6);
|
---|
330 | movq_r2m(mm5, *(out + offset + 6 * 8));
|
---|
331 | movq_r2r(mm3, mm7);
|
---|
332 | movq_m2r(*(fdct_tg_all_16 + 8), mm5);
|
---|
333 | psubsw_r2r(mm2, mm7);
|
---|
334 | paddsw_r2r(mm2, mm3);
|
---|
335 | pmulhw_r2r(mm7, mm5);
|
---|
336 | paddsw_r2r(mm3, mm0);
|
---|
337 | paddsw_r2r(mm4, mm6);
|
---|
338 | pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
|
---|
339 | por_m2r(fdct_one_corr, mm0);
|
---|
340 | paddsw_r2r(mm7, mm5);
|
---|
341 | psubsw_r2r(mm6, mm7);
|
---|
342 | movq_r2m(mm0, *(out + offset + 1 * 8));
|
---|
343 | paddsw_r2r(mm4, mm5);
|
---|
344 | movq_r2m(mm7, *(out + offset + 3 * 8));
|
---|
345 | psubsw_r2r(mm1, mm3);
|
---|
346 | movq_r2m(mm5, *(out + offset + 5 * 8));
|
---|
347 | movq_r2m(mm3, *(out + offset + 7 * 8));
|
---|
348 | }
|
---|
349 |
|
---|
350 |
|
---|
351 | #if !defined(VBOX) || !defined(__DARWIN__) /* requires gnu as. */
|
---|
352 | static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
|
---|
353 | {
|
---|
354 | asm volatile(
|
---|
355 | ".macro FDCT_ROW_SSE2_H1 i t \n\t"
|
---|
356 | "movq \\i(%0), %%xmm2 \n\t"
|
---|
357 | "movq \\i+8(%0), %%xmm0 \n\t"
|
---|
358 | "movdqa \\t+32(%1), %%xmm3 \n\t"
|
---|
359 | "movdqa \\t+48(%1), %%xmm7 \n\t"
|
---|
360 | "movdqa \\t(%1), %%xmm4 \n\t"
|
---|
361 | "movdqa \\t+16(%1), %%xmm5 \n\t"
|
---|
362 | ".endm \n\t"
|
---|
363 | ".macro FDCT_ROW_SSE2_H2 i t \n\t"
|
---|
364 | "movq \\i(%0), %%xmm2 \n\t"
|
---|
365 | "movq \\i+8(%0), %%xmm0 \n\t"
|
---|
366 | "movdqa \\t+32(%1), %%xmm3 \n\t"
|
---|
367 | "movdqa \\t+48(%1), %%xmm7 \n\t"
|
---|
368 | ".endm \n\t"
|
---|
369 | ".macro FDCT_ROW_SSE2 i \n\t"
|
---|
370 | "movq %%xmm2, %%xmm1 \n\t"
|
---|
371 | "pshuflw $27, %%xmm0, %%xmm0 \n\t"
|
---|
372 | "paddsw %%xmm0, %%xmm1 \n\t"
|
---|
373 | "psubsw %%xmm0, %%xmm2 \n\t"
|
---|
374 | "punpckldq %%xmm2, %%xmm1 \n\t"
|
---|
375 | "pshufd $78, %%xmm1, %%xmm2 \n\t"
|
---|
376 | "pmaddwd %%xmm2, %%xmm3 \n\t"
|
---|
377 | "pmaddwd %%xmm1, %%xmm7 \n\t"
|
---|
378 | "pmaddwd %%xmm5, %%xmm2 \n\t"
|
---|
379 | "pmaddwd %%xmm4, %%xmm1 \n\t"
|
---|
380 | "paddd %%xmm7, %%xmm3 \n\t"
|
---|
381 | "paddd %%xmm2, %%xmm1 \n\t"
|
---|
382 | "paddd %%xmm6, %%xmm3 \n\t"
|
---|
383 | "paddd %%xmm6, %%xmm1 \n\t"
|
---|
384 | "psrad %3, %%xmm3 \n\t"
|
---|
385 | "psrad %3, %%xmm1 \n\t"
|
---|
386 | "packssdw %%xmm3, %%xmm1 \n\t"
|
---|
387 | "movdqa %%xmm1, \\i(%4) \n\t"
|
---|
388 | ".endm \n\t"
|
---|
389 | "movdqa (%2), %%xmm6 \n\t"
|
---|
390 | "FDCT_ROW_SSE2_H1 0 0 \n\t"
|
---|
391 | "FDCT_ROW_SSE2 0 \n\t"
|
---|
392 | "FDCT_ROW_SSE2_H2 64 0 \n\t"
|
---|
393 | "FDCT_ROW_SSE2 64 \n\t"
|
---|
394 |
|
---|
395 | "FDCT_ROW_SSE2_H1 16 64 \n\t"
|
---|
396 | "FDCT_ROW_SSE2 16 \n\t"
|
---|
397 | "FDCT_ROW_SSE2_H2 112 64 \n\t"
|
---|
398 | "FDCT_ROW_SSE2 112 \n\t"
|
---|
399 |
|
---|
400 | "FDCT_ROW_SSE2_H1 32 128 \n\t"
|
---|
401 | "FDCT_ROW_SSE2 32 \n\t"
|
---|
402 | "FDCT_ROW_SSE2_H2 96 128 \n\t"
|
---|
403 | "FDCT_ROW_SSE2 96 \n\t"
|
---|
404 |
|
---|
405 | "FDCT_ROW_SSE2_H1 48 192 \n\t"
|
---|
406 | "FDCT_ROW_SSE2 48 \n\t"
|
---|
407 | "FDCT_ROW_SSE2_H2 80 192 \n\t"
|
---|
408 | "FDCT_ROW_SSE2 80 \n\t"
|
---|
409 | :
|
---|
410 | : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
|
---|
411 | );
|
---|
412 | }
|
---|
413 | #endif /* !VBOX || !__DARWIN__ */
|
---|
414 |
|
---|
415 | static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
|
---|
416 | {
|
---|
417 | pshufw_m2r(*(in + 4), mm5, 0x1B);
|
---|
418 | movq_m2r(*(in + 0), mm0);
|
---|
419 | movq_r2r(mm0, mm1);
|
---|
420 | paddsw_r2r(mm5, mm0);
|
---|
421 | psubsw_r2r(mm5, mm1);
|
---|
422 | movq_r2r(mm0, mm2);
|
---|
423 | punpckldq_r2r(mm1, mm0);
|
---|
424 | punpckhdq_r2r(mm1, mm2);
|
---|
425 | movq_m2r(*(table + 0), mm1);
|
---|
426 | movq_m2r(*(table + 4), mm3);
|
---|
427 | movq_m2r(*(table + 8), mm4);
|
---|
428 | movq_m2r(*(table + 12), mm5);
|
---|
429 | movq_m2r(*(table + 16), mm6);
|
---|
430 | movq_m2r(*(table + 20), mm7);
|
---|
431 | pmaddwd_r2r(mm0, mm1);
|
---|
432 | pmaddwd_r2r(mm2, mm3);
|
---|
433 | pmaddwd_r2r(mm0, mm4);
|
---|
434 | pmaddwd_r2r(mm2, mm5);
|
---|
435 | pmaddwd_r2r(mm0, mm6);
|
---|
436 | pmaddwd_r2r(mm2, mm7);
|
---|
437 | pmaddwd_m2r(*(table + 24), mm0);
|
---|
438 | pmaddwd_m2r(*(table + 28), mm2);
|
---|
439 | paddd_r2r(mm1, mm3);
|
---|
440 | paddd_r2r(mm4, mm5);
|
---|
441 | paddd_r2r(mm6, mm7);
|
---|
442 | paddd_r2r(mm0, mm2);
|
---|
443 | movq_m2r(*fdct_r_row, mm0);
|
---|
444 | paddd_r2r(mm0, mm3);
|
---|
445 | paddd_r2r(mm0, mm5);
|
---|
446 | paddd_r2r(mm0, mm7);
|
---|
447 | paddd_r2r(mm0, mm2);
|
---|
448 | psrad_i2r(SHIFT_FRW_ROW, mm3);
|
---|
449 | psrad_i2r(SHIFT_FRW_ROW, mm5);
|
---|
450 | psrad_i2r(SHIFT_FRW_ROW, mm7);
|
---|
451 | psrad_i2r(SHIFT_FRW_ROW, mm2);
|
---|
452 | packssdw_r2r(mm5, mm3);
|
---|
453 | packssdw_r2r(mm2, mm7);
|
---|
454 | movq_r2m(mm3, *(out + 0));
|
---|
455 | movq_r2m(mm7, *(out + 4));
|
---|
456 | }
|
---|
457 |
|
---|
458 | static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
|
---|
459 | {
|
---|
460 | //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
|
---|
461 | movd_m2r(*(in + 6), mm1);
|
---|
462 | punpcklwd_m2r(*(in + 4), mm1);
|
---|
463 | movq_r2r(mm1, mm2);
|
---|
464 | psrlq_i2r(0x20, mm1);
|
---|
465 | movq_m2r(*(in + 0), mm0);
|
---|
466 | punpcklwd_r2r(mm2, mm1);
|
---|
467 | movq_r2r(mm0, mm5);
|
---|
468 | paddsw_r2r(mm1, mm0);
|
---|
469 | psubsw_r2r(mm1, mm5);
|
---|
470 | movq_r2r(mm0, mm2);
|
---|
471 | punpckldq_r2r(mm5, mm0);
|
---|
472 | punpckhdq_r2r(mm5, mm2);
|
---|
473 | movq_m2r(*(table + 0), mm1);
|
---|
474 | movq_m2r(*(table + 4), mm3);
|
---|
475 | movq_m2r(*(table + 8), mm4);
|
---|
476 | movq_m2r(*(table + 12), mm5);
|
---|
477 | movq_m2r(*(table + 16), mm6);
|
---|
478 | movq_m2r(*(table + 20), mm7);
|
---|
479 | pmaddwd_r2r(mm0, mm1);
|
---|
480 | pmaddwd_r2r(mm2, mm3);
|
---|
481 | pmaddwd_r2r(mm0, mm4);
|
---|
482 | pmaddwd_r2r(mm2, mm5);
|
---|
483 | pmaddwd_r2r(mm0, mm6);
|
---|
484 | pmaddwd_r2r(mm2, mm7);
|
---|
485 | pmaddwd_m2r(*(table + 24), mm0);
|
---|
486 | pmaddwd_m2r(*(table + 28), mm2);
|
---|
487 | paddd_r2r(mm1, mm3);
|
---|
488 | paddd_r2r(mm4, mm5);
|
---|
489 | paddd_r2r(mm6, mm7);
|
---|
490 | paddd_r2r(mm0, mm2);
|
---|
491 | movq_m2r(*fdct_r_row, mm0);
|
---|
492 | paddd_r2r(mm0, mm3);
|
---|
493 | paddd_r2r(mm0, mm5);
|
---|
494 | paddd_r2r(mm0, mm7);
|
---|
495 | paddd_r2r(mm0, mm2);
|
---|
496 | psrad_i2r(SHIFT_FRW_ROW, mm3);
|
---|
497 | psrad_i2r(SHIFT_FRW_ROW, mm5);
|
---|
498 | psrad_i2r(SHIFT_FRW_ROW, mm7);
|
---|
499 | psrad_i2r(SHIFT_FRW_ROW, mm2);
|
---|
500 | packssdw_r2r(mm5, mm3);
|
---|
501 | packssdw_r2r(mm2, mm7);
|
---|
502 | movq_r2m(mm3, *(out + 0));
|
---|
503 | movq_r2m(mm7, *(out + 4));
|
---|
504 | }
|
---|
505 |
|
---|
506 | void ff_fdct_mmx(int16_t *block)
|
---|
507 | {
|
---|
508 | int64_t align_tmp[16] ATTR_ALIGN(8);
|
---|
509 | int16_t * const block_tmp= (int16_t*)align_tmp;
|
---|
510 | int16_t *block1, *out;
|
---|
511 | const int16_t *table;
|
---|
512 | int i;
|
---|
513 |
|
---|
514 | block1 = block_tmp;
|
---|
515 | fdct_col(block, block1, 0);
|
---|
516 | fdct_col(block, block1, 4);
|
---|
517 |
|
---|
518 | block1 = block_tmp;
|
---|
519 | table = tab_frw_01234567;
|
---|
520 | out = block;
|
---|
521 | for(i=8;i>0;i--) {
|
---|
522 | fdct_row_mmx(block1, out, table);
|
---|
523 | block1 += 8;
|
---|
524 | table += 32;
|
---|
525 | out += 8;
|
---|
526 | }
|
---|
527 | }
|
---|
528 |
|
---|
529 | void ff_fdct_mmx2(int16_t *block)
|
---|
530 | {
|
---|
531 | int64_t align_tmp[16] ATTR_ALIGN(8);
|
---|
532 | int16_t * const block_tmp= (int16_t*)align_tmp;
|
---|
533 | int16_t *block1, *out;
|
---|
534 | const int16_t *table;
|
---|
535 | int i;
|
---|
536 |
|
---|
537 | block1 = block_tmp;
|
---|
538 | fdct_col(block, block1, 0);
|
---|
539 | fdct_col(block, block1, 4);
|
---|
540 |
|
---|
541 | block1 = block_tmp;
|
---|
542 | table = tab_frw_01234567;
|
---|
543 | out = block;
|
---|
544 | for(i=8;i>0;i--) {
|
---|
545 | fdct_row_mmx2(block1, out, table);
|
---|
546 | block1 += 8;
|
---|
547 | table += 32;
|
---|
548 | out += 8;
|
---|
549 | }
|
---|
550 | }
|
---|
551 |
|
---|
552 | void ff_fdct_sse2(int16_t *block)
|
---|
553 | {
|
---|
554 | #if !defined(VBOX) || !defined(__DARWIN__) /* requires gnu as. */
|
---|
555 | int64_t align_tmp[16] ATTR_ALIGN(8);
|
---|
556 | int16_t * const block_tmp= (int16_t*)align_tmp;
|
---|
557 | int16_t *block1;
|
---|
558 |
|
---|
559 | block1 = block_tmp;
|
---|
560 | fdct_col(block, block1, 0);
|
---|
561 | fdct_col(block, block1, 4);
|
---|
562 |
|
---|
563 | fdct_row_sse2(block1, block);
|
---|
564 | #else /* VBOX && __DARWIN__ */
|
---|
565 | ff_fdct_mmx2(block);
|
---|
566 | #endif /* VBOX && __DARWIN__ */
|
---|
567 | }
|
---|
568 |
|
---|