1 | /*
|
---|
2 | * MMX optimized DSP utils
|
---|
3 | * Copyright (c) 2000, 2001 Fabrice Bellard.
|
---|
4 | * Copyright (c) 2002-2004 Michael Niedermayer <[email protected]>
|
---|
5 | *
|
---|
6 | * This library is free software; you can redistribute it and/or
|
---|
7 | * modify it under the terms of the GNU Lesser General Public
|
---|
8 | * License as published by the Free Software Foundation; either
|
---|
9 | * version 2 of the License, or (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This library is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
14 | * Lesser General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU Lesser General Public
|
---|
17 | * License along with this library; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
19 | *
|
---|
20 | * MMX optimization by Nick Kurshev <[email protected]>
|
---|
21 | */
|
---|
22 |
|
---|
23 | #include "../dsputil.h"
|
---|
24 | #include "../simple_idct.h"
|
---|
25 | #include "../mpegvideo.h"
|
---|
26 | #include "x86_cpu.h"
|
---|
27 | #include "mmx.h"
|
---|
28 |
|
---|
29 | //#undef NDEBUG
|
---|
30 | //#include <assert.h>
|
---|
31 |
|
---|
32 | extern const uint8_t ff_h263_loop_filter_strength[32];
|
---|
33 | extern void ff_idct_xvid_mmx(short *block);
|
---|
34 | extern void ff_idct_xvid_mmx2(short *block);
|
---|
35 |
|
---|
36 | int mm_flags; /* multimedia extension flags */
|
---|
37 |
|
---|
38 | /* pixel operations */
|
---|
39 | static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
|
---|
40 | static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
|
---|
41 | static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
|
---|
42 |
|
---|
43 | static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
|
---|
44 | static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
|
---|
45 | static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
|
---|
46 | static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
|
---|
47 | static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
|
---|
48 | static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
|
---|
49 | static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
|
---|
50 | static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
|
---|
51 | static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
|
---|
52 |
|
---|
53 | static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
|
---|
54 | static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
|
---|
55 |
|
---|
56 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
57 | #define JUMPALIGN() __asm __volatile (".balign 8"::)
|
---|
58 | #else
|
---|
59 | #define JUMPALIGN() __asm __volatile (".align 3"::)
|
---|
60 | #endif
|
---|
61 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
|
---|
62 |
|
---|
63 | #define MOVQ_WONE(regd) \
|
---|
64 | __asm __volatile ( \
|
---|
65 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
---|
66 | "psrlw $15, %%" #regd ::)
|
---|
67 |
|
---|
68 | #define MOVQ_BFE(regd) \
|
---|
69 | __asm __volatile ( \
|
---|
70 | "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
|
---|
71 | "paddb %%" #regd ", %%" #regd " \n\t" ::)
|
---|
72 |
|
---|
73 | #ifndef PIC
|
---|
74 | #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
|
---|
75 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
|
---|
76 | #else
|
---|
77 | // for shared library it's better to use this way for accessing constants
|
---|
78 | // pcmpeqd -> -1
|
---|
79 | #define MOVQ_BONE(regd) \
|
---|
80 | __asm __volatile ( \
|
---|
81 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
---|
82 | "psrlw $15, %%" #regd " \n\t" \
|
---|
83 | "packuswb %%" #regd ", %%" #regd " \n\t" ::)
|
---|
84 |
|
---|
85 | #define MOVQ_WTWO(regd) \
|
---|
86 | __asm __volatile ( \
|
---|
87 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
---|
88 | "psrlw $15, %%" #regd " \n\t" \
|
---|
89 | "psllw $1, %%" #regd " \n\t"::)
|
---|
90 |
|
---|
91 | #endif
|
---|
92 |
|
---|
93 | // using regr as temporary and for the output result
|
---|
94 | // first argument is unmodifed and second is trashed
|
---|
95 | // regfe is supposed to contain 0xfefefefefefefefe
|
---|
96 | #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
---|
97 | "movq " #rega ", " #regr " \n\t"\
|
---|
98 | "pand " #regb ", " #regr " \n\t"\
|
---|
99 | "pxor " #rega ", " #regb " \n\t"\
|
---|
100 | "pand " #regfe "," #regb " \n\t"\
|
---|
101 | "psrlq $1, " #regb " \n\t"\
|
---|
102 | "paddb " #regb ", " #regr " \n\t"
|
---|
103 |
|
---|
104 | #define PAVGB_MMX(rega, regb, regr, regfe) \
|
---|
105 | "movq " #rega ", " #regr " \n\t"\
|
---|
106 | "por " #regb ", " #regr " \n\t"\
|
---|
107 | "pxor " #rega ", " #regb " \n\t"\
|
---|
108 | "pand " #regfe "," #regb " \n\t"\
|
---|
109 | "psrlq $1, " #regb " \n\t"\
|
---|
110 | "psubb " #regb ", " #regr " \n\t"
|
---|
111 |
|
---|
112 | // mm6 is supposed to contain 0xfefefefefefefefe
|
---|
113 | #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
---|
114 | "movq " #rega ", " #regr " \n\t"\
|
---|
115 | "movq " #regc ", " #regp " \n\t"\
|
---|
116 | "pand " #regb ", " #regr " \n\t"\
|
---|
117 | "pand " #regd ", " #regp " \n\t"\
|
---|
118 | "pxor " #rega ", " #regb " \n\t"\
|
---|
119 | "pxor " #regc ", " #regd " \n\t"\
|
---|
120 | "pand %%mm6, " #regb " \n\t"\
|
---|
121 | "pand %%mm6, " #regd " \n\t"\
|
---|
122 | "psrlq $1, " #regb " \n\t"\
|
---|
123 | "psrlq $1, " #regd " \n\t"\
|
---|
124 | "paddb " #regb ", " #regr " \n\t"\
|
---|
125 | "paddb " #regd ", " #regp " \n\t"
|
---|
126 |
|
---|
127 | #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
---|
128 | "movq " #rega ", " #regr " \n\t"\
|
---|
129 | "movq " #regc ", " #regp " \n\t"\
|
---|
130 | "por " #regb ", " #regr " \n\t"\
|
---|
131 | "por " #regd ", " #regp " \n\t"\
|
---|
132 | "pxor " #rega ", " #regb " \n\t"\
|
---|
133 | "pxor " #regc ", " #regd " \n\t"\
|
---|
134 | "pand %%mm6, " #regb " \n\t"\
|
---|
135 | "pand %%mm6, " #regd " \n\t"\
|
---|
136 | "psrlq $1, " #regd " \n\t"\
|
---|
137 | "psrlq $1, " #regb " \n\t"\
|
---|
138 | "psubb " #regb ", " #regr " \n\t"\
|
---|
139 | "psubb " #regd ", " #regp " \n\t"
|
---|
140 |
|
---|
141 | /***********************************/
|
---|
142 | /* MMX no rounding */
|
---|
143 | #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
|
---|
144 | #define SET_RND MOVQ_WONE
|
---|
145 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
|
---|
146 | #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
|
---|
147 |
|
---|
148 | #include "dsputil_mmx_rnd.h"
|
---|
149 |
|
---|
150 | #undef DEF
|
---|
151 | #undef SET_RND
|
---|
152 | #undef PAVGBP
|
---|
153 | #undef PAVGB
|
---|
154 | /***********************************/
|
---|
155 | /* MMX rounding */
|
---|
156 |
|
---|
157 | #define DEF(x, y) x ## _ ## y ##_mmx
|
---|
158 | #define SET_RND MOVQ_WTWO
|
---|
159 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
---|
160 | #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
---|
161 |
|
---|
162 | #include "dsputil_mmx_rnd.h"
|
---|
163 |
|
---|
164 | #undef DEF
|
---|
165 | #undef SET_RND
|
---|
166 | #undef PAVGBP
|
---|
167 | #undef PAVGB
|
---|
168 |
|
---|
169 | /***********************************/
|
---|
170 | /* 3Dnow specific */
|
---|
171 |
|
---|
172 | #define DEF(x) x ## _3dnow
|
---|
173 | /* for Athlons PAVGUSB is prefered */
|
---|
174 | #define PAVGB "pavgusb"
|
---|
175 |
|
---|
176 | #include "dsputil_mmx_avg.h"
|
---|
177 |
|
---|
178 | #undef DEF
|
---|
179 | #undef PAVGB
|
---|
180 |
|
---|
181 | /***********************************/
|
---|
182 | /* MMX2 specific */
|
---|
183 |
|
---|
184 | #define DEF(x) x ## _mmx2
|
---|
185 |
|
---|
186 | /* Introduced only in MMX2 set */
|
---|
187 | #define PAVGB "pavgb"
|
---|
188 |
|
---|
189 | #include "dsputil_mmx_avg.h"
|
---|
190 |
|
---|
191 | #undef DEF
|
---|
192 | #undef PAVGB
|
---|
193 |
|
---|
194 | #define SBUTTERFLY(a,b,t,n)\
|
---|
195 | "movq " #a ", " #t " \n\t" /* abcd */\
|
---|
196 | "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
|
---|
197 | "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
|
---|
198 |
|
---|
199 | /***********************************/
|
---|
200 | /* standard MMX */
|
---|
201 |
|
---|
202 | #ifdef CONFIG_ENCODERS
|
---|
203 | static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
|
---|
204 | {
|
---|
205 | asm volatile(
|
---|
206 | "mov $-128, %%"REG_a" \n\t"
|
---|
207 | "pxor %%mm7, %%mm7 \n\t"
|
---|
208 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
209 | ".balign 16 \n\t"
|
---|
210 | #else
|
---|
211 | ".align 4 \n\t"
|
---|
212 | #endif
|
---|
213 | "1: \n\t"
|
---|
214 | "movq (%0), %%mm0 \n\t"
|
---|
215 | "movq (%0, %2), %%mm2 \n\t"
|
---|
216 | "movq %%mm0, %%mm1 \n\t"
|
---|
217 | "movq %%mm2, %%mm3 \n\t"
|
---|
218 | "punpcklbw %%mm7, %%mm0 \n\t"
|
---|
219 | "punpckhbw %%mm7, %%mm1 \n\t"
|
---|
220 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
221 | "punpckhbw %%mm7, %%mm3 \n\t"
|
---|
222 | "movq %%mm0, (%1, %%"REG_a") \n\t"
|
---|
223 | "movq %%mm1, 8(%1, %%"REG_a") \n\t"
|
---|
224 | "movq %%mm2, 16(%1, %%"REG_a") \n\t"
|
---|
225 | "movq %%mm3, 24(%1, %%"REG_a") \n\t"
|
---|
226 | "add %3, %0 \n\t"
|
---|
227 | "add $32, %%"REG_a" \n\t"
|
---|
228 | "js 1b \n\t"
|
---|
229 | : "+r" (pixels)
|
---|
230 | : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
|
---|
231 | : "%"REG_a
|
---|
232 | );
|
---|
233 | }
|
---|
234 |
|
---|
235 | static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
|
---|
236 | {
|
---|
237 | asm volatile(
|
---|
238 | "pxor %%mm7, %%mm7 \n\t"
|
---|
239 | "mov $-128, %%"REG_a" \n\t"
|
---|
240 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
241 | ".balign 16 \n\t"
|
---|
242 | #else
|
---|
243 | ".align 4 \n\t"
|
---|
244 | #endif
|
---|
245 | "1: \n\t"
|
---|
246 | "movq (%0), %%mm0 \n\t"
|
---|
247 | "movq (%1), %%mm2 \n\t"
|
---|
248 | "movq %%mm0, %%mm1 \n\t"
|
---|
249 | "movq %%mm2, %%mm3 \n\t"
|
---|
250 | "punpcklbw %%mm7, %%mm0 \n\t"
|
---|
251 | "punpckhbw %%mm7, %%mm1 \n\t"
|
---|
252 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
253 | "punpckhbw %%mm7, %%mm3 \n\t"
|
---|
254 | "psubw %%mm2, %%mm0 \n\t"
|
---|
255 | "psubw %%mm3, %%mm1 \n\t"
|
---|
256 | "movq %%mm0, (%2, %%"REG_a") \n\t"
|
---|
257 | "movq %%mm1, 8(%2, %%"REG_a") \n\t"
|
---|
258 | "add %3, %0 \n\t"
|
---|
259 | "add %3, %1 \n\t"
|
---|
260 | "add $16, %%"REG_a" \n\t"
|
---|
261 | "jnz 1b \n\t"
|
---|
262 | : "+r" (s1), "+r" (s2)
|
---|
263 | : "r" (block+64), "r" ((long)stride)
|
---|
264 | : "%"REG_a
|
---|
265 | );
|
---|
266 | }
|
---|
267 | #endif //CONFIG_ENCODERS
|
---|
268 |
|
---|
269 | void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
---|
270 | {
|
---|
271 | const DCTELEM *p;
|
---|
272 | uint8_t *pix;
|
---|
273 |
|
---|
274 | /* read the pixels */
|
---|
275 | p = block;
|
---|
276 | pix = pixels;
|
---|
277 | /* unrolled loop */
|
---|
278 | __asm __volatile(
|
---|
279 | "movq %3, %%mm0 \n\t"
|
---|
280 | "movq 8%3, %%mm1 \n\t"
|
---|
281 | "movq 16%3, %%mm2 \n\t"
|
---|
282 | "movq 24%3, %%mm3 \n\t"
|
---|
283 | "movq 32%3, %%mm4 \n\t"
|
---|
284 | "movq 40%3, %%mm5 \n\t"
|
---|
285 | "movq 48%3, %%mm6 \n\t"
|
---|
286 | "movq 56%3, %%mm7 \n\t"
|
---|
287 | "packuswb %%mm1, %%mm0 \n\t"
|
---|
288 | "packuswb %%mm3, %%mm2 \n\t"
|
---|
289 | "packuswb %%mm5, %%mm4 \n\t"
|
---|
290 | "packuswb %%mm7, %%mm6 \n\t"
|
---|
291 | "movq %%mm0, (%0) \n\t"
|
---|
292 | "movq %%mm2, (%0, %1) \n\t"
|
---|
293 | "movq %%mm4, (%0, %1, 2) \n\t"
|
---|
294 | "movq %%mm6, (%0, %2) \n\t"
|
---|
295 | ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
|
---|
296 | :"memory");
|
---|
297 | pix += line_size*4;
|
---|
298 | p += 32;
|
---|
299 |
|
---|
300 | // if here would be an exact copy of the code above
|
---|
301 | // compiler would generate some very strange code
|
---|
302 | // thus using "r"
|
---|
303 | __asm __volatile(
|
---|
304 | "movq (%3), %%mm0 \n\t"
|
---|
305 | "movq 8(%3), %%mm1 \n\t"
|
---|
306 | "movq 16(%3), %%mm2 \n\t"
|
---|
307 | "movq 24(%3), %%mm3 \n\t"
|
---|
308 | "movq 32(%3), %%mm4 \n\t"
|
---|
309 | "movq 40(%3), %%mm5 \n\t"
|
---|
310 | "movq 48(%3), %%mm6 \n\t"
|
---|
311 | "movq 56(%3), %%mm7 \n\t"
|
---|
312 | "packuswb %%mm1, %%mm0 \n\t"
|
---|
313 | "packuswb %%mm3, %%mm2 \n\t"
|
---|
314 | "packuswb %%mm5, %%mm4 \n\t"
|
---|
315 | "packuswb %%mm7, %%mm6 \n\t"
|
---|
316 | "movq %%mm0, (%0) \n\t"
|
---|
317 | "movq %%mm2, (%0, %1) \n\t"
|
---|
318 | "movq %%mm4, (%0, %1, 2) \n\t"
|
---|
319 | "movq %%mm6, (%0, %2) \n\t"
|
---|
320 | ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
|
---|
321 | :"memory");
|
---|
322 | }
|
---|
323 |
|
---|
324 | static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
|
---|
325 | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
---|
326 |
|
---|
327 | void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
---|
328 | {
|
---|
329 | int i;
|
---|
330 |
|
---|
331 | movq_m2r(*vector128, mm1);
|
---|
332 | for (i = 0; i < 8; i++) {
|
---|
333 | movq_m2r(*(block), mm0);
|
---|
334 | packsswb_m2r(*(block + 4), mm0);
|
---|
335 | block += 8;
|
---|
336 | paddb_r2r(mm1, mm0);
|
---|
337 | movq_r2m(mm0, *pixels);
|
---|
338 | pixels += line_size;
|
---|
339 | }
|
---|
340 | }
|
---|
341 |
|
---|
342 | void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
---|
343 | {
|
---|
344 | const DCTELEM *p;
|
---|
345 | uint8_t *pix;
|
---|
346 | int i;
|
---|
347 |
|
---|
348 | /* read the pixels */
|
---|
349 | p = block;
|
---|
350 | pix = pixels;
|
---|
351 | MOVQ_ZERO(mm7);
|
---|
352 | i = 4;
|
---|
353 | do {
|
---|
354 | __asm __volatile(
|
---|
355 | "movq (%2), %%mm0 \n\t"
|
---|
356 | "movq 8(%2), %%mm1 \n\t"
|
---|
357 | "movq 16(%2), %%mm2 \n\t"
|
---|
358 | "movq 24(%2), %%mm3 \n\t"
|
---|
359 | "movq %0, %%mm4 \n\t"
|
---|
360 | "movq %1, %%mm6 \n\t"
|
---|
361 | "movq %%mm4, %%mm5 \n\t"
|
---|
362 | "punpcklbw %%mm7, %%mm4 \n\t"
|
---|
363 | "punpckhbw %%mm7, %%mm5 \n\t"
|
---|
364 | "paddsw %%mm4, %%mm0 \n\t"
|
---|
365 | "paddsw %%mm5, %%mm1 \n\t"
|
---|
366 | "movq %%mm6, %%mm5 \n\t"
|
---|
367 | "punpcklbw %%mm7, %%mm6 \n\t"
|
---|
368 | "punpckhbw %%mm7, %%mm5 \n\t"
|
---|
369 | "paddsw %%mm6, %%mm2 \n\t"
|
---|
370 | "paddsw %%mm5, %%mm3 \n\t"
|
---|
371 | "packuswb %%mm1, %%mm0 \n\t"
|
---|
372 | "packuswb %%mm3, %%mm2 \n\t"
|
---|
373 | "movq %%mm0, %0 \n\t"
|
---|
374 | "movq %%mm2, %1 \n\t"
|
---|
375 | :"+m"(*pix), "+m"(*(pix+line_size))
|
---|
376 | :"r"(p)
|
---|
377 | :"memory");
|
---|
378 | pix += line_size*2;
|
---|
379 | p += 16;
|
---|
380 | } while (--i);
|
---|
381 | }
|
---|
382 |
|
---|
383 | static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
384 | {
|
---|
385 | __asm __volatile(
|
---|
386 | "lea (%3, %3), %%"REG_a" \n\t"
|
---|
387 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
388 | ".balign 8 \n\t"
|
---|
389 | #else
|
---|
390 | ".align 3 \n\t"
|
---|
391 | #endif
|
---|
392 | "1: \n\t"
|
---|
393 | "movd (%1), %%mm0 \n\t"
|
---|
394 | "movd (%1, %3), %%mm1 \n\t"
|
---|
395 | "movd %%mm0, (%2) \n\t"
|
---|
396 | "movd %%mm1, (%2, %3) \n\t"
|
---|
397 | "add %%"REG_a", %1 \n\t"
|
---|
398 | "add %%"REG_a", %2 \n\t"
|
---|
399 | "movd (%1), %%mm0 \n\t"
|
---|
400 | "movd (%1, %3), %%mm1 \n\t"
|
---|
401 | "movd %%mm0, (%2) \n\t"
|
---|
402 | "movd %%mm1, (%2, %3) \n\t"
|
---|
403 | "add %%"REG_a", %1 \n\t"
|
---|
404 | "add %%"REG_a", %2 \n\t"
|
---|
405 | "subl $4, %0 \n\t"
|
---|
406 | "jnz 1b \n\t"
|
---|
407 | : "+g"(h), "+r" (pixels), "+r" (block)
|
---|
408 | : "r"((long)line_size)
|
---|
409 | : "%"REG_a, "memory"
|
---|
410 | );
|
---|
411 | }
|
---|
412 |
|
---|
413 | static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
414 | {
|
---|
415 | __asm __volatile(
|
---|
416 | "lea (%3, %3), %%"REG_a" \n\t"
|
---|
417 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
418 | ".balign 8 \n\t"
|
---|
419 | #else
|
---|
420 | ".align 3 \n\t"
|
---|
421 | #endif
|
---|
422 | "1: \n\t"
|
---|
423 | "movq (%1), %%mm0 \n\t"
|
---|
424 | "movq (%1, %3), %%mm1 \n\t"
|
---|
425 | "movq %%mm0, (%2) \n\t"
|
---|
426 | "movq %%mm1, (%2, %3) \n\t"
|
---|
427 | "add %%"REG_a", %1 \n\t"
|
---|
428 | "add %%"REG_a", %2 \n\t"
|
---|
429 | "movq (%1), %%mm0 \n\t"
|
---|
430 | "movq (%1, %3), %%mm1 \n\t"
|
---|
431 | "movq %%mm0, (%2) \n\t"
|
---|
432 | "movq %%mm1, (%2, %3) \n\t"
|
---|
433 | "add %%"REG_a", %1 \n\t"
|
---|
434 | "add %%"REG_a", %2 \n\t"
|
---|
435 | "subl $4, %0 \n\t"
|
---|
436 | "jnz 1b \n\t"
|
---|
437 | : "+g"(h), "+r" (pixels), "+r" (block)
|
---|
438 | : "r"((long)line_size)
|
---|
439 | : "%"REG_a, "memory"
|
---|
440 | );
|
---|
441 | }
|
---|
442 |
|
---|
443 | static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
444 | {
|
---|
445 | __asm __volatile(
|
---|
446 | "lea (%3, %3), %%"REG_a" \n\t"
|
---|
447 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
448 | ".balign 8 \n\t"
|
---|
449 | #else
|
---|
450 | ".align 3 \n\t"
|
---|
451 | #endif
|
---|
452 | "1: \n\t"
|
---|
453 | "movq (%1), %%mm0 \n\t"
|
---|
454 | "movq 8(%1), %%mm4 \n\t"
|
---|
455 | "movq (%1, %3), %%mm1 \n\t"
|
---|
456 | "movq 8(%1, %3), %%mm5 \n\t"
|
---|
457 | "movq %%mm0, (%2) \n\t"
|
---|
458 | "movq %%mm4, 8(%2) \n\t"
|
---|
459 | "movq %%mm1, (%2, %3) \n\t"
|
---|
460 | "movq %%mm5, 8(%2, %3) \n\t"
|
---|
461 | "add %%"REG_a", %1 \n\t"
|
---|
462 | "add %%"REG_a", %2 \n\t"
|
---|
463 | "movq (%1), %%mm0 \n\t"
|
---|
464 | "movq 8(%1), %%mm4 \n\t"
|
---|
465 | "movq (%1, %3), %%mm1 \n\t"
|
---|
466 | "movq 8(%1, %3), %%mm5 \n\t"
|
---|
467 | "movq %%mm0, (%2) \n\t"
|
---|
468 | "movq %%mm4, 8(%2) \n\t"
|
---|
469 | "movq %%mm1, (%2, %3) \n\t"
|
---|
470 | "movq %%mm5, 8(%2, %3) \n\t"
|
---|
471 | "add %%"REG_a", %1 \n\t"
|
---|
472 | "add %%"REG_a", %2 \n\t"
|
---|
473 | "subl $4, %0 \n\t"
|
---|
474 | "jnz 1b \n\t"
|
---|
475 | : "+g"(h), "+r" (pixels), "+r" (block)
|
---|
476 | : "r"((long)line_size)
|
---|
477 | : "%"REG_a, "memory"
|
---|
478 | );
|
---|
479 | }
|
---|
480 |
|
---|
481 | static void clear_blocks_mmx(DCTELEM *blocks)
|
---|
482 | {
|
---|
483 | __asm __volatile(
|
---|
484 | "pxor %%mm7, %%mm7 \n\t"
|
---|
485 | "mov $-128*6, %%"REG_a" \n\t"
|
---|
486 | "1: \n\t"
|
---|
487 | "movq %%mm7, (%0, %%"REG_a") \n\t"
|
---|
488 | "movq %%mm7, 8(%0, %%"REG_a") \n\t"
|
---|
489 | "movq %%mm7, 16(%0, %%"REG_a") \n\t"
|
---|
490 | "movq %%mm7, 24(%0, %%"REG_a") \n\t"
|
---|
491 | "add $32, %%"REG_a" \n\t"
|
---|
492 | " js 1b \n\t"
|
---|
493 | : : "r" (((uint8_t *)blocks)+128*6)
|
---|
494 | : "%"REG_a
|
---|
495 | );
|
---|
496 | }
|
---|
497 |
|
---|
498 | #ifdef CONFIG_ENCODERS
|
---|
499 | static int pix_sum16_mmx(uint8_t * pix, int line_size){
|
---|
500 | const int h=16;
|
---|
501 | int sum;
|
---|
502 | long index= -line_size*h;
|
---|
503 |
|
---|
504 | __asm __volatile(
|
---|
505 | "pxor %%mm7, %%mm7 \n\t"
|
---|
506 | "pxor %%mm6, %%mm6 \n\t"
|
---|
507 | "1: \n\t"
|
---|
508 | "movq (%2, %1), %%mm0 \n\t"
|
---|
509 | "movq (%2, %1), %%mm1 \n\t"
|
---|
510 | "movq 8(%2, %1), %%mm2 \n\t"
|
---|
511 | "movq 8(%2, %1), %%mm3 \n\t"
|
---|
512 | "punpcklbw %%mm7, %%mm0 \n\t"
|
---|
513 | "punpckhbw %%mm7, %%mm1 \n\t"
|
---|
514 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
515 | "punpckhbw %%mm7, %%mm3 \n\t"
|
---|
516 | "paddw %%mm0, %%mm1 \n\t"
|
---|
517 | "paddw %%mm2, %%mm3 \n\t"
|
---|
518 | "paddw %%mm1, %%mm3 \n\t"
|
---|
519 | "paddw %%mm3, %%mm6 \n\t"
|
---|
520 | "add %3, %1 \n\t"
|
---|
521 | " js 1b \n\t"
|
---|
522 | "movq %%mm6, %%mm5 \n\t"
|
---|
523 | "psrlq $32, %%mm6 \n\t"
|
---|
524 | "paddw %%mm5, %%mm6 \n\t"
|
---|
525 | "movq %%mm6, %%mm5 \n\t"
|
---|
526 | "psrlq $16, %%mm6 \n\t"
|
---|
527 | "paddw %%mm5, %%mm6 \n\t"
|
---|
528 | "movd %%mm6, %0 \n\t"
|
---|
529 | "andl $0xFFFF, %0 \n\t"
|
---|
530 | : "=&r" (sum), "+r" (index)
|
---|
531 | : "r" (pix - index), "r" ((long)line_size)
|
---|
532 | );
|
---|
533 |
|
---|
534 | return sum;
|
---|
535 | }
|
---|
536 | #endif //CONFIG_ENCODERS
|
---|
537 |
|
---|
538 | static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
|
---|
539 | long i=0;
|
---|
540 | asm volatile(
|
---|
541 | "1: \n\t"
|
---|
542 | "movq (%1, %0), %%mm0 \n\t"
|
---|
543 | "movq (%2, %0), %%mm1 \n\t"
|
---|
544 | "paddb %%mm0, %%mm1 \n\t"
|
---|
545 | "movq %%mm1, (%2, %0) \n\t"
|
---|
546 | "movq 8(%1, %0), %%mm0 \n\t"
|
---|
547 | "movq 8(%2, %0), %%mm1 \n\t"
|
---|
548 | "paddb %%mm0, %%mm1 \n\t"
|
---|
549 | "movq %%mm1, 8(%2, %0) \n\t"
|
---|
550 | "add $16, %0 \n\t"
|
---|
551 | "cmp %3, %0 \n\t"
|
---|
552 | " jb 1b \n\t"
|
---|
553 | : "+r" (i)
|
---|
554 | : "r"(src), "r"(dst), "r"((long)w-15)
|
---|
555 | );
|
---|
556 | for(; i<w; i++)
|
---|
557 | dst[i+0] += src[i+0];
|
---|
558 | }
|
---|
559 |
|
---|
560 | #define H263_LOOP_FILTER \
|
---|
561 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
562 | "movq %0, %%mm0 \n\t"\
|
---|
563 | "movq %0, %%mm1 \n\t"\
|
---|
564 | "movq %3, %%mm2 \n\t"\
|
---|
565 | "movq %3, %%mm3 \n\t"\
|
---|
566 | "punpcklbw %%mm7, %%mm0 \n\t"\
|
---|
567 | "punpckhbw %%mm7, %%mm1 \n\t"\
|
---|
568 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
569 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
570 | "psubw %%mm2, %%mm0 \n\t"\
|
---|
571 | "psubw %%mm3, %%mm1 \n\t"\
|
---|
572 | "movq %1, %%mm2 \n\t"\
|
---|
573 | "movq %1, %%mm3 \n\t"\
|
---|
574 | "movq %2, %%mm4 \n\t"\
|
---|
575 | "movq %2, %%mm5 \n\t"\
|
---|
576 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
577 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
578 | "punpcklbw %%mm7, %%mm4 \n\t"\
|
---|
579 | "punpckhbw %%mm7, %%mm5 \n\t"\
|
---|
580 | "psubw %%mm2, %%mm4 \n\t"\
|
---|
581 | "psubw %%mm3, %%mm5 \n\t"\
|
---|
582 | "psllw $2, %%mm4 \n\t"\
|
---|
583 | "psllw $2, %%mm5 \n\t"\
|
---|
584 | "paddw %%mm0, %%mm4 \n\t"\
|
---|
585 | "paddw %%mm1, %%mm5 \n\t"\
|
---|
586 | "pxor %%mm6, %%mm6 \n\t"\
|
---|
587 | "pcmpgtw %%mm4, %%mm6 \n\t"\
|
---|
588 | "pcmpgtw %%mm5, %%mm7 \n\t"\
|
---|
589 | "pxor %%mm6, %%mm4 \n\t"\
|
---|
590 | "pxor %%mm7, %%mm5 \n\t"\
|
---|
591 | "psubw %%mm6, %%mm4 \n\t"\
|
---|
592 | "psubw %%mm7, %%mm5 \n\t"\
|
---|
593 | "psrlw $3, %%mm4 \n\t"\
|
---|
594 | "psrlw $3, %%mm5 \n\t"\
|
---|
595 | "packuswb %%mm5, %%mm4 \n\t"\
|
---|
596 | "packsswb %%mm7, %%mm6 \n\t"\
|
---|
597 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
598 | "movd %4, %%mm2 \n\t"\
|
---|
599 | "punpcklbw %%mm2, %%mm2 \n\t"\
|
---|
600 | "punpcklbw %%mm2, %%mm2 \n\t"\
|
---|
601 | "punpcklbw %%mm2, %%mm2 \n\t"\
|
---|
602 | "psubusb %%mm4, %%mm2 \n\t"\
|
---|
603 | "movq %%mm2, %%mm3 \n\t"\
|
---|
604 | "psubusb %%mm4, %%mm3 \n\t"\
|
---|
605 | "psubb %%mm3, %%mm2 \n\t"\
|
---|
606 | "movq %1, %%mm3 \n\t"\
|
---|
607 | "movq %2, %%mm4 \n\t"\
|
---|
608 | "pxor %%mm6, %%mm3 \n\t"\
|
---|
609 | "pxor %%mm6, %%mm4 \n\t"\
|
---|
610 | "paddusb %%mm2, %%mm3 \n\t"\
|
---|
611 | "psubusb %%mm2, %%mm4 \n\t"\
|
---|
612 | "pxor %%mm6, %%mm3 \n\t"\
|
---|
613 | "pxor %%mm6, %%mm4 \n\t"\
|
---|
614 | "paddusb %%mm2, %%mm2 \n\t"\
|
---|
615 | "packsswb %%mm1, %%mm0 \n\t"\
|
---|
616 | "pcmpgtb %%mm0, %%mm7 \n\t"\
|
---|
617 | "pxor %%mm7, %%mm0 \n\t"\
|
---|
618 | "psubb %%mm7, %%mm0 \n\t"\
|
---|
619 | "movq %%mm0, %%mm1 \n\t"\
|
---|
620 | "psubusb %%mm2, %%mm0 \n\t"\
|
---|
621 | "psubb %%mm0, %%mm1 \n\t"\
|
---|
622 | "pand %5, %%mm1 \n\t"\
|
---|
623 | "psrlw $2, %%mm1 \n\t"\
|
---|
624 | "pxor %%mm7, %%mm1 \n\t"\
|
---|
625 | "psubb %%mm7, %%mm1 \n\t"\
|
---|
626 | "movq %0, %%mm5 \n\t"\
|
---|
627 | "movq %3, %%mm6 \n\t"\
|
---|
628 | "psubb %%mm1, %%mm5 \n\t"\
|
---|
629 | "paddb %%mm1, %%mm6 \n\t"
|
---|
630 |
|
---|
631 | static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
|
---|
632 | const int strength= ff_h263_loop_filter_strength[qscale];
|
---|
633 |
|
---|
634 | asm volatile(
|
---|
635 |
|
---|
636 | H263_LOOP_FILTER
|
---|
637 |
|
---|
638 | "movq %%mm3, %1 \n\t"
|
---|
639 | "movq %%mm4, %2 \n\t"
|
---|
640 | "movq %%mm5, %0 \n\t"
|
---|
641 | "movq %%mm6, %3 \n\t"
|
---|
642 | : "+m" (*(uint64_t*)(src - 2*stride)),
|
---|
643 | "+m" (*(uint64_t*)(src - 1*stride)),
|
---|
644 | "+m" (*(uint64_t*)(src + 0*stride)),
|
---|
645 | "+m" (*(uint64_t*)(src + 1*stride))
|
---|
646 | : "g" (2*strength), "m"(ff_pb_FC)
|
---|
647 | );
|
---|
648 | }
|
---|
649 |
|
---|
650 | static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
|
---|
651 | asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
|
---|
652 | "movd %4, %%mm0 \n\t"
|
---|
653 | "movd %5, %%mm1 \n\t"
|
---|
654 | "movd %6, %%mm2 \n\t"
|
---|
655 | "movd %7, %%mm3 \n\t"
|
---|
656 | "punpcklbw %%mm1, %%mm0 \n\t"
|
---|
657 | "punpcklbw %%mm3, %%mm2 \n\t"
|
---|
658 | "movq %%mm0, %%mm1 \n\t"
|
---|
659 | "punpcklwd %%mm2, %%mm0 \n\t"
|
---|
660 | "punpckhwd %%mm2, %%mm1 \n\t"
|
---|
661 | "movd %%mm0, %0 \n\t"
|
---|
662 | "punpckhdq %%mm0, %%mm0 \n\t"
|
---|
663 | "movd %%mm0, %1 \n\t"
|
---|
664 | "movd %%mm1, %2 \n\t"
|
---|
665 | "punpckhdq %%mm1, %%mm1 \n\t"
|
---|
666 | "movd %%mm1, %3 \n\t"
|
---|
667 |
|
---|
668 | : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
|
---|
669 | "=m" (*(uint32_t*)(dst + 1*dst_stride)),
|
---|
670 | "=m" (*(uint32_t*)(dst + 2*dst_stride)),
|
---|
671 | "=m" (*(uint32_t*)(dst + 3*dst_stride))
|
---|
672 | : "m" (*(uint32_t*)(src + 0*src_stride)),
|
---|
673 | "m" (*(uint32_t*)(src + 1*src_stride)),
|
---|
674 | "m" (*(uint32_t*)(src + 2*src_stride)),
|
---|
675 | "m" (*(uint32_t*)(src + 3*src_stride))
|
---|
676 | );
|
---|
677 | }
|
---|
678 |
|
---|
679 | static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
|
---|
680 | const int strength= ff_h263_loop_filter_strength[qscale];
|
---|
681 | uint64_t temp[4] __attribute__ ((aligned(8)));
|
---|
682 | uint8_t *btemp= (uint8_t*)temp;
|
---|
683 |
|
---|
684 | src -= 2;
|
---|
685 |
|
---|
686 | transpose4x4(btemp , src , 8, stride);
|
---|
687 | transpose4x4(btemp+4, src + 4*stride, 8, stride);
|
---|
688 | asm volatile(
|
---|
689 | H263_LOOP_FILTER // 5 3 4 6
|
---|
690 |
|
---|
691 | : "+m" (temp[0]),
|
---|
692 | "+m" (temp[1]),
|
---|
693 | "+m" (temp[2]),
|
---|
694 | "+m" (temp[3])
|
---|
695 | : "g" (2*strength), "m"(ff_pb_FC)
|
---|
696 | );
|
---|
697 |
|
---|
698 | asm volatile(
|
---|
699 | "movq %%mm5, %%mm1 \n\t"
|
---|
700 | "movq %%mm4, %%mm0 \n\t"
|
---|
701 | "punpcklbw %%mm3, %%mm5 \n\t"
|
---|
702 | "punpcklbw %%mm6, %%mm4 \n\t"
|
---|
703 | "punpckhbw %%mm3, %%mm1 \n\t"
|
---|
704 | "punpckhbw %%mm6, %%mm0 \n\t"
|
---|
705 | "movq %%mm5, %%mm3 \n\t"
|
---|
706 | "movq %%mm1, %%mm6 \n\t"
|
---|
707 | "punpcklwd %%mm4, %%mm5 \n\t"
|
---|
708 | "punpcklwd %%mm0, %%mm1 \n\t"
|
---|
709 | "punpckhwd %%mm4, %%mm3 \n\t"
|
---|
710 | "punpckhwd %%mm0, %%mm6 \n\t"
|
---|
711 | "movd %%mm5, (%0) \n\t"
|
---|
712 | "punpckhdq %%mm5, %%mm5 \n\t"
|
---|
713 | "movd %%mm5, (%0,%2) \n\t"
|
---|
714 | "movd %%mm3, (%0,%2,2) \n\t"
|
---|
715 | "punpckhdq %%mm3, %%mm3 \n\t"
|
---|
716 | "movd %%mm3, (%0,%3) \n\t"
|
---|
717 | "movd %%mm1, (%1) \n\t"
|
---|
718 | "punpckhdq %%mm1, %%mm1 \n\t"
|
---|
719 | "movd %%mm1, (%1,%2) \n\t"
|
---|
720 | "movd %%mm6, (%1,%2,2) \n\t"
|
---|
721 | "punpckhdq %%mm6, %%mm6 \n\t"
|
---|
722 | "movd %%mm6, (%1,%3) \n\t"
|
---|
723 | :: "r" (src),
|
---|
724 | "r" (src + 4*stride),
|
---|
725 | "r" ((long) stride ),
|
---|
726 | "r" ((long)(3*stride))
|
---|
727 | );
|
---|
728 | }
|
---|
729 |
|
---|
730 | #ifdef CONFIG_ENCODERS
|
---|
731 | static int pix_norm1_mmx(uint8_t *pix, int line_size) {
|
---|
732 | int tmp;
|
---|
733 | asm volatile (
|
---|
734 | "movl $16,%%ecx\n"
|
---|
735 | "pxor %%mm0,%%mm0\n"
|
---|
736 | "pxor %%mm7,%%mm7\n"
|
---|
737 | "1:\n"
|
---|
738 | "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
|
---|
739 | "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
|
---|
740 |
|
---|
741 | "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
|
---|
742 |
|
---|
743 | "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
|
---|
744 | "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
|
---|
745 |
|
---|
746 | "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
|
---|
747 | "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
|
---|
748 | "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
|
---|
749 |
|
---|
750 | "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
|
---|
751 | "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
|
---|
752 |
|
---|
753 | "pmaddwd %%mm3,%%mm3\n"
|
---|
754 | "pmaddwd %%mm4,%%mm4\n"
|
---|
755 |
|
---|
756 | "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
|
---|
757 | pix2^2+pix3^2+pix6^2+pix7^2) */
|
---|
758 | "paddd %%mm3,%%mm4\n"
|
---|
759 | "paddd %%mm2,%%mm7\n"
|
---|
760 |
|
---|
761 | "add %2, %0\n"
|
---|
762 | "paddd %%mm4,%%mm7\n"
|
---|
763 | "dec %%ecx\n"
|
---|
764 | "jnz 1b\n"
|
---|
765 |
|
---|
766 | "movq %%mm7,%%mm1\n"
|
---|
767 | "psrlq $32, %%mm7\n" /* shift hi dword to lo */
|
---|
768 | "paddd %%mm7,%%mm1\n"
|
---|
769 | "movd %%mm1,%1\n"
|
---|
770 | : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
|
---|
771 | return tmp;
|
---|
772 | }
|
---|
773 |
|
---|
774 | static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
775 | int tmp;
|
---|
776 | asm volatile (
|
---|
777 | "movl %4,%%ecx\n"
|
---|
778 | "shr $1,%%ecx\n"
|
---|
779 | "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
|
---|
780 | "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
|
---|
781 | "1:\n"
|
---|
782 | "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
|
---|
783 | "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
|
---|
784 | "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
|
---|
785 | "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
|
---|
786 |
|
---|
787 | /* todo: mm1-mm2, mm3-mm4 */
|
---|
788 | /* algo: substract mm1 from mm2 with saturation and vice versa */
|
---|
789 | /* OR the results to get absolute difference */
|
---|
790 | "movq %%mm1,%%mm5\n"
|
---|
791 | "movq %%mm3,%%mm6\n"
|
---|
792 | "psubusb %%mm2,%%mm1\n"
|
---|
793 | "psubusb %%mm4,%%mm3\n"
|
---|
794 | "psubusb %%mm5,%%mm2\n"
|
---|
795 | "psubusb %%mm6,%%mm4\n"
|
---|
796 |
|
---|
797 | "por %%mm1,%%mm2\n"
|
---|
798 | "por %%mm3,%%mm4\n"
|
---|
799 |
|
---|
800 | /* now convert to 16-bit vectors so we can square them */
|
---|
801 | "movq %%mm2,%%mm1\n"
|
---|
802 | "movq %%mm4,%%mm3\n"
|
---|
803 |
|
---|
804 | "punpckhbw %%mm0,%%mm2\n"
|
---|
805 | "punpckhbw %%mm0,%%mm4\n"
|
---|
806 | "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
|
---|
807 | "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
|
---|
808 |
|
---|
809 | "pmaddwd %%mm2,%%mm2\n"
|
---|
810 | "pmaddwd %%mm4,%%mm4\n"
|
---|
811 | "pmaddwd %%mm1,%%mm1\n"
|
---|
812 | "pmaddwd %%mm3,%%mm3\n"
|
---|
813 |
|
---|
814 | "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
|
---|
815 | "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
|
---|
816 |
|
---|
817 | "paddd %%mm2,%%mm1\n"
|
---|
818 | "paddd %%mm4,%%mm3\n"
|
---|
819 | "paddd %%mm1,%%mm7\n"
|
---|
820 | "paddd %%mm3,%%mm7\n"
|
---|
821 |
|
---|
822 | "decl %%ecx\n"
|
---|
823 | "jnz 1b\n"
|
---|
824 |
|
---|
825 | "movq %%mm7,%%mm1\n"
|
---|
826 | "psrlq $32, %%mm7\n" /* shift hi dword to lo */
|
---|
827 | "paddd %%mm7,%%mm1\n"
|
---|
828 | "movd %%mm1,%2\n"
|
---|
829 | : "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
---|
830 | : "r" ((long)line_size) , "m" (h)
|
---|
831 | : "%ecx");
|
---|
832 | return tmp;
|
---|
833 | }
|
---|
834 |
|
---|
835 | static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
836 | int tmp;
|
---|
837 | asm volatile (
|
---|
838 | "movl %4,%%ecx\n"
|
---|
839 | "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
|
---|
840 | "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
|
---|
841 | "1:\n"
|
---|
842 | "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
|
---|
843 | "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
|
---|
844 | "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
|
---|
845 | "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
|
---|
846 |
|
---|
847 | /* todo: mm1-mm2, mm3-mm4 */
|
---|
848 | /* algo: substract mm1 from mm2 with saturation and vice versa */
|
---|
849 | /* OR the results to get absolute difference */
|
---|
850 | "movq %%mm1,%%mm5\n"
|
---|
851 | "movq %%mm3,%%mm6\n"
|
---|
852 | "psubusb %%mm2,%%mm1\n"
|
---|
853 | "psubusb %%mm4,%%mm3\n"
|
---|
854 | "psubusb %%mm5,%%mm2\n"
|
---|
855 | "psubusb %%mm6,%%mm4\n"
|
---|
856 |
|
---|
857 | "por %%mm1,%%mm2\n"
|
---|
858 | "por %%mm3,%%mm4\n"
|
---|
859 |
|
---|
860 | /* now convert to 16-bit vectors so we can square them */
|
---|
861 | "movq %%mm2,%%mm1\n"
|
---|
862 | "movq %%mm4,%%mm3\n"
|
---|
863 |
|
---|
864 | "punpckhbw %%mm0,%%mm2\n"
|
---|
865 | "punpckhbw %%mm0,%%mm4\n"
|
---|
866 | "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
|
---|
867 | "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
|
---|
868 |
|
---|
869 | "pmaddwd %%mm2,%%mm2\n"
|
---|
870 | "pmaddwd %%mm4,%%mm4\n"
|
---|
871 | "pmaddwd %%mm1,%%mm1\n"
|
---|
872 | "pmaddwd %%mm3,%%mm3\n"
|
---|
873 |
|
---|
874 | "add %3,%0\n"
|
---|
875 | "add %3,%1\n"
|
---|
876 |
|
---|
877 | "paddd %%mm2,%%mm1\n"
|
---|
878 | "paddd %%mm4,%%mm3\n"
|
---|
879 | "paddd %%mm1,%%mm7\n"
|
---|
880 | "paddd %%mm3,%%mm7\n"
|
---|
881 |
|
---|
882 | "decl %%ecx\n"
|
---|
883 | "jnz 1b\n"
|
---|
884 |
|
---|
885 | "movq %%mm7,%%mm1\n"
|
---|
886 | "psrlq $32, %%mm7\n" /* shift hi dword to lo */
|
---|
887 | "paddd %%mm7,%%mm1\n"
|
---|
888 | "movd %%mm1,%2\n"
|
---|
889 | : "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
---|
890 | : "r" ((long)line_size) , "m" (h)
|
---|
891 | : "%ecx");
|
---|
892 | return tmp;
|
---|
893 | }
|
---|
894 |
|
---|
895 | static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
896 | int tmp;
|
---|
897 | asm volatile (
|
---|
898 | "shr $1,%2\n"
|
---|
899 | "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
|
---|
900 | "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
|
---|
901 | "1:\n"
|
---|
902 | "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
|
---|
903 | "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
|
---|
904 | "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
|
---|
905 | "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
|
---|
906 |
|
---|
907 | /* todo: mm1-mm2, mm3-mm4 */
|
---|
908 | /* algo: substract mm1 from mm2 with saturation and vice versa */
|
---|
909 | /* OR the results to get absolute difference */
|
---|
910 | "movdqa %%xmm1,%%xmm5\n"
|
---|
911 | "movdqa %%xmm3,%%xmm6\n"
|
---|
912 | "psubusb %%xmm2,%%xmm1\n"
|
---|
913 | "psubusb %%xmm4,%%xmm3\n"
|
---|
914 | "psubusb %%xmm5,%%xmm2\n"
|
---|
915 | "psubusb %%xmm6,%%xmm4\n"
|
---|
916 |
|
---|
917 | "por %%xmm1,%%xmm2\n"
|
---|
918 | "por %%xmm3,%%xmm4\n"
|
---|
919 |
|
---|
920 | /* now convert to 16-bit vectors so we can square them */
|
---|
921 | "movdqa %%xmm2,%%xmm1\n"
|
---|
922 | "movdqa %%xmm4,%%xmm3\n"
|
---|
923 |
|
---|
924 | "punpckhbw %%xmm0,%%xmm2\n"
|
---|
925 | "punpckhbw %%xmm0,%%xmm4\n"
|
---|
926 | "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
|
---|
927 | "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
|
---|
928 |
|
---|
929 | "pmaddwd %%xmm2,%%xmm2\n"
|
---|
930 | "pmaddwd %%xmm4,%%xmm4\n"
|
---|
931 | "pmaddwd %%xmm1,%%xmm1\n"
|
---|
932 | "pmaddwd %%xmm3,%%xmm3\n"
|
---|
933 |
|
---|
934 | "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
|
---|
935 | "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
|
---|
936 |
|
---|
937 | "paddd %%xmm2,%%xmm1\n"
|
---|
938 | "paddd %%xmm4,%%xmm3\n"
|
---|
939 | "paddd %%xmm1,%%xmm7\n"
|
---|
940 | "paddd %%xmm3,%%xmm7\n"
|
---|
941 |
|
---|
942 | "decl %2\n"
|
---|
943 | "jnz 1b\n"
|
---|
944 |
|
---|
945 | "movdqa %%xmm7,%%xmm1\n"
|
---|
946 | "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
|
---|
947 | "paddd %%xmm1,%%xmm7\n"
|
---|
948 | "movdqa %%xmm7,%%xmm1\n"
|
---|
949 | "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
|
---|
950 | "paddd %%xmm1,%%xmm7\n"
|
---|
951 | "movd %%xmm7,%3\n"
|
---|
952 | : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
|
---|
953 | : "r" ((long)line_size));
|
---|
954 | return tmp;
|
---|
955 | }
|
---|
956 |
|
---|
957 | static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
|
---|
958 | int tmp;
|
---|
959 | asm volatile (
|
---|
960 | "movl %3,%%ecx\n"
|
---|
961 | "pxor %%mm7,%%mm7\n"
|
---|
962 | "pxor %%mm6,%%mm6\n"
|
---|
963 |
|
---|
964 | "movq (%0),%%mm0\n"
|
---|
965 | "movq %%mm0, %%mm1\n"
|
---|
966 | "psllq $8, %%mm0\n"
|
---|
967 | "psrlq $8, %%mm1\n"
|
---|
968 | "psrlq $8, %%mm0\n"
|
---|
969 | "movq %%mm0, %%mm2\n"
|
---|
970 | "movq %%mm1, %%mm3\n"
|
---|
971 | "punpcklbw %%mm7,%%mm0\n"
|
---|
972 | "punpcklbw %%mm7,%%mm1\n"
|
---|
973 | "punpckhbw %%mm7,%%mm2\n"
|
---|
974 | "punpckhbw %%mm7,%%mm3\n"
|
---|
975 | "psubw %%mm1, %%mm0\n"
|
---|
976 | "psubw %%mm3, %%mm2\n"
|
---|
977 |
|
---|
978 | "add %2,%0\n"
|
---|
979 |
|
---|
980 | "movq (%0),%%mm4\n"
|
---|
981 | "movq %%mm4, %%mm1\n"
|
---|
982 | "psllq $8, %%mm4\n"
|
---|
983 | "psrlq $8, %%mm1\n"
|
---|
984 | "psrlq $8, %%mm4\n"
|
---|
985 | "movq %%mm4, %%mm5\n"
|
---|
986 | "movq %%mm1, %%mm3\n"
|
---|
987 | "punpcklbw %%mm7,%%mm4\n"
|
---|
988 | "punpcklbw %%mm7,%%mm1\n"
|
---|
989 | "punpckhbw %%mm7,%%mm5\n"
|
---|
990 | "punpckhbw %%mm7,%%mm3\n"
|
---|
991 | "psubw %%mm1, %%mm4\n"
|
---|
992 | "psubw %%mm3, %%mm5\n"
|
---|
993 | "psubw %%mm4, %%mm0\n"
|
---|
994 | "psubw %%mm5, %%mm2\n"
|
---|
995 | "pxor %%mm3, %%mm3\n"
|
---|
996 | "pxor %%mm1, %%mm1\n"
|
---|
997 | "pcmpgtw %%mm0, %%mm3\n\t"
|
---|
998 | "pcmpgtw %%mm2, %%mm1\n\t"
|
---|
999 | "pxor %%mm3, %%mm0\n"
|
---|
1000 | "pxor %%mm1, %%mm2\n"
|
---|
1001 | "psubw %%mm3, %%mm0\n"
|
---|
1002 | "psubw %%mm1, %%mm2\n"
|
---|
1003 | "paddw %%mm0, %%mm2\n"
|
---|
1004 | "paddw %%mm2, %%mm6\n"
|
---|
1005 |
|
---|
1006 | "add %2,%0\n"
|
---|
1007 | "1:\n"
|
---|
1008 |
|
---|
1009 | "movq (%0),%%mm0\n"
|
---|
1010 | "movq %%mm0, %%mm1\n"
|
---|
1011 | "psllq $8, %%mm0\n"
|
---|
1012 | "psrlq $8, %%mm1\n"
|
---|
1013 | "psrlq $8, %%mm0\n"
|
---|
1014 | "movq %%mm0, %%mm2\n"
|
---|
1015 | "movq %%mm1, %%mm3\n"
|
---|
1016 | "punpcklbw %%mm7,%%mm0\n"
|
---|
1017 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1018 | "punpckhbw %%mm7,%%mm2\n"
|
---|
1019 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1020 | "psubw %%mm1, %%mm0\n"
|
---|
1021 | "psubw %%mm3, %%mm2\n"
|
---|
1022 | "psubw %%mm0, %%mm4\n"
|
---|
1023 | "psubw %%mm2, %%mm5\n"
|
---|
1024 | "pxor %%mm3, %%mm3\n"
|
---|
1025 | "pxor %%mm1, %%mm1\n"
|
---|
1026 | "pcmpgtw %%mm4, %%mm3\n\t"
|
---|
1027 | "pcmpgtw %%mm5, %%mm1\n\t"
|
---|
1028 | "pxor %%mm3, %%mm4\n"
|
---|
1029 | "pxor %%mm1, %%mm5\n"
|
---|
1030 | "psubw %%mm3, %%mm4\n"
|
---|
1031 | "psubw %%mm1, %%mm5\n"
|
---|
1032 | "paddw %%mm4, %%mm5\n"
|
---|
1033 | "paddw %%mm5, %%mm6\n"
|
---|
1034 |
|
---|
1035 | "add %2,%0\n"
|
---|
1036 |
|
---|
1037 | "movq (%0),%%mm4\n"
|
---|
1038 | "movq %%mm4, %%mm1\n"
|
---|
1039 | "psllq $8, %%mm4\n"
|
---|
1040 | "psrlq $8, %%mm1\n"
|
---|
1041 | "psrlq $8, %%mm4\n"
|
---|
1042 | "movq %%mm4, %%mm5\n"
|
---|
1043 | "movq %%mm1, %%mm3\n"
|
---|
1044 | "punpcklbw %%mm7,%%mm4\n"
|
---|
1045 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1046 | "punpckhbw %%mm7,%%mm5\n"
|
---|
1047 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1048 | "psubw %%mm1, %%mm4\n"
|
---|
1049 | "psubw %%mm3, %%mm5\n"
|
---|
1050 | "psubw %%mm4, %%mm0\n"
|
---|
1051 | "psubw %%mm5, %%mm2\n"
|
---|
1052 | "pxor %%mm3, %%mm3\n"
|
---|
1053 | "pxor %%mm1, %%mm1\n"
|
---|
1054 | "pcmpgtw %%mm0, %%mm3\n\t"
|
---|
1055 | "pcmpgtw %%mm2, %%mm1\n\t"
|
---|
1056 | "pxor %%mm3, %%mm0\n"
|
---|
1057 | "pxor %%mm1, %%mm2\n"
|
---|
1058 | "psubw %%mm3, %%mm0\n"
|
---|
1059 | "psubw %%mm1, %%mm2\n"
|
---|
1060 | "paddw %%mm0, %%mm2\n"
|
---|
1061 | "paddw %%mm2, %%mm6\n"
|
---|
1062 |
|
---|
1063 | "add %2,%0\n"
|
---|
1064 | "subl $2, %%ecx\n"
|
---|
1065 | " jnz 1b\n"
|
---|
1066 |
|
---|
1067 | "movq %%mm6, %%mm0\n"
|
---|
1068 | "punpcklwd %%mm7,%%mm0\n"
|
---|
1069 | "punpckhwd %%mm7,%%mm6\n"
|
---|
1070 | "paddd %%mm0, %%mm6\n"
|
---|
1071 |
|
---|
1072 | "movq %%mm6,%%mm0\n"
|
---|
1073 | "psrlq $32, %%mm6\n"
|
---|
1074 | "paddd %%mm6,%%mm0\n"
|
---|
1075 | "movd %%mm0,%1\n"
|
---|
1076 | : "+r" (pix1), "=r"(tmp)
|
---|
1077 | : "r" ((long)line_size) , "g" (h-2)
|
---|
1078 | : "%ecx");
|
---|
1079 | return tmp;
|
---|
1080 | }
|
---|
1081 |
|
---|
1082 | static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
|
---|
1083 | int tmp;
|
---|
1084 | uint8_t * pix= pix1;
|
---|
1085 | asm volatile (
|
---|
1086 | "movl %3,%%ecx\n"
|
---|
1087 | "pxor %%mm7,%%mm7\n"
|
---|
1088 | "pxor %%mm6,%%mm6\n"
|
---|
1089 |
|
---|
1090 | "movq (%0),%%mm0\n"
|
---|
1091 | "movq 1(%0),%%mm1\n"
|
---|
1092 | "movq %%mm0, %%mm2\n"
|
---|
1093 | "movq %%mm1, %%mm3\n"
|
---|
1094 | "punpcklbw %%mm7,%%mm0\n"
|
---|
1095 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1096 | "punpckhbw %%mm7,%%mm2\n"
|
---|
1097 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1098 | "psubw %%mm1, %%mm0\n"
|
---|
1099 | "psubw %%mm3, %%mm2\n"
|
---|
1100 |
|
---|
1101 | "add %2,%0\n"
|
---|
1102 |
|
---|
1103 | "movq (%0),%%mm4\n"
|
---|
1104 | "movq 1(%0),%%mm1\n"
|
---|
1105 | "movq %%mm4, %%mm5\n"
|
---|
1106 | "movq %%mm1, %%mm3\n"
|
---|
1107 | "punpcklbw %%mm7,%%mm4\n"
|
---|
1108 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1109 | "punpckhbw %%mm7,%%mm5\n"
|
---|
1110 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1111 | "psubw %%mm1, %%mm4\n"
|
---|
1112 | "psubw %%mm3, %%mm5\n"
|
---|
1113 | "psubw %%mm4, %%mm0\n"
|
---|
1114 | "psubw %%mm5, %%mm2\n"
|
---|
1115 | "pxor %%mm3, %%mm3\n"
|
---|
1116 | "pxor %%mm1, %%mm1\n"
|
---|
1117 | "pcmpgtw %%mm0, %%mm3\n\t"
|
---|
1118 | "pcmpgtw %%mm2, %%mm1\n\t"
|
---|
1119 | "pxor %%mm3, %%mm0\n"
|
---|
1120 | "pxor %%mm1, %%mm2\n"
|
---|
1121 | "psubw %%mm3, %%mm0\n"
|
---|
1122 | "psubw %%mm1, %%mm2\n"
|
---|
1123 | "paddw %%mm0, %%mm2\n"
|
---|
1124 | "paddw %%mm2, %%mm6\n"
|
---|
1125 |
|
---|
1126 | "add %2,%0\n"
|
---|
1127 | "1:\n"
|
---|
1128 |
|
---|
1129 | "movq (%0),%%mm0\n"
|
---|
1130 | "movq 1(%0),%%mm1\n"
|
---|
1131 | "movq %%mm0, %%mm2\n"
|
---|
1132 | "movq %%mm1, %%mm3\n"
|
---|
1133 | "punpcklbw %%mm7,%%mm0\n"
|
---|
1134 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1135 | "punpckhbw %%mm7,%%mm2\n"
|
---|
1136 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1137 | "psubw %%mm1, %%mm0\n"
|
---|
1138 | "psubw %%mm3, %%mm2\n"
|
---|
1139 | "psubw %%mm0, %%mm4\n"
|
---|
1140 | "psubw %%mm2, %%mm5\n"
|
---|
1141 | "pxor %%mm3, %%mm3\n"
|
---|
1142 | "pxor %%mm1, %%mm1\n"
|
---|
1143 | "pcmpgtw %%mm4, %%mm3\n\t"
|
---|
1144 | "pcmpgtw %%mm5, %%mm1\n\t"
|
---|
1145 | "pxor %%mm3, %%mm4\n"
|
---|
1146 | "pxor %%mm1, %%mm5\n"
|
---|
1147 | "psubw %%mm3, %%mm4\n"
|
---|
1148 | "psubw %%mm1, %%mm5\n"
|
---|
1149 | "paddw %%mm4, %%mm5\n"
|
---|
1150 | "paddw %%mm5, %%mm6\n"
|
---|
1151 |
|
---|
1152 | "add %2,%0\n"
|
---|
1153 |
|
---|
1154 | "movq (%0),%%mm4\n"
|
---|
1155 | "movq 1(%0),%%mm1\n"
|
---|
1156 | "movq %%mm4, %%mm5\n"
|
---|
1157 | "movq %%mm1, %%mm3\n"
|
---|
1158 | "punpcklbw %%mm7,%%mm4\n"
|
---|
1159 | "punpcklbw %%mm7,%%mm1\n"
|
---|
1160 | "punpckhbw %%mm7,%%mm5\n"
|
---|
1161 | "punpckhbw %%mm7,%%mm3\n"
|
---|
1162 | "psubw %%mm1, %%mm4\n"
|
---|
1163 | "psubw %%mm3, %%mm5\n"
|
---|
1164 | "psubw %%mm4, %%mm0\n"
|
---|
1165 | "psubw %%mm5, %%mm2\n"
|
---|
1166 | "pxor %%mm3, %%mm3\n"
|
---|
1167 | "pxor %%mm1, %%mm1\n"
|
---|
1168 | "pcmpgtw %%mm0, %%mm3\n\t"
|
---|
1169 | "pcmpgtw %%mm2, %%mm1\n\t"
|
---|
1170 | "pxor %%mm3, %%mm0\n"
|
---|
1171 | "pxor %%mm1, %%mm2\n"
|
---|
1172 | "psubw %%mm3, %%mm0\n"
|
---|
1173 | "psubw %%mm1, %%mm2\n"
|
---|
1174 | "paddw %%mm0, %%mm2\n"
|
---|
1175 | "paddw %%mm2, %%mm6\n"
|
---|
1176 |
|
---|
1177 | "add %2,%0\n"
|
---|
1178 | "subl $2, %%ecx\n"
|
---|
1179 | " jnz 1b\n"
|
---|
1180 |
|
---|
1181 | "movq %%mm6, %%mm0\n"
|
---|
1182 | "punpcklwd %%mm7,%%mm0\n"
|
---|
1183 | "punpckhwd %%mm7,%%mm6\n"
|
---|
1184 | "paddd %%mm0, %%mm6\n"
|
---|
1185 |
|
---|
1186 | "movq %%mm6,%%mm0\n"
|
---|
1187 | "psrlq $32, %%mm6\n"
|
---|
1188 | "paddd %%mm6,%%mm0\n"
|
---|
1189 | "movd %%mm0,%1\n"
|
---|
1190 | : "+r" (pix1), "=r"(tmp)
|
---|
1191 | : "r" ((long)line_size) , "g" (h-2)
|
---|
1192 | : "%ecx");
|
---|
1193 | return tmp + hf_noise8_mmx(pix+8, line_size, h);
|
---|
1194 | }
|
---|
1195 |
|
---|
1196 | static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
1197 | MpegEncContext *c = p;
|
---|
1198 | int score1, score2;
|
---|
1199 |
|
---|
1200 | if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
|
---|
1201 | else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
|
---|
1202 | score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
|
---|
1203 |
|
---|
1204 | if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
|
---|
1205 | else return score1 + ABS(score2)*8;
|
---|
1206 | }
|
---|
1207 |
|
---|
1208 | static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
1209 | MpegEncContext *c = p;
|
---|
1210 | int score1= sse8_mmx(c, pix1, pix2, line_size, h);
|
---|
1211 | int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
|
---|
1212 |
|
---|
1213 | if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
|
---|
1214 | else return score1 + ABS(score2)*8;
|
---|
1215 | }
|
---|
1216 |
|
---|
1217 | static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
|
---|
1218 | int tmp;
|
---|
1219 |
|
---|
1220 | assert( (((int)pix) & 7) == 0);
|
---|
1221 | assert((line_size &7) ==0);
|
---|
1222 |
|
---|
1223 | #define SUM(in0, in1, out0, out1) \
|
---|
1224 | "movq (%0), %%mm2\n"\
|
---|
1225 | "movq 8(%0), %%mm3\n"\
|
---|
1226 | "add %2,%0\n"\
|
---|
1227 | "movq %%mm2, " #out0 "\n"\
|
---|
1228 | "movq %%mm3, " #out1 "\n"\
|
---|
1229 | "psubusb " #in0 ", %%mm2\n"\
|
---|
1230 | "psubusb " #in1 ", %%mm3\n"\
|
---|
1231 | "psubusb " #out0 ", " #in0 "\n"\
|
---|
1232 | "psubusb " #out1 ", " #in1 "\n"\
|
---|
1233 | "por %%mm2, " #in0 "\n"\
|
---|
1234 | "por %%mm3, " #in1 "\n"\
|
---|
1235 | "movq " #in0 ", %%mm2\n"\
|
---|
1236 | "movq " #in1 ", %%mm3\n"\
|
---|
1237 | "punpcklbw %%mm7, " #in0 "\n"\
|
---|
1238 | "punpcklbw %%mm7, " #in1 "\n"\
|
---|
1239 | "punpckhbw %%mm7, %%mm2\n"\
|
---|
1240 | "punpckhbw %%mm7, %%mm3\n"\
|
---|
1241 | "paddw " #in1 ", " #in0 "\n"\
|
---|
1242 | "paddw %%mm3, %%mm2\n"\
|
---|
1243 | "paddw %%mm2, " #in0 "\n"\
|
---|
1244 | "paddw " #in0 ", %%mm6\n"
|
---|
1245 |
|
---|
1246 |
|
---|
1247 | asm volatile (
|
---|
1248 | "movl %3,%%ecx\n"
|
---|
1249 | "pxor %%mm6,%%mm6\n"
|
---|
1250 | "pxor %%mm7,%%mm7\n"
|
---|
1251 | "movq (%0),%%mm0\n"
|
---|
1252 | "movq 8(%0),%%mm1\n"
|
---|
1253 | "add %2,%0\n"
|
---|
1254 | "subl $2, %%ecx\n"
|
---|
1255 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1256 | "1:\n"
|
---|
1257 |
|
---|
1258 | SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
---|
1259 |
|
---|
1260 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1261 |
|
---|
1262 | "subl $2, %%ecx\n"
|
---|
1263 | "jnz 1b\n"
|
---|
1264 |
|
---|
1265 | "movq %%mm6,%%mm0\n"
|
---|
1266 | "psrlq $32, %%mm6\n"
|
---|
1267 | "paddw %%mm6,%%mm0\n"
|
---|
1268 | "movq %%mm0,%%mm6\n"
|
---|
1269 | "psrlq $16, %%mm0\n"
|
---|
1270 | "paddw %%mm6,%%mm0\n"
|
---|
1271 | "movd %%mm0,%1\n"
|
---|
1272 | : "+r" (pix), "=r"(tmp)
|
---|
1273 | : "r" ((long)line_size) , "m" (h)
|
---|
1274 | : "%ecx");
|
---|
1275 | return tmp & 0xFFFF;
|
---|
1276 | }
|
---|
1277 | #undef SUM
|
---|
1278 |
|
---|
1279 | static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
|
---|
1280 | int tmp;
|
---|
1281 |
|
---|
1282 | assert( (((int)pix) & 7) == 0);
|
---|
1283 | assert((line_size &7) ==0);
|
---|
1284 |
|
---|
1285 | #define SUM(in0, in1, out0, out1) \
|
---|
1286 | "movq (%0), " #out0 "\n"\
|
---|
1287 | "movq 8(%0), " #out1 "\n"\
|
---|
1288 | "add %2,%0\n"\
|
---|
1289 | "psadbw " #out0 ", " #in0 "\n"\
|
---|
1290 | "psadbw " #out1 ", " #in1 "\n"\
|
---|
1291 | "paddw " #in1 ", " #in0 "\n"\
|
---|
1292 | "paddw " #in0 ", %%mm6\n"
|
---|
1293 |
|
---|
1294 | asm volatile (
|
---|
1295 | "movl %3,%%ecx\n"
|
---|
1296 | "pxor %%mm6,%%mm6\n"
|
---|
1297 | "pxor %%mm7,%%mm7\n"
|
---|
1298 | "movq (%0),%%mm0\n"
|
---|
1299 | "movq 8(%0),%%mm1\n"
|
---|
1300 | "add %2,%0\n"
|
---|
1301 | "subl $2, %%ecx\n"
|
---|
1302 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1303 | "1:\n"
|
---|
1304 |
|
---|
1305 | SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
---|
1306 |
|
---|
1307 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1308 |
|
---|
1309 | "subl $2, %%ecx\n"
|
---|
1310 | "jnz 1b\n"
|
---|
1311 |
|
---|
1312 | "movd %%mm6,%1\n"
|
---|
1313 | : "+r" (pix), "=r"(tmp)
|
---|
1314 | : "r" ((long)line_size) , "m" (h)
|
---|
1315 | : "%ecx");
|
---|
1316 | return tmp;
|
---|
1317 | }
|
---|
1318 | #undef SUM
|
---|
1319 |
|
---|
1320 | static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
1321 | int tmp;
|
---|
1322 |
|
---|
1323 | assert( (((int)pix1) & 7) == 0);
|
---|
1324 | assert( (((int)pix2) & 7) == 0);
|
---|
1325 | assert((line_size &7) ==0);
|
---|
1326 |
|
---|
1327 | #define SUM(in0, in1, out0, out1) \
|
---|
1328 | "movq (%0),%%mm2\n"\
|
---|
1329 | "movq (%1)," #out0 "\n"\
|
---|
1330 | "movq 8(%0),%%mm3\n"\
|
---|
1331 | "movq 8(%1)," #out1 "\n"\
|
---|
1332 | "add %3,%0\n"\
|
---|
1333 | "add %3,%1\n"\
|
---|
1334 | "psubb " #out0 ", %%mm2\n"\
|
---|
1335 | "psubb " #out1 ", %%mm3\n"\
|
---|
1336 | "pxor %%mm7, %%mm2\n"\
|
---|
1337 | "pxor %%mm7, %%mm3\n"\
|
---|
1338 | "movq %%mm2, " #out0 "\n"\
|
---|
1339 | "movq %%mm3, " #out1 "\n"\
|
---|
1340 | "psubusb " #in0 ", %%mm2\n"\
|
---|
1341 | "psubusb " #in1 ", %%mm3\n"\
|
---|
1342 | "psubusb " #out0 ", " #in0 "\n"\
|
---|
1343 | "psubusb " #out1 ", " #in1 "\n"\
|
---|
1344 | "por %%mm2, " #in0 "\n"\
|
---|
1345 | "por %%mm3, " #in1 "\n"\
|
---|
1346 | "movq " #in0 ", %%mm2\n"\
|
---|
1347 | "movq " #in1 ", %%mm3\n"\
|
---|
1348 | "punpcklbw %%mm7, " #in0 "\n"\
|
---|
1349 | "punpcklbw %%mm7, " #in1 "\n"\
|
---|
1350 | "punpckhbw %%mm7, %%mm2\n"\
|
---|
1351 | "punpckhbw %%mm7, %%mm3\n"\
|
---|
1352 | "paddw " #in1 ", " #in0 "\n"\
|
---|
1353 | "paddw %%mm3, %%mm2\n"\
|
---|
1354 | "paddw %%mm2, " #in0 "\n"\
|
---|
1355 | "paddw " #in0 ", %%mm6\n"
|
---|
1356 |
|
---|
1357 |
|
---|
1358 | asm volatile (
|
---|
1359 | "movl %4,%%ecx\n"
|
---|
1360 | "pxor %%mm6,%%mm6\n"
|
---|
1361 | "pcmpeqw %%mm7,%%mm7\n"
|
---|
1362 | "psllw $15, %%mm7\n"
|
---|
1363 | "packsswb %%mm7, %%mm7\n"
|
---|
1364 | "movq (%0),%%mm0\n"
|
---|
1365 | "movq (%1),%%mm2\n"
|
---|
1366 | "movq 8(%0),%%mm1\n"
|
---|
1367 | "movq 8(%1),%%mm3\n"
|
---|
1368 | "add %3,%0\n"
|
---|
1369 | "add %3,%1\n"
|
---|
1370 | "subl $2, %%ecx\n"
|
---|
1371 | "psubb %%mm2, %%mm0\n"
|
---|
1372 | "psubb %%mm3, %%mm1\n"
|
---|
1373 | "pxor %%mm7, %%mm0\n"
|
---|
1374 | "pxor %%mm7, %%mm1\n"
|
---|
1375 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1376 | "1:\n"
|
---|
1377 |
|
---|
1378 | SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
---|
1379 |
|
---|
1380 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1381 |
|
---|
1382 | "subl $2, %%ecx\n"
|
---|
1383 | "jnz 1b\n"
|
---|
1384 |
|
---|
1385 | "movq %%mm6,%%mm0\n"
|
---|
1386 | "psrlq $32, %%mm6\n"
|
---|
1387 | "paddw %%mm6,%%mm0\n"
|
---|
1388 | "movq %%mm0,%%mm6\n"
|
---|
1389 | "psrlq $16, %%mm0\n"
|
---|
1390 | "paddw %%mm6,%%mm0\n"
|
---|
1391 | "movd %%mm0,%2\n"
|
---|
1392 | : "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
---|
1393 | : "r" ((long)line_size) , "m" (h)
|
---|
1394 | : "%ecx");
|
---|
1395 | return tmp & 0x7FFF;
|
---|
1396 | }
|
---|
1397 | #undef SUM
|
---|
1398 |
|
---|
1399 | static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
|
---|
1400 | int tmp;
|
---|
1401 |
|
---|
1402 | assert( (((int)pix1) & 7) == 0);
|
---|
1403 | assert( (((int)pix2) & 7) == 0);
|
---|
1404 | assert((line_size &7) ==0);
|
---|
1405 |
|
---|
1406 | #define SUM(in0, in1, out0, out1) \
|
---|
1407 | "movq (%0)," #out0 "\n"\
|
---|
1408 | "movq (%1),%%mm2\n"\
|
---|
1409 | "movq 8(%0)," #out1 "\n"\
|
---|
1410 | "movq 8(%1),%%mm3\n"\
|
---|
1411 | "add %3,%0\n"\
|
---|
1412 | "add %3,%1\n"\
|
---|
1413 | "psubb %%mm2, " #out0 "\n"\
|
---|
1414 | "psubb %%mm3, " #out1 "\n"\
|
---|
1415 | "pxor %%mm7, " #out0 "\n"\
|
---|
1416 | "pxor %%mm7, " #out1 "\n"\
|
---|
1417 | "psadbw " #out0 ", " #in0 "\n"\
|
---|
1418 | "psadbw " #out1 ", " #in1 "\n"\
|
---|
1419 | "paddw " #in1 ", " #in0 "\n"\
|
---|
1420 | "paddw " #in0 ", %%mm6\n"
|
---|
1421 |
|
---|
1422 | asm volatile (
|
---|
1423 | "movl %4,%%ecx\n"
|
---|
1424 | "pxor %%mm6,%%mm6\n"
|
---|
1425 | "pcmpeqw %%mm7,%%mm7\n"
|
---|
1426 | "psllw $15, %%mm7\n"
|
---|
1427 | "packsswb %%mm7, %%mm7\n"
|
---|
1428 | "movq (%0),%%mm0\n"
|
---|
1429 | "movq (%1),%%mm2\n"
|
---|
1430 | "movq 8(%0),%%mm1\n"
|
---|
1431 | "movq 8(%1),%%mm3\n"
|
---|
1432 | "add %3,%0\n"
|
---|
1433 | "add %3,%1\n"
|
---|
1434 | "subl $2, %%ecx\n"
|
---|
1435 | "psubb %%mm2, %%mm0\n"
|
---|
1436 | "psubb %%mm3, %%mm1\n"
|
---|
1437 | "pxor %%mm7, %%mm0\n"
|
---|
1438 | "pxor %%mm7, %%mm1\n"
|
---|
1439 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1440 | "1:\n"
|
---|
1441 |
|
---|
1442 | SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
---|
1443 |
|
---|
1444 | SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
---|
1445 |
|
---|
1446 | "subl $2, %%ecx\n"
|
---|
1447 | "jnz 1b\n"
|
---|
1448 |
|
---|
1449 | "movd %%mm6,%2\n"
|
---|
1450 | : "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
---|
1451 | : "r" ((long)line_size) , "m" (h)
|
---|
1452 | : "%ecx");
|
---|
1453 | return tmp;
|
---|
1454 | }
|
---|
1455 | #undef SUM
|
---|
1456 |
|
---|
1457 | static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
---|
1458 | long i=0;
|
---|
1459 | asm volatile(
|
---|
1460 | "1: \n\t"
|
---|
1461 | "movq (%2, %0), %%mm0 \n\t"
|
---|
1462 | "movq (%1, %0), %%mm1 \n\t"
|
---|
1463 | "psubb %%mm0, %%mm1 \n\t"
|
---|
1464 | "movq %%mm1, (%3, %0) \n\t"
|
---|
1465 | "movq 8(%2, %0), %%mm0 \n\t"
|
---|
1466 | "movq 8(%1, %0), %%mm1 \n\t"
|
---|
1467 | "psubb %%mm0, %%mm1 \n\t"
|
---|
1468 | "movq %%mm1, 8(%3, %0) \n\t"
|
---|
1469 | "add $16, %0 \n\t"
|
---|
1470 | "cmp %4, %0 \n\t"
|
---|
1471 | " jb 1b \n\t"
|
---|
1472 | : "+r" (i)
|
---|
1473 | : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
|
---|
1474 | );
|
---|
1475 | for(; i<w; i++)
|
---|
1476 | dst[i+0] = src1[i+0]-src2[i+0];
|
---|
1477 | }
|
---|
1478 |
|
---|
1479 | static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
|
---|
1480 | long i=0;
|
---|
1481 | uint8_t l, lt;
|
---|
1482 |
|
---|
1483 | asm volatile(
|
---|
1484 | "1: \n\t"
|
---|
1485 | "movq -1(%1, %0), %%mm0 \n\t" // LT
|
---|
1486 | "movq (%1, %0), %%mm1 \n\t" // T
|
---|
1487 | "movq -1(%2, %0), %%mm2 \n\t" // L
|
---|
1488 | "movq (%2, %0), %%mm3 \n\t" // X
|
---|
1489 | "movq %%mm2, %%mm4 \n\t" // L
|
---|
1490 | "psubb %%mm0, %%mm2 \n\t"
|
---|
1491 | "paddb %%mm1, %%mm2 \n\t" // L + T - LT
|
---|
1492 | "movq %%mm4, %%mm5 \n\t" // L
|
---|
1493 | "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
|
---|
1494 | "pminub %%mm5, %%mm1 \n\t" // min(T, L)
|
---|
1495 | "pminub %%mm2, %%mm4 \n\t"
|
---|
1496 | "pmaxub %%mm1, %%mm4 \n\t"
|
---|
1497 | "psubb %%mm4, %%mm3 \n\t" // dst - pred
|
---|
1498 | "movq %%mm3, (%3, %0) \n\t"
|
---|
1499 | "add $8, %0 \n\t"
|
---|
1500 | "cmp %4, %0 \n\t"
|
---|
1501 | " jb 1b \n\t"
|
---|
1502 | : "+r" (i)
|
---|
1503 | : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
|
---|
1504 | );
|
---|
1505 |
|
---|
1506 | l= *left;
|
---|
1507 | lt= *left_top;
|
---|
1508 |
|
---|
1509 | dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
|
---|
1510 |
|
---|
1511 | *left_top= src1[w-1];
|
---|
1512 | *left = src2[w-1];
|
---|
1513 | }
|
---|
1514 |
|
---|
1515 | #define LBUTTERFLY2(a1,b1,a2,b2)\
|
---|
1516 | "paddw " #b1 ", " #a1 " \n\t"\
|
---|
1517 | "paddw " #b2 ", " #a2 " \n\t"\
|
---|
1518 | "paddw " #b1 ", " #b1 " \n\t"\
|
---|
1519 | "paddw " #b2 ", " #b2 " \n\t"\
|
---|
1520 | "psubw " #a1 ", " #b1 " \n\t"\
|
---|
1521 | "psubw " #a2 ", " #b2 " \n\t"
|
---|
1522 |
|
---|
1523 | #define HADAMARD48\
|
---|
1524 | LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
|
---|
1525 | LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
|
---|
1526 | LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
|
---|
1527 | LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
|
---|
1528 | LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
|
---|
1529 | LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
|
---|
1530 |
|
---|
1531 | #define MMABS(a,z)\
|
---|
1532 | "pxor " #z ", " #z " \n\t"\
|
---|
1533 | "pcmpgtw " #a ", " #z " \n\t"\
|
---|
1534 | "pxor " #z ", " #a " \n\t"\
|
---|
1535 | "psubw " #z ", " #a " \n\t"
|
---|
1536 |
|
---|
1537 | #define MMABS_SUM(a,z, sum)\
|
---|
1538 | "pxor " #z ", " #z " \n\t"\
|
---|
1539 | "pcmpgtw " #a ", " #z " \n\t"\
|
---|
1540 | "pxor " #z ", " #a " \n\t"\
|
---|
1541 | "psubw " #z ", " #a " \n\t"\
|
---|
1542 | "paddusw " #a ", " #sum " \n\t"
|
---|
1543 |
|
---|
1544 | #define MMABS_MMX2(a,z)\
|
---|
1545 | "pxor " #z ", " #z " \n\t"\
|
---|
1546 | "psubw " #a ", " #z " \n\t"\
|
---|
1547 | "pmaxsw " #z ", " #a " \n\t"
|
---|
1548 |
|
---|
1549 | #define MMABS_SUM_MMX2(a,z, sum)\
|
---|
1550 | "pxor " #z ", " #z " \n\t"\
|
---|
1551 | "psubw " #a ", " #z " \n\t"\
|
---|
1552 | "pmaxsw " #z ", " #a " \n\t"\
|
---|
1553 | "paddusw " #a ", " #sum " \n\t"
|
---|
1554 |
|
---|
1555 | #define TRANSPOSE4(a,b,c,d,t)\
|
---|
1556 | SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
|
---|
1557 | SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
|
---|
1558 | SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
|
---|
1559 | SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
|
---|
1560 |
|
---|
1561 | #define LOAD4(o, a, b, c, d)\
|
---|
1562 | "movq "#o"(%1), " #a " \n\t"\
|
---|
1563 | "movq "#o"+16(%1), " #b " \n\t"\
|
---|
1564 | "movq "#o"+32(%1), " #c " \n\t"\
|
---|
1565 | "movq "#o"+48(%1), " #d " \n\t"
|
---|
1566 |
|
---|
1567 | #define STORE4(o, a, b, c, d)\
|
---|
1568 | "movq "#a", "#o"(%1) \n\t"\
|
---|
1569 | "movq "#b", "#o"+16(%1) \n\t"\
|
---|
1570 | "movq "#c", "#o"+32(%1) \n\t"\
|
---|
1571 | "movq "#d", "#o"+48(%1) \n\t"\
|
---|
1572 |
|
---|
1573 | static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
|
---|
1574 | DECLARE_ALIGNED_8(uint64_t, temp[16]);
|
---|
1575 | int sum=0;
|
---|
1576 |
|
---|
1577 | assert(h==8);
|
---|
1578 |
|
---|
1579 | diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
|
---|
1580 |
|
---|
1581 | asm volatile(
|
---|
1582 | LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1583 | LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1584 |
|
---|
1585 | HADAMARD48
|
---|
1586 |
|
---|
1587 | "movq %%mm7, 112(%1) \n\t"
|
---|
1588 |
|
---|
1589 | TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
|
---|
1590 | STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
|
---|
1591 |
|
---|
1592 | "movq 112(%1), %%mm7 \n\t"
|
---|
1593 | TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
|
---|
1594 | STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
|
---|
1595 |
|
---|
1596 | LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1597 | LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1598 |
|
---|
1599 | HADAMARD48
|
---|
1600 |
|
---|
1601 | "movq %%mm7, 120(%1) \n\t"
|
---|
1602 |
|
---|
1603 | TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
|
---|
1604 | STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
|
---|
1605 |
|
---|
1606 | "movq 120(%1), %%mm7 \n\t"
|
---|
1607 | TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
|
---|
1608 | "movq %%mm7, %%mm5 \n\t"//FIXME remove
|
---|
1609 | "movq %%mm6, %%mm7 \n\t"
|
---|
1610 | "movq %%mm0, %%mm6 \n\t"
|
---|
1611 | // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
|
---|
1612 |
|
---|
1613 | LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1614 | // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1615 |
|
---|
1616 | HADAMARD48
|
---|
1617 | "movq %%mm7, 64(%1) \n\t"
|
---|
1618 | MMABS(%%mm0, %%mm7)
|
---|
1619 | MMABS_SUM(%%mm1, %%mm7, %%mm0)
|
---|
1620 | MMABS_SUM(%%mm2, %%mm7, %%mm0)
|
---|
1621 | MMABS_SUM(%%mm3, %%mm7, %%mm0)
|
---|
1622 | MMABS_SUM(%%mm4, %%mm7, %%mm0)
|
---|
1623 | MMABS_SUM(%%mm5, %%mm7, %%mm0)
|
---|
1624 | MMABS_SUM(%%mm6, %%mm7, %%mm0)
|
---|
1625 | "movq 64(%1), %%mm1 \n\t"
|
---|
1626 | MMABS_SUM(%%mm1, %%mm7, %%mm0)
|
---|
1627 | "movq %%mm0, 64(%1) \n\t"
|
---|
1628 |
|
---|
1629 | LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1630 | LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1631 |
|
---|
1632 | HADAMARD48
|
---|
1633 | "movq %%mm7, (%1) \n\t"
|
---|
1634 | MMABS(%%mm0, %%mm7)
|
---|
1635 | MMABS_SUM(%%mm1, %%mm7, %%mm0)
|
---|
1636 | MMABS_SUM(%%mm2, %%mm7, %%mm0)
|
---|
1637 | MMABS_SUM(%%mm3, %%mm7, %%mm0)
|
---|
1638 | MMABS_SUM(%%mm4, %%mm7, %%mm0)
|
---|
1639 | MMABS_SUM(%%mm5, %%mm7, %%mm0)
|
---|
1640 | MMABS_SUM(%%mm6, %%mm7, %%mm0)
|
---|
1641 | "movq (%1), %%mm1 \n\t"
|
---|
1642 | MMABS_SUM(%%mm1, %%mm7, %%mm0)
|
---|
1643 | "movq 64(%1), %%mm1 \n\t"
|
---|
1644 | MMABS_SUM(%%mm1, %%mm7, %%mm0)
|
---|
1645 |
|
---|
1646 | "movq %%mm0, %%mm1 \n\t"
|
---|
1647 | "psrlq $32, %%mm0 \n\t"
|
---|
1648 | "paddusw %%mm1, %%mm0 \n\t"
|
---|
1649 | "movq %%mm0, %%mm1 \n\t"
|
---|
1650 | "psrlq $16, %%mm0 \n\t"
|
---|
1651 | "paddusw %%mm1, %%mm0 \n\t"
|
---|
1652 | "movd %%mm0, %0 \n\t"
|
---|
1653 |
|
---|
1654 | : "=r" (sum)
|
---|
1655 | : "r"(temp)
|
---|
1656 | );
|
---|
1657 | return sum&0xFFFF;
|
---|
1658 | }
|
---|
1659 |
|
---|
1660 | static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
|
---|
1661 | DECLARE_ALIGNED_8(uint64_t, temp[16]);
|
---|
1662 | int sum=0;
|
---|
1663 |
|
---|
1664 | assert(h==8);
|
---|
1665 |
|
---|
1666 | diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
|
---|
1667 |
|
---|
1668 | asm volatile(
|
---|
1669 | LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1670 | LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1671 |
|
---|
1672 | HADAMARD48
|
---|
1673 |
|
---|
1674 | "movq %%mm7, 112(%1) \n\t"
|
---|
1675 |
|
---|
1676 | TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
|
---|
1677 | STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
|
---|
1678 |
|
---|
1679 | "movq 112(%1), %%mm7 \n\t"
|
---|
1680 | TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
|
---|
1681 | STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
|
---|
1682 |
|
---|
1683 | LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1684 | LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1685 |
|
---|
1686 | HADAMARD48
|
---|
1687 |
|
---|
1688 | "movq %%mm7, 120(%1) \n\t"
|
---|
1689 |
|
---|
1690 | TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
|
---|
1691 | STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
|
---|
1692 |
|
---|
1693 | "movq 120(%1), %%mm7 \n\t"
|
---|
1694 | TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
|
---|
1695 | "movq %%mm7, %%mm5 \n\t"//FIXME remove
|
---|
1696 | "movq %%mm6, %%mm7 \n\t"
|
---|
1697 | "movq %%mm0, %%mm6 \n\t"
|
---|
1698 | // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
|
---|
1699 |
|
---|
1700 | LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1701 | // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1702 |
|
---|
1703 | HADAMARD48
|
---|
1704 | "movq %%mm7, 64(%1) \n\t"
|
---|
1705 | MMABS_MMX2(%%mm0, %%mm7)
|
---|
1706 | MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
|
---|
1707 | MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
|
---|
1708 | MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
|
---|
1709 | MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
|
---|
1710 | MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
|
---|
1711 | MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
|
---|
1712 | "movq 64(%1), %%mm1 \n\t"
|
---|
1713 | MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
|
---|
1714 | "movq %%mm0, 64(%1) \n\t"
|
---|
1715 |
|
---|
1716 | LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
|
---|
1717 | LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
|
---|
1718 |
|
---|
1719 | HADAMARD48
|
---|
1720 | "movq %%mm7, (%1) \n\t"
|
---|
1721 | MMABS_MMX2(%%mm0, %%mm7)
|
---|
1722 | MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
|
---|
1723 | MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
|
---|
1724 | MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
|
---|
1725 | MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
|
---|
1726 | MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
|
---|
1727 | MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
|
---|
1728 | "movq (%1), %%mm1 \n\t"
|
---|
1729 | MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
|
---|
1730 | "movq 64(%1), %%mm1 \n\t"
|
---|
1731 | MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
|
---|
1732 |
|
---|
1733 | "pshufw $0x0E, %%mm0, %%mm1 \n\t"
|
---|
1734 | "paddusw %%mm1, %%mm0 \n\t"
|
---|
1735 | "pshufw $0x01, %%mm0, %%mm1 \n\t"
|
---|
1736 | "paddusw %%mm1, %%mm0 \n\t"
|
---|
1737 | "movd %%mm0, %0 \n\t"
|
---|
1738 |
|
---|
1739 | : "=r" (sum)
|
---|
1740 | : "r"(temp)
|
---|
1741 | );
|
---|
1742 | return sum&0xFFFF;
|
---|
1743 | }
|
---|
1744 |
|
---|
1745 |
|
---|
1746 | WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
|
---|
1747 | WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
|
---|
1748 | #endif //CONFIG_ENCODERS
|
---|
1749 |
|
---|
1750 | #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
|
---|
1751 | #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
|
---|
1752 |
|
---|
1753 | #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
|
---|
1754 | "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
|
---|
1755 | "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
|
---|
1756 | "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
|
---|
1757 | "movq "#in7", " #m3 " \n\t" /* d */\
|
---|
1758 | "movq "#in0", %%mm5 \n\t" /* D */\
|
---|
1759 | "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
|
---|
1760 | "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
|
---|
1761 | "movq "#in1", %%mm5 \n\t" /* C */\
|
---|
1762 | "movq "#in2", %%mm6 \n\t" /* B */\
|
---|
1763 | "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
|
---|
1764 | "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
|
---|
1765 | "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
|
---|
1766 | "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
|
---|
1767 | "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
|
---|
1768 | "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
|
---|
1769 | "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
|
---|
1770 | "psraw $5, %%mm5 \n\t"\
|
---|
1771 | "packuswb %%mm5, %%mm5 \n\t"\
|
---|
1772 | OP(%%mm5, out, %%mm7, d)
|
---|
1773 |
|
---|
1774 | #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
|
---|
1775 | static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
---|
1776 | uint64_t temp;\
|
---|
1777 | \
|
---|
1778 | asm volatile(\
|
---|
1779 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
1780 | "1: \n\t"\
|
---|
1781 | "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
|
---|
1782 | "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
|
---|
1783 | "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
|
---|
1784 | "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
|
---|
1785 | "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
|
---|
1786 | "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
|
---|
1787 | "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
|
---|
1788 | "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
|
---|
1789 | "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
|
---|
1790 | "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
|
---|
1791 | "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
|
---|
1792 | "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
|
---|
1793 | "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
|
---|
1794 | "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
|
---|
1795 | "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
|
---|
1796 | "paddw %%mm3, %%mm5 \n\t" /* b */\
|
---|
1797 | "paddw %%mm2, %%mm6 \n\t" /* c */\
|
---|
1798 | "paddw %%mm5, %%mm5 \n\t" /* 2b */\
|
---|
1799 | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
|
---|
1800 | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
|
---|
1801 | "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
|
---|
1802 | "paddw %%mm4, %%mm0 \n\t" /* a */\
|
---|
1803 | "paddw %%mm1, %%mm5 \n\t" /* d */\
|
---|
1804 | "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
|
---|
1805 | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
|
---|
1806 | "paddw %6, %%mm6 \n\t"\
|
---|
1807 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
|
---|
1808 | "psraw $5, %%mm0 \n\t"\
|
---|
1809 | "movq %%mm0, %5 \n\t"\
|
---|
1810 | /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
|
---|
1811 | \
|
---|
1812 | "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
|
---|
1813 | "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
|
---|
1814 | "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
|
---|
1815 | "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
|
---|
1816 | "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
|
---|
1817 | "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
|
---|
1818 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
|
---|
1819 | "paddw %%mm0, %%mm2 \n\t" /* b */\
|
---|
1820 | "paddw %%mm5, %%mm3 \n\t" /* c */\
|
---|
1821 | "paddw %%mm2, %%mm2 \n\t" /* 2b */\
|
---|
1822 | "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
|
---|
1823 | "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
|
---|
1824 | "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
|
---|
1825 | "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
|
---|
1826 | "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
|
---|
1827 | "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
|
---|
1828 | "paddw %%mm2, %%mm1 \n\t" /* a */\
|
---|
1829 | "paddw %%mm6, %%mm4 \n\t" /* d */\
|
---|
1830 | "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
|
---|
1831 | "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
|
---|
1832 | "paddw %6, %%mm1 \n\t"\
|
---|
1833 | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
|
---|
1834 | "psraw $5, %%mm3 \n\t"\
|
---|
1835 | "movq %5, %%mm1 \n\t"\
|
---|
1836 | "packuswb %%mm3, %%mm1 \n\t"\
|
---|
1837 | OP_MMX2(%%mm1, (%1),%%mm4, q)\
|
---|
1838 | /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
|
---|
1839 | \
|
---|
1840 | "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
|
---|
1841 | "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
|
---|
1842 | "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
|
---|
1843 | "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
|
---|
1844 | "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
|
---|
1845 | "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
|
---|
1846 | "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
|
---|
1847 | "paddw %%mm1, %%mm5 \n\t" /* b */\
|
---|
1848 | "paddw %%mm4, %%mm0 \n\t" /* c */\
|
---|
1849 | "paddw %%mm5, %%mm5 \n\t" /* 2b */\
|
---|
1850 | "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
|
---|
1851 | "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
|
---|
1852 | "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
|
---|
1853 | "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
|
---|
1854 | "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
|
---|
1855 | "paddw %%mm3, %%mm2 \n\t" /* d */\
|
---|
1856 | "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
|
---|
1857 | "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
|
---|
1858 | "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
|
---|
1859 | "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
|
---|
1860 | "paddw %%mm2, %%mm6 \n\t" /* a */\
|
---|
1861 | "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
|
---|
1862 | "paddw %6, %%mm0 \n\t"\
|
---|
1863 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
|
---|
1864 | "psraw $5, %%mm0 \n\t"\
|
---|
1865 | /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
|
---|
1866 | \
|
---|
1867 | "paddw %%mm5, %%mm3 \n\t" /* a */\
|
---|
1868 | "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
|
---|
1869 | "paddw %%mm4, %%mm6 \n\t" /* b */\
|
---|
1870 | "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
|
---|
1871 | "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
|
---|
1872 | "paddw %%mm1, %%mm4 \n\t" /* c */\
|
---|
1873 | "paddw %%mm2, %%mm5 \n\t" /* d */\
|
---|
1874 | "paddw %%mm6, %%mm6 \n\t" /* 2b */\
|
---|
1875 | "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
|
---|
1876 | "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
|
---|
1877 | "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
|
---|
1878 | "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
|
---|
1879 | "paddw %6, %%mm4 \n\t"\
|
---|
1880 | "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
|
---|
1881 | "psraw $5, %%mm4 \n\t"\
|
---|
1882 | "packuswb %%mm4, %%mm0 \n\t"\
|
---|
1883 | OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
|
---|
1884 | \
|
---|
1885 | "add %3, %0 \n\t"\
|
---|
1886 | "add %4, %1 \n\t"\
|
---|
1887 | "decl %2 \n\t"\
|
---|
1888 | " jnz 1b \n\t"\
|
---|
1889 | : "+a"(src), "+c"(dst), "+m"(h)\
|
---|
1890 | : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
|
---|
1891 | : "memory"\
|
---|
1892 | );\
|
---|
1893 | }\
|
---|
1894 | \
|
---|
1895 | static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
---|
1896 | int i;\
|
---|
1897 | int16_t temp[16];\
|
---|
1898 | /* quick HACK, XXX FIXME MUST be optimized */\
|
---|
1899 | for(i=0; i<h; i++)\
|
---|
1900 | {\
|
---|
1901 | temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
|
---|
1902 | temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
|
---|
1903 | temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
|
---|
1904 | temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
|
---|
1905 | temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
|
---|
1906 | temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
|
---|
1907 | temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
|
---|
1908 | temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
|
---|
1909 | temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
|
---|
1910 | temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
|
---|
1911 | temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
|
---|
1912 | temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
|
---|
1913 | temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
|
---|
1914 | temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
|
---|
1915 | temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
|
---|
1916 | temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
|
---|
1917 | asm volatile(\
|
---|
1918 | "movq (%0), %%mm0 \n\t"\
|
---|
1919 | "movq 8(%0), %%mm1 \n\t"\
|
---|
1920 | "paddw %2, %%mm0 \n\t"\
|
---|
1921 | "paddw %2, %%mm1 \n\t"\
|
---|
1922 | "psraw $5, %%mm0 \n\t"\
|
---|
1923 | "psraw $5, %%mm1 \n\t"\
|
---|
1924 | "packuswb %%mm1, %%mm0 \n\t"\
|
---|
1925 | OP_3DNOW(%%mm0, (%1), %%mm1, q)\
|
---|
1926 | "movq 16(%0), %%mm0 \n\t"\
|
---|
1927 | "movq 24(%0), %%mm1 \n\t"\
|
---|
1928 | "paddw %2, %%mm0 \n\t"\
|
---|
1929 | "paddw %2, %%mm1 \n\t"\
|
---|
1930 | "psraw $5, %%mm0 \n\t"\
|
---|
1931 | "psraw $5, %%mm1 \n\t"\
|
---|
1932 | "packuswb %%mm1, %%mm0 \n\t"\
|
---|
1933 | OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
|
---|
1934 | :: "r"(temp), "r"(dst), "m"(ROUNDER)\
|
---|
1935 | : "memory"\
|
---|
1936 | );\
|
---|
1937 | dst+=dstStride;\
|
---|
1938 | src+=srcStride;\
|
---|
1939 | }\
|
---|
1940 | }\
|
---|
1941 | \
|
---|
1942 | static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
---|
1943 | uint64_t temp;\
|
---|
1944 | \
|
---|
1945 | asm volatile(\
|
---|
1946 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
1947 | "1: \n\t"\
|
---|
1948 | "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
|
---|
1949 | "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
|
---|
1950 | "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
|
---|
1951 | "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
|
---|
1952 | "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
|
---|
1953 | "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
|
---|
1954 | "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
|
---|
1955 | "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
|
---|
1956 | "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
|
---|
1957 | "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
|
---|
1958 | "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
|
---|
1959 | "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
|
---|
1960 | "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
|
---|
1961 | "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
|
---|
1962 | "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
|
---|
1963 | "paddw %%mm3, %%mm5 \n\t" /* b */\
|
---|
1964 | "paddw %%mm2, %%mm6 \n\t" /* c */\
|
---|
1965 | "paddw %%mm5, %%mm5 \n\t" /* 2b */\
|
---|
1966 | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
|
---|
1967 | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
|
---|
1968 | "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
|
---|
1969 | "paddw %%mm4, %%mm0 \n\t" /* a */\
|
---|
1970 | "paddw %%mm1, %%mm5 \n\t" /* d */\
|
---|
1971 | "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
|
---|
1972 | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
|
---|
1973 | "paddw %6, %%mm6 \n\t"\
|
---|
1974 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
|
---|
1975 | "psraw $5, %%mm0 \n\t"\
|
---|
1976 | /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
|
---|
1977 | \
|
---|
1978 | "movd 5(%0), %%mm5 \n\t" /* FGHI */\
|
---|
1979 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
|
---|
1980 | "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
|
---|
1981 | "paddw %%mm5, %%mm1 \n\t" /* a */\
|
---|
1982 | "paddw %%mm6, %%mm2 \n\t" /* b */\
|
---|
1983 | "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
|
---|
1984 | "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
|
---|
1985 | "paddw %%mm6, %%mm3 \n\t" /* c */\
|
---|
1986 | "paddw %%mm5, %%mm4 \n\t" /* d */\
|
---|
1987 | "paddw %%mm2, %%mm2 \n\t" /* 2b */\
|
---|
1988 | "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
|
---|
1989 | "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
|
---|
1990 | "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
|
---|
1991 | "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
|
---|
1992 | "paddw %6, %%mm1 \n\t"\
|
---|
1993 | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
|
---|
1994 | "psraw $5, %%mm3 \n\t"\
|
---|
1995 | "packuswb %%mm3, %%mm0 \n\t"\
|
---|
1996 | OP_MMX2(%%mm0, (%1), %%mm4, q)\
|
---|
1997 | \
|
---|
1998 | "add %3, %0 \n\t"\
|
---|
1999 | "add %4, %1 \n\t"\
|
---|
2000 | "decl %2 \n\t"\
|
---|
2001 | " jnz 1b \n\t"\
|
---|
2002 | : "+a"(src), "+c"(dst), "+m"(h)\
|
---|
2003 | : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
|
---|
2004 | : "memory"\
|
---|
2005 | );\
|
---|
2006 | }\
|
---|
2007 | \
|
---|
2008 | static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
---|
2009 | int i;\
|
---|
2010 | int16_t temp[8];\
|
---|
2011 | /* quick HACK, XXX FIXME MUST be optimized */\
|
---|
2012 | for(i=0; i<h; i++)\
|
---|
2013 | {\
|
---|
2014 | temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
|
---|
2015 | temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
|
---|
2016 | temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
|
---|
2017 | temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
|
---|
2018 | temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
|
---|
2019 | temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
|
---|
2020 | temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
|
---|
2021 | temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
|
---|
2022 | asm volatile(\
|
---|
2023 | "movq (%0), %%mm0 \n\t"\
|
---|
2024 | "movq 8(%0), %%mm1 \n\t"\
|
---|
2025 | "paddw %2, %%mm0 \n\t"\
|
---|
2026 | "paddw %2, %%mm1 \n\t"\
|
---|
2027 | "psraw $5, %%mm0 \n\t"\
|
---|
2028 | "psraw $5, %%mm1 \n\t"\
|
---|
2029 | "packuswb %%mm1, %%mm0 \n\t"\
|
---|
2030 | OP_3DNOW(%%mm0, (%1), %%mm1, q)\
|
---|
2031 | :: "r"(temp), "r"(dst), "m"(ROUNDER)\
|
---|
2032 | :"memory"\
|
---|
2033 | );\
|
---|
2034 | dst+=dstStride;\
|
---|
2035 | src+=srcStride;\
|
---|
2036 | }\
|
---|
2037 | }
|
---|
2038 |
|
---|
2039 | #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
|
---|
2040 | \
|
---|
2041 | static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
---|
2042 | uint64_t temp[17*4];\
|
---|
2043 | uint64_t *temp_ptr= temp;\
|
---|
2044 | int count= 17;\
|
---|
2045 | \
|
---|
2046 | /*FIXME unroll */\
|
---|
2047 | asm volatile(\
|
---|
2048 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
2049 | "1: \n\t"\
|
---|
2050 | "movq (%0), %%mm0 \n\t"\
|
---|
2051 | "movq (%0), %%mm1 \n\t"\
|
---|
2052 | "movq 8(%0), %%mm2 \n\t"\
|
---|
2053 | "movq 8(%0), %%mm3 \n\t"\
|
---|
2054 | "punpcklbw %%mm7, %%mm0 \n\t"\
|
---|
2055 | "punpckhbw %%mm7, %%mm1 \n\t"\
|
---|
2056 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
2057 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
2058 | "movq %%mm0, (%1) \n\t"\
|
---|
2059 | "movq %%mm1, 17*8(%1) \n\t"\
|
---|
2060 | "movq %%mm2, 2*17*8(%1) \n\t"\
|
---|
2061 | "movq %%mm3, 3*17*8(%1) \n\t"\
|
---|
2062 | "add $8, %1 \n\t"\
|
---|
2063 | "add %3, %0 \n\t"\
|
---|
2064 | "decl %2 \n\t"\
|
---|
2065 | " jnz 1b \n\t"\
|
---|
2066 | : "+r" (src), "+r" (temp_ptr), "+r"(count)\
|
---|
2067 | : "r" ((long)srcStride)\
|
---|
2068 | : "memory"\
|
---|
2069 | );\
|
---|
2070 | \
|
---|
2071 | temp_ptr= temp;\
|
---|
2072 | count=4;\
|
---|
2073 | \
|
---|
2074 | /*FIXME reorder for speed */\
|
---|
2075 | asm volatile(\
|
---|
2076 | /*"pxor %%mm7, %%mm7 \n\t"*/\
|
---|
2077 | "1: \n\t"\
|
---|
2078 | "movq (%0), %%mm0 \n\t"\
|
---|
2079 | "movq 8(%0), %%mm1 \n\t"\
|
---|
2080 | "movq 16(%0), %%mm2 \n\t"\
|
---|
2081 | "movq 24(%0), %%mm3 \n\t"\
|
---|
2082 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
|
---|
2083 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
|
---|
2084 | "add %4, %1 \n\t"\
|
---|
2085 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
|
---|
2086 | \
|
---|
2087 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
|
---|
2088 | "add %4, %1 \n\t"\
|
---|
2089 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
|
---|
2090 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
|
---|
2091 | "add %4, %1 \n\t"\
|
---|
2092 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
|
---|
2093 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
|
---|
2094 | "add %4, %1 \n\t"\
|
---|
2095 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
|
---|
2096 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
|
---|
2097 | "add %4, %1 \n\t"\
|
---|
2098 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
|
---|
2099 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
|
---|
2100 | "add %4, %1 \n\t"\
|
---|
2101 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
|
---|
2102 | \
|
---|
2103 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
|
---|
2104 | "add %4, %1 \n\t" \
|
---|
2105 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
|
---|
2106 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
|
---|
2107 | \
|
---|
2108 | "add $136, %0 \n\t"\
|
---|
2109 | "add %6, %1 \n\t"\
|
---|
2110 | "decl %2 \n\t"\
|
---|
2111 | " jnz 1b \n\t"\
|
---|
2112 | \
|
---|
2113 | : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
|
---|
2114 | : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
|
---|
2115 | :"memory"\
|
---|
2116 | );\
|
---|
2117 | }\
|
---|
2118 | \
|
---|
2119 | static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
---|
2120 | uint64_t temp[9*2];\
|
---|
2121 | uint64_t *temp_ptr= temp;\
|
---|
2122 | int count= 9;\
|
---|
2123 | \
|
---|
2124 | /*FIXME unroll */\
|
---|
2125 | asm volatile(\
|
---|
2126 | "pxor %%mm7, %%mm7 \n\t"\
|
---|
2127 | "1: \n\t"\
|
---|
2128 | "movq (%0), %%mm0 \n\t"\
|
---|
2129 | "movq (%0), %%mm1 \n\t"\
|
---|
2130 | "punpcklbw %%mm7, %%mm0 \n\t"\
|
---|
2131 | "punpckhbw %%mm7, %%mm1 \n\t"\
|
---|
2132 | "movq %%mm0, (%1) \n\t"\
|
---|
2133 | "movq %%mm1, 9*8(%1) \n\t"\
|
---|
2134 | "add $8, %1 \n\t"\
|
---|
2135 | "add %3, %0 \n\t"\
|
---|
2136 | "decl %2 \n\t"\
|
---|
2137 | " jnz 1b \n\t"\
|
---|
2138 | : "+r" (src), "+r" (temp_ptr), "+r"(count)\
|
---|
2139 | : "r" ((long)srcStride)\
|
---|
2140 | : "memory"\
|
---|
2141 | );\
|
---|
2142 | \
|
---|
2143 | temp_ptr= temp;\
|
---|
2144 | count=2;\
|
---|
2145 | \
|
---|
2146 | /*FIXME reorder for speed */\
|
---|
2147 | asm volatile(\
|
---|
2148 | /*"pxor %%mm7, %%mm7 \n\t"*/\
|
---|
2149 | "1: \n\t"\
|
---|
2150 | "movq (%0), %%mm0 \n\t"\
|
---|
2151 | "movq 8(%0), %%mm1 \n\t"\
|
---|
2152 | "movq 16(%0), %%mm2 \n\t"\
|
---|
2153 | "movq 24(%0), %%mm3 \n\t"\
|
---|
2154 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
|
---|
2155 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
|
---|
2156 | "add %4, %1 \n\t"\
|
---|
2157 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
|
---|
2158 | \
|
---|
2159 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
|
---|
2160 | "add %4, %1 \n\t"\
|
---|
2161 | QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
|
---|
2162 | \
|
---|
2163 | QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
|
---|
2164 | "add %4, %1 \n\t"\
|
---|
2165 | QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
|
---|
2166 | QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
|
---|
2167 | \
|
---|
2168 | "add $72, %0 \n\t"\
|
---|
2169 | "add %6, %1 \n\t"\
|
---|
2170 | "decl %2 \n\t"\
|
---|
2171 | " jnz 1b \n\t"\
|
---|
2172 | \
|
---|
2173 | : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
|
---|
2174 | : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
|
---|
2175 | : "memory"\
|
---|
2176 | );\
|
---|
2177 | }\
|
---|
2178 | \
|
---|
2179 | static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
|
---|
2180 | OPNAME ## pixels8_mmx(dst, src, stride, 8);\
|
---|
2181 | }\
|
---|
2182 | \
|
---|
2183 | static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2184 | uint64_t temp[8];\
|
---|
2185 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2186 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
---|
2187 | OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
---|
2188 | }\
|
---|
2189 | \
|
---|
2190 | static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2191 | OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
|
---|
2192 | }\
|
---|
2193 | \
|
---|
2194 | static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2195 | uint64_t temp[8];\
|
---|
2196 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2197 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
---|
2198 | OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
|
---|
2199 | }\
|
---|
2200 | \
|
---|
2201 | static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2202 | uint64_t temp[8];\
|
---|
2203 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2204 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
---|
2205 | OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
---|
2206 | }\
|
---|
2207 | \
|
---|
2208 | static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2209 | OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
|
---|
2210 | }\
|
---|
2211 | \
|
---|
2212 | static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2213 | uint64_t temp[8];\
|
---|
2214 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2215 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
---|
2216 | OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
|
---|
2217 | }\
|
---|
2218 | static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2219 | uint64_t half[8 + 9];\
|
---|
2220 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2221 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2222 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2223 | put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
---|
2224 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2225 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
---|
2226 | }\
|
---|
2227 | static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2228 | uint64_t half[8 + 9];\
|
---|
2229 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2230 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2231 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2232 | put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
---|
2233 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2234 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
---|
2235 | }\
|
---|
2236 | static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2237 | uint64_t half[8 + 9];\
|
---|
2238 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2239 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2240 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2241 | put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
---|
2242 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2243 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
---|
2244 | }\
|
---|
2245 | static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2246 | uint64_t half[8 + 9];\
|
---|
2247 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2248 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2249 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2250 | put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
---|
2251 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2252 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
---|
2253 | }\
|
---|
2254 | static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2255 | uint64_t half[8 + 9];\
|
---|
2256 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2257 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2258 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2259 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2260 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
---|
2261 | }\
|
---|
2262 | static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2263 | uint64_t half[8 + 9];\
|
---|
2264 | uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
---|
2265 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2266 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2267 | put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
---|
2268 | OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
---|
2269 | }\
|
---|
2270 | static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2271 | uint64_t half[8 + 9];\
|
---|
2272 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2273 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2274 | put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
---|
2275 | OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
---|
2276 | }\
|
---|
2277 | static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2278 | uint64_t half[8 + 9];\
|
---|
2279 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2280 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2281 | put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
---|
2282 | OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
---|
2283 | }\
|
---|
2284 | static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2285 | uint64_t half[9];\
|
---|
2286 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2287 | put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
---|
2288 | OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
---|
2289 | }\
|
---|
2290 | static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
|
---|
2291 | OPNAME ## pixels16_mmx(dst, src, stride, 16);\
|
---|
2292 | }\
|
---|
2293 | \
|
---|
2294 | static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2295 | uint64_t temp[32];\
|
---|
2296 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2297 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
---|
2298 | OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
---|
2299 | }\
|
---|
2300 | \
|
---|
2301 | static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2302 | OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
|
---|
2303 | }\
|
---|
2304 | \
|
---|
2305 | static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2306 | uint64_t temp[32];\
|
---|
2307 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2308 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
---|
2309 | OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
|
---|
2310 | }\
|
---|
2311 | \
|
---|
2312 | static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2313 | uint64_t temp[32];\
|
---|
2314 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2315 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
---|
2316 | OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
---|
2317 | }\
|
---|
2318 | \
|
---|
2319 | static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2320 | OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
|
---|
2321 | }\
|
---|
2322 | \
|
---|
2323 | static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2324 | uint64_t temp[32];\
|
---|
2325 | uint8_t * const half= (uint8_t*)temp;\
|
---|
2326 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
---|
2327 | OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
|
---|
2328 | }\
|
---|
2329 | static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2330 | uint64_t half[16*2 + 17*2];\
|
---|
2331 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2332 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2333 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2334 | put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
---|
2335 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2336 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
---|
2337 | }\
|
---|
2338 | static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2339 | uint64_t half[16*2 + 17*2];\
|
---|
2340 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2341 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2342 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2343 | put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
---|
2344 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2345 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
---|
2346 | }\
|
---|
2347 | static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2348 | uint64_t half[16*2 + 17*2];\
|
---|
2349 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2350 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2351 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2352 | put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
---|
2353 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2354 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
---|
2355 | }\
|
---|
2356 | static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2357 | uint64_t half[16*2 + 17*2];\
|
---|
2358 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2359 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2360 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2361 | put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
---|
2362 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2363 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
---|
2364 | }\
|
---|
2365 | static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2366 | uint64_t half[16*2 + 17*2];\
|
---|
2367 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2368 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2369 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2370 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2371 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
---|
2372 | }\
|
---|
2373 | static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2374 | uint64_t half[16*2 + 17*2];\
|
---|
2375 | uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
---|
2376 | uint8_t * const halfHV= ((uint8_t*)half);\
|
---|
2377 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2378 | put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
---|
2379 | OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
---|
2380 | }\
|
---|
2381 | static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2382 | uint64_t half[17*2];\
|
---|
2383 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2384 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2385 | put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
---|
2386 | OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
---|
2387 | }\
|
---|
2388 | static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2389 | uint64_t half[17*2];\
|
---|
2390 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2391 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2392 | put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
---|
2393 | OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
---|
2394 | }\
|
---|
2395 | static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
---|
2396 | uint64_t half[17*2];\
|
---|
2397 | uint8_t * const halfH= ((uint8_t*)half);\
|
---|
2398 | put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
---|
2399 | OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
---|
2400 | }
|
---|
2401 |
|
---|
2402 | #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
|
---|
2403 | #define AVG_3DNOW_OP(a,b,temp, size) \
|
---|
2404 | "mov" #size " " #b ", " #temp " \n\t"\
|
---|
2405 | "pavgusb " #temp ", " #a " \n\t"\
|
---|
2406 | "mov" #size " " #a ", " #b " \n\t"
|
---|
2407 | #define AVG_MMX2_OP(a,b,temp, size) \
|
---|
2408 | "mov" #size " " #b ", " #temp " \n\t"\
|
---|
2409 | "pavgb " #temp ", " #a " \n\t"\
|
---|
2410 | "mov" #size " " #a ", " #b " \n\t"
|
---|
2411 |
|
---|
2412 | QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
|
---|
2413 | QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
|
---|
2414 | QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
|
---|
2415 | QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
|
---|
2416 | QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
|
---|
2417 | QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
|
---|
2418 | QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
|
---|
2419 | QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
|
---|
2420 | QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
|
---|
2421 |
|
---|
2422 | #if 0
|
---|
2423 | static void just_return() { return; }
|
---|
2424 | #endif
|
---|
2425 |
|
---|
2426 | #define SET_QPEL_FUNC(postfix1, postfix2) \
|
---|
2427 | c->put_ ## postfix1 = put_ ## postfix2;\
|
---|
2428 | c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
|
---|
2429 | c->avg_ ## postfix1 = avg_ ## postfix2;
|
---|
2430 |
|
---|
2431 | static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
|
---|
2432 | int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
|
---|
2433 | const int w = 8;
|
---|
2434 | const int ix = ox>>(16+shift);
|
---|
2435 | const int iy = oy>>(16+shift);
|
---|
2436 | const int oxs = ox>>4;
|
---|
2437 | const int oys = oy>>4;
|
---|
2438 | const int dxxs = dxx>>4;
|
---|
2439 | const int dxys = dxy>>4;
|
---|
2440 | const int dyxs = dyx>>4;
|
---|
2441 | const int dyys = dyy>>4;
|
---|
2442 | const uint16_t r4[4] = {r,r,r,r};
|
---|
2443 | const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
|
---|
2444 | const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
|
---|
2445 | const uint64_t shift2 = 2*shift;
|
---|
2446 | uint8_t edge_buf[(h+1)*stride];
|
---|
2447 | int x, y;
|
---|
2448 |
|
---|
2449 | const int dxw = (dxx-(1<<(16+shift)))*(w-1);
|
---|
2450 | const int dyh = (dyy-(1<<(16+shift)))*(h-1);
|
---|
2451 | const int dxh = dxy*(h-1);
|
---|
2452 | const int dyw = dyx*(w-1);
|
---|
2453 | if( // non-constant fullpel offset (3% of blocks)
|
---|
2454 | (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
|
---|
2455 | oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
|
---|
2456 | // uses more than 16 bits of subpel mv (only at huge resolution)
|
---|
2457 | || (dxx|dxy|dyx|dyy)&15 )
|
---|
2458 | {
|
---|
2459 | //FIXME could still use mmx for some of the rows
|
---|
2460 | ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
|
---|
2461 | return;
|
---|
2462 | }
|
---|
2463 |
|
---|
2464 | src += ix + iy*stride;
|
---|
2465 | if( (unsigned)ix >= width-w ||
|
---|
2466 | (unsigned)iy >= height-h )
|
---|
2467 | {
|
---|
2468 | ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
|
---|
2469 | src = edge_buf;
|
---|
2470 | }
|
---|
2471 |
|
---|
2472 | asm volatile(
|
---|
2473 | "movd %0, %%mm6 \n\t"
|
---|
2474 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2475 | "punpcklwd %%mm6, %%mm6 \n\t"
|
---|
2476 | "punpcklwd %%mm6, %%mm6 \n\t"
|
---|
2477 | :: "r"(1<<shift)
|
---|
2478 | );
|
---|
2479 |
|
---|
2480 | for(x=0; x<w; x+=4){
|
---|
2481 | uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
|
---|
2482 | oxs - dxys + dxxs*(x+1),
|
---|
2483 | oxs - dxys + dxxs*(x+2),
|
---|
2484 | oxs - dxys + dxxs*(x+3) };
|
---|
2485 | uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
|
---|
2486 | oys - dyys + dyxs*(x+1),
|
---|
2487 | oys - dyys + dyxs*(x+2),
|
---|
2488 | oys - dyys + dyxs*(x+3) };
|
---|
2489 |
|
---|
2490 | for(y=0; y<h; y++){
|
---|
2491 | asm volatile(
|
---|
2492 | "movq %0, %%mm4 \n\t"
|
---|
2493 | "movq %1, %%mm5 \n\t"
|
---|
2494 | "paddw %2, %%mm4 \n\t"
|
---|
2495 | "paddw %3, %%mm5 \n\t"
|
---|
2496 | "movq %%mm4, %0 \n\t"
|
---|
2497 | "movq %%mm5, %1 \n\t"
|
---|
2498 | "psrlw $12, %%mm4 \n\t"
|
---|
2499 | "psrlw $12, %%mm5 \n\t"
|
---|
2500 | : "+m"(*dx4), "+m"(*dy4)
|
---|
2501 | : "m"(*dxy4), "m"(*dyy4)
|
---|
2502 | );
|
---|
2503 |
|
---|
2504 | asm volatile(
|
---|
2505 | "movq %%mm6, %%mm2 \n\t"
|
---|
2506 | "movq %%mm6, %%mm1 \n\t"
|
---|
2507 | "psubw %%mm4, %%mm2 \n\t"
|
---|
2508 | "psubw %%mm5, %%mm1 \n\t"
|
---|
2509 | "movq %%mm2, %%mm0 \n\t"
|
---|
2510 | "movq %%mm4, %%mm3 \n\t"
|
---|
2511 | "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
|
---|
2512 | "pmullw %%mm5, %%mm3 \n\t" // dx*dy
|
---|
2513 | "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
|
---|
2514 | "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
|
---|
2515 |
|
---|
2516 | "movd %4, %%mm5 \n\t"
|
---|
2517 | "movd %3, %%mm4 \n\t"
|
---|
2518 | "punpcklbw %%mm7, %%mm5 \n\t"
|
---|
2519 | "punpcklbw %%mm7, %%mm4 \n\t"
|
---|
2520 | "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
|
---|
2521 | "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
|
---|
2522 |
|
---|
2523 | "movd %2, %%mm5 \n\t"
|
---|
2524 | "movd %1, %%mm4 \n\t"
|
---|
2525 | "punpcklbw %%mm7, %%mm5 \n\t"
|
---|
2526 | "punpcklbw %%mm7, %%mm4 \n\t"
|
---|
2527 | "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
|
---|
2528 | "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
|
---|
2529 | "paddw %5, %%mm1 \n\t"
|
---|
2530 | "paddw %%mm3, %%mm2 \n\t"
|
---|
2531 | "paddw %%mm1, %%mm0 \n\t"
|
---|
2532 | "paddw %%mm2, %%mm0 \n\t"
|
---|
2533 |
|
---|
2534 | "psrlw %6, %%mm0 \n\t"
|
---|
2535 | "packuswb %%mm0, %%mm0 \n\t"
|
---|
2536 | "movd %%mm0, %0 \n\t"
|
---|
2537 |
|
---|
2538 | : "=m"(dst[x+y*stride])
|
---|
2539 | : "m"(src[0]), "m"(src[1]),
|
---|
2540 | "m"(src[stride]), "m"(src[stride+1]),
|
---|
2541 | "m"(*r4), "m"(shift2)
|
---|
2542 | );
|
---|
2543 | src += stride;
|
---|
2544 | }
|
---|
2545 | src += 4-h*stride;
|
---|
2546 | }
|
---|
2547 | }
|
---|
2548 |
|
---|
2549 | static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
|
---|
2550 | long i=0;
|
---|
2551 |
|
---|
2552 | assert(ABS(scale) < 256);
|
---|
2553 | scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
|
---|
2554 |
|
---|
2555 | asm volatile(
|
---|
2556 | "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
|
---|
2557 | "psrlw $15, %%mm6 \n\t" // 1w
|
---|
2558 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2559 | "movd %4, %%mm5 \n\t"
|
---|
2560 | "punpcklwd %%mm5, %%mm5 \n\t"
|
---|
2561 | "punpcklwd %%mm5, %%mm5 \n\t"
|
---|
2562 | "1: \n\t"
|
---|
2563 | "movq (%1, %0), %%mm0 \n\t"
|
---|
2564 | "movq 8(%1, %0), %%mm1 \n\t"
|
---|
2565 | "pmulhw %%mm5, %%mm0 \n\t"
|
---|
2566 | "pmulhw %%mm5, %%mm1 \n\t"
|
---|
2567 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2568 | "paddw %%mm6, %%mm1 \n\t"
|
---|
2569 | "psraw $1, %%mm0 \n\t"
|
---|
2570 | "psraw $1, %%mm1 \n\t"
|
---|
2571 | "paddw (%2, %0), %%mm0 \n\t"
|
---|
2572 | "paddw 8(%2, %0), %%mm1 \n\t"
|
---|
2573 | "psraw $6, %%mm0 \n\t"
|
---|
2574 | "psraw $6, %%mm1 \n\t"
|
---|
2575 | "pmullw (%3, %0), %%mm0 \n\t"
|
---|
2576 | "pmullw 8(%3, %0), %%mm1 \n\t"
|
---|
2577 | "pmaddwd %%mm0, %%mm0 \n\t"
|
---|
2578 | "pmaddwd %%mm1, %%mm1 \n\t"
|
---|
2579 | "paddd %%mm1, %%mm0 \n\t"
|
---|
2580 | "psrld $4, %%mm0 \n\t"
|
---|
2581 | "paddd %%mm0, %%mm7 \n\t"
|
---|
2582 | "add $16, %0 \n\t"
|
---|
2583 | "cmp $128, %0 \n\t" //FIXME optimize & bench
|
---|
2584 | " jb 1b \n\t"
|
---|
2585 | "movq %%mm7, %%mm6 \n\t"
|
---|
2586 | "psrlq $32, %%mm7 \n\t"
|
---|
2587 | "paddd %%mm6, %%mm7 \n\t"
|
---|
2588 | "psrld $2, %%mm7 \n\t"
|
---|
2589 | "movd %%mm7, %0 \n\t"
|
---|
2590 |
|
---|
2591 | : "+r" (i)
|
---|
2592 | : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
---|
2593 | );
|
---|
2594 | return i;
|
---|
2595 | }
|
---|
2596 |
|
---|
2597 | static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
|
---|
2598 | long i=0;
|
---|
2599 |
|
---|
2600 | if(ABS(scale) < 256){
|
---|
2601 | scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
|
---|
2602 | asm volatile(
|
---|
2603 | "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
|
---|
2604 | "psrlw $15, %%mm6 \n\t" // 1w
|
---|
2605 | "movd %3, %%mm5 \n\t"
|
---|
2606 | "punpcklwd %%mm5, %%mm5 \n\t"
|
---|
2607 | "punpcklwd %%mm5, %%mm5 \n\t"
|
---|
2608 | "1: \n\t"
|
---|
2609 | "movq (%1, %0), %%mm0 \n\t"
|
---|
2610 | "movq 8(%1, %0), %%mm1 \n\t"
|
---|
2611 | "pmulhw %%mm5, %%mm0 \n\t"
|
---|
2612 | "pmulhw %%mm5, %%mm1 \n\t"
|
---|
2613 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2614 | "paddw %%mm6, %%mm1 \n\t"
|
---|
2615 | "psraw $1, %%mm0 \n\t"
|
---|
2616 | "psraw $1, %%mm1 \n\t"
|
---|
2617 | "paddw (%2, %0), %%mm0 \n\t"
|
---|
2618 | "paddw 8(%2, %0), %%mm1 \n\t"
|
---|
2619 | "movq %%mm0, (%2, %0) \n\t"
|
---|
2620 | "movq %%mm1, 8(%2, %0) \n\t"
|
---|
2621 | "add $16, %0 \n\t"
|
---|
2622 | "cmp $128, %0 \n\t" //FIXME optimize & bench
|
---|
2623 | " jb 1b \n\t"
|
---|
2624 |
|
---|
2625 | : "+r" (i)
|
---|
2626 | : "r"(basis), "r"(rem), "g"(scale)
|
---|
2627 | );
|
---|
2628 | }else{
|
---|
2629 | for(i=0; i<8*8; i++){
|
---|
2630 | rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
---|
2631 | }
|
---|
2632 | }
|
---|
2633 | }
|
---|
2634 |
|
---|
2635 | #define PREFETCH(name, op) \
|
---|
2636 | void name(void *mem, int stride, int h){\
|
---|
2637 | const uint8_t *p= mem;\
|
---|
2638 | do{\
|
---|
2639 | asm volatile(#op" %0" :: "m"(*p));\
|
---|
2640 | p+= stride;\
|
---|
2641 | }while(--h);\
|
---|
2642 | }
|
---|
2643 | PREFETCH(prefetch_mmx2, prefetcht0)
|
---|
2644 | PREFETCH(prefetch_3dnow, prefetch)
|
---|
2645 | #undef PREFETCH
|
---|
2646 |
|
---|
2647 | #include "h264dsp_mmx.c"
|
---|
2648 |
|
---|
2649 | /* external functions, from idct_mmx.c */
|
---|
2650 | void ff_mmx_idct(DCTELEM *block);
|
---|
2651 | void ff_mmxext_idct(DCTELEM *block);
|
---|
2652 |
|
---|
2653 | void ff_vp3_idct_sse2(int16_t *input_data);
|
---|
2654 | void ff_vp3_idct_mmx(int16_t *data);
|
---|
2655 | void ff_vp3_dsp_init_mmx(void);
|
---|
2656 |
|
---|
2657 | /* XXX: those functions should be suppressed ASAP when all IDCTs are
|
---|
2658 | converted */
|
---|
2659 | static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2660 | {
|
---|
2661 | ff_mmx_idct (block);
|
---|
2662 | put_pixels_clamped_mmx(block, dest, line_size);
|
---|
2663 | }
|
---|
2664 | static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2665 | {
|
---|
2666 | ff_mmx_idct (block);
|
---|
2667 | add_pixels_clamped_mmx(block, dest, line_size);
|
---|
2668 | }
|
---|
2669 | static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2670 | {
|
---|
2671 | ff_mmxext_idct (block);
|
---|
2672 | put_pixels_clamped_mmx(block, dest, line_size);
|
---|
2673 | }
|
---|
2674 | static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2675 | {
|
---|
2676 | ff_mmxext_idct (block);
|
---|
2677 | add_pixels_clamped_mmx(block, dest, line_size);
|
---|
2678 | }
|
---|
2679 | static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2680 | {
|
---|
2681 | ff_vp3_idct_sse2(block);
|
---|
2682 | put_signed_pixels_clamped_mmx(block, dest, line_size);
|
---|
2683 | }
|
---|
2684 | static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2685 | {
|
---|
2686 | ff_vp3_idct_sse2(block);
|
---|
2687 | add_pixels_clamped_mmx(block, dest, line_size);
|
---|
2688 | }
|
---|
2689 | static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2690 | {
|
---|
2691 | ff_vp3_idct_mmx(block);
|
---|
2692 | put_signed_pixels_clamped_mmx(block, dest, line_size);
|
---|
2693 | }
|
---|
2694 | static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
2695 | {
|
---|
2696 | ff_vp3_idct_mmx(block);
|
---|
2697 | add_pixels_clamped_mmx(block, dest, line_size);
|
---|
2698 | }
|
---|
2699 |
|
---|
2700 | #ifdef CONFIG_SNOW_ENCODER
|
---|
2701 | extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
|
---|
2702 | extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
|
---|
2703 | extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
|
---|
2704 | extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
|
---|
2705 | extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
---|
2706 | int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
---|
2707 | extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
---|
2708 | int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
---|
2709 | #endif
|
---|
2710 |
|
---|
2711 | void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
---|
2712 | {
|
---|
2713 | mm_flags = mm_support();
|
---|
2714 |
|
---|
2715 | if (avctx->dsp_mask) {
|
---|
2716 | if (avctx->dsp_mask & FF_MM_FORCE)
|
---|
2717 | mm_flags |= (avctx->dsp_mask & 0xffff);
|
---|
2718 | else
|
---|
2719 | mm_flags &= ~(avctx->dsp_mask & 0xffff);
|
---|
2720 | }
|
---|
2721 |
|
---|
2722 | #if 0
|
---|
2723 | av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
|
---|
2724 | if (mm_flags & MM_MMX)
|
---|
2725 | av_log(avctx, AV_LOG_INFO, " mmx");
|
---|
2726 | if (mm_flags & MM_MMXEXT)
|
---|
2727 | av_log(avctx, AV_LOG_INFO, " mmxext");
|
---|
2728 | if (mm_flags & MM_3DNOW)
|
---|
2729 | av_log(avctx, AV_LOG_INFO, " 3dnow");
|
---|
2730 | if (mm_flags & MM_SSE)
|
---|
2731 | av_log(avctx, AV_LOG_INFO, " sse");
|
---|
2732 | if (mm_flags & MM_SSE2)
|
---|
2733 | av_log(avctx, AV_LOG_INFO, " sse2");
|
---|
2734 | av_log(avctx, AV_LOG_INFO, "\n");
|
---|
2735 | #endif
|
---|
2736 |
|
---|
2737 | if (mm_flags & MM_MMX) {
|
---|
2738 | const int idct_algo= avctx->idct_algo;
|
---|
2739 |
|
---|
2740 | #ifdef CONFIG_ENCODERS
|
---|
2741 | const int dct_algo = avctx->dct_algo;
|
---|
2742 | if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
|
---|
2743 | if(mm_flags & MM_SSE2){
|
---|
2744 | c->fdct = ff_fdct_sse2;
|
---|
2745 | }else if(mm_flags & MM_MMXEXT){
|
---|
2746 | c->fdct = ff_fdct_mmx2;
|
---|
2747 | }else{
|
---|
2748 | c->fdct = ff_fdct_mmx;
|
---|
2749 | }
|
---|
2750 | }
|
---|
2751 | #endif //CONFIG_ENCODERS
|
---|
2752 | if(avctx->lowres==0){
|
---|
2753 | if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
|
---|
2754 | c->idct_put= ff_simple_idct_put_mmx;
|
---|
2755 | c->idct_add= ff_simple_idct_add_mmx;
|
---|
2756 | c->idct = ff_simple_idct_mmx;
|
---|
2757 | c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
|
---|
2758 | }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
|
---|
2759 | if(mm_flags & MM_MMXEXT){
|
---|
2760 | c->idct_put= ff_libmpeg2mmx2_idct_put;
|
---|
2761 | c->idct_add= ff_libmpeg2mmx2_idct_add;
|
---|
2762 | c->idct = ff_mmxext_idct;
|
---|
2763 | }else{
|
---|
2764 | c->idct_put= ff_libmpeg2mmx_idct_put;
|
---|
2765 | c->idct_add= ff_libmpeg2mmx_idct_add;
|
---|
2766 | c->idct = ff_mmx_idct;
|
---|
2767 | }
|
---|
2768 | c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
|
---|
2769 | }else if(idct_algo==FF_IDCT_VP3){
|
---|
2770 | if(mm_flags & MM_SSE2){
|
---|
2771 | c->idct_put= ff_vp3_idct_put_sse2;
|
---|
2772 | c->idct_add= ff_vp3_idct_add_sse2;
|
---|
2773 | c->idct = ff_vp3_idct_sse2;
|
---|
2774 | c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
|
---|
2775 | }else{
|
---|
2776 | ff_vp3_dsp_init_mmx();
|
---|
2777 | c->idct_put= ff_vp3_idct_put_mmx;
|
---|
2778 | c->idct_add= ff_vp3_idct_add_mmx;
|
---|
2779 | c->idct = ff_vp3_idct_mmx;
|
---|
2780 | c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
|
---|
2781 | }
|
---|
2782 | }
|
---|
2783 | }
|
---|
2784 |
|
---|
2785 | #ifdef CONFIG_ENCODERS
|
---|
2786 | c->get_pixels = get_pixels_mmx;
|
---|
2787 | c->diff_pixels = diff_pixels_mmx;
|
---|
2788 | #endif //CONFIG_ENCODERS
|
---|
2789 | c->put_pixels_clamped = put_pixels_clamped_mmx;
|
---|
2790 | c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
|
---|
2791 | c->add_pixels_clamped = add_pixels_clamped_mmx;
|
---|
2792 | c->clear_blocks = clear_blocks_mmx;
|
---|
2793 | #ifdef CONFIG_ENCODERS
|
---|
2794 | c->pix_sum = pix_sum16_mmx;
|
---|
2795 | #endif //CONFIG_ENCODERS
|
---|
2796 |
|
---|
2797 | c->put_pixels_tab[0][0] = put_pixels16_mmx;
|
---|
2798 | c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
|
---|
2799 | c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
|
---|
2800 | c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
|
---|
2801 |
|
---|
2802 | c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
|
---|
2803 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
|
---|
2804 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
|
---|
2805 | c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
|
---|
2806 |
|
---|
2807 | c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
|
---|
2808 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
|
---|
2809 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
|
---|
2810 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
|
---|
2811 |
|
---|
2812 | c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
|
---|
2813 | c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
|
---|
2814 | c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
|
---|
2815 | c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
|
---|
2816 |
|
---|
2817 | c->put_pixels_tab[1][0] = put_pixels8_mmx;
|
---|
2818 | c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
|
---|
2819 | c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
|
---|
2820 | c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
|
---|
2821 |
|
---|
2822 | c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
|
---|
2823 | c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
|
---|
2824 | c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
|
---|
2825 | c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
|
---|
2826 |
|
---|
2827 | c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
|
---|
2828 | c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
|
---|
2829 | c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
|
---|
2830 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
|
---|
2831 |
|
---|
2832 | c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
|
---|
2833 | c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
|
---|
2834 | c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
|
---|
2835 | c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
|
---|
2836 |
|
---|
2837 | c->gmc= gmc_mmx;
|
---|
2838 |
|
---|
2839 | c->add_bytes= add_bytes_mmx;
|
---|
2840 | #ifdef CONFIG_ENCODERS
|
---|
2841 | c->diff_bytes= diff_bytes_mmx;
|
---|
2842 |
|
---|
2843 | c->hadamard8_diff[0]= hadamard8_diff16_mmx;
|
---|
2844 | c->hadamard8_diff[1]= hadamard8_diff_mmx;
|
---|
2845 |
|
---|
2846 | c->pix_norm1 = pix_norm1_mmx;
|
---|
2847 | c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
|
---|
2848 | c->sse[1] = sse8_mmx;
|
---|
2849 | c->vsad[4]= vsad_intra16_mmx;
|
---|
2850 |
|
---|
2851 | c->nsse[0] = nsse16_mmx;
|
---|
2852 | c->nsse[1] = nsse8_mmx;
|
---|
2853 | if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
---|
2854 | c->vsad[0] = vsad16_mmx;
|
---|
2855 | }
|
---|
2856 |
|
---|
2857 | if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
---|
2858 | c->try_8x8basis= try_8x8basis_mmx;
|
---|
2859 | }
|
---|
2860 | c->add_8x8basis= add_8x8basis_mmx;
|
---|
2861 |
|
---|
2862 | #endif //CONFIG_ENCODERS
|
---|
2863 |
|
---|
2864 | c->h263_v_loop_filter= h263_v_loop_filter_mmx;
|
---|
2865 | c->h263_h_loop_filter= h263_h_loop_filter_mmx;
|
---|
2866 | c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
|
---|
2867 | c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
|
---|
2868 |
|
---|
2869 | c->h264_idct_dc_add=
|
---|
2870 | c->h264_idct_add= ff_h264_idct_add_mmx;
|
---|
2871 | c->h264_idct8_dc_add=
|
---|
2872 | c->h264_idct8_add= ff_h264_idct8_add_mmx;
|
---|
2873 |
|
---|
2874 | if (mm_flags & MM_MMXEXT) {
|
---|
2875 | c->prefetch = prefetch_mmx2;
|
---|
2876 |
|
---|
2877 | c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
|
---|
2878 | c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
|
---|
2879 |
|
---|
2880 | c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
|
---|
2881 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
|
---|
2882 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
|
---|
2883 |
|
---|
2884 | c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
|
---|
2885 | c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
|
---|
2886 |
|
---|
2887 | c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
|
---|
2888 | c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
|
---|
2889 | c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
|
---|
2890 |
|
---|
2891 | #ifdef CONFIG_ENCODERS
|
---|
2892 | c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
|
---|
2893 | c->hadamard8_diff[1]= hadamard8_diff_mmx2;
|
---|
2894 | c->vsad[4]= vsad_intra16_mmx2;
|
---|
2895 | #endif //CONFIG_ENCODERS
|
---|
2896 |
|
---|
2897 | c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
|
---|
2898 | c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
|
---|
2899 |
|
---|
2900 | if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
---|
2901 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
|
---|
2902 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
|
---|
2903 | c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
|
---|
2904 | c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
|
---|
2905 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
|
---|
2906 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
|
---|
2907 | #ifdef CONFIG_ENCODERS
|
---|
2908 | c->vsad[0] = vsad16_mmx2;
|
---|
2909 | #endif //CONFIG_ENCODERS
|
---|
2910 | }
|
---|
2911 |
|
---|
2912 | #if 1
|
---|
2913 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
|
---|
2914 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
|
---|
2915 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
|
---|
2916 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
|
---|
2917 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
|
---|
2918 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
|
---|
2919 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
|
---|
2920 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
|
---|
2921 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
|
---|
2922 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
|
---|
2923 | SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
|
---|
2924 | SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
|
---|
2925 | SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
|
---|
2926 | SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
|
---|
2927 | SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
|
---|
2928 | SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
|
---|
2929 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
|
---|
2930 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
|
---|
2931 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
|
---|
2932 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
|
---|
2933 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
|
---|
2934 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
|
---|
2935 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
|
---|
2936 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
|
---|
2937 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
|
---|
2938 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
|
---|
2939 | SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
|
---|
2940 | SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
|
---|
2941 | SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
|
---|
2942 | SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
|
---|
2943 | SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
|
---|
2944 | SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
|
---|
2945 | #endif
|
---|
2946 |
|
---|
2947 | //FIXME 3dnow too
|
---|
2948 | #define dspfunc(PFX, IDX, NUM) \
|
---|
2949 | c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
|
---|
2950 | c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
|
---|
2951 | c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
|
---|
2952 | c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
|
---|
2953 | c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
|
---|
2954 | c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
|
---|
2955 | c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
|
---|
2956 | c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
|
---|
2957 | c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
|
---|
2958 | c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
|
---|
2959 | c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
|
---|
2960 | c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
|
---|
2961 | c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
|
---|
2962 | c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
|
---|
2963 | c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
|
---|
2964 | c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
|
---|
2965 |
|
---|
2966 | dspfunc(put_h264_qpel, 0, 16);
|
---|
2967 | dspfunc(put_h264_qpel, 1, 8);
|
---|
2968 | dspfunc(put_h264_qpel, 2, 4);
|
---|
2969 | dspfunc(avg_h264_qpel, 0, 16);
|
---|
2970 | dspfunc(avg_h264_qpel, 1, 8);
|
---|
2971 | dspfunc(avg_h264_qpel, 2, 4);
|
---|
2972 | #undef dspfunc
|
---|
2973 |
|
---|
2974 | c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
|
---|
2975 | c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
|
---|
2976 | c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
|
---|
2977 | c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
|
---|
2978 | c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
|
---|
2979 | c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
|
---|
2980 | c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
|
---|
2981 | c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
|
---|
2982 | c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
|
---|
2983 | c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
|
---|
2984 |
|
---|
2985 | c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
|
---|
2986 | c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
|
---|
2987 | c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
|
---|
2988 | c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
|
---|
2989 | c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
|
---|
2990 | c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
|
---|
2991 | c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
|
---|
2992 | c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
|
---|
2993 |
|
---|
2994 | c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
|
---|
2995 | c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
|
---|
2996 | c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
|
---|
2997 | c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
|
---|
2998 | c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
|
---|
2999 | c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
|
---|
3000 | c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
|
---|
3001 | c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
|
---|
3002 |
|
---|
3003 | #ifdef CONFIG_ENCODERS
|
---|
3004 | c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
|
---|
3005 | #endif //CONFIG_ENCODERS
|
---|
3006 | } else if (mm_flags & MM_3DNOW) {
|
---|
3007 | c->prefetch = prefetch_3dnow;
|
---|
3008 |
|
---|
3009 | c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
|
---|
3010 | c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
|
---|
3011 |
|
---|
3012 | c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
|
---|
3013 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
|
---|
3014 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
|
---|
3015 |
|
---|
3016 | c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
|
---|
3017 | c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
|
---|
3018 |
|
---|
3019 | c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
|
---|
3020 | c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
|
---|
3021 | c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
|
---|
3022 |
|
---|
3023 | if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
---|
3024 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
|
---|
3025 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
|
---|
3026 | c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
|
---|
3027 | c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
|
---|
3028 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
|
---|
3029 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
|
---|
3030 | }
|
---|
3031 |
|
---|
3032 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
|
---|
3033 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
|
---|
3034 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
|
---|
3035 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
|
---|
3036 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
|
---|
3037 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
|
---|
3038 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
|
---|
3039 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
|
---|
3040 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
|
---|
3041 | SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
|
---|
3042 | SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
|
---|
3043 | SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
|
---|
3044 | SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
|
---|
3045 | SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
|
---|
3046 | SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
|
---|
3047 | SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
|
---|
3048 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
|
---|
3049 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
|
---|
3050 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
|
---|
3051 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
|
---|
3052 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
|
---|
3053 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
|
---|
3054 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
|
---|
3055 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
|
---|
3056 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
|
---|
3057 | SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
|
---|
3058 | SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
|
---|
3059 | SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
|
---|
3060 | SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
|
---|
3061 | SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
|
---|
3062 | SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
|
---|
3063 | SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
|
---|
3064 |
|
---|
3065 | #define dspfunc(PFX, IDX, NUM) \
|
---|
3066 | c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
|
---|
3067 | c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
|
---|
3068 | c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
|
---|
3069 | c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
|
---|
3070 | c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
|
---|
3071 | c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
|
---|
3072 | c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
|
---|
3073 | c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
|
---|
3074 | c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
|
---|
3075 | c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
|
---|
3076 | c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
|
---|
3077 | c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
|
---|
3078 | c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
|
---|
3079 | c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
|
---|
3080 | c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
|
---|
3081 | c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
|
---|
3082 |
|
---|
3083 | dspfunc(put_h264_qpel, 0, 16);
|
---|
3084 | dspfunc(put_h264_qpel, 1, 8);
|
---|
3085 | dspfunc(put_h264_qpel, 2, 4);
|
---|
3086 | dspfunc(avg_h264_qpel, 0, 16);
|
---|
3087 | dspfunc(avg_h264_qpel, 1, 8);
|
---|
3088 | dspfunc(avg_h264_qpel, 2, 4);
|
---|
3089 |
|
---|
3090 | c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
|
---|
3091 | c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
|
---|
3092 | }
|
---|
3093 |
|
---|
3094 | #ifdef CONFIG_SNOW_ENCODER
|
---|
3095 | if(mm_flags & MM_SSE2){
|
---|
3096 | c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
|
---|
3097 | c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
|
---|
3098 | c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
|
---|
3099 | }
|
---|
3100 | else{
|
---|
3101 | c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
|
---|
3102 | c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
|
---|
3103 | c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
|
---|
3104 | }
|
---|
3105 | #endif
|
---|
3106 | }
|
---|
3107 |
|
---|
3108 | #ifdef CONFIG_ENCODERS
|
---|
3109 | dsputil_init_pix_mmx(c, avctx);
|
---|
3110 | #endif //CONFIG_ENCODERS
|
---|
3111 | #if 0
|
---|
3112 | // for speed testing
|
---|
3113 | get_pixels = just_return;
|
---|
3114 | put_pixels_clamped = just_return;
|
---|
3115 | add_pixels_clamped = just_return;
|
---|
3116 |
|
---|
3117 | pix_abs16x16 = just_return;
|
---|
3118 | pix_abs16x16_x2 = just_return;
|
---|
3119 | pix_abs16x16_y2 = just_return;
|
---|
3120 | pix_abs16x16_xy2 = just_return;
|
---|
3121 |
|
---|
3122 | put_pixels_tab[0] = just_return;
|
---|
3123 | put_pixels_tab[1] = just_return;
|
---|
3124 | put_pixels_tab[2] = just_return;
|
---|
3125 | put_pixels_tab[3] = just_return;
|
---|
3126 |
|
---|
3127 | put_no_rnd_pixels_tab[0] = just_return;
|
---|
3128 | put_no_rnd_pixels_tab[1] = just_return;
|
---|
3129 | put_no_rnd_pixels_tab[2] = just_return;
|
---|
3130 | put_no_rnd_pixels_tab[3] = just_return;
|
---|
3131 |
|
---|
3132 | avg_pixels_tab[0] = just_return;
|
---|
3133 | avg_pixels_tab[1] = just_return;
|
---|
3134 | avg_pixels_tab[2] = just_return;
|
---|
3135 | avg_pixels_tab[3] = just_return;
|
---|
3136 |
|
---|
3137 | avg_no_rnd_pixels_tab[0] = just_return;
|
---|
3138 | avg_no_rnd_pixels_tab[1] = just_return;
|
---|
3139 | avg_no_rnd_pixels_tab[2] = just_return;
|
---|
3140 | avg_no_rnd_pixels_tab[3] = just_return;
|
---|
3141 |
|
---|
3142 | //av_fdct = just_return;
|
---|
3143 | //ff_idct = just_return;
|
---|
3144 | #endif
|
---|
3145 | }
|
---|