VirtualBox

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/i386/motion_est_mmx.c@ 5776

Last change on this file since 5776 was 5776, checked in by vboxsync, 17 years ago

ffmpeg: exported to OSE

File size: 14.8 KB
Line 
1/*
2 * MMX optimized motion estimation
3 * Copyright (c) 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * mostly by Michael Niedermayer <[email protected]>
21 */
22#include "../dsputil.h"
23#include "x86_cpu.h"
24
25static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
260x0000000000000000ULL,
270x0001000100010001ULL,
280x0002000200020002ULL,
29};
30
31static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
32
33static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
34{
35 long len= -(stride*h);
36 asm volatile(
37#if !defined(VBOX) || !defined(__DARWIN__)
38 ".balign 16 \n\t"
39#else
40 ".align 4 \n\t"
41#endif
42 "1: \n\t"
43 "movq (%1, %%"REG_a"), %%mm0 \n\t"
44 "movq (%2, %%"REG_a"), %%mm2 \n\t"
45 "movq (%2, %%"REG_a"), %%mm4 \n\t"
46 "add %3, %%"REG_a" \n\t"
47 "psubusb %%mm0, %%mm2 \n\t"
48 "psubusb %%mm4, %%mm0 \n\t"
49 "movq (%1, %%"REG_a"), %%mm1 \n\t"
50 "movq (%2, %%"REG_a"), %%mm3 \n\t"
51 "movq (%2, %%"REG_a"), %%mm5 \n\t"
52 "psubusb %%mm1, %%mm3 \n\t"
53 "psubusb %%mm5, %%mm1 \n\t"
54 "por %%mm2, %%mm0 \n\t"
55 "por %%mm1, %%mm3 \n\t"
56 "movq %%mm0, %%mm1 \n\t"
57 "movq %%mm3, %%mm2 \n\t"
58 "punpcklbw %%mm7, %%mm0 \n\t"
59 "punpckhbw %%mm7, %%mm1 \n\t"
60 "punpcklbw %%mm7, %%mm3 \n\t"
61 "punpckhbw %%mm7, %%mm2 \n\t"
62 "paddw %%mm1, %%mm0 \n\t"
63 "paddw %%mm3, %%mm2 \n\t"
64 "paddw %%mm2, %%mm0 \n\t"
65 "paddw %%mm0, %%mm6 \n\t"
66 "add %3, %%"REG_a" \n\t"
67 " js 1b \n\t"
68 : "+a" (len)
69 : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
70 );
71}
72
73static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
74{
75 long len= -(stride*h);
76 asm volatile(
77#if !defined(VBOX) || !defined(__DARWIN__)
78 ".balign 16 \n\t"
79#else
80 ".align 4 \n\t"
81#endif
82 "1: \n\t"
83 "movq (%1, %%"REG_a"), %%mm0 \n\t"
84 "movq (%2, %%"REG_a"), %%mm2 \n\t"
85 "psadbw %%mm2, %%mm0 \n\t"
86 "add %3, %%"REG_a" \n\t"
87 "movq (%1, %%"REG_a"), %%mm1 \n\t"
88 "movq (%2, %%"REG_a"), %%mm3 \n\t"
89 "psadbw %%mm1, %%mm3 \n\t"
90 "paddw %%mm3, %%mm0 \n\t"
91 "paddw %%mm0, %%mm6 \n\t"
92 "add %3, %%"REG_a" \n\t"
93 " js 1b \n\t"
94 : "+a" (len)
95 : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
96 );
97}
98
99static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
100{
101 long len= -(stride*h);
102 asm volatile(
103#if !defined(VBOX) || !defined(__DARWIN__)
104 ".balign 16 \n\t"
105#else
106 ".align 4 \n\t"
107#endif
108 "1: \n\t"
109 "movq (%1, %%"REG_a"), %%mm0 \n\t"
110 "movq (%2, %%"REG_a"), %%mm2 \n\t"
111 "pavgb %%mm2, %%mm0 \n\t"
112 "movq (%3, %%"REG_a"), %%mm2 \n\t"
113 "psadbw %%mm2, %%mm0 \n\t"
114 "add %4, %%"REG_a" \n\t"
115 "movq (%1, %%"REG_a"), %%mm1 \n\t"
116 "movq (%2, %%"REG_a"), %%mm3 \n\t"
117 "pavgb %%mm1, %%mm3 \n\t"
118 "movq (%3, %%"REG_a"), %%mm1 \n\t"
119 "psadbw %%mm1, %%mm3 \n\t"
120 "paddw %%mm3, %%mm0 \n\t"
121 "paddw %%mm0, %%mm6 \n\t"
122 "add %4, %%"REG_a" \n\t"
123 " js 1b \n\t"
124 : "+a" (len)
125 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
126 );
127}
128
129static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
130{ //FIXME reuse src
131 long len= -(stride*h);
132 asm volatile(
133#if !defined(VBOX) || !defined(__DARWIN__)
134 ".balign 16 \n\t"
135#else
136 ".align 4 \n\t"
137#endif
138 "movq "MANGLE(bone)", %%mm5 \n\t"
139 "1: \n\t"
140 "movq (%1, %%"REG_a"), %%mm0 \n\t"
141 "movq (%2, %%"REG_a"), %%mm2 \n\t"
142 "movq 1(%1, %%"REG_a"), %%mm1 \n\t"
143 "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
144 "pavgb %%mm2, %%mm0 \n\t"
145 "pavgb %%mm1, %%mm3 \n\t"
146 "psubusb %%mm5, %%mm3 \n\t"
147 "pavgb %%mm3, %%mm0 \n\t"
148 "movq (%3, %%"REG_a"), %%mm2 \n\t"
149 "psadbw %%mm2, %%mm0 \n\t"
150 "add %4, %%"REG_a" \n\t"
151 "movq (%1, %%"REG_a"), %%mm1 \n\t"
152 "movq (%2, %%"REG_a"), %%mm3 \n\t"
153 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
154 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
155 "pavgb %%mm3, %%mm1 \n\t"
156 "pavgb %%mm4, %%mm2 \n\t"
157 "psubusb %%mm5, %%mm2 \n\t"
158 "pavgb %%mm1, %%mm2 \n\t"
159 "movq (%3, %%"REG_a"), %%mm1 \n\t"
160 "psadbw %%mm1, %%mm2 \n\t"
161 "paddw %%mm2, %%mm0 \n\t"
162 "paddw %%mm0, %%mm6 \n\t"
163 "add %4, %%"REG_a" \n\t"
164 " js 1b \n\t"
165 : "+a" (len)
166 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
167 );
168}
169
170static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
171{
172 long len= -(stride*h);
173 asm volatile(
174#if !defined(VBOX) || !defined(__DARWIN__)
175 ".balign 16 \n\t"
176#else
177 ".align 4 \n\t"
178#endif
179 "1: \n\t"
180 "movq (%1, %%"REG_a"), %%mm0 \n\t"
181 "movq (%2, %%"REG_a"), %%mm1 \n\t"
182 "movq (%1, %%"REG_a"), %%mm2 \n\t"
183 "movq (%2, %%"REG_a"), %%mm3 \n\t"
184 "punpcklbw %%mm7, %%mm0 \n\t"
185 "punpcklbw %%mm7, %%mm1 \n\t"
186 "punpckhbw %%mm7, %%mm2 \n\t"
187 "punpckhbw %%mm7, %%mm3 \n\t"
188 "paddw %%mm0, %%mm1 \n\t"
189 "paddw %%mm2, %%mm3 \n\t"
190 "movq (%3, %%"REG_a"), %%mm4 \n\t"
191 "movq (%3, %%"REG_a"), %%mm2 \n\t"
192 "paddw %%mm5, %%mm1 \n\t"
193 "paddw %%mm5, %%mm3 \n\t"
194 "psrlw $1, %%mm1 \n\t"
195 "psrlw $1, %%mm3 \n\t"
196 "packuswb %%mm3, %%mm1 \n\t"
197 "psubusb %%mm1, %%mm4 \n\t"
198 "psubusb %%mm2, %%mm1 \n\t"
199 "por %%mm4, %%mm1 \n\t"
200 "movq %%mm1, %%mm0 \n\t"
201 "punpcklbw %%mm7, %%mm0 \n\t"
202 "punpckhbw %%mm7, %%mm1 \n\t"
203 "paddw %%mm1, %%mm0 \n\t"
204 "paddw %%mm0, %%mm6 \n\t"
205 "add %4, %%"REG_a" \n\t"
206 " js 1b \n\t"
207 : "+a" (len)
208 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
209 );
210}
211
212static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
213{
214 long len= -(stride*h);
215 asm volatile(
216#if !defined(VBOX) || !defined(__DARWIN__)
217 ".balign 16 \n\t"
218#else
219 ".align 4 \n\t"
220#endif
221 "1: \n\t"
222 "movq (%1, %%"REG_a"), %%mm0 \n\t"
223 "movq (%2, %%"REG_a"), %%mm1 \n\t"
224 "movq %%mm0, %%mm4 \n\t"
225 "movq %%mm1, %%mm2 \n\t"
226 "punpcklbw %%mm7, %%mm0 \n\t"
227 "punpcklbw %%mm7, %%mm1 \n\t"
228 "punpckhbw %%mm7, %%mm4 \n\t"
229 "punpckhbw %%mm7, %%mm2 \n\t"
230 "paddw %%mm1, %%mm0 \n\t"
231 "paddw %%mm2, %%mm4 \n\t"
232 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
233 "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
234 "movq %%mm2, %%mm1 \n\t"
235 "punpcklbw %%mm7, %%mm2 \n\t"
236 "punpckhbw %%mm7, %%mm1 \n\t"
237 "paddw %%mm0, %%mm2 \n\t"
238 "paddw %%mm4, %%mm1 \n\t"
239 "movq %%mm3, %%mm4 \n\t"
240 "punpcklbw %%mm7, %%mm3 \n\t"
241 "punpckhbw %%mm7, %%mm4 \n\t"
242 "paddw %%mm3, %%mm2 \n\t"
243 "paddw %%mm4, %%mm1 \n\t"
244 "movq (%3, %%"REG_a"), %%mm3 \n\t"
245 "movq (%3, %%"REG_a"), %%mm4 \n\t"
246 "paddw %%mm5, %%mm2 \n\t"
247 "paddw %%mm5, %%mm1 \n\t"
248 "psrlw $2, %%mm2 \n\t"
249 "psrlw $2, %%mm1 \n\t"
250 "packuswb %%mm1, %%mm2 \n\t"
251 "psubusb %%mm2, %%mm3 \n\t"
252 "psubusb %%mm4, %%mm2 \n\t"
253 "por %%mm3, %%mm2 \n\t"
254 "movq %%mm2, %%mm0 \n\t"
255 "punpcklbw %%mm7, %%mm0 \n\t"
256 "punpckhbw %%mm7, %%mm2 \n\t"
257 "paddw %%mm2, %%mm0 \n\t"
258 "paddw %%mm0, %%mm6 \n\t"
259 "add %4, %%"REG_a" \n\t"
260 " js 1b \n\t"
261 : "+a" (len)
262 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
263 );
264}
265
266static inline int sum_mmx(void)
267{
268 int ret;
269 asm volatile(
270 "movq %%mm6, %%mm0 \n\t"
271 "psrlq $32, %%mm6 \n\t"
272 "paddw %%mm0, %%mm6 \n\t"
273 "movq %%mm6, %%mm0 \n\t"
274 "psrlq $16, %%mm6 \n\t"
275 "paddw %%mm0, %%mm6 \n\t"
276 "movd %%mm6, %0 \n\t"
277 : "=r" (ret)
278 );
279 return ret&0xFFFF;
280}
281
282static inline int sum_mmx2(void)
283{
284 int ret;
285 asm volatile(
286 "movd %%mm6, %0 \n\t"
287 : "=r" (ret)
288 );
289 return ret;
290}
291
292
293#define PIX_SAD(suf)\
294static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
295{\
296 assert(h==8);\
297 asm volatile("pxor %%mm7, %%mm7 \n\t"\
298 "pxor %%mm6, %%mm6 \n\t":);\
299\
300 sad8_1_ ## suf(blk1, blk2, stride, 8);\
301\
302 return sum_ ## suf();\
303}\
304static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
305{\
306 assert(h==8);\
307 asm volatile("pxor %%mm7, %%mm7 \n\t"\
308 "pxor %%mm6, %%mm6 \n\t"\
309 "movq %0, %%mm5 \n\t"\
310 :: "m"(round_tab[1]) \
311 );\
312\
313 sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
314\
315 return sum_ ## suf();\
316}\
317\
318static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
319{\
320 assert(h==8);\
321 asm volatile("pxor %%mm7, %%mm7 \n\t"\
322 "pxor %%mm6, %%mm6 \n\t"\
323 "movq %0, %%mm5 \n\t"\
324 :: "m"(round_tab[1]) \
325 );\
326\
327 sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
328\
329 return sum_ ## suf();\
330}\
331\
332static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
333{\
334 assert(h==8);\
335 asm volatile("pxor %%mm7, %%mm7 \n\t"\
336 "pxor %%mm6, %%mm6 \n\t"\
337 "movq %0, %%mm5 \n\t"\
338 :: "m"(round_tab[2]) \
339 );\
340\
341 sad8_4_ ## suf(blk1, blk2, stride, 8);\
342\
343 return sum_ ## suf();\
344}\
345\
346static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
347{\
348 asm volatile("pxor %%mm7, %%mm7 \n\t"\
349 "pxor %%mm6, %%mm6 \n\t":);\
350\
351 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
352 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
353\
354 return sum_ ## suf();\
355}\
356static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
357{\
358 asm volatile("pxor %%mm7, %%mm7 \n\t"\
359 "pxor %%mm6, %%mm6 \n\t"\
360 "movq %0, %%mm5 \n\t"\
361 :: "m"(round_tab[1]) \
362 );\
363\
364 sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\
365 sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
366\
367 return sum_ ## suf();\
368}\
369static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
370{\
371 asm volatile("pxor %%mm7, %%mm7 \n\t"\
372 "pxor %%mm6, %%mm6 \n\t"\
373 "movq %0, %%mm5 \n\t"\
374 :: "m"(round_tab[1]) \
375 );\
376\
377 sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\
378 sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
379\
380 return sum_ ## suf();\
381}\
382static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
383{\
384 asm volatile("pxor %%mm7, %%mm7 \n\t"\
385 "pxor %%mm6, %%mm6 \n\t"\
386 "movq %0, %%mm5 \n\t"\
387 :: "m"(round_tab[2]) \
388 );\
389\
390 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
391 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
392\
393 return sum_ ## suf();\
394}\
395
396PIX_SAD(mmx)
397PIX_SAD(mmx2)
398
399void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
400{
401 if (mm_flags & MM_MMX) {
402 c->pix_abs[0][0] = sad16_mmx;
403 c->pix_abs[0][1] = sad16_x2_mmx;
404 c->pix_abs[0][2] = sad16_y2_mmx;
405 c->pix_abs[0][3] = sad16_xy2_mmx;
406 c->pix_abs[1][0] = sad8_mmx;
407 c->pix_abs[1][1] = sad8_x2_mmx;
408 c->pix_abs[1][2] = sad8_y2_mmx;
409 c->pix_abs[1][3] = sad8_xy2_mmx;
410
411 c->sad[0]= sad16_mmx;
412 c->sad[1]= sad8_mmx;
413 }
414 if (mm_flags & MM_MMXEXT) {
415 c->pix_abs[0][0] = sad16_mmx2;
416 c->pix_abs[1][0] = sad8_mmx2;
417
418 c->sad[0]= sad16_mmx2;
419 c->sad[1]= sad8_mmx2;
420
421 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
422 c->pix_abs[0][1] = sad16_x2_mmx2;
423 c->pix_abs[0][2] = sad16_y2_mmx2;
424 c->pix_abs[0][3] = sad16_xy2_mmx2;
425 c->pix_abs[1][1] = sad8_x2_mmx2;
426 c->pix_abs[1][2] = sad8_y2_mmx2;
427 c->pix_abs[1][3] = sad8_xy2_mmx2;
428 }
429 }
430}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette