VirtualBox

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/dsputil.c@ 9441

Last change on this file since 9441 was 5776, checked in by vboxsync, 17 years ago

ffmpeg: exported to OSE

File size: 143.6 KB
Line 
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <[email protected]>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <[email protected]>
21 */
22
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28#include "avcodec.h"
29#include "dsputil.h"
30#include "mpegvideo.h"
31#include "simple_idct.h"
32#include "faandct.h"
33#include "snow.h"
34
35/* snow.c */
36void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37
38uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39uint32_t squareTbl[512] = {0, };
40
41const uint8_t ff_zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
50};
51
52/* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
63};
64
65/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
67
68const uint8_t ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
77};
78
79const uint8_t ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
88};
89
90/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91const uint32_t inverse[256]={
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
124};
125
126/* Input permutation for the simple_idct_mmx */
127static const uint8_t simple_mmx_permutation[64]={
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136};
137
138static int pix_sum_c(uint8_t * pix, int line_size)
139{
140 int s, i, j;
141
142 s = 0;
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
145 s += pix[0];
146 s += pix[1];
147 s += pix[2];
148 s += pix[3];
149 s += pix[4];
150 s += pix[5];
151 s += pix[6];
152 s += pix[7];
153 pix += 8;
154 }
155 pix += line_size - 16;
156 }
157 return s;
158}
159
160static int pix_norm1_c(uint8_t * pix, int line_size)
161{
162 int s, i, j;
163 uint32_t *sq = squareTbl + 256;
164
165 s = 0;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
168#if 0
169 s += sq[pix[0]];
170 s += sq[pix[1]];
171 s += sq[pix[2]];
172 s += sq[pix[3]];
173 s += sq[pix[4]];
174 s += sq[pix[5]];
175 s += sq[pix[6]];
176 s += sq[pix[7]];
177#else
178#if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
180 s += sq[x&0xff];
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
188#else
189 register uint32_t x=*(uint32_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199#endif
200#endif
201 pix += 8;
202 }
203 pix += line_size - 16;
204 }
205 return s;
206}
207
208static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209 int i;
210
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
220 }
221 for(;i<w; i++){
222 dst[i+0]= bswap_32(src[i+0]);
223 }
224}
225
226static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227{
228 int s, i;
229 uint32_t *sq = squareTbl + 256;
230
231 s = 0;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
243static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244{
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
258 pix1 += line_size;
259 pix2 += line_size;
260 }
261 return s;
262}
263
264static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265{
266 int s, i;
267 uint32_t *sq = squareTbl + 256;
268
269 s = 0;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
287
288 pix1 += line_size;
289 pix2 += line_size;
290 }
291 return s;
292}
293
294
295#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
296static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
297 int s, i, j;
298 const int dec_count= w==8 ? 3 : 4;
299 int tmp[32*32];
300 int level, ori;
301 static const int scale[2][2][4][4]={
302 {
303 {
304 // 9/7 8x8 dec=3
305 {268, 239, 239, 213},
306 { 0, 224, 224, 152},
307 { 0, 135, 135, 110},
308 },{
309 // 9/7 16x16 or 32x32 dec=4
310 {344, 310, 310, 280},
311 { 0, 320, 320, 228},
312 { 0, 175, 175, 136},
313 { 0, 129, 129, 102},
314 }
315 },{
316 {
317 // 5/3 8x8 dec=3
318 {275, 245, 245, 218},
319 { 0, 230, 230, 156},
320 { 0, 138, 138, 113},
321 },{
322 // 5/3 16x16 or 32x32 dec=4
323 {352, 317, 317, 286},
324 { 0, 328, 328, 233},
325 { 0, 180, 180, 140},
326 { 0, 132, 132, 105},
327 }
328 }
329 };
330
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
333 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
337 }
338 pix1 += line_size;
339 pix2 += line_size;
340 }
341
342 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
343
344 s=0;
345 assert(w==h);
346 for(level=0; level<dec_count; level++){
347 for(ori= level ? 1 : 0; ori<4; ori++){
348 int size= w>>(dec_count-level);
349 int sx= (ori&1) ? size : 0;
350 int stride= 32<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
352
353 for(i=0; i<size; i++){
354 for(j=0; j<size; j++){
355 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
356 s += ABS(v);
357 }
358 }
359 }
360 }
361 assert(s>=0);
362 return s>>9;
363}
364
365static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
366 return w_c(v, pix1, pix2, line_size, 8, h, 1);
367}
368
369static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
370 return w_c(v, pix1, pix2, line_size, 8, h, 0);
371}
372
373static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
374 return w_c(v, pix1, pix2, line_size, 16, h, 1);
375}
376
377static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 16, h, 0);
379}
380
381int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 32, h, 1);
383}
384
385int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 32, h, 0);
387}
388#endif
389
390static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
391{
392 int i;
393
394 /* read the pixels */
395 for(i=0;i<8;i++) {
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
404 pixels += line_size;
405 block += 8;
406 }
407}
408
409static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410 const uint8_t *s2, int stride){
411 int i;
412
413 /* read the pixels */
414 for(i=0;i<8;i++) {
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
423 s1 += stride;
424 s2 += stride;
425 block += 8;
426 }
427}
428
429
430static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 int line_size)
432{
433 int i;
434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
435
436 /* read the pixels */
437 for(i=0;i<8;i++) {
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
446
447 pixels += line_size;
448 block += 8;
449 }
450}
451
452static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453 int line_size)
454{
455 int i;
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
457
458 /* read the pixels */
459 for(i=0;i<4;i++) {
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
464
465 pixels += line_size;
466 block += 8;
467 }
468}
469
470static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
471 int line_size)
472{
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476 /* read the pixels */
477 for(i=0;i<2;i++) {
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
480
481 pixels += line_size;
482 block += 8;
483 }
484}
485
486static void put_signed_pixels_clamped_c(const DCTELEM *block,
487 uint8_t *restrict pixels,
488 int line_size)
489{
490 int i, j;
491
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
494 if (*block < -128)
495 *pixels = 0;
496 else if (*block > 127)
497 *pixels = 255;
498 else
499 *pixels = (uint8_t)(*block + 128);
500 block++;
501 pixels++;
502 }
503 pixels += (line_size - 8);
504 }
505}
506
507static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
508 int line_size)
509{
510 int i;
511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
512
513 /* read the pixels */
514 for(i=0;i<8;i++) {
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
523 pixels += line_size;
524 block += 8;
525 }
526}
527
528static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529 int line_size)
530{
531 int i;
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
533
534 /* read the pixels */
535 for(i=0;i<4;i++) {
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
540 pixels += line_size;
541 block += 8;
542 }
543}
544
545static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546 int line_size)
547{
548 int i;
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
550
551 /* read the pixels */
552 for(i=0;i<2;i++) {
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
555 pixels += line_size;
556 block += 8;
557 }
558}
559
560static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
561{
562 int i;
563 for(i=0;i<8;i++) {
564 pixels[0] += block[0];
565 pixels[1] += block[1];
566 pixels[2] += block[2];
567 pixels[3] += block[3];
568 pixels[4] += block[4];
569 pixels[5] += block[5];
570 pixels[6] += block[6];
571 pixels[7] += block[7];
572 pixels += line_size;
573 block += 8;
574 }
575}
576
577static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
578{
579 int i;
580 for(i=0;i<4;i++) {
581 pixels[0] += block[0];
582 pixels[1] += block[1];
583 pixels[2] += block[2];
584 pixels[3] += block[3];
585 pixels += line_size;
586 block += 4;
587 }
588}
589
590#if 0
591
592#define PIXOP2(OPNAME, OP) \
593static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
594{\
595 int i;\
596 for(i=0; i<h; i++){\
597 OP(*((uint64_t*)block), LD64(pixels));\
598 pixels+=line_size;\
599 block +=line_size;\
600 }\
601}\
602\
603static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
604{\
605 int i;\
606 for(i=0; i<h; i++){\
607 const uint64_t a= LD64(pixels );\
608 const uint64_t b= LD64(pixels+1);\
609 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
610 pixels+=line_size;\
611 block +=line_size;\
612 }\
613}\
614\
615static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
616{\
617 int i;\
618 for(i=0; i<h; i++){\
619 const uint64_t a= LD64(pixels );\
620 const uint64_t b= LD64(pixels+1);\
621 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
622 pixels+=line_size;\
623 block +=line_size;\
624 }\
625}\
626\
627static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
628{\
629 int i;\
630 for(i=0; i<h; i++){\
631 const uint64_t a= LD64(pixels );\
632 const uint64_t b= LD64(pixels+line_size);\
633 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
634 pixels+=line_size;\
635 block +=line_size;\
636 }\
637}\
638\
639static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
640{\
641 int i;\
642 for(i=0; i<h; i++){\
643 const uint64_t a= LD64(pixels );\
644 const uint64_t b= LD64(pixels+line_size);\
645 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
646 pixels+=line_size;\
647 block +=line_size;\
648 }\
649}\
650\
651static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
652{\
653 int i;\
654 const uint64_t a= LD64(pixels );\
655 const uint64_t b= LD64(pixels+1);\
656 uint64_t l0= (a&0x0303030303030303ULL)\
657 + (b&0x0303030303030303ULL)\
658 + 0x0202020202020202ULL;\
659 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
660 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
661 uint64_t l1,h1;\
662\
663 pixels+=line_size;\
664 for(i=0; i<h; i+=2){\
665 uint64_t a= LD64(pixels );\
666 uint64_t b= LD64(pixels+1);\
667 l1= (a&0x0303030303030303ULL)\
668 + (b&0x0303030303030303ULL);\
669 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
670 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
671 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
672 pixels+=line_size;\
673 block +=line_size;\
674 a= LD64(pixels );\
675 b= LD64(pixels+1);\
676 l0= (a&0x0303030303030303ULL)\
677 + (b&0x0303030303030303ULL)\
678 + 0x0202020202020202ULL;\
679 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
682 pixels+=line_size;\
683 block +=line_size;\
684 }\
685}\
686\
687static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
688{\
689 int i;\
690 const uint64_t a= LD64(pixels );\
691 const uint64_t b= LD64(pixels+1);\
692 uint64_t l0= (a&0x0303030303030303ULL)\
693 + (b&0x0303030303030303ULL)\
694 + 0x0101010101010101ULL;\
695 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
696 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
697 uint64_t l1,h1;\
698\
699 pixels+=line_size;\
700 for(i=0; i<h; i+=2){\
701 uint64_t a= LD64(pixels );\
702 uint64_t b= LD64(pixels+1);\
703 l1= (a&0x0303030303030303ULL)\
704 + (b&0x0303030303030303ULL);\
705 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
706 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
707 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
708 pixels+=line_size;\
709 block +=line_size;\
710 a= LD64(pixels );\
711 b= LD64(pixels+1);\
712 l0= (a&0x0303030303030303ULL)\
713 + (b&0x0303030303030303ULL)\
714 + 0x0101010101010101ULL;\
715 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
718 pixels+=line_size;\
719 block +=line_size;\
720 }\
721}\
722\
723CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
724CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
725CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
726CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
727CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
728CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
729CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
730
731#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
732#else // 64 bit variant
733
734#define PIXOP2(OPNAME, OP) \
735static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
736 int i;\
737 for(i=0; i<h; i++){\
738 OP(*((uint16_t*)(block )), LD16(pixels ));\
739 pixels+=line_size;\
740 block +=line_size;\
741 }\
742}\
743static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
744 int i;\
745 for(i=0; i<h; i++){\
746 OP(*((uint32_t*)(block )), LD32(pixels ));\
747 pixels+=line_size;\
748 block +=line_size;\
749 }\
750}\
751static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
752 int i;\
753 for(i=0; i<h; i++){\
754 OP(*((uint32_t*)(block )), LD32(pixels ));\
755 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
756 pixels+=line_size;\
757 block +=line_size;\
758 }\
759}\
760static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
762}\
763\
764static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
766 int i;\
767 for(i=0; i<h; i++){\
768 uint32_t a,b;\
769 a= LD32(&src1[i*src_stride1 ]);\
770 b= LD32(&src2[i*src_stride2 ]);\
771 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
772 a= LD32(&src1[i*src_stride1+4]);\
773 b= LD32(&src2[i*src_stride2+4]);\
774 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
775 }\
776}\
777\
778static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779 int src_stride1, int src_stride2, int h){\
780 int i;\
781 for(i=0; i<h; i++){\
782 uint32_t a,b;\
783 a= LD32(&src1[i*src_stride1 ]);\
784 b= LD32(&src2[i*src_stride2 ]);\
785 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
786 a= LD32(&src1[i*src_stride1+4]);\
787 b= LD32(&src2[i*src_stride2+4]);\
788 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
789 }\
790}\
791\
792static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
794 int i;\
795 for(i=0; i<h; i++){\
796 uint32_t a,b;\
797 a= LD32(&src1[i*src_stride1 ]);\
798 b= LD32(&src2[i*src_stride2 ]);\
799 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
800 }\
801}\
802\
803static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
804 int src_stride1, int src_stride2, int h){\
805 int i;\
806 for(i=0; i<h; i++){\
807 uint32_t a,b;\
808 a= LD16(&src1[i*src_stride1 ]);\
809 b= LD16(&src2[i*src_stride2 ]);\
810 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
811 }\
812}\
813\
814static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
815 int src_stride1, int src_stride2, int h){\
816 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
817 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
818}\
819\
820static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
822 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
823 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
824}\
825\
826static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
828}\
829\
830static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832}\
833\
834static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
836}\
837\
838static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840}\
841\
842static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
843 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
844 int i;\
845 for(i=0; i<h; i++){\
846 uint32_t a, b, c, d, l0, l1, h0, h1;\
847 a= LD32(&src1[i*src_stride1]);\
848 b= LD32(&src2[i*src_stride2]);\
849 c= LD32(&src3[i*src_stride3]);\
850 d= LD32(&src4[i*src_stride4]);\
851 l0= (a&0x03030303UL)\
852 + (b&0x03030303UL)\
853 + 0x02020202UL;\
854 h0= ((a&0xFCFCFCFCUL)>>2)\
855 + ((b&0xFCFCFCFCUL)>>2);\
856 l1= (c&0x03030303UL)\
857 + (d&0x03030303UL);\
858 h1= ((c&0xFCFCFCFCUL)>>2)\
859 + ((d&0xFCFCFCFCUL)>>2);\
860 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
861 a= LD32(&src1[i*src_stride1+4]);\
862 b= LD32(&src2[i*src_stride2+4]);\
863 c= LD32(&src3[i*src_stride3+4]);\
864 d= LD32(&src4[i*src_stride4+4]);\
865 l0= (a&0x03030303UL)\
866 + (b&0x03030303UL)\
867 + 0x02020202UL;\
868 h0= ((a&0xFCFCFCFCUL)>>2)\
869 + ((b&0xFCFCFCFCUL)>>2);\
870 l1= (c&0x03030303UL)\
871 + (d&0x03030303UL);\
872 h1= ((c&0xFCFCFCFCUL)>>2)\
873 + ((d&0xFCFCFCFCUL)>>2);\
874 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875 }\
876}\
877\
878static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
879 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
880}\
881\
882static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
884}\
885\
886static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
888}\
889\
890static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
892}\
893\
894static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
895 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
896 int i;\
897 for(i=0; i<h; i++){\
898 uint32_t a, b, c, d, l0, l1, h0, h1;\
899 a= LD32(&src1[i*src_stride1]);\
900 b= LD32(&src2[i*src_stride2]);\
901 c= LD32(&src3[i*src_stride3]);\
902 d= LD32(&src4[i*src_stride4]);\
903 l0= (a&0x03030303UL)\
904 + (b&0x03030303UL)\
905 + 0x01010101UL;\
906 h0= ((a&0xFCFCFCFCUL)>>2)\
907 + ((b&0xFCFCFCFCUL)>>2);\
908 l1= (c&0x03030303UL)\
909 + (d&0x03030303UL);\
910 h1= ((c&0xFCFCFCFCUL)>>2)\
911 + ((d&0xFCFCFCFCUL)>>2);\
912 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
913 a= LD32(&src1[i*src_stride1+4]);\
914 b= LD32(&src2[i*src_stride2+4]);\
915 c= LD32(&src3[i*src_stride3+4]);\
916 d= LD32(&src4[i*src_stride4+4]);\
917 l0= (a&0x03030303UL)\
918 + (b&0x03030303UL)\
919 + 0x01010101UL;\
920 h0= ((a&0xFCFCFCFCUL)>>2)\
921 + ((b&0xFCFCFCFCUL)>>2);\
922 l1= (c&0x03030303UL)\
923 + (d&0x03030303UL);\
924 h1= ((c&0xFCFCFCFCUL)>>2)\
925 + ((d&0xFCFCFCFCUL)>>2);\
926 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927 }\
928}\
929static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
930 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
931 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
932 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
933}\
934static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938}\
939\
940static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
941{\
942 int i, a0, b0, a1, b1;\
943 a0= pixels[0];\
944 b0= pixels[1] + 2;\
945 a0 += b0;\
946 b0 += pixels[2];\
947\
948 pixels+=line_size;\
949 for(i=0; i<h; i+=2){\
950 a1= pixels[0];\
951 b1= pixels[1];\
952 a1 += b1;\
953 b1 += pixels[2];\
954\
955 block[0]= (a1+a0)>>2; /* FIXME non put */\
956 block[1]= (b1+b0)>>2;\
957\
958 pixels+=line_size;\
959 block +=line_size;\
960\
961 a0= pixels[0];\
962 b0= pixels[1] + 2;\
963 a0 += b0;\
964 b0 += pixels[2];\
965\
966 block[0]= (a1+a0)>>2;\
967 block[1]= (b1+b0)>>2;\
968 pixels+=line_size;\
969 block +=line_size;\
970 }\
971}\
972\
973static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
974{\
975 int i;\
976 const uint32_t a= LD32(pixels );\
977 const uint32_t b= LD32(pixels+1);\
978 uint32_t l0= (a&0x03030303UL)\
979 + (b&0x03030303UL)\
980 + 0x02020202UL;\
981 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
982 + ((b&0xFCFCFCFCUL)>>2);\
983 uint32_t l1,h1;\
984\
985 pixels+=line_size;\
986 for(i=0; i<h; i+=2){\
987 uint32_t a= LD32(pixels );\
988 uint32_t b= LD32(pixels+1);\
989 l1= (a&0x03030303UL)\
990 + (b&0x03030303UL);\
991 h1= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
993 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994 pixels+=line_size;\
995 block +=line_size;\
996 a= LD32(pixels );\
997 b= LD32(pixels+1);\
998 l0= (a&0x03030303UL)\
999 + (b&0x03030303UL)\
1000 + 0x02020202UL;\
1001 h0= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 pixels+=line_size;\
1005 block +=line_size;\
1006 }\
1007}\
1008\
1009static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010{\
1011 int j;\
1012 for(j=0; j<2; j++){\
1013 int i;\
1014 const uint32_t a= LD32(pixels );\
1015 const uint32_t b= LD32(pixels+1);\
1016 uint32_t l0= (a&0x03030303UL)\
1017 + (b&0x03030303UL)\
1018 + 0x02020202UL;\
1019 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020 + ((b&0xFCFCFCFCUL)>>2);\
1021 uint32_t l1,h1;\
1022\
1023 pixels+=line_size;\
1024 for(i=0; i<h; i+=2){\
1025 uint32_t a= LD32(pixels );\
1026 uint32_t b= LD32(pixels+1);\
1027 l1= (a&0x03030303UL)\
1028 + (b&0x03030303UL);\
1029 h1= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032 pixels+=line_size;\
1033 block +=line_size;\
1034 a= LD32(pixels );\
1035 b= LD32(pixels+1);\
1036 l0= (a&0x03030303UL)\
1037 + (b&0x03030303UL)\
1038 + 0x02020202UL;\
1039 h0= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042 pixels+=line_size;\
1043 block +=line_size;\
1044 }\
1045 pixels+=4-line_size*(h+1);\
1046 block +=4-line_size*h;\
1047 }\
1048}\
1049\
1050static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051{\
1052 int j;\
1053 for(j=0; j<2; j++){\
1054 int i;\
1055 const uint32_t a= LD32(pixels );\
1056 const uint32_t b= LD32(pixels+1);\
1057 uint32_t l0= (a&0x03030303UL)\
1058 + (b&0x03030303UL)\
1059 + 0x01010101UL;\
1060 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 uint32_t l1,h1;\
1063\
1064 pixels+=line_size;\
1065 for(i=0; i<h; i+=2){\
1066 uint32_t a= LD32(pixels );\
1067 uint32_t b= LD32(pixels+1);\
1068 l1= (a&0x03030303UL)\
1069 + (b&0x03030303UL);\
1070 h1= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073 pixels+=line_size;\
1074 block +=line_size;\
1075 a= LD32(pixels );\
1076 b= LD32(pixels+1);\
1077 l0= (a&0x03030303UL)\
1078 + (b&0x03030303UL)\
1079 + 0x01010101UL;\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083 pixels+=line_size;\
1084 block +=line_size;\
1085 }\
1086 pixels+=4-line_size*(h+1);\
1087 block +=4-line_size*h;\
1088 }\
1089}\
1090\
1091CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1092CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1096CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1099
1100#define op_avg(a, b) a = rnd_avg32(a, b)
1101#endif
1102#define op_put(a, b) a = b
1103
1104PIXOP2(avg, op_avg)
1105PIXOP2(put, op_put)
1106#undef op_avg
1107#undef op_put
1108
1109#define avg2(a,b) ((a+b+1)>>1)
1110#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111
1112static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114}
1115
1116static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118}
1119
1120static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1121{
1122 const int A=(16-x16)*(16-y16);
1123 const int B=( x16)*(16-y16);
1124 const int C=(16-x16)*( y16);
1125 const int D=( x16)*( y16);
1126 int i;
1127
1128 for(i=0; i<h; i++)
1129 {
1130 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1138 dst+= stride;
1139 src+= stride;
1140 }
1141}
1142
1143void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1144 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145{
1146 int y, vx, vy;
1147 const int s= 1<<shift;
1148
1149 width--;
1150 height--;
1151
1152 for(y=0; y<h; y++){
1153 int x;
1154
1155 vx= ox;
1156 vy= oy;
1157 for(x=0; x<8; x++){ //XXX FIXME optimize
1158 int src_x, src_y, frac_x, frac_y, index;
1159
1160 src_x= vx>>16;
1161 src_y= vy>>16;
1162 frac_x= src_x&(s-1);
1163 frac_y= src_y&(s-1);
1164 src_x>>=shift;
1165 src_y>>=shift;
1166
1167 if((unsigned)src_x < width){
1168 if((unsigned)src_y < height){
1169 index= src_x + src_y*stride;
1170 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1171 + src[index +1]* frac_x )*(s-frac_y)
1172 + ( src[index+stride ]*(s-frac_x)
1173 + src[index+stride+1]* frac_x )* frac_y
1174 + r)>>(shift*2);
1175 }else{
1176 index= src_x + clip(src_y, 0, height)*stride;
1177 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1178 + src[index +1]* frac_x )*s
1179 + r)>>(shift*2);
1180 }
1181 }else{
1182 if((unsigned)src_y < height){
1183 index= clip(src_x, 0, width) + src_y*stride;
1184 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1185 + src[index+stride ]* frac_y )*s
1186 + r)>>(shift*2);
1187 }else{
1188 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189 dst[y*stride + x]= src[index ];
1190 }
1191 }
1192
1193 vx+= dxx;
1194 vy+= dyx;
1195 }
1196 ox += dxy;
1197 oy += dyy;
1198 }
1199}
1200
1201static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202 switch(width){
1203 case 2: put_pixels2_c (dst, src, stride, height); break;
1204 case 4: put_pixels4_c (dst, src, stride, height); break;
1205 case 8: put_pixels8_c (dst, src, stride, height); break;
1206 case 16:put_pixels16_c(dst, src, stride, height); break;
1207 }
1208}
1209
1210static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211 int i,j;
1212 for (i=0; i < height; i++) {
1213 for (j=0; j < width; j++) {
1214 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1215 }
1216 src += stride;
1217 dst += stride;
1218 }
1219}
1220
1221static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222 int i,j;
1223 for (i=0; i < height; i++) {
1224 for (j=0; j < width; j++) {
1225 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1226 }
1227 src += stride;
1228 dst += stride;
1229 }
1230}
1231
1232static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233 int i,j;
1234 for (i=0; i < height; i++) {
1235 for (j=0; j < width; j++) {
1236 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1237 }
1238 src += stride;
1239 dst += stride;
1240 }
1241}
1242
1243static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244 int i,j;
1245 for (i=0; i < height; i++) {
1246 for (j=0; j < width; j++) {
1247 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1248 }
1249 src += stride;
1250 dst += stride;
1251 }
1252}
1253
1254static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255 int i,j;
1256 for (i=0; i < height; i++) {
1257 for (j=0; j < width; j++) {
1258 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259 }
1260 src += stride;
1261 dst += stride;
1262 }
1263}
1264
1265static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266 int i,j;
1267 for (i=0; i < height; i++) {
1268 for (j=0; j < width; j++) {
1269 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1270 }
1271 src += stride;
1272 dst += stride;
1273 }
1274}
1275
1276static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 int i,j;
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
1280 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1281 }
1282 src += stride;
1283 dst += stride;
1284 }
1285}
1286
1287static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 int i,j;
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
1291 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1292 }
1293 src += stride;
1294 dst += stride;
1295 }
1296}
1297
1298static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 switch(width){
1300 case 2: avg_pixels2_c (dst, src, stride, height); break;
1301 case 4: avg_pixels4_c (dst, src, stride, height); break;
1302 case 8: avg_pixels8_c (dst, src, stride, height); break;
1303 case 16:avg_pixels16_c(dst, src, stride, height); break;
1304 }
1305}
1306
1307static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308 int i,j;
1309 for (i=0; i < height; i++) {
1310 for (j=0; j < width; j++) {
1311 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1312 }
1313 src += stride;
1314 dst += stride;
1315 }
1316}
1317
1318static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319 int i,j;
1320 for (i=0; i < height; i++) {
1321 for (j=0; j < width; j++) {
1322 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1323 }
1324 src += stride;
1325 dst += stride;
1326 }
1327}
1328
1329static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330 int i,j;
1331 for (i=0; i < height; i++) {
1332 for (j=0; j < width; j++) {
1333 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1334 }
1335 src += stride;
1336 dst += stride;
1337 }
1338}
1339
1340static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341 int i,j;
1342 for (i=0; i < height; i++) {
1343 for (j=0; j < width; j++) {
1344 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1345 }
1346 src += stride;
1347 dst += stride;
1348 }
1349}
1350
1351static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352 int i,j;
1353 for (i=0; i < height; i++) {
1354 for (j=0; j < width; j++) {
1355 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356 }
1357 src += stride;
1358 dst += stride;
1359 }
1360}
1361
1362static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363 int i,j;
1364 for (i=0; i < height; i++) {
1365 for (j=0; j < width; j++) {
1366 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1367 }
1368 src += stride;
1369 dst += stride;
1370 }
1371}
1372
1373static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374 int i,j;
1375 for (i=0; i < height; i++) {
1376 for (j=0; j < width; j++) {
1377 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1378 }
1379 src += stride;
1380 dst += stride;
1381 }
1382}
1383
1384static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385 int i,j;
1386 for (i=0; i < height; i++) {
1387 for (j=0; j < width; j++) {
1388 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389 }
1390 src += stride;
1391 dst += stride;
1392 }
1393}
1394#if 0
1395#define TPEL_WIDTH(width)\
1396static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414#endif
1415
1416#define H264_CHROMA_MC(OPNAME, OP)\
1417static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418 const int A=(8-x)*(8-y);\
1419 const int B=( x)*(8-y);\
1420 const int C=(8-x)*( y);\
1421 const int D=( x)*( y);\
1422 int i;\
1423 \
1424 assert(x<8 && y<8 && x>=0 && y>=0);\
1425\
1426 for(i=0; i<h; i++)\
1427 {\
1428 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430 dst+= stride;\
1431 src+= stride;\
1432 }\
1433}\
1434\
1435static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436 const int A=(8-x)*(8-y);\
1437 const int B=( x)*(8-y);\
1438 const int C=(8-x)*( y);\
1439 const int D=( x)*( y);\
1440 int i;\
1441 \
1442 assert(x<8 && y<8 && x>=0 && y>=0);\
1443\
1444 for(i=0; i<h; i++)\
1445 {\
1446 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1450 dst+= stride;\
1451 src+= stride;\
1452 }\
1453}\
1454\
1455static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456 const int A=(8-x)*(8-y);\
1457 const int B=( x)*(8-y);\
1458 const int C=(8-x)*( y);\
1459 const int D=( x)*( y);\
1460 int i;\
1461 \
1462 assert(x<8 && y<8 && x>=0 && y>=0);\
1463\
1464 for(i=0; i<h; i++)\
1465 {\
1466 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1474 dst+= stride;\
1475 src+= stride;\
1476 }\
1477}
1478
1479#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480#define op_put(a, b) a = (((b) + 32)>>6)
1481
1482H264_CHROMA_MC(put_ , op_put)
1483H264_CHROMA_MC(avg_ , op_avg)
1484#undef op_avg
1485#undef op_put
1486
1487static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1488{
1489 int i;
1490 for(i=0; i<h; i++)
1491 {
1492 ST16(dst , LD16(src ));
1493 dst+=dstStride;
1494 src+=srcStride;
1495 }
1496}
1497
1498static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1499{
1500 int i;
1501 for(i=0; i<h; i++)
1502 {
1503 ST32(dst , LD32(src ));
1504 dst+=dstStride;
1505 src+=srcStride;
1506 }
1507}
1508
1509static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510{
1511 int i;
1512 for(i=0; i<h; i++)
1513 {
1514 ST32(dst , LD32(src ));
1515 ST32(dst+4 , LD32(src+4 ));
1516 dst+=dstStride;
1517 src+=srcStride;
1518 }
1519}
1520
1521static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1522{
1523 int i;
1524 for(i=0; i<h; i++)
1525 {
1526 ST32(dst , LD32(src ));
1527 ST32(dst+4 , LD32(src+4 ));
1528 ST32(dst+8 , LD32(src+8 ));
1529 ST32(dst+12, LD32(src+12));
1530 dst+=dstStride;
1531 src+=srcStride;
1532 }
1533}
1534
1535static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1536{
1537 int i;
1538 for(i=0; i<h; i++)
1539 {
1540 ST32(dst , LD32(src ));
1541 ST32(dst+4 , LD32(src+4 ));
1542 ST32(dst+8 , LD32(src+8 ));
1543 ST32(dst+12, LD32(src+12));
1544 dst[16]= src[16];
1545 dst+=dstStride;
1546 src+=srcStride;
1547 }
1548}
1549
1550static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551{
1552 int i;
1553 for(i=0; i<h; i++)
1554 {
1555 ST32(dst , LD32(src ));
1556 ST32(dst+4 , LD32(src+4 ));
1557 dst[8]= src[8];
1558 dst+=dstStride;
1559 src+=srcStride;
1560 }
1561}
1562
1563
1564#define QPEL_MC(r, OPNAME, RND, OP) \
1565static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1567 int i;\
1568 for(i=0; i<h; i++)\
1569 {\
1570 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578 dst+=dstStride;\
1579 src+=srcStride;\
1580 }\
1581}\
1582\
1583static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1584 const int w=8;\
1585 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1586 int i;\
1587 for(i=0; i<w; i++)\
1588 {\
1589 const int src0= src[0*srcStride];\
1590 const int src1= src[1*srcStride];\
1591 const int src2= src[2*srcStride];\
1592 const int src3= src[3*srcStride];\
1593 const int src4= src[4*srcStride];\
1594 const int src5= src[5*srcStride];\
1595 const int src6= src[6*srcStride];\
1596 const int src7= src[7*srcStride];\
1597 const int src8= src[8*srcStride];\
1598 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1606 dst++;\
1607 src++;\
1608 }\
1609}\
1610\
1611static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1613 int i;\
1614 \
1615 for(i=0; i<h; i++)\
1616 {\
1617 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633 dst+=dstStride;\
1634 src+=srcStride;\
1635 }\
1636}\
1637\
1638static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1640 int i;\
1641 const int w=16;\
1642 for(i=0; i<w; i++)\
1643 {\
1644 const int src0= src[0*srcStride];\
1645 const int src1= src[1*srcStride];\
1646 const int src2= src[2*srcStride];\
1647 const int src3= src[3*srcStride];\
1648 const int src4= src[4*srcStride];\
1649 const int src5= src[5*srcStride];\
1650 const int src6= src[6*srcStride];\
1651 const int src7= src[7*srcStride];\
1652 const int src8= src[8*srcStride];\
1653 const int src9= src[9*srcStride];\
1654 const int src10= src[10*srcStride];\
1655 const int src11= src[11*srcStride];\
1656 const int src12= src[12*srcStride];\
1657 const int src13= src[13*srcStride];\
1658 const int src14= src[14*srcStride];\
1659 const int src15= src[15*srcStride];\
1660 const int src16= src[16*srcStride];\
1661 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1677 dst++;\
1678 src++;\
1679 }\
1680}\
1681\
1682static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1683 OPNAME ## pixels8_c(dst, src, stride, 8);\
1684}\
1685\
1686static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t half[64];\
1688 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1690}\
1691\
1692static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1693 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1694}\
1695\
1696static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t half[64];\
1698 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1700}\
1701\
1702static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1704 uint8_t half[64];\
1705 copy_block9(full, src, 16, stride, 9);\
1706 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1708}\
1709\
1710static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 copy_block9(full, src, 16, stride, 9);\
1713 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1714}\
1715\
1716static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1718 uint8_t half[64];\
1719 copy_block9(full, src, 16, stride, 9);\
1720 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1722}\
1723void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1725 uint8_t halfH[72];\
1726 uint8_t halfV[64];\
1727 uint8_t halfHV[64];\
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733}\
1734static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1736 uint8_t halfH[72];\
1737 uint8_t halfHV[64];\
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743}\
1744void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1746 uint8_t halfH[72];\
1747 uint8_t halfV[64];\
1748 uint8_t halfHV[64];\
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754}\
1755static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1757 uint8_t halfH[72];\
1758 uint8_t halfHV[64];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764}\
1765void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766 uint8_t full[16*9];\
1767 uint8_t halfH[72];\
1768 uint8_t halfV[64];\
1769 uint8_t halfHV[64];\
1770 copy_block9(full, src, 16, stride, 9);\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775}\
1776static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1778 uint8_t halfH[72];\
1779 uint8_t halfHV[64];\
1780 copy_block9(full, src, 16, stride, 9);\
1781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785}\
1786void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787 uint8_t full[16*9];\
1788 uint8_t halfH[72];\
1789 uint8_t halfV[64];\
1790 uint8_t halfHV[64];\
1791 copy_block9(full, src, 16, stride, 9);\
1792 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796}\
1797static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[16*9];\
1799 uint8_t halfH[72];\
1800 uint8_t halfHV[64];\
1801 copy_block9(full, src, 16, stride, 9);\
1802 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806}\
1807static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1808 uint8_t halfH[72];\
1809 uint8_t halfHV[64];\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1813}\
1814static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t halfH[72];\
1816 uint8_t halfHV[64];\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820}\
1821void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t full[16*9];\
1823 uint8_t halfH[72];\
1824 uint8_t halfV[64];\
1825 uint8_t halfHV[64];\
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1831}\
1832static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1834 uint8_t halfH[72];\
1835 copy_block9(full, src, 16, stride, 9);\
1836 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839}\
1840void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[16*9];\
1842 uint8_t halfH[72];\
1843 uint8_t halfV[64];\
1844 uint8_t halfHV[64];\
1845 copy_block9(full, src, 16, stride, 9);\
1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1850}\
1851static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 uint8_t halfH[72];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858}\
1859static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t halfH[72];\
1861 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863}\
1864static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1865 OPNAME ## pixels16_c(dst, src, stride, 16);\
1866}\
1867\
1868static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t half[256];\
1870 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872}\
1873\
1874static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1875 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1876}\
1877\
1878static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t half[256];\
1880 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882}\
1883\
1884static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1886 uint8_t half[256];\
1887 copy_block17(full, src, 24, stride, 17);\
1888 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890}\
1891\
1892static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 copy_block17(full, src, 24, stride, 17);\
1895 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1896}\
1897\
1898static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t half[256];\
1901 copy_block17(full, src, 24, stride, 17);\
1902 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904}\
1905void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915}\
1916static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925}\
1926void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936}\
1937static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946}\
1947void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t full[24*17];\
1949 uint8_t halfH[272];\
1950 uint8_t halfV[256];\
1951 uint8_t halfHV[256];\
1952 copy_block17(full, src, 24, stride, 17);\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957}\
1958static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfHV[256];\
1962 copy_block17(full, src, 24, stride, 17);\
1963 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967}\
1968void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969 uint8_t full[24*17];\
1970 uint8_t halfH[272];\
1971 uint8_t halfV[256];\
1972 uint8_t halfHV[256];\
1973 copy_block17(full, src, 24, stride, 17);\
1974 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1975 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978}\
1979static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfHV[256];\
1983 copy_block17(full, src, 24, stride, 17);\
1984 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988}\
1989static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990 uint8_t halfH[272];\
1991 uint8_t halfHV[256];\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995}\
1996static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
1998 uint8_t halfHV[256];\
1999 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002}\
2003void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t full[24*17];\
2005 uint8_t halfH[272];\
2006 uint8_t halfV[256];\
2007 uint8_t halfHV[256];\
2008 copy_block17(full, src, 24, stride, 17);\
2009 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013}\
2014static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015 uint8_t full[24*17];\
2016 uint8_t halfH[272];\
2017 copy_block17(full, src, 24, stride, 17);\
2018 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021}\
2022void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 uint8_t halfV[256];\
2026 uint8_t halfHV[256];\
2027 copy_block17(full, src, 24, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032}\
2033static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 uint8_t halfH[272];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040}\
2041static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t halfH[272];\
2043 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045}
2046
2047#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049#define op_put(a, b) a = cm[((b) + 16)>>5]
2050#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051
2052QPEL_MC(0, put_ , _ , op_put)
2053QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054QPEL_MC(0, avg_ , _ , op_avg)
2055//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2056#undef op_avg
2057#undef op_avg_no_rnd
2058#undef op_put
2059#undef op_put_no_rnd
2060
2061#if 1
2062#define H264_LOWPASS(OPNAME, OP, OP2) \
2063static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064 const int h=2;\
2065 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066 int i;\
2067 for(i=0; i<h; i++)\
2068 {\
2069 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2071 dst+=dstStride;\
2072 src+=srcStride;\
2073 }\
2074}\
2075\
2076static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2077 const int w=2;\
2078 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079 int i;\
2080 for(i=0; i<w; i++)\
2081 {\
2082 const int srcB= src[-2*srcStride];\
2083 const int srcA= src[-1*srcStride];\
2084 const int src0= src[0 *srcStride];\
2085 const int src1= src[1 *srcStride];\
2086 const int src2= src[2 *srcStride];\
2087 const int src3= src[3 *srcStride];\
2088 const int src4= src[4 *srcStride];\
2089 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2091 dst++;\
2092 src++;\
2093 }\
2094}\
2095\
2096static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097 const int h=2;\
2098 const int w=2;\
2099 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100 int i;\
2101 src -= 2*srcStride;\
2102 for(i=0; i<h+5; i++)\
2103 {\
2104 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2106 tmp+=tmpStride;\
2107 src+=srcStride;\
2108 }\
2109 tmp -= tmpStride*(h+5-2);\
2110 for(i=0; i<w; i++)\
2111 {\
2112 const int tmpB= tmp[-2*tmpStride];\
2113 const int tmpA= tmp[-1*tmpStride];\
2114 const int tmp0= tmp[0 *tmpStride];\
2115 const int tmp1= tmp[1 *tmpStride];\
2116 const int tmp2= tmp[2 *tmpStride];\
2117 const int tmp3= tmp[3 *tmpStride];\
2118 const int tmp4= tmp[4 *tmpStride];\
2119 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121 dst++;\
2122 tmp++;\
2123 }\
2124}\
2125static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2126 const int h=4;\
2127 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128 int i;\
2129 for(i=0; i<h; i++)\
2130 {\
2131 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2135 dst+=dstStride;\
2136 src+=srcStride;\
2137 }\
2138}\
2139\
2140static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2141 const int w=4;\
2142 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143 int i;\
2144 for(i=0; i<w; i++)\
2145 {\
2146 const int srcB= src[-2*srcStride];\
2147 const int srcA= src[-1*srcStride];\
2148 const int src0= src[0 *srcStride];\
2149 const int src1= src[1 *srcStride];\
2150 const int src2= src[2 *srcStride];\
2151 const int src3= src[3 *srcStride];\
2152 const int src4= src[4 *srcStride];\
2153 const int src5= src[5 *srcStride];\
2154 const int src6= src[6 *srcStride];\
2155 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2159 dst++;\
2160 src++;\
2161 }\
2162}\
2163\
2164static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165 const int h=4;\
2166 const int w=4;\
2167 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168 int i;\
2169 src -= 2*srcStride;\
2170 for(i=0; i<h+5; i++)\
2171 {\
2172 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2176 tmp+=tmpStride;\
2177 src+=srcStride;\
2178 }\
2179 tmp -= tmpStride*(h+5-2);\
2180 for(i=0; i<w; i++)\
2181 {\
2182 const int tmpB= tmp[-2*tmpStride];\
2183 const int tmpA= tmp[-1*tmpStride];\
2184 const int tmp0= tmp[0 *tmpStride];\
2185 const int tmp1= tmp[1 *tmpStride];\
2186 const int tmp2= tmp[2 *tmpStride];\
2187 const int tmp3= tmp[3 *tmpStride];\
2188 const int tmp4= tmp[4 *tmpStride];\
2189 const int tmp5= tmp[5 *tmpStride];\
2190 const int tmp6= tmp[6 *tmpStride];\
2191 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195 dst++;\
2196 tmp++;\
2197 }\
2198}\
2199\
2200static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2201 const int h=8;\
2202 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203 int i;\
2204 for(i=0; i<h; i++)\
2205 {\
2206 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2214 dst+=dstStride;\
2215 src+=srcStride;\
2216 }\
2217}\
2218\
2219static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220 const int w=8;\
2221 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222 int i;\
2223 for(i=0; i<w; i++)\
2224 {\
2225 const int srcB= src[-2*srcStride];\
2226 const int srcA= src[-1*srcStride];\
2227 const int src0= src[0 *srcStride];\
2228 const int src1= src[1 *srcStride];\
2229 const int src2= src[2 *srcStride];\
2230 const int src3= src[3 *srcStride];\
2231 const int src4= src[4 *srcStride];\
2232 const int src5= src[5 *srcStride];\
2233 const int src6= src[6 *srcStride];\
2234 const int src7= src[7 *srcStride];\
2235 const int src8= src[8 *srcStride];\
2236 const int src9= src[9 *srcStride];\
2237 const int src10=src[10*srcStride];\
2238 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2246 dst++;\
2247 src++;\
2248 }\
2249}\
2250\
2251static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252 const int h=8;\
2253 const int w=8;\
2254 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255 int i;\
2256 src -= 2*srcStride;\
2257 for(i=0; i<h+5; i++)\
2258 {\
2259 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2267 tmp+=tmpStride;\
2268 src+=srcStride;\
2269 }\
2270 tmp -= tmpStride*(h+5-2);\
2271 for(i=0; i<w; i++)\
2272 {\
2273 const int tmpB= tmp[-2*tmpStride];\
2274 const int tmpA= tmp[-1*tmpStride];\
2275 const int tmp0= tmp[0 *tmpStride];\
2276 const int tmp1= tmp[1 *tmpStride];\
2277 const int tmp2= tmp[2 *tmpStride];\
2278 const int tmp3= tmp[3 *tmpStride];\
2279 const int tmp4= tmp[4 *tmpStride];\
2280 const int tmp5= tmp[5 *tmpStride];\
2281 const int tmp6= tmp[6 *tmpStride];\
2282 const int tmp7= tmp[7 *tmpStride];\
2283 const int tmp8= tmp[8 *tmpStride];\
2284 const int tmp9= tmp[9 *tmpStride];\
2285 const int tmp10=tmp[10*tmpStride];\
2286 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294 dst++;\
2295 tmp++;\
2296 }\
2297}\
2298\
2299static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2301 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302 src += 8*srcStride;\
2303 dst += 8*dstStride;\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306}\
2307\
2308static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2310 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 src += 8*srcStride;\
2312 dst += 8*dstStride;\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315}\
2316\
2317static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2319 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320 src += 8*srcStride;\
2321 dst += 8*dstStride;\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324}\
2325
2326#define H264_MC(OPNAME, SIZE) \
2327static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329}\
2330\
2331static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332 uint8_t half[SIZE*SIZE];\
2333 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335}\
2336\
2337static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339}\
2340\
2341static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342 uint8_t half[SIZE*SIZE];\
2343 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345}\
2346\
2347static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 uint8_t half[SIZE*SIZE];\
2351 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2352 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354}\
2355\
2356static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357 uint8_t full[SIZE*(SIZE+5)];\
2358 uint8_t * const full_mid= full + SIZE*2;\
2359 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2360 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361}\
2362\
2363static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364 uint8_t full[SIZE*(SIZE+5)];\
2365 uint8_t * const full_mid= full + SIZE*2;\
2366 uint8_t half[SIZE*SIZE];\
2367 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2368 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370}\
2371\
2372static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373 uint8_t full[SIZE*(SIZE+5)];\
2374 uint8_t * const full_mid= full + SIZE*2;\
2375 uint8_t halfH[SIZE*SIZE];\
2376 uint8_t halfV[SIZE*SIZE];\
2377 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2379 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381}\
2382\
2383static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384 uint8_t full[SIZE*(SIZE+5)];\
2385 uint8_t * const full_mid= full + SIZE*2;\
2386 uint8_t halfH[SIZE*SIZE];\
2387 uint8_t halfV[SIZE*SIZE];\
2388 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2390 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392}\
2393\
2394static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395 uint8_t full[SIZE*(SIZE+5)];\
2396 uint8_t * const full_mid= full + SIZE*2;\
2397 uint8_t halfH[SIZE*SIZE];\
2398 uint8_t halfV[SIZE*SIZE];\
2399 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2401 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403}\
2404\
2405static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406 uint8_t full[SIZE*(SIZE+5)];\
2407 uint8_t * const full_mid= full + SIZE*2;\
2408 uint8_t halfH[SIZE*SIZE];\
2409 uint8_t halfV[SIZE*SIZE];\
2410 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2412 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414}\
2415\
2416static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417 int16_t tmp[SIZE*(SIZE+5)];\
2418 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419}\
2420\
2421static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422 int16_t tmp[SIZE*(SIZE+5)];\
2423 uint8_t halfH[SIZE*SIZE];\
2424 uint8_t halfHV[SIZE*SIZE];\
2425 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428}\
2429\
2430static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431 int16_t tmp[SIZE*(SIZE+5)];\
2432 uint8_t halfH[SIZE*SIZE];\
2433 uint8_t halfHV[SIZE*SIZE];\
2434 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437}\
2438\
2439static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440 uint8_t full[SIZE*(SIZE+5)];\
2441 uint8_t * const full_mid= full + SIZE*2;\
2442 int16_t tmp[SIZE*(SIZE+5)];\
2443 uint8_t halfV[SIZE*SIZE];\
2444 uint8_t halfHV[SIZE*SIZE];\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449}\
2450\
2451static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452 uint8_t full[SIZE*(SIZE+5)];\
2453 uint8_t * const full_mid= full + SIZE*2;\
2454 int16_t tmp[SIZE*(SIZE+5)];\
2455 uint8_t halfV[SIZE*SIZE];\
2456 uint8_t halfHV[SIZE*SIZE];\
2457 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2458 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461}\
2462
2463#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465#define op_put(a, b) a = cm[((b) + 16)>>5]
2466#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467#define op2_put(a, b) a = cm[((b) + 512)>>10]
2468
2469H264_LOWPASS(put_ , op_put, op2_put)
2470H264_LOWPASS(avg_ , op_avg, op2_avg)
2471H264_MC(put_, 2)
2472H264_MC(put_, 4)
2473H264_MC(put_, 8)
2474H264_MC(put_, 16)
2475H264_MC(avg_, 4)
2476H264_MC(avg_, 8)
2477H264_MC(avg_, 16)
2478
2479#undef op_avg
2480#undef op_put
2481#undef op2_avg
2482#undef op2_put
2483#endif
2484
2485#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2487#define H264_WEIGHT(W,H) \
2488static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2489 int y; \
2490 offset <<= log2_denom; \
2491 if(log2_denom) offset += 1<<(log2_denom-1); \
2492 for(y=0; y<H; y++, block += stride){ \
2493 op_scale1(0); \
2494 op_scale1(1); \
2495 if(W==2) continue; \
2496 op_scale1(2); \
2497 op_scale1(3); \
2498 if(W==4) continue; \
2499 op_scale1(4); \
2500 op_scale1(5); \
2501 op_scale1(6); \
2502 op_scale1(7); \
2503 if(W==8) continue; \
2504 op_scale1(8); \
2505 op_scale1(9); \
2506 op_scale1(10); \
2507 op_scale1(11); \
2508 op_scale1(12); \
2509 op_scale1(13); \
2510 op_scale1(14); \
2511 op_scale1(15); \
2512 } \
2513} \
2514static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515 int y; \
2516 offset = ((offset + 1) | 1) << log2_denom; \
2517 for(y=0; y<H; y++, dst += stride, src += stride){ \
2518 op_scale2(0); \
2519 op_scale2(1); \
2520 if(W==2) continue; \
2521 op_scale2(2); \
2522 op_scale2(3); \
2523 if(W==4) continue; \
2524 op_scale2(4); \
2525 op_scale2(5); \
2526 op_scale2(6); \
2527 op_scale2(7); \
2528 if(W==8) continue; \
2529 op_scale2(8); \
2530 op_scale2(9); \
2531 op_scale2(10); \
2532 op_scale2(11); \
2533 op_scale2(12); \
2534 op_scale2(13); \
2535 op_scale2(14); \
2536 op_scale2(15); \
2537 } \
2538}
2539
2540H264_WEIGHT(16,16)
2541H264_WEIGHT(16,8)
2542H264_WEIGHT(8,16)
2543H264_WEIGHT(8,8)
2544H264_WEIGHT(8,4)
2545H264_WEIGHT(4,8)
2546H264_WEIGHT(4,4)
2547H264_WEIGHT(4,2)
2548H264_WEIGHT(2,4)
2549H264_WEIGHT(2,2)
2550
2551#undef op_scale1
2552#undef op_scale2
2553#undef H264_WEIGHT
2554
2555static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557 int i;
2558
2559 for(i=0; i<h; i++){
2560 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568 dst+=dstStride;
2569 src+=srcStride;
2570 }
2571}
2572
2573#ifdef CONFIG_CAVS_DECODER
2574/* AVS specific */
2575void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2576
2577void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2578 put_pixels8_c(dst, src, stride, 8);
2579}
2580void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2581 avg_pixels8_c(dst, src, stride, 8);
2582}
2583void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584 put_pixels16_c(dst, src, stride, 16);
2585}
2586void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587 avg_pixels16_c(dst, src, stride, 16);
2588}
2589#endif /* CONFIG_CAVS_DECODER */
2590
2591static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2592 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2593 int i;
2594
2595 for(i=0; i<w; i++){
2596 const int src_1= src[ -srcStride];
2597 const int src0 = src[0 ];
2598 const int src1 = src[ srcStride];
2599 const int src2 = src[2*srcStride];
2600 const int src3 = src[3*srcStride];
2601 const int src4 = src[4*srcStride];
2602 const int src5 = src[5*srcStride];
2603 const int src6 = src[6*srcStride];
2604 const int src7 = src[7*srcStride];
2605 const int src8 = src[8*srcStride];
2606 const int src9 = src[9*srcStride];
2607 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2608 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2609 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2610 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2611 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2612 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2613 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2614 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2615 src++;
2616 dst++;
2617 }
2618}
2619
2620static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2621 put_pixels8_c(dst, src, stride, 8);
2622}
2623
2624static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2625 uint8_t half[64];
2626 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2627 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2628}
2629
2630static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2631 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2632}
2633
2634static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2635 uint8_t half[64];
2636 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2637 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2638}
2639
2640static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2641 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2642}
2643
2644static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2645 uint8_t halfH[88];
2646 uint8_t halfV[64];
2647 uint8_t halfHV[64];
2648 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2649 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2650 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2651 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2652}
2653static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2654 uint8_t halfH[88];
2655 uint8_t halfV[64];
2656 uint8_t halfHV[64];
2657 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2658 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2659 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2660 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2661}
2662static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2663 uint8_t halfH[88];
2664 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2665 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2666}
2667
2668static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2669 int x;
2670 const int strength= ff_h263_loop_filter_strength[qscale];
2671
2672 for(x=0; x<8; x++){
2673 int d1, d2, ad1;
2674 int p0= src[x-2*stride];
2675 int p1= src[x-1*stride];
2676 int p2= src[x+0*stride];
2677 int p3= src[x+1*stride];
2678 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2679
2680 if (d<-2*strength) d1= 0;
2681 else if(d<- strength) d1=-2*strength - d;
2682 else if(d< strength) d1= d;
2683 else if(d< 2*strength) d1= 2*strength - d;
2684 else d1= 0;
2685
2686 p1 += d1;
2687 p2 -= d1;
2688 if(p1&256) p1= ~(p1>>31);
2689 if(p2&256) p2= ~(p2>>31);
2690
2691 src[x-1*stride] = p1;
2692 src[x+0*stride] = p2;
2693
2694 ad1= ABS(d1)>>1;
2695
2696 d2= clip((p0-p3)/4, -ad1, ad1);
2697
2698 src[x-2*stride] = p0 - d2;
2699 src[x+ stride] = p3 + d2;
2700 }
2701}
2702
2703static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2704 int y;
2705 const int strength= ff_h263_loop_filter_strength[qscale];
2706
2707 for(y=0; y<8; y++){
2708 int d1, d2, ad1;
2709 int p0= src[y*stride-2];
2710 int p1= src[y*stride-1];
2711 int p2= src[y*stride+0];
2712 int p3= src[y*stride+1];
2713 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2714
2715 if (d<-2*strength) d1= 0;
2716 else if(d<- strength) d1=-2*strength - d;
2717 else if(d< strength) d1= d;
2718 else if(d< 2*strength) d1= 2*strength - d;
2719 else d1= 0;
2720
2721 p1 += d1;
2722 p2 -= d1;
2723 if(p1&256) p1= ~(p1>>31);
2724 if(p2&256) p2= ~(p2>>31);
2725
2726 src[y*stride-1] = p1;
2727 src[y*stride+0] = p2;
2728
2729 ad1= ABS(d1)>>1;
2730
2731 d2= clip((p0-p3)/4, -ad1, ad1);
2732
2733 src[y*stride-2] = p0 - d2;
2734 src[y*stride+1] = p3 + d2;
2735 }
2736}
2737
2738static void h261_loop_filter_c(uint8_t *src, int stride){
2739 int x,y,xy,yz;
2740 int temp[64];
2741
2742 for(x=0; x<8; x++){
2743 temp[x ] = 4*src[x ];
2744 temp[x + 7*8] = 4*src[x + 7*stride];
2745 }
2746 for(y=1; y<7; y++){
2747 for(x=0; x<8; x++){
2748 xy = y * stride + x;
2749 yz = y * 8 + x;
2750 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2751 }
2752 }
2753
2754 for(y=0; y<8; y++){
2755 src[ y*stride] = (temp[ y*8] + 2)>>2;
2756 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2757 for(x=1; x<7; x++){
2758 xy = y * stride + x;
2759 yz = y * 8 + x;
2760 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2761 }
2762 }
2763}
2764
2765static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2766{
2767 int i, d;
2768 for( i = 0; i < 4; i++ ) {
2769 if( tc0[i] < 0 ) {
2770 pix += 4*ystride;
2771 continue;
2772 }
2773 for( d = 0; d < 4; d++ ) {
2774 const int p0 = pix[-1*xstride];
2775 const int p1 = pix[-2*xstride];
2776 const int p2 = pix[-3*xstride];
2777 const int q0 = pix[0];
2778 const int q1 = pix[1*xstride];
2779 const int q2 = pix[2*xstride];
2780
2781 if( ABS( p0 - q0 ) < alpha &&
2782 ABS( p1 - p0 ) < beta &&
2783 ABS( q1 - q0 ) < beta ) {
2784
2785 int tc = tc0[i];
2786 int i_delta;
2787
2788 if( ABS( p2 - p0 ) < beta ) {
2789 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2790 tc++;
2791 }
2792 if( ABS( q2 - q0 ) < beta ) {
2793 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2794 tc++;
2795 }
2796
2797 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2798 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2799 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2800 }
2801 pix += ystride;
2802 }
2803 }
2804}
2805static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2806{
2807 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2808}
2809static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2810{
2811 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2812}
2813
2814static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2815{
2816 int i, d;
2817 for( i = 0; i < 4; i++ ) {
2818 const int tc = tc0[i];
2819 if( tc <= 0 ) {
2820 pix += 2*ystride;
2821 continue;
2822 }
2823 for( d = 0; d < 2; d++ ) {
2824 const int p0 = pix[-1*xstride];
2825 const int p1 = pix[-2*xstride];
2826 const int q0 = pix[0];
2827 const int q1 = pix[1*xstride];
2828
2829 if( ABS( p0 - q0 ) < alpha &&
2830 ABS( p1 - p0 ) < beta &&
2831 ABS( q1 - q0 ) < beta ) {
2832
2833 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2834
2835 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2836 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2837 }
2838 pix += ystride;
2839 }
2840 }
2841}
2842static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2843{
2844 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2845}
2846static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2847{
2848 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2849}
2850
2851static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2852{
2853 int d;
2854 for( d = 0; d < 8; d++ ) {
2855 const int p0 = pix[-1*xstride];
2856 const int p1 = pix[-2*xstride];
2857 const int q0 = pix[0];
2858 const int q1 = pix[1*xstride];
2859
2860 if( ABS( p0 - q0 ) < alpha &&
2861 ABS( p1 - p0 ) < beta &&
2862 ABS( q1 - q0 ) < beta ) {
2863
2864 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2865 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2866 }
2867 pix += ystride;
2868 }
2869}
2870static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2871{
2872 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2873}
2874static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2875{
2876 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2877}
2878
2879static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2880{
2881 int s, i;
2882
2883 s = 0;
2884 for(i=0;i<h;i++) {
2885 s += abs(pix1[0] - pix2[0]);
2886 s += abs(pix1[1] - pix2[1]);
2887 s += abs(pix1[2] - pix2[2]);
2888 s += abs(pix1[3] - pix2[3]);
2889 s += abs(pix1[4] - pix2[4]);
2890 s += abs(pix1[5] - pix2[5]);
2891 s += abs(pix1[6] - pix2[6]);
2892 s += abs(pix1[7] - pix2[7]);
2893 s += abs(pix1[8] - pix2[8]);
2894 s += abs(pix1[9] - pix2[9]);
2895 s += abs(pix1[10] - pix2[10]);
2896 s += abs(pix1[11] - pix2[11]);
2897 s += abs(pix1[12] - pix2[12]);
2898 s += abs(pix1[13] - pix2[13]);
2899 s += abs(pix1[14] - pix2[14]);
2900 s += abs(pix1[15] - pix2[15]);
2901 pix1 += line_size;
2902 pix2 += line_size;
2903 }
2904 return s;
2905}
2906
2907static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908{
2909 int s, i;
2910
2911 s = 0;
2912 for(i=0;i<h;i++) {
2913 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2914 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2915 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2916 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2917 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2918 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2919 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2920 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2921 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2922 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2923 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2924 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2925 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2926 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2927 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2928 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2929 pix1 += line_size;
2930 pix2 += line_size;
2931 }
2932 return s;
2933}
2934
2935static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2936{
2937 int s, i;
2938 uint8_t *pix3 = pix2 + line_size;
2939
2940 s = 0;
2941 for(i=0;i<h;i++) {
2942 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2943 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2944 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2945 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2946 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2947 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2948 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2949 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2950 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2951 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2952 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2953 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2954 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2955 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2956 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2957 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2958 pix1 += line_size;
2959 pix2 += line_size;
2960 pix3 += line_size;
2961 }
2962 return s;
2963}
2964
2965static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2966{
2967 int s, i;
2968 uint8_t *pix3 = pix2 + line_size;
2969
2970 s = 0;
2971 for(i=0;i<h;i++) {
2972 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2973 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2974 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2975 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2976 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2977 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2978 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2979 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2980 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2981 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2982 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2983 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2984 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2985 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2986 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2987 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2988 pix1 += line_size;
2989 pix2 += line_size;
2990 pix3 += line_size;
2991 }
2992 return s;
2993}
2994
2995static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2996{
2997 int s, i;
2998
2999 s = 0;
3000 for(i=0;i<h;i++) {
3001 s += abs(pix1[0] - pix2[0]);
3002 s += abs(pix1[1] - pix2[1]);
3003 s += abs(pix1[2] - pix2[2]);
3004 s += abs(pix1[3] - pix2[3]);
3005 s += abs(pix1[4] - pix2[4]);
3006 s += abs(pix1[5] - pix2[5]);
3007 s += abs(pix1[6] - pix2[6]);
3008 s += abs(pix1[7] - pix2[7]);
3009 pix1 += line_size;
3010 pix2 += line_size;
3011 }
3012 return s;
3013}
3014
3015static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016{
3017 int s, i;
3018
3019 s = 0;
3020 for(i=0;i<h;i++) {
3021 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3022 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3023 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3024 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3025 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3026 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3027 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3028 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3029 pix1 += line_size;
3030 pix2 += line_size;
3031 }
3032 return s;
3033}
3034
3035static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036{
3037 int s, i;
3038 uint8_t *pix3 = pix2 + line_size;
3039
3040 s = 0;
3041 for(i=0;i<h;i++) {
3042 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3043 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3044 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3045 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3046 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3047 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3048 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3049 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3050 pix1 += line_size;
3051 pix2 += line_size;
3052 pix3 += line_size;
3053 }
3054 return s;
3055}
3056
3057static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3058{
3059 int s, i;
3060 uint8_t *pix3 = pix2 + line_size;
3061
3062 s = 0;
3063 for(i=0;i<h;i++) {
3064 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3065 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3066 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3067 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3068 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3069 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3070 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3071 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3072 pix1 += line_size;
3073 pix2 += line_size;
3074 pix3 += line_size;
3075 }
3076 return s;
3077}
3078
3079static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3080 MpegEncContext *c = v;
3081 int score1=0;
3082 int score2=0;
3083 int x,y;
3084
3085 for(y=0; y<h; y++){
3086 for(x=0; x<16; x++){
3087 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3088 }
3089 if(y+1<h){
3090 for(x=0; x<15; x++){
3091 score2+= ABS( s1[x ] - s1[x +stride]
3092 - s1[x+1] + s1[x+1+stride])
3093 -ABS( s2[x ] - s2[x +stride]
3094 - s2[x+1] + s2[x+1+stride]);
3095 }
3096 }
3097 s1+= stride;
3098 s2+= stride;
3099 }
3100
3101 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3102 else return score1 + ABS(score2)*8;
3103}
3104
3105static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3106 MpegEncContext *c = v;
3107 int score1=0;
3108 int score2=0;
3109 int x,y;
3110
3111 for(y=0; y<h; y++){
3112 for(x=0; x<8; x++){
3113 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3114 }
3115 if(y+1<h){
3116 for(x=0; x<7; x++){
3117 score2+= ABS( s1[x ] - s1[x +stride]
3118 - s1[x+1] + s1[x+1+stride])
3119 -ABS( s2[x ] - s2[x +stride]
3120 - s2[x+1] + s2[x+1+stride]);
3121 }
3122 }
3123 s1+= stride;
3124 s2+= stride;
3125 }
3126
3127 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3128 else return score1 + ABS(score2)*8;
3129}
3130
3131static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3132 int i;
3133 unsigned int sum=0;
3134
3135 for(i=0; i<8*8; i++){
3136 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3137 int w= weight[i];
3138 b>>= RECON_SHIFT;
3139 assert(-512<b && b<512);
3140
3141 sum += (w*b)*(w*b)>>4;
3142 }
3143 return sum>>2;
3144}
3145
3146static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3147 int i;
3148
3149 for(i=0; i<8*8; i++){
3150 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3151 }
3152}
3153
3154/**
3155 * permutes an 8x8 block.
3156 * @param block the block which will be permuted according to the given permutation vector
3157 * @param permutation the permutation vector
3158 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3159 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3160 * (inverse) permutated to scantable order!
3161 */
3162void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3163{
3164 int i;
3165 DCTELEM temp[64];
3166
3167 if(last<=0) return;
3168 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3169
3170 for(i=0; i<=last; i++){
3171 const int j= scantable[i];
3172 temp[j]= block[j];
3173 block[j]=0;
3174 }
3175
3176 for(i=0; i<=last; i++){
3177 const int j= scantable[i];
3178 const int perm_j= permutation[j];
3179 block[perm_j]= temp[j];
3180 }
3181}
3182
3183static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3184 return 0;
3185}
3186
3187void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3188 int i;
3189
3190 memset(cmp, 0, sizeof(void*)*5);
3191
3192 for(i=0; i<5; i++){
3193 switch(type&0xFF){
3194 case FF_CMP_SAD:
3195 cmp[i]= c->sad[i];
3196 break;
3197 case FF_CMP_SATD:
3198 cmp[i]= c->hadamard8_diff[i];
3199 break;
3200 case FF_CMP_SSE:
3201 cmp[i]= c->sse[i];
3202 break;
3203 case FF_CMP_DCT:
3204 cmp[i]= c->dct_sad[i];
3205 break;
3206 case FF_CMP_DCT264:
3207 cmp[i]= c->dct264_sad[i];
3208 break;
3209 case FF_CMP_DCTMAX:
3210 cmp[i]= c->dct_max[i];
3211 break;
3212 case FF_CMP_PSNR:
3213 cmp[i]= c->quant_psnr[i];
3214 break;
3215 case FF_CMP_BIT:
3216 cmp[i]= c->bit[i];
3217 break;
3218 case FF_CMP_RD:
3219 cmp[i]= c->rd[i];
3220 break;
3221 case FF_CMP_VSAD:
3222 cmp[i]= c->vsad[i];
3223 break;
3224 case FF_CMP_VSSE:
3225 cmp[i]= c->vsse[i];
3226 break;
3227 case FF_CMP_ZERO:
3228 cmp[i]= zero_cmp;
3229 break;
3230 case FF_CMP_NSSE:
3231 cmp[i]= c->nsse[i];
3232 break;
3233#ifdef CONFIG_SNOW_ENCODER
3234 case FF_CMP_W53:
3235 cmp[i]= c->w53[i];
3236 break;
3237 case FF_CMP_W97:
3238 cmp[i]= c->w97[i];
3239 break;
3240#endif
3241 default:
3242 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3243 }
3244 }
3245}
3246
3247/**
3248 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3249 */
3250static void clear_blocks_c(DCTELEM *blocks)
3251{
3252 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3253}
3254
3255static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3256 int i;
3257 for(i=0; i+7<w; i+=8){
3258 dst[i+0] += src[i+0];
3259 dst[i+1] += src[i+1];
3260 dst[i+2] += src[i+2];
3261 dst[i+3] += src[i+3];
3262 dst[i+4] += src[i+4];
3263 dst[i+5] += src[i+5];
3264 dst[i+6] += src[i+6];
3265 dst[i+7] += src[i+7];
3266 }
3267 for(; i<w; i++)
3268 dst[i+0] += src[i+0];
3269}
3270
3271static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3272 int i;
3273 for(i=0; i+7<w; i+=8){
3274 dst[i+0] = src1[i+0]-src2[i+0];
3275 dst[i+1] = src1[i+1]-src2[i+1];
3276 dst[i+2] = src1[i+2]-src2[i+2];
3277 dst[i+3] = src1[i+3]-src2[i+3];
3278 dst[i+4] = src1[i+4]-src2[i+4];
3279 dst[i+5] = src1[i+5]-src2[i+5];
3280 dst[i+6] = src1[i+6]-src2[i+6];
3281 dst[i+7] = src1[i+7]-src2[i+7];
3282 }
3283 for(; i<w; i++)
3284 dst[i+0] = src1[i+0]-src2[i+0];
3285}
3286
3287static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3288 int i;
3289 uint8_t l, lt;
3290
3291 l= *left;
3292 lt= *left_top;
3293
3294 for(i=0; i<w; i++){
3295 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3296 lt= src1[i];
3297 l= src2[i];
3298 dst[i]= l - pred;
3299 }
3300
3301 *left= l;
3302 *left_top= lt;
3303}
3304
3305#define BUTTERFLY2(o1,o2,i1,i2) \
3306o1= (i1)+(i2);\
3307o2= (i1)-(i2);
3308
3309#define BUTTERFLY1(x,y) \
3310{\
3311 int a,b;\
3312 a= x;\
3313 b= y;\
3314 x= a+b;\
3315 y= a-b;\
3316}
3317
3318#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3319
3320static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3321 int i;
3322 int temp[64];
3323 int sum=0;
3324
3325 assert(h==8);
3326
3327 for(i=0; i<8; i++){
3328 //FIXME try pointer walks
3329 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3330 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3331 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3332 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3333
3334 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3335 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3336 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3337 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3338
3339 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3340 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3341 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3342 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3343 }
3344
3345 for(i=0; i<8; i++){
3346 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3347 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3348 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3349 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3350
3351 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3352 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3353 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3354 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3355
3356 sum +=
3357 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3358 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3359 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3360 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3361 }
3362#if 0
3363static int maxi=0;
3364if(sum>maxi){
3365 maxi=sum;
3366 printf("MAX:%d\n", maxi);
3367}
3368#endif
3369 return sum;
3370}
3371
3372static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3373 int i;
3374 int temp[64];
3375 int sum=0;
3376
3377 assert(h==8);
3378
3379 for(i=0; i<8; i++){
3380 //FIXME try pointer walks
3381 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3382 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3383 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3384 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3385
3386 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3387 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3388 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3389 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3390
3391 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3392 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3393 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3394 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3395 }
3396
3397 for(i=0; i<8; i++){
3398 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3399 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3400 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3401 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3402
3403 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3404 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3405 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3406 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3407
3408 sum +=
3409 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3410 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3411 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3412 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3413 }
3414
3415 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3416
3417 return sum;
3418}
3419
3420static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3421 MpegEncContext * const s= (MpegEncContext *)c;
3422 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3423 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3424 int sum=0, i;
3425
3426 assert(h==8);
3427
3428 s->dsp.diff_pixels(temp, src1, src2, stride);
3429 s->dsp.fdct(temp);
3430
3431 for(i=0; i<64; i++)
3432 sum+= ABS(temp[i]);
3433
3434 return sum;
3435}
3436
3437static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3438 MpegEncContext * const s= (MpegEncContext *)c;
3439 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3440 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3441 int sum=0, i;
3442
3443 assert(h==8);
3444
3445 s->dsp.diff_pixels(temp, src1, src2, stride);
3446 s->dsp.fdct(temp);
3447
3448 for(i=0; i<64; i++)
3449 sum= FFMAX(sum, ABS(temp[i]));
3450
3451 return sum;
3452}
3453
3454void simple_idct(DCTELEM *block); //FIXME
3455
3456static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3457 MpegEncContext * const s= (MpegEncContext *)c;
3458 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3459 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3460 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3461 int sum=0, i;
3462
3463 assert(h==8);
3464 s->mb_intra=0;
3465
3466 s->dsp.diff_pixels(temp, src1, src2, stride);
3467
3468 memcpy(bak, temp, 64*sizeof(DCTELEM));
3469
3470 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3471 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3472 simple_idct(temp); //FIXME
3473
3474 for(i=0; i<64; i++)
3475 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3476
3477 return sum;
3478}
3479
3480static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3481 MpegEncContext * const s= (MpegEncContext *)c;
3482 const uint8_t *scantable= s->intra_scantable.permutated;
3483 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3484 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3485 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3486 uint8_t * const bak= (uint8_t*)aligned_bak;
3487 int i, last, run, bits, level, distoration, start_i;
3488 const int esc_length= s->ac_esc_length;
3489 uint8_t * length;
3490 uint8_t * last_length;
3491
3492 assert(h==8);
3493
3494 for(i=0; i<8; i++){
3495 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3496 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3497 }
3498
3499 s->dsp.diff_pixels(temp, src1, src2, stride);
3500
3501 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3502
3503 bits=0;
3504
3505 if (s->mb_intra) {
3506 start_i = 1;
3507 length = s->intra_ac_vlc_length;
3508 last_length= s->intra_ac_vlc_last_length;
3509 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3510 } else {
3511 start_i = 0;
3512 length = s->inter_ac_vlc_length;
3513 last_length= s->inter_ac_vlc_last_length;
3514 }
3515
3516 if(last>=start_i){
3517 run=0;
3518 for(i=start_i; i<last; i++){
3519 int j= scantable[i];
3520 level= temp[j];
3521
3522 if(level){
3523 level+=64;
3524 if((level&(~127)) == 0){
3525 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3526 }else
3527 bits+= esc_length;
3528 run=0;
3529 }else
3530 run++;
3531 }
3532 i= scantable[last];
3533
3534 level= temp[i] + 64;
3535
3536 assert(level - 64);
3537
3538 if((level&(~127)) == 0){
3539 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3540 }else
3541 bits+= esc_length;
3542
3543 }
3544
3545 if(last>=0){
3546 if(s->mb_intra)
3547 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3548 else
3549 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3550 }
3551
3552 s->dsp.idct_add(bak, stride, temp);
3553
3554 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3555
3556 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3557}
3558
3559static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3560 MpegEncContext * const s= (MpegEncContext *)c;
3561 const uint8_t *scantable= s->intra_scantable.permutated;
3562 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3563 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3564 int i, last, run, bits, level, start_i;
3565 const int esc_length= s->ac_esc_length;
3566 uint8_t * length;
3567 uint8_t * last_length;
3568
3569 assert(h==8);
3570
3571 s->dsp.diff_pixels(temp, src1, src2, stride);
3572
3573 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3574
3575 bits=0;
3576
3577 if (s->mb_intra) {
3578 start_i = 1;
3579 length = s->intra_ac_vlc_length;
3580 last_length= s->intra_ac_vlc_last_length;
3581 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3582 } else {
3583 start_i = 0;
3584 length = s->inter_ac_vlc_length;
3585 last_length= s->inter_ac_vlc_last_length;
3586 }
3587
3588 if(last>=start_i){
3589 run=0;
3590 for(i=start_i; i<last; i++){
3591 int j= scantable[i];
3592 level= temp[j];
3593
3594 if(level){
3595 level+=64;
3596 if((level&(~127)) == 0){
3597 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3598 }else
3599 bits+= esc_length;
3600 run=0;
3601 }else
3602 run++;
3603 }
3604 i= scantable[last];
3605
3606 level= temp[i] + 64;
3607
3608 assert(level - 64);
3609
3610 if((level&(~127)) == 0){
3611 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3612 }else
3613 bits+= esc_length;
3614 }
3615
3616 return bits;
3617}
3618
3619static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3620 int score=0;
3621 int x,y;
3622
3623 for(y=1; y<h; y++){
3624 for(x=0; x<16; x+=4){
3625 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3626 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3627 }
3628 s+= stride;
3629 }
3630
3631 return score;
3632}
3633
3634static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3635 int score=0;
3636 int x,y;
3637
3638 for(y=1; y<h; y++){
3639 for(x=0; x<16; x++){
3640 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3641 }
3642 s1+= stride;
3643 s2+= stride;
3644 }
3645
3646 return score;
3647}
3648
3649#define SQ(a) ((a)*(a))
3650static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3651 int score=0;
3652 int x,y;
3653
3654 for(y=1; y<h; y++){
3655 for(x=0; x<16; x+=4){
3656 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3657 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3658 }
3659 s+= stride;
3660 }
3661
3662 return score;
3663}
3664
3665static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3666 int score=0;
3667 int x,y;
3668
3669 for(y=1; y<h; y++){
3670 for(x=0; x<16; x++){
3671 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3672 }
3673 s1+= stride;
3674 s2+= stride;
3675 }
3676
3677 return score;
3678}
3679
3680WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3681WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3682WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3683WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3684WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3685WARPER8_16_SQ(rd8x8_c, rd16_c)
3686WARPER8_16_SQ(bit8x8_c, bit16_c)
3687
3688/* XXX: those functions should be suppressed ASAP when all IDCTs are
3689 converted */
3690static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3691{
3692 j_rev_dct (block);
3693 put_pixels_clamped_c(block, dest, line_size);
3694}
3695static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3696{
3697 j_rev_dct (block);
3698 add_pixels_clamped_c(block, dest, line_size);
3699}
3700
3701static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3702{
3703 j_rev_dct4 (block);
3704 put_pixels_clamped4_c(block, dest, line_size);
3705}
3706static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3707{
3708 j_rev_dct4 (block);
3709 add_pixels_clamped4_c(block, dest, line_size);
3710}
3711
3712static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3713{
3714 j_rev_dct2 (block);
3715 put_pixels_clamped2_c(block, dest, line_size);
3716}
3717static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3718{
3719 j_rev_dct2 (block);
3720 add_pixels_clamped2_c(block, dest, line_size);
3721}
3722
3723static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3724{
3725 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3726
3727 dest[0] = cm[(block[0] + 4)>>3];
3728}
3729static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3730{
3731 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3732
3733 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3734}
3735
3736static void just_return() { return; }
3737
3738/* init static data */
3739void dsputil_static_init(void)
3740{
3741 int i;
3742
3743 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3744 for(i=0;i<MAX_NEG_CROP;i++) {
3745 cropTbl[i] = 0;
3746 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3747 }
3748
3749 for(i=0;i<512;i++) {
3750 squareTbl[i] = (i - 256) * (i - 256);
3751 }
3752
3753 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3754}
3755
3756
3757void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3758{
3759 int i;
3760
3761#ifdef CONFIG_ENCODERS
3762 if(avctx->dct_algo==FF_DCT_FASTINT) {
3763 c->fdct = fdct_ifast;
3764 c->fdct248 = fdct_ifast248;
3765 }
3766 else if(avctx->dct_algo==FF_DCT_FAAN) {
3767 c->fdct = ff_faandct;
3768 c->fdct248 = ff_faandct248;
3769 }
3770 else {
3771 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3772 c->fdct248 = ff_fdct248_islow;
3773 }
3774#endif //CONFIG_ENCODERS
3775
3776 if(avctx->lowres==1){
3777 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3778 c->idct_put= ff_jref_idct4_put;
3779 c->idct_add= ff_jref_idct4_add;
3780 }else{
3781 c->idct_put= ff_h264_lowres_idct_put_c;
3782 c->idct_add= ff_h264_lowres_idct_add_c;
3783 }
3784 c->idct = j_rev_dct4;
3785 c->idct_permutation_type= FF_NO_IDCT_PERM;
3786 }else if(avctx->lowres==2){
3787 c->idct_put= ff_jref_idct2_put;
3788 c->idct_add= ff_jref_idct2_add;
3789 c->idct = j_rev_dct2;
3790 c->idct_permutation_type= FF_NO_IDCT_PERM;
3791 }else if(avctx->lowres==3){
3792 c->idct_put= ff_jref_idct1_put;
3793 c->idct_add= ff_jref_idct1_add;
3794 c->idct = j_rev_dct1;
3795 c->idct_permutation_type= FF_NO_IDCT_PERM;
3796 }else{
3797 if(avctx->idct_algo==FF_IDCT_INT){
3798 c->idct_put= ff_jref_idct_put;
3799 c->idct_add= ff_jref_idct_add;
3800 c->idct = j_rev_dct;
3801 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3802 }else if(avctx->idct_algo==FF_IDCT_VP3){
3803 c->idct_put= ff_vp3_idct_put_c;
3804 c->idct_add= ff_vp3_idct_add_c;
3805 c->idct = ff_vp3_idct_c;
3806 c->idct_permutation_type= FF_NO_IDCT_PERM;
3807 }else{ //accurate/default
3808 c->idct_put= simple_idct_put;
3809 c->idct_add= simple_idct_add;
3810 c->idct = simple_idct;
3811 c->idct_permutation_type= FF_NO_IDCT_PERM;
3812 }
3813 }
3814
3815 c->h264_idct_add= ff_h264_idct_add_c;
3816 c->h264_idct8_add= ff_h264_idct8_add_c;
3817 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3818 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3819
3820 c->get_pixels = get_pixels_c;
3821 c->diff_pixels = diff_pixels_c;
3822 c->put_pixels_clamped = put_pixels_clamped_c;
3823 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3824 c->add_pixels_clamped = add_pixels_clamped_c;
3825 c->add_pixels8 = add_pixels8_c;
3826 c->add_pixels4 = add_pixels4_c;
3827 c->gmc1 = gmc1_c;
3828 c->gmc = ff_gmc_c;
3829 c->clear_blocks = clear_blocks_c;
3830 c->pix_sum = pix_sum_c;
3831 c->pix_norm1 = pix_norm1_c;
3832
3833 /* TODO [0] 16 [1] 8 */
3834 c->pix_abs[0][0] = pix_abs16_c;
3835 c->pix_abs[0][1] = pix_abs16_x2_c;
3836 c->pix_abs[0][2] = pix_abs16_y2_c;
3837 c->pix_abs[0][3] = pix_abs16_xy2_c;
3838 c->pix_abs[1][0] = pix_abs8_c;
3839 c->pix_abs[1][1] = pix_abs8_x2_c;
3840 c->pix_abs[1][2] = pix_abs8_y2_c;
3841 c->pix_abs[1][3] = pix_abs8_xy2_c;
3842
3843#define dspfunc(PFX, IDX, NUM) \
3844 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3845 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3846 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3847 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3848
3849 dspfunc(put, 0, 16);
3850 dspfunc(put_no_rnd, 0, 16);
3851 dspfunc(put, 1, 8);
3852 dspfunc(put_no_rnd, 1, 8);
3853 dspfunc(put, 2, 4);
3854 dspfunc(put, 3, 2);
3855
3856 dspfunc(avg, 0, 16);
3857 dspfunc(avg_no_rnd, 0, 16);
3858 dspfunc(avg, 1, 8);
3859 dspfunc(avg_no_rnd, 1, 8);
3860 dspfunc(avg, 2, 4);
3861 dspfunc(avg, 3, 2);
3862#undef dspfunc
3863
3864 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3865 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3866
3867 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3868 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3869 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3870 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3871 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3872 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3873 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3874 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3875 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3876
3877 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3878 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3879 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3880 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3881 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3882 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3883 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3884 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3885 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3886
3887#define dspfunc(PFX, IDX, NUM) \
3888 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3889 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3890 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3891 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3892 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3893 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3894 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3895 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3896 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3897 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3898 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3899 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3900 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3901 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3902 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3903 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3904
3905 dspfunc(put_qpel, 0, 16);
3906 dspfunc(put_no_rnd_qpel, 0, 16);
3907
3908 dspfunc(avg_qpel, 0, 16);
3909 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3910
3911 dspfunc(put_qpel, 1, 8);
3912 dspfunc(put_no_rnd_qpel, 1, 8);
3913
3914 dspfunc(avg_qpel, 1, 8);
3915 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3916
3917 dspfunc(put_h264_qpel, 0, 16);
3918 dspfunc(put_h264_qpel, 1, 8);
3919 dspfunc(put_h264_qpel, 2, 4);
3920 dspfunc(put_h264_qpel, 3, 2);
3921 dspfunc(avg_h264_qpel, 0, 16);
3922 dspfunc(avg_h264_qpel, 1, 8);
3923 dspfunc(avg_h264_qpel, 2, 4);
3924
3925#undef dspfunc
3926 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3927 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3928 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3929 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3930 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3931 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3932
3933 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3934 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3935 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3936 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3937 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3938 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3939 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3940 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3941 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3942 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3943 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3944 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3945 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3946 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3947 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3948 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3949 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3950 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3951 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3952 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3953
3954#ifdef CONFIG_CAVS_DECODER
3955 ff_cavsdsp_init(c,avctx);
3956#endif
3957
3958 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3959 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3960 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3961 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3962 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3963 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3964 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3965 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3966
3967#define SET_CMP_FUNC(name) \
3968 c->name[0]= name ## 16_c;\
3969 c->name[1]= name ## 8x8_c;
3970
3971 SET_CMP_FUNC(hadamard8_diff)
3972 c->hadamard8_diff[4]= hadamard8_intra16_c;
3973 SET_CMP_FUNC(dct_sad)
3974 SET_CMP_FUNC(dct_max)
3975 c->sad[0]= pix_abs16_c;
3976 c->sad[1]= pix_abs8_c;
3977 c->sse[0]= sse16_c;
3978 c->sse[1]= sse8_c;
3979 c->sse[2]= sse4_c;
3980 SET_CMP_FUNC(quant_psnr)
3981 SET_CMP_FUNC(rd)
3982 SET_CMP_FUNC(bit)
3983 c->vsad[0]= vsad16_c;
3984 c->vsad[4]= vsad_intra16_c;
3985 c->vsse[0]= vsse16_c;
3986 c->vsse[4]= vsse_intra16_c;
3987 c->nsse[0]= nsse16_c;
3988 c->nsse[1]= nsse8_c;
3989#ifdef CONFIG_SNOW_ENCODER
3990 c->w53[0]= w53_16_c;
3991 c->w53[1]= w53_8_c;
3992 c->w97[0]= w97_16_c;
3993 c->w97[1]= w97_8_c;
3994#endif
3995
3996 c->add_bytes= add_bytes_c;
3997 c->diff_bytes= diff_bytes_c;
3998 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3999 c->bswap_buf= bswap_buf;
4000
4001 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4002 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4003 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4004 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4005 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4006 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4007
4008 c->h263_h_loop_filter= h263_h_loop_filter_c;
4009 c->h263_v_loop_filter= h263_v_loop_filter_c;
4010
4011 c->h261_loop_filter= h261_loop_filter_c;
4012
4013 c->try_8x8basis= try_8x8basis_c;
4014 c->add_8x8basis= add_8x8basis_c;
4015
4016#ifdef CONFIG_SNOW_ENCODER
4017 c->vertical_compose97i = ff_snow_vertical_compose97i;
4018 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4019 c->inner_add_yblock = ff_snow_inner_add_yblock;
4020#endif
4021
4022 c->shrink[0]= ff_img_copy_plane;
4023 c->shrink[1]= ff_shrink22;
4024 c->shrink[2]= ff_shrink44;
4025 c->shrink[3]= ff_shrink88;
4026
4027 c->prefetch= just_return;
4028
4029#ifdef HAVE_MMX
4030 dsputil_init_mmx(c, avctx);
4031#endif
4032#ifdef ARCH_ARMV4L
4033 dsputil_init_armv4l(c, avctx);
4034#endif
4035#ifdef HAVE_MLIB
4036 dsputil_init_mlib(c, avctx);
4037#endif
4038#ifdef ARCH_SPARC
4039 dsputil_init_vis(c,avctx);
4040#endif
4041#ifdef ARCH_ALPHA
4042 dsputil_init_alpha(c, avctx);
4043#endif
4044#ifdef ARCH_POWERPC
4045 dsputil_init_ppc(c, avctx);
4046#endif
4047#ifdef HAVE_MMI
4048 dsputil_init_mmi(c, avctx);
4049#endif
4050#ifdef ARCH_SH4
4051 dsputil_init_sh4(c,avctx);
4052#endif
4053
4054 switch(c->idct_permutation_type){
4055 case FF_NO_IDCT_PERM:
4056 for(i=0; i<64; i++)
4057 c->idct_permutation[i]= i;
4058 break;
4059 case FF_LIBMPEG2_IDCT_PERM:
4060 for(i=0; i<64; i++)
4061 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4062 break;
4063 case FF_SIMPLE_IDCT_PERM:
4064 for(i=0; i<64; i++)
4065 c->idct_permutation[i]= simple_mmx_permutation[i];
4066 break;
4067 case FF_TRANSPOSE_IDCT_PERM:
4068 for(i=0; i<64; i++)
4069 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4070 break;
4071 case FF_PARTTRANS_IDCT_PERM:
4072 for(i=0; i<64; i++)
4073 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4074 break;
4075 default:
4076 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4077 }
4078}
4079
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette