dsputil.c@ 9441

Last change on this file since 9441 was 5776, checked in by vboxsync, 17 years ago
ffmpeg: exported to OSE
File size: 143.6 KB

Line
1	/*
2	* DSP utils
3	* Copyright (c) 2000, 2001 Fabrice Bellard.
4	* Copyright (c) 2002-2004 Michael Niedermayer <[email protected]>
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, write to the Free Software
18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19	*
20	* gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <[email protected]>
21	*/
22
23	/**
24	* @file dsputil.c
25	* DSP utils
26	*/
27
28	#include "avcodec.h"
29	#include "dsputil.h"
30	#include "mpegvideo.h"
31	#include "simple_idct.h"
32	#include "faandct.h"
33	#include "snow.h"
34
35	/* snow.c */
36	void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37
38	uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39	uint32_t squareTbl[512] = {0, };
40
41	const uint8_t ff_zigzag_direct[64] = {
42	0, 1, 8, 16, 9, 2, 3, 10,
43	17, 24, 32, 25, 18, 11, 4, 5,
44	12, 19, 26, 33, 40, 48, 41, 34,
45	27, 20, 13, 6, 7, 14, 21, 28,
46	35, 42, 49, 56, 57, 50, 43, 36,
47	29, 22, 15, 23, 30, 37, 44, 51,
48	58, 59, 52, 45, 38, 31, 39, 46,
49	53, 60, 61, 54, 47, 55, 62, 63
50	};
51
52	/* Specific zigzag scan for 248 idct. NOTE that unlike the
53	specification, we interleave the fields */
54	const uint8_t ff_zigzag248_direct[64] = {
55	0, 8, 1, 9, 16, 24, 2, 10,
56	17, 25, 32, 40, 48, 56, 33, 41,
57	18, 26, 3, 11, 4, 12, 19, 27,
58	34, 42, 49, 57, 50, 58, 35, 43,
59	20, 28, 5, 13, 6, 14, 21, 29,
60	36, 44, 51, 59, 52, 60, 37, 45,
61	22, 30, 7, 15, 23, 31, 38, 46,
62	53, 61, 54, 62, 39, 47, 55, 63,
63	};
64
65	/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66	DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
67
68	const uint8_t ff_alternate_horizontal_scan[64] = {
69	0, 1, 2, 3, 8, 9, 16, 17,
70	10, 11, 4, 5, 6, 7, 15, 14,
71	13, 12, 19, 18, 24, 25, 32, 33,
72	26, 27, 20, 21, 22, 23, 28, 29,
73	30, 31, 34, 35, 40, 41, 48, 49,
74	42, 43, 36, 37, 38, 39, 44, 45,
75	46, 47, 50, 51, 56, 57, 58, 59,
76	52, 53, 54, 55, 60, 61, 62, 63,
77	};
78
79	const uint8_t ff_alternate_vertical_scan[64] = {
80	0, 8, 16, 24, 1, 9, 2, 10,
81	17, 25, 32, 40, 48, 56, 57, 49,
82	41, 33, 26, 18, 3, 11, 4, 12,
83	19, 27, 34, 42, 50, 58, 35, 43,
84	51, 59, 20, 28, 5, 13, 6, 14,
85	21, 29, 36, 44, 52, 60, 37, 45,
86	53, 61, 22, 30, 7, 15, 23, 31,
87	38, 46, 54, 62, 39, 47, 55, 63,
88	};
89
90	/* ainverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 /
91	const uint32_t inverse[256]={
92	0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93	536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94	268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95	178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96	134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97	107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98	89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99	76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100	67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101	59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102	53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103	48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104	44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105	41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106	38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107	35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108	33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109	31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110	29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111	28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112	26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113	25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114	24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115	23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116	22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117	21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118	20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119	19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120	19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121	18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122	17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
123	17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
124	};
125
126	/* Input permutation for the simple_idct_mmx */
127	static const uint8_t simple_mmx_permutation[64]={
128	0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129	0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130	0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131	0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132	0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133	0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134	0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136	};
137
138	static int pix_sum_c(uint8_t * pix, int line_size)
139	{
140	int s, i, j;
141
142	s = 0;
143	for (i = 0; i < 16; i++) {
144	for (j = 0; j < 16; j += 8) {
145	s += pix[0];
146	s += pix[1];
147	s += pix[2];
148	s += pix[3];
149	s += pix[4];
150	s += pix[5];
151	s += pix[6];
152	s += pix[7];
153	pix += 8;
154	}
155	pix += line_size - 16;
156	}
157	return s;
158	}
159
160	static int pix_norm1_c(uint8_t * pix, int line_size)
161	{
162	int s, i, j;
163	uint32_t *sq = squareTbl + 256;
164
165	s = 0;
166	for (i = 0; i < 16; i++) {
167	for (j = 0; j < 16; j += 8) {
168	#if 0
169	s += sq[pix[0]];
170	s += sq[pix[1]];
171	s += sq[pix[2]];
172	s += sq[pix[3]];
173	s += sq[pix[4]];
174	s += sq[pix[5]];
175	s += sq[pix[6]];
176	s += sq[pix[7]];
177	#else
178	#if LONG_MAX > 2147483647
179	register uint64_t x=(uint64_t)pix;
180	s += sq[x&0xff];
181	s += sq[(x>>8)&0xff];
182	s += sq[(x>>16)&0xff];
183	s += sq[(x>>24)&0xff];
184	s += sq[(x>>32)&0xff];
185	s += sq[(x>>40)&0xff];
186	s += sq[(x>>48)&0xff];
187	s += sq[(x>>56)&0xff];
188	#else
189	register uint32_t x=(uint32_t)pix;
190	s += sq[x&0xff];
191	s += sq[(x>>8)&0xff];
192	s += sq[(x>>16)&0xff];
193	s += sq[(x>>24)&0xff];
194	x=(uint32_t)(pix+4);
195	s += sq[x&0xff];
196	s += sq[(x>>8)&0xff];
197	s += sq[(x>>16)&0xff];
198	s += sq[(x>>24)&0xff];
199	#endif
200	#endif
201	pix += 8;
202	}
203	pix += line_size - 16;
204	}
205	return s;
206	}
207
208	static void bswap_buf(uint32_t dst, uint32_t src, int w){
209	int i;
210
211	for(i=0; i+8<=w; i+=8){
212	dst[i+0]= bswap_32(src[i+0]);
213	dst[i+1]= bswap_32(src[i+1]);
214	dst[i+2]= bswap_32(src[i+2]);
215	dst[i+3]= bswap_32(src[i+3]);
216	dst[i+4]= bswap_32(src[i+4]);
217	dst[i+5]= bswap_32(src[i+5]);
218	dst[i+6]= bswap_32(src[i+6]);
219	dst[i+7]= bswap_32(src[i+7]);
220	}
221	for(;i<w; i++){
222	dst[i+0]= bswap_32(src[i+0]);
223	}
224	}
225
226	static int sse4_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h)
227	{
228	int s, i;
229	uint32_t *sq = squareTbl + 256;
230
231	s = 0;
232	for (i = 0; i < h; i++) {
233	s += sq[pix1[0] - pix2[0]];
234	s += sq[pix1[1] - pix2[1]];
235	s += sq[pix1[2] - pix2[2]];
236	s += sq[pix1[3] - pix2[3]];
237	pix1 += line_size;
238	pix2 += line_size;
239	}
240	return s;
241	}
242
243	static int sse8_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h)
244	{
245	int s, i;
246	uint32_t *sq = squareTbl + 256;
247
248	s = 0;
249	for (i = 0; i < h; i++) {
250	s += sq[pix1[0] - pix2[0]];
251	s += sq[pix1[1] - pix2[1]];
252	s += sq[pix1[2] - pix2[2]];
253	s += sq[pix1[3] - pix2[3]];
254	s += sq[pix1[4] - pix2[4]];
255	s += sq[pix1[5] - pix2[5]];
256	s += sq[pix1[6] - pix2[6]];
257	s += sq[pix1[7] - pix2[7]];
258	pix1 += line_size;
259	pix2 += line_size;
260	}
261	return s;
262	}
263
264	static int sse16_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
265	{
266	int s, i;
267	uint32_t *sq = squareTbl + 256;
268
269	s = 0;
270	for (i = 0; i < h; i++) {
271	s += sq[pix1[ 0] - pix2[ 0]];
272	s += sq[pix1[ 1] - pix2[ 1]];
273	s += sq[pix1[ 2] - pix2[ 2]];
274	s += sq[pix1[ 3] - pix2[ 3]];
275	s += sq[pix1[ 4] - pix2[ 4]];
276	s += sq[pix1[ 5] - pix2[ 5]];
277	s += sq[pix1[ 6] - pix2[ 6]];
278	s += sq[pix1[ 7] - pix2[ 7]];
279	s += sq[pix1[ 8] - pix2[ 8]];
280	s += sq[pix1[ 9] - pix2[ 9]];
281	s += sq[pix1[10] - pix2[10]];
282	s += sq[pix1[11] - pix2[11]];
283	s += sq[pix1[12] - pix2[12]];
284	s += sq[pix1[13] - pix2[13]];
285	s += sq[pix1[14] - pix2[14]];
286	s += sq[pix1[15] - pix2[15]];
287
288	pix1 += line_size;
289	pix2 += line_size;
290	}
291	return s;
292	}
293
294
295	#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
296	static inline int w_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int w, int h, int type){
297	int s, i, j;
298	const int dec_count= w==8 ? 3 : 4;
299	int tmp[32*32];
300	int level, ori;
301	static const int scale[2][2][4][4]={
302	{
303	{
304	// 9/7 8x8 dec=3
305	{268, 239, 239, 213},
306	{ 0, 224, 224, 152},
307	{ 0, 135, 135, 110},
308	},{
309	// 9/7 16x16 or 32x32 dec=4
310	{344, 310, 310, 280},
311	{ 0, 320, 320, 228},
312	{ 0, 175, 175, 136},
313	{ 0, 129, 129, 102},
314	}
315	},{
316	{
317	// 5/3 8x8 dec=3
318	{275, 245, 245, 218},
319	{ 0, 230, 230, 156},
320	{ 0, 138, 138, 113},
321	},{
322	// 5/3 16x16 or 32x32 dec=4
323	{352, 317, 317, 286},
324	{ 0, 328, 328, 233},
325	{ 0, 180, 180, 140},
326	{ 0, 132, 132, 105},
327	}
328	}
329	};
330
331	for (i = 0; i < h; i++) {
332	for (j = 0; j < w; j+=4) {
333	tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334	tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335	tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336	tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
337	}
338	pix1 += line_size;
339	pix2 += line_size;
340	}
341
342	ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
343
344	s=0;
345	assert(w==h);
346	for(level=0; level<dec_count; level++){
347	for(ori= level ? 1 : 0; ori<4; ori++){
348	int size= w>>(dec_count-level);
349	int sx= (ori&1) ? size : 0;
350	int stride= 32<<(dec_count-level);
351	int sy= (ori&2) ? stride>>1 : 0;
352
353	for(i=0; i<size; i++){
354	for(j=0; j<size; j++){
355	int v= tmp[sx + sy + istride + j] scale[type][dec_count-3][level][ori];
356	s += ABS(v);
357	}
358	}
359	}
360	}
361	assert(s>=0);
362	return s>>9;
363	}
364
365	static int w53_8_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
366	return w_c(v, pix1, pix2, line_size, 8, h, 1);
367	}
368
369	static int w97_8_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
370	return w_c(v, pix1, pix2, line_size, 8, h, 0);
371	}
372
373	static int w53_16_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
374	return w_c(v, pix1, pix2, line_size, 16, h, 1);
375	}
376
377	static int w97_16_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
378	return w_c(v, pix1, pix2, line_size, 16, h, 0);
379	}
380
381	int w53_32_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
382	return w_c(v, pix1, pix2, line_size, 32, h, 1);
383	}
384
385	int w97_32_c(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h){
386	return w_c(v, pix1, pix2, line_size, 32, h, 0);
387	}
388	#endif
389
390	static void get_pixels_c(DCTELEM restrict block, const uint8_t pixels, int line_size)
391	{
392	int i;
393
394	/* read the pixels */
395	for(i=0;i<8;i++) {
396	block[0] = pixels[0];
397	block[1] = pixels[1];
398	block[2] = pixels[2];
399	block[3] = pixels[3];
400	block[4] = pixels[4];
401	block[5] = pixels[5];
402	block[6] = pixels[6];
403	block[7] = pixels[7];
404	pixels += line_size;
405	block += 8;
406	}
407	}
408
409	static void diff_pixels_c(DCTELEM restrict block, const uint8_t s1,
410	const uint8_t *s2, int stride){
411	int i;
412
413	/* read the pixels */
414	for(i=0;i<8;i++) {
415	block[0] = s1[0] - s2[0];
416	block[1] = s1[1] - s2[1];
417	block[2] = s1[2] - s2[2];
418	block[3] = s1[3] - s2[3];
419	block[4] = s1[4] - s2[4];
420	block[5] = s1[5] - s2[5];
421	block[6] = s1[6] - s2[6];
422	block[7] = s1[7] - s2[7];
423	s1 += stride;
424	s2 += stride;
425	block += 8;
426	}
427	}
428
429
430	static void put_pixels_clamped_c(const DCTELEM block, uint8_t restrict pixels,
431	int line_size)
432	{
433	int i;
434	uint8_t *cm = cropTbl + MAX_NEG_CROP;
435
436	/* read the pixels */
437	for(i=0;i<8;i++) {
438	pixels[0] = cm[block[0]];
439	pixels[1] = cm[block[1]];
440	pixels[2] = cm[block[2]];
441	pixels[3] = cm[block[3]];
442	pixels[4] = cm[block[4]];
443	pixels[5] = cm[block[5]];
444	pixels[6] = cm[block[6]];
445	pixels[7] = cm[block[7]];
446
447	pixels += line_size;
448	block += 8;
449	}
450	}
451
452	static void put_pixels_clamped4_c(const DCTELEM block, uint8_t restrict pixels,
453	int line_size)
454	{
455	int i;
456	uint8_t *cm = cropTbl + MAX_NEG_CROP;
457
458	/* read the pixels */
459	for(i=0;i<4;i++) {
460	pixels[0] = cm[block[0]];
461	pixels[1] = cm[block[1]];
462	pixels[2] = cm[block[2]];
463	pixels[3] = cm[block[3]];
464
465	pixels += line_size;
466	block += 8;
467	}
468	}
469
470	static void put_pixels_clamped2_c(const DCTELEM block, uint8_t restrict pixels,
471	int line_size)
472	{
473	int i;
474	uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476	/* read the pixels */
477	for(i=0;i<2;i++) {
478	pixels[0] = cm[block[0]];
479	pixels[1] = cm[block[1]];
480
481	pixels += line_size;
482	block += 8;
483	}
484	}
485
486	static void put_signed_pixels_clamped_c(const DCTELEM *block,
487	uint8_t *restrict pixels,
488	int line_size)
489	{
490	int i, j;
491
492	for (i = 0; i < 8; i++) {
493	for (j = 0; j < 8; j++) {
494	if (*block < -128)
495	*pixels = 0;
496	else if (*block > 127)
497	*pixels = 255;
498	else
499	pixels = (uint8_t)(block + 128);
500	block++;
501	pixels++;
502	}
503	pixels += (line_size - 8);
504	}
505	}
506
507	static void add_pixels_clamped_c(const DCTELEM block, uint8_t restrict pixels,
508	int line_size)
509	{
510	int i;
511	uint8_t *cm = cropTbl + MAX_NEG_CROP;
512
513	/* read the pixels */
514	for(i=0;i<8;i++) {
515	pixels[0] = cm[pixels[0] + block[0]];
516	pixels[1] = cm[pixels[1] + block[1]];
517	pixels[2] = cm[pixels[2] + block[2]];
518	pixels[3] = cm[pixels[3] + block[3]];
519	pixels[4] = cm[pixels[4] + block[4]];
520	pixels[5] = cm[pixels[5] + block[5]];
521	pixels[6] = cm[pixels[6] + block[6]];
522	pixels[7] = cm[pixels[7] + block[7]];
523	pixels += line_size;
524	block += 8;
525	}
526	}
527
528	static void add_pixels_clamped4_c(const DCTELEM block, uint8_t restrict pixels,
529	int line_size)
530	{
531	int i;
532	uint8_t *cm = cropTbl + MAX_NEG_CROP;
533
534	/* read the pixels */
535	for(i=0;i<4;i++) {
536	pixels[0] = cm[pixels[0] + block[0]];
537	pixels[1] = cm[pixels[1] + block[1]];
538	pixels[2] = cm[pixels[2] + block[2]];
539	pixels[3] = cm[pixels[3] + block[3]];
540	pixels += line_size;
541	block += 8;
542	}
543	}
544
545	static void add_pixels_clamped2_c(const DCTELEM block, uint8_t restrict pixels,
546	int line_size)
547	{
548	int i;
549	uint8_t *cm = cropTbl + MAX_NEG_CROP;
550
551	/* read the pixels */
552	for(i=0;i<2;i++) {
553	pixels[0] = cm[pixels[0] + block[0]];
554	pixels[1] = cm[pixels[1] + block[1]];
555	pixels += line_size;
556	block += 8;
557	}
558	}
559
560	static void add_pixels8_c(uint8_t restrict pixels, DCTELEM block, int line_size)
561	{
562	int i;
563	for(i=0;i<8;i++) {
564	pixels[0] += block[0];
565	pixels[1] += block[1];
566	pixels[2] += block[2];
567	pixels[3] += block[3];
568	pixels[4] += block[4];
569	pixels[5] += block[5];
570	pixels[6] += block[6];
571	pixels[7] += block[7];
572	pixels += line_size;
573	block += 8;
574	}
575	}
576
577	static void add_pixels4_c(uint8_t restrict pixels, DCTELEM block, int line_size)
578	{
579	int i;
580	for(i=0;i<4;i++) {
581	pixels[0] += block[0];
582	pixels[1] += block[1];
583	pixels[2] += block[2];
584	pixels[3] += block[3];
585	pixels += line_size;
586	block += 4;
587	}
588	}
589
590	#if 0
591
592	#define PIXOP2(OPNAME, OP) \
593	static void OPNAME ## _pixels(uint8_t block, const uint8_t pixels, int line_size, int h)\
594	{\
595	int i;\
596	for(i=0; i<h; i++){\
597	OP(((uint64_t)block), LD64(pixels));\
598	pixels+=line_size;\
599	block +=line_size;\
600	}\
601	}\
602	\
603	static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
604	{\
605	int i;\
606	for(i=0; i<h; i++){\
607	const uint64_t a= LD64(pixels );\
608	const uint64_t b= LD64(pixels+1);\
609	OP(((uint64_t)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
610	pixels+=line_size;\
611	block +=line_size;\
612	}\
613	}\
614	\
615	static void OPNAME ## _pixels_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
616	{\
617	int i;\
618	for(i=0; i<h; i++){\
619	const uint64_t a= LD64(pixels );\
620	const uint64_t b= LD64(pixels+1);\
621	OP(((uint64_t)block), (a\|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
622	pixels+=line_size;\
623	block +=line_size;\
624	}\
625	}\
626	\
627	static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
628	{\
629	int i;\
630	for(i=0; i<h; i++){\
631	const uint64_t a= LD64(pixels );\
632	const uint64_t b= LD64(pixels+line_size);\
633	OP(((uint64_t)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
634	pixels+=line_size;\
635	block +=line_size;\
636	}\
637	}\
638	\
639	static void OPNAME ## _pixels_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
640	{\
641	int i;\
642	for(i=0; i<h; i++){\
643	const uint64_t a= LD64(pixels );\
644	const uint64_t b= LD64(pixels+line_size);\
645	OP(((uint64_t)block), (a\|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
646	pixels+=line_size;\
647	block +=line_size;\
648	}\
649	}\
650	\
651	static void OPNAME ## _pixels_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
652	{\
653	int i;\
654	const uint64_t a= LD64(pixels );\
655	const uint64_t b= LD64(pixels+1);\
656	uint64_t l0= (a&0x0303030303030303ULL)\
657	+ (b&0x0303030303030303ULL)\
658	+ 0x0202020202020202ULL;\
659	uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
660	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
661	uint64_t l1,h1;\
662	\
663	pixels+=line_size;\
664	for(i=0; i<h; i+=2){\
665	uint64_t a= LD64(pixels );\
666	uint64_t b= LD64(pixels+1);\
667	l1= (a&0x0303030303030303ULL)\
668	+ (b&0x0303030303030303ULL);\
669	h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
670	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
671	OP(((uint64_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
672	pixels+=line_size;\
673	block +=line_size;\
674	a= LD64(pixels );\
675	b= LD64(pixels+1);\
676	l0= (a&0x0303030303030303ULL)\
677	+ (b&0x0303030303030303ULL)\
678	+ 0x0202020202020202ULL;\
679	h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681	OP(((uint64_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
682	pixels+=line_size;\
683	block +=line_size;\
684	}\
685	}\
686	\
687	static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
688	{\
689	int i;\
690	const uint64_t a= LD64(pixels );\
691	const uint64_t b= LD64(pixels+1);\
692	uint64_t l0= (a&0x0303030303030303ULL)\
693	+ (b&0x0303030303030303ULL)\
694	+ 0x0101010101010101ULL;\
695	uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
696	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
697	uint64_t l1,h1;\
698	\
699	pixels+=line_size;\
700	for(i=0; i<h; i+=2){\
701	uint64_t a= LD64(pixels );\
702	uint64_t b= LD64(pixels+1);\
703	l1= (a&0x0303030303030303ULL)\
704	+ (b&0x0303030303030303ULL);\
705	h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
706	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
707	OP(((uint64_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
708	pixels+=line_size;\
709	block +=line_size;\
710	a= LD64(pixels );\
711	b= LD64(pixels+1);\
712	l0= (a&0x0303030303030303ULL)\
713	+ (b&0x0303030303030303ULL)\
714	+ 0x0101010101010101ULL;\
715	h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716	+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717	OP(((uint64_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
718	pixels+=line_size;\
719	block +=line_size;\
720	}\
721	}\
722	\
723	CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
724	CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
725	CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
726	CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
727	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
728	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
729	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
730
731	#define op_avg(a, b) a = ( ((a)\|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
732	#else // 64 bit variant
733
734	#define PIXOP2(OPNAME, OP) \
735	static void OPNAME ## _pixels2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
736	int i;\
737	for(i=0; i<h; i++){\
738	OP(((uint16_t)(block )), LD16(pixels ));\
739	pixels+=line_size;\
740	block +=line_size;\
741	}\
742	}\
743	static void OPNAME ## _pixels4_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
744	int i;\
745	for(i=0; i<h; i++){\
746	OP(((uint32_t)(block )), LD32(pixels ));\
747	pixels+=line_size;\
748	block +=line_size;\
749	}\
750	}\
751	static void OPNAME ## _pixels8_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
752	int i;\
753	for(i=0; i<h; i++){\
754	OP(((uint32_t)(block )), LD32(pixels ));\
755	OP(((uint32_t)(block+4)), LD32(pixels+4));\
756	pixels+=line_size;\
757	block +=line_size;\
758	}\
759	}\
760	static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
761	OPNAME ## _pixels8_c(block, pixels, line_size, h);\
762	}\
763	\
764	static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
765	int src_stride1, int src_stride2, int h){\
766	int i;\
767	for(i=0; i<h; i++){\
768	uint32_t a,b;\
769	a= LD32(&src1[i*src_stride1 ]);\
770	b= LD32(&src2[i*src_stride2 ]);\
771	OP(((uint32_t)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
772	a= LD32(&src1[i*src_stride1+4]);\
773	b= LD32(&src2[i*src_stride2+4]);\
774	OP(((uint32_t)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
775	}\
776	}\
777	\
778	static inline void OPNAME ## _pixels8_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
779	int src_stride1, int src_stride2, int h){\
780	int i;\
781	for(i=0; i<h; i++){\
782	uint32_t a,b;\
783	a= LD32(&src1[i*src_stride1 ]);\
784	b= LD32(&src2[i*src_stride2 ]);\
785	OP(((uint32_t)&dst[i*dst_stride ]), rnd_avg32(a, b));\
786	a= LD32(&src1[i*src_stride1+4]);\
787	b= LD32(&src2[i*src_stride2+4]);\
788	OP(((uint32_t)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
789	}\
790	}\
791	\
792	static inline void OPNAME ## _pixels4_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
793	int src_stride1, int src_stride2, int h){\
794	int i;\
795	for(i=0; i<h; i++){\
796	uint32_t a,b;\
797	a= LD32(&src1[i*src_stride1 ]);\
798	b= LD32(&src2[i*src_stride2 ]);\
799	OP(((uint32_t)&dst[i*dst_stride ]), rnd_avg32(a, b));\
800	}\
801	}\
802	\
803	static inline void OPNAME ## _pixels2_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
804	int src_stride1, int src_stride2, int h){\
805	int i;\
806	for(i=0; i<h; i++){\
807	uint32_t a,b;\
808	a= LD16(&src1[i*src_stride1 ]);\
809	b= LD16(&src2[i*src_stride2 ]);\
810	OP(((uint16_t)&dst[i*dst_stride ]), rnd_avg32(a, b));\
811	}\
812	}\
813	\
814	static inline void OPNAME ## _pixels16_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
815	int src_stride1, int src_stride2, int h){\
816	OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
817	OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
818	}\
819	\
820	static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t dst, const uint8_t src1, const uint8_t *src2, int dst_stride, \
821	int src_stride1, int src_stride2, int h){\
822	OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
823	OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
824	}\
825	\
826	static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
827	OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
828	}\
829	\
830	static inline void OPNAME ## _pixels8_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
831	OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832	}\
833	\
834	static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
835	OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
836	}\
837	\
838	static inline void OPNAME ## _pixels8_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
839	OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840	}\
841	\
842	static inline void OPNAME ## _pixels8_l4(uint8_t dst, const uint8_t src1, uint8_t src2, uint8_t src3, uint8_t *src4,\
843	int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
844	int i;\
845	for(i=0; i<h; i++){\
846	uint32_t a, b, c, d, l0, l1, h0, h1;\
847	a= LD32(&src1[i*src_stride1]);\
848	b= LD32(&src2[i*src_stride2]);\
849	c= LD32(&src3[i*src_stride3]);\
850	d= LD32(&src4[i*src_stride4]);\
851	l0= (a&0x03030303UL)\
852	+ (b&0x03030303UL)\
853	+ 0x02020202UL;\
854	h0= ((a&0xFCFCFCFCUL)>>2)\
855	+ ((b&0xFCFCFCFCUL)>>2);\
856	l1= (c&0x03030303UL)\
857	+ (d&0x03030303UL);\
858	h1= ((c&0xFCFCFCFCUL)>>2)\
859	+ ((d&0xFCFCFCFCUL)>>2);\
860	OP(((uint32_t)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
861	a= LD32(&src1[i*src_stride1+4]);\
862	b= LD32(&src2[i*src_stride2+4]);\
863	c= LD32(&src3[i*src_stride3+4]);\
864	d= LD32(&src4[i*src_stride4+4]);\
865	l0= (a&0x03030303UL)\
866	+ (b&0x03030303UL)\
867	+ 0x02020202UL;\
868	h0= ((a&0xFCFCFCFCUL)>>2)\
869	+ ((b&0xFCFCFCFCUL)>>2);\
870	l1= (c&0x03030303UL)\
871	+ (d&0x03030303UL);\
872	h1= ((c&0xFCFCFCFCUL)>>2)\
873	+ ((d&0xFCFCFCFCUL)>>2);\
874	OP(((uint32_t)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875	}\
876	}\
877	\
878	static inline void OPNAME ## _pixels4_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
879	OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
880	}\
881	\
882	static inline void OPNAME ## _pixels4_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
883	OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
884	}\
885	\
886	static inline void OPNAME ## _pixels2_x2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
887	OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
888	}\
889	\
890	static inline void OPNAME ## _pixels2_y2_c(uint8_t block, const uint8_t pixels, int line_size, int h){\
891	OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
892	}\
893	\
894	static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t dst, const uint8_t src1, uint8_t src2, uint8_t src3, uint8_t *src4,\
895	int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
896	int i;\
897	for(i=0; i<h; i++){\
898	uint32_t a, b, c, d, l0, l1, h0, h1;\
899	a= LD32(&src1[i*src_stride1]);\
900	b= LD32(&src2[i*src_stride2]);\
901	c= LD32(&src3[i*src_stride3]);\
902	d= LD32(&src4[i*src_stride4]);\
903	l0= (a&0x03030303UL)\
904	+ (b&0x03030303UL)\
905	+ 0x01010101UL;\
906	h0= ((a&0xFCFCFCFCUL)>>2)\
907	+ ((b&0xFCFCFCFCUL)>>2);\
908	l1= (c&0x03030303UL)\
909	+ (d&0x03030303UL);\
910	h1= ((c&0xFCFCFCFCUL)>>2)\
911	+ ((d&0xFCFCFCFCUL)>>2);\
912	OP(((uint32_t)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
913	a= LD32(&src1[i*src_stride1+4]);\
914	b= LD32(&src2[i*src_stride2+4]);\
915	c= LD32(&src3[i*src_stride3+4]);\
916	d= LD32(&src4[i*src_stride4+4]);\
917	l0= (a&0x03030303UL)\
918	+ (b&0x03030303UL)\
919	+ 0x01010101UL;\
920	h0= ((a&0xFCFCFCFCUL)>>2)\
921	+ ((b&0xFCFCFCFCUL)>>2);\
922	l1= (c&0x03030303UL)\
923	+ (d&0x03030303UL);\
924	h1= ((c&0xFCFCFCFCUL)>>2)\
925	+ ((d&0xFCFCFCFCUL)>>2);\
926	OP(((uint32_t)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927	}\
928	}\
929	static inline void OPNAME ## _pixels16_l4(uint8_t dst, const uint8_t src1, uint8_t src2, uint8_t src3, uint8_t *src4,\
930	int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
931	OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
932	OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
933	}\
934	static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t dst, const uint8_t src1, uint8_t src2, uint8_t src3, uint8_t *src4,\
935	int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936	OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937	OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938	}\
939	\
940	static inline void OPNAME ## _pixels2_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
941	{\
942	int i, a0, b0, a1, b1;\
943	a0= pixels[0];\
944	b0= pixels[1] + 2;\
945	a0 += b0;\
946	b0 += pixels[2];\
947	\
948	pixels+=line_size;\
949	for(i=0; i<h; i+=2){\
950	a1= pixels[0];\
951	b1= pixels[1];\
952	a1 += b1;\
953	b1 += pixels[2];\
954	\
955	block[0]= (a1+a0)>>2; /* FIXME non put */\
956	block[1]= (b1+b0)>>2;\
957	\
958	pixels+=line_size;\
959	block +=line_size;\
960	\
961	a0= pixels[0];\
962	b0= pixels[1] + 2;\
963	a0 += b0;\
964	b0 += pixels[2];\
965	\
966	block[0]= (a1+a0)>>2;\
967	block[1]= (b1+b0)>>2;\
968	pixels+=line_size;\
969	block +=line_size;\
970	}\
971	}\
972	\
973	static inline void OPNAME ## _pixels4_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
974	{\
975	int i;\
976	const uint32_t a= LD32(pixels );\
977	const uint32_t b= LD32(pixels+1);\
978	uint32_t l0= (a&0x03030303UL)\
979	+ (b&0x03030303UL)\
980	+ 0x02020202UL;\
981	uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
982	+ ((b&0xFCFCFCFCUL)>>2);\
983	uint32_t l1,h1;\
984	\
985	pixels+=line_size;\
986	for(i=0; i<h; i+=2){\
987	uint32_t a= LD32(pixels );\
988	uint32_t b= LD32(pixels+1);\
989	l1= (a&0x03030303UL)\
990	+ (b&0x03030303UL);\
991	h1= ((a&0xFCFCFCFCUL)>>2)\
992	+ ((b&0xFCFCFCFCUL)>>2);\
993	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994	pixels+=line_size;\
995	block +=line_size;\
996	a= LD32(pixels );\
997	b= LD32(pixels+1);\
998	l0= (a&0x03030303UL)\
999	+ (b&0x03030303UL)\
1000	+ 0x02020202UL;\
1001	h0= ((a&0xFCFCFCFCUL)>>2)\
1002	+ ((b&0xFCFCFCFCUL)>>2);\
1003	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004	pixels+=line_size;\
1005	block +=line_size;\
1006	}\
1007	}\
1008	\
1009	static inline void OPNAME ## _pixels8_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
1010	{\
1011	int j;\
1012	for(j=0; j<2; j++){\
1013	int i;\
1014	const uint32_t a= LD32(pixels );\
1015	const uint32_t b= LD32(pixels+1);\
1016	uint32_t l0= (a&0x03030303UL)\
1017	+ (b&0x03030303UL)\
1018	+ 0x02020202UL;\
1019	uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020	+ ((b&0xFCFCFCFCUL)>>2);\
1021	uint32_t l1,h1;\
1022	\
1023	pixels+=line_size;\
1024	for(i=0; i<h; i+=2){\
1025	uint32_t a= LD32(pixels );\
1026	uint32_t b= LD32(pixels+1);\
1027	l1= (a&0x03030303UL)\
1028	+ (b&0x03030303UL);\
1029	h1= ((a&0xFCFCFCFCUL)>>2)\
1030	+ ((b&0xFCFCFCFCUL)>>2);\
1031	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032	pixels+=line_size;\
1033	block +=line_size;\
1034	a= LD32(pixels );\
1035	b= LD32(pixels+1);\
1036	l0= (a&0x03030303UL)\
1037	+ (b&0x03030303UL)\
1038	+ 0x02020202UL;\
1039	h0= ((a&0xFCFCFCFCUL)>>2)\
1040	+ ((b&0xFCFCFCFCUL)>>2);\
1041	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042	pixels+=line_size;\
1043	block +=line_size;\
1044	}\
1045	pixels+=4-line_size*(h+1);\
1046	block +=4-line_size*h;\
1047	}\
1048	}\
1049	\
1050	static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t block, const uint8_t pixels, int line_size, int h)\
1051	{\
1052	int j;\
1053	for(j=0; j<2; j++){\
1054	int i;\
1055	const uint32_t a= LD32(pixels );\
1056	const uint32_t b= LD32(pixels+1);\
1057	uint32_t l0= (a&0x03030303UL)\
1058	+ (b&0x03030303UL)\
1059	+ 0x01010101UL;\
1060	uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061	+ ((b&0xFCFCFCFCUL)>>2);\
1062	uint32_t l1,h1;\
1063	\
1064	pixels+=line_size;\
1065	for(i=0; i<h; i+=2){\
1066	uint32_t a= LD32(pixels );\
1067	uint32_t b= LD32(pixels+1);\
1068	l1= (a&0x03030303UL)\
1069	+ (b&0x03030303UL);\
1070	h1= ((a&0xFCFCFCFCUL)>>2)\
1071	+ ((b&0xFCFCFCFCUL)>>2);\
1072	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073	pixels+=line_size;\
1074	block +=line_size;\
1075	a= LD32(pixels );\
1076	b= LD32(pixels+1);\
1077	l0= (a&0x03030303UL)\
1078	+ (b&0x03030303UL)\
1079	+ 0x01010101UL;\
1080	h0= ((a&0xFCFCFCFCUL)>>2)\
1081	+ ((b&0xFCFCFCFCUL)>>2);\
1082	OP(((uint32_t)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083	pixels+=line_size;\
1084	block +=line_size;\
1085	}\
1086	pixels+=4-line_size*(h+1);\
1087	block +=4-line_size*h;\
1088	}\
1089	}\
1090	\
1091	CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1092	CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093	CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094	CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1096	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098	CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1099
1100	#define op_avg(a, b) a = rnd_avg32(a, b)
1101	#endif
1102	#define op_put(a, b) a = b
1103
1104	PIXOP2(avg, op_avg)
1105	PIXOP2(put, op_put)
1106	#undef op_avg
1107	#undef op_put
1108
1109	#define avg2(a,b) ((a+b+1)>>1)
1110	#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111
1112	static void put_no_rnd_pixels16_l2_c(uint8_t dst, const uint8_t a, const uint8_t *b, int stride, int h){
1113	put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114	}
1115
1116	static void put_no_rnd_pixels8_l2_c(uint8_t dst, const uint8_t a, const uint8_t *b, int stride, int h){
1117	put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118	}
1119
1120	static void gmc1_c(uint8_t dst, uint8_t src, int stride, int h, int x16, int y16, int rounder)
1121	{
1122	const int A=(16-x16)*(16-y16);
1123	const int B=( x16)*(16-y16);
1124	const int C=(16-x16)*( y16);
1125	const int D=( x16)*( y16);
1126	int i;
1127
1128	for(i=0; i<h; i++)
1129	{
1130	dst[0]= (Asrc[0] + Bsrc[1] + Csrc[stride+0] + Dsrc[stride+1] + rounder)>>8;
1131	dst[1]= (Asrc[1] + Bsrc[2] + Csrc[stride+1] + Dsrc[stride+2] + rounder)>>8;
1132	dst[2]= (Asrc[2] + Bsrc[3] + Csrc[stride+2] + Dsrc[stride+3] + rounder)>>8;
1133	dst[3]= (Asrc[3] + Bsrc[4] + Csrc[stride+3] + Dsrc[stride+4] + rounder)>>8;
1134	dst[4]= (Asrc[4] + Bsrc[5] + Csrc[stride+4] + Dsrc[stride+5] + rounder)>>8;
1135	dst[5]= (Asrc[5] + Bsrc[6] + Csrc[stride+5] + Dsrc[stride+6] + rounder)>>8;
1136	dst[6]= (Asrc[6] + Bsrc[7] + Csrc[stride+6] + Dsrc[stride+7] + rounder)>>8;
1137	dst[7]= (Asrc[7] + Bsrc[8] + Csrc[stride+7] + Dsrc[stride+8] + rounder)>>8;
1138	dst+= stride;
1139	src+= stride;
1140	}
1141	}
1142
1143	void ff_gmc_c(uint8_t dst, uint8_t src, int stride, int h, int ox, int oy,
1144	int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145	{
1146	int y, vx, vy;
1147	const int s= 1<<shift;
1148
1149	width--;
1150	height--;
1151
1152	for(y=0; y<h; y++){
1153	int x;
1154
1155	vx= ox;
1156	vy= oy;
1157	for(x=0; x<8; x++){ //XXX FIXME optimize
1158	int src_x, src_y, frac_x, frac_y, index;
1159
1160	src_x= vx>>16;
1161	src_y= vy>>16;
1162	frac_x= src_x&(s-1);
1163	frac_y= src_y&(s-1);
1164	src_x>>=shift;
1165	src_y>>=shift;
1166
1167	if((unsigned)src_x < width){
1168	if((unsigned)src_y < height){
1169	index= src_x + src_y*stride;
1170	dst[ystride + x]= ( ( src[index ](s-frac_x)
1171	+ src[index +1]* frac_x )*(s-frac_y)
1172	+ ( src[index+stride ]*(s-frac_x)
1173	+ src[index+stride+1]* frac_x )* frac_y
1174	+ r)>>(shift*2);
1175	}else{
1176	index= src_x + clip(src_y, 0, height)*stride;
1177	dst[ystride + x]= ( ( src[index ](s-frac_x)
1178	+ src[index +1]* frac_x )*s
1179	+ r)>>(shift*2);
1180	}
1181	}else{
1182	if((unsigned)src_y < height){
1183	index= clip(src_x, 0, width) + src_y*stride;
1184	dst[ystride + x]= ( ( src[index ](s-frac_y)
1185	+ src[index+stride ]* frac_y )*s
1186	+ r)>>(shift*2);
1187	}else{
1188	index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189	dst[y*stride + x]= src[index ];
1190	}
1191	}
1192
1193	vx+= dxx;
1194	vy+= dyx;
1195	}
1196	ox += dxy;
1197	oy += dyy;
1198	}
1199	}
1200
1201	static inline void put_tpel_pixels_mc00_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1202	switch(width){
1203	case 2: put_pixels2_c (dst, src, stride, height); break;
1204	case 4: put_pixels4_c (dst, src, stride, height); break;
1205	case 8: put_pixels8_c (dst, src, stride, height); break;
1206	case 16:put_pixels16_c(dst, src, stride, height); break;
1207	}
1208	}
1209
1210	static inline void put_tpel_pixels_mc10_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1211	int i,j;
1212	for (i=0; i < height; i++) {
1213	for (j=0; j < width; j++) {
1214	dst[j] = (683(2src[j] + src[j+1] + 1)) >> 11;
1215	}
1216	src += stride;
1217	dst += stride;
1218	}
1219	}
1220
1221	static inline void put_tpel_pixels_mc20_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1222	int i,j;
1223	for (i=0; i < height; i++) {
1224	for (j=0; j < width; j++) {
1225	dst[j] = (683(src[j] + 2src[j+1] + 1)) >> 11;
1226	}
1227	src += stride;
1228	dst += stride;
1229	}
1230	}
1231
1232	static inline void put_tpel_pixels_mc01_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1233	int i,j;
1234	for (i=0; i < height; i++) {
1235	for (j=0; j < width; j++) {
1236	dst[j] = (683(2src[j] + src[j+stride] + 1)) >> 11;
1237	}
1238	src += stride;
1239	dst += stride;
1240	}
1241	}
1242
1243	static inline void put_tpel_pixels_mc11_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1244	int i,j;
1245	for (i=0; i < height; i++) {
1246	for (j=0; j < width; j++) {
1247	dst[j] = (2731(4src[j] + 3src[j+1] + 3src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1248	}
1249	src += stride;
1250	dst += stride;
1251	}
1252	}
1253
1254	static inline void put_tpel_pixels_mc12_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1255	int i,j;
1256	for (i=0; i < height; i++) {
1257	for (j=0; j < width; j++) {
1258	dst[j] = (2731(3src[j] + 2src[j+1] + 4src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259	}
1260	src += stride;
1261	dst += stride;
1262	}
1263	}
1264
1265	static inline void put_tpel_pixels_mc02_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1266	int i,j;
1267	for (i=0; i < height; i++) {
1268	for (j=0; j < width; j++) {
1269	dst[j] = (683(src[j] + 2src[j+stride] + 1)) >> 11;
1270	}
1271	src += stride;
1272	dst += stride;
1273	}
1274	}
1275
1276	static inline void put_tpel_pixels_mc21_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1277	int i,j;
1278	for (i=0; i < height; i++) {
1279	for (j=0; j < width; j++) {
1280	dst[j] = (2731(3src[j] + 4src[j+1] + 2src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1281	}
1282	src += stride;
1283	dst += stride;
1284	}
1285	}
1286
1287	static inline void put_tpel_pixels_mc22_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1288	int i,j;
1289	for (i=0; i < height; i++) {
1290	for (j=0; j < width; j++) {
1291	dst[j] = (2731(2src[j] + 3src[j+1] + 3src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1292	}
1293	src += stride;
1294	dst += stride;
1295	}
1296	}
1297
1298	static inline void avg_tpel_pixels_mc00_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1299	switch(width){
1300	case 2: avg_pixels2_c (dst, src, stride, height); break;
1301	case 4: avg_pixels4_c (dst, src, stride, height); break;
1302	case 8: avg_pixels8_c (dst, src, stride, height); break;
1303	case 16:avg_pixels16_c(dst, src, stride, height); break;
1304	}
1305	}
1306
1307	static inline void avg_tpel_pixels_mc10_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1308	int i,j;
1309	for (i=0; i < height; i++) {
1310	for (j=0; j < width; j++) {
1311	dst[j] = (dst[j] + ((683(2src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1312	}
1313	src += stride;
1314	dst += stride;
1315	}
1316	}
1317
1318	static inline void avg_tpel_pixels_mc20_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1319	int i,j;
1320	for (i=0; i < height; i++) {
1321	for (j=0; j < width; j++) {
1322	dst[j] = (dst[j] + ((683(src[j] + 2src[j+1] + 1)) >> 11) + 1) >> 1;
1323	}
1324	src += stride;
1325	dst += stride;
1326	}
1327	}
1328
1329	static inline void avg_tpel_pixels_mc01_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1330	int i,j;
1331	for (i=0; i < height; i++) {
1332	for (j=0; j < width; j++) {
1333	dst[j] = (dst[j] + ((683(2src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1334	}
1335	src += stride;
1336	dst += stride;
1337	}
1338	}
1339
1340	static inline void avg_tpel_pixels_mc11_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1341	int i,j;
1342	for (i=0; i < height; i++) {
1343	for (j=0; j < width; j++) {
1344	dst[j] = (dst[j] + ((2731(4src[j] + 3src[j+1] + 3src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1345	}
1346	src += stride;
1347	dst += stride;
1348	}
1349	}
1350
1351	static inline void avg_tpel_pixels_mc12_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1352	int i,j;
1353	for (i=0; i < height; i++) {
1354	for (j=0; j < width; j++) {
1355	dst[j] = (dst[j] + ((2731(3src[j] + 2src[j+1] + 4src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356	}
1357	src += stride;
1358	dst += stride;
1359	}
1360	}
1361
1362	static inline void avg_tpel_pixels_mc02_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1363	int i,j;
1364	for (i=0; i < height; i++) {
1365	for (j=0; j < width; j++) {
1366	dst[j] = (dst[j] + ((683(src[j] + 2src[j+stride] + 1)) >> 11) + 1) >> 1;
1367	}
1368	src += stride;
1369	dst += stride;
1370	}
1371	}
1372
1373	static inline void avg_tpel_pixels_mc21_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1374	int i,j;
1375	for (i=0; i < height; i++) {
1376	for (j=0; j < width; j++) {
1377	dst[j] = (dst[j] + ((2731(3src[j] + 4src[j+1] + 2src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1378	}
1379	src += stride;
1380	dst += stride;
1381	}
1382	}
1383
1384	static inline void avg_tpel_pixels_mc22_c(uint8_t dst, const uint8_t src, int stride, int width, int height){
1385	int i,j;
1386	for (i=0; i < height; i++) {
1387	for (j=0; j < width; j++) {
1388	dst[j] = (dst[j] + ((2731(2src[j] + 3src[j+1] + 3src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389	}
1390	src += stride;
1391	dst += stride;
1392	}
1393	}
1394	#if 0
1395	#define TPEL_WIDTH(width)\
1396	static void put_tpel_pixels ## width ## _mc00_c(uint8_t dst, const uint8_t src, int stride, int height){\
1397	void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398	static void put_tpel_pixels ## width ## _mc10_c(uint8_t dst, const uint8_t src, int stride, int height){\
1399	void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400	static void put_tpel_pixels ## width ## _mc20_c(uint8_t dst, const uint8_t src, int stride, int height){\
1401	void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402	static void put_tpel_pixels ## width ## _mc01_c(uint8_t dst, const uint8_t src, int stride, int height){\
1403	void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404	static void put_tpel_pixels ## width ## _mc11_c(uint8_t dst, const uint8_t src, int stride, int height){\
1405	void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406	static void put_tpel_pixels ## width ## _mc21_c(uint8_t dst, const uint8_t src, int stride, int height){\
1407	void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408	static void put_tpel_pixels ## width ## _mc02_c(uint8_t dst, const uint8_t src, int stride, int height){\
1409	void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410	static void put_tpel_pixels ## width ## _mc12_c(uint8_t dst, const uint8_t src, int stride, int height){\
1411	void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412	static void put_tpel_pixels ## width ## _mc22_c(uint8_t dst, const uint8_t src, int stride, int height){\
1413	void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414	#endif
1415
1416	#define H264_CHROMA_MC(OPNAME, OP)\
1417	static void OPNAME ## h264_chroma_mc2_c(uint8_t dst/align 8/, uint8_t src/align 1/, int stride, int h, int x, int y){\
1418	const int A=(8-x)*(8-y);\
1419	const int B=( x)*(8-y);\
1420	const int C=(8-x)*( y);\
1421	const int D=( x)*( y);\
1422	int i;\
1423	\
1424	assert(x<8 && y<8 && x>=0 && y>=0);\
1425	\
1426	for(i=0; i<h; i++)\
1427	{\
1428	OP(dst[0], (Asrc[0] + Bsrc[1] + Csrc[stride+0] + Dsrc[stride+1]));\
1429	OP(dst[1], (Asrc[1] + Bsrc[2] + Csrc[stride+1] + Dsrc[stride+2]));\
1430	dst+= stride;\
1431	src+= stride;\
1432	}\
1433	}\
1434	\
1435	static void OPNAME ## h264_chroma_mc4_c(uint8_t dst/align 8/, uint8_t src/align 1/, int stride, int h, int x, int y){\
1436	const int A=(8-x)*(8-y);\
1437	const int B=( x)*(8-y);\
1438	const int C=(8-x)*( y);\
1439	const int D=( x)*( y);\
1440	int i;\
1441	\
1442	assert(x<8 && y<8 && x>=0 && y>=0);\
1443	\
1444	for(i=0; i<h; i++)\
1445	{\
1446	OP(dst[0], (Asrc[0] + Bsrc[1] + Csrc[stride+0] + Dsrc[stride+1]));\
1447	OP(dst[1], (Asrc[1] + Bsrc[2] + Csrc[stride+1] + Dsrc[stride+2]));\
1448	OP(dst[2], (Asrc[2] + Bsrc[3] + Csrc[stride+2] + Dsrc[stride+3]));\
1449	OP(dst[3], (Asrc[3] + Bsrc[4] + Csrc[stride+3] + Dsrc[stride+4]));\
1450	dst+= stride;\
1451	src+= stride;\
1452	}\
1453	}\
1454	\
1455	static void OPNAME ## h264_chroma_mc8_c(uint8_t dst/align 8/, uint8_t src/align 1/, int stride, int h, int x, int y){\
1456	const int A=(8-x)*(8-y);\
1457	const int B=( x)*(8-y);\
1458	const int C=(8-x)*( y);\
1459	const int D=( x)*( y);\
1460	int i;\
1461	\
1462	assert(x<8 && y<8 && x>=0 && y>=0);\
1463	\
1464	for(i=0; i<h; i++)\
1465	{\
1466	OP(dst[0], (Asrc[0] + Bsrc[1] + Csrc[stride+0] + Dsrc[stride+1]));\
1467	OP(dst[1], (Asrc[1] + Bsrc[2] + Csrc[stride+1] + Dsrc[stride+2]));\
1468	OP(dst[2], (Asrc[2] + Bsrc[3] + Csrc[stride+2] + Dsrc[stride+3]));\
1469	OP(dst[3], (Asrc[3] + Bsrc[4] + Csrc[stride+3] + Dsrc[stride+4]));\
1470	OP(dst[4], (Asrc[4] + Bsrc[5] + Csrc[stride+4] + Dsrc[stride+5]));\
1471	OP(dst[5], (Asrc[5] + Bsrc[6] + Csrc[stride+5] + Dsrc[stride+6]));\
1472	OP(dst[6], (Asrc[6] + Bsrc[7] + Csrc[stride+6] + Dsrc[stride+7]));\
1473	OP(dst[7], (Asrc[7] + Bsrc[8] + Csrc[stride+7] + Dsrc[stride+8]));\
1474	dst+= stride;\
1475	src+= stride;\
1476	}\
1477	}
1478
1479	#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480	#define op_put(a, b) a = (((b) + 32)>>6)
1481
1482	H264_CHROMA_MC(put_ , op_put)
1483	H264_CHROMA_MC(avg_ , op_avg)
1484	#undef op_avg
1485	#undef op_put
1486
1487	static inline void copy_block2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1488	{
1489	int i;
1490	for(i=0; i<h; i++)
1491	{
1492	ST16(dst , LD16(src ));
1493	dst+=dstStride;
1494	src+=srcStride;
1495	}
1496	}
1497
1498	static inline void copy_block4(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1499	{
1500	int i;
1501	for(i=0; i<h; i++)
1502	{
1503	ST32(dst , LD32(src ));
1504	dst+=dstStride;
1505	src+=srcStride;
1506	}
1507	}
1508
1509	static inline void copy_block8(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1510	{
1511	int i;
1512	for(i=0; i<h; i++)
1513	{
1514	ST32(dst , LD32(src ));
1515	ST32(dst+4 , LD32(src+4 ));
1516	dst+=dstStride;
1517	src+=srcStride;
1518	}
1519	}
1520
1521	static inline void copy_block16(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1522	{
1523	int i;
1524	for(i=0; i<h; i++)
1525	{
1526	ST32(dst , LD32(src ));
1527	ST32(dst+4 , LD32(src+4 ));
1528	ST32(dst+8 , LD32(src+8 ));
1529	ST32(dst+12, LD32(src+12));
1530	dst+=dstStride;
1531	src+=srcStride;
1532	}
1533	}
1534
1535	static inline void copy_block17(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1536	{
1537	int i;
1538	for(i=0; i<h; i++)
1539	{
1540	ST32(dst , LD32(src ));
1541	ST32(dst+4 , LD32(src+4 ));
1542	ST32(dst+8 , LD32(src+8 ));
1543	ST32(dst+12, LD32(src+12));
1544	dst[16]= src[16];
1545	dst+=dstStride;
1546	src+=srcStride;
1547	}
1548	}
1549
1550	static inline void copy_block9(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h)
1551	{
1552	int i;
1553	for(i=0; i<h; i++)
1554	{
1555	ST32(dst , LD32(src ));
1556	ST32(dst+4 , LD32(src+4 ));
1557	dst[8]= src[8];
1558	dst+=dstStride;
1559	src+=srcStride;
1560	}
1561	}
1562
1563
1564	#define QPEL_MC(r, OPNAME, RND, OP) \
1565	static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
1566	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1567	int i;\
1568	for(i=0; i<h; i++)\
1569	{\
1570	OP(dst[0], (src[0]+src[1])20 - (src[0]+src[2])6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571	OP(dst[1], (src[1]+src[2])20 - (src[0]+src[3])6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572	OP(dst[2], (src[2]+src[3])20 - (src[1]+src[4])6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573	OP(dst[3], (src[3]+src[4])20 - (src[2]+src[5])6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574	OP(dst[4], (src[4]+src[5])20 - (src[3]+src[6])6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575	OP(dst[5], (src[5]+src[6])20 - (src[4]+src[7])6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576	OP(dst[6], (src[6]+src[7])20 - (src[5]+src[8])6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577	OP(dst[7], (src[7]+src[8])20 - (src[6]+src[8])6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578	dst+=dstStride;\
1579	src+=srcStride;\
1580	}\
1581	}\
1582	\
1583	static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
1584	const int w=8;\
1585	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1586	int i;\
1587	for(i=0; i<w; i++)\
1588	{\
1589	const int src0= src[0*srcStride];\
1590	const int src1= src[1*srcStride];\
1591	const int src2= src[2*srcStride];\
1592	const int src3= src[3*srcStride];\
1593	const int src4= src[4*srcStride];\
1594	const int src5= src[5*srcStride];\
1595	const int src6= src[6*srcStride];\
1596	const int src7= src[7*srcStride];\
1597	const int src8= src[8*srcStride];\
1598	OP(dst[0dstStride], (src0+src1)20 - (src0+src2)6 + (src1+src3)3 - (src2+src4));\
1599	OP(dst[1dstStride], (src1+src2)20 - (src0+src3)6 + (src0+src4)3 - (src1+src5));\
1600	OP(dst[2dstStride], (src2+src3)20 - (src1+src4)6 + (src0+src5)3 - (src0+src6));\
1601	OP(dst[3dstStride], (src3+src4)20 - (src2+src5)6 + (src1+src6)3 - (src0+src7));\
1602	OP(dst[4dstStride], (src4+src5)20 - (src3+src6)6 + (src2+src7)3 - (src1+src8));\
1603	OP(dst[5dstStride], (src5+src6)20 - (src4+src7)6 + (src3+src8)3 - (src2+src8));\
1604	OP(dst[6dstStride], (src6+src7)20 - (src5+src8)6 + (src4+src8)3 - (src3+src7));\
1605	OP(dst[7dstStride], (src7+src8)20 - (src6+src8)6 + (src5+src7)3 - (src4+src6));\
1606	dst++;\
1607	src++;\
1608	}\
1609	}\
1610	\
1611	static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
1612	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1613	int i;\
1614	\
1615	for(i=0; i<h; i++)\
1616	{\
1617	OP(dst[ 0], (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618	OP(dst[ 1], (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619	OP(dst[ 2], (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620	OP(dst[ 3], (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621	OP(dst[ 4], (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622	OP(dst[ 5], (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623	OP(dst[ 6], (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624	OP(dst[ 7], (src[ 7]+src[ 8])20 - (src[ 6]+src[ 9])6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625	OP(dst[ 8], (src[ 8]+src[ 9])20 - (src[ 7]+src[10])6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626	OP(dst[ 9], (src[ 9]+src[10])20 - (src[ 8]+src[11])6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627	OP(dst[10], (src[10]+src[11])20 - (src[ 9]+src[12])6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628	OP(dst[11], (src[11]+src[12])20 - (src[10]+src[13])6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629	OP(dst[12], (src[12]+src[13])20 - (src[11]+src[14])6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630	OP(dst[13], (src[13]+src[14])20 - (src[12]+src[15])6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631	OP(dst[14], (src[14]+src[15])20 - (src[13]+src[16])6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632	OP(dst[15], (src[15]+src[16])20 - (src[14]+src[16])6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633	dst+=dstStride;\
1634	src+=srcStride;\
1635	}\
1636	}\
1637	\
1638	static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
1639	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1640	int i;\
1641	const int w=16;\
1642	for(i=0; i<w; i++)\
1643	{\
1644	const int src0= src[0*srcStride];\
1645	const int src1= src[1*srcStride];\
1646	const int src2= src[2*srcStride];\
1647	const int src3= src[3*srcStride];\
1648	const int src4= src[4*srcStride];\
1649	const int src5= src[5*srcStride];\
1650	const int src6= src[6*srcStride];\
1651	const int src7= src[7*srcStride];\
1652	const int src8= src[8*srcStride];\
1653	const int src9= src[9*srcStride];\
1654	const int src10= src[10*srcStride];\
1655	const int src11= src[11*srcStride];\
1656	const int src12= src[12*srcStride];\
1657	const int src13= src[13*srcStride];\
1658	const int src14= src[14*srcStride];\
1659	const int src15= src[15*srcStride];\
1660	const int src16= src[16*srcStride];\
1661	OP(dst[ 0dstStride], (src0 +src1 )20 - (src0 +src2 )6 + (src1 +src3 )3 - (src2 +src4 ));\
1662	OP(dst[ 1dstStride], (src1 +src2 )20 - (src0 +src3 )6 + (src0 +src4 )3 - (src1 +src5 ));\
1663	OP(dst[ 2dstStride], (src2 +src3 )20 - (src1 +src4 )6 + (src0 +src5 )3 - (src0 +src6 ));\
1664	OP(dst[ 3dstStride], (src3 +src4 )20 - (src2 +src5 )6 + (src1 +src6 )3 - (src0 +src7 ));\
1665	OP(dst[ 4dstStride], (src4 +src5 )20 - (src3 +src6 )6 + (src2 +src7 )3 - (src1 +src8 ));\
1666	OP(dst[ 5dstStride], (src5 +src6 )20 - (src4 +src7 )6 + (src3 +src8 )3 - (src2 +src9 ));\
1667	OP(dst[ 6dstStride], (src6 +src7 )20 - (src5 +src8 )6 + (src4 +src9 )3 - (src3 +src10));\
1668	OP(dst[ 7dstStride], (src7 +src8 )20 - (src6 +src9 )6 + (src5 +src10)3 - (src4 +src11));\
1669	OP(dst[ 8dstStride], (src8 +src9 )20 - (src7 +src10)6 + (src6 +src11)3 - (src5 +src12));\
1670	OP(dst[ 9dstStride], (src9 +src10)20 - (src8 +src11)6 + (src7 +src12)3 - (src6 +src13));\
1671	OP(dst[10dstStride], (src10+src11)20 - (src9 +src12)6 + (src8 +src13)3 - (src7 +src14));\
1672	OP(dst[11dstStride], (src11+src12)20 - (src10+src13)6 + (src9 +src14)3 - (src8 +src15));\
1673	OP(dst[12dstStride], (src12+src13)20 - (src11+src14)6 + (src10+src15)3 - (src9 +src16));\
1674	OP(dst[13dstStride], (src13+src14)20 - (src12+src15)6 + (src11+src16)3 - (src10+src16));\
1675	OP(dst[14dstStride], (src14+src15)20 - (src13+src16)6 + (src12+src16)3 - (src11+src15));\
1676	OP(dst[15dstStride], (src15+src16)20 - (src14+src16)6 + (src13+src15)3 - (src12+src14));\
1677	dst++;\
1678	src++;\
1679	}\
1680	}\
1681	\
1682	static void OPNAME ## qpel8_mc00_c (uint8_t dst, uint8_t src, int stride){\
1683	OPNAME ## pixels8_c(dst, src, stride, 8);\
1684	}\
1685	\
1686	static void OPNAME ## qpel8_mc10_c(uint8_t dst, uint8_t src, int stride){\
1687	uint8_t half[64];\
1688	put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689	OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1690	}\
1691	\
1692	static void OPNAME ## qpel8_mc20_c(uint8_t dst, uint8_t src, int stride){\
1693	OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1694	}\
1695	\
1696	static void OPNAME ## qpel8_mc30_c(uint8_t dst, uint8_t src, int stride){\
1697	uint8_t half[64];\
1698	put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699	OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1700	}\
1701	\
1702	static void OPNAME ## qpel8_mc01_c(uint8_t dst, uint8_t src, int stride){\
1703	uint8_t full[16*9];\
1704	uint8_t half[64];\
1705	copy_block9(full, src, 16, stride, 9);\
1706	put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707	OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1708	}\
1709	\
1710	static void OPNAME ## qpel8_mc02_c(uint8_t dst, uint8_t src, int stride){\
1711	uint8_t full[16*9];\
1712	copy_block9(full, src, 16, stride, 9);\
1713	OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1714	}\
1715	\
1716	static void OPNAME ## qpel8_mc03_c(uint8_t dst, uint8_t src, int stride){\
1717	uint8_t full[16*9];\
1718	uint8_t half[64];\
1719	copy_block9(full, src, 16, stride, 9);\
1720	put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721	OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1722	}\
1723	void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t dst, uint8_t src, int stride){\
1724	uint8_t full[16*9];\
1725	uint8_t halfH[72];\
1726	uint8_t halfV[64];\
1727	uint8_t halfHV[64];\
1728	copy_block9(full, src, 16, stride, 9);\
1729	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732	OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733	}\
1734	static void OPNAME ## qpel8_mc11_c(uint8_t dst, uint8_t src, int stride){\
1735	uint8_t full[16*9];\
1736	uint8_t halfH[72];\
1737	uint8_t halfHV[64];\
1738	copy_block9(full, src, 16, stride, 9);\
1739	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740	put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742	OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743	}\
1744	void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t dst, uint8_t src, int stride){\
1745	uint8_t full[16*9];\
1746	uint8_t halfH[72];\
1747	uint8_t halfV[64];\
1748	uint8_t halfHV[64];\
1749	copy_block9(full, src, 16, stride, 9);\
1750	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753	OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754	}\
1755	static void OPNAME ## qpel8_mc31_c(uint8_t dst, uint8_t src, int stride){\
1756	uint8_t full[16*9];\
1757	uint8_t halfH[72];\
1758	uint8_t halfHV[64];\
1759	copy_block9(full, src, 16, stride, 9);\
1760	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761	put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763	OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764	}\
1765	void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t dst, uint8_t src, int stride){\
1766	uint8_t full[16*9];\
1767	uint8_t halfH[72];\
1768	uint8_t halfV[64];\
1769	uint8_t halfHV[64];\
1770	copy_block9(full, src, 16, stride, 9);\
1771	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774	OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775	}\
1776	static void OPNAME ## qpel8_mc13_c(uint8_t dst, uint8_t src, int stride){\
1777	uint8_t full[16*9];\
1778	uint8_t halfH[72];\
1779	uint8_t halfHV[64];\
1780	copy_block9(full, src, 16, stride, 9);\
1781	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782	put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784	OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785	}\
1786	void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t dst, uint8_t src, int stride){\
1787	uint8_t full[16*9];\
1788	uint8_t halfH[72];\
1789	uint8_t halfV[64];\
1790	uint8_t halfHV[64];\
1791	copy_block9(full, src, 16, stride, 9);\
1792	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1793	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795	OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796	}\
1797	static void OPNAME ## qpel8_mc33_c(uint8_t dst, uint8_t src, int stride){\
1798	uint8_t full[16*9];\
1799	uint8_t halfH[72];\
1800	uint8_t halfHV[64];\
1801	copy_block9(full, src, 16, stride, 9);\
1802	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803	put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805	OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806	}\
1807	static void OPNAME ## qpel8_mc21_c(uint8_t dst, uint8_t src, int stride){\
1808	uint8_t halfH[72];\
1809	uint8_t halfHV[64];\
1810	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812	OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1813	}\
1814	static void OPNAME ## qpel8_mc23_c(uint8_t dst, uint8_t src, int stride){\
1815	uint8_t halfH[72];\
1816	uint8_t halfHV[64];\
1817	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819	OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820	}\
1821	void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t dst, uint8_t src, int stride){\
1822	uint8_t full[16*9];\
1823	uint8_t halfH[72];\
1824	uint8_t halfV[64];\
1825	uint8_t halfHV[64];\
1826	copy_block9(full, src, 16, stride, 9);\
1827	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830	OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1831	}\
1832	static void OPNAME ## qpel8_mc12_c(uint8_t dst, uint8_t src, int stride){\
1833	uint8_t full[16*9];\
1834	uint8_t halfH[72];\
1835	copy_block9(full, src, 16, stride, 9);\
1836	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837	put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838	OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839	}\
1840	void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t dst, uint8_t src, int stride){\
1841	uint8_t full[16*9];\
1842	uint8_t halfH[72];\
1843	uint8_t halfV[64];\
1844	uint8_t halfHV[64];\
1845	copy_block9(full, src, 16, stride, 9);\
1846	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847	put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848	put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849	OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1850	}\
1851	static void OPNAME ## qpel8_mc32_c(uint8_t dst, uint8_t src, int stride){\
1852	uint8_t full[16*9];\
1853	uint8_t halfH[72];\
1854	copy_block9(full, src, 16, stride, 9);\
1855	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856	put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857	OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858	}\
1859	static void OPNAME ## qpel8_mc22_c(uint8_t dst, uint8_t src, int stride){\
1860	uint8_t halfH[72];\
1861	put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862	OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863	}\
1864	static void OPNAME ## qpel16_mc00_c (uint8_t dst, uint8_t src, int stride){\
1865	OPNAME ## pixels16_c(dst, src, stride, 16);\
1866	}\
1867	\
1868	static void OPNAME ## qpel16_mc10_c(uint8_t dst, uint8_t src, int stride){\
1869	uint8_t half[256];\
1870	put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871	OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872	}\
1873	\
1874	static void OPNAME ## qpel16_mc20_c(uint8_t dst, uint8_t src, int stride){\
1875	OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1876	}\
1877	\
1878	static void OPNAME ## qpel16_mc30_c(uint8_t dst, uint8_t src, int stride){\
1879	uint8_t half[256];\
1880	put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881	OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882	}\
1883	\
1884	static void OPNAME ## qpel16_mc01_c(uint8_t dst, uint8_t src, int stride){\
1885	uint8_t full[24*17];\
1886	uint8_t half[256];\
1887	copy_block17(full, src, 24, stride, 17);\
1888	put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889	OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890	}\
1891	\
1892	static void OPNAME ## qpel16_mc02_c(uint8_t dst, uint8_t src, int stride){\
1893	uint8_t full[24*17];\
1894	copy_block17(full, src, 24, stride, 17);\
1895	OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1896	}\
1897	\
1898	static void OPNAME ## qpel16_mc03_c(uint8_t dst, uint8_t src, int stride){\
1899	uint8_t full[24*17];\
1900	uint8_t half[256];\
1901	copy_block17(full, src, 24, stride, 17);\
1902	put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903	OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904	}\
1905	void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t dst, uint8_t src, int stride){\
1906	uint8_t full[24*17];\
1907	uint8_t halfH[272];\
1908	uint8_t halfV[256];\
1909	uint8_t halfHV[256];\
1910	copy_block17(full, src, 24, stride, 17);\
1911	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914	OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915	}\
1916	static void OPNAME ## qpel16_mc11_c(uint8_t dst, uint8_t src, int stride){\
1917	uint8_t full[24*17];\
1918	uint8_t halfH[272];\
1919	uint8_t halfHV[256];\
1920	copy_block17(full, src, 24, stride, 17);\
1921	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922	put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924	OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925	}\
1926	void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t dst, uint8_t src, int stride){\
1927	uint8_t full[24*17];\
1928	uint8_t halfH[272];\
1929	uint8_t halfV[256];\
1930	uint8_t halfHV[256];\
1931	copy_block17(full, src, 24, stride, 17);\
1932	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935	OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936	}\
1937	static void OPNAME ## qpel16_mc31_c(uint8_t dst, uint8_t src, int stride){\
1938	uint8_t full[24*17];\
1939	uint8_t halfH[272];\
1940	uint8_t halfHV[256];\
1941	copy_block17(full, src, 24, stride, 17);\
1942	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943	put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945	OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946	}\
1947	void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t dst, uint8_t src, int stride){\
1948	uint8_t full[24*17];\
1949	uint8_t halfH[272];\
1950	uint8_t halfV[256];\
1951	uint8_t halfHV[256];\
1952	copy_block17(full, src, 24, stride, 17);\
1953	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956	OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957	}\
1958	static void OPNAME ## qpel16_mc13_c(uint8_t dst, uint8_t src, int stride){\
1959	uint8_t full[24*17];\
1960	uint8_t halfH[272];\
1961	uint8_t halfHV[256];\
1962	copy_block17(full, src, 24, stride, 17);\
1963	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964	put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966	OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967	}\
1968	void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t dst, uint8_t src, int stride){\
1969	uint8_t full[24*17];\
1970	uint8_t halfH[272];\
1971	uint8_t halfV[256];\
1972	uint8_t halfHV[256];\
1973	copy_block17(full, src, 24, stride, 17);\
1974	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1975	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977	OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978	}\
1979	static void OPNAME ## qpel16_mc33_c(uint8_t dst, uint8_t src, int stride){\
1980	uint8_t full[24*17];\
1981	uint8_t halfH[272];\
1982	uint8_t halfHV[256];\
1983	copy_block17(full, src, 24, stride, 17);\
1984	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985	put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987	OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988	}\
1989	static void OPNAME ## qpel16_mc21_c(uint8_t dst, uint8_t src, int stride){\
1990	uint8_t halfH[272];\
1991	uint8_t halfHV[256];\
1992	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994	OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995	}\
1996	static void OPNAME ## qpel16_mc23_c(uint8_t dst, uint8_t src, int stride){\
1997	uint8_t halfH[272];\
1998	uint8_t halfHV[256];\
1999	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001	OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002	}\
2003	void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t dst, uint8_t src, int stride){\
2004	uint8_t full[24*17];\
2005	uint8_t halfH[272];\
2006	uint8_t halfV[256];\
2007	uint8_t halfHV[256];\
2008	copy_block17(full, src, 24, stride, 17);\
2009	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012	OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013	}\
2014	static void OPNAME ## qpel16_mc12_c(uint8_t dst, uint8_t src, int stride){\
2015	uint8_t full[24*17];\
2016	uint8_t halfH[272];\
2017	copy_block17(full, src, 24, stride, 17);\
2018	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019	put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020	OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021	}\
2022	void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t dst, uint8_t src, int stride){\
2023	uint8_t full[24*17];\
2024	uint8_t halfH[272];\
2025	uint8_t halfV[256];\
2026	uint8_t halfHV[256];\
2027	copy_block17(full, src, 24, stride, 17);\
2028	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029	put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030	put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031	OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032	}\
2033	static void OPNAME ## qpel16_mc32_c(uint8_t dst, uint8_t src, int stride){\
2034	uint8_t full[24*17];\
2035	uint8_t halfH[272];\
2036	copy_block17(full, src, 24, stride, 17);\
2037	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038	put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039	OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040	}\
2041	static void OPNAME ## qpel16_mc22_c(uint8_t dst, uint8_t src, int stride){\
2042	uint8_t halfH[272];\
2043	put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044	OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045	}
2046
2047	#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048	#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049	#define op_put(a, b) a = cm[((b) + 16)>>5]
2050	#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051
2052	QPEL_MC(0, put_ , _ , op_put)
2053	QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054	QPEL_MC(0, avg_ , _ , op_avg)
2055	//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2056	#undef op_avg
2057	#undef op_avg_no_rnd
2058	#undef op_put
2059	#undef op_put_no_rnd
2060
2061	#if 1
2062	#define H264_LOWPASS(OPNAME, OP, OP2) \
2063	static void OPNAME ## h264_qpel2_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2064	const int h=2;\
2065	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066	int i;\
2067	for(i=0; i<h; i++)\
2068	{\
2069	OP(dst[0], (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3]));\
2070	OP(dst[1], (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4]));\
2071	dst+=dstStride;\
2072	src+=srcStride;\
2073	}\
2074	}\
2075	\
2076	static void OPNAME ## h264_qpel2_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2077	const int w=2;\
2078	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079	int i;\
2080	for(i=0; i<w; i++)\
2081	{\
2082	const int srcB= src[-2*srcStride];\
2083	const int srcA= src[-1*srcStride];\
2084	const int src0= src[0 *srcStride];\
2085	const int src1= src[1 *srcStride];\
2086	const int src2= src[2 *srcStride];\
2087	const int src3= src[3 *srcStride];\
2088	const int src4= src[4 *srcStride];\
2089	OP(dst[0dstStride], (src0+src1)20 - (srcA+src2)*5 + (srcB+src3));\
2090	OP(dst[1dstStride], (src1+src2)20 - (src0+src3)*5 + (srcA+src4));\
2091	dst++;\
2092	src++;\
2093	}\
2094	}\
2095	\
2096	static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t dst, int16_t tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097	const int h=2;\
2098	const int w=2;\
2099	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100	int i;\
2101	src -= 2*srcStride;\
2102	for(i=0; i<h+5; i++)\
2103	{\
2104	tmp[0]= (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3]);\
2105	tmp[1]= (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4]);\
2106	tmp+=tmpStride;\
2107	src+=srcStride;\
2108	}\
2109	tmp -= tmpStride*(h+5-2);\
2110	for(i=0; i<w; i++)\
2111	{\
2112	const int tmpB= tmp[-2*tmpStride];\
2113	const int tmpA= tmp[-1*tmpStride];\
2114	const int tmp0= tmp[0 *tmpStride];\
2115	const int tmp1= tmp[1 *tmpStride];\
2116	const int tmp2= tmp[2 *tmpStride];\
2117	const int tmp3= tmp[3 *tmpStride];\
2118	const int tmp4= tmp[4 *tmpStride];\
2119	OP2(dst[0dstStride], (tmp0+tmp1)20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120	OP2(dst[1dstStride], (tmp1+tmp2)20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121	dst++;\
2122	tmp++;\
2123	}\
2124	}\
2125	static void OPNAME ## h264_qpel4_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2126	const int h=4;\
2127	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128	int i;\
2129	for(i=0; i<h; i++)\
2130	{\
2131	OP(dst[0], (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3]));\
2132	OP(dst[1], (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4]));\
2133	OP(dst[2], (src[2]+src[3])20 - (src[1 ]+src[4])5 + (src[0 ]+src[5]));\
2134	OP(dst[3], (src[3]+src[4])20 - (src[2 ]+src[5])5 + (src[1 ]+src[6]));\
2135	dst+=dstStride;\
2136	src+=srcStride;\
2137	}\
2138	}\
2139	\
2140	static void OPNAME ## h264_qpel4_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2141	const int w=4;\
2142	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143	int i;\
2144	for(i=0; i<w; i++)\
2145	{\
2146	const int srcB= src[-2*srcStride];\
2147	const int srcA= src[-1*srcStride];\
2148	const int src0= src[0 *srcStride];\
2149	const int src1= src[1 *srcStride];\
2150	const int src2= src[2 *srcStride];\
2151	const int src3= src[3 *srcStride];\
2152	const int src4= src[4 *srcStride];\
2153	const int src5= src[5 *srcStride];\
2154	const int src6= src[6 *srcStride];\
2155	OP(dst[0dstStride], (src0+src1)20 - (srcA+src2)*5 + (srcB+src3));\
2156	OP(dst[1dstStride], (src1+src2)20 - (src0+src3)*5 + (srcA+src4));\
2157	OP(dst[2dstStride], (src2+src3)20 - (src1+src4)*5 + (src0+src5));\
2158	OP(dst[3dstStride], (src3+src4)20 - (src2+src5)*5 + (src1+src6));\
2159	dst++;\
2160	src++;\
2161	}\
2162	}\
2163	\
2164	static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t dst, int16_t tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165	const int h=4;\
2166	const int w=4;\
2167	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168	int i;\
2169	src -= 2*srcStride;\
2170	for(i=0; i<h+5; i++)\
2171	{\
2172	tmp[0]= (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3]);\
2173	tmp[1]= (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4]);\
2174	tmp[2]= (src[2]+src[3])20 - (src[1 ]+src[4])5 + (src[0 ]+src[5]);\
2175	tmp[3]= (src[3]+src[4])20 - (src[2 ]+src[5])5 + (src[1 ]+src[6]);\
2176	tmp+=tmpStride;\
2177	src+=srcStride;\
2178	}\
2179	tmp -= tmpStride*(h+5-2);\
2180	for(i=0; i<w; i++)\
2181	{\
2182	const int tmpB= tmp[-2*tmpStride];\
2183	const int tmpA= tmp[-1*tmpStride];\
2184	const int tmp0= tmp[0 *tmpStride];\
2185	const int tmp1= tmp[1 *tmpStride];\
2186	const int tmp2= tmp[2 *tmpStride];\
2187	const int tmp3= tmp[3 *tmpStride];\
2188	const int tmp4= tmp[4 *tmpStride];\
2189	const int tmp5= tmp[5 *tmpStride];\
2190	const int tmp6= tmp[6 *tmpStride];\
2191	OP2(dst[0dstStride], (tmp0+tmp1)20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192	OP2(dst[1dstStride], (tmp1+tmp2)20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193	OP2(dst[2dstStride], (tmp2+tmp3)20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194	OP2(dst[3dstStride], (tmp3+tmp4)20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195	dst++;\
2196	tmp++;\
2197	}\
2198	}\
2199	\
2200	static void OPNAME ## h264_qpel8_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2201	const int h=8;\
2202	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203	int i;\
2204	for(i=0; i<h; i++)\
2205	{\
2206	OP(dst[0], (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3 ]));\
2207	OP(dst[1], (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4 ]));\
2208	OP(dst[2], (src[2]+src[3])20 - (src[1 ]+src[4])5 + (src[0 ]+src[5 ]));\
2209	OP(dst[3], (src[3]+src[4])20 - (src[2 ]+src[5])5 + (src[1 ]+src[6 ]));\
2210	OP(dst[4], (src[4]+src[5])20 - (src[3 ]+src[6])5 + (src[2 ]+src[7 ]));\
2211	OP(dst[5], (src[5]+src[6])20 - (src[4 ]+src[7])5 + (src[3 ]+src[8 ]));\
2212	OP(dst[6], (src[6]+src[7])20 - (src[5 ]+src[8])5 + (src[4 ]+src[9 ]));\
2213	OP(dst[7], (src[7]+src[8])20 - (src[6 ]+src[9])5 + (src[5 ]+src[10]));\
2214	dst+=dstStride;\
2215	src+=srcStride;\
2216	}\
2217	}\
2218	\
2219	static void OPNAME ## h264_qpel8_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2220	const int w=8;\
2221	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222	int i;\
2223	for(i=0; i<w; i++)\
2224	{\
2225	const int srcB= src[-2*srcStride];\
2226	const int srcA= src[-1*srcStride];\
2227	const int src0= src[0 *srcStride];\
2228	const int src1= src[1 *srcStride];\
2229	const int src2= src[2 *srcStride];\
2230	const int src3= src[3 *srcStride];\
2231	const int src4= src[4 *srcStride];\
2232	const int src5= src[5 *srcStride];\
2233	const int src6= src[6 *srcStride];\
2234	const int src7= src[7 *srcStride];\
2235	const int src8= src[8 *srcStride];\
2236	const int src9= src[9 *srcStride];\
2237	const int src10=src[10*srcStride];\
2238	OP(dst[0dstStride], (src0+src1)20 - (srcA+src2)*5 + (srcB+src3));\
2239	OP(dst[1dstStride], (src1+src2)20 - (src0+src3)*5 + (srcA+src4));\
2240	OP(dst[2dstStride], (src2+src3)20 - (src1+src4)*5 + (src0+src5));\
2241	OP(dst[3dstStride], (src3+src4)20 - (src2+src5)*5 + (src1+src6));\
2242	OP(dst[4dstStride], (src4+src5)20 - (src3+src6)*5 + (src2+src7));\
2243	OP(dst[5dstStride], (src5+src6)20 - (src4+src7)*5 + (src3+src8));\
2244	OP(dst[6dstStride], (src6+src7)20 - (src5+src8)*5 + (src4+src9));\
2245	OP(dst[7dstStride], (src7+src8)20 - (src6+src9)*5 + (src5+src10));\
2246	dst++;\
2247	src++;\
2248	}\
2249	}\
2250	\
2251	static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t dst, int16_t tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252	const int h=8;\
2253	const int w=8;\
2254	uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255	int i;\
2256	src -= 2*srcStride;\
2257	for(i=0; i<h+5; i++)\
2258	{\
2259	tmp[0]= (src[0]+src[1])20 - (src[-1]+src[2])5 + (src[-2]+src[3 ]);\
2260	tmp[1]= (src[1]+src[2])20 - (src[0 ]+src[3])5 + (src[-1]+src[4 ]);\
2261	tmp[2]= (src[2]+src[3])20 - (src[1 ]+src[4])5 + (src[0 ]+src[5 ]);\
2262	tmp[3]= (src[3]+src[4])20 - (src[2 ]+src[5])5 + (src[1 ]+src[6 ]);\
2263	tmp[4]= (src[4]+src[5])20 - (src[3 ]+src[6])5 + (src[2 ]+src[7 ]);\
2264	tmp[5]= (src[5]+src[6])20 - (src[4 ]+src[7])5 + (src[3 ]+src[8 ]);\
2265	tmp[6]= (src[6]+src[7])20 - (src[5 ]+src[8])5 + (src[4 ]+src[9 ]);\
2266	tmp[7]= (src[7]+src[8])20 - (src[6 ]+src[9])5 + (src[5 ]+src[10]);\
2267	tmp+=tmpStride;\
2268	src+=srcStride;\
2269	}\
2270	tmp -= tmpStride*(h+5-2);\
2271	for(i=0; i<w; i++)\
2272	{\
2273	const int tmpB= tmp[-2*tmpStride];\
2274	const int tmpA= tmp[-1*tmpStride];\
2275	const int tmp0= tmp[0 *tmpStride];\
2276	const int tmp1= tmp[1 *tmpStride];\
2277	const int tmp2= tmp[2 *tmpStride];\
2278	const int tmp3= tmp[3 *tmpStride];\
2279	const int tmp4= tmp[4 *tmpStride];\
2280	const int tmp5= tmp[5 *tmpStride];\
2281	const int tmp6= tmp[6 *tmpStride];\
2282	const int tmp7= tmp[7 *tmpStride];\
2283	const int tmp8= tmp[8 *tmpStride];\
2284	const int tmp9= tmp[9 *tmpStride];\
2285	const int tmp10=tmp[10*tmpStride];\
2286	OP2(dst[0dstStride], (tmp0+tmp1)20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287	OP2(dst[1dstStride], (tmp1+tmp2)20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288	OP2(dst[2dstStride], (tmp2+tmp3)20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289	OP2(dst[3dstStride], (tmp3+tmp4)20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290	OP2(dst[4dstStride], (tmp4+tmp5)20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291	OP2(dst[5dstStride], (tmp5+tmp6)20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292	OP2(dst[6dstStride], (tmp6+tmp7)20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293	OP2(dst[7dstStride], (tmp7+tmp8)20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294	dst++;\
2295	tmp++;\
2296	}\
2297	}\
2298	\
2299	static void OPNAME ## h264_qpel16_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2300	OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2301	OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302	src += 8*srcStride;\
2303	dst += 8*dstStride;\
2304	OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305	OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306	}\
2307	\
2308	static void OPNAME ## h264_qpel16_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2309	OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2310	OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311	src += 8*srcStride;\
2312	dst += 8*dstStride;\
2313	OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314	OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315	}\
2316	\
2317	static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t dst, int16_t tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318	OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2319	OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320	src += 8*srcStride;\
2321	dst += 8*dstStride;\
2322	OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323	OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324	}\
2325
2326	#define H264_MC(OPNAME, SIZE) \
2327	static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t dst, uint8_t src, int stride){\
2328	OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329	}\
2330	\
2331	static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t dst, uint8_t src, int stride){\
2332	uint8_t half[SIZE*SIZE];\
2333	put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334	OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335	}\
2336	\
2337	static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t dst, uint8_t src, int stride){\
2338	OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339	}\
2340	\
2341	static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t dst, uint8_t src, int stride){\
2342	uint8_t half[SIZE*SIZE];\
2343	put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344	OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345	}\
2346	\
2347	static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t dst, uint8_t src, int stride){\
2348	uint8_t full[SIZE*(SIZE+5)];\
2349	uint8_t * const full_mid= full + SIZE*2;\
2350	uint8_t half[SIZE*SIZE];\
2351	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2352	put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353	OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354	}\
2355	\
2356	static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t dst, uint8_t src, int stride){\
2357	uint8_t full[SIZE*(SIZE+5)];\
2358	uint8_t * const full_mid= full + SIZE*2;\
2359	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2360	OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361	}\
2362	\
2363	static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t dst, uint8_t src, int stride){\
2364	uint8_t full[SIZE*(SIZE+5)];\
2365	uint8_t * const full_mid= full + SIZE*2;\
2366	uint8_t half[SIZE*SIZE];\
2367	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2368	put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369	OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370	}\
2371	\
2372	static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t dst, uint8_t src, int stride){\
2373	uint8_t full[SIZE*(SIZE+5)];\
2374	uint8_t * const full_mid= full + SIZE*2;\
2375	uint8_t halfH[SIZE*SIZE];\
2376	uint8_t halfV[SIZE*SIZE];\
2377	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2379	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381	}\
2382	\
2383	static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t dst, uint8_t src, int stride){\
2384	uint8_t full[SIZE*(SIZE+5)];\
2385	uint8_t * const full_mid= full + SIZE*2;\
2386	uint8_t halfH[SIZE*SIZE];\
2387	uint8_t halfV[SIZE*SIZE];\
2388	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389	copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2390	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392	}\
2393	\
2394	static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t dst, uint8_t src, int stride){\
2395	uint8_t full[SIZE*(SIZE+5)];\
2396	uint8_t * const full_mid= full + SIZE*2;\
2397	uint8_t halfH[SIZE*SIZE];\
2398	uint8_t halfV[SIZE*SIZE];\
2399	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2401	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403	}\
2404	\
2405	static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t dst, uint8_t src, int stride){\
2406	uint8_t full[SIZE*(SIZE+5)];\
2407	uint8_t * const full_mid= full + SIZE*2;\
2408	uint8_t halfH[SIZE*SIZE];\
2409	uint8_t halfV[SIZE*SIZE];\
2410	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411	copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2412	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414	}\
2415	\
2416	static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t dst, uint8_t src, int stride){\
2417	int16_t tmp[SIZE*(SIZE+5)];\
2418	OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419	}\
2420	\
2421	static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t dst, uint8_t src, int stride){\
2422	int16_t tmp[SIZE*(SIZE+5)];\
2423	uint8_t halfH[SIZE*SIZE];\
2424	uint8_t halfHV[SIZE*SIZE];\
2425	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426	put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428	}\
2429	\
2430	static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t dst, uint8_t src, int stride){\
2431	int16_t tmp[SIZE*(SIZE+5)];\
2432	uint8_t halfH[SIZE*SIZE];\
2433	uint8_t halfHV[SIZE*SIZE];\
2434	put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435	put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436	OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437	}\
2438	\
2439	static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t dst, uint8_t src, int stride){\
2440	uint8_t full[SIZE*(SIZE+5)];\
2441	uint8_t * const full_mid= full + SIZE*2;\
2442	int16_t tmp[SIZE*(SIZE+5)];\
2443	uint8_t halfV[SIZE*SIZE];\
2444	uint8_t halfHV[SIZE*SIZE];\
2445	copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447	put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448	OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449	}\
2450	\
2451	static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t dst, uint8_t src, int stride){\
2452	uint8_t full[SIZE*(SIZE+5)];\
2453	uint8_t * const full_mid= full + SIZE*2;\
2454	int16_t tmp[SIZE*(SIZE+5)];\
2455	uint8_t halfV[SIZE*SIZE];\
2456	uint8_t halfHV[SIZE*SIZE];\
2457	copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2458	put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459	put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460	OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461	}\
2462
2463	#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464	//#define op_avg2(a, b) a = (((a)w1+cm[((b) + 16)>>5]w2 + o + 64)>>7)
2465	#define op_put(a, b) a = cm[((b) + 16)>>5]
2466	#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467	#define op2_put(a, b) a = cm[((b) + 512)>>10]
2468
2469	H264_LOWPASS(put_ , op_put, op2_put)
2470	H264_LOWPASS(avg_ , op_avg, op2_avg)
2471	H264_MC(put_, 2)
2472	H264_MC(put_, 4)
2473	H264_MC(put_, 8)
2474	H264_MC(put_, 16)
2475	H264_MC(avg_, 4)
2476	H264_MC(avg_, 8)
2477	H264_MC(avg_, 16)
2478
2479	#undef op_avg
2480	#undef op_put
2481	#undef op2_avg
2482	#undef op2_put
2483	#endif
2484
2485	#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486	#define op_scale2(x) dst[x] = clip_uint8( (src[x]weights + dst[x]weightd + offset) >> (log2_denom+1))
2487	#define H264_WEIGHT(W,H) \
2488	static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2489	int y; \
2490	offset <<= log2_denom; \
2491	if(log2_denom) offset += 1<<(log2_denom-1); \
2492	for(y=0; y<H; y++, block += stride){ \
2493	op_scale1(0); \
2494	op_scale1(1); \
2495	if(W==2) continue; \
2496	op_scale1(2); \
2497	op_scale1(3); \
2498	if(W==4) continue; \
2499	op_scale1(4); \
2500	op_scale1(5); \
2501	op_scale1(6); \
2502	op_scale1(7); \
2503	if(W==8) continue; \
2504	op_scale1(8); \
2505	op_scale1(9); \
2506	op_scale1(10); \
2507	op_scale1(11); \
2508	op_scale1(12); \
2509	op_scale1(13); \
2510	op_scale1(14); \
2511	op_scale1(15); \
2512	} \
2513	} \
2514	static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t dst, uint8_t src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515	int y; \
2516	offset = ((offset + 1) \| 1) << log2_denom; \
2517	for(y=0; y<H; y++, dst += stride, src += stride){ \
2518	op_scale2(0); \
2519	op_scale2(1); \
2520	if(W==2) continue; \
2521	op_scale2(2); \
2522	op_scale2(3); \
2523	if(W==4) continue; \
2524	op_scale2(4); \
2525	op_scale2(5); \
2526	op_scale2(6); \
2527	op_scale2(7); \
2528	if(W==8) continue; \
2529	op_scale2(8); \
2530	op_scale2(9); \
2531	op_scale2(10); \
2532	op_scale2(11); \
2533	op_scale2(12); \
2534	op_scale2(13); \
2535	op_scale2(14); \
2536	op_scale2(15); \
2537	} \
2538	}
2539
2540	H264_WEIGHT(16,16)
2541	H264_WEIGHT(16,8)
2542	H264_WEIGHT(8,16)
2543	H264_WEIGHT(8,8)
2544	H264_WEIGHT(8,4)
2545	H264_WEIGHT(4,8)
2546	H264_WEIGHT(4,4)
2547	H264_WEIGHT(4,2)
2548	H264_WEIGHT(2,4)
2549	H264_WEIGHT(2,2)
2550
2551	#undef op_scale1
2552	#undef op_scale2
2553	#undef H264_WEIGHT
2554
2555	static void wmv2_mspel8_h_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){
2556	uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557	int i;
2558
2559	for(i=0; i<h; i++){
2560	dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561	dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562	dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563	dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564	dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565	dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566	dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567	dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568	dst+=dstStride;
2569	src+=srcStride;
2570	}
2571	}
2572
2573	#ifdef CONFIG_CAVS_DECODER
2574	/* AVS specific */
2575	void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2576
2577	void ff_put_cavs_qpel8_mc00_c(uint8_t dst, uint8_t src, int stride) {
2578	put_pixels8_c(dst, src, stride, 8);
2579	}
2580	void ff_avg_cavs_qpel8_mc00_c(uint8_t dst, uint8_t src, int stride) {
2581	avg_pixels8_c(dst, src, stride, 8);
2582	}
2583	void ff_put_cavs_qpel16_mc00_c(uint8_t dst, uint8_t src, int stride) {
2584	put_pixels16_c(dst, src, stride, 16);
2585	}
2586	void ff_avg_cavs_qpel16_mc00_c(uint8_t dst, uint8_t src, int stride) {
2587	avg_pixels16_c(dst, src, stride, 16);
2588	}
2589	#endif /* CONFIG_CAVS_DECODER */
2590
2591	static void wmv2_mspel8_v_lowpass(uint8_t dst, uint8_t src, int dstStride, int srcStride, int w){
2592	uint8_t *cm = cropTbl + MAX_NEG_CROP;
2593	int i;
2594
2595	for(i=0; i<w; i++){
2596	const int src_1= src[ -srcStride];
2597	const int src0 = src[0 ];
2598	const int src1 = src[ srcStride];
2599	const int src2 = src[2*srcStride];
2600	const int src3 = src[3*srcStride];
2601	const int src4 = src[4*srcStride];
2602	const int src5 = src[5*srcStride];
2603	const int src6 = src[6*srcStride];
2604	const int src7 = src[7*srcStride];
2605	const int src8 = src[8*srcStride];
2606	const int src9 = src[9*srcStride];
2607	dst[0dstStride]= cm[(9(src0 + src1) - (src_1 + src2) + 8)>>4];
2608	dst[1dstStride]= cm[(9(src1 + src2) - (src0 + src3) + 8)>>4];
2609	dst[2dstStride]= cm[(9(src2 + src3) - (src1 + src4) + 8)>>4];
2610	dst[3dstStride]= cm[(9(src3 + src4) - (src2 + src5) + 8)>>4];
2611	dst[4dstStride]= cm[(9(src4 + src5) - (src3 + src6) + 8)>>4];
2612	dst[5dstStride]= cm[(9(src5 + src6) - (src4 + src7) + 8)>>4];
2613	dst[6dstStride]= cm[(9(src6 + src7) - (src5 + src8) + 8)>>4];
2614	dst[7dstStride]= cm[(9(src7 + src8) - (src6 + src9) + 8)>>4];
2615	src++;
2616	dst++;
2617	}
2618	}
2619
2620	static void put_mspel8_mc00_c (uint8_t dst, uint8_t src, int stride){
2621	put_pixels8_c(dst, src, stride, 8);
2622	}
2623
2624	static void put_mspel8_mc10_c(uint8_t dst, uint8_t src, int stride){
2625	uint8_t half[64];
2626	wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2627	put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2628	}
2629
2630	static void put_mspel8_mc20_c(uint8_t dst, uint8_t src, int stride){
2631	wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2632	}
2633
2634	static void put_mspel8_mc30_c(uint8_t dst, uint8_t src, int stride){
2635	uint8_t half[64];
2636	wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2637	put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2638	}
2639
2640	static void put_mspel8_mc02_c(uint8_t dst, uint8_t src, int stride){
2641	wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2642	}
2643
2644	static void put_mspel8_mc12_c(uint8_t dst, uint8_t src, int stride){
2645	uint8_t halfH[88];
2646	uint8_t halfV[64];
2647	uint8_t halfHV[64];
2648	wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2649	wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2650	wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2651	put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2652	}
2653	static void put_mspel8_mc32_c(uint8_t dst, uint8_t src, int stride){
2654	uint8_t halfH[88];
2655	uint8_t halfV[64];
2656	uint8_t halfHV[64];
2657	wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2658	wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2659	wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2660	put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2661	}
2662	static void put_mspel8_mc22_c(uint8_t dst, uint8_t src, int stride){
2663	uint8_t halfH[88];
2664	wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2665	wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2666	}
2667
2668	static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2669	int x;
2670	const int strength= ff_h263_loop_filter_strength[qscale];
2671
2672	for(x=0; x<8; x++){
2673	int d1, d2, ad1;
2674	int p0= src[x-2*stride];
2675	int p1= src[x-1*stride];
2676	int p2= src[x+0*stride];
2677	int p3= src[x+1*stride];
2678	int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2679
2680	if (d<-2*strength) d1= 0;
2681	else if(d<- strength) d1=-2*strength - d;
2682	else if(d< strength) d1= d;
2683	else if(d< 2strength) d1= 2strength - d;
2684	else d1= 0;
2685
2686	p1 += d1;
2687	p2 -= d1;
2688	if(p1&256) p1= ~(p1>>31);
2689	if(p2&256) p2= ~(p2>>31);
2690
2691	src[x-1*stride] = p1;
2692	src[x+0*stride] = p2;
2693
2694	ad1= ABS(d1)>>1;
2695
2696	d2= clip((p0-p3)/4, -ad1, ad1);
2697
2698	src[x-2*stride] = p0 - d2;
2699	src[x+ stride] = p3 + d2;
2700	}
2701	}
2702
2703	static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2704	int y;
2705	const int strength= ff_h263_loop_filter_strength[qscale];
2706
2707	for(y=0; y<8; y++){
2708	int d1, d2, ad1;
2709	int p0= src[y*stride-2];
2710	int p1= src[y*stride-1];
2711	int p2= src[y*stride+0];
2712	int p3= src[y*stride+1];
2713	int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2714
2715	if (d<-2*strength) d1= 0;
2716	else if(d<- strength) d1=-2*strength - d;
2717	else if(d< strength) d1= d;
2718	else if(d< 2strength) d1= 2strength - d;
2719	else d1= 0;
2720
2721	p1 += d1;
2722	p2 -= d1;
2723	if(p1&256) p1= ~(p1>>31);
2724	if(p2&256) p2= ~(p2>>31);
2725
2726	src[y*stride-1] = p1;
2727	src[y*stride+0] = p2;
2728
2729	ad1= ABS(d1)>>1;
2730
2731	d2= clip((p0-p3)/4, -ad1, ad1);
2732
2733	src[y*stride-2] = p0 - d2;
2734	src[y*stride+1] = p3 + d2;
2735	}
2736	}
2737
2738	static void h261_loop_filter_c(uint8_t *src, int stride){
2739	int x,y,xy,yz;
2740	int temp[64];
2741
2742	for(x=0; x<8; x++){
2743	temp[x ] = 4*src[x ];
2744	temp[x + 78] = 4src[x + 7*stride];
2745	}
2746	for(y=1; y<7; y++){
2747	for(x=0; x<8; x++){
2748	xy = y * stride + x;
2749	yz = y * 8 + x;
2750	temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2751	}
2752	}
2753
2754	for(y=0; y<8; y++){
2755	src[ ystride] = (temp[ y8] + 2)>>2;
2756	src[7+ystride] = (temp[7+y8] + 2)>>2;
2757	for(x=1; x<7; x++){
2758	xy = y * stride + x;
2759	yz = y * 8 + x;
2760	src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2761	}
2762	}
2763	}
2764
2765	static inline void h264_loop_filter_luma_c(uint8_t pix, int xstride, int ystride, int alpha, int beta, int8_t tc0)
2766	{
2767	int i, d;
2768	for( i = 0; i < 4; i++ ) {
2769	if( tc0[i] < 0 ) {
2770	pix += 4*ystride;
2771	continue;
2772	}
2773	for( d = 0; d < 4; d++ ) {
2774	const int p0 = pix[-1*xstride];
2775	const int p1 = pix[-2*xstride];
2776	const int p2 = pix[-3*xstride];
2777	const int q0 = pix[0];
2778	const int q1 = pix[1*xstride];
2779	const int q2 = pix[2*xstride];
2780
2781	if( ABS( p0 - q0 ) < alpha &&
2782	ABS( p1 - p0 ) < beta &&
2783	ABS( q1 - q0 ) < beta ) {
2784
2785	int tc = tc0[i];
2786	int i_delta;
2787
2788	if( ABS( p2 - p0 ) < beta ) {
2789	pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2790	tc++;
2791	}
2792	if( ABS( q2 - q0 ) < beta ) {
2793	pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2794	tc++;
2795	}
2796
2797	i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2798	pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2799	pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2800	}
2801	pix += ystride;
2802	}
2803	}
2804	}
2805	static void h264_v_loop_filter_luma_c(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)
2806	{
2807	h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2808	}
2809	static void h264_h_loop_filter_luma_c(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)
2810	{
2811	h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2812	}
2813
2814	static inline void h264_loop_filter_chroma_c(uint8_t pix, int xstride, int ystride, int alpha, int beta, int8_t tc0)
2815	{
2816	int i, d;
2817	for( i = 0; i < 4; i++ ) {
2818	const int tc = tc0[i];
2819	if( tc <= 0 ) {
2820	pix += 2*ystride;
2821	continue;
2822	}
2823	for( d = 0; d < 2; d++ ) {
2824	const int p0 = pix[-1*xstride];
2825	const int p1 = pix[-2*xstride];
2826	const int q0 = pix[0];
2827	const int q1 = pix[1*xstride];
2828
2829	if( ABS( p0 - q0 ) < alpha &&
2830	ABS( p1 - p0 ) < beta &&
2831	ABS( q1 - q0 ) < beta ) {
2832
2833	int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2834
2835	pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2836	pix[0] = clip_uint8( q0 - delta ); /* q0' */
2837	}
2838	pix += ystride;
2839	}
2840	}
2841	}
2842	static void h264_v_loop_filter_chroma_c(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)
2843	{
2844	h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2845	}
2846	static void h264_h_loop_filter_chroma_c(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)
2847	{
2848	h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2849	}
2850
2851	static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2852	{
2853	int d;
2854	for( d = 0; d < 8; d++ ) {
2855	const int p0 = pix[-1*xstride];
2856	const int p1 = pix[-2*xstride];
2857	const int q0 = pix[0];
2858	const int q1 = pix[1*xstride];
2859
2860	if( ABS( p0 - q0 ) < alpha &&
2861	ABS( p1 - p0 ) < beta &&
2862	ABS( q1 - q0 ) < beta ) {
2863
2864	pix[-xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; / p0' */
2865	pix[0] = ( 2q1 + q0 + p1 + 2 ) >> 2; / q0' */
2866	}
2867	pix += ystride;
2868	}
2869	}
2870	static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2871	{
2872	h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2873	}
2874	static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2875	{
2876	h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2877	}
2878
2879	static inline int pix_abs16_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
2880	{
2881	int s, i;
2882
2883	s = 0;
2884	for(i=0;i<h;i++) {
2885	s += abs(pix1[0] - pix2[0]);
2886	s += abs(pix1[1] - pix2[1]);
2887	s += abs(pix1[2] - pix2[2]);
2888	s += abs(pix1[3] - pix2[3]);
2889	s += abs(pix1[4] - pix2[4]);
2890	s += abs(pix1[5] - pix2[5]);
2891	s += abs(pix1[6] - pix2[6]);
2892	s += abs(pix1[7] - pix2[7]);
2893	s += abs(pix1[8] - pix2[8]);
2894	s += abs(pix1[9] - pix2[9]);
2895	s += abs(pix1[10] - pix2[10]);
2896	s += abs(pix1[11] - pix2[11]);
2897	s += abs(pix1[12] - pix2[12]);
2898	s += abs(pix1[13] - pix2[13]);
2899	s += abs(pix1[14] - pix2[14]);
2900	s += abs(pix1[15] - pix2[15]);
2901	pix1 += line_size;
2902	pix2 += line_size;
2903	}
2904	return s;
2905	}
2906
2907	static int pix_abs16_x2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
2908	{
2909	int s, i;
2910
2911	s = 0;
2912	for(i=0;i<h;i++) {
2913	s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2914	s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2915	s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2916	s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2917	s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2918	s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2919	s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2920	s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2921	s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2922	s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2923	s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2924	s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2925	s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2926	s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2927	s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2928	s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2929	pix1 += line_size;
2930	pix2 += line_size;
2931	}
2932	return s;
2933	}
2934
2935	static int pix_abs16_y2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
2936	{
2937	int s, i;
2938	uint8_t *pix3 = pix2 + line_size;
2939
2940	s = 0;
2941	for(i=0;i<h;i++) {
2942	s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2943	s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2944	s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2945	s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2946	s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2947	s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2948	s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2949	s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2950	s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2951	s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2952	s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2953	s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2954	s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2955	s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2956	s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2957	s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2958	pix1 += line_size;
2959	pix2 += line_size;
2960	pix3 += line_size;
2961	}
2962	return s;
2963	}
2964
2965	static int pix_abs16_xy2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
2966	{
2967	int s, i;
2968	uint8_t *pix3 = pix2 + line_size;
2969
2970	s = 0;
2971	for(i=0;i<h;i++) {
2972	s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2973	s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2974	s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2975	s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2976	s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2977	s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2978	s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2979	s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2980	s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2981	s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2982	s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2983	s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2984	s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2985	s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2986	s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2987	s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2988	pix1 += line_size;
2989	pix2 += line_size;
2990	pix3 += line_size;
2991	}
2992	return s;
2993	}
2994
2995	static inline int pix_abs8_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
2996	{
2997	int s, i;
2998
2999	s = 0;
3000	for(i=0;i<h;i++) {
3001	s += abs(pix1[0] - pix2[0]);
3002	s += abs(pix1[1] - pix2[1]);
3003	s += abs(pix1[2] - pix2[2]);
3004	s += abs(pix1[3] - pix2[3]);
3005	s += abs(pix1[4] - pix2[4]);
3006	s += abs(pix1[5] - pix2[5]);
3007	s += abs(pix1[6] - pix2[6]);
3008	s += abs(pix1[7] - pix2[7]);
3009	pix1 += line_size;
3010	pix2 += line_size;
3011	}
3012	return s;
3013	}
3014
3015	static int pix_abs8_x2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
3016	{
3017	int s, i;
3018
3019	s = 0;
3020	for(i=0;i<h;i++) {
3021	s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3022	s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3023	s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3024	s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3025	s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3026	s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3027	s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3028	s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3029	pix1 += line_size;
3030	pix2 += line_size;
3031	}
3032	return s;
3033	}
3034
3035	static int pix_abs8_y2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
3036	{
3037	int s, i;
3038	uint8_t *pix3 = pix2 + line_size;
3039
3040	s = 0;
3041	for(i=0;i<h;i++) {
3042	s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3043	s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3044	s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3045	s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3046	s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3047	s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3048	s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3049	s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3050	pix1 += line_size;
3051	pix2 += line_size;
3052	pix3 += line_size;
3053	}
3054	return s;
3055	}
3056
3057	static int pix_abs8_xy2_c(void v, uint8_t pix1, uint8_t *pix2, int line_size, int h)
3058	{
3059	int s, i;
3060	uint8_t *pix3 = pix2 + line_size;
3061
3062	s = 0;
3063	for(i=0;i<h;i++) {
3064	s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3065	s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3066	s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3067	s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3068	s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3069	s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3070	s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3071	s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3072	pix1 += line_size;
3073	pix2 += line_size;
3074	pix3 += line_size;
3075	}
3076	return s;
3077	}
3078
3079	static int nsse16_c(void v, uint8_t s1, uint8_t *s2, int stride, int h){
3080	MpegEncContext *c = v;
3081	int score1=0;
3082	int score2=0;
3083	int x,y;
3084
3085	for(y=0; y<h; y++){
3086	for(x=0; x<16; x++){
3087	score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3088	}
3089	if(y+1<h){
3090	for(x=0; x<15; x++){
3091	score2+= ABS( s1[x ] - s1[x +stride]
3092	- s1[x+1] + s1[x+1+stride])
3093	-ABS( s2[x ] - s2[x +stride]
3094	- s2[x+1] + s2[x+1+stride]);
3095	}
3096	}
3097	s1+= stride;
3098	s2+= stride;
3099	}
3100
3101	if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3102	else return score1 + ABS(score2)*8;
3103	}
3104
3105	static int nsse8_c(void v, uint8_t s1, uint8_t *s2, int stride, int h){
3106	MpegEncContext *c = v;
3107	int score1=0;
3108	int score2=0;
3109	int x,y;
3110
3111	for(y=0; y<h; y++){
3112	for(x=0; x<8; x++){
3113	score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3114	}
3115	if(y+1<h){
3116	for(x=0; x<7; x++){
3117	score2+= ABS( s1[x ] - s1[x +stride]
3118	- s1[x+1] + s1[x+1+stride])
3119	-ABS( s2[x ] - s2[x +stride]
3120	- s2[x+1] + s2[x+1+stride]);
3121	}
3122	}
3123	s1+= stride;
3124	s2+= stride;
3125	}
3126
3127	if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3128	else return score1 + ABS(score2)*8;
3129	}
3130
3131	static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3132	int i;
3133	unsigned int sum=0;
3134
3135	for(i=0; i<8*8; i++){
3136	int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3137	int w= weight[i];
3138	b>>= RECON_SHIFT;
3139	assert(-512<b && b<512);
3140
3141	sum += (wb)(w*b)>>4;
3142	}
3143	return sum>>2;
3144	}
3145
3146	static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3147	int i;
3148
3149	for(i=0; i<8*8; i++){
3150	rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3151	}
3152	}
3153
3154	/**
3155	* permutes an 8x8 block.
3156	* @param block the block which will be permuted according to the given permutation vector
3157	* @param permutation the permutation vector
3158	* @param last the last non zero coefficient in scantable order, used to speed the permutation up
3159	* @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3160	* (inverse) permutated to scantable order!
3161	*/
3162	void ff_block_permute(DCTELEM block, uint8_t permutation, const uint8_t *scantable, int last)
3163	{
3164	int i;
3165	DCTELEM temp[64];
3166
3167	if(last<=0) return;
3168	//if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3169
3170	for(i=0; i<=last; i++){
3171	const int j= scantable[i];
3172	temp[j]= block[j];
3173	block[j]=0;
3174	}
3175
3176	for(i=0; i<=last; i++){
3177	const int j= scantable[i];
3178	const int perm_j= permutation[j];
3179	block[perm_j]= temp[j];
3180	}
3181	}
3182
3183	static int zero_cmp(void s, uint8_t a, uint8_t *b, int stride, int h){
3184	return 0;
3185	}
3186
3187	void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3188	int i;
3189
3190	memset(cmp, 0, sizeof(void)5);
3191
3192	for(i=0; i<5; i++){
3193	switch(type&0xFF){
3194	case FF_CMP_SAD:
3195	cmp[i]= c->sad[i];
3196	break;
3197	case FF_CMP_SATD:
3198	cmp[i]= c->hadamard8_diff[i];
3199	break;
3200	case FF_CMP_SSE:
3201	cmp[i]= c->sse[i];
3202	break;
3203	case FF_CMP_DCT:
3204	cmp[i]= c->dct_sad[i];
3205	break;
3206	case FF_CMP_DCT264:
3207	cmp[i]= c->dct264_sad[i];
3208	break;
3209	case FF_CMP_DCTMAX:
3210	cmp[i]= c->dct_max[i];
3211	break;
3212	case FF_CMP_PSNR:
3213	cmp[i]= c->quant_psnr[i];
3214	break;
3215	case FF_CMP_BIT:
3216	cmp[i]= c->bit[i];
3217	break;
3218	case FF_CMP_RD:
3219	cmp[i]= c->rd[i];
3220	break;
3221	case FF_CMP_VSAD:
3222	cmp[i]= c->vsad[i];
3223	break;
3224	case FF_CMP_VSSE:
3225	cmp[i]= c->vsse[i];
3226	break;
3227	case FF_CMP_ZERO:
3228	cmp[i]= zero_cmp;
3229	break;
3230	case FF_CMP_NSSE:
3231	cmp[i]= c->nsse[i];
3232	break;
3233	#ifdef CONFIG_SNOW_ENCODER
3234	case FF_CMP_W53:
3235	cmp[i]= c->w53[i];
3236	break;
3237	case FF_CMP_W97:
3238	cmp[i]= c->w97[i];
3239	break;
3240	#endif
3241	default:
3242	av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3243	}
3244	}
3245	}
3246
3247	/**
3248	* memset(blocks, 0, sizeof(DCTELEM)664)
3249	*/
3250	static void clear_blocks_c(DCTELEM *blocks)
3251	{
3252	memset(blocks, 0, sizeof(DCTELEM)664);
3253	}
3254
3255	static void add_bytes_c(uint8_t dst, uint8_t src, int w){
3256	int i;
3257	for(i=0; i+7<w; i+=8){
3258	dst[i+0] += src[i+0];
3259	dst[i+1] += src[i+1];
3260	dst[i+2] += src[i+2];
3261	dst[i+3] += src[i+3];
3262	dst[i+4] += src[i+4];
3263	dst[i+5] += src[i+5];
3264	dst[i+6] += src[i+6];
3265	dst[i+7] += src[i+7];
3266	}
3267	for(; i<w; i++)
3268	dst[i+0] += src[i+0];
3269	}
3270
3271	static void diff_bytes_c(uint8_t dst, uint8_t src1, uint8_t *src2, int w){
3272	int i;
3273	for(i=0; i+7<w; i+=8){
3274	dst[i+0] = src1[i+0]-src2[i+0];
3275	dst[i+1] = src1[i+1]-src2[i+1];
3276	dst[i+2] = src1[i+2]-src2[i+2];
3277	dst[i+3] = src1[i+3]-src2[i+3];
3278	dst[i+4] = src1[i+4]-src2[i+4];
3279	dst[i+5] = src1[i+5]-src2[i+5];
3280	dst[i+6] = src1[i+6]-src2[i+6];
3281	dst[i+7] = src1[i+7]-src2[i+7];
3282	}
3283	for(; i<w; i++)
3284	dst[i+0] = src1[i+0]-src2[i+0];
3285	}
3286
3287	static void sub_hfyu_median_prediction_c(uint8_t dst, uint8_t src1, uint8_t src2, int w, int left, int *left_top){
3288	int i;
3289	uint8_t l, lt;
3290
3291	l= *left;
3292	lt= *left_top;
3293
3294	for(i=0; i<w; i++){
3295	const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3296	lt= src1[i];
3297	l= src2[i];
3298	dst[i]= l - pred;
3299	}
3300
3301	*left= l;
3302	*left_top= lt;
3303	}
3304
3305	#define BUTTERFLY2(o1,o2,i1,i2) \
3306	o1= (i1)+(i2);\
3307	o2= (i1)-(i2);
3308
3309	#define BUTTERFLY1(x,y) \
3310	{\
3311	int a,b;\
3312	a= x;\
3313	b= y;\
3314	x= a+b;\
3315	y= a-b;\
3316	}
3317
3318	#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3319
3320	static int hadamard8_diff8x8_c(/MpegEncContext/ void s, uint8_t dst, uint8_t *src, int stride, int h){
3321	int i;
3322	int temp[64];
3323	int sum=0;
3324
3325	assert(h==8);
3326
3327	for(i=0; i<8; i++){
3328	//FIXME try pointer walks
3329	BUTTERFLY2(temp[8i+0], temp[8i+1], src[stridei+0]-dst[stridei+0],src[stridei+1]-dst[stridei+1]);
3330	BUTTERFLY2(temp[8i+2], temp[8i+3], src[stridei+2]-dst[stridei+2],src[stridei+3]-dst[stridei+3]);
3331	BUTTERFLY2(temp[8i+4], temp[8i+5], src[stridei+4]-dst[stridei+4],src[stridei+5]-dst[stridei+5]);
3332	BUTTERFLY2(temp[8i+6], temp[8i+7], src[stridei+6]-dst[stridei+6],src[stridei+7]-dst[stridei+7]);
3333
3334	BUTTERFLY1(temp[8i+0], temp[8i+2]);
3335	BUTTERFLY1(temp[8i+1], temp[8i+3]);
3336	BUTTERFLY1(temp[8i+4], temp[8i+6]);
3337	BUTTERFLY1(temp[8i+5], temp[8i+7]);
3338
3339	BUTTERFLY1(temp[8i+0], temp[8i+4]);
3340	BUTTERFLY1(temp[8i+1], temp[8i+5]);
3341	BUTTERFLY1(temp[8i+2], temp[8i+6]);
3342	BUTTERFLY1(temp[8i+3], temp[8i+7]);
3343	}
3344
3345	for(i=0; i<8; i++){
3346	BUTTERFLY1(temp[80+i], temp[81+i]);
3347	BUTTERFLY1(temp[82+i], temp[83+i]);
3348	BUTTERFLY1(temp[84+i], temp[85+i]);
3349	BUTTERFLY1(temp[86+i], temp[87+i]);
3350
3351	BUTTERFLY1(temp[80+i], temp[82+i]);
3352	BUTTERFLY1(temp[81+i], temp[83+i]);
3353	BUTTERFLY1(temp[84+i], temp[86+i]);
3354	BUTTERFLY1(temp[85+i], temp[87+i]);
3355
3356	sum +=
3357	BUTTERFLYA(temp[80+i], temp[84+i])
3358	+BUTTERFLYA(temp[81+i], temp[85+i])
3359	+BUTTERFLYA(temp[82+i], temp[86+i])
3360	+BUTTERFLYA(temp[83+i], temp[87+i]);
3361	}
3362	#if 0
3363	static int maxi=0;
3364	if(sum>maxi){
3365	maxi=sum;
3366	printf("MAX:%d\n", maxi);
3367	}
3368	#endif
3369	return sum;
3370	}
3371
3372	static int hadamard8_intra8x8_c(/MpegEncContext/ void s, uint8_t src, uint8_t *dummy, int stride, int h){
3373	int i;
3374	int temp[64];
3375	int sum=0;
3376
3377	assert(h==8);
3378
3379	for(i=0; i<8; i++){
3380	//FIXME try pointer walks
3381	BUTTERFLY2(temp[8i+0], temp[8i+1], src[stridei+0],src[stridei+1]);
3382	BUTTERFLY2(temp[8i+2], temp[8i+3], src[stridei+2],src[stridei+3]);
3383	BUTTERFLY2(temp[8i+4], temp[8i+5], src[stridei+4],src[stridei+5]);
3384	BUTTERFLY2(temp[8i+6], temp[8i+7], src[stridei+6],src[stridei+7]);
3385
3386	BUTTERFLY1(temp[8i+0], temp[8i+2]);
3387	BUTTERFLY1(temp[8i+1], temp[8i+3]);
3388	BUTTERFLY1(temp[8i+4], temp[8i+6]);
3389	BUTTERFLY1(temp[8i+5], temp[8i+7]);
3390
3391	BUTTERFLY1(temp[8i+0], temp[8i+4]);
3392	BUTTERFLY1(temp[8i+1], temp[8i+5]);
3393	BUTTERFLY1(temp[8i+2], temp[8i+6]);
3394	BUTTERFLY1(temp[8i+3], temp[8i+7]);
3395	}
3396
3397	for(i=0; i<8; i++){
3398	BUTTERFLY1(temp[80+i], temp[81+i]);
3399	BUTTERFLY1(temp[82+i], temp[83+i]);
3400	BUTTERFLY1(temp[84+i], temp[85+i]);
3401	BUTTERFLY1(temp[86+i], temp[87+i]);
3402
3403	BUTTERFLY1(temp[80+i], temp[82+i]);
3404	BUTTERFLY1(temp[81+i], temp[83+i]);
3405	BUTTERFLY1(temp[84+i], temp[86+i]);
3406	BUTTERFLY1(temp[85+i], temp[87+i]);
3407
3408	sum +=
3409	BUTTERFLYA(temp[80+i], temp[84+i])
3410	+BUTTERFLYA(temp[81+i], temp[85+i])
3411	+BUTTERFLYA(temp[82+i], temp[86+i])
3412	+BUTTERFLYA(temp[83+i], temp[87+i]);
3413	}
3414
3415	sum -= ABS(temp[80] + temp[84]); // -mean
3416
3417	return sum;
3418	}
3419
3420	static int dct_sad8x8_c(/MpegEncContext/ void c, uint8_t src1, uint8_t *src2, int stride, int h){
3421	MpegEncContext * const s= (MpegEncContext *)c;
3422	DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3423	DCTELEM * const temp= (DCTELEM*)aligned_temp;
3424	int sum=0, i;
3425
3426	assert(h==8);
3427
3428	s->dsp.diff_pixels(temp, src1, src2, stride);
3429	s->dsp.fdct(temp);
3430
3431	for(i=0; i<64; i++)
3432	sum+= ABS(temp[i]);
3433
3434	return sum;
3435	}
3436
3437	static int dct_max8x8_c(/MpegEncContext/ void c, uint8_t src1, uint8_t *src2, int stride, int h){
3438	MpegEncContext * const s= (MpegEncContext *)c;
3439	DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3440	DCTELEM * const temp= (DCTELEM*)aligned_temp;
3441	int sum=0, i;
3442
3443	assert(h==8);
3444
3445	s->dsp.diff_pixels(temp, src1, src2, stride);
3446	s->dsp.fdct(temp);
3447
3448	for(i=0; i<64; i++)
3449	sum= FFMAX(sum, ABS(temp[i]));
3450
3451	return sum;
3452	}
3453
3454	void simple_idct(DCTELEM *block); //FIXME
3455
3456	static int quant_psnr8x8_c(/MpegEncContext/ void c, uint8_t src1, uint8_t *src2, int stride, int h){
3457	MpegEncContext * const s= (MpegEncContext *)c;
3458	DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)642/8]);
3459	DCTELEM * const temp= (DCTELEM*)aligned_temp;
3460	DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3461	int sum=0, i;
3462
3463	assert(h==8);
3464	s->mb_intra=0;
3465
3466	s->dsp.diff_pixels(temp, src1, src2, stride);
3467
3468	memcpy(bak, temp, 64*sizeof(DCTELEM));
3469
3470	s->block_last_index[0/FIXME/]= s->fast_dct_quantize(s, temp, 0/FIXME/, s->qscale, &i);
3471	s->dct_unquantize_inter(s, temp, 0, s->qscale);
3472	simple_idct(temp); //FIXME
3473
3474	for(i=0; i<64; i++)
3475	sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3476
3477	return sum;
3478	}
3479
3480	static int rd8x8_c(/MpegEncContext/ void c, uint8_t src1, uint8_t *src2, int stride, int h){
3481	MpegEncContext * const s= (MpegEncContext *)c;
3482	const uint8_t *scantable= s->intra_scantable.permutated;
3483	DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3484	DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3485	DCTELEM * const temp= (DCTELEM*)aligned_temp;
3486	uint8_t * const bak= (uint8_t*)aligned_bak;
3487	int i, last, run, bits, level, distoration, start_i;
3488	const int esc_length= s->ac_esc_length;
3489	uint8_t * length;
3490	uint8_t * last_length;
3491
3492	assert(h==8);
3493
3494	for(i=0; i<8; i++){
3495	((uint32_t)(bak + istride))[0]= ((uint32_t)(src2 + istride))[0];
3496	((uint32_t)(bak + istride))[1]= ((uint32_t)(src2 + istride))[1];
3497	}
3498
3499	s->dsp.diff_pixels(temp, src1, src2, stride);
3500
3501	s->block_last_index[0/FIXME/]= last= s->fast_dct_quantize(s, temp, 0/FIXME/, s->qscale, &i);
3502
3503	bits=0;
3504
3505	if (s->mb_intra) {
3506	start_i = 1;
3507	length = s->intra_ac_vlc_length;
3508	last_length= s->intra_ac_vlc_last_length;
3509	bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3510	} else {
3511	start_i = 0;
3512	length = s->inter_ac_vlc_length;
3513	last_length= s->inter_ac_vlc_last_length;
3514	}
3515
3516	if(last>=start_i){
3517	run=0;
3518	for(i=start_i; i<last; i++){
3519	int j= scantable[i];
3520	level= temp[j];
3521
3522	if(level){
3523	level+=64;
3524	if((level&(~127)) == 0){
3525	bits+= length[UNI_AC_ENC_INDEX(run, level)];
3526	}else
3527	bits+= esc_length;
3528	run=0;
3529	}else
3530	run++;
3531	}
3532	i= scantable[last];
3533
3534	level= temp[i] + 64;
3535
3536	assert(level - 64);
3537
3538	if((level&(~127)) == 0){
3539	bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3540	}else
3541	bits+= esc_length;
3542
3543	}
3544
3545	if(last>=0){
3546	if(s->mb_intra)
3547	s->dct_unquantize_intra(s, temp, 0, s->qscale);
3548	else
3549	s->dct_unquantize_inter(s, temp, 0, s->qscale);
3550	}
3551
3552	s->dsp.idct_add(bak, stride, temp);
3553
3554	distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3555
3556	return distoration + ((bitss->qscales->qscale*109 + 64)>>7);
3557	}
3558
3559	static int bit8x8_c(/MpegEncContext/ void c, uint8_t src1, uint8_t *src2, int stride, int h){
3560	MpegEncContext * const s= (MpegEncContext *)c;
3561	const uint8_t *scantable= s->intra_scantable.permutated;
3562	DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3563	DCTELEM * const temp= (DCTELEM*)aligned_temp;
3564	int i, last, run, bits, level, start_i;
3565	const int esc_length= s->ac_esc_length;
3566	uint8_t * length;
3567	uint8_t * last_length;
3568
3569	assert(h==8);
3570
3571	s->dsp.diff_pixels(temp, src1, src2, stride);
3572
3573	s->block_last_index[0/FIXME/]= last= s->fast_dct_quantize(s, temp, 0/FIXME/, s->qscale, &i);
3574
3575	bits=0;
3576
3577	if (s->mb_intra) {
3578	start_i = 1;
3579	length = s->intra_ac_vlc_length;
3580	last_length= s->intra_ac_vlc_last_length;
3581	bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3582	} else {
3583	start_i = 0;
3584	length = s->inter_ac_vlc_length;
3585	last_length= s->inter_ac_vlc_last_length;
3586	}
3587
3588	if(last>=start_i){
3589	run=0;
3590	for(i=start_i; i<last; i++){
3591	int j= scantable[i];
3592	level= temp[j];
3593
3594	if(level){
3595	level+=64;
3596	if((level&(~127)) == 0){
3597	bits+= length[UNI_AC_ENC_INDEX(run, level)];
3598	}else
3599	bits+= esc_length;
3600	run=0;
3601	}else
3602	run++;
3603	}
3604	i= scantable[last];
3605
3606	level= temp[i] + 64;
3607
3608	assert(level - 64);
3609
3610	if((level&(~127)) == 0){
3611	bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3612	}else
3613	bits+= esc_length;
3614	}
3615
3616	return bits;
3617	}
3618
3619	static int vsad_intra16_c(/MpegEncContext/ void c, uint8_t s, uint8_t *dummy, int stride, int h){
3620	int score=0;
3621	int x,y;
3622
3623	for(y=1; y<h; y++){
3624	for(x=0; x<16; x+=4){
3625	score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3626	+ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3627	}
3628	s+= stride;
3629	}
3630
3631	return score;
3632	}
3633
3634	static int vsad16_c(/MpegEncContext/ void c, uint8_t s1, uint8_t *s2, int stride, int h){
3635	int score=0;
3636	int x,y;
3637
3638	for(y=1; y<h; y++){
3639	for(x=0; x<16; x++){
3640	score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3641	}
3642	s1+= stride;
3643	s2+= stride;
3644	}
3645
3646	return score;
3647	}
3648
3649	#define SQ(a) ((a)*(a))
3650	static int vsse_intra16_c(/MpegEncContext/ void c, uint8_t s, uint8_t *dummy, int stride, int h){
3651	int score=0;
3652	int x,y;
3653
3654	for(y=1; y<h; y++){
3655	for(x=0; x<16; x+=4){
3656	score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3657	+SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3658	}
3659	s+= stride;
3660	}
3661
3662	return score;
3663	}
3664
3665	static int vsse16_c(/MpegEncContext/ void c, uint8_t s1, uint8_t *s2, int stride, int h){
3666	int score=0;
3667	int x,y;
3668
3669	for(y=1; y<h; y++){
3670	for(x=0; x<16; x++){
3671	score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3672	}
3673	s1+= stride;
3674	s2+= stride;
3675	}
3676
3677	return score;
3678	}
3679
3680	WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3681	WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3682	WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3683	WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3684	WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3685	WARPER8_16_SQ(rd8x8_c, rd16_c)
3686	WARPER8_16_SQ(bit8x8_c, bit16_c)
3687
3688	/* XXX: those functions should be suppressed ASAP when all IDCTs are
3689	converted */
3690	static void ff_jref_idct_put(uint8_t dest, int line_size, DCTELEM block)
3691	{
3692	j_rev_dct (block);
3693	put_pixels_clamped_c(block, dest, line_size);
3694	}
3695	static void ff_jref_idct_add(uint8_t dest, int line_size, DCTELEM block)
3696	{
3697	j_rev_dct (block);
3698	add_pixels_clamped_c(block, dest, line_size);
3699	}
3700
3701	static void ff_jref_idct4_put(uint8_t dest, int line_size, DCTELEM block)
3702	{
3703	j_rev_dct4 (block);
3704	put_pixels_clamped4_c(block, dest, line_size);
3705	}
3706	static void ff_jref_idct4_add(uint8_t dest, int line_size, DCTELEM block)
3707	{
3708	j_rev_dct4 (block);
3709	add_pixels_clamped4_c(block, dest, line_size);
3710	}
3711
3712	static void ff_jref_idct2_put(uint8_t dest, int line_size, DCTELEM block)
3713	{
3714	j_rev_dct2 (block);
3715	put_pixels_clamped2_c(block, dest, line_size);
3716	}
3717	static void ff_jref_idct2_add(uint8_t dest, int line_size, DCTELEM block)
3718	{
3719	j_rev_dct2 (block);
3720	add_pixels_clamped2_c(block, dest, line_size);
3721	}
3722
3723	static void ff_jref_idct1_put(uint8_t dest, int line_size, DCTELEM block)
3724	{
3725	uint8_t *cm = cropTbl + MAX_NEG_CROP;
3726
3727	dest[0] = cm[(block[0] + 4)>>3];
3728	}
3729	static void ff_jref_idct1_add(uint8_t dest, int line_size, DCTELEM block)
3730	{
3731	uint8_t *cm = cropTbl + MAX_NEG_CROP;
3732
3733	dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3734	}
3735
3736	static void just_return() { return; }
3737
3738	/* init static data */
3739	void dsputil_static_init(void)
3740	{
3741	int i;
3742
3743	for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3744	for(i=0;i<MAX_NEG_CROP;i++) {
3745	cropTbl[i] = 0;
3746	cropTbl[i + MAX_NEG_CROP + 256] = 255;
3747	}
3748
3749	for(i=0;i<512;i++) {
3750	squareTbl[i] = (i - 256) * (i - 256);
3751	}
3752
3753	for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3754	}
3755
3756
3757	void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3758	{
3759	int i;
3760
3761	#ifdef CONFIG_ENCODERS
3762	if(avctx->dct_algo==FF_DCT_FASTINT) {
3763	c->fdct = fdct_ifast;
3764	c->fdct248 = fdct_ifast248;
3765	}
3766	else if(avctx->dct_algo==FF_DCT_FAAN) {
3767	c->fdct = ff_faandct;
3768	c->fdct248 = ff_faandct248;
3769	}
3770	else {
3771	c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3772	c->fdct248 = ff_fdct248_islow;
3773	}
3774	#endif //CONFIG_ENCODERS
3775
3776	if(avctx->lowres==1){
3777	if(avctx->idct_algo==FF_IDCT_INT \|\| avctx->idct_algo==FF_IDCT_AUTO){
3778	c->idct_put= ff_jref_idct4_put;
3779	c->idct_add= ff_jref_idct4_add;
3780	}else{
3781	c->idct_put= ff_h264_lowres_idct_put_c;
3782	c->idct_add= ff_h264_lowres_idct_add_c;
3783	}
3784	c->idct = j_rev_dct4;
3785	c->idct_permutation_type= FF_NO_IDCT_PERM;
3786	}else if(avctx->lowres==2){
3787	c->idct_put= ff_jref_idct2_put;
3788	c->idct_add= ff_jref_idct2_add;
3789	c->idct = j_rev_dct2;
3790	c->idct_permutation_type= FF_NO_IDCT_PERM;
3791	}else if(avctx->lowres==3){
3792	c->idct_put= ff_jref_idct1_put;
3793	c->idct_add= ff_jref_idct1_add;
3794	c->idct = j_rev_dct1;
3795	c->idct_permutation_type= FF_NO_IDCT_PERM;
3796	}else{
3797	if(avctx->idct_algo==FF_IDCT_INT){
3798	c->idct_put= ff_jref_idct_put;
3799	c->idct_add= ff_jref_idct_add;
3800	c->idct = j_rev_dct;
3801	c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3802	}else if(avctx->idct_algo==FF_IDCT_VP3){
3803	c->idct_put= ff_vp3_idct_put_c;
3804	c->idct_add= ff_vp3_idct_add_c;
3805	c->idct = ff_vp3_idct_c;
3806	c->idct_permutation_type= FF_NO_IDCT_PERM;
3807	}else{ //accurate/default
3808	c->idct_put= simple_idct_put;
3809	c->idct_add= simple_idct_add;
3810	c->idct = simple_idct;
3811	c->idct_permutation_type= FF_NO_IDCT_PERM;
3812	}
3813	}
3814
3815	c->h264_idct_add= ff_h264_idct_add_c;
3816	c->h264_idct8_add= ff_h264_idct8_add_c;
3817	c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3818	c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3819
3820	c->get_pixels = get_pixels_c;
3821	c->diff_pixels = diff_pixels_c;
3822	c->put_pixels_clamped = put_pixels_clamped_c;
3823	c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3824	c->add_pixels_clamped = add_pixels_clamped_c;
3825	c->add_pixels8 = add_pixels8_c;
3826	c->add_pixels4 = add_pixels4_c;
3827	c->gmc1 = gmc1_c;
3828	c->gmc = ff_gmc_c;
3829	c->clear_blocks = clear_blocks_c;
3830	c->pix_sum = pix_sum_c;
3831	c->pix_norm1 = pix_norm1_c;
3832
3833	/* TODO [0] 16 [1] 8 */
3834	c->pix_abs[0][0] = pix_abs16_c;
3835	c->pix_abs[0][1] = pix_abs16_x2_c;
3836	c->pix_abs[0][2] = pix_abs16_y2_c;
3837	c->pix_abs[0][3] = pix_abs16_xy2_c;
3838	c->pix_abs[1][0] = pix_abs8_c;
3839	c->pix_abs[1][1] = pix_abs8_x2_c;
3840	c->pix_abs[1][2] = pix_abs8_y2_c;
3841	c->pix_abs[1][3] = pix_abs8_xy2_c;
3842
3843	#define dspfunc(PFX, IDX, NUM) \
3844	c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3845	c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3846	c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3847	c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3848
3849	dspfunc(put, 0, 16);
3850	dspfunc(put_no_rnd, 0, 16);
3851	dspfunc(put, 1, 8);
3852	dspfunc(put_no_rnd, 1, 8);
3853	dspfunc(put, 2, 4);
3854	dspfunc(put, 3, 2);
3855
3856	dspfunc(avg, 0, 16);
3857	dspfunc(avg_no_rnd, 0, 16);
3858	dspfunc(avg, 1, 8);
3859	dspfunc(avg_no_rnd, 1, 8);
3860	dspfunc(avg, 2, 4);
3861	dspfunc(avg, 3, 2);
3862	#undef dspfunc
3863
3864	c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3865	c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3866
3867	c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3868	c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3869	c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3870	c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3871	c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3872	c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3873	c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3874	c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3875	c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3876
3877	c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3878	c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3879	c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3880	c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3881	c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3882	c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3883	c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3884	c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3885	c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3886
3887	#define dspfunc(PFX, IDX, NUM) \
3888	c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3889	c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3890	c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3891	c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3892	c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3893	c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3894	c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3895	c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3896	c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3897	c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3898	c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3899	c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3900	c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3901	c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3902	c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3903	c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3904
3905	dspfunc(put_qpel, 0, 16);
3906	dspfunc(put_no_rnd_qpel, 0, 16);
3907
3908	dspfunc(avg_qpel, 0, 16);
3909	/* dspfunc(avg_no_rnd_qpel, 0, 16); */
3910
3911	dspfunc(put_qpel, 1, 8);
3912	dspfunc(put_no_rnd_qpel, 1, 8);
3913
3914	dspfunc(avg_qpel, 1, 8);
3915	/* dspfunc(avg_no_rnd_qpel, 1, 8); */
3916
3917	dspfunc(put_h264_qpel, 0, 16);
3918	dspfunc(put_h264_qpel, 1, 8);
3919	dspfunc(put_h264_qpel, 2, 4);
3920	dspfunc(put_h264_qpel, 3, 2);
3921	dspfunc(avg_h264_qpel, 0, 16);
3922	dspfunc(avg_h264_qpel, 1, 8);
3923	dspfunc(avg_h264_qpel, 2, 4);
3924
3925	#undef dspfunc
3926	c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3927	c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3928	c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3929	c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3930	c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3931	c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3932
3933	c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3934	c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3935	c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3936	c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3937	c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3938	c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3939	c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3940	c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3941	c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3942	c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3943	c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3944	c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3945	c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3946	c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3947	c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3948	c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3949	c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3950	c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3951	c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3952	c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3953
3954	#ifdef CONFIG_CAVS_DECODER
3955	ff_cavsdsp_init(c,avctx);
3956	#endif
3957
3958	c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3959	c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3960	c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3961	c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3962	c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3963	c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3964	c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3965	c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3966
3967	#define SET_CMP_FUNC(name) \
3968	c->name[0]= name ## 16_c;\
3969	c->name[1]= name ## 8x8_c;
3970
3971	SET_CMP_FUNC(hadamard8_diff)
3972	c->hadamard8_diff[4]= hadamard8_intra16_c;
3973	SET_CMP_FUNC(dct_sad)
3974	SET_CMP_FUNC(dct_max)
3975	c->sad[0]= pix_abs16_c;
3976	c->sad[1]= pix_abs8_c;
3977	c->sse[0]= sse16_c;
3978	c->sse[1]= sse8_c;
3979	c->sse[2]= sse4_c;
3980	SET_CMP_FUNC(quant_psnr)
3981	SET_CMP_FUNC(rd)
3982	SET_CMP_FUNC(bit)
3983	c->vsad[0]= vsad16_c;
3984	c->vsad[4]= vsad_intra16_c;
3985	c->vsse[0]= vsse16_c;
3986	c->vsse[4]= vsse_intra16_c;
3987	c->nsse[0]= nsse16_c;
3988	c->nsse[1]= nsse8_c;
3989	#ifdef CONFIG_SNOW_ENCODER
3990	c->w53[0]= w53_16_c;
3991	c->w53[1]= w53_8_c;
3992	c->w97[0]= w97_16_c;
3993	c->w97[1]= w97_8_c;
3994	#endif
3995
3996	c->add_bytes= add_bytes_c;
3997	c->diff_bytes= diff_bytes_c;
3998	c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3999	c->bswap_buf= bswap_buf;
4000
4001	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4002	c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4003	c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4004	c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4005	c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4006	c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4007
4008	c->h263_h_loop_filter= h263_h_loop_filter_c;
4009	c->h263_v_loop_filter= h263_v_loop_filter_c;
4010
4011	c->h261_loop_filter= h261_loop_filter_c;
4012
4013	c->try_8x8basis= try_8x8basis_c;
4014	c->add_8x8basis= add_8x8basis_c;
4015
4016	#ifdef CONFIG_SNOW_ENCODER
4017	c->vertical_compose97i = ff_snow_vertical_compose97i;
4018	c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4019	c->inner_add_yblock = ff_snow_inner_add_yblock;
4020	#endif
4021
4022	c->shrink[0]= ff_img_copy_plane;
4023	c->shrink[1]= ff_shrink22;
4024	c->shrink[2]= ff_shrink44;
4025	c->shrink[3]= ff_shrink88;
4026
4027	c->prefetch= just_return;
4028
4029	#ifdef HAVE_MMX
4030	dsputil_init_mmx(c, avctx);
4031	#endif
4032	#ifdef ARCH_ARMV4L
4033	dsputil_init_armv4l(c, avctx);
4034	#endif
4035	#ifdef HAVE_MLIB
4036	dsputil_init_mlib(c, avctx);
4037	#endif
4038	#ifdef ARCH_SPARC
4039	dsputil_init_vis(c,avctx);
4040	#endif
4041	#ifdef ARCH_ALPHA
4042	dsputil_init_alpha(c, avctx);
4043	#endif
4044	#ifdef ARCH_POWERPC
4045	dsputil_init_ppc(c, avctx);
4046	#endif
4047	#ifdef HAVE_MMI
4048	dsputil_init_mmi(c, avctx);
4049	#endif
4050	#ifdef ARCH_SH4
4051	dsputil_init_sh4(c,avctx);
4052	#endif
4053
4054	switch(c->idct_permutation_type){
4055	case FF_NO_IDCT_PERM:
4056	for(i=0; i<64; i++)
4057	c->idct_permutation[i]= i;
4058	break;
4059	case FF_LIBMPEG2_IDCT_PERM:
4060	for(i=0; i<64; i++)
4061	c->idct_permutation[i]= (i & 0x38) \| ((i & 6) >> 1) \| ((i & 1) << 2);
4062	break;
4063	case FF_SIMPLE_IDCT_PERM:
4064	for(i=0; i<64; i++)
4065	c->idct_permutation[i]= simple_mmx_permutation[i];
4066	break;
4067	case FF_TRANSPOSE_IDCT_PERM:
4068	for(i=0; i<64; i++)
4069	c->idct_permutation[i]= ((i&7)<<3) \| (i>>3);
4070	break;
4071	case FF_PARTTRANS_IDCT_PERM:
4072	for(i=0; i<64; i++)
4073	c->idct_permutation[i]= (i&0x24) \| ((i&3)<<3) \| ((i>>3)&3);
4074	break;
4075	default:
4076	av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4077	}
4078	}
4079

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/dsputil.c@ 9441

Download in other formats: