1 | /*
|
---|
2 | * Copyright (c) 2002 Brian Foley
|
---|
3 | * Copyright (c) 2002 Dieter Shirley
|
---|
4 | * Copyright (c) 2003-2004 Romain Dolbeau <[email protected]>
|
---|
5 | *
|
---|
6 | * This library is free software; you can redistribute it and/or
|
---|
7 | * modify it under the terms of the GNU Lesser General Public
|
---|
8 | * License as published by the Free Software Foundation; either
|
---|
9 | * version 2 of the License, or (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This library is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
14 | * Lesser General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU Lesser General Public
|
---|
17 | * License along with this library; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
19 | */
|
---|
20 |
|
---|
21 | #include "../dsputil.h"
|
---|
22 |
|
---|
23 | #include "gcc_fixes.h"
|
---|
24 |
|
---|
25 | #include "dsputil_altivec.h"
|
---|
26 |
|
---|
27 | #ifdef CONFIG_DARWIN
|
---|
28 | #include <sys/sysctl.h>
|
---|
29 | #else /* CONFIG_DARWIN */
|
---|
30 | #ifdef __AMIGAOS4__
|
---|
31 | #include <exec/exec.h>
|
---|
32 | #include <interfaces/exec.h>
|
---|
33 | #include <proto/exec.h>
|
---|
34 | #else /* __AMIGAOS4__ */
|
---|
35 | #include <signal.h>
|
---|
36 | #include <setjmp.h>
|
---|
37 |
|
---|
38 | static sigjmp_buf jmpbuf;
|
---|
39 | static volatile sig_atomic_t canjump = 0;
|
---|
40 |
|
---|
41 | static void sigill_handler (int sig)
|
---|
42 | {
|
---|
43 | if (!canjump) {
|
---|
44 | signal (sig, SIG_DFL);
|
---|
45 | raise (sig);
|
---|
46 | }
|
---|
47 |
|
---|
48 | canjump = 0;
|
---|
49 | siglongjmp (jmpbuf, 1);
|
---|
50 | }
|
---|
51 | #endif /* CONFIG_DARWIN */
|
---|
52 | #endif /* __AMIGAOS4__ */
|
---|
53 |
|
---|
54 | int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
55 | {
|
---|
56 | int i;
|
---|
57 | int s __attribute__((aligned(16)));
|
---|
58 | const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
59 | vector unsigned char *tv;
|
---|
60 | vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
|
---|
61 | vector unsigned int sad;
|
---|
62 | vector signed int sumdiffs;
|
---|
63 |
|
---|
64 | s = 0;
|
---|
65 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
66 | for(i=0;i<h;i++) {
|
---|
67 | /*
|
---|
68 | Read unaligned pixels into our vectors. The vectors are as follows:
|
---|
69 | pix1v: pix1[0]-pix1[15]
|
---|
70 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
|
---|
71 | */
|
---|
72 | tv = (vector unsigned char *) pix1;
|
---|
73 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
---|
74 |
|
---|
75 | tv = (vector unsigned char *) &pix2[0];
|
---|
76 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
---|
77 |
|
---|
78 | tv = (vector unsigned char *) &pix2[1];
|
---|
79 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
|
---|
80 |
|
---|
81 | /* Calculate the average vector */
|
---|
82 | avgv = vec_avg(pix2v, pix2iv);
|
---|
83 |
|
---|
84 | /* Calculate a sum of abs differences vector */
|
---|
85 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
---|
86 |
|
---|
87 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
88 | sad = vec_sum4s(t5, sad);
|
---|
89 |
|
---|
90 | pix1 += line_size;
|
---|
91 | pix2 += line_size;
|
---|
92 | }
|
---|
93 | /* Sum up the four partial sums, and put the result into s */
|
---|
94 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
95 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
96 | vec_ste(sumdiffs, 0, &s);
|
---|
97 |
|
---|
98 | return s;
|
---|
99 | }
|
---|
100 |
|
---|
101 | int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
102 | {
|
---|
103 | int i;
|
---|
104 | int s __attribute__((aligned(16)));
|
---|
105 | const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
106 | vector unsigned char *tv;
|
---|
107 | vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
|
---|
108 | vector unsigned int sad;
|
---|
109 | vector signed int sumdiffs;
|
---|
110 | uint8_t *pix3 = pix2 + line_size;
|
---|
111 |
|
---|
112 | s = 0;
|
---|
113 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
114 |
|
---|
115 | /*
|
---|
116 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one
|
---|
117 | iteration becomes pix2 in the next iteration. We can use this
|
---|
118 | fact to avoid a potentially expensive unaligned read, each
|
---|
119 | time around the loop.
|
---|
120 | Read unaligned pixels into our vectors. The vectors are as follows:
|
---|
121 | pix2v: pix2[0]-pix2[15]
|
---|
122 | Split the pixel vectors into shorts
|
---|
123 | */
|
---|
124 | tv = (vector unsigned char *) &pix2[0];
|
---|
125 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
---|
126 |
|
---|
127 | for(i=0;i<h;i++) {
|
---|
128 | /*
|
---|
129 | Read unaligned pixels into our vectors. The vectors are as follows:
|
---|
130 | pix1v: pix1[0]-pix1[15]
|
---|
131 | pix3v: pix3[0]-pix3[15]
|
---|
132 | */
|
---|
133 | tv = (vector unsigned char *) pix1;
|
---|
134 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
---|
135 |
|
---|
136 | tv = (vector unsigned char *) &pix3[0];
|
---|
137 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
|
---|
138 |
|
---|
139 | /* Calculate the average vector */
|
---|
140 | avgv = vec_avg(pix2v, pix3v);
|
---|
141 |
|
---|
142 | /* Calculate a sum of abs differences vector */
|
---|
143 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
---|
144 |
|
---|
145 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
146 | sad = vec_sum4s(t5, sad);
|
---|
147 |
|
---|
148 | pix1 += line_size;
|
---|
149 | pix2v = pix3v;
|
---|
150 | pix3 += line_size;
|
---|
151 |
|
---|
152 | }
|
---|
153 |
|
---|
154 | /* Sum up the four partial sums, and put the result into s */
|
---|
155 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
156 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
157 | vec_ste(sumdiffs, 0, &s);
|
---|
158 | return s;
|
---|
159 | }
|
---|
160 |
|
---|
161 | int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
162 | {
|
---|
163 | int i;
|
---|
164 | int s __attribute__((aligned(16)));
|
---|
165 | uint8_t *pix3 = pix2 + line_size;
|
---|
166 | const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
167 | const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
|
---|
168 | vector unsigned char *tv, avgv, t5;
|
---|
169 | vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
|
---|
170 | vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
|
---|
171 | vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
|
---|
172 | vector unsigned short avghv, avglv;
|
---|
173 | vector unsigned short t1, t2, t3, t4;
|
---|
174 | vector unsigned int sad;
|
---|
175 | vector signed int sumdiffs;
|
---|
176 |
|
---|
177 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
178 |
|
---|
179 | s = 0;
|
---|
180 |
|
---|
181 | /*
|
---|
182 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one
|
---|
183 | iteration becomes pix2 in the next iteration. We can use this
|
---|
184 | fact to avoid a potentially expensive unaligned read, as well
|
---|
185 | as some splitting, and vector addition each time around the loop.
|
---|
186 | Read unaligned pixels into our vectors. The vectors are as follows:
|
---|
187 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
|
---|
188 | Split the pixel vectors into shorts
|
---|
189 | */
|
---|
190 | tv = (vector unsigned char *) &pix2[0];
|
---|
191 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
---|
192 |
|
---|
193 | tv = (vector unsigned char *) &pix2[1];
|
---|
194 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
|
---|
195 |
|
---|
196 | pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
|
---|
197 | pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
|
---|
198 | pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
|
---|
199 | pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
|
---|
200 | t1 = vec_add(pix2hv, pix2ihv);
|
---|
201 | t2 = vec_add(pix2lv, pix2ilv);
|
---|
202 |
|
---|
203 | for(i=0;i<h;i++) {
|
---|
204 | /*
|
---|
205 | Read unaligned pixels into our vectors. The vectors are as follows:
|
---|
206 | pix1v: pix1[0]-pix1[15]
|
---|
207 | pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
|
---|
208 | */
|
---|
209 | tv = (vector unsigned char *) pix1;
|
---|
210 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
---|
211 |
|
---|
212 | tv = (vector unsigned char *) &pix3[0];
|
---|
213 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
|
---|
214 |
|
---|
215 | tv = (vector unsigned char *) &pix3[1];
|
---|
216 | pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
|
---|
217 |
|
---|
218 | /*
|
---|
219 | Note that Altivec does have vec_avg, but this works on vector pairs
|
---|
220 | and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
|
---|
221 | would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
|
---|
222 | Instead, we have to split the pixel vectors into vectors of shorts,
|
---|
223 | and do the averaging by hand.
|
---|
224 | */
|
---|
225 |
|
---|
226 | /* Split the pixel vectors into shorts */
|
---|
227 | pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
|
---|
228 | pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
|
---|
229 | pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
|
---|
230 | pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
|
---|
231 |
|
---|
232 | /* Do the averaging on them */
|
---|
233 | t3 = vec_add(pix3hv, pix3ihv);
|
---|
234 | t4 = vec_add(pix3lv, pix3ilv);
|
---|
235 |
|
---|
236 | avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
|
---|
237 | avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
|
---|
238 |
|
---|
239 | /* Pack the shorts back into a result */
|
---|
240 | avgv = vec_pack(avghv, avglv);
|
---|
241 |
|
---|
242 | /* Calculate a sum of abs differences vector */
|
---|
243 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
---|
244 |
|
---|
245 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
246 | sad = vec_sum4s(t5, sad);
|
---|
247 |
|
---|
248 | pix1 += line_size;
|
---|
249 | pix3 += line_size;
|
---|
250 | /* Transfer the calculated values for pix3 into pix2 */
|
---|
251 | t1 = t3;
|
---|
252 | t2 = t4;
|
---|
253 | }
|
---|
254 | /* Sum up the four partial sums, and put the result into s */
|
---|
255 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
256 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
257 | vec_ste(sumdiffs, 0, &s);
|
---|
258 |
|
---|
259 | return s;
|
---|
260 | }
|
---|
261 |
|
---|
262 | int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
263 | {
|
---|
264 | int i;
|
---|
265 | int s __attribute__((aligned(16)));
|
---|
266 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
267 | vector unsigned char perm1, perm2, *pix1v, *pix2v;
|
---|
268 | vector unsigned char t1, t2, t3,t4, t5;
|
---|
269 | vector unsigned int sad;
|
---|
270 | vector signed int sumdiffs;
|
---|
271 |
|
---|
272 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
273 |
|
---|
274 |
|
---|
275 | for(i=0;i<h;i++) {
|
---|
276 | /* Read potentially unaligned pixels into t1 and t2 */
|
---|
277 | perm1 = vec_lvsl(0, pix1);
|
---|
278 | pix1v = (vector unsigned char *) pix1;
|
---|
279 | perm2 = vec_lvsl(0, pix2);
|
---|
280 | pix2v = (vector unsigned char *) pix2;
|
---|
281 | t1 = vec_perm(pix1v[0], pix1v[1], perm1);
|
---|
282 | t2 = vec_perm(pix2v[0], pix2v[1], perm2);
|
---|
283 |
|
---|
284 | /* Calculate a sum of abs differences vector */
|
---|
285 | t3 = vec_max(t1, t2);
|
---|
286 | t4 = vec_min(t1, t2);
|
---|
287 | t5 = vec_sub(t3, t4);
|
---|
288 |
|
---|
289 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
290 | sad = vec_sum4s(t5, sad);
|
---|
291 |
|
---|
292 | pix1 += line_size;
|
---|
293 | pix2 += line_size;
|
---|
294 | }
|
---|
295 |
|
---|
296 | /* Sum up the four partial sums, and put the result into s */
|
---|
297 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
298 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
299 | vec_ste(sumdiffs, 0, &s);
|
---|
300 |
|
---|
301 | return s;
|
---|
302 | }
|
---|
303 |
|
---|
304 | int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
305 | {
|
---|
306 | int i;
|
---|
307 | int s __attribute__((aligned(16)));
|
---|
308 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
309 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
|
---|
310 | vector unsigned char t1, t2, t3,t4, t5;
|
---|
311 | vector unsigned int sad;
|
---|
312 | vector signed int sumdiffs;
|
---|
313 |
|
---|
314 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
315 |
|
---|
316 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
|
---|
317 |
|
---|
318 | for(i=0;i<h;i++) {
|
---|
319 | /* Read potentially unaligned pixels into t1 and t2
|
---|
320 | Since we're reading 16 pixels, and actually only want 8,
|
---|
321 | mask out the last 8 pixels. The 0s don't change the sum. */
|
---|
322 | perm1 = vec_lvsl(0, pix1);
|
---|
323 | pix1v = (vector unsigned char *) pix1;
|
---|
324 | perm2 = vec_lvsl(0, pix2);
|
---|
325 | pix2v = (vector unsigned char *) pix2;
|
---|
326 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
|
---|
327 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
|
---|
328 |
|
---|
329 | /* Calculate a sum of abs differences vector */
|
---|
330 | t3 = vec_max(t1, t2);
|
---|
331 | t4 = vec_min(t1, t2);
|
---|
332 | t5 = vec_sub(t3, t4);
|
---|
333 |
|
---|
334 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
335 | sad = vec_sum4s(t5, sad);
|
---|
336 |
|
---|
337 | pix1 += line_size;
|
---|
338 | pix2 += line_size;
|
---|
339 | }
|
---|
340 |
|
---|
341 | /* Sum up the four partial sums, and put the result into s */
|
---|
342 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
343 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
344 | vec_ste(sumdiffs, 0, &s);
|
---|
345 |
|
---|
346 | return s;
|
---|
347 | }
|
---|
348 |
|
---|
349 | int pix_norm1_altivec(uint8_t *pix, int line_size)
|
---|
350 | {
|
---|
351 | int i;
|
---|
352 | int s __attribute__((aligned(16)));
|
---|
353 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
354 | vector unsigned char *tv;
|
---|
355 | vector unsigned char pixv;
|
---|
356 | vector unsigned int sv;
|
---|
357 | vector signed int sum;
|
---|
358 |
|
---|
359 | sv = (vector unsigned int)vec_splat_u32(0);
|
---|
360 |
|
---|
361 | s = 0;
|
---|
362 | for (i = 0; i < 16; i++) {
|
---|
363 | /* Read in the potentially unaligned pixels */
|
---|
364 | tv = (vector unsigned char *) pix;
|
---|
365 | pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
|
---|
366 |
|
---|
367 | /* Square the values, and add them to our sum */
|
---|
368 | sv = vec_msum(pixv, pixv, sv);
|
---|
369 |
|
---|
370 | pix += line_size;
|
---|
371 | }
|
---|
372 | /* Sum up the four partial sums, and put the result into s */
|
---|
373 | sum = vec_sums((vector signed int) sv, (vector signed int) zero);
|
---|
374 | sum = vec_splat(sum, 3);
|
---|
375 | vec_ste(sum, 0, &s);
|
---|
376 |
|
---|
377 | return s;
|
---|
378 | }
|
---|
379 |
|
---|
380 | /**
|
---|
381 | * Sum of Squared Errors for a 8x8 block.
|
---|
382 | * AltiVec-enhanced.
|
---|
383 | * It's the sad8_altivec code above w/ squaring added.
|
---|
384 | */
|
---|
385 | int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
386 | {
|
---|
387 | int i;
|
---|
388 | int s __attribute__((aligned(16)));
|
---|
389 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
390 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
|
---|
391 | vector unsigned char t1, t2, t3,t4, t5;
|
---|
392 | vector unsigned int sum;
|
---|
393 | vector signed int sumsqr;
|
---|
394 |
|
---|
395 | sum = (vector unsigned int)vec_splat_u32(0);
|
---|
396 |
|
---|
397 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
|
---|
398 |
|
---|
399 |
|
---|
400 | for(i=0;i<h;i++) {
|
---|
401 | /* Read potentially unaligned pixels into t1 and t2
|
---|
402 | Since we're reading 16 pixels, and actually only want 8,
|
---|
403 | mask out the last 8 pixels. The 0s don't change the sum. */
|
---|
404 | perm1 = vec_lvsl(0, pix1);
|
---|
405 | pix1v = (vector unsigned char *) pix1;
|
---|
406 | perm2 = vec_lvsl(0, pix2);
|
---|
407 | pix2v = (vector unsigned char *) pix2;
|
---|
408 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
|
---|
409 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
|
---|
410 |
|
---|
411 | /*
|
---|
412 | Since we want to use unsigned chars, we can take advantage
|
---|
413 | of the fact that abs(a-b)^2 = (a-b)^2.
|
---|
414 | */
|
---|
415 |
|
---|
416 | /* Calculate abs differences vector */
|
---|
417 | t3 = vec_max(t1, t2);
|
---|
418 | t4 = vec_min(t1, t2);
|
---|
419 | t5 = vec_sub(t3, t4);
|
---|
420 |
|
---|
421 | /* Square the values and add them to our sum */
|
---|
422 | sum = vec_msum(t5, t5, sum);
|
---|
423 |
|
---|
424 | pix1 += line_size;
|
---|
425 | pix2 += line_size;
|
---|
426 | }
|
---|
427 |
|
---|
428 | /* Sum up the four partial sums, and put the result into s */
|
---|
429 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
|
---|
430 | sumsqr = vec_splat(sumsqr, 3);
|
---|
431 | vec_ste(sumsqr, 0, &s);
|
---|
432 |
|
---|
433 | return s;
|
---|
434 | }
|
---|
435 |
|
---|
436 | /**
|
---|
437 | * Sum of Squared Errors for a 16x16 block.
|
---|
438 | * AltiVec-enhanced.
|
---|
439 | * It's the sad16_altivec code above w/ squaring added.
|
---|
440 | */
|
---|
441 | int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
---|
442 | {
|
---|
443 | int i;
|
---|
444 | int s __attribute__((aligned(16)));
|
---|
445 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
446 | vector unsigned char perm1, perm2, *pix1v, *pix2v;
|
---|
447 | vector unsigned char t1, t2, t3,t4, t5;
|
---|
448 | vector unsigned int sum;
|
---|
449 | vector signed int sumsqr;
|
---|
450 |
|
---|
451 | sum = (vector unsigned int)vec_splat_u32(0);
|
---|
452 |
|
---|
453 | for(i=0;i<h;i++) {
|
---|
454 | /* Read potentially unaligned pixels into t1 and t2 */
|
---|
455 | perm1 = vec_lvsl(0, pix1);
|
---|
456 | pix1v = (vector unsigned char *) pix1;
|
---|
457 | perm2 = vec_lvsl(0, pix2);
|
---|
458 | pix2v = (vector unsigned char *) pix2;
|
---|
459 | t1 = vec_perm(pix1v[0], pix1v[1], perm1);
|
---|
460 | t2 = vec_perm(pix2v[0], pix2v[1], perm2);
|
---|
461 |
|
---|
462 | /*
|
---|
463 | Since we want to use unsigned chars, we can take advantage
|
---|
464 | of the fact that abs(a-b)^2 = (a-b)^2.
|
---|
465 | */
|
---|
466 |
|
---|
467 | /* Calculate abs differences vector */
|
---|
468 | t3 = vec_max(t1, t2);
|
---|
469 | t4 = vec_min(t1, t2);
|
---|
470 | t5 = vec_sub(t3, t4);
|
---|
471 |
|
---|
472 | /* Square the values and add them to our sum */
|
---|
473 | sum = vec_msum(t5, t5, sum);
|
---|
474 |
|
---|
475 | pix1 += line_size;
|
---|
476 | pix2 += line_size;
|
---|
477 | }
|
---|
478 |
|
---|
479 | /* Sum up the four partial sums, and put the result into s */
|
---|
480 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
|
---|
481 | sumsqr = vec_splat(sumsqr, 3);
|
---|
482 | vec_ste(sumsqr, 0, &s);
|
---|
483 |
|
---|
484 | return s;
|
---|
485 | }
|
---|
486 |
|
---|
487 | int pix_sum_altivec(uint8_t * pix, int line_size)
|
---|
488 | {
|
---|
489 | const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
---|
490 | vector unsigned char perm, *pixv;
|
---|
491 | vector unsigned char t1;
|
---|
492 | vector unsigned int sad;
|
---|
493 | vector signed int sumdiffs;
|
---|
494 |
|
---|
495 | int i;
|
---|
496 | int s __attribute__((aligned(16)));
|
---|
497 |
|
---|
498 | sad = (vector unsigned int)vec_splat_u32(0);
|
---|
499 |
|
---|
500 | for (i = 0; i < 16; i++) {
|
---|
501 | /* Read the potentially unaligned 16 pixels into t1 */
|
---|
502 | perm = vec_lvsl(0, pix);
|
---|
503 | pixv = (vector unsigned char *) pix;
|
---|
504 | t1 = vec_perm(pixv[0], pixv[1], perm);
|
---|
505 |
|
---|
506 | /* Add each 4 pixel group together and put 4 results into sad */
|
---|
507 | sad = vec_sum4s(t1, sad);
|
---|
508 |
|
---|
509 | pix += line_size;
|
---|
510 | }
|
---|
511 |
|
---|
512 | /* Sum up the four partial sums, and put the result into s */
|
---|
513 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
---|
514 | sumdiffs = vec_splat(sumdiffs, 3);
|
---|
515 | vec_ste(sumdiffs, 0, &s);
|
---|
516 |
|
---|
517 | return s;
|
---|
518 | }
|
---|
519 |
|
---|
520 | void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
|
---|
521 | {
|
---|
522 | int i;
|
---|
523 | vector unsigned char perm, bytes, *pixv;
|
---|
524 | const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
525 | vector signed short shorts;
|
---|
526 |
|
---|
527 | for(i=0;i<8;i++)
|
---|
528 | {
|
---|
529 | // Read potentially unaligned pixels.
|
---|
530 | // We're reading 16 pixels, and actually only want 8,
|
---|
531 | // but we simply ignore the extras.
|
---|
532 | perm = vec_lvsl(0, pixels);
|
---|
533 | pixv = (vector unsigned char *) pixels;
|
---|
534 | bytes = vec_perm(pixv[0], pixv[1], perm);
|
---|
535 |
|
---|
536 | // convert the bytes into shorts
|
---|
537 | shorts = (vector signed short)vec_mergeh(zero, bytes);
|
---|
538 |
|
---|
539 | // save the data to the block, we assume the block is 16-byte aligned
|
---|
540 | vec_st(shorts, i*16, (vector signed short*)block);
|
---|
541 |
|
---|
542 | pixels += line_size;
|
---|
543 | }
|
---|
544 | }
|
---|
545 |
|
---|
546 | void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
|
---|
547 | const uint8_t *s2, int stride)
|
---|
548 | {
|
---|
549 | int i;
|
---|
550 | vector unsigned char perm, bytes, *pixv;
|
---|
551 | const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
552 | vector signed short shorts1, shorts2;
|
---|
553 |
|
---|
554 | for(i=0;i<4;i++)
|
---|
555 | {
|
---|
556 | // Read potentially unaligned pixels
|
---|
557 | // We're reading 16 pixels, and actually only want 8,
|
---|
558 | // but we simply ignore the extras.
|
---|
559 | perm = vec_lvsl(0, s1);
|
---|
560 | pixv = (vector unsigned char *) s1;
|
---|
561 | bytes = vec_perm(pixv[0], pixv[1], perm);
|
---|
562 |
|
---|
563 | // convert the bytes into shorts
|
---|
564 | shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
---|
565 |
|
---|
566 | // Do the same for the second block of pixels
|
---|
567 | perm = vec_lvsl(0, s2);
|
---|
568 | pixv = (vector unsigned char *) s2;
|
---|
569 | bytes = vec_perm(pixv[0], pixv[1], perm);
|
---|
570 |
|
---|
571 | // convert the bytes into shorts
|
---|
572 | shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
---|
573 |
|
---|
574 | // Do the subtraction
|
---|
575 | shorts1 = vec_sub(shorts1, shorts2);
|
---|
576 |
|
---|
577 | // save the data to the block, we assume the block is 16-byte aligned
|
---|
578 | vec_st(shorts1, 0, (vector signed short*)block);
|
---|
579 |
|
---|
580 | s1 += stride;
|
---|
581 | s2 += stride;
|
---|
582 | block += 8;
|
---|
583 |
|
---|
584 |
|
---|
585 | // The code below is a copy of the code above... This is a manual
|
---|
586 | // unroll.
|
---|
587 |
|
---|
588 | // Read potentially unaligned pixels
|
---|
589 | // We're reading 16 pixels, and actually only want 8,
|
---|
590 | // but we simply ignore the extras.
|
---|
591 | perm = vec_lvsl(0, s1);
|
---|
592 | pixv = (vector unsigned char *) s1;
|
---|
593 | bytes = vec_perm(pixv[0], pixv[1], perm);
|
---|
594 |
|
---|
595 | // convert the bytes into shorts
|
---|
596 | shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
---|
597 |
|
---|
598 | // Do the same for the second block of pixels
|
---|
599 | perm = vec_lvsl(0, s2);
|
---|
600 | pixv = (vector unsigned char *) s2;
|
---|
601 | bytes = vec_perm(pixv[0], pixv[1], perm);
|
---|
602 |
|
---|
603 | // convert the bytes into shorts
|
---|
604 | shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
---|
605 |
|
---|
606 | // Do the subtraction
|
---|
607 | shorts1 = vec_sub(shorts1, shorts2);
|
---|
608 |
|
---|
609 | // save the data to the block, we assume the block is 16-byte aligned
|
---|
610 | vec_st(shorts1, 0, (vector signed short*)block);
|
---|
611 |
|
---|
612 | s1 += stride;
|
---|
613 | s2 += stride;
|
---|
614 | block += 8;
|
---|
615 | }
|
---|
616 | }
|
---|
617 |
|
---|
618 | void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
|
---|
619 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
620 | int i;
|
---|
621 | for(i=0; i+7<w; i++){
|
---|
622 | dst[i+0] += src[i+0];
|
---|
623 | dst[i+1] += src[i+1];
|
---|
624 | dst[i+2] += src[i+2];
|
---|
625 | dst[i+3] += src[i+3];
|
---|
626 | dst[i+4] += src[i+4];
|
---|
627 | dst[i+5] += src[i+5];
|
---|
628 | dst[i+6] += src[i+6];
|
---|
629 | dst[i+7] += src[i+7];
|
---|
630 | }
|
---|
631 | for(; i<w; i++)
|
---|
632 | dst[i+0] += src[i+0];
|
---|
633 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
634 | register int i;
|
---|
635 | register vector unsigned char vdst, vsrc;
|
---|
636 |
|
---|
637 | /* dst and src are 16 bytes-aligned (guaranteed) */
|
---|
638 | for(i = 0 ; (i + 15) < w ; i++)
|
---|
639 | {
|
---|
640 | vdst = vec_ld(i << 4, (unsigned char*)dst);
|
---|
641 | vsrc = vec_ld(i << 4, (unsigned char*)src);
|
---|
642 | vdst = vec_add(vsrc, vdst);
|
---|
643 | vec_st(vdst, i << 4, (unsigned char*)dst);
|
---|
644 | }
|
---|
645 | /* if w is not a multiple of 16 */
|
---|
646 | for (; (i < w) ; i++)
|
---|
647 | {
|
---|
648 | dst[i] = src[i];
|
---|
649 | }
|
---|
650 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
651 | }
|
---|
652 |
|
---|
653 | /* next one assumes that ((line_size % 16) == 0) */
|
---|
654 | void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
655 | {
|
---|
656 | POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
|
---|
657 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
658 | int i;
|
---|
659 |
|
---|
660 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
|
---|
661 |
|
---|
662 | for(i=0; i<h; i++) {
|
---|
663 | *((uint32_t*)(block)) = LD32(pixels);
|
---|
664 | *((uint32_t*)(block+4)) = LD32(pixels+4);
|
---|
665 | *((uint32_t*)(block+8)) = LD32(pixels+8);
|
---|
666 | *((uint32_t*)(block+12)) = LD32(pixels+12);
|
---|
667 | pixels+=line_size;
|
---|
668 | block +=line_size;
|
---|
669 | }
|
---|
670 |
|
---|
671 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
|
---|
672 |
|
---|
673 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
674 | register vector unsigned char pixelsv1, pixelsv2;
|
---|
675 | register vector unsigned char pixelsv1B, pixelsv2B;
|
---|
676 | register vector unsigned char pixelsv1C, pixelsv2C;
|
---|
677 | register vector unsigned char pixelsv1D, pixelsv2D;
|
---|
678 |
|
---|
679 | register vector unsigned char perm = vec_lvsl(0, pixels);
|
---|
680 | int i;
|
---|
681 | register int line_size_2 = line_size << 1;
|
---|
682 | register int line_size_3 = line_size + line_size_2;
|
---|
683 | register int line_size_4 = line_size << 2;
|
---|
684 |
|
---|
685 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
|
---|
686 | // hand-unrolling the loop by 4 gains about 15%
|
---|
687 | // mininum execution time goes from 74 to 60 cycles
|
---|
688 | // it's faster than -funroll-loops, but using
|
---|
689 | // -funroll-loops w/ this is bad - 74 cycles again.
|
---|
690 | // all this is on a 7450, tuning for the 7450
|
---|
691 | #if 0
|
---|
692 | for(i=0; i<h; i++) {
|
---|
693 | pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
---|
694 | pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
---|
695 | vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
---|
696 | 0, (unsigned char*)block);
|
---|
697 | pixels+=line_size;
|
---|
698 | block +=line_size;
|
---|
699 | }
|
---|
700 | #else
|
---|
701 | for(i=0; i<h; i+=4) {
|
---|
702 | pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
---|
703 | pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
---|
704 | pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
|
---|
705 | pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
|
---|
706 | pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
|
---|
707 | pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
|
---|
708 | pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
|
---|
709 | pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
|
---|
710 | vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
---|
711 | 0, (unsigned char*)block);
|
---|
712 | vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
|
---|
713 | line_size, (unsigned char*)block);
|
---|
714 | vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
|
---|
715 | line_size_2, (unsigned char*)block);
|
---|
716 | vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
|
---|
717 | line_size_3, (unsigned char*)block);
|
---|
718 | pixels+=line_size_4;
|
---|
719 | block +=line_size_4;
|
---|
720 | }
|
---|
721 | #endif
|
---|
722 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
|
---|
723 |
|
---|
724 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
725 | }
|
---|
726 |
|
---|
727 | /* next one assumes that ((line_size % 16) == 0) */
|
---|
728 | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
|
---|
729 | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
730 | {
|
---|
731 | POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
|
---|
732 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
733 | int i;
|
---|
734 |
|
---|
735 | POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
|
---|
736 |
|
---|
737 | for(i=0; i<h; i++) {
|
---|
738 | op_avg(*((uint32_t*)(block)),LD32(pixels));
|
---|
739 | op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
|
---|
740 | op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
|
---|
741 | op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
|
---|
742 | pixels+=line_size;
|
---|
743 | block +=line_size;
|
---|
744 | }
|
---|
745 |
|
---|
746 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
|
---|
747 |
|
---|
748 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
749 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
---|
750 | register vector unsigned char perm = vec_lvsl(0, pixels);
|
---|
751 | int i;
|
---|
752 |
|
---|
753 | POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
|
---|
754 |
|
---|
755 | for(i=0; i<h; i++) {
|
---|
756 | pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
---|
757 | pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
---|
758 | blockv = vec_ld(0, block);
|
---|
759 | pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
|
---|
760 | blockv = vec_avg(blockv,pixelsv);
|
---|
761 | vec_st(blockv, 0, (unsigned char*)block);
|
---|
762 | pixels+=line_size;
|
---|
763 | block +=line_size;
|
---|
764 | }
|
---|
765 |
|
---|
766 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
|
---|
767 |
|
---|
768 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
769 | }
|
---|
770 |
|
---|
771 | /* next one assumes that ((line_size % 8) == 0) */
|
---|
772 | void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
---|
773 | {
|
---|
774 | POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
|
---|
775 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
776 | int i;
|
---|
777 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
|
---|
778 | for (i = 0; i < h; i++) {
|
---|
779 | *((uint32_t *) (block)) =
|
---|
780 | (((*((uint32_t *) (block))) |
|
---|
781 | ((((const struct unaligned_32 *) (pixels))->l))) -
|
---|
782 | ((((*((uint32_t *) (block))) ^
|
---|
783 | ((((const struct unaligned_32 *) (pixels))->
|
---|
784 | l))) & 0xFEFEFEFEUL) >> 1));
|
---|
785 | *((uint32_t *) (block + 4)) =
|
---|
786 | (((*((uint32_t *) (block + 4))) |
|
---|
787 | ((((const struct unaligned_32 *) (pixels + 4))->l))) -
|
---|
788 | ((((*((uint32_t *) (block + 4))) ^
|
---|
789 | ((((const struct unaligned_32 *) (pixels +
|
---|
790 | 4))->
|
---|
791 | l))) & 0xFEFEFEFEUL) >> 1));
|
---|
792 | pixels += line_size;
|
---|
793 | block += line_size;
|
---|
794 | }
|
---|
795 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
|
---|
796 |
|
---|
797 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
798 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
---|
799 | int i;
|
---|
800 |
|
---|
801 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
|
---|
802 |
|
---|
803 | for (i = 0; i < h; i++) {
|
---|
804 | /*
|
---|
805 | block is 8 bytes-aligned, so we're either in the
|
---|
806 | left block (16 bytes-aligned) or in the right block (not)
|
---|
807 | */
|
---|
808 | int rightside = ((unsigned long)block & 0x0000000F);
|
---|
809 |
|
---|
810 | blockv = vec_ld(0, block);
|
---|
811 | pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
---|
812 | pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
---|
813 | pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
|
---|
814 |
|
---|
815 | if (rightside)
|
---|
816 | {
|
---|
817 | pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
|
---|
818 | }
|
---|
819 | else
|
---|
820 | {
|
---|
821 | pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
|
---|
822 | }
|
---|
823 |
|
---|
824 | blockv = vec_avg(blockv, pixelsv);
|
---|
825 |
|
---|
826 | vec_st(blockv, 0, block);
|
---|
827 |
|
---|
828 | pixels += line_size;
|
---|
829 | block += line_size;
|
---|
830 | }
|
---|
831 |
|
---|
832 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
|
---|
833 |
|
---|
834 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
835 | }
|
---|
836 |
|
---|
837 | /* next one assumes that ((line_size % 8) == 0) */
|
---|
838 | void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
839 | {
|
---|
840 | POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
|
---|
841 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
842 | int j;
|
---|
843 | POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
|
---|
844 | for (j = 0; j < 2; j++) {
|
---|
845 | int i;
|
---|
846 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
847 | const uint32_t b =
|
---|
848 | (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
849 | uint32_t l0 =
|
---|
850 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
851 | uint32_t h0 =
|
---|
852 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
853 | uint32_t l1, h1;
|
---|
854 | pixels += line_size;
|
---|
855 | for (i = 0; i < h; i += 2) {
|
---|
856 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
857 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
858 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
|
---|
859 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
860 | *((uint32_t *) block) =
|
---|
861 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
862 | pixels += line_size;
|
---|
863 | block += line_size;
|
---|
864 | a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
865 | b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
866 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
867 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
868 | *((uint32_t *) block) =
|
---|
869 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
870 | pixels += line_size;
|
---|
871 | block += line_size;
|
---|
872 | } pixels += 4 - line_size * (h + 1);
|
---|
873 | block += 4 - line_size * h;
|
---|
874 | }
|
---|
875 |
|
---|
876 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
|
---|
877 |
|
---|
878 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
879 | register int i;
|
---|
880 | register vector unsigned char
|
---|
881 | pixelsv1, pixelsv2,
|
---|
882 | pixelsavg;
|
---|
883 | register vector unsigned char
|
---|
884 | blockv, temp1, temp2;
|
---|
885 | register vector unsigned short
|
---|
886 | pixelssum1, pixelssum2, temp3;
|
---|
887 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
888 | register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
---|
889 |
|
---|
890 | temp1 = vec_ld(0, pixels);
|
---|
891 | temp2 = vec_ld(16, pixels);
|
---|
892 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
---|
893 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
---|
894 | {
|
---|
895 | pixelsv2 = temp2;
|
---|
896 | }
|
---|
897 | else
|
---|
898 | {
|
---|
899 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
---|
900 | }
|
---|
901 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
902 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
903 | pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
---|
904 | (vector unsigned short)pixelsv2);
|
---|
905 | pixelssum1 = vec_add(pixelssum1, vctwo);
|
---|
906 |
|
---|
907 | POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
|
---|
908 | for (i = 0; i < h ; i++) {
|
---|
909 | int rightside = ((unsigned long)block & 0x0000000F);
|
---|
910 | blockv = vec_ld(0, block);
|
---|
911 |
|
---|
912 | temp1 = vec_ld(line_size, pixels);
|
---|
913 | temp2 = vec_ld(line_size + 16, pixels);
|
---|
914 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
---|
915 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
---|
916 | {
|
---|
917 | pixelsv2 = temp2;
|
---|
918 | }
|
---|
919 | else
|
---|
920 | {
|
---|
921 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
---|
922 | }
|
---|
923 |
|
---|
924 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
925 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
926 | pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
---|
927 | (vector unsigned short)pixelsv2);
|
---|
928 | temp3 = vec_add(pixelssum1, pixelssum2);
|
---|
929 | temp3 = vec_sra(temp3, vctwo);
|
---|
930 | pixelssum1 = vec_add(pixelssum2, vctwo);
|
---|
931 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
---|
932 |
|
---|
933 | if (rightside)
|
---|
934 | {
|
---|
935 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
---|
936 | }
|
---|
937 | else
|
---|
938 | {
|
---|
939 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
---|
940 | }
|
---|
941 |
|
---|
942 | vec_st(blockv, 0, block);
|
---|
943 |
|
---|
944 | block += line_size;
|
---|
945 | pixels += line_size;
|
---|
946 | }
|
---|
947 |
|
---|
948 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
|
---|
949 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
950 | }
|
---|
951 |
|
---|
952 | /* next one assumes that ((line_size % 8) == 0) */
|
---|
953 | void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
954 | {
|
---|
955 | POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
---|
956 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
957 | int j;
|
---|
958 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
---|
959 | for (j = 0; j < 2; j++) {
|
---|
960 | int i;
|
---|
961 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
962 | const uint32_t b =
|
---|
963 | (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
964 | uint32_t l0 =
|
---|
965 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
|
---|
966 | uint32_t h0 =
|
---|
967 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
968 | uint32_t l1, h1;
|
---|
969 | pixels += line_size;
|
---|
970 | for (i = 0; i < h; i += 2) {
|
---|
971 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
972 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
973 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
|
---|
974 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
975 | *((uint32_t *) block) =
|
---|
976 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
977 | pixels += line_size;
|
---|
978 | block += line_size;
|
---|
979 | a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
980 | b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
981 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
|
---|
982 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
983 | *((uint32_t *) block) =
|
---|
984 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
985 | pixels += line_size;
|
---|
986 | block += line_size;
|
---|
987 | } pixels += 4 - line_size * (h + 1);
|
---|
988 | block += 4 - line_size * h;
|
---|
989 | }
|
---|
990 |
|
---|
991 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
---|
992 |
|
---|
993 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
994 | register int i;
|
---|
995 | register vector unsigned char
|
---|
996 | pixelsv1, pixelsv2,
|
---|
997 | pixelsavg;
|
---|
998 | register vector unsigned char
|
---|
999 | blockv, temp1, temp2;
|
---|
1000 | register vector unsigned short
|
---|
1001 | pixelssum1, pixelssum2, temp3;
|
---|
1002 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
1003 | register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
|
---|
1004 | register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
---|
1005 |
|
---|
1006 | temp1 = vec_ld(0, pixels);
|
---|
1007 | temp2 = vec_ld(16, pixels);
|
---|
1008 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
---|
1009 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
---|
1010 | {
|
---|
1011 | pixelsv2 = temp2;
|
---|
1012 | }
|
---|
1013 | else
|
---|
1014 | {
|
---|
1015 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
---|
1016 | }
|
---|
1017 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1018 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1019 | pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
---|
1020 | (vector unsigned short)pixelsv2);
|
---|
1021 | pixelssum1 = vec_add(pixelssum1, vcone);
|
---|
1022 |
|
---|
1023 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
---|
1024 | for (i = 0; i < h ; i++) {
|
---|
1025 | int rightside = ((unsigned long)block & 0x0000000F);
|
---|
1026 | blockv = vec_ld(0, block);
|
---|
1027 |
|
---|
1028 | temp1 = vec_ld(line_size, pixels);
|
---|
1029 | temp2 = vec_ld(line_size + 16, pixels);
|
---|
1030 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
---|
1031 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
---|
1032 | {
|
---|
1033 | pixelsv2 = temp2;
|
---|
1034 | }
|
---|
1035 | else
|
---|
1036 | {
|
---|
1037 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
---|
1038 | }
|
---|
1039 |
|
---|
1040 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1041 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1042 | pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
---|
1043 | (vector unsigned short)pixelsv2);
|
---|
1044 | temp3 = vec_add(pixelssum1, pixelssum2);
|
---|
1045 | temp3 = vec_sra(temp3, vctwo);
|
---|
1046 | pixelssum1 = vec_add(pixelssum2, vcone);
|
---|
1047 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
---|
1048 |
|
---|
1049 | if (rightside)
|
---|
1050 | {
|
---|
1051 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
---|
1052 | }
|
---|
1053 | else
|
---|
1054 | {
|
---|
1055 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
---|
1056 | }
|
---|
1057 |
|
---|
1058 | vec_st(blockv, 0, block);
|
---|
1059 |
|
---|
1060 | block += line_size;
|
---|
1061 | pixels += line_size;
|
---|
1062 | }
|
---|
1063 |
|
---|
1064 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
---|
1065 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1066 | }
|
---|
1067 |
|
---|
1068 | /* next one assumes that ((line_size % 16) == 0) */
|
---|
1069 | void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
---|
1070 | {
|
---|
1071 | POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
|
---|
1072 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
1073 | int j;
|
---|
1074 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
|
---|
1075 | for (j = 0; j < 4; j++) {
|
---|
1076 | int i;
|
---|
1077 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1078 | const uint32_t b =
|
---|
1079 | (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1080 | uint32_t l0 =
|
---|
1081 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
1082 | uint32_t h0 =
|
---|
1083 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1084 | uint32_t l1, h1;
|
---|
1085 | pixels += line_size;
|
---|
1086 | for (i = 0; i < h; i += 2) {
|
---|
1087 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1088 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1089 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
|
---|
1090 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1091 | *((uint32_t *) block) =
|
---|
1092 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
1093 | pixels += line_size;
|
---|
1094 | block += line_size;
|
---|
1095 | a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1096 | b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1097 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
1098 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1099 | *((uint32_t *) block) =
|
---|
1100 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
1101 | pixels += line_size;
|
---|
1102 | block += line_size;
|
---|
1103 | } pixels += 4 - line_size * (h + 1);
|
---|
1104 | block += 4 - line_size * h;
|
---|
1105 | }
|
---|
1106 |
|
---|
1107 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
|
---|
1108 |
|
---|
1109 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1110 | register int i;
|
---|
1111 | register vector unsigned char
|
---|
1112 | pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
---|
1113 | register vector unsigned char
|
---|
1114 | blockv, temp1, temp2;
|
---|
1115 | register vector unsigned short
|
---|
1116 | pixelssum1, pixelssum2, temp3,
|
---|
1117 | pixelssum3, pixelssum4, temp4;
|
---|
1118 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
1119 | register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
---|
1120 |
|
---|
1121 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
|
---|
1122 |
|
---|
1123 | temp1 = vec_ld(0, pixels);
|
---|
1124 | temp2 = vec_ld(16, pixels);
|
---|
1125 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
---|
1126 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
---|
1127 | {
|
---|
1128 | pixelsv2 = temp2;
|
---|
1129 | }
|
---|
1130 | else
|
---|
1131 | {
|
---|
1132 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
---|
1133 | }
|
---|
1134 | pixelsv3 = vec_mergel(vczero, pixelsv1);
|
---|
1135 | pixelsv4 = vec_mergel(vczero, pixelsv2);
|
---|
1136 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1137 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1138 | pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
---|
1139 | (vector unsigned short)pixelsv4);
|
---|
1140 | pixelssum3 = vec_add(pixelssum3, vctwo);
|
---|
1141 | pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
---|
1142 | (vector unsigned short)pixelsv2);
|
---|
1143 | pixelssum1 = vec_add(pixelssum1, vctwo);
|
---|
1144 |
|
---|
1145 | for (i = 0; i < h ; i++) {
|
---|
1146 | blockv = vec_ld(0, block);
|
---|
1147 |
|
---|
1148 | temp1 = vec_ld(line_size, pixels);
|
---|
1149 | temp2 = vec_ld(line_size + 16, pixels);
|
---|
1150 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
---|
1151 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
---|
1152 | {
|
---|
1153 | pixelsv2 = temp2;
|
---|
1154 | }
|
---|
1155 | else
|
---|
1156 | {
|
---|
1157 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
---|
1158 | }
|
---|
1159 |
|
---|
1160 | pixelsv3 = vec_mergel(vczero, pixelsv1);
|
---|
1161 | pixelsv4 = vec_mergel(vczero, pixelsv2);
|
---|
1162 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1163 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1164 |
|
---|
1165 | pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
---|
1166 | (vector unsigned short)pixelsv4);
|
---|
1167 | pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
---|
1168 | (vector unsigned short)pixelsv2);
|
---|
1169 | temp4 = vec_add(pixelssum3, pixelssum4);
|
---|
1170 | temp4 = vec_sra(temp4, vctwo);
|
---|
1171 | temp3 = vec_add(pixelssum1, pixelssum2);
|
---|
1172 | temp3 = vec_sra(temp3, vctwo);
|
---|
1173 |
|
---|
1174 | pixelssum3 = vec_add(pixelssum4, vctwo);
|
---|
1175 | pixelssum1 = vec_add(pixelssum2, vctwo);
|
---|
1176 |
|
---|
1177 | blockv = vec_packsu(temp3, temp4);
|
---|
1178 |
|
---|
1179 | vec_st(blockv, 0, block);
|
---|
1180 |
|
---|
1181 | block += line_size;
|
---|
1182 | pixels += line_size;
|
---|
1183 | }
|
---|
1184 |
|
---|
1185 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
|
---|
1186 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1187 | }
|
---|
1188 |
|
---|
1189 | /* next one assumes that ((line_size % 16) == 0) */
|
---|
1190 | void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
---|
1191 | {
|
---|
1192 | POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
---|
1193 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
1194 | int j;
|
---|
1195 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
---|
1196 | for (j = 0; j < 4; j++) {
|
---|
1197 | int i;
|
---|
1198 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1199 | const uint32_t b =
|
---|
1200 | (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1201 | uint32_t l0 =
|
---|
1202 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
|
---|
1203 | uint32_t h0 =
|
---|
1204 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1205 | uint32_t l1, h1;
|
---|
1206 | pixels += line_size;
|
---|
1207 | for (i = 0; i < h; i += 2) {
|
---|
1208 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1209 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1210 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
|
---|
1211 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1212 | *((uint32_t *) block) =
|
---|
1213 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
1214 | pixels += line_size;
|
---|
1215 | block += line_size;
|
---|
1216 | a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1217 | b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1218 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
|
---|
1219 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1220 | *((uint32_t *) block) =
|
---|
1221 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
|
---|
1222 | pixels += line_size;
|
---|
1223 | block += line_size;
|
---|
1224 | } pixels += 4 - line_size * (h + 1);
|
---|
1225 | block += 4 - line_size * h;
|
---|
1226 | }
|
---|
1227 |
|
---|
1228 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
---|
1229 |
|
---|
1230 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1231 | register int i;
|
---|
1232 | register vector unsigned char
|
---|
1233 | pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
---|
1234 | register vector unsigned char
|
---|
1235 | blockv, temp1, temp2;
|
---|
1236 | register vector unsigned short
|
---|
1237 | pixelssum1, pixelssum2, temp3,
|
---|
1238 | pixelssum3, pixelssum4, temp4;
|
---|
1239 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
1240 | register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
|
---|
1241 | register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
---|
1242 |
|
---|
1243 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
---|
1244 |
|
---|
1245 | temp1 = vec_ld(0, pixels);
|
---|
1246 | temp2 = vec_ld(16, pixels);
|
---|
1247 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
---|
1248 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
---|
1249 | {
|
---|
1250 | pixelsv2 = temp2;
|
---|
1251 | }
|
---|
1252 | else
|
---|
1253 | {
|
---|
1254 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
---|
1255 | }
|
---|
1256 | pixelsv3 = vec_mergel(vczero, pixelsv1);
|
---|
1257 | pixelsv4 = vec_mergel(vczero, pixelsv2);
|
---|
1258 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1259 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1260 | pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
---|
1261 | (vector unsigned short)pixelsv4);
|
---|
1262 | pixelssum3 = vec_add(pixelssum3, vcone);
|
---|
1263 | pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
---|
1264 | (vector unsigned short)pixelsv2);
|
---|
1265 | pixelssum1 = vec_add(pixelssum1, vcone);
|
---|
1266 |
|
---|
1267 | for (i = 0; i < h ; i++) {
|
---|
1268 | blockv = vec_ld(0, block);
|
---|
1269 |
|
---|
1270 | temp1 = vec_ld(line_size, pixels);
|
---|
1271 | temp2 = vec_ld(line_size + 16, pixels);
|
---|
1272 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
---|
1273 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
---|
1274 | {
|
---|
1275 | pixelsv2 = temp2;
|
---|
1276 | }
|
---|
1277 | else
|
---|
1278 | {
|
---|
1279 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
---|
1280 | }
|
---|
1281 |
|
---|
1282 | pixelsv3 = vec_mergel(vczero, pixelsv1);
|
---|
1283 | pixelsv4 = vec_mergel(vczero, pixelsv2);
|
---|
1284 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1285 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1286 |
|
---|
1287 | pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
---|
1288 | (vector unsigned short)pixelsv4);
|
---|
1289 | pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
---|
1290 | (vector unsigned short)pixelsv2);
|
---|
1291 | temp4 = vec_add(pixelssum3, pixelssum4);
|
---|
1292 | temp4 = vec_sra(temp4, vctwo);
|
---|
1293 | temp3 = vec_add(pixelssum1, pixelssum2);
|
---|
1294 | temp3 = vec_sra(temp3, vctwo);
|
---|
1295 |
|
---|
1296 | pixelssum3 = vec_add(pixelssum4, vcone);
|
---|
1297 | pixelssum1 = vec_add(pixelssum2, vcone);
|
---|
1298 |
|
---|
1299 | blockv = vec_packsu(temp3, temp4);
|
---|
1300 |
|
---|
1301 | vec_st(blockv, 0, block);
|
---|
1302 |
|
---|
1303 | block += line_size;
|
---|
1304 | pixels += line_size;
|
---|
1305 | }
|
---|
1306 |
|
---|
1307 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
---|
1308 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1309 | }
|
---|
1310 |
|
---|
1311 | int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
|
---|
1312 | POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
|
---|
1313 | int sum;
|
---|
1314 | register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
1315 | register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
---|
1316 | POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
|
---|
1317 | {
|
---|
1318 | register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
|
---|
1319 | register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
|
---|
1320 | register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
|
---|
1321 | register const_vector unsigned char perm1 = (const_vector unsigned char)
|
---|
1322 | AVV(0x02, 0x03, 0x00, 0x01,
|
---|
1323 | 0x06, 0x07, 0x04, 0x05,
|
---|
1324 | 0x0A, 0x0B, 0x08, 0x09,
|
---|
1325 | 0x0E, 0x0F, 0x0C, 0x0D);
|
---|
1326 | register const_vector unsigned char perm2 = (const_vector unsigned char)
|
---|
1327 | AVV(0x04, 0x05, 0x06, 0x07,
|
---|
1328 | 0x00, 0x01, 0x02, 0x03,
|
---|
1329 | 0x0C, 0x0D, 0x0E, 0x0F,
|
---|
1330 | 0x08, 0x09, 0x0A, 0x0B);
|
---|
1331 | register const_vector unsigned char perm3 = (const_vector unsigned char)
|
---|
1332 | AVV(0x08, 0x09, 0x0A, 0x0B,
|
---|
1333 | 0x0C, 0x0D, 0x0E, 0x0F,
|
---|
1334 | 0x00, 0x01, 0x02, 0x03,
|
---|
1335 | 0x04, 0x05, 0x06, 0x07);
|
---|
1336 |
|
---|
1337 | #define ONEITERBUTTERFLY(i, res) \
|
---|
1338 | { \
|
---|
1339 | register vector unsigned char src1, src2, srcO; \
|
---|
1340 | register vector unsigned char dst1, dst2, dstO; \
|
---|
1341 | register vector signed short srcV, dstV; \
|
---|
1342 | register vector signed short but0, but1, but2, op1, op2, op3; \
|
---|
1343 | src1 = vec_ld(stride * i, src); \
|
---|
1344 | if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
|
---|
1345 | src2 = vec_ld((stride * i) + 16, src); \
|
---|
1346 | srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
|
---|
1347 | dst1 = vec_ld(stride * i, dst); \
|
---|
1348 | if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
|
---|
1349 | dst2 = vec_ld((stride * i) + 16, dst); \
|
---|
1350 | dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
|
---|
1351 | /* promote the unsigned chars to signed shorts */ \
|
---|
1352 | /* we're in the 8x8 function, we only care for the first 8 */ \
|
---|
1353 | srcV = \
|
---|
1354 | (vector signed short)vec_mergeh((vector signed char)vzero, \
|
---|
1355 | (vector signed char)srcO); \
|
---|
1356 | dstV = \
|
---|
1357 | (vector signed short)vec_mergeh((vector signed char)vzero, \
|
---|
1358 | (vector signed char)dstO); \
|
---|
1359 | /* substractions inside the first butterfly */ \
|
---|
1360 | but0 = vec_sub(srcV, dstV); \
|
---|
1361 | op1 = vec_perm(but0, but0, perm1); \
|
---|
1362 | but1 = vec_mladd(but0, vprod1, op1); \
|
---|
1363 | op2 = vec_perm(but1, but1, perm2); \
|
---|
1364 | but2 = vec_mladd(but1, vprod2, op2); \
|
---|
1365 | op3 = vec_perm(but2, but2, perm3); \
|
---|
1366 | res = vec_mladd(but2, vprod3, op3); \
|
---|
1367 | }
|
---|
1368 | ONEITERBUTTERFLY(0, temp0);
|
---|
1369 | ONEITERBUTTERFLY(1, temp1);
|
---|
1370 | ONEITERBUTTERFLY(2, temp2);
|
---|
1371 | ONEITERBUTTERFLY(3, temp3);
|
---|
1372 | ONEITERBUTTERFLY(4, temp4);
|
---|
1373 | ONEITERBUTTERFLY(5, temp5);
|
---|
1374 | ONEITERBUTTERFLY(6, temp6);
|
---|
1375 | ONEITERBUTTERFLY(7, temp7);
|
---|
1376 | }
|
---|
1377 | #undef ONEITERBUTTERFLY
|
---|
1378 | {
|
---|
1379 | register vector signed int vsum;
|
---|
1380 | register vector signed short line0 = vec_add(temp0, temp1);
|
---|
1381 | register vector signed short line1 = vec_sub(temp0, temp1);
|
---|
1382 | register vector signed short line2 = vec_add(temp2, temp3);
|
---|
1383 | register vector signed short line3 = vec_sub(temp2, temp3);
|
---|
1384 | register vector signed short line4 = vec_add(temp4, temp5);
|
---|
1385 | register vector signed short line5 = vec_sub(temp4, temp5);
|
---|
1386 | register vector signed short line6 = vec_add(temp6, temp7);
|
---|
1387 | register vector signed short line7 = vec_sub(temp6, temp7);
|
---|
1388 |
|
---|
1389 | register vector signed short line0B = vec_add(line0, line2);
|
---|
1390 | register vector signed short line2B = vec_sub(line0, line2);
|
---|
1391 | register vector signed short line1B = vec_add(line1, line3);
|
---|
1392 | register vector signed short line3B = vec_sub(line1, line3);
|
---|
1393 | register vector signed short line4B = vec_add(line4, line6);
|
---|
1394 | register vector signed short line6B = vec_sub(line4, line6);
|
---|
1395 | register vector signed short line5B = vec_add(line5, line7);
|
---|
1396 | register vector signed short line7B = vec_sub(line5, line7);
|
---|
1397 |
|
---|
1398 | register vector signed short line0C = vec_add(line0B, line4B);
|
---|
1399 | register vector signed short line4C = vec_sub(line0B, line4B);
|
---|
1400 | register vector signed short line1C = vec_add(line1B, line5B);
|
---|
1401 | register vector signed short line5C = vec_sub(line1B, line5B);
|
---|
1402 | register vector signed short line2C = vec_add(line2B, line6B);
|
---|
1403 | register vector signed short line6C = vec_sub(line2B, line6B);
|
---|
1404 | register vector signed short line3C = vec_add(line3B, line7B);
|
---|
1405 | register vector signed short line7C = vec_sub(line3B, line7B);
|
---|
1406 |
|
---|
1407 | vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
|
---|
1408 | vsum = vec_sum4s(vec_abs(line1C), vsum);
|
---|
1409 | vsum = vec_sum4s(vec_abs(line2C), vsum);
|
---|
1410 | vsum = vec_sum4s(vec_abs(line3C), vsum);
|
---|
1411 | vsum = vec_sum4s(vec_abs(line4C), vsum);
|
---|
1412 | vsum = vec_sum4s(vec_abs(line5C), vsum);
|
---|
1413 | vsum = vec_sum4s(vec_abs(line6C), vsum);
|
---|
1414 | vsum = vec_sum4s(vec_abs(line7C), vsum);
|
---|
1415 | vsum = vec_sums(vsum, (vector signed int)vzero);
|
---|
1416 | vsum = vec_splat(vsum, 3);
|
---|
1417 | vec_ste(vsum, 0, &sum);
|
---|
1418 | }
|
---|
1419 | POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
|
---|
1420 | return sum;
|
---|
1421 | }
|
---|
1422 |
|
---|
1423 | /*
|
---|
1424 | 16x8 works with 16 elements ; it allows to avoid replicating
|
---|
1425 | loads, and give the compiler more rooms for scheduling.
|
---|
1426 | It's only used from inside hadamard8_diff16_altivec.
|
---|
1427 |
|
---|
1428 | Unfortunately, it seems gcc-3.3 is a bit dumb, and
|
---|
1429 | the compiled code has a LOT of spill code, it seems
|
---|
1430 | gcc (unlike xlc) cannot keep everything in registers
|
---|
1431 | by itself. The following code include hand-made
|
---|
1432 | registers allocation. It's not clean, but on
|
---|
1433 | a 7450 the resulting code is much faster (best case
|
---|
1434 | fall from 700+ cycles to 550).
|
---|
1435 |
|
---|
1436 | xlc doesn't add spill code, but it doesn't know how to
|
---|
1437 | schedule for the 7450, and its code isn't much faster than
|
---|
1438 | gcc-3.3 on the 7450 (but uses 25% less instructions...)
|
---|
1439 |
|
---|
1440 | On the 970, the hand-made RA is still a win (arount 690
|
---|
1441 | vs. around 780), but xlc goes to around 660 on the
|
---|
1442 | regular C code...
|
---|
1443 | */
|
---|
1444 |
|
---|
1445 | static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
|
---|
1446 | int sum;
|
---|
1447 | register vector signed short
|
---|
1448 | temp0 REG_v(v0),
|
---|
1449 | temp1 REG_v(v1),
|
---|
1450 | temp2 REG_v(v2),
|
---|
1451 | temp3 REG_v(v3),
|
---|
1452 | temp4 REG_v(v4),
|
---|
1453 | temp5 REG_v(v5),
|
---|
1454 | temp6 REG_v(v6),
|
---|
1455 | temp7 REG_v(v7);
|
---|
1456 | register vector signed short
|
---|
1457 | temp0S REG_v(v8),
|
---|
1458 | temp1S REG_v(v9),
|
---|
1459 | temp2S REG_v(v10),
|
---|
1460 | temp3S REG_v(v11),
|
---|
1461 | temp4S REG_v(v12),
|
---|
1462 | temp5S REG_v(v13),
|
---|
1463 | temp6S REG_v(v14),
|
---|
1464 | temp7S REG_v(v15);
|
---|
1465 | register const_vector unsigned char vzero REG_v(v31)= (const_vector unsigned char)vec_splat_u8(0);
|
---|
1466 | {
|
---|
1467 | register const_vector signed short vprod1 REG_v(v16)= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
|
---|
1468 | register const_vector signed short vprod2 REG_v(v17)= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
|
---|
1469 | register const_vector signed short vprod3 REG_v(v18)= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
|
---|
1470 | register const_vector unsigned char perm1 REG_v(v19)= (const_vector unsigned char)
|
---|
1471 | AVV(0x02, 0x03, 0x00, 0x01,
|
---|
1472 | 0x06, 0x07, 0x04, 0x05,
|
---|
1473 | 0x0A, 0x0B, 0x08, 0x09,
|
---|
1474 | 0x0E, 0x0F, 0x0C, 0x0D);
|
---|
1475 | register const_vector unsigned char perm2 REG_v(v20)= (const_vector unsigned char)
|
---|
1476 | AVV(0x04, 0x05, 0x06, 0x07,
|
---|
1477 | 0x00, 0x01, 0x02, 0x03,
|
---|
1478 | 0x0C, 0x0D, 0x0E, 0x0F,
|
---|
1479 | 0x08, 0x09, 0x0A, 0x0B);
|
---|
1480 | register const_vector unsigned char perm3 REG_v(v21)= (const_vector unsigned char)
|
---|
1481 | AVV(0x08, 0x09, 0x0A, 0x0B,
|
---|
1482 | 0x0C, 0x0D, 0x0E, 0x0F,
|
---|
1483 | 0x00, 0x01, 0x02, 0x03,
|
---|
1484 | 0x04, 0x05, 0x06, 0x07);
|
---|
1485 |
|
---|
1486 | #define ONEITERBUTTERFLY(i, res1, res2) \
|
---|
1487 | { \
|
---|
1488 | register vector unsigned char src1 REG_v(v22), \
|
---|
1489 | src2 REG_v(v23), \
|
---|
1490 | dst1 REG_v(v24), \
|
---|
1491 | dst2 REG_v(v25), \
|
---|
1492 | srcO REG_v(v22), \
|
---|
1493 | dstO REG_v(v23); \
|
---|
1494 | \
|
---|
1495 | register vector signed short srcV REG_v(v24), \
|
---|
1496 | dstV REG_v(v25), \
|
---|
1497 | srcW REG_v(v26), \
|
---|
1498 | dstW REG_v(v27), \
|
---|
1499 | but0 REG_v(v28), \
|
---|
1500 | but0S REG_v(v29), \
|
---|
1501 | op1 REG_v(v30), \
|
---|
1502 | but1 REG_v(v22), \
|
---|
1503 | op1S REG_v(v23), \
|
---|
1504 | but1S REG_v(v24), \
|
---|
1505 | op2 REG_v(v25), \
|
---|
1506 | but2 REG_v(v26), \
|
---|
1507 | op2S REG_v(v27), \
|
---|
1508 | but2S REG_v(v28), \
|
---|
1509 | op3 REG_v(v29), \
|
---|
1510 | op3S REG_v(v30); \
|
---|
1511 | \
|
---|
1512 | src1 = vec_ld(stride * i, src); \
|
---|
1513 | src2 = vec_ld((stride * i) + 16, src); \
|
---|
1514 | srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
|
---|
1515 | dst1 = vec_ld(stride * i, dst); \
|
---|
1516 | dst2 = vec_ld((stride * i) + 16, dst); \
|
---|
1517 | dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
|
---|
1518 | /* promote the unsigned chars to signed shorts */ \
|
---|
1519 | srcV = \
|
---|
1520 | (vector signed short)vec_mergeh((vector signed char)vzero, \
|
---|
1521 | (vector signed char)srcO); \
|
---|
1522 | dstV = \
|
---|
1523 | (vector signed short)vec_mergeh((vector signed char)vzero, \
|
---|
1524 | (vector signed char)dstO); \
|
---|
1525 | srcW = \
|
---|
1526 | (vector signed short)vec_mergel((vector signed char)vzero, \
|
---|
1527 | (vector signed char)srcO); \
|
---|
1528 | dstW = \
|
---|
1529 | (vector signed short)vec_mergel((vector signed char)vzero, \
|
---|
1530 | (vector signed char)dstO); \
|
---|
1531 | /* substractions inside the first butterfly */ \
|
---|
1532 | but0 = vec_sub(srcV, dstV); \
|
---|
1533 | but0S = vec_sub(srcW, dstW); \
|
---|
1534 | op1 = vec_perm(but0, but0, perm1); \
|
---|
1535 | but1 = vec_mladd(but0, vprod1, op1); \
|
---|
1536 | op1S = vec_perm(but0S, but0S, perm1); \
|
---|
1537 | but1S = vec_mladd(but0S, vprod1, op1S); \
|
---|
1538 | op2 = vec_perm(but1, but1, perm2); \
|
---|
1539 | but2 = vec_mladd(but1, vprod2, op2); \
|
---|
1540 | op2S = vec_perm(but1S, but1S, perm2); \
|
---|
1541 | but2S = vec_mladd(but1S, vprod2, op2S); \
|
---|
1542 | op3 = vec_perm(but2, but2, perm3); \
|
---|
1543 | res1 = vec_mladd(but2, vprod3, op3); \
|
---|
1544 | op3S = vec_perm(but2S, but2S, perm3); \
|
---|
1545 | res2 = vec_mladd(but2S, vprod3, op3S); \
|
---|
1546 | }
|
---|
1547 | ONEITERBUTTERFLY(0, temp0, temp0S);
|
---|
1548 | ONEITERBUTTERFLY(1, temp1, temp1S);
|
---|
1549 | ONEITERBUTTERFLY(2, temp2, temp2S);
|
---|
1550 | ONEITERBUTTERFLY(3, temp3, temp3S);
|
---|
1551 | ONEITERBUTTERFLY(4, temp4, temp4S);
|
---|
1552 | ONEITERBUTTERFLY(5, temp5, temp5S);
|
---|
1553 | ONEITERBUTTERFLY(6, temp6, temp6S);
|
---|
1554 | ONEITERBUTTERFLY(7, temp7, temp7S);
|
---|
1555 | }
|
---|
1556 | #undef ONEITERBUTTERFLY
|
---|
1557 | {
|
---|
1558 | register vector signed int vsum;
|
---|
1559 | register vector signed short line0S, line1S, line2S, line3S, line4S,
|
---|
1560 | line5S, line6S, line7S, line0BS,line2BS,
|
---|
1561 | line1BS,line3BS,line4BS,line6BS,line5BS,
|
---|
1562 | line7BS,line0CS,line4CS,line1CS,line5CS,
|
---|
1563 | line2CS,line6CS,line3CS,line7CS;
|
---|
1564 |
|
---|
1565 | register vector signed short line0 = vec_add(temp0, temp1);
|
---|
1566 | register vector signed short line1 = vec_sub(temp0, temp1);
|
---|
1567 | register vector signed short line2 = vec_add(temp2, temp3);
|
---|
1568 | register vector signed short line3 = vec_sub(temp2, temp3);
|
---|
1569 | register vector signed short line4 = vec_add(temp4, temp5);
|
---|
1570 | register vector signed short line5 = vec_sub(temp4, temp5);
|
---|
1571 | register vector signed short line6 = vec_add(temp6, temp7);
|
---|
1572 | register vector signed short line7 = vec_sub(temp6, temp7);
|
---|
1573 |
|
---|
1574 | register vector signed short line0B = vec_add(line0, line2);
|
---|
1575 | register vector signed short line2B = vec_sub(line0, line2);
|
---|
1576 | register vector signed short line1B = vec_add(line1, line3);
|
---|
1577 | register vector signed short line3B = vec_sub(line1, line3);
|
---|
1578 | register vector signed short line4B = vec_add(line4, line6);
|
---|
1579 | register vector signed short line6B = vec_sub(line4, line6);
|
---|
1580 | register vector signed short line5B = vec_add(line5, line7);
|
---|
1581 | register vector signed short line7B = vec_sub(line5, line7);
|
---|
1582 |
|
---|
1583 | register vector signed short line0C = vec_add(line0B, line4B);
|
---|
1584 | register vector signed short line4C = vec_sub(line0B, line4B);
|
---|
1585 | register vector signed short line1C = vec_add(line1B, line5B);
|
---|
1586 | register vector signed short line5C = vec_sub(line1B, line5B);
|
---|
1587 | register vector signed short line2C = vec_add(line2B, line6B);
|
---|
1588 | register vector signed short line6C = vec_sub(line2B, line6B);
|
---|
1589 | register vector signed short line3C = vec_add(line3B, line7B);
|
---|
1590 | register vector signed short line7C = vec_sub(line3B, line7B);
|
---|
1591 |
|
---|
1592 | vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
|
---|
1593 | vsum = vec_sum4s(vec_abs(line1C), vsum);
|
---|
1594 | vsum = vec_sum4s(vec_abs(line2C), vsum);
|
---|
1595 | vsum = vec_sum4s(vec_abs(line3C), vsum);
|
---|
1596 | vsum = vec_sum4s(vec_abs(line4C), vsum);
|
---|
1597 | vsum = vec_sum4s(vec_abs(line5C), vsum);
|
---|
1598 | vsum = vec_sum4s(vec_abs(line6C), vsum);
|
---|
1599 | vsum = vec_sum4s(vec_abs(line7C), vsum);
|
---|
1600 |
|
---|
1601 | line0S = vec_add(temp0S, temp1S);
|
---|
1602 | line1S = vec_sub(temp0S, temp1S);
|
---|
1603 | line2S = vec_add(temp2S, temp3S);
|
---|
1604 | line3S = vec_sub(temp2S, temp3S);
|
---|
1605 | line4S = vec_add(temp4S, temp5S);
|
---|
1606 | line5S = vec_sub(temp4S, temp5S);
|
---|
1607 | line6S = vec_add(temp6S, temp7S);
|
---|
1608 | line7S = vec_sub(temp6S, temp7S);
|
---|
1609 |
|
---|
1610 | line0BS = vec_add(line0S, line2S);
|
---|
1611 | line2BS = vec_sub(line0S, line2S);
|
---|
1612 | line1BS = vec_add(line1S, line3S);
|
---|
1613 | line3BS = vec_sub(line1S, line3S);
|
---|
1614 | line4BS = vec_add(line4S, line6S);
|
---|
1615 | line6BS = vec_sub(line4S, line6S);
|
---|
1616 | line5BS = vec_add(line5S, line7S);
|
---|
1617 | line7BS = vec_sub(line5S, line7S);
|
---|
1618 |
|
---|
1619 | line0CS = vec_add(line0BS, line4BS);
|
---|
1620 | line4CS = vec_sub(line0BS, line4BS);
|
---|
1621 | line1CS = vec_add(line1BS, line5BS);
|
---|
1622 | line5CS = vec_sub(line1BS, line5BS);
|
---|
1623 | line2CS = vec_add(line2BS, line6BS);
|
---|
1624 | line6CS = vec_sub(line2BS, line6BS);
|
---|
1625 | line3CS = vec_add(line3BS, line7BS);
|
---|
1626 | line7CS = vec_sub(line3BS, line7BS);
|
---|
1627 |
|
---|
1628 | vsum = vec_sum4s(vec_abs(line0CS), vsum);
|
---|
1629 | vsum = vec_sum4s(vec_abs(line1CS), vsum);
|
---|
1630 | vsum = vec_sum4s(vec_abs(line2CS), vsum);
|
---|
1631 | vsum = vec_sum4s(vec_abs(line3CS), vsum);
|
---|
1632 | vsum = vec_sum4s(vec_abs(line4CS), vsum);
|
---|
1633 | vsum = vec_sum4s(vec_abs(line5CS), vsum);
|
---|
1634 | vsum = vec_sum4s(vec_abs(line6CS), vsum);
|
---|
1635 | vsum = vec_sum4s(vec_abs(line7CS), vsum);
|
---|
1636 | vsum = vec_sums(vsum, (vector signed int)vzero);
|
---|
1637 | vsum = vec_splat(vsum, 3);
|
---|
1638 | vec_ste(vsum, 0, &sum);
|
---|
1639 | }
|
---|
1640 | return sum;
|
---|
1641 | }
|
---|
1642 |
|
---|
1643 | int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
|
---|
1644 | POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
|
---|
1645 | int score;
|
---|
1646 | POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
|
---|
1647 | score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
|
---|
1648 | if (h==16) {
|
---|
1649 | dst += 8*stride;
|
---|
1650 | src += 8*stride;
|
---|
1651 | score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
|
---|
1652 | }
|
---|
1653 | POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
|
---|
1654 | return score;
|
---|
1655 | }
|
---|
1656 |
|
---|
1657 | int has_altivec(void)
|
---|
1658 | {
|
---|
1659 | #ifdef __AMIGAOS4__
|
---|
1660 | ULONG result = 0;
|
---|
1661 | extern struct ExecIFace *IExec;
|
---|
1662 |
|
---|
1663 | IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
|
---|
1664 | if (result == VECTORTYPE_ALTIVEC) return 1;
|
---|
1665 | return 0;
|
---|
1666 | #else /* __AMIGAOS4__ */
|
---|
1667 |
|
---|
1668 | #ifdef CONFIG_DARWIN
|
---|
1669 | int sels[2] = {CTL_HW, HW_VECTORUNIT};
|
---|
1670 | int has_vu = 0;
|
---|
1671 | size_t len = sizeof(has_vu);
|
---|
1672 | int err;
|
---|
1673 |
|
---|
1674 | err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
|
---|
1675 |
|
---|
1676 | if (err == 0) return (has_vu != 0);
|
---|
1677 | #else /* CONFIG_DARWIN */
|
---|
1678 | /* no Darwin, do it the brute-force way */
|
---|
1679 | /* this is borrowed from the libmpeg2 library */
|
---|
1680 | {
|
---|
1681 | signal (SIGILL, sigill_handler);
|
---|
1682 | if (sigsetjmp (jmpbuf, 1)) {
|
---|
1683 | signal (SIGILL, SIG_DFL);
|
---|
1684 | } else {
|
---|
1685 | canjump = 1;
|
---|
1686 |
|
---|
1687 | asm volatile ("mtspr 256, %0\n\t"
|
---|
1688 | "vand %%v0, %%v0, %%v0"
|
---|
1689 | :
|
---|
1690 | : "r" (-1));
|
---|
1691 |
|
---|
1692 | signal (SIGILL, SIG_DFL);
|
---|
1693 | return 1;
|
---|
1694 | }
|
---|
1695 | }
|
---|
1696 | #endif /* CONFIG_DARWIN */
|
---|
1697 | return 0;
|
---|
1698 | #endif /* __AMIGAOS4__ */
|
---|
1699 | }
|
---|
1700 |
|
---|
1701 | /* next one assumes that ((line_size % 8) == 0) */
|
---|
1702 | void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
1703 | {
|
---|
1704 | POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
|
---|
1705 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
---|
1706 |
|
---|
1707 | int j;
|
---|
1708 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
---|
1709 | for (j = 0; j < 2; j++) {
|
---|
1710 | int i;
|
---|
1711 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1712 | const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1713 | uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
1714 | uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1715 | uint32_t l1, h1;
|
---|
1716 | pixels += line_size;
|
---|
1717 | for (i = 0; i < h; i += 2) {
|
---|
1718 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1719 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1720 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
|
---|
1721 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1722 | *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
|
---|
1723 | pixels += line_size;
|
---|
1724 | block += line_size;
|
---|
1725 | a = (((const struct unaligned_32 *) (pixels))->l);
|
---|
1726 | b = (((const struct unaligned_32 *) (pixels + 1))->l);
|
---|
1727 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
|
---|
1728 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
|
---|
1729 | *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
|
---|
1730 | pixels += line_size;
|
---|
1731 | block += line_size;
|
---|
1732 | } pixels += 4 - line_size * (h + 1);
|
---|
1733 | block += 4 - line_size * h;
|
---|
1734 | }
|
---|
1735 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
---|
1736 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1737 | register int i;
|
---|
1738 | register vector unsigned char
|
---|
1739 | pixelsv1, pixelsv2,
|
---|
1740 | pixelsavg;
|
---|
1741 | register vector unsigned char
|
---|
1742 | blockv, temp1, temp2, blocktemp;
|
---|
1743 | register vector unsigned short
|
---|
1744 | pixelssum1, pixelssum2, temp3;
|
---|
1745 | register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
---|
1746 | register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
---|
1747 |
|
---|
1748 | temp1 = vec_ld(0, pixels);
|
---|
1749 | temp2 = vec_ld(16, pixels);
|
---|
1750 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
---|
1751 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
---|
1752 | {
|
---|
1753 | pixelsv2 = temp2;
|
---|
1754 | }
|
---|
1755 | else
|
---|
1756 | {
|
---|
1757 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
---|
1758 | }
|
---|
1759 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1760 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1761 | pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
---|
1762 | (vector unsigned short)pixelsv2);
|
---|
1763 | pixelssum1 = vec_add(pixelssum1, vctwo);
|
---|
1764 |
|
---|
1765 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
---|
1766 | for (i = 0; i < h ; i++) {
|
---|
1767 | int rightside = ((unsigned long)block & 0x0000000F);
|
---|
1768 | blockv = vec_ld(0, block);
|
---|
1769 |
|
---|
1770 | temp1 = vec_ld(line_size, pixels);
|
---|
1771 | temp2 = vec_ld(line_size + 16, pixels);
|
---|
1772 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
---|
1773 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
---|
1774 | {
|
---|
1775 | pixelsv2 = temp2;
|
---|
1776 | }
|
---|
1777 | else
|
---|
1778 | {
|
---|
1779 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
---|
1780 | }
|
---|
1781 |
|
---|
1782 | pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
---|
1783 | pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
---|
1784 | pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
---|
1785 | (vector unsigned short)pixelsv2);
|
---|
1786 | temp3 = vec_add(pixelssum1, pixelssum2);
|
---|
1787 | temp3 = vec_sra(temp3, vctwo);
|
---|
1788 | pixelssum1 = vec_add(pixelssum2, vctwo);
|
---|
1789 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
---|
1790 |
|
---|
1791 | if (rightside)
|
---|
1792 | {
|
---|
1793 | blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
---|
1794 | }
|
---|
1795 | else
|
---|
1796 | {
|
---|
1797 | blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
---|
1798 | }
|
---|
1799 |
|
---|
1800 | blockv = vec_avg(blocktemp, blockv);
|
---|
1801 | vec_st(blockv, 0, block);
|
---|
1802 |
|
---|
1803 | block += line_size;
|
---|
1804 | pixels += line_size;
|
---|
1805 | }
|
---|
1806 |
|
---|
1807 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
---|
1808 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
|
---|
1809 | }
|
---|