1 | /*
|
---|
2 | Copyright (C) 2001-2002 Michael Niedermayer ([email protected])
|
---|
3 |
|
---|
4 | This program is free software; you can redistribute it and/or modify
|
---|
5 | it under the terms of the GNU General Public License as published by
|
---|
6 | the Free Software Foundation; either version 2 of the License, or
|
---|
7 | (at your option) any later version.
|
---|
8 |
|
---|
9 | This program is distributed in the hope that it will be useful,
|
---|
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
12 | GNU General Public License for more details.
|
---|
13 |
|
---|
14 | You should have received a copy of the GNU General Public License
|
---|
15 | along with this program; if not, write to the Free Software
|
---|
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
17 | */
|
---|
18 |
|
---|
19 | /**
|
---|
20 | * @file postprocess_template.c
|
---|
21 | * mmx/mmx2/3dnow postprocess code.
|
---|
22 | */
|
---|
23 |
|
---|
24 |
|
---|
25 | #ifdef ARCH_X86_64
|
---|
26 | # define REGa rax
|
---|
27 | # define REGc rcx
|
---|
28 | # define REGd rdx
|
---|
29 | # define REG_a "rax"
|
---|
30 | # define REG_c "rcx"
|
---|
31 | # define REG_d "rdx"
|
---|
32 | # define REG_SP "rsp"
|
---|
33 | # define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8"
|
---|
34 | #else
|
---|
35 | # define REGa eax
|
---|
36 | # define REGc ecx
|
---|
37 | # define REGd edx
|
---|
38 | # define REG_a "eax"
|
---|
39 | # define REG_c "ecx"
|
---|
40 | # define REG_d "edx"
|
---|
41 | # define REG_SP "esp"
|
---|
42 | # define ALIGN_MASK "$0xFFFFFFF8"
|
---|
43 | #endif
|
---|
44 |
|
---|
45 |
|
---|
46 | #undef PAVGB
|
---|
47 | #undef PMINUB
|
---|
48 | #undef PMAXUB
|
---|
49 |
|
---|
50 | #ifdef HAVE_MMX2
|
---|
51 | #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
|
---|
52 | #elif defined (HAVE_3DNOW)
|
---|
53 | #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
|
---|
54 | #endif
|
---|
55 | #define PAVGB(a,b) REAL_PAVGB(a,b)
|
---|
56 |
|
---|
57 | #ifdef HAVE_MMX2
|
---|
58 | #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
|
---|
59 | #elif defined (HAVE_MMX)
|
---|
60 | #define PMINUB(b,a,t) \
|
---|
61 | "movq " #a ", " #t " \n\t"\
|
---|
62 | "psubusb " #b ", " #t " \n\t"\
|
---|
63 | "psubb " #t ", " #a " \n\t"
|
---|
64 | #endif
|
---|
65 |
|
---|
66 | #ifdef HAVE_MMX2
|
---|
67 | #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
|
---|
68 | #elif defined (HAVE_MMX)
|
---|
69 | #define PMAXUB(a,b) \
|
---|
70 | "psubusb " #a ", " #b " \n\t"\
|
---|
71 | "paddb " #a ", " #b " \n\t"
|
---|
72 | #endif
|
---|
73 |
|
---|
74 | //FIXME? |255-0| = 1 (shouldnt be a problem ...)
|
---|
75 | #ifdef HAVE_MMX
|
---|
76 | /**
|
---|
77 | * Check if the middle 8x8 Block in the given 8x16 block is flat
|
---|
78 | */
|
---|
79 | static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
|
---|
80 | int numEq= 0, dcOk;
|
---|
81 | src+= stride*4; // src points to begin of the 8x8 Block
|
---|
82 | asm volatile(
|
---|
83 | "movq %0, %%mm7 \n\t"
|
---|
84 | "movq %1, %%mm6 \n\t"
|
---|
85 | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
|
---|
86 | );
|
---|
87 |
|
---|
88 | asm volatile(
|
---|
89 | "lea (%2, %3), %%"REG_a" \n\t"
|
---|
90 | // 0 1 2 3 4 5 6 7 8 9
|
---|
91 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
|
---|
92 |
|
---|
93 | "movq (%2), %%mm0 \n\t"
|
---|
94 | "movq (%%"REG_a"), %%mm1 \n\t"
|
---|
95 | "movq %%mm0, %%mm3 \n\t"
|
---|
96 | "movq %%mm0, %%mm4 \n\t"
|
---|
97 | PMAXUB(%%mm1, %%mm4)
|
---|
98 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
99 | "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
|
---|
100 | "paddb %%mm7, %%mm0 \n\t"
|
---|
101 | "pcmpgtb %%mm6, %%mm0 \n\t"
|
---|
102 |
|
---|
103 | "movq (%%"REG_a",%3), %%mm2 \n\t"
|
---|
104 | PMAXUB(%%mm2, %%mm4)
|
---|
105 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
106 | "psubb %%mm2, %%mm1 \n\t"
|
---|
107 | "paddb %%mm7, %%mm1 \n\t"
|
---|
108 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
109 | "paddb %%mm1, %%mm0 \n\t"
|
---|
110 |
|
---|
111 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
|
---|
112 | PMAXUB(%%mm1, %%mm4)
|
---|
113 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
114 | "psubb %%mm1, %%mm2 \n\t"
|
---|
115 | "paddb %%mm7, %%mm2 \n\t"
|
---|
116 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
117 | "paddb %%mm2, %%mm0 \n\t"
|
---|
118 |
|
---|
119 | "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
|
---|
120 |
|
---|
121 | "movq (%2, %3, 4), %%mm2 \n\t"
|
---|
122 | PMAXUB(%%mm2, %%mm4)
|
---|
123 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
124 | "psubb %%mm2, %%mm1 \n\t"
|
---|
125 | "paddb %%mm7, %%mm1 \n\t"
|
---|
126 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
127 | "paddb %%mm1, %%mm0 \n\t"
|
---|
128 |
|
---|
129 | "movq (%%"REG_a"), %%mm1 \n\t"
|
---|
130 | PMAXUB(%%mm1, %%mm4)
|
---|
131 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
132 | "psubb %%mm1, %%mm2 \n\t"
|
---|
133 | "paddb %%mm7, %%mm2 \n\t"
|
---|
134 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
135 | "paddb %%mm2, %%mm0 \n\t"
|
---|
136 |
|
---|
137 | "movq (%%"REG_a", %3), %%mm2 \n\t"
|
---|
138 | PMAXUB(%%mm2, %%mm4)
|
---|
139 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
140 | "psubb %%mm2, %%mm1 \n\t"
|
---|
141 | "paddb %%mm7, %%mm1 \n\t"
|
---|
142 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
143 | "paddb %%mm1, %%mm0 \n\t"
|
---|
144 |
|
---|
145 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
|
---|
146 | PMAXUB(%%mm1, %%mm4)
|
---|
147 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
148 | "psubb %%mm1, %%mm2 \n\t"
|
---|
149 | "paddb %%mm7, %%mm2 \n\t"
|
---|
150 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
151 | "paddb %%mm2, %%mm0 \n\t"
|
---|
152 | "psubusb %%mm3, %%mm4 \n\t"
|
---|
153 |
|
---|
154 | " \n\t"
|
---|
155 | #ifdef HAVE_MMX2
|
---|
156 | "pxor %%mm7, %%mm7 \n\t"
|
---|
157 | "psadbw %%mm7, %%mm0 \n\t"
|
---|
158 | #else
|
---|
159 | "movq %%mm0, %%mm1 \n\t"
|
---|
160 | "psrlw $8, %%mm0 \n\t"
|
---|
161 | "paddb %%mm1, %%mm0 \n\t"
|
---|
162 | "movq %%mm0, %%mm1 \n\t"
|
---|
163 | "psrlq $16, %%mm0 \n\t"
|
---|
164 | "paddb %%mm1, %%mm0 \n\t"
|
---|
165 | "movq %%mm0, %%mm1 \n\t"
|
---|
166 | "psrlq $32, %%mm0 \n\t"
|
---|
167 | "paddb %%mm1, %%mm0 \n\t"
|
---|
168 | #endif
|
---|
169 | "movq %4, %%mm7 \n\t" // QP,..., QP
|
---|
170 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
|
---|
171 | "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
|
---|
172 | "packssdw %%mm4, %%mm4 \n\t"
|
---|
173 | "movd %%mm0, %0 \n\t"
|
---|
174 | "movd %%mm4, %1 \n\t"
|
---|
175 |
|
---|
176 | : "=r" (numEq), "=r" (dcOk)
|
---|
177 | : "r" (src), "r" ((long)stride), "m" (c->pQPb)
|
---|
178 | : "%"REG_a
|
---|
179 | );
|
---|
180 |
|
---|
181 | numEq= (-numEq) &0xFF;
|
---|
182 | if(numEq > c->ppMode.flatnessThreshold){
|
---|
183 | if(dcOk) return 0;
|
---|
184 | else return 1;
|
---|
185 | }else{
|
---|
186 | return 2;
|
---|
187 | }
|
---|
188 | }
|
---|
189 | #endif //HAVE_MMX
|
---|
190 |
|
---|
191 | /**
|
---|
192 | * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
|
---|
193 | * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
|
---|
194 | */
|
---|
195 | #ifndef HAVE_ALTIVEC
|
---|
196 | static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
|
---|
197 | {
|
---|
198 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
199 | src+= stride*3;
|
---|
200 | asm volatile( //"movv %0 %1 %2\n\t"
|
---|
201 | "movq %2, %%mm0 \n\t" // QP,..., QP
|
---|
202 | "pxor %%mm4, %%mm4 \n\t"
|
---|
203 |
|
---|
204 | "movq (%0), %%mm6 \n\t"
|
---|
205 | "movq (%0, %1), %%mm5 \n\t"
|
---|
206 | "movq %%mm5, %%mm1 \n\t"
|
---|
207 | "movq %%mm6, %%mm2 \n\t"
|
---|
208 | "psubusb %%mm6, %%mm5 \n\t"
|
---|
209 | "psubusb %%mm1, %%mm2 \n\t"
|
---|
210 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
|
---|
211 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
|
---|
212 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
|
---|
213 |
|
---|
214 | "pand %%mm2, %%mm6 \n\t"
|
---|
215 | "pandn %%mm1, %%mm2 \n\t"
|
---|
216 | "por %%mm2, %%mm6 \n\t"// First Line to Filter
|
---|
217 |
|
---|
218 | "movq (%0, %1, 8), %%mm5 \n\t"
|
---|
219 | "lea (%0, %1, 4), %%"REG_a" \n\t"
|
---|
220 | "lea (%0, %1, 8), %%"REG_c" \n\t"
|
---|
221 | "sub %1, %%"REG_c" \n\t"
|
---|
222 | "add %1, %0 \n\t" // %0 points to line 1 not 0
|
---|
223 | "movq (%0, %1, 8), %%mm7 \n\t"
|
---|
224 | "movq %%mm5, %%mm1 \n\t"
|
---|
225 | "movq %%mm7, %%mm2 \n\t"
|
---|
226 | "psubusb %%mm7, %%mm5 \n\t"
|
---|
227 | "psubusb %%mm1, %%mm2 \n\t"
|
---|
228 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
|
---|
229 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
|
---|
230 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
|
---|
231 |
|
---|
232 | "pand %%mm2, %%mm7 \n\t"
|
---|
233 | "pandn %%mm1, %%mm2 \n\t"
|
---|
234 | "por %%mm2, %%mm7 \n\t" // First Line to Filter
|
---|
235 |
|
---|
236 |
|
---|
237 | // 1 2 3 4 5 6 7 8
|
---|
238 | // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
|
---|
239 | // 6 4 2 2 1 1
|
---|
240 | // 6 4 4 2
|
---|
241 | // 6 8 2
|
---|
242 |
|
---|
243 | "movq (%0, %1), %%mm0 \n\t" // 1
|
---|
244 | "movq %%mm0, %%mm1 \n\t" // 1
|
---|
245 | PAVGB(%%mm6, %%mm0) //1 1 /2
|
---|
246 | PAVGB(%%mm6, %%mm0) //3 1 /4
|
---|
247 |
|
---|
248 | "movq (%0, %1, 4), %%mm2 \n\t" // 1
|
---|
249 | "movq %%mm2, %%mm5 \n\t" // 1
|
---|
250 | PAVGB((%%REGa), %%mm2) // 11 /2
|
---|
251 | PAVGB((%0, %1, 2), %%mm2) // 211 /4
|
---|
252 | "movq %%mm2, %%mm3 \n\t" // 211 /4
|
---|
253 | "movq (%0), %%mm4 \n\t" // 1
|
---|
254 | PAVGB(%%mm4, %%mm3) // 4 211 /8
|
---|
255 | PAVGB(%%mm0, %%mm3) //642211 /16
|
---|
256 | "movq %%mm3, (%0) \n\t" // X
|
---|
257 | // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
|
---|
258 | "movq %%mm1, %%mm0 \n\t" // 1
|
---|
259 | PAVGB(%%mm6, %%mm0) //1 1 /2
|
---|
260 | "movq %%mm4, %%mm3 \n\t" // 1
|
---|
261 | PAVGB((%0,%1,2), %%mm3) // 1 1 /2
|
---|
262 | PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
|
---|
263 | PAVGB((%%REGa), %%mm5) // 211 /4
|
---|
264 | PAVGB(%%mm5, %%mm3) // 2 2211 /8
|
---|
265 | PAVGB(%%mm0, %%mm3) //4242211 /16
|
---|
266 | "movq %%mm3, (%0,%1) \n\t" // X
|
---|
267 | // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
|
---|
268 | PAVGB(%%mm4, %%mm6) //11 /2
|
---|
269 | "movq (%%"REG_c"), %%mm0 \n\t" // 1
|
---|
270 | PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
|
---|
271 | "movq %%mm0, %%mm3 \n\t" // 11/2
|
---|
272 | PAVGB(%%mm1, %%mm0) // 2 11/4
|
---|
273 | PAVGB(%%mm6, %%mm0) //222 11/8
|
---|
274 | PAVGB(%%mm2, %%mm0) //22242211/16
|
---|
275 | "movq (%0, %1, 2), %%mm2 \n\t" // 1
|
---|
276 | "movq %%mm0, (%0, %1, 2) \n\t" // X
|
---|
277 | // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
|
---|
278 | "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
|
---|
279 | PAVGB((%%REGc), %%mm0) // 11 /2
|
---|
280 | PAVGB(%%mm0, %%mm6) //11 11 /4
|
---|
281 | PAVGB(%%mm1, %%mm4) // 11 /2
|
---|
282 | PAVGB(%%mm2, %%mm1) // 11 /2
|
---|
283 | PAVGB(%%mm1, %%mm6) //1122 11 /8
|
---|
284 | PAVGB(%%mm5, %%mm6) //112242211 /16
|
---|
285 | "movq (%%"REG_a"), %%mm5 \n\t" // 1
|
---|
286 | "movq %%mm6, (%%"REG_a") \n\t" // X
|
---|
287 | // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
|
---|
288 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
|
---|
289 | PAVGB(%%mm7, %%mm6) // 11 /2
|
---|
290 | PAVGB(%%mm4, %%mm6) // 11 11 /4
|
---|
291 | PAVGB(%%mm3, %%mm6) // 11 2211 /8
|
---|
292 | PAVGB(%%mm5, %%mm2) // 11 /2
|
---|
293 | "movq (%0, %1, 4), %%mm4 \n\t" // 1
|
---|
294 | PAVGB(%%mm4, %%mm2) // 112 /4
|
---|
295 | PAVGB(%%mm2, %%mm6) // 112242211 /16
|
---|
296 | "movq %%mm6, (%0, %1, 4) \n\t" // X
|
---|
297 | // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
|
---|
298 | PAVGB(%%mm7, %%mm1) // 11 2 /4
|
---|
299 | PAVGB(%%mm4, %%mm5) // 11 /2
|
---|
300 | PAVGB(%%mm5, %%mm0) // 11 11 /4
|
---|
301 | "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
|
---|
302 | PAVGB(%%mm6, %%mm1) // 11 4 2 /8
|
---|
303 | PAVGB(%%mm0, %%mm1) // 11224222 /16
|
---|
304 | "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
|
---|
305 | // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
|
---|
306 | PAVGB((%%REGc), %%mm2) // 112 4 /8
|
---|
307 | "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
|
---|
308 | PAVGB(%%mm0, %%mm6) // 1 1 /2
|
---|
309 | PAVGB(%%mm7, %%mm6) // 1 12 /4
|
---|
310 | PAVGB(%%mm2, %%mm6) // 1122424 /4
|
---|
311 | "movq %%mm6, (%%"REG_c") \n\t" // X
|
---|
312 | // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
|
---|
313 | PAVGB(%%mm7, %%mm5) // 11 2 /4
|
---|
314 | PAVGB(%%mm7, %%mm5) // 11 6 /8
|
---|
315 |
|
---|
316 | PAVGB(%%mm3, %%mm0) // 112 /4
|
---|
317 | PAVGB(%%mm0, %%mm5) // 112246 /16
|
---|
318 | "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
|
---|
319 | "sub %1, %0 \n\t"
|
---|
320 |
|
---|
321 | :
|
---|
322 | : "r" (src), "r" ((long)stride), "m" (c->pQPb)
|
---|
323 | : "%"REG_a, "%"REG_c
|
---|
324 | );
|
---|
325 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
326 | const int l1= stride;
|
---|
327 | const int l2= stride + l1;
|
---|
328 | const int l3= stride + l2;
|
---|
329 | const int l4= stride + l3;
|
---|
330 | const int l5= stride + l4;
|
---|
331 | const int l6= stride + l5;
|
---|
332 | const int l7= stride + l6;
|
---|
333 | const int l8= stride + l7;
|
---|
334 | const int l9= stride + l8;
|
---|
335 | int x;
|
---|
336 | src+= stride*3;
|
---|
337 | for(x=0; x<BLOCK_SIZE; x++)
|
---|
338 | {
|
---|
339 | const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
|
---|
340 | const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
|
---|
341 |
|
---|
342 | int sums[10];
|
---|
343 | sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
|
---|
344 | sums[1] = sums[0] - first + src[l4];
|
---|
345 | sums[2] = sums[1] - first + src[l5];
|
---|
346 | sums[3] = sums[2] - first + src[l6];
|
---|
347 | sums[4] = sums[3] - first + src[l7];
|
---|
348 | sums[5] = sums[4] - src[l1] + src[l8];
|
---|
349 | sums[6] = sums[5] - src[l2] + last;
|
---|
350 | sums[7] = sums[6] - src[l3] + last;
|
---|
351 | sums[8] = sums[7] - src[l4] + last;
|
---|
352 | sums[9] = sums[8] - src[l5] + last;
|
---|
353 |
|
---|
354 | src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
|
---|
355 | src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
|
---|
356 | src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
|
---|
357 | src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
|
---|
358 | src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
|
---|
359 | src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
|
---|
360 | src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
|
---|
361 | src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
|
---|
362 |
|
---|
363 | src++;
|
---|
364 | }
|
---|
365 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
366 | }
|
---|
367 | #endif //HAVE_ALTIVEC
|
---|
368 |
|
---|
369 | #if 0
|
---|
370 | /**
|
---|
371 | * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
|
---|
372 | * values are correctly clipped (MMX2)
|
---|
373 | * values are wraparound (C)
|
---|
374 | * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
|
---|
375 | 0 8 16 24
|
---|
376 | x = 8
|
---|
377 | x/2 = 4
|
---|
378 | x/8 = 1
|
---|
379 | 1 12 12 23
|
---|
380 | */
|
---|
381 | static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
|
---|
382 | {
|
---|
383 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
384 | src+= stride*3;
|
---|
385 | // FIXME rounding
|
---|
386 | asm volatile(
|
---|
387 | "pxor %%mm7, %%mm7 \n\t" // 0
|
---|
388 | "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
|
---|
389 | "leal (%0, %1), %%"REG_a" \n\t"
|
---|
390 | "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t"
|
---|
391 | // 0 1 2 3 4 5 6 7 8 9
|
---|
392 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
|
---|
393 | "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
|
---|
394 | "movq %%mm0, %%mm1 \n\t" // QP,..., QP
|
---|
395 | "paddusb "MANGLE(b02)", %%mm0 \n\t"
|
---|
396 | "psrlw $2, %%mm0 \n\t"
|
---|
397 | "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
|
---|
398 | "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
|
---|
399 | "movq (%0, %1, 4), %%mm2 \n\t" // line 4
|
---|
400 | "movq (%%"REG_c"), %%mm3 \n\t" // line 5
|
---|
401 | "movq %%mm2, %%mm4 \n\t" // line 4
|
---|
402 | "pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
---|
403 | "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
---|
404 | PAVGB(%%mm3, %%mm5)
|
---|
405 | "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
---|
406 | "psubusb %%mm3, %%mm4 \n\t"
|
---|
407 | "psubusb %%mm2, %%mm3 \n\t"
|
---|
408 | "por %%mm3, %%mm4 \n\t" // |l4 - l5|
|
---|
409 | "psubusb %%mm0, %%mm4 \n\t"
|
---|
410 | "pcmpeqb %%mm7, %%mm4 \n\t"
|
---|
411 | "pand %%mm4, %%mm5 \n\t" // d/2
|
---|
412 |
|
---|
413 | // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
|
---|
414 | "paddb %%mm5, %%mm2 \n\t"
|
---|
415 | // "psubb %%mm6, %%mm2 \n\t"
|
---|
416 | "movq %%mm2, (%0,%1, 4) \n\t"
|
---|
417 |
|
---|
418 | "movq (%%"REG_c"), %%mm2 \n\t"
|
---|
419 | // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
|
---|
420 | "psubb %%mm5, %%mm2 \n\t"
|
---|
421 | // "psubb %%mm6, %%mm2 \n\t"
|
---|
422 | "movq %%mm2, (%%"REG_c") \n\t"
|
---|
423 |
|
---|
424 | "paddb %%mm6, %%mm5 \n\t"
|
---|
425 | "psrlw $2, %%mm5 \n\t"
|
---|
426 | "pand "MANGLE(b3F)", %%mm5 \n\t"
|
---|
427 | "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
|
---|
428 |
|
---|
429 | "movq (%%"REG_a", %1, 2), %%mm2 \n\t"
|
---|
430 | "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
|
---|
431 | "paddsb %%mm5, %%mm2 \n\t"
|
---|
432 | "psubb %%mm6, %%mm2 \n\t"
|
---|
433 | "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
|
---|
434 |
|
---|
435 | "movq (%%"REG_c", %1), %%mm2 \n\t"
|
---|
436 | "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
|
---|
437 | "psubsb %%mm5, %%mm2 \n\t"
|
---|
438 | "psubb %%mm6, %%mm2 \n\t"
|
---|
439 | "movq %%mm2, (%%"REG_c", %1) \n\t"
|
---|
440 |
|
---|
441 | :
|
---|
442 | : "r" (src), "r" ((long)stride)
|
---|
443 | : "%"REG_a, "%"REG_c
|
---|
444 | );
|
---|
445 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
446 | const int l1= stride;
|
---|
447 | const int l2= stride + l1;
|
---|
448 | const int l3= stride + l2;
|
---|
449 | const int l4= stride + l3;
|
---|
450 | const int l5= stride + l4;
|
---|
451 | const int l6= stride + l5;
|
---|
452 | // const int l7= stride + l6;
|
---|
453 | // const int l8= stride + l7;
|
---|
454 | // const int l9= stride + l8;
|
---|
455 | int x;
|
---|
456 | const int QP15= QP + (QP>>2);
|
---|
457 | src+= stride*3;
|
---|
458 | for(x=0; x<BLOCK_SIZE; x++)
|
---|
459 | {
|
---|
460 | const int v = (src[x+l5] - src[x+l4]);
|
---|
461 | if(ABS(v) < QP15)
|
---|
462 | {
|
---|
463 | src[x+l3] +=v>>3;
|
---|
464 | src[x+l4] +=v>>1;
|
---|
465 | src[x+l5] -=v>>1;
|
---|
466 | src[x+l6] -=v>>3;
|
---|
467 |
|
---|
468 | }
|
---|
469 | }
|
---|
470 |
|
---|
471 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
472 | }
|
---|
473 | #endif //0
|
---|
474 |
|
---|
475 | /**
|
---|
476 | * Experimental Filter 1
|
---|
477 | * will not damage linear gradients
|
---|
478 | * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
|
---|
479 | * can only smooth blocks at the expected locations (it cant smooth them if they did move)
|
---|
480 | * MMX2 version does correct clipping C version doesnt
|
---|
481 | */
|
---|
482 | static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
|
---|
483 | {
|
---|
484 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
485 | src+= stride*3;
|
---|
486 |
|
---|
487 | asm volatile(
|
---|
488 | "pxor %%mm7, %%mm7 \n\t" // 0
|
---|
489 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
490 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
|
---|
491 | // 0 1 2 3 4 5 6 7 8 9
|
---|
492 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
|
---|
493 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
|
---|
494 | "movq (%0, %1, 4), %%mm1 \n\t" // line 4
|
---|
495 | "movq %%mm1, %%mm2 \n\t" // line 4
|
---|
496 | "psubusb %%mm0, %%mm1 \n\t"
|
---|
497 | "psubusb %%mm2, %%mm0 \n\t"
|
---|
498 | "por %%mm1, %%mm0 \n\t" // |l2 - l3|
|
---|
499 | "movq (%%"REG_c"), %%mm3 \n\t" // line 5
|
---|
500 | "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
|
---|
501 | "movq %%mm3, %%mm5 \n\t" // line 5
|
---|
502 | "psubusb %%mm4, %%mm3 \n\t"
|
---|
503 | "psubusb %%mm5, %%mm4 \n\t"
|
---|
504 | "por %%mm4, %%mm3 \n\t" // |l5 - l6|
|
---|
505 | PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
|
---|
506 | "movq %%mm2, %%mm1 \n\t" // line 4
|
---|
507 | "psubusb %%mm5, %%mm2 \n\t"
|
---|
508 | "movq %%mm2, %%mm4 \n\t"
|
---|
509 | "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
|
---|
510 | "psubusb %%mm1, %%mm5 \n\t"
|
---|
511 | "por %%mm5, %%mm4 \n\t" // |l4 - l5|
|
---|
512 | "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
|
---|
513 | "movq %%mm4, %%mm3 \n\t" // d
|
---|
514 | "movq %2, %%mm0 \n\t"
|
---|
515 | "paddusb %%mm0, %%mm0 \n\t"
|
---|
516 | "psubusb %%mm0, %%mm4 \n\t"
|
---|
517 | "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
|
---|
518 | "psubusb "MANGLE(b01)", %%mm3 \n\t"
|
---|
519 | "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
|
---|
520 |
|
---|
521 | PAVGB(%%mm7, %%mm3) // d/2
|
---|
522 | "movq %%mm3, %%mm1 \n\t" // d/2
|
---|
523 | PAVGB(%%mm7, %%mm3) // d/4
|
---|
524 | PAVGB(%%mm1, %%mm3) // 3*d/8
|
---|
525 |
|
---|
526 | "movq (%0, %1, 4), %%mm0 \n\t" // line 4
|
---|
527 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
---|
528 | "psubusb %%mm3, %%mm0 \n\t"
|
---|
529 | "pxor %%mm2, %%mm0 \n\t"
|
---|
530 | "movq %%mm0, (%0, %1, 4) \n\t" // line 4
|
---|
531 |
|
---|
532 | "movq (%%"REG_c"), %%mm0 \n\t" // line 5
|
---|
533 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
---|
534 | "paddusb %%mm3, %%mm0 \n\t"
|
---|
535 | "pxor %%mm2, %%mm0 \n\t"
|
---|
536 | "movq %%mm0, (%%"REG_c") \n\t" // line 5
|
---|
537 |
|
---|
538 | PAVGB(%%mm7, %%mm1) // d/4
|
---|
539 |
|
---|
540 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
|
---|
541 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
---|
542 | "psubusb %%mm1, %%mm0 \n\t"
|
---|
543 | "pxor %%mm2, %%mm0 \n\t"
|
---|
544 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3
|
---|
545 |
|
---|
546 | "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6
|
---|
547 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
---|
548 | "paddusb %%mm1, %%mm0 \n\t"
|
---|
549 | "pxor %%mm2, %%mm0 \n\t"
|
---|
550 | "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6
|
---|
551 |
|
---|
552 | PAVGB(%%mm7, %%mm1) // d/8
|
---|
553 |
|
---|
554 | "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2
|
---|
555 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
|
---|
556 | "psubusb %%mm1, %%mm0 \n\t"
|
---|
557 | "pxor %%mm2, %%mm0 \n\t"
|
---|
558 | "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2
|
---|
559 |
|
---|
560 | "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7
|
---|
561 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
|
---|
562 | "paddusb %%mm1, %%mm0 \n\t"
|
---|
563 | "pxor %%mm2, %%mm0 \n\t"
|
---|
564 | "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7
|
---|
565 |
|
---|
566 | :
|
---|
567 | : "r" (src), "r" ((long)stride), "m" (co->pQPb)
|
---|
568 | : "%"REG_a, "%"REG_c
|
---|
569 | );
|
---|
570 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
571 |
|
---|
572 | const int l1= stride;
|
---|
573 | const int l2= stride + l1;
|
---|
574 | const int l3= stride + l2;
|
---|
575 | const int l4= stride + l3;
|
---|
576 | const int l5= stride + l4;
|
---|
577 | const int l6= stride + l5;
|
---|
578 | const int l7= stride + l6;
|
---|
579 | // const int l8= stride + l7;
|
---|
580 | // const int l9= stride + l8;
|
---|
581 | int x;
|
---|
582 |
|
---|
583 | src+= stride*3;
|
---|
584 | for(x=0; x<BLOCK_SIZE; x++)
|
---|
585 | {
|
---|
586 | int a= src[l3] - src[l4];
|
---|
587 | int b= src[l4] - src[l5];
|
---|
588 | int c= src[l5] - src[l6];
|
---|
589 |
|
---|
590 | int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
|
---|
591 | d= MAX(d, 0);
|
---|
592 |
|
---|
593 | if(d < co->QP*2)
|
---|
594 | {
|
---|
595 | int v = d * SIGN(-b);
|
---|
596 |
|
---|
597 | src[l2] +=v>>3;
|
---|
598 | src[l3] +=v>>2;
|
---|
599 | src[l4] +=(3*v)>>3;
|
---|
600 | src[l5] -=(3*v)>>3;
|
---|
601 | src[l6] -=v>>2;
|
---|
602 | src[l7] -=v>>3;
|
---|
603 |
|
---|
604 | }
|
---|
605 | src++;
|
---|
606 | }
|
---|
607 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
608 | }
|
---|
609 |
|
---|
610 | #ifndef HAVE_ALTIVEC
|
---|
611 | static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
|
---|
612 | {
|
---|
613 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
614 | /*
|
---|
615 | uint8_t tmp[16];
|
---|
616 | const int l1= stride;
|
---|
617 | const int l2= stride + l1;
|
---|
618 | const int l3= stride + l2;
|
---|
619 | const int l4= (int)tmp - (int)src - stride*3;
|
---|
620 | const int l5= (int)tmp - (int)src - stride*3 + 8;
|
---|
621 | const int l6= stride*3 + l3;
|
---|
622 | const int l7= stride + l6;
|
---|
623 | const int l8= stride + l7;
|
---|
624 |
|
---|
625 | memcpy(tmp, src+stride*7, 8);
|
---|
626 | memcpy(tmp+8, src+stride*8, 8);
|
---|
627 | */
|
---|
628 | src+= stride*4;
|
---|
629 | asm volatile(
|
---|
630 |
|
---|
631 | #if 0 //sligtly more accurate and slightly slower
|
---|
632 | "pxor %%mm7, %%mm7 \n\t" // 0
|
---|
633 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
634 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
|
---|
635 | // 0 1 2 3 4 5 6 7
|
---|
636 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
|
---|
637 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
|
---|
638 |
|
---|
639 |
|
---|
640 | "movq (%0, %1, 2), %%mm0 \n\t" // l2
|
---|
641 | "movq (%0), %%mm1 \n\t" // l0
|
---|
642 | "movq %%mm0, %%mm2 \n\t" // l2
|
---|
643 | PAVGB(%%mm7, %%mm0) // ~l2/2
|
---|
644 | PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
|
---|
645 | PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
|
---|
646 |
|
---|
647 | "movq (%%"REG_a"), %%mm1 \n\t" // l1
|
---|
648 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3
|
---|
649 | "movq %%mm1, %%mm4 \n\t" // l1
|
---|
650 | PAVGB(%%mm7, %%mm1) // ~l1/2
|
---|
651 | PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
|
---|
652 | PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
|
---|
653 |
|
---|
654 | "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
|
---|
655 | "psubusb %%mm1, %%mm0 \n\t"
|
---|
656 | "psubusb %%mm4, %%mm1 \n\t"
|
---|
657 | "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
|
---|
658 | // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
|
---|
659 |
|
---|
660 | "movq (%0, %1, 4), %%mm0 \n\t" // l4
|
---|
661 | "movq %%mm0, %%mm4 \n\t" // l4
|
---|
662 | PAVGB(%%mm7, %%mm0) // ~l4/2
|
---|
663 | PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
|
---|
664 | PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
|
---|
665 |
|
---|
666 | "movq (%%"REG_c"), %%mm2 \n\t" // l5
|
---|
667 | "movq %%mm3, %%mm5 \n\t" // l3
|
---|
668 | PAVGB(%%mm7, %%mm3) // ~l3/2
|
---|
669 | PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
|
---|
670 | PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
|
---|
671 |
|
---|
672 | "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
|
---|
673 | "psubusb %%mm3, %%mm0 \n\t"
|
---|
674 | "psubusb %%mm6, %%mm3 \n\t"
|
---|
675 | "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
|
---|
676 | "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
|
---|
677 | // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
|
---|
678 |
|
---|
679 | "movq (%%"REG_c", %1), %%mm6 \n\t" // l6
|
---|
680 | "movq %%mm6, %%mm5 \n\t" // l6
|
---|
681 | PAVGB(%%mm7, %%mm6) // ~l6/2
|
---|
682 | PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
|
---|
683 | PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
|
---|
684 |
|
---|
685 | "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7
|
---|
686 | "movq %%mm2, %%mm4 \n\t" // l5
|
---|
687 | PAVGB(%%mm7, %%mm2) // ~l5/2
|
---|
688 | PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
|
---|
689 | PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
|
---|
690 |
|
---|
691 | "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
|
---|
692 | "psubusb %%mm2, %%mm6 \n\t"
|
---|
693 | "psubusb %%mm4, %%mm2 \n\t"
|
---|
694 | "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
|
---|
695 | // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
|
---|
696 |
|
---|
697 |
|
---|
698 | PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
|
---|
699 | "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
|
---|
700 | "paddusb "MANGLE(b01)", %%mm4 \n\t"
|
---|
701 | "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
|
---|
702 | "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
|
---|
703 | "pand %%mm4, %%mm3 \n\t"
|
---|
704 |
|
---|
705 | "movq %%mm3, %%mm1 \n\t"
|
---|
706 | // "psubusb "MANGLE(b01)", %%mm3 \n\t"
|
---|
707 | PAVGB(%%mm7, %%mm3)
|
---|
708 | PAVGB(%%mm7, %%mm3)
|
---|
709 | "paddusb %%mm1, %%mm3 \n\t"
|
---|
710 | // "paddusb "MANGLE(b01)", %%mm3 \n\t"
|
---|
711 |
|
---|
712 | "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3
|
---|
713 | "movq (%0, %1, 4), %%mm5 \n\t" //l4
|
---|
714 | "movq (%0, %1, 4), %%mm4 \n\t" //l4
|
---|
715 | "psubusb %%mm6, %%mm5 \n\t"
|
---|
716 | "psubusb %%mm4, %%mm6 \n\t"
|
---|
717 | "por %%mm6, %%mm5 \n\t" // |l3-l4|
|
---|
718 | "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
|
---|
719 | "pxor %%mm6, %%mm0 \n\t"
|
---|
720 | "pand %%mm0, %%mm3 \n\t"
|
---|
721 | PMINUB(%%mm5, %%mm3, %%mm0)
|
---|
722 |
|
---|
723 | "psubusb "MANGLE(b01)", %%mm3 \n\t"
|
---|
724 | PAVGB(%%mm7, %%mm3)
|
---|
725 |
|
---|
726 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
|
---|
727 | "movq (%0, %1, 4), %%mm2 \n\t"
|
---|
728 | "pxor %%mm6, %%mm0 \n\t"
|
---|
729 | "pxor %%mm6, %%mm2 \n\t"
|
---|
730 | "psubb %%mm3, %%mm0 \n\t"
|
---|
731 | "paddb %%mm3, %%mm2 \n\t"
|
---|
732 | "pxor %%mm6, %%mm0 \n\t"
|
---|
733 | "pxor %%mm6, %%mm2 \n\t"
|
---|
734 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
|
---|
735 | "movq %%mm2, (%0, %1, 4) \n\t"
|
---|
736 | #endif //0
|
---|
737 |
|
---|
738 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
739 | "pcmpeqb %%mm6, %%mm6 \n\t" // -1
|
---|
740 | // 0 1 2 3 4 5 6 7
|
---|
741 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
|
---|
742 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
|
---|
743 |
|
---|
744 |
|
---|
745 | "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3
|
---|
746 | "movq (%0, %1, 4), %%mm0 \n\t" // l4
|
---|
747 | "pxor %%mm6, %%mm1 \n\t" // -l3-1
|
---|
748 | PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
|
---|
749 | // mm1=-l3-1, mm0=128-q
|
---|
750 |
|
---|
751 | "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5
|
---|
752 | "movq (%%"REG_a", %1), %%mm3 \n\t" // l2
|
---|
753 | "pxor %%mm6, %%mm2 \n\t" // -l5-1
|
---|
754 | "movq %%mm2, %%mm5 \n\t" // -l5-1
|
---|
755 | "movq "MANGLE(b80)", %%mm4 \n\t" // 128
|
---|
756 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
|
---|
757 | PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
|
---|
758 | PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
|
---|
759 | PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
|
---|
760 | PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
|
---|
761 | // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
|
---|
762 |
|
---|
763 | "movq (%%"REG_a"), %%mm2 \n\t" // l1
|
---|
764 | "pxor %%mm6, %%mm2 \n\t" // -l1-1
|
---|
765 | PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
|
---|
766 | PAVGB((%0), %%mm1) // (l0-l3+256)/2
|
---|
767 | "movq "MANGLE(b80)", %%mm3 \n\t" // 128
|
---|
768 | PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
|
---|
769 | PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
|
---|
770 | PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
|
---|
771 | // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
|
---|
772 |
|
---|
773 | PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2
|
---|
774 | "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7
|
---|
775 | "pxor %%mm6, %%mm1 \n\t" // -l7-1
|
---|
776 | PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
|
---|
777 | "movq "MANGLE(b80)", %%mm2 \n\t" // 128
|
---|
778 | PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
|
---|
779 | PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
|
---|
780 | PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
|
---|
781 | // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
|
---|
782 |
|
---|
783 | "movq "MANGLE(b00)", %%mm1 \n\t" // 0
|
---|
784 | "movq "MANGLE(b00)", %%mm5 \n\t" // 0
|
---|
785 | "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
|
---|
786 | "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
|
---|
787 | PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
|
---|
788 | PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
|
---|
789 | PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
|
---|
790 |
|
---|
791 | // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
|
---|
792 |
|
---|
793 | "movq "MANGLE(b00)", %%mm7 \n\t" // 0
|
---|
794 | "movq %2, %%mm2 \n\t" // QP
|
---|
795 | PAVGB(%%mm6, %%mm2) // 128 + QP/2
|
---|
796 | "psubb %%mm6, %%mm2 \n\t"
|
---|
797 |
|
---|
798 | "movq %%mm4, %%mm1 \n\t"
|
---|
799 | "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
|
---|
800 | "pxor %%mm1, %%mm4 \n\t"
|
---|
801 | "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
|
---|
802 | "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
|
---|
803 | "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
|
---|
804 | // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
|
---|
805 |
|
---|
806 | "movq %%mm4, %%mm3 \n\t" // d
|
---|
807 | "psubusb "MANGLE(b01)", %%mm4 \n\t"
|
---|
808 | PAVGB(%%mm7, %%mm4) // d/32
|
---|
809 | PAVGB(%%mm7, %%mm4) // (d + 32)/64
|
---|
810 | "paddb %%mm3, %%mm4 \n\t" // 5d/64
|
---|
811 | "pand %%mm2, %%mm4 \n\t"
|
---|
812 |
|
---|
813 | "movq "MANGLE(b80)", %%mm5 \n\t" // 128
|
---|
814 | "psubb %%mm0, %%mm5 \n\t" // q
|
---|
815 | "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
|
---|
816 | "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
|
---|
817 | "pxor %%mm7, %%mm5 \n\t"
|
---|
818 |
|
---|
819 | PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
|
---|
820 | "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
|
---|
821 |
|
---|
822 | "pand %%mm7, %%mm4 \n\t"
|
---|
823 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
|
---|
824 | "movq (%0, %1, 4), %%mm2 \n\t"
|
---|
825 | "pxor %%mm1, %%mm0 \n\t"
|
---|
826 | "pxor %%mm1, %%mm2 \n\t"
|
---|
827 | "paddb %%mm4, %%mm0 \n\t"
|
---|
828 | "psubb %%mm4, %%mm2 \n\t"
|
---|
829 | "pxor %%mm1, %%mm0 \n\t"
|
---|
830 | "pxor %%mm1, %%mm2 \n\t"
|
---|
831 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
|
---|
832 | "movq %%mm2, (%0, %1, 4) \n\t"
|
---|
833 |
|
---|
834 | :
|
---|
835 | : "r" (src), "r" ((long)stride), "m" (c->pQPb)
|
---|
836 | : "%"REG_a, "%"REG_c
|
---|
837 | );
|
---|
838 |
|
---|
839 | /*
|
---|
840 | {
|
---|
841 | int x;
|
---|
842 | src-= stride;
|
---|
843 | for(x=0; x<BLOCK_SIZE; x++)
|
---|
844 | {
|
---|
845 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
---|
846 | if(ABS(middleEnergy)< 8*QP)
|
---|
847 | {
|
---|
848 | const int q=(src[l4] - src[l5])/2;
|
---|
849 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
|
---|
850 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
|
---|
851 |
|
---|
852 | int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
|
---|
853 | d= MAX(d, 0);
|
---|
854 |
|
---|
855 | d= (5*d + 32) >> 6;
|
---|
856 | d*= SIGN(-middleEnergy);
|
---|
857 |
|
---|
858 | if(q>0)
|
---|
859 | {
|
---|
860 | d= d<0 ? 0 : d;
|
---|
861 | d= d>q ? q : d;
|
---|
862 | }
|
---|
863 | else
|
---|
864 | {
|
---|
865 | d= d>0 ? 0 : d;
|
---|
866 | d= d<q ? q : d;
|
---|
867 | }
|
---|
868 |
|
---|
869 | src[l4]-= d;
|
---|
870 | src[l5]+= d;
|
---|
871 | }
|
---|
872 | src++;
|
---|
873 | }
|
---|
874 | src-=8;
|
---|
875 | for(x=0; x<8; x++)
|
---|
876 | {
|
---|
877 | int y;
|
---|
878 | for(y=4; y<6; y++)
|
---|
879 | {
|
---|
880 | int d= src[x+y*stride] - tmp[x+(y-4)*8];
|
---|
881 | int ad= ABS(d);
|
---|
882 | static int max=0;
|
---|
883 | static int sum=0;
|
---|
884 | static int num=0;
|
---|
885 | static int bias=0;
|
---|
886 |
|
---|
887 | if(max<ad) max=ad;
|
---|
888 | sum+= ad>3 ? 1 : 0;
|
---|
889 | if(ad>3)
|
---|
890 | {
|
---|
891 | src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
|
---|
892 | }
|
---|
893 | if(y==4) bias+=d;
|
---|
894 | num++;
|
---|
895 | if(num%1000000 == 0)
|
---|
896 | {
|
---|
897 | printf(" %d %d %d %d\n", num, sum, max, bias);
|
---|
898 | }
|
---|
899 | }
|
---|
900 | }
|
---|
901 | }
|
---|
902 | */
|
---|
903 | #elif defined (HAVE_MMX)
|
---|
904 | src+= stride*4;
|
---|
905 | asm volatile(
|
---|
906 | "pxor %%mm7, %%mm7 \n\t"
|
---|
907 | "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
|
---|
908 | "and "ALIGN_MASK", %%"REG_c" \n\t" // align
|
---|
909 | // 0 1 2 3 4 5 6 7
|
---|
910 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
|
---|
911 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
|
---|
912 |
|
---|
913 | "movq (%0), %%mm0 \n\t"
|
---|
914 | "movq %%mm0, %%mm1 \n\t"
|
---|
915 | "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
|
---|
916 | "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
|
---|
917 |
|
---|
918 | "movq (%0, %1), %%mm2 \n\t"
|
---|
919 | "lea (%0, %1, 2), %%"REG_a" \n\t"
|
---|
920 | "movq %%mm2, %%mm3 \n\t"
|
---|
921 | "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
|
---|
922 | "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
|
---|
923 |
|
---|
924 | "movq (%%"REG_a"), %%mm4 \n\t"
|
---|
925 | "movq %%mm4, %%mm5 \n\t"
|
---|
926 | "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
|
---|
927 | "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
|
---|
928 |
|
---|
929 | "paddw %%mm0, %%mm0 \n\t" // 2L0
|
---|
930 | "paddw %%mm1, %%mm1 \n\t" // 2H0
|
---|
931 | "psubw %%mm4, %%mm2 \n\t" // L1 - L2
|
---|
932 | "psubw %%mm5, %%mm3 \n\t" // H1 - H2
|
---|
933 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
|
---|
934 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
|
---|
935 |
|
---|
936 | "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
|
---|
937 | "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
|
---|
938 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
|
---|
939 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
|
---|
940 |
|
---|
941 | "movq (%%"REG_a", %1), %%mm2 \n\t"
|
---|
942 | "movq %%mm2, %%mm3 \n\t"
|
---|
943 | "punpcklbw %%mm7, %%mm2 \n\t" // L3
|
---|
944 | "punpckhbw %%mm7, %%mm3 \n\t" // H3
|
---|
945 |
|
---|
946 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
|
---|
947 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
|
---|
948 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
949 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
950 | "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
951 | "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
952 |
|
---|
953 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
|
---|
954 | "movq %%mm0, %%mm1 \n\t"
|
---|
955 | "punpcklbw %%mm7, %%mm0 \n\t" // L4
|
---|
956 | "punpckhbw %%mm7, %%mm1 \n\t" // H4
|
---|
957 |
|
---|
958 | "psubw %%mm0, %%mm2 \n\t" // L3 - L4
|
---|
959 | "psubw %%mm1, %%mm3 \n\t" // H3 - H4
|
---|
960 | "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
|
---|
961 | "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
|
---|
962 | "paddw %%mm4, %%mm4 \n\t" // 2L2
|
---|
963 | "paddw %%mm5, %%mm5 \n\t" // 2H2
|
---|
964 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
|
---|
965 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
|
---|
966 |
|
---|
967 | "lea (%%"REG_a", %1), %0 \n\t"
|
---|
968 | "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
|
---|
969 | "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
|
---|
970 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
|
---|
971 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
|
---|
972 | //50 opcodes so far
|
---|
973 | "movq (%0, %1, 2), %%mm2 \n\t"
|
---|
974 | "movq %%mm2, %%mm3 \n\t"
|
---|
975 | "punpcklbw %%mm7, %%mm2 \n\t" // L5
|
---|
976 | "punpckhbw %%mm7, %%mm3 \n\t" // H5
|
---|
977 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
|
---|
978 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
|
---|
979 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
|
---|
980 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
|
---|
981 |
|
---|
982 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
|
---|
983 | "punpcklbw %%mm7, %%mm6 \n\t" // L6
|
---|
984 | "psubw %%mm6, %%mm2 \n\t" // L5 - L6
|
---|
985 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
|
---|
986 | "punpckhbw %%mm7, %%mm6 \n\t" // H6
|
---|
987 | "psubw %%mm6, %%mm3 \n\t" // H5 - H6
|
---|
988 |
|
---|
989 | "paddw %%mm0, %%mm0 \n\t" // 2L4
|
---|
990 | "paddw %%mm1, %%mm1 \n\t" // 2H4
|
---|
991 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
|
---|
992 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
|
---|
993 |
|
---|
994 | "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
|
---|
995 | "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
|
---|
996 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
|
---|
997 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
|
---|
998 |
|
---|
999 | "movq (%0, %1, 4), %%mm2 \n\t"
|
---|
1000 | "movq %%mm2, %%mm3 \n\t"
|
---|
1001 | "punpcklbw %%mm7, %%mm2 \n\t" // L7
|
---|
1002 | "punpckhbw %%mm7, %%mm3 \n\t" // H7
|
---|
1003 |
|
---|
1004 | "paddw %%mm2, %%mm2 \n\t" // 2L7
|
---|
1005 | "paddw %%mm3, %%mm3 \n\t" // 2H7
|
---|
1006 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
|
---|
1007 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
|
---|
1008 |
|
---|
1009 | "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
1010 | "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
1011 |
|
---|
1012 | #ifdef HAVE_MMX2
|
---|
1013 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1014 | "psubw %%mm0, %%mm6 \n\t"
|
---|
1015 | "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
|
---|
1016 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1017 | "psubw %%mm1, %%mm6 \n\t"
|
---|
1018 | "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
|
---|
1019 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1020 | "psubw %%mm2, %%mm6 \n\t"
|
---|
1021 | "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
|
---|
1022 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1023 | "psubw %%mm3, %%mm6 \n\t"
|
---|
1024 | "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
|
---|
1025 | #else
|
---|
1026 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1027 | "pcmpgtw %%mm0, %%mm6 \n\t"
|
---|
1028 | "pxor %%mm6, %%mm0 \n\t"
|
---|
1029 | "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
|
---|
1030 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1031 | "pcmpgtw %%mm1, %%mm6 \n\t"
|
---|
1032 | "pxor %%mm6, %%mm1 \n\t"
|
---|
1033 | "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
|
---|
1034 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1035 | "pcmpgtw %%mm2, %%mm6 \n\t"
|
---|
1036 | "pxor %%mm6, %%mm2 \n\t"
|
---|
1037 | "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
|
---|
1038 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1039 | "pcmpgtw %%mm3, %%mm6 \n\t"
|
---|
1040 | "pxor %%mm6, %%mm3 \n\t"
|
---|
1041 | "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
|
---|
1042 | #endif
|
---|
1043 |
|
---|
1044 | #ifdef HAVE_MMX2
|
---|
1045 | "pminsw %%mm2, %%mm0 \n\t"
|
---|
1046 | "pminsw %%mm3, %%mm1 \n\t"
|
---|
1047 | #else
|
---|
1048 | "movq %%mm0, %%mm6 \n\t"
|
---|
1049 | "psubusw %%mm2, %%mm6 \n\t"
|
---|
1050 | "psubw %%mm6, %%mm0 \n\t"
|
---|
1051 | "movq %%mm1, %%mm6 \n\t"
|
---|
1052 | "psubusw %%mm3, %%mm6 \n\t"
|
---|
1053 | "psubw %%mm6, %%mm1 \n\t"
|
---|
1054 | #endif
|
---|
1055 |
|
---|
1056 | "movd %2, %%mm2 \n\t" // QP
|
---|
1057 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
1058 |
|
---|
1059 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
1060 | "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
|
---|
1061 | "pxor %%mm6, %%mm4 \n\t"
|
---|
1062 | "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
|
---|
1063 | "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
|
---|
1064 | "pxor %%mm7, %%mm5 \n\t"
|
---|
1065 | "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
|
---|
1066 | // 100 opcodes
|
---|
1067 | "psllw $3, %%mm2 \n\t" // 8QP
|
---|
1068 | "movq %%mm2, %%mm3 \n\t" // 8QP
|
---|
1069 | "pcmpgtw %%mm4, %%mm2 \n\t"
|
---|
1070 | "pcmpgtw %%mm5, %%mm3 \n\t"
|
---|
1071 | "pand %%mm2, %%mm4 \n\t"
|
---|
1072 | "pand %%mm3, %%mm5 \n\t"
|
---|
1073 |
|
---|
1074 |
|
---|
1075 | "psubusw %%mm0, %%mm4 \n\t" // hd
|
---|
1076 | "psubusw %%mm1, %%mm5 \n\t" // ld
|
---|
1077 |
|
---|
1078 |
|
---|
1079 | "movq "MANGLE(w05)", %%mm2 \n\t" // 5
|
---|
1080 | "pmullw %%mm2, %%mm4 \n\t"
|
---|
1081 | "pmullw %%mm2, %%mm5 \n\t"
|
---|
1082 | "movq "MANGLE(w20)", %%mm2 \n\t" // 32
|
---|
1083 | "paddw %%mm2, %%mm4 \n\t"
|
---|
1084 | "paddw %%mm2, %%mm5 \n\t"
|
---|
1085 | "psrlw $6, %%mm4 \n\t"
|
---|
1086 | "psrlw $6, %%mm5 \n\t"
|
---|
1087 |
|
---|
1088 | "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
|
---|
1089 | "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
|
---|
1090 |
|
---|
1091 | "pxor %%mm2, %%mm2 \n\t"
|
---|
1092 | "pxor %%mm3, %%mm3 \n\t"
|
---|
1093 |
|
---|
1094 | "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
|
---|
1095 | "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
|
---|
1096 | "pxor %%mm2, %%mm0 \n\t"
|
---|
1097 | "pxor %%mm3, %%mm1 \n\t"
|
---|
1098 | "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
|
---|
1099 | "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
|
---|
1100 | "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
|
---|
1101 | "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
|
---|
1102 |
|
---|
1103 | "pxor %%mm6, %%mm2 \n\t"
|
---|
1104 | "pxor %%mm7, %%mm3 \n\t"
|
---|
1105 | "pand %%mm2, %%mm4 \n\t"
|
---|
1106 | "pand %%mm3, %%mm5 \n\t"
|
---|
1107 |
|
---|
1108 | #ifdef HAVE_MMX2
|
---|
1109 | "pminsw %%mm0, %%mm4 \n\t"
|
---|
1110 | "pminsw %%mm1, %%mm5 \n\t"
|
---|
1111 | #else
|
---|
1112 | "movq %%mm4, %%mm2 \n\t"
|
---|
1113 | "psubusw %%mm0, %%mm2 \n\t"
|
---|
1114 | "psubw %%mm2, %%mm4 \n\t"
|
---|
1115 | "movq %%mm5, %%mm2 \n\t"
|
---|
1116 | "psubusw %%mm1, %%mm2 \n\t"
|
---|
1117 | "psubw %%mm2, %%mm5 \n\t"
|
---|
1118 | #endif
|
---|
1119 | "pxor %%mm6, %%mm4 \n\t"
|
---|
1120 | "pxor %%mm7, %%mm5 \n\t"
|
---|
1121 | "psubw %%mm6, %%mm4 \n\t"
|
---|
1122 | "psubw %%mm7, %%mm5 \n\t"
|
---|
1123 | "packsswb %%mm5, %%mm4 \n\t"
|
---|
1124 | "movq (%0), %%mm0 \n\t"
|
---|
1125 | "paddb %%mm4, %%mm0 \n\t"
|
---|
1126 | "movq %%mm0, (%0) \n\t"
|
---|
1127 | "movq (%0, %1), %%mm0 \n\t"
|
---|
1128 | "psubb %%mm4, %%mm0 \n\t"
|
---|
1129 | "movq %%mm0, (%0, %1) \n\t"
|
---|
1130 |
|
---|
1131 | : "+r" (src)
|
---|
1132 | : "r" ((long)stride), "m" (c->pQPb)
|
---|
1133 | : "%"REG_a, "%"REG_c
|
---|
1134 | );
|
---|
1135 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1136 | const int l1= stride;
|
---|
1137 | const int l2= stride + l1;
|
---|
1138 | const int l3= stride + l2;
|
---|
1139 | const int l4= stride + l3;
|
---|
1140 | const int l5= stride + l4;
|
---|
1141 | const int l6= stride + l5;
|
---|
1142 | const int l7= stride + l6;
|
---|
1143 | const int l8= stride + l7;
|
---|
1144 | // const int l9= stride + l8;
|
---|
1145 | int x;
|
---|
1146 | src+= stride*3;
|
---|
1147 | for(x=0; x<BLOCK_SIZE; x++)
|
---|
1148 | {
|
---|
1149 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
---|
1150 | if(ABS(middleEnergy) < 8*c->QP)
|
---|
1151 | {
|
---|
1152 | const int q=(src[l4] - src[l5])/2;
|
---|
1153 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
|
---|
1154 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
|
---|
1155 |
|
---|
1156 | int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
|
---|
1157 | d= MAX(d, 0);
|
---|
1158 |
|
---|
1159 | d= (5*d + 32) >> 6;
|
---|
1160 | d*= SIGN(-middleEnergy);
|
---|
1161 |
|
---|
1162 | if(q>0)
|
---|
1163 | {
|
---|
1164 | d= d<0 ? 0 : d;
|
---|
1165 | d= d>q ? q : d;
|
---|
1166 | }
|
---|
1167 | else
|
---|
1168 | {
|
---|
1169 | d= d>0 ? 0 : d;
|
---|
1170 | d= d<q ? q : d;
|
---|
1171 | }
|
---|
1172 |
|
---|
1173 | src[l4]-= d;
|
---|
1174 | src[l5]+= d;
|
---|
1175 | }
|
---|
1176 | src++;
|
---|
1177 | }
|
---|
1178 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1179 | }
|
---|
1180 | #endif //HAVE_ALTIVEC
|
---|
1181 |
|
---|
1182 | #ifndef HAVE_ALTIVEC
|
---|
1183 | static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
|
---|
1184 | {
|
---|
1185 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1186 | asm volatile(
|
---|
1187 | "pxor %%mm6, %%mm6 \n\t"
|
---|
1188 | "pcmpeqb %%mm7, %%mm7 \n\t"
|
---|
1189 | "movq %2, %%mm0 \n\t"
|
---|
1190 | "punpcklbw %%mm6, %%mm0 \n\t"
|
---|
1191 | "psrlw $1, %%mm0 \n\t"
|
---|
1192 | "psubw %%mm7, %%mm0 \n\t"
|
---|
1193 | "packuswb %%mm0, %%mm0 \n\t"
|
---|
1194 | "movq %%mm0, %3 \n\t"
|
---|
1195 |
|
---|
1196 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1197 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1198 |
|
---|
1199 | // 0 1 2 3 4 5 6 7 8 9
|
---|
1200 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
1201 |
|
---|
1202 | #undef FIND_MIN_MAX
|
---|
1203 | #ifdef HAVE_MMX2
|
---|
1204 | #define REAL_FIND_MIN_MAX(addr)\
|
---|
1205 | "movq " #addr ", %%mm0 \n\t"\
|
---|
1206 | "pminub %%mm0, %%mm7 \n\t"\
|
---|
1207 | "pmaxub %%mm0, %%mm6 \n\t"
|
---|
1208 | #else
|
---|
1209 | #define REAL_FIND_MIN_MAX(addr)\
|
---|
1210 | "movq " #addr ", %%mm0 \n\t"\
|
---|
1211 | "movq %%mm7, %%mm1 \n\t"\
|
---|
1212 | "psubusb %%mm0, %%mm6 \n\t"\
|
---|
1213 | "paddb %%mm0, %%mm6 \n\t"\
|
---|
1214 | "psubusb %%mm0, %%mm1 \n\t"\
|
---|
1215 | "psubb %%mm1, %%mm7 \n\t"
|
---|
1216 | #endif
|
---|
1217 | #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
|
---|
1218 |
|
---|
1219 | FIND_MIN_MAX((%%REGa))
|
---|
1220 | FIND_MIN_MAX((%%REGa, %1))
|
---|
1221 | FIND_MIN_MAX((%%REGa, %1, 2))
|
---|
1222 | FIND_MIN_MAX((%0, %1, 4))
|
---|
1223 | FIND_MIN_MAX((%%REGd))
|
---|
1224 | FIND_MIN_MAX((%%REGd, %1))
|
---|
1225 | FIND_MIN_MAX((%%REGd, %1, 2))
|
---|
1226 | FIND_MIN_MAX((%0, %1, 8))
|
---|
1227 |
|
---|
1228 | "movq %%mm7, %%mm4 \n\t"
|
---|
1229 | "psrlq $8, %%mm7 \n\t"
|
---|
1230 | #ifdef HAVE_MMX2
|
---|
1231 | "pminub %%mm4, %%mm7 \n\t" // min of pixels
|
---|
1232 | "pshufw $0xF9, %%mm7, %%mm4 \n\t"
|
---|
1233 | "pminub %%mm4, %%mm7 \n\t" // min of pixels
|
---|
1234 | "pshufw $0xFE, %%mm7, %%mm4 \n\t"
|
---|
1235 | "pminub %%mm4, %%mm7 \n\t"
|
---|
1236 | #else
|
---|
1237 | "movq %%mm7, %%mm1 \n\t"
|
---|
1238 | "psubusb %%mm4, %%mm1 \n\t"
|
---|
1239 | "psubb %%mm1, %%mm7 \n\t"
|
---|
1240 | "movq %%mm7, %%mm4 \n\t"
|
---|
1241 | "psrlq $16, %%mm7 \n\t"
|
---|
1242 | "movq %%mm7, %%mm1 \n\t"
|
---|
1243 | "psubusb %%mm4, %%mm1 \n\t"
|
---|
1244 | "psubb %%mm1, %%mm7 \n\t"
|
---|
1245 | "movq %%mm7, %%mm4 \n\t"
|
---|
1246 | "psrlq $32, %%mm7 \n\t"
|
---|
1247 | "movq %%mm7, %%mm1 \n\t"
|
---|
1248 | "psubusb %%mm4, %%mm1 \n\t"
|
---|
1249 | "psubb %%mm1, %%mm7 \n\t"
|
---|
1250 | #endif
|
---|
1251 |
|
---|
1252 |
|
---|
1253 | "movq %%mm6, %%mm4 \n\t"
|
---|
1254 | "psrlq $8, %%mm6 \n\t"
|
---|
1255 | #ifdef HAVE_MMX2
|
---|
1256 | "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
|
---|
1257 | "pshufw $0xF9, %%mm6, %%mm4 \n\t"
|
---|
1258 | "pmaxub %%mm4, %%mm6 \n\t"
|
---|
1259 | "pshufw $0xFE, %%mm6, %%mm4 \n\t"
|
---|
1260 | "pmaxub %%mm4, %%mm6 \n\t"
|
---|
1261 | #else
|
---|
1262 | "psubusb %%mm4, %%mm6 \n\t"
|
---|
1263 | "paddb %%mm4, %%mm6 \n\t"
|
---|
1264 | "movq %%mm6, %%mm4 \n\t"
|
---|
1265 | "psrlq $16, %%mm6 \n\t"
|
---|
1266 | "psubusb %%mm4, %%mm6 \n\t"
|
---|
1267 | "paddb %%mm4, %%mm6 \n\t"
|
---|
1268 | "movq %%mm6, %%mm4 \n\t"
|
---|
1269 | "psrlq $32, %%mm6 \n\t"
|
---|
1270 | "psubusb %%mm4, %%mm6 \n\t"
|
---|
1271 | "paddb %%mm4, %%mm6 \n\t"
|
---|
1272 | #endif
|
---|
1273 | "movq %%mm6, %%mm0 \n\t" // max
|
---|
1274 | "psubb %%mm7, %%mm6 \n\t" // max - min
|
---|
1275 | "movd %%mm6, %%ecx \n\t"
|
---|
1276 | "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
|
---|
1277 | " jb 1f \n\t"
|
---|
1278 | "lea -24(%%"REG_SP"), %%"REG_c" \n\t"
|
---|
1279 | "and "ALIGN_MASK", %%"REG_c" \n\t"
|
---|
1280 | PAVGB(%%mm0, %%mm7) // a=(max + min)/2
|
---|
1281 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
1282 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
1283 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
1284 | "movq %%mm7, (%%"REG_c") \n\t"
|
---|
1285 |
|
---|
1286 | "movq (%0), %%mm0 \n\t" // L10
|
---|
1287 | "movq %%mm0, %%mm1 \n\t" // L10
|
---|
1288 | "movq %%mm0, %%mm2 \n\t" // L10
|
---|
1289 | "psllq $8, %%mm1 \n\t"
|
---|
1290 | "psrlq $8, %%mm2 \n\t"
|
---|
1291 | "movd -4(%0), %%mm3 \n\t"
|
---|
1292 | "movd 8(%0), %%mm4 \n\t"
|
---|
1293 | "psrlq $24, %%mm3 \n\t"
|
---|
1294 | "psllq $56, %%mm4 \n\t"
|
---|
1295 | "por %%mm3, %%mm1 \n\t" // L00
|
---|
1296 | "por %%mm4, %%mm2 \n\t" // L20
|
---|
1297 | "movq %%mm1, %%mm3 \n\t" // L00
|
---|
1298 | PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
|
---|
1299 | PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
|
---|
1300 | "psubusb %%mm7, %%mm0 \n\t"
|
---|
1301 | "psubusb %%mm7, %%mm2 \n\t"
|
---|
1302 | "psubusb %%mm7, %%mm3 \n\t"
|
---|
1303 | "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
|
---|
1304 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
|
---|
1305 | "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
|
---|
1306 | "paddb %%mm2, %%mm0 \n\t"
|
---|
1307 | "paddb %%mm3, %%mm0 \n\t"
|
---|
1308 |
|
---|
1309 | "movq (%%"REG_a"), %%mm2 \n\t" // L11
|
---|
1310 | "movq %%mm2, %%mm3 \n\t" // L11
|
---|
1311 | "movq %%mm2, %%mm4 \n\t" // L11
|
---|
1312 | "psllq $8, %%mm3 \n\t"
|
---|
1313 | "psrlq $8, %%mm4 \n\t"
|
---|
1314 | "movd -4(%%"REG_a"), %%mm5 \n\t"
|
---|
1315 | "movd 8(%%"REG_a"), %%mm6 \n\t"
|
---|
1316 | "psrlq $24, %%mm5 \n\t"
|
---|
1317 | "psllq $56, %%mm6 \n\t"
|
---|
1318 | "por %%mm5, %%mm3 \n\t" // L01
|
---|
1319 | "por %%mm6, %%mm4 \n\t" // L21
|
---|
1320 | "movq %%mm3, %%mm5 \n\t" // L01
|
---|
1321 | PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
|
---|
1322 | PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
|
---|
1323 | "psubusb %%mm7, %%mm2 \n\t"
|
---|
1324 | "psubusb %%mm7, %%mm4 \n\t"
|
---|
1325 | "psubusb %%mm7, %%mm5 \n\t"
|
---|
1326 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
|
---|
1327 | "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
|
---|
1328 | "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
|
---|
1329 | "paddb %%mm4, %%mm2 \n\t"
|
---|
1330 | "paddb %%mm5, %%mm2 \n\t"
|
---|
1331 | // 0, 2, 3, 1
|
---|
1332 | #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
|
---|
1333 | "movq " #src ", " #sx " \n\t" /* src[0] */\
|
---|
1334 | "movq " #sx ", " #lx " \n\t" /* src[0] */\
|
---|
1335 | "movq " #sx ", " #t0 " \n\t" /* src[0] */\
|
---|
1336 | "psllq $8, " #lx " \n\t"\
|
---|
1337 | "psrlq $8, " #t0 " \n\t"\
|
---|
1338 | "movd -4" #src ", " #t1 " \n\t"\
|
---|
1339 | "psrlq $24, " #t1 " \n\t"\
|
---|
1340 | "por " #t1 ", " #lx " \n\t" /* src[-1] */\
|
---|
1341 | "movd 8" #src ", " #t1 " \n\t"\
|
---|
1342 | "psllq $56, " #t1 " \n\t"\
|
---|
1343 | "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
|
---|
1344 | "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
|
---|
1345 | PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
|
---|
1346 | PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
|
---|
1347 | PAVGB(lx, pplx) \
|
---|
1348 | "movq " #lx ", 8(%%"REG_c") \n\t"\
|
---|
1349 | "movq (%%"REG_c"), " #lx " \n\t"\
|
---|
1350 | "psubusb " #lx ", " #t1 " \n\t"\
|
---|
1351 | "psubusb " #lx ", " #t0 " \n\t"\
|
---|
1352 | "psubusb " #lx ", " #sx " \n\t"\
|
---|
1353 | "movq "MANGLE(b00)", " #lx " \n\t"\
|
---|
1354 | "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
|
---|
1355 | "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
|
---|
1356 | "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
|
---|
1357 | "paddb " #t1 ", " #t0 " \n\t"\
|
---|
1358 | "paddb " #t0 ", " #sx " \n\t"\
|
---|
1359 | \
|
---|
1360 | PAVGB(plx, pplx) /* filtered */\
|
---|
1361 | "movq " #dst ", " #t0 " \n\t" /* dst */\
|
---|
1362 | "movq " #t0 ", " #t1 " \n\t" /* dst */\
|
---|
1363 | "psubusb %3, " #t0 " \n\t"\
|
---|
1364 | "paddusb %3, " #t1 " \n\t"\
|
---|
1365 | PMAXUB(t0, pplx)\
|
---|
1366 | PMINUB(t1, pplx, t0)\
|
---|
1367 | "paddb " #sx ", " #ppsx " \n\t"\
|
---|
1368 | "paddb " #psx ", " #ppsx " \n\t"\
|
---|
1369 | "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
|
---|
1370 | "pand "MANGLE(b08)", " #ppsx " \n\t"\
|
---|
1371 | "pcmpeqb " #lx ", " #ppsx " \n\t"\
|
---|
1372 | "pand " #ppsx ", " #pplx " \n\t"\
|
---|
1373 | "pandn " #dst ", " #ppsx " \n\t"\
|
---|
1374 | "por " #pplx ", " #ppsx " \n\t"\
|
---|
1375 | "movq " #ppsx ", " #dst " \n\t"\
|
---|
1376 | "movq 8(%%"REG_c"), " #lx " \n\t"
|
---|
1377 |
|
---|
1378 | #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
|
---|
1379 | REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
|
---|
1380 | /*
|
---|
1381 | 0000000
|
---|
1382 | 1111111
|
---|
1383 |
|
---|
1384 | 1111110
|
---|
1385 | 1111101
|
---|
1386 | 1111100
|
---|
1387 | 1111011
|
---|
1388 | 1111010
|
---|
1389 | 1111001
|
---|
1390 |
|
---|
1391 | 1111000
|
---|
1392 | 1110111
|
---|
1393 |
|
---|
1394 | */
|
---|
1395 | //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
|
---|
1396 | DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
|
---|
1397 | DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
|
---|
1398 | DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
|
---|
1399 | DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
|
---|
1400 | DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
|
---|
1401 | DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
|
---|
1402 | DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
|
---|
1403 | DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
|
---|
1404 |
|
---|
1405 | "1: \n\t"
|
---|
1406 | : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
|
---|
1407 | : "%"REG_a, "%"REG_d, "%"REG_c
|
---|
1408 | );
|
---|
1409 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1410 | int y;
|
---|
1411 | int min=255;
|
---|
1412 | int max=0;
|
---|
1413 | int avg;
|
---|
1414 | uint8_t *p;
|
---|
1415 | int s[10];
|
---|
1416 | const int QP2= c->QP/2 + 1;
|
---|
1417 |
|
---|
1418 | for(y=1; y<9; y++)
|
---|
1419 | {
|
---|
1420 | int x;
|
---|
1421 | p= src + stride*y;
|
---|
1422 | for(x=1; x<9; x++)
|
---|
1423 | {
|
---|
1424 | p++;
|
---|
1425 | if(*p > max) max= *p;
|
---|
1426 | if(*p < min) min= *p;
|
---|
1427 | }
|
---|
1428 | }
|
---|
1429 | avg= (min + max + 1)>>1;
|
---|
1430 |
|
---|
1431 | if(max - min <deringThreshold) return;
|
---|
1432 |
|
---|
1433 | for(y=0; y<10; y++)
|
---|
1434 | {
|
---|
1435 | int t = 0;
|
---|
1436 |
|
---|
1437 | if(src[stride*y + 0] > avg) t+= 1;
|
---|
1438 | if(src[stride*y + 1] > avg) t+= 2;
|
---|
1439 | if(src[stride*y + 2] > avg) t+= 4;
|
---|
1440 | if(src[stride*y + 3] > avg) t+= 8;
|
---|
1441 | if(src[stride*y + 4] > avg) t+= 16;
|
---|
1442 | if(src[stride*y + 5] > avg) t+= 32;
|
---|
1443 | if(src[stride*y + 6] > avg) t+= 64;
|
---|
1444 | if(src[stride*y + 7] > avg) t+= 128;
|
---|
1445 | if(src[stride*y + 8] > avg) t+= 256;
|
---|
1446 | if(src[stride*y + 9] > avg) t+= 512;
|
---|
1447 |
|
---|
1448 | t |= (~t)<<16;
|
---|
1449 | t &= (t<<1) & (t>>1);
|
---|
1450 | s[y] = t;
|
---|
1451 | }
|
---|
1452 |
|
---|
1453 | for(y=1; y<9; y++)
|
---|
1454 | {
|
---|
1455 | int t = s[y-1] & s[y] & s[y+1];
|
---|
1456 | t|= t>>16;
|
---|
1457 | s[y-1]= t;
|
---|
1458 | }
|
---|
1459 |
|
---|
1460 | for(y=1; y<9; y++)
|
---|
1461 | {
|
---|
1462 | int x;
|
---|
1463 | int t = s[y-1];
|
---|
1464 |
|
---|
1465 | p= src + stride*y;
|
---|
1466 | for(x=1; x<9; x++)
|
---|
1467 | {
|
---|
1468 | p++;
|
---|
1469 | if(t & (1<<x))
|
---|
1470 | {
|
---|
1471 | int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
|
---|
1472 | +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
|
---|
1473 | +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
|
---|
1474 | f= (f + 8)>>4;
|
---|
1475 |
|
---|
1476 | #ifdef DEBUG_DERING_THRESHOLD
|
---|
1477 | asm volatile("emms\n\t":);
|
---|
1478 | {
|
---|
1479 | static long long numPixels=0;
|
---|
1480 | if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
|
---|
1481 | // if((max-min)<20 || (max-min)*QP<200)
|
---|
1482 | // if((max-min)*QP < 500)
|
---|
1483 | // if(max-min<QP/2)
|
---|
1484 | if(max-min < 20)
|
---|
1485 | {
|
---|
1486 | static int numSkiped=0;
|
---|
1487 | static int errorSum=0;
|
---|
1488 | static int worstQP=0;
|
---|
1489 | static int worstRange=0;
|
---|
1490 | static int worstDiff=0;
|
---|
1491 | int diff= (f - *p);
|
---|
1492 | int absDiff= ABS(diff);
|
---|
1493 | int error= diff*diff;
|
---|
1494 |
|
---|
1495 | if(x==1 || x==8 || y==1 || y==8) continue;
|
---|
1496 |
|
---|
1497 | numSkiped++;
|
---|
1498 | if(absDiff > worstDiff)
|
---|
1499 | {
|
---|
1500 | worstDiff= absDiff;
|
---|
1501 | worstQP= QP;
|
---|
1502 | worstRange= max-min;
|
---|
1503 | }
|
---|
1504 | errorSum+= error;
|
---|
1505 |
|
---|
1506 | if(1024LL*1024LL*1024LL % numSkiped == 0)
|
---|
1507 | {
|
---|
1508 | printf( "sum:%1.3f, skip:%d, wQP:%d, "
|
---|
1509 | "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
|
---|
1510 | (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
|
---|
1511 | worstDiff, (float)numSkiped/numPixels);
|
---|
1512 | }
|
---|
1513 | }
|
---|
1514 | }
|
---|
1515 | #endif
|
---|
1516 | if (*p + QP2 < f) *p= *p + QP2;
|
---|
1517 | else if(*p - QP2 > f) *p= *p - QP2;
|
---|
1518 | else *p=f;
|
---|
1519 | }
|
---|
1520 | }
|
---|
1521 | }
|
---|
1522 | #ifdef DEBUG_DERING_THRESHOLD
|
---|
1523 | if(max-min < 20)
|
---|
1524 | {
|
---|
1525 | for(y=1; y<9; y++)
|
---|
1526 | {
|
---|
1527 | int x;
|
---|
1528 | int t = 0;
|
---|
1529 | p= src + stride*y;
|
---|
1530 | for(x=1; x<9; x++)
|
---|
1531 | {
|
---|
1532 | p++;
|
---|
1533 | *p = MIN(*p + 20, 255);
|
---|
1534 | }
|
---|
1535 | }
|
---|
1536 | // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
|
---|
1537 | }
|
---|
1538 | #endif
|
---|
1539 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1540 | }
|
---|
1541 | #endif //HAVE_ALTIVEC
|
---|
1542 |
|
---|
1543 | /**
|
---|
1544 | * Deinterlaces the given block by linearly interpolating every second line.
|
---|
1545 | * will be called for every 8x8 block and can read & write from line 4-15
|
---|
1546 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1547 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1548 | */
|
---|
1549 | static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
|
---|
1550 | {
|
---|
1551 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1552 | src+= 4*stride;
|
---|
1553 | asm volatile(
|
---|
1554 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1555 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
|
---|
1556 | // 0 1 2 3 4 5 6 7 8 9
|
---|
1557 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
|
---|
1558 |
|
---|
1559 | "movq (%0), %%mm0 \n\t"
|
---|
1560 | "movq (%%"REG_a", %1), %%mm1 \n\t"
|
---|
1561 | PAVGB(%%mm1, %%mm0)
|
---|
1562 | "movq %%mm0, (%%"REG_a") \n\t"
|
---|
1563 | "movq (%0, %1, 4), %%mm0 \n\t"
|
---|
1564 | PAVGB(%%mm0, %%mm1)
|
---|
1565 | "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
|
---|
1566 | "movq (%%"REG_c", %1), %%mm1 \n\t"
|
---|
1567 | PAVGB(%%mm1, %%mm0)
|
---|
1568 | "movq %%mm0, (%%"REG_c") \n\t"
|
---|
1569 | "movq (%0, %1, 8), %%mm0 \n\t"
|
---|
1570 | PAVGB(%%mm0, %%mm1)
|
---|
1571 | "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
|
---|
1572 |
|
---|
1573 | : : "r" (src), "r" ((long)stride)
|
---|
1574 | : "%"REG_a, "%"REG_c
|
---|
1575 | );
|
---|
1576 | #else
|
---|
1577 | int a, b, x;
|
---|
1578 | src+= 4*stride;
|
---|
1579 |
|
---|
1580 | for(x=0; x<2; x++){
|
---|
1581 | a= *(uint32_t*)&src[stride*0];
|
---|
1582 | b= *(uint32_t*)&src[stride*2];
|
---|
1583 | *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1584 | a= *(uint32_t*)&src[stride*4];
|
---|
1585 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1586 | b= *(uint32_t*)&src[stride*6];
|
---|
1587 | *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1588 | a= *(uint32_t*)&src[stride*8];
|
---|
1589 | *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1590 | src += 4;
|
---|
1591 | }
|
---|
1592 | #endif
|
---|
1593 | }
|
---|
1594 |
|
---|
1595 | /**
|
---|
1596 | * Deinterlaces the given block by cubic interpolating every second line.
|
---|
1597 | * will be called for every 8x8 block and can read & write from line 4-15
|
---|
1598 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1599 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1600 | * this filter will read lines 3-15 and write 7-13
|
---|
1601 | */
|
---|
1602 | static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
|
---|
1603 | {
|
---|
1604 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1605 | src+= stride*3;
|
---|
1606 | asm volatile(
|
---|
1607 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1608 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1609 | "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
|
---|
1610 | "add %1, %%"REG_c" \n\t"
|
---|
1611 | "pxor %%mm7, %%mm7 \n\t"
|
---|
1612 | // 0 1 2 3 4 5 6 7 8 9 10
|
---|
1613 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
|
---|
1614 |
|
---|
1615 | #define REAL_DEINT_CUBIC(a,b,c,d,e)\
|
---|
1616 | "movq " #a ", %%mm0 \n\t"\
|
---|
1617 | "movq " #b ", %%mm1 \n\t"\
|
---|
1618 | "movq " #d ", %%mm2 \n\t"\
|
---|
1619 | "movq " #e ", %%mm3 \n\t"\
|
---|
1620 | PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
|
---|
1621 | PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
|
---|
1622 | "movq %%mm0, %%mm2 \n\t"\
|
---|
1623 | "punpcklbw %%mm7, %%mm0 \n\t"\
|
---|
1624 | "punpckhbw %%mm7, %%mm2 \n\t"\
|
---|
1625 | "movq %%mm1, %%mm3 \n\t"\
|
---|
1626 | "punpcklbw %%mm7, %%mm1 \n\t"\
|
---|
1627 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
1628 | "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
|
---|
1629 | "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
|
---|
1630 | "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
|
---|
1631 | "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
|
---|
1632 | "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
|
---|
1633 | "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
|
---|
1634 | "packuswb %%mm3, %%mm1 \n\t"\
|
---|
1635 | "movq %%mm1, " #c " \n\t"
|
---|
1636 | #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
|
---|
1637 |
|
---|
1638 | DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
|
---|
1639 | DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
|
---|
1640 | DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
|
---|
1641 | DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
|
---|
1642 |
|
---|
1643 | : : "r" (src), "r" ((long)stride)
|
---|
1644 | : "%"REG_a, "%"REG_d, "%"REG_c
|
---|
1645 | );
|
---|
1646 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1647 | int x;
|
---|
1648 | src+= stride*3;
|
---|
1649 | for(x=0; x<8; x++)
|
---|
1650 | {
|
---|
1651 | src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
|
---|
1652 | src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
|
---|
1653 | src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
|
---|
1654 | src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
|
---|
1655 | src++;
|
---|
1656 | }
|
---|
1657 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1658 | }
|
---|
1659 |
|
---|
1660 | /**
|
---|
1661 | * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
|
---|
1662 | * will be called for every 8x8 block and can read & write from line 4-15
|
---|
1663 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1664 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1665 | * this filter will read lines 4-13 and write 5-11
|
---|
1666 | */
|
---|
1667 | static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
|
---|
1668 | {
|
---|
1669 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1670 | src+= stride*4;
|
---|
1671 | asm volatile(
|
---|
1672 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1673 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1674 | "pxor %%mm7, %%mm7 \n\t"
|
---|
1675 | "movq (%2), %%mm0 \n\t"
|
---|
1676 | // 0 1 2 3 4 5 6 7 8 9 10
|
---|
1677 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
|
---|
1678 |
|
---|
1679 | #define REAL_DEINT_FF(a,b,c,d)\
|
---|
1680 | "movq " #a ", %%mm1 \n\t"\
|
---|
1681 | "movq " #b ", %%mm2 \n\t"\
|
---|
1682 | "movq " #c ", %%mm3 \n\t"\
|
---|
1683 | "movq " #d ", %%mm4 \n\t"\
|
---|
1684 | PAVGB(%%mm3, %%mm1) \
|
---|
1685 | PAVGB(%%mm4, %%mm0) \
|
---|
1686 | "movq %%mm0, %%mm3 \n\t"\
|
---|
1687 | "punpcklbw %%mm7, %%mm0 \n\t"\
|
---|
1688 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
1689 | "movq %%mm1, %%mm4 \n\t"\
|
---|
1690 | "punpcklbw %%mm7, %%mm1 \n\t"\
|
---|
1691 | "punpckhbw %%mm7, %%mm4 \n\t"\
|
---|
1692 | "psllw $2, %%mm1 \n\t"\
|
---|
1693 | "psllw $2, %%mm4 \n\t"\
|
---|
1694 | "psubw %%mm0, %%mm1 \n\t"\
|
---|
1695 | "psubw %%mm3, %%mm4 \n\t"\
|
---|
1696 | "movq %%mm2, %%mm5 \n\t"\
|
---|
1697 | "movq %%mm2, %%mm0 \n\t"\
|
---|
1698 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
1699 | "punpckhbw %%mm7, %%mm5 \n\t"\
|
---|
1700 | "paddw %%mm2, %%mm1 \n\t"\
|
---|
1701 | "paddw %%mm5, %%mm4 \n\t"\
|
---|
1702 | "psraw $2, %%mm1 \n\t"\
|
---|
1703 | "psraw $2, %%mm4 \n\t"\
|
---|
1704 | "packuswb %%mm4, %%mm1 \n\t"\
|
---|
1705 | "movq %%mm1, " #b " \n\t"\
|
---|
1706 |
|
---|
1707 | #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
|
---|
1708 |
|
---|
1709 | DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
|
---|
1710 | DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
|
---|
1711 | DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
|
---|
1712 | DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
|
---|
1713 |
|
---|
1714 | "movq %%mm0, (%2) \n\t"
|
---|
1715 | : : "r" (src), "r" ((long)stride), "r"(tmp)
|
---|
1716 | : "%"REG_a, "%"REG_d
|
---|
1717 | );
|
---|
1718 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1719 | int x;
|
---|
1720 | src+= stride*4;
|
---|
1721 | for(x=0; x<8; x++)
|
---|
1722 | {
|
---|
1723 | int t1= tmp[x];
|
---|
1724 | int t2= src[stride*1];
|
---|
1725 |
|
---|
1726 | src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
|
---|
1727 | t1= src[stride*4];
|
---|
1728 | src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
|
---|
1729 | t2= src[stride*6];
|
---|
1730 | src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
|
---|
1731 | t1= src[stride*8];
|
---|
1732 | src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
|
---|
1733 | tmp[x]= t1;
|
---|
1734 |
|
---|
1735 | src++;
|
---|
1736 | }
|
---|
1737 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1738 | }
|
---|
1739 |
|
---|
1740 | /**
|
---|
1741 | * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
|
---|
1742 | * will be called for every 8x8 block and can read & write from line 4-15
|
---|
1743 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1744 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1745 | * this filter will read lines 4-13 and write 4-11
|
---|
1746 | */
|
---|
1747 | static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
|
---|
1748 | {
|
---|
1749 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1750 | src+= stride*4;
|
---|
1751 | asm volatile(
|
---|
1752 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1753 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1754 | "pxor %%mm7, %%mm7 \n\t"
|
---|
1755 | "movq (%2), %%mm0 \n\t"
|
---|
1756 | "movq (%3), %%mm1 \n\t"
|
---|
1757 | // 0 1 2 3 4 5 6 7 8 9 10
|
---|
1758 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
|
---|
1759 |
|
---|
1760 | #define REAL_DEINT_L5(t1,t2,a,b,c)\
|
---|
1761 | "movq " #a ", %%mm2 \n\t"\
|
---|
1762 | "movq " #b ", %%mm3 \n\t"\
|
---|
1763 | "movq " #c ", %%mm4 \n\t"\
|
---|
1764 | PAVGB(t2, %%mm3) \
|
---|
1765 | PAVGB(t1, %%mm4) \
|
---|
1766 | "movq %%mm2, %%mm5 \n\t"\
|
---|
1767 | "movq %%mm2, " #t1 " \n\t"\
|
---|
1768 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
1769 | "punpckhbw %%mm7, %%mm5 \n\t"\
|
---|
1770 | "movq %%mm2, %%mm6 \n\t"\
|
---|
1771 | "paddw %%mm2, %%mm2 \n\t"\
|
---|
1772 | "paddw %%mm6, %%mm2 \n\t"\
|
---|
1773 | "movq %%mm5, %%mm6 \n\t"\
|
---|
1774 | "paddw %%mm5, %%mm5 \n\t"\
|
---|
1775 | "paddw %%mm6, %%mm5 \n\t"\
|
---|
1776 | "movq %%mm3, %%mm6 \n\t"\
|
---|
1777 | "punpcklbw %%mm7, %%mm3 \n\t"\
|
---|
1778 | "punpckhbw %%mm7, %%mm6 \n\t"\
|
---|
1779 | "paddw %%mm3, %%mm3 \n\t"\
|
---|
1780 | "paddw %%mm6, %%mm6 \n\t"\
|
---|
1781 | "paddw %%mm3, %%mm2 \n\t"\
|
---|
1782 | "paddw %%mm6, %%mm5 \n\t"\
|
---|
1783 | "movq %%mm4, %%mm6 \n\t"\
|
---|
1784 | "punpcklbw %%mm7, %%mm4 \n\t"\
|
---|
1785 | "punpckhbw %%mm7, %%mm6 \n\t"\
|
---|
1786 | "psubw %%mm4, %%mm2 \n\t"\
|
---|
1787 | "psubw %%mm6, %%mm5 \n\t"\
|
---|
1788 | "psraw $2, %%mm2 \n\t"\
|
---|
1789 | "psraw $2, %%mm5 \n\t"\
|
---|
1790 | "packuswb %%mm5, %%mm2 \n\t"\
|
---|
1791 | "movq %%mm2, " #a " \n\t"\
|
---|
1792 |
|
---|
1793 | #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
|
---|
1794 |
|
---|
1795 | DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
|
---|
1796 | DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
|
---|
1797 | DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) )
|
---|
1798 | DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
|
---|
1799 | DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
|
---|
1800 | DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
|
---|
1801 | DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
|
---|
1802 | DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
|
---|
1803 |
|
---|
1804 | "movq %%mm0, (%2) \n\t"
|
---|
1805 | "movq %%mm1, (%3) \n\t"
|
---|
1806 | : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
|
---|
1807 | : "%"REG_a, "%"REG_d
|
---|
1808 | );
|
---|
1809 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1810 | int x;
|
---|
1811 | src+= stride*4;
|
---|
1812 | for(x=0; x<8; x++)
|
---|
1813 | {
|
---|
1814 | int t1= tmp[x];
|
---|
1815 | int t2= tmp2[x];
|
---|
1816 | int t3= src[0];
|
---|
1817 |
|
---|
1818 | src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
|
---|
1819 | t1= src[stride*1];
|
---|
1820 | src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
|
---|
1821 | t2= src[stride*2];
|
---|
1822 | src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
|
---|
1823 | t3= src[stride*3];
|
---|
1824 | src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
|
---|
1825 | t1= src[stride*4];
|
---|
1826 | src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
|
---|
1827 | t2= src[stride*5];
|
---|
1828 | src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
|
---|
1829 | t3= src[stride*6];
|
---|
1830 | src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
|
---|
1831 | t1= src[stride*7];
|
---|
1832 | src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
|
---|
1833 |
|
---|
1834 | tmp[x]= t3;
|
---|
1835 | tmp2[x]= t1;
|
---|
1836 |
|
---|
1837 | src++;
|
---|
1838 | }
|
---|
1839 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1840 | }
|
---|
1841 |
|
---|
1842 | /**
|
---|
1843 | * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
|
---|
1844 | * will be called for every 8x8 block and can read & write from line 4-15
|
---|
1845 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1846 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1847 | * this filter will read lines 4-13 and write 4-11
|
---|
1848 | */
|
---|
1849 | static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
|
---|
1850 | {
|
---|
1851 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1852 | src+= 4*stride;
|
---|
1853 | asm volatile(
|
---|
1854 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1855 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1856 | // 0 1 2 3 4 5 6 7 8 9
|
---|
1857 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
1858 |
|
---|
1859 | "movq (%2), %%mm0 \n\t" // L0
|
---|
1860 | "movq (%%"REG_a"), %%mm1 \n\t" // L2
|
---|
1861 | PAVGB(%%mm1, %%mm0) // L0+L2
|
---|
1862 | "movq (%0), %%mm2 \n\t" // L1
|
---|
1863 | PAVGB(%%mm2, %%mm0)
|
---|
1864 | "movq %%mm0, (%0) \n\t"
|
---|
1865 | "movq (%%"REG_a", %1), %%mm0 \n\t" // L3
|
---|
1866 | PAVGB(%%mm0, %%mm2) // L1+L3
|
---|
1867 | PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
|
---|
1868 | "movq %%mm2, (%%"REG_a") \n\t"
|
---|
1869 | "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
|
---|
1870 | PAVGB(%%mm2, %%mm1) // L2+L4
|
---|
1871 | PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
|
---|
1872 | "movq %%mm1, (%%"REG_a", %1) \n\t"
|
---|
1873 | "movq (%0, %1, 4), %%mm1 \n\t" // L5
|
---|
1874 | PAVGB(%%mm1, %%mm0) // L3+L5
|
---|
1875 | PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
|
---|
1876 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
|
---|
1877 | "movq (%%"REG_d"), %%mm0 \n\t" // L6
|
---|
1878 | PAVGB(%%mm0, %%mm2) // L4+L6
|
---|
1879 | PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
|
---|
1880 | "movq %%mm2, (%0, %1, 4) \n\t"
|
---|
1881 | "movq (%%"REG_d", %1), %%mm2 \n\t" // L7
|
---|
1882 | PAVGB(%%mm2, %%mm1) // L5+L7
|
---|
1883 | PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
|
---|
1884 | "movq %%mm1, (%%"REG_d") \n\t"
|
---|
1885 | "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
|
---|
1886 | PAVGB(%%mm1, %%mm0) // L6+L8
|
---|
1887 | PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
|
---|
1888 | "movq %%mm0, (%%"REG_d", %1) \n\t"
|
---|
1889 | "movq (%0, %1, 8), %%mm0 \n\t" // L9
|
---|
1890 | PAVGB(%%mm0, %%mm2) // L7+L9
|
---|
1891 | PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
|
---|
1892 | "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
|
---|
1893 | "movq %%mm1, (%2) \n\t"
|
---|
1894 |
|
---|
1895 | : : "r" (src), "r" ((long)stride), "r" (tmp)
|
---|
1896 | : "%"REG_a, "%"REG_d
|
---|
1897 | );
|
---|
1898 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1899 | int a, b, c, x;
|
---|
1900 | src+= 4*stride;
|
---|
1901 |
|
---|
1902 | for(x=0; x<2; x++){
|
---|
1903 | a= *(uint32_t*)&tmp[stride*0];
|
---|
1904 | b= *(uint32_t*)&src[stride*0];
|
---|
1905 | c= *(uint32_t*)&src[stride*1];
|
---|
1906 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
|
---|
1907 | *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1908 |
|
---|
1909 | a= *(uint32_t*)&src[stride*2];
|
---|
1910 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1911 | *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
|
---|
1912 |
|
---|
1913 | b= *(uint32_t*)&src[stride*3];
|
---|
1914 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
|
---|
1915 | *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
|
---|
1916 |
|
---|
1917 | c= *(uint32_t*)&src[stride*4];
|
---|
1918 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
|
---|
1919 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1920 |
|
---|
1921 | a= *(uint32_t*)&src[stride*5];
|
---|
1922 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1923 | *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
|
---|
1924 |
|
---|
1925 | b= *(uint32_t*)&src[stride*6];
|
---|
1926 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
|
---|
1927 | *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
|
---|
1928 |
|
---|
1929 | c= *(uint32_t*)&src[stride*7];
|
---|
1930 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
|
---|
1931 | *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1932 |
|
---|
1933 | a= *(uint32_t*)&src[stride*8];
|
---|
1934 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
|
---|
1935 | *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
|
---|
1936 |
|
---|
1937 | *(uint32_t*)&tmp[stride*0]= c;
|
---|
1938 | src += 4;
|
---|
1939 | tmp += 4;
|
---|
1940 | }
|
---|
1941 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
1942 | }
|
---|
1943 |
|
---|
1944 | /**
|
---|
1945 | * Deinterlaces the given block by applying a median filter to every second line.
|
---|
1946 | * will be called for every 8x8 block and can read & write from line 4-15,
|
---|
1947 | * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
|
---|
1948 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced
|
---|
1949 | */
|
---|
1950 | static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
|
---|
1951 | {
|
---|
1952 | #ifdef HAVE_MMX
|
---|
1953 | src+= 4*stride;
|
---|
1954 | #ifdef HAVE_MMX2
|
---|
1955 | asm volatile(
|
---|
1956 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
1957 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
1958 | // 0 1 2 3 4 5 6 7 8 9
|
---|
1959 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
1960 |
|
---|
1961 | "movq (%0), %%mm0 \n\t" //
|
---|
1962 | "movq (%%"REG_a", %1), %%mm2 \n\t" //
|
---|
1963 | "movq (%%"REG_a"), %%mm1 \n\t" //
|
---|
1964 | "movq %%mm0, %%mm3 \n\t"
|
---|
1965 | "pmaxub %%mm1, %%mm0 \n\t" //
|
---|
1966 | "pminub %%mm3, %%mm1 \n\t" //
|
---|
1967 | "pmaxub %%mm2, %%mm1 \n\t" //
|
---|
1968 | "pminub %%mm1, %%mm0 \n\t"
|
---|
1969 | "movq %%mm0, (%%"REG_a") \n\t"
|
---|
1970 |
|
---|
1971 | "movq (%0, %1, 4), %%mm0 \n\t" //
|
---|
1972 | "movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
|
---|
1973 | "movq %%mm2, %%mm3 \n\t"
|
---|
1974 | "pmaxub %%mm1, %%mm2 \n\t" //
|
---|
1975 | "pminub %%mm3, %%mm1 \n\t" //
|
---|
1976 | "pmaxub %%mm0, %%mm1 \n\t" //
|
---|
1977 | "pminub %%mm1, %%mm2 \n\t"
|
---|
1978 | "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
|
---|
1979 |
|
---|
1980 | "movq (%%"REG_d"), %%mm2 \n\t" //
|
---|
1981 | "movq (%%"REG_d", %1), %%mm1 \n\t" //
|
---|
1982 | "movq %%mm2, %%mm3 \n\t"
|
---|
1983 | "pmaxub %%mm0, %%mm2 \n\t" //
|
---|
1984 | "pminub %%mm3, %%mm0 \n\t" //
|
---|
1985 | "pmaxub %%mm1, %%mm0 \n\t" //
|
---|
1986 | "pminub %%mm0, %%mm2 \n\t"
|
---|
1987 | "movq %%mm2, (%%"REG_d") \n\t"
|
---|
1988 |
|
---|
1989 | "movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
|
---|
1990 | "movq (%0, %1, 8), %%mm0 \n\t" //
|
---|
1991 | "movq %%mm2, %%mm3 \n\t"
|
---|
1992 | "pmaxub %%mm0, %%mm2 \n\t" //
|
---|
1993 | "pminub %%mm3, %%mm0 \n\t" //
|
---|
1994 | "pmaxub %%mm1, %%mm0 \n\t" //
|
---|
1995 | "pminub %%mm0, %%mm2 \n\t"
|
---|
1996 | "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
|
---|
1997 |
|
---|
1998 |
|
---|
1999 | : : "r" (src), "r" ((long)stride)
|
---|
2000 | : "%"REG_a, "%"REG_d
|
---|
2001 | );
|
---|
2002 |
|
---|
2003 | #else // MMX without MMX2
|
---|
2004 | asm volatile(
|
---|
2005 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
2006 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
|
---|
2007 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2008 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
2009 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2010 |
|
---|
2011 | #define REAL_MEDIAN(a,b,c)\
|
---|
2012 | "movq " #a ", %%mm0 \n\t"\
|
---|
2013 | "movq " #b ", %%mm2 \n\t"\
|
---|
2014 | "movq " #c ", %%mm1 \n\t"\
|
---|
2015 | "movq %%mm0, %%mm3 \n\t"\
|
---|
2016 | "movq %%mm1, %%mm4 \n\t"\
|
---|
2017 | "movq %%mm2, %%mm5 \n\t"\
|
---|
2018 | "psubusb %%mm1, %%mm3 \n\t"\
|
---|
2019 | "psubusb %%mm2, %%mm4 \n\t"\
|
---|
2020 | "psubusb %%mm0, %%mm5 \n\t"\
|
---|
2021 | "pcmpeqb %%mm7, %%mm3 \n\t"\
|
---|
2022 | "pcmpeqb %%mm7, %%mm4 \n\t"\
|
---|
2023 | "pcmpeqb %%mm7, %%mm5 \n\t"\
|
---|
2024 | "movq %%mm3, %%mm6 \n\t"\
|
---|
2025 | "pxor %%mm4, %%mm3 \n\t"\
|
---|
2026 | "pxor %%mm5, %%mm4 \n\t"\
|
---|
2027 | "pxor %%mm6, %%mm5 \n\t"\
|
---|
2028 | "por %%mm3, %%mm1 \n\t"\
|
---|
2029 | "por %%mm4, %%mm2 \n\t"\
|
---|
2030 | "por %%mm5, %%mm0 \n\t"\
|
---|
2031 | "pand %%mm2, %%mm0 \n\t"\
|
---|
2032 | "pand %%mm1, %%mm0 \n\t"\
|
---|
2033 | "movq %%mm0, " #b " \n\t"
|
---|
2034 | #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
|
---|
2035 |
|
---|
2036 | MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
|
---|
2037 | MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
|
---|
2038 | MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
|
---|
2039 | MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
|
---|
2040 |
|
---|
2041 | : : "r" (src), "r" ((long)stride)
|
---|
2042 | : "%"REG_a, "%"REG_d
|
---|
2043 | );
|
---|
2044 | #endif //HAVE_MMX2
|
---|
2045 | #else //HAVE_MMX
|
---|
2046 | int x, y;
|
---|
2047 | src+= 4*stride;
|
---|
2048 | // FIXME - there should be a way to do a few columns in parallel like w/mmx
|
---|
2049 | for(x=0; x<8; x++)
|
---|
2050 | {
|
---|
2051 | uint8_t *colsrc = src;
|
---|
2052 | for (y=0; y<4; y++)
|
---|
2053 | {
|
---|
2054 | int a, b, c, d, e, f;
|
---|
2055 | a = colsrc[0 ];
|
---|
2056 | b = colsrc[stride ];
|
---|
2057 | c = colsrc[stride*2];
|
---|
2058 | d = (a-b)>>31;
|
---|
2059 | e = (b-c)>>31;
|
---|
2060 | f = (c-a)>>31;
|
---|
2061 | colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
|
---|
2062 | colsrc += stride*2;
|
---|
2063 | }
|
---|
2064 | src++;
|
---|
2065 | }
|
---|
2066 | #endif //HAVE_MMX
|
---|
2067 | }
|
---|
2068 |
|
---|
2069 | #ifdef HAVE_MMX
|
---|
2070 | /**
|
---|
2071 | * transposes and shift the given 8x8 Block into dst1 and dst2
|
---|
2072 | */
|
---|
2073 | static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
|
---|
2074 | {
|
---|
2075 | asm(
|
---|
2076 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
2077 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2078 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
2079 | "movq (%0), %%mm0 \n\t" // 12345678
|
---|
2080 | "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
|
---|
2081 | "movq %%mm0, %%mm2 \n\t" // 12345678
|
---|
2082 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
|
---|
2083 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
|
---|
2084 |
|
---|
2085 | "movq (%%"REG_a", %1), %%mm1 \n\t"
|
---|
2086 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
|
---|
2087 | "movq %%mm1, %%mm4 \n\t"
|
---|
2088 | "punpcklbw %%mm3, %%mm1 \n\t"
|
---|
2089 | "punpckhbw %%mm3, %%mm4 \n\t"
|
---|
2090 |
|
---|
2091 | "movq %%mm0, %%mm3 \n\t"
|
---|
2092 | "punpcklwd %%mm1, %%mm0 \n\t"
|
---|
2093 | "punpckhwd %%mm1, %%mm3 \n\t"
|
---|
2094 | "movq %%mm2, %%mm1 \n\t"
|
---|
2095 | "punpcklwd %%mm4, %%mm2 \n\t"
|
---|
2096 | "punpckhwd %%mm4, %%mm1 \n\t"
|
---|
2097 |
|
---|
2098 | "movd %%mm0, 128(%2) \n\t"
|
---|
2099 | "psrlq $32, %%mm0 \n\t"
|
---|
2100 | "movd %%mm0, 144(%2) \n\t"
|
---|
2101 | "movd %%mm3, 160(%2) \n\t"
|
---|
2102 | "psrlq $32, %%mm3 \n\t"
|
---|
2103 | "movd %%mm3, 176(%2) \n\t"
|
---|
2104 | "movd %%mm3, 48(%3) \n\t"
|
---|
2105 | "movd %%mm2, 192(%2) \n\t"
|
---|
2106 | "movd %%mm2, 64(%3) \n\t"
|
---|
2107 | "psrlq $32, %%mm2 \n\t"
|
---|
2108 | "movd %%mm2, 80(%3) \n\t"
|
---|
2109 | "movd %%mm1, 96(%3) \n\t"
|
---|
2110 | "psrlq $32, %%mm1 \n\t"
|
---|
2111 | "movd %%mm1, 112(%3) \n\t"
|
---|
2112 |
|
---|
2113 | "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
|
---|
2114 |
|
---|
2115 | "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
|
---|
2116 | "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
|
---|
2117 | "movq %%mm0, %%mm2 \n\t" // 12345678
|
---|
2118 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
|
---|
2119 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
|
---|
2120 |
|
---|
2121 | "movq (%%"REG_a", %1), %%mm1 \n\t"
|
---|
2122 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
|
---|
2123 | "movq %%mm1, %%mm4 \n\t"
|
---|
2124 | "punpcklbw %%mm3, %%mm1 \n\t"
|
---|
2125 | "punpckhbw %%mm3, %%mm4 \n\t"
|
---|
2126 |
|
---|
2127 | "movq %%mm0, %%mm3 \n\t"
|
---|
2128 | "punpcklwd %%mm1, %%mm0 \n\t"
|
---|
2129 | "punpckhwd %%mm1, %%mm3 \n\t"
|
---|
2130 | "movq %%mm2, %%mm1 \n\t"
|
---|
2131 | "punpcklwd %%mm4, %%mm2 \n\t"
|
---|
2132 | "punpckhwd %%mm4, %%mm1 \n\t"
|
---|
2133 |
|
---|
2134 | "movd %%mm0, 132(%2) \n\t"
|
---|
2135 | "psrlq $32, %%mm0 \n\t"
|
---|
2136 | "movd %%mm0, 148(%2) \n\t"
|
---|
2137 | "movd %%mm3, 164(%2) \n\t"
|
---|
2138 | "psrlq $32, %%mm3 \n\t"
|
---|
2139 | "movd %%mm3, 180(%2) \n\t"
|
---|
2140 | "movd %%mm3, 52(%3) \n\t"
|
---|
2141 | "movd %%mm2, 196(%2) \n\t"
|
---|
2142 | "movd %%mm2, 68(%3) \n\t"
|
---|
2143 | "psrlq $32, %%mm2 \n\t"
|
---|
2144 | "movd %%mm2, 84(%3) \n\t"
|
---|
2145 | "movd %%mm1, 100(%3) \n\t"
|
---|
2146 | "psrlq $32, %%mm1 \n\t"
|
---|
2147 | "movd %%mm1, 116(%3) \n\t"
|
---|
2148 |
|
---|
2149 |
|
---|
2150 | :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
|
---|
2151 | : "%"REG_a
|
---|
2152 | );
|
---|
2153 | }
|
---|
2154 |
|
---|
2155 | /**
|
---|
2156 | * transposes the given 8x8 block
|
---|
2157 | */
|
---|
2158 | static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
|
---|
2159 | {
|
---|
2160 | asm(
|
---|
2161 | "lea (%0, %1), %%"REG_a" \n\t"
|
---|
2162 | "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
|
---|
2163 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2164 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
|
---|
2165 | "movq (%2), %%mm0 \n\t" // 12345678
|
---|
2166 | "movq 16(%2), %%mm1 \n\t" // abcdefgh
|
---|
2167 | "movq %%mm0, %%mm2 \n\t" // 12345678
|
---|
2168 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
|
---|
2169 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
|
---|
2170 |
|
---|
2171 | "movq 32(%2), %%mm1 \n\t"
|
---|
2172 | "movq 48(%2), %%mm3 \n\t"
|
---|
2173 | "movq %%mm1, %%mm4 \n\t"
|
---|
2174 | "punpcklbw %%mm3, %%mm1 \n\t"
|
---|
2175 | "punpckhbw %%mm3, %%mm4 \n\t"
|
---|
2176 |
|
---|
2177 | "movq %%mm0, %%mm3 \n\t"
|
---|
2178 | "punpcklwd %%mm1, %%mm0 \n\t"
|
---|
2179 | "punpckhwd %%mm1, %%mm3 \n\t"
|
---|
2180 | "movq %%mm2, %%mm1 \n\t"
|
---|
2181 | "punpcklwd %%mm4, %%mm2 \n\t"
|
---|
2182 | "punpckhwd %%mm4, %%mm1 \n\t"
|
---|
2183 |
|
---|
2184 | "movd %%mm0, (%0) \n\t"
|
---|
2185 | "psrlq $32, %%mm0 \n\t"
|
---|
2186 | "movd %%mm0, (%%"REG_a") \n\t"
|
---|
2187 | "movd %%mm3, (%%"REG_a", %1) \n\t"
|
---|
2188 | "psrlq $32, %%mm3 \n\t"
|
---|
2189 | "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
|
---|
2190 | "movd %%mm2, (%0, %1, 4) \n\t"
|
---|
2191 | "psrlq $32, %%mm2 \n\t"
|
---|
2192 | "movd %%mm2, (%%"REG_d") \n\t"
|
---|
2193 | "movd %%mm1, (%%"REG_d", %1) \n\t"
|
---|
2194 | "psrlq $32, %%mm1 \n\t"
|
---|
2195 | "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
|
---|
2196 |
|
---|
2197 |
|
---|
2198 | "movq 64(%2), %%mm0 \n\t" // 12345678
|
---|
2199 | "movq 80(%2), %%mm1 \n\t" // abcdefgh
|
---|
2200 | "movq %%mm0, %%mm2 \n\t" // 12345678
|
---|
2201 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
|
---|
2202 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
|
---|
2203 |
|
---|
2204 | "movq 96(%2), %%mm1 \n\t"
|
---|
2205 | "movq 112(%2), %%mm3 \n\t"
|
---|
2206 | "movq %%mm1, %%mm4 \n\t"
|
---|
2207 | "punpcklbw %%mm3, %%mm1 \n\t"
|
---|
2208 | "punpckhbw %%mm3, %%mm4 \n\t"
|
---|
2209 |
|
---|
2210 | "movq %%mm0, %%mm3 \n\t"
|
---|
2211 | "punpcklwd %%mm1, %%mm0 \n\t"
|
---|
2212 | "punpckhwd %%mm1, %%mm3 \n\t"
|
---|
2213 | "movq %%mm2, %%mm1 \n\t"
|
---|
2214 | "punpcklwd %%mm4, %%mm2 \n\t"
|
---|
2215 | "punpckhwd %%mm4, %%mm1 \n\t"
|
---|
2216 |
|
---|
2217 | "movd %%mm0, 4(%0) \n\t"
|
---|
2218 | "psrlq $32, %%mm0 \n\t"
|
---|
2219 | "movd %%mm0, 4(%%"REG_a") \n\t"
|
---|
2220 | "movd %%mm3, 4(%%"REG_a", %1) \n\t"
|
---|
2221 | "psrlq $32, %%mm3 \n\t"
|
---|
2222 | "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
|
---|
2223 | "movd %%mm2, 4(%0, %1, 4) \n\t"
|
---|
2224 | "psrlq $32, %%mm2 \n\t"
|
---|
2225 | "movd %%mm2, 4(%%"REG_d") \n\t"
|
---|
2226 | "movd %%mm1, 4(%%"REG_d", %1) \n\t"
|
---|
2227 | "psrlq $32, %%mm1 \n\t"
|
---|
2228 | "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
|
---|
2229 |
|
---|
2230 | :: "r" (dst), "r" ((long)dstStride), "r" (src)
|
---|
2231 | : "%"REG_a, "%"REG_d
|
---|
2232 | );
|
---|
2233 | }
|
---|
2234 | #endif //HAVE_MMX
|
---|
2235 | //static long test=0;
|
---|
2236 |
|
---|
2237 | #ifndef HAVE_ALTIVEC
|
---|
2238 | static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
|
---|
2239 | uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
|
---|
2240 | {
|
---|
2241 | // to save a register (FIXME do this outside of the loops)
|
---|
2242 | tempBluredPast[127]= maxNoise[0];
|
---|
2243 | tempBluredPast[128]= maxNoise[1];
|
---|
2244 | tempBluredPast[129]= maxNoise[2];
|
---|
2245 |
|
---|
2246 | #define FAST_L2_DIFF
|
---|
2247 | //#define L1_DIFF //u should change the thresholds too if u try that one
|
---|
2248 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
2249 | asm volatile(
|
---|
2250 | "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
|
---|
2251 | "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
|
---|
2252 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
|
---|
2253 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2254 | // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
|
---|
2255 | //FIXME reorder?
|
---|
2256 | #ifdef L1_DIFF //needs mmx2
|
---|
2257 | "movq (%0), %%mm0 \n\t" // L0
|
---|
2258 | "psadbw (%1), %%mm0 \n\t" // |L0-R0|
|
---|
2259 | "movq (%0, %2), %%mm1 \n\t" // L1
|
---|
2260 | "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
|
---|
2261 | "movq (%0, %2, 2), %%mm2 \n\t" // L2
|
---|
2262 | "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
|
---|
2263 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
|
---|
2264 | "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3|
|
---|
2265 |
|
---|
2266 | "movq (%0, %2, 4), %%mm4 \n\t" // L4
|
---|
2267 | "paddw %%mm1, %%mm0 \n\t"
|
---|
2268 | "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
|
---|
2269 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
|
---|
2270 | "paddw %%mm2, %%mm0 \n\t"
|
---|
2271 | "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5|
|
---|
2272 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
|
---|
2273 | "paddw %%mm3, %%mm0 \n\t"
|
---|
2274 | "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6|
|
---|
2275 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
|
---|
2276 | "paddw %%mm4, %%mm0 \n\t"
|
---|
2277 | "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7|
|
---|
2278 | "paddw %%mm5, %%mm6 \n\t"
|
---|
2279 | "paddw %%mm7, %%mm6 \n\t"
|
---|
2280 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2281 | #else //L1_DIFF
|
---|
2282 | #if defined (FAST_L2_DIFF)
|
---|
2283 | "pcmpeqb %%mm7, %%mm7 \n\t"
|
---|
2284 | "movq "MANGLE(b80)", %%mm6 \n\t"
|
---|
2285 | "pxor %%mm0, %%mm0 \n\t"
|
---|
2286 | #define REAL_L2_DIFF_CORE(a, b)\
|
---|
2287 | "movq " #a ", %%mm5 \n\t"\
|
---|
2288 | "movq " #b ", %%mm2 \n\t"\
|
---|
2289 | "pxor %%mm7, %%mm2 \n\t"\
|
---|
2290 | PAVGB(%%mm2, %%mm5)\
|
---|
2291 | "paddb %%mm6, %%mm5 \n\t"\
|
---|
2292 | "movq %%mm5, %%mm2 \n\t"\
|
---|
2293 | "psllw $8, %%mm5 \n\t"\
|
---|
2294 | "pmaddwd %%mm5, %%mm5 \n\t"\
|
---|
2295 | "pmaddwd %%mm2, %%mm2 \n\t"\
|
---|
2296 | "paddd %%mm2, %%mm5 \n\t"\
|
---|
2297 | "psrld $14, %%mm5 \n\t"\
|
---|
2298 | "paddd %%mm5, %%mm0 \n\t"
|
---|
2299 |
|
---|
2300 | #else //defined (FAST_L2_DIFF)
|
---|
2301 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2302 | "pxor %%mm0, %%mm0 \n\t"
|
---|
2303 | #define REAL_L2_DIFF_CORE(a, b)\
|
---|
2304 | "movq " #a ", %%mm5 \n\t"\
|
---|
2305 | "movq " #b ", %%mm2 \n\t"\
|
---|
2306 | "movq %%mm5, %%mm1 \n\t"\
|
---|
2307 | "movq %%mm2, %%mm3 \n\t"\
|
---|
2308 | "punpcklbw %%mm7, %%mm5 \n\t"\
|
---|
2309 | "punpckhbw %%mm7, %%mm1 \n\t"\
|
---|
2310 | "punpcklbw %%mm7, %%mm2 \n\t"\
|
---|
2311 | "punpckhbw %%mm7, %%mm3 \n\t"\
|
---|
2312 | "psubw %%mm2, %%mm5 \n\t"\
|
---|
2313 | "psubw %%mm3, %%mm1 \n\t"\
|
---|
2314 | "pmaddwd %%mm5, %%mm5 \n\t"\
|
---|
2315 | "pmaddwd %%mm1, %%mm1 \n\t"\
|
---|
2316 | "paddd %%mm1, %%mm5 \n\t"\
|
---|
2317 | "paddd %%mm5, %%mm0 \n\t"
|
---|
2318 |
|
---|
2319 | #endif //defined (FAST_L2_DIFF)
|
---|
2320 |
|
---|
2321 | #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
|
---|
2322 |
|
---|
2323 | L2_DIFF_CORE((%0) , (%1))
|
---|
2324 | L2_DIFF_CORE((%0, %2) , (%1, %2))
|
---|
2325 | L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
|
---|
2326 | L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
|
---|
2327 | L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
|
---|
2328 | L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
|
---|
2329 | L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
|
---|
2330 | L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
|
---|
2331 |
|
---|
2332 | #endif //L1_DIFF
|
---|
2333 |
|
---|
2334 | "movq %%mm0, %%mm4 \n\t"
|
---|
2335 | "psrlq $32, %%mm0 \n\t"
|
---|
2336 | "paddd %%mm0, %%mm4 \n\t"
|
---|
2337 | "movd %%mm4, %%ecx \n\t"
|
---|
2338 | "shll $2, %%ecx \n\t"
|
---|
2339 | "mov %3, %%"REG_d" \n\t"
|
---|
2340 | "addl -4(%%"REG_d"), %%ecx \n\t"
|
---|
2341 | "addl 4(%%"REG_d"), %%ecx \n\t"
|
---|
2342 | "addl -1024(%%"REG_d"), %%ecx \n\t"
|
---|
2343 | "addl $4, %%ecx \n\t"
|
---|
2344 | "addl 1024(%%"REG_d"), %%ecx \n\t"
|
---|
2345 | "shrl $3, %%ecx \n\t"
|
---|
2346 | "movl %%ecx, (%%"REG_d") \n\t"
|
---|
2347 |
|
---|
2348 | // "mov %3, %%"REG_c" \n\t"
|
---|
2349 | // "mov %%"REG_c", test \n\t"
|
---|
2350 | // "jmp 4f \n\t"
|
---|
2351 | "cmpl 512(%%"REG_d"), %%ecx \n\t"
|
---|
2352 | " jb 2f \n\t"
|
---|
2353 | "cmpl 516(%%"REG_d"), %%ecx \n\t"
|
---|
2354 | " jb 1f \n\t"
|
---|
2355 |
|
---|
2356 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
|
---|
2357 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
|
---|
2358 | "movq (%0), %%mm0 \n\t" // L0
|
---|
2359 | "movq (%0, %2), %%mm1 \n\t" // L1
|
---|
2360 | "movq (%0, %2, 2), %%mm2 \n\t" // L2
|
---|
2361 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
|
---|
2362 | "movq (%0, %2, 4), %%mm4 \n\t" // L4
|
---|
2363 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
|
---|
2364 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
|
---|
2365 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
|
---|
2366 | "movq %%mm0, (%1) \n\t" // L0
|
---|
2367 | "movq %%mm1, (%1, %2) \n\t" // L1
|
---|
2368 | "movq %%mm2, (%1, %2, 2) \n\t" // L2
|
---|
2369 | "movq %%mm3, (%1, %%"REG_a") \n\t" // L3
|
---|
2370 | "movq %%mm4, (%1, %2, 4) \n\t" // L4
|
---|
2371 | "movq %%mm5, (%1, %%"REG_d") \n\t" // L5
|
---|
2372 | "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
|
---|
2373 | "movq %%mm7, (%1, %%"REG_c") \n\t" // L7
|
---|
2374 | "jmp 4f \n\t"
|
---|
2375 |
|
---|
2376 | "1: \n\t"
|
---|
2377 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
|
---|
2378 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
|
---|
2379 | "movq (%0), %%mm0 \n\t" // L0
|
---|
2380 | PAVGB((%1), %%mm0) // L0
|
---|
2381 | "movq (%0, %2), %%mm1 \n\t" // L1
|
---|
2382 | PAVGB((%1, %2), %%mm1) // L1
|
---|
2383 | "movq (%0, %2, 2), %%mm2 \n\t" // L2
|
---|
2384 | PAVGB((%1, %2, 2), %%mm2) // L2
|
---|
2385 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
|
---|
2386 | PAVGB((%1, %%REGa), %%mm3) // L3
|
---|
2387 | "movq (%0, %2, 4), %%mm4 \n\t" // L4
|
---|
2388 | PAVGB((%1, %2, 4), %%mm4) // L4
|
---|
2389 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
|
---|
2390 | PAVGB((%1, %%REGd), %%mm5) // L5
|
---|
2391 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
|
---|
2392 | PAVGB((%1, %%REGa, 2), %%mm6) // L6
|
---|
2393 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
|
---|
2394 | PAVGB((%1, %%REGc), %%mm7) // L7
|
---|
2395 | "movq %%mm0, (%1) \n\t" // R0
|
---|
2396 | "movq %%mm1, (%1, %2) \n\t" // R1
|
---|
2397 | "movq %%mm2, (%1, %2, 2) \n\t" // R2
|
---|
2398 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
|
---|
2399 | "movq %%mm4, (%1, %2, 4) \n\t" // R4
|
---|
2400 | "movq %%mm5, (%1, %%"REG_d") \n\t" // R5
|
---|
2401 | "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
|
---|
2402 | "movq %%mm7, (%1, %%"REG_c") \n\t" // R7
|
---|
2403 | "movq %%mm0, (%0) \n\t" // L0
|
---|
2404 | "movq %%mm1, (%0, %2) \n\t" // L1
|
---|
2405 | "movq %%mm2, (%0, %2, 2) \n\t" // L2
|
---|
2406 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
|
---|
2407 | "movq %%mm4, (%0, %2, 4) \n\t" // L4
|
---|
2408 | "movq %%mm5, (%0, %%"REG_d") \n\t" // L5
|
---|
2409 | "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
|
---|
2410 | "movq %%mm7, (%0, %%"REG_c") \n\t" // L7
|
---|
2411 | "jmp 4f \n\t"
|
---|
2412 |
|
---|
2413 | "2: \n\t"
|
---|
2414 | "cmpl 508(%%"REG_d"), %%ecx \n\t"
|
---|
2415 | " jb 3f \n\t"
|
---|
2416 |
|
---|
2417 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
|
---|
2418 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
|
---|
2419 | "movq (%0), %%mm0 \n\t" // L0
|
---|
2420 | "movq (%0, %2), %%mm1 \n\t" // L1
|
---|
2421 | "movq (%0, %2, 2), %%mm2 \n\t" // L2
|
---|
2422 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
|
---|
2423 | "movq (%1), %%mm4 \n\t" // R0
|
---|
2424 | "movq (%1, %2), %%mm5 \n\t" // R1
|
---|
2425 | "movq (%1, %2, 2), %%mm6 \n\t" // R2
|
---|
2426 | "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
|
---|
2427 | PAVGB(%%mm4, %%mm0)
|
---|
2428 | PAVGB(%%mm5, %%mm1)
|
---|
2429 | PAVGB(%%mm6, %%mm2)
|
---|
2430 | PAVGB(%%mm7, %%mm3)
|
---|
2431 | PAVGB(%%mm4, %%mm0)
|
---|
2432 | PAVGB(%%mm5, %%mm1)
|
---|
2433 | PAVGB(%%mm6, %%mm2)
|
---|
2434 | PAVGB(%%mm7, %%mm3)
|
---|
2435 | "movq %%mm0, (%1) \n\t" // R0
|
---|
2436 | "movq %%mm1, (%1, %2) \n\t" // R1
|
---|
2437 | "movq %%mm2, (%1, %2, 2) \n\t" // R2
|
---|
2438 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
|
---|
2439 | "movq %%mm0, (%0) \n\t" // L0
|
---|
2440 | "movq %%mm1, (%0, %2) \n\t" // L1
|
---|
2441 | "movq %%mm2, (%0, %2, 2) \n\t" // L2
|
---|
2442 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
|
---|
2443 |
|
---|
2444 | "movq (%0, %2, 4), %%mm0 \n\t" // L4
|
---|
2445 | "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
|
---|
2446 | "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
|
---|
2447 | "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
|
---|
2448 | "movq (%1, %2, 4), %%mm4 \n\t" // R4
|
---|
2449 | "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
|
---|
2450 | "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
|
---|
2451 | "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
|
---|
2452 | PAVGB(%%mm4, %%mm0)
|
---|
2453 | PAVGB(%%mm5, %%mm1)
|
---|
2454 | PAVGB(%%mm6, %%mm2)
|
---|
2455 | PAVGB(%%mm7, %%mm3)
|
---|
2456 | PAVGB(%%mm4, %%mm0)
|
---|
2457 | PAVGB(%%mm5, %%mm1)
|
---|
2458 | PAVGB(%%mm6, %%mm2)
|
---|
2459 | PAVGB(%%mm7, %%mm3)
|
---|
2460 | "movq %%mm0, (%1, %2, 4) \n\t" // R4
|
---|
2461 | "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
|
---|
2462 | "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
|
---|
2463 | "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
|
---|
2464 | "movq %%mm0, (%0, %2, 4) \n\t" // L4
|
---|
2465 | "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
|
---|
2466 | "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
|
---|
2467 | "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
|
---|
2468 | "jmp 4f \n\t"
|
---|
2469 |
|
---|
2470 | "3: \n\t"
|
---|
2471 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
|
---|
2472 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
|
---|
2473 | "movq (%0), %%mm0 \n\t" // L0
|
---|
2474 | "movq (%0, %2), %%mm1 \n\t" // L1
|
---|
2475 | "movq (%0, %2, 2), %%mm2 \n\t" // L2
|
---|
2476 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
|
---|
2477 | "movq (%1), %%mm4 \n\t" // R0
|
---|
2478 | "movq (%1, %2), %%mm5 \n\t" // R1
|
---|
2479 | "movq (%1, %2, 2), %%mm6 \n\t" // R2
|
---|
2480 | "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
|
---|
2481 | PAVGB(%%mm4, %%mm0)
|
---|
2482 | PAVGB(%%mm5, %%mm1)
|
---|
2483 | PAVGB(%%mm6, %%mm2)
|
---|
2484 | PAVGB(%%mm7, %%mm3)
|
---|
2485 | PAVGB(%%mm4, %%mm0)
|
---|
2486 | PAVGB(%%mm5, %%mm1)
|
---|
2487 | PAVGB(%%mm6, %%mm2)
|
---|
2488 | PAVGB(%%mm7, %%mm3)
|
---|
2489 | PAVGB(%%mm4, %%mm0)
|
---|
2490 | PAVGB(%%mm5, %%mm1)
|
---|
2491 | PAVGB(%%mm6, %%mm2)
|
---|
2492 | PAVGB(%%mm7, %%mm3)
|
---|
2493 | "movq %%mm0, (%1) \n\t" // R0
|
---|
2494 | "movq %%mm1, (%1, %2) \n\t" // R1
|
---|
2495 | "movq %%mm2, (%1, %2, 2) \n\t" // R2
|
---|
2496 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
|
---|
2497 | "movq %%mm0, (%0) \n\t" // L0
|
---|
2498 | "movq %%mm1, (%0, %2) \n\t" // L1
|
---|
2499 | "movq %%mm2, (%0, %2, 2) \n\t" // L2
|
---|
2500 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
|
---|
2501 |
|
---|
2502 | "movq (%0, %2, 4), %%mm0 \n\t" // L4
|
---|
2503 | "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
|
---|
2504 | "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
|
---|
2505 | "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
|
---|
2506 | "movq (%1, %2, 4), %%mm4 \n\t" // R4
|
---|
2507 | "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
|
---|
2508 | "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
|
---|
2509 | "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
|
---|
2510 | PAVGB(%%mm4, %%mm0)
|
---|
2511 | PAVGB(%%mm5, %%mm1)
|
---|
2512 | PAVGB(%%mm6, %%mm2)
|
---|
2513 | PAVGB(%%mm7, %%mm3)
|
---|
2514 | PAVGB(%%mm4, %%mm0)
|
---|
2515 | PAVGB(%%mm5, %%mm1)
|
---|
2516 | PAVGB(%%mm6, %%mm2)
|
---|
2517 | PAVGB(%%mm7, %%mm3)
|
---|
2518 | PAVGB(%%mm4, %%mm0)
|
---|
2519 | PAVGB(%%mm5, %%mm1)
|
---|
2520 | PAVGB(%%mm6, %%mm2)
|
---|
2521 | PAVGB(%%mm7, %%mm3)
|
---|
2522 | "movq %%mm0, (%1, %2, 4) \n\t" // R4
|
---|
2523 | "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
|
---|
2524 | "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
|
---|
2525 | "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
|
---|
2526 | "movq %%mm0, (%0, %2, 4) \n\t" // L4
|
---|
2527 | "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
|
---|
2528 | "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
|
---|
2529 | "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
|
---|
2530 |
|
---|
2531 | "4: \n\t"
|
---|
2532 |
|
---|
2533 | :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
|
---|
2534 | : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
|
---|
2535 | );
|
---|
2536 | //printf("%d\n", test);
|
---|
2537 | #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
2538 | {
|
---|
2539 | int y;
|
---|
2540 | int d=0;
|
---|
2541 | // int sysd=0;
|
---|
2542 | int i;
|
---|
2543 |
|
---|
2544 | for(y=0; y<8; y++)
|
---|
2545 | {
|
---|
2546 | int x;
|
---|
2547 | for(x=0; x<8; x++)
|
---|
2548 | {
|
---|
2549 | int ref= tempBlured[ x + y*stride ];
|
---|
2550 | int cur= src[ x + y*stride ];
|
---|
2551 | int d1=ref - cur;
|
---|
2552 | // if(x==0 || x==7) d1+= d1>>1;
|
---|
2553 | // if(y==0 || y==7) d1+= d1>>1;
|
---|
2554 | // d+= ABS(d1);
|
---|
2555 | d+= d1*d1;
|
---|
2556 | // sysd+= d1;
|
---|
2557 | }
|
---|
2558 | }
|
---|
2559 | i=d;
|
---|
2560 | d= (
|
---|
2561 | 4*d
|
---|
2562 | +(*(tempBluredPast-256))
|
---|
2563 | +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
|
---|
2564 | +(*(tempBluredPast+256))
|
---|
2565 | +4)>>3;
|
---|
2566 | *tempBluredPast=i;
|
---|
2567 | // ((*tempBluredPast)*3 + d + 2)>>2;
|
---|
2568 |
|
---|
2569 | //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
|
---|
2570 | /*
|
---|
2571 | Switch between
|
---|
2572 | 1 0 0 0 0 0 0 (0)
|
---|
2573 | 64 32 16 8 4 2 1 (1)
|
---|
2574 | 64 48 36 27 20 15 11 (33) (approx)
|
---|
2575 | 64 56 49 43 37 33 29 (200) (approx)
|
---|
2576 | */
|
---|
2577 | if(d > maxNoise[1])
|
---|
2578 | {
|
---|
2579 | if(d < maxNoise[2])
|
---|
2580 | {
|
---|
2581 | for(y=0; y<8; y++)
|
---|
2582 | {
|
---|
2583 | int x;
|
---|
2584 | for(x=0; x<8; x++)
|
---|
2585 | {
|
---|
2586 | int ref= tempBlured[ x + y*stride ];
|
---|
2587 | int cur= src[ x + y*stride ];
|
---|
2588 | tempBlured[ x + y*stride ]=
|
---|
2589 | src[ x + y*stride ]=
|
---|
2590 | (ref + cur + 1)>>1;
|
---|
2591 | }
|
---|
2592 | }
|
---|
2593 | }
|
---|
2594 | else
|
---|
2595 | {
|
---|
2596 | for(y=0; y<8; y++)
|
---|
2597 | {
|
---|
2598 | int x;
|
---|
2599 | for(x=0; x<8; x++)
|
---|
2600 | {
|
---|
2601 | tempBlured[ x + y*stride ]= src[ x + y*stride ];
|
---|
2602 | }
|
---|
2603 | }
|
---|
2604 | }
|
---|
2605 | }
|
---|
2606 | else
|
---|
2607 | {
|
---|
2608 | if(d < maxNoise[0])
|
---|
2609 | {
|
---|
2610 | for(y=0; y<8; y++)
|
---|
2611 | {
|
---|
2612 | int x;
|
---|
2613 | for(x=0; x<8; x++)
|
---|
2614 | {
|
---|
2615 | int ref= tempBlured[ x + y*stride ];
|
---|
2616 | int cur= src[ x + y*stride ];
|
---|
2617 | tempBlured[ x + y*stride ]=
|
---|
2618 | src[ x + y*stride ]=
|
---|
2619 | (ref*7 + cur + 4)>>3;
|
---|
2620 | }
|
---|
2621 | }
|
---|
2622 | }
|
---|
2623 | else
|
---|
2624 | {
|
---|
2625 | for(y=0; y<8; y++)
|
---|
2626 | {
|
---|
2627 | int x;
|
---|
2628 | for(x=0; x<8; x++)
|
---|
2629 | {
|
---|
2630 | int ref= tempBlured[ x + y*stride ];
|
---|
2631 | int cur= src[ x + y*stride ];
|
---|
2632 | tempBlured[ x + y*stride ]=
|
---|
2633 | src[ x + y*stride ]=
|
---|
2634 | (ref*3 + cur + 2)>>2;
|
---|
2635 | }
|
---|
2636 | }
|
---|
2637 | }
|
---|
2638 | }
|
---|
2639 | }
|
---|
2640 | #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
---|
2641 | }
|
---|
2642 | #endif //HAVE_ALTIVEC
|
---|
2643 |
|
---|
2644 | #ifdef HAVE_MMX
|
---|
2645 | /**
|
---|
2646 | * accurate deblock filter
|
---|
2647 | */
|
---|
2648 | static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
|
---|
2649 | int64_t dc_mask, eq_mask, both_masks;
|
---|
2650 | int64_t sums[10*8*2];
|
---|
2651 | src+= step*3; // src points to begin of the 8x8 Block
|
---|
2652 | //START_TIMER
|
---|
2653 | asm volatile(
|
---|
2654 | "movq %0, %%mm7 \n\t"
|
---|
2655 | "movq %1, %%mm6 \n\t"
|
---|
2656 | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
|
---|
2657 | );
|
---|
2658 |
|
---|
2659 | asm volatile(
|
---|
2660 | "lea (%2, %3), %%"REG_a" \n\t"
|
---|
2661 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2662 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
|
---|
2663 |
|
---|
2664 | "movq (%2), %%mm0 \n\t"
|
---|
2665 | "movq (%%"REG_a"), %%mm1 \n\t"
|
---|
2666 | "movq %%mm1, %%mm3 \n\t"
|
---|
2667 | "movq %%mm1, %%mm4 \n\t"
|
---|
2668 | "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
|
---|
2669 | "paddb %%mm7, %%mm0 \n\t"
|
---|
2670 | "pcmpgtb %%mm6, %%mm0 \n\t"
|
---|
2671 |
|
---|
2672 | "movq (%%"REG_a",%3), %%mm2 \n\t"
|
---|
2673 | PMAXUB(%%mm2, %%mm4)
|
---|
2674 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
2675 | "psubb %%mm2, %%mm1 \n\t"
|
---|
2676 | "paddb %%mm7, %%mm1 \n\t"
|
---|
2677 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
2678 | "paddb %%mm1, %%mm0 \n\t"
|
---|
2679 |
|
---|
2680 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
|
---|
2681 | PMAXUB(%%mm1, %%mm4)
|
---|
2682 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
2683 | "psubb %%mm1, %%mm2 \n\t"
|
---|
2684 | "paddb %%mm7, %%mm2 \n\t"
|
---|
2685 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
2686 | "paddb %%mm2, %%mm0 \n\t"
|
---|
2687 |
|
---|
2688 | "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
|
---|
2689 |
|
---|
2690 | "movq (%2, %3, 4), %%mm2 \n\t"
|
---|
2691 | PMAXUB(%%mm2, %%mm4)
|
---|
2692 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
2693 | "psubb %%mm2, %%mm1 \n\t"
|
---|
2694 | "paddb %%mm7, %%mm1 \n\t"
|
---|
2695 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
2696 | "paddb %%mm1, %%mm0 \n\t"
|
---|
2697 |
|
---|
2698 | "movq (%%"REG_a"), %%mm1 \n\t"
|
---|
2699 | PMAXUB(%%mm1, %%mm4)
|
---|
2700 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
2701 | "psubb %%mm1, %%mm2 \n\t"
|
---|
2702 | "paddb %%mm7, %%mm2 \n\t"
|
---|
2703 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
2704 | "paddb %%mm2, %%mm0 \n\t"
|
---|
2705 |
|
---|
2706 | "movq (%%"REG_a", %3), %%mm2 \n\t"
|
---|
2707 | PMAXUB(%%mm2, %%mm4)
|
---|
2708 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
2709 | "psubb %%mm2, %%mm1 \n\t"
|
---|
2710 | "paddb %%mm7, %%mm1 \n\t"
|
---|
2711 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
2712 | "paddb %%mm1, %%mm0 \n\t"
|
---|
2713 |
|
---|
2714 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
|
---|
2715 | PMAXUB(%%mm1, %%mm4)
|
---|
2716 | PMINUB(%%mm1, %%mm3, %%mm5)
|
---|
2717 | "psubb %%mm1, %%mm2 \n\t"
|
---|
2718 | "paddb %%mm7, %%mm2 \n\t"
|
---|
2719 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
2720 | "paddb %%mm2, %%mm0 \n\t"
|
---|
2721 |
|
---|
2722 | "movq (%2, %3, 8), %%mm2 \n\t"
|
---|
2723 | PMAXUB(%%mm2, %%mm4)
|
---|
2724 | PMINUB(%%mm2, %%mm3, %%mm5)
|
---|
2725 | "psubb %%mm2, %%mm1 \n\t"
|
---|
2726 | "paddb %%mm7, %%mm1 \n\t"
|
---|
2727 | "pcmpgtb %%mm6, %%mm1 \n\t"
|
---|
2728 | "paddb %%mm1, %%mm0 \n\t"
|
---|
2729 |
|
---|
2730 | "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
|
---|
2731 | "psubb %%mm1, %%mm2 \n\t"
|
---|
2732 | "paddb %%mm7, %%mm2 \n\t"
|
---|
2733 | "pcmpgtb %%mm6, %%mm2 \n\t"
|
---|
2734 | "paddb %%mm2, %%mm0 \n\t"
|
---|
2735 | "psubusb %%mm3, %%mm4 \n\t"
|
---|
2736 |
|
---|
2737 | "pxor %%mm6, %%mm6 \n\t"
|
---|
2738 | "movq %4, %%mm7 \n\t" // QP,..., QP
|
---|
2739 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
|
---|
2740 | "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
|
---|
2741 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
|
---|
2742 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
|
---|
2743 | "movq %%mm7, %1 \n\t"
|
---|
2744 |
|
---|
2745 | "movq %5, %%mm7 \n\t"
|
---|
2746 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
2747 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
2748 | "punpcklbw %%mm7, %%mm7 \n\t"
|
---|
2749 | "psubb %%mm0, %%mm6 \n\t"
|
---|
2750 | "pcmpgtb %%mm7, %%mm6 \n\t"
|
---|
2751 | "movq %%mm6, %0 \n\t"
|
---|
2752 |
|
---|
2753 | : "=m" (eq_mask), "=m" (dc_mask)
|
---|
2754 | : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
|
---|
2755 | : "%"REG_a
|
---|
2756 | );
|
---|
2757 |
|
---|
2758 | both_masks = dc_mask & eq_mask;
|
---|
2759 |
|
---|
2760 | if(both_masks){
|
---|
2761 | long offset= -8*step;
|
---|
2762 | int64_t *temp_sums= sums;
|
---|
2763 |
|
---|
2764 | asm volatile(
|
---|
2765 | "movq %2, %%mm0 \n\t" // QP,..., QP
|
---|
2766 | "pxor %%mm4, %%mm4 \n\t"
|
---|
2767 |
|
---|
2768 | "movq (%0), %%mm6 \n\t"
|
---|
2769 | "movq (%0, %1), %%mm5 \n\t"
|
---|
2770 | "movq %%mm5, %%mm1 \n\t"
|
---|
2771 | "movq %%mm6, %%mm2 \n\t"
|
---|
2772 | "psubusb %%mm6, %%mm5 \n\t"
|
---|
2773 | "psubusb %%mm1, %%mm2 \n\t"
|
---|
2774 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
|
---|
2775 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
|
---|
2776 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
|
---|
2777 |
|
---|
2778 | "pxor %%mm6, %%mm1 \n\t"
|
---|
2779 | "pand %%mm0, %%mm1 \n\t"
|
---|
2780 | "pxor %%mm1, %%mm6 \n\t"
|
---|
2781 | // 0:QP 6:First
|
---|
2782 |
|
---|
2783 | "movq (%0, %1, 8), %%mm5 \n\t"
|
---|
2784 | "add %1, %0 \n\t" // %0 points to line 1 not 0
|
---|
2785 | "movq (%0, %1, 8), %%mm7 \n\t"
|
---|
2786 | "movq %%mm5, %%mm1 \n\t"
|
---|
2787 | "movq %%mm7, %%mm2 \n\t"
|
---|
2788 | "psubusb %%mm7, %%mm5 \n\t"
|
---|
2789 | "psubusb %%mm1, %%mm2 \n\t"
|
---|
2790 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
|
---|
2791 | "movq %2, %%mm0 \n\t" // QP,..., QP
|
---|
2792 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
|
---|
2793 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
|
---|
2794 |
|
---|
2795 | "pxor %%mm7, %%mm1 \n\t"
|
---|
2796 | "pand %%mm0, %%mm1 \n\t"
|
---|
2797 | "pxor %%mm1, %%mm7 \n\t"
|
---|
2798 |
|
---|
2799 | "movq %%mm6, %%mm5 \n\t"
|
---|
2800 | "punpckhbw %%mm4, %%mm6 \n\t"
|
---|
2801 | "punpcklbw %%mm4, %%mm5 \n\t"
|
---|
2802 | // 4:0 5/6:First 7:Last
|
---|
2803 |
|
---|
2804 | "movq %%mm5, %%mm0 \n\t"
|
---|
2805 | "movq %%mm6, %%mm1 \n\t"
|
---|
2806 | "psllw $2, %%mm0 \n\t"
|
---|
2807 | "psllw $2, %%mm1 \n\t"
|
---|
2808 | "paddw "MANGLE(w04)", %%mm0 \n\t"
|
---|
2809 | "paddw "MANGLE(w04)", %%mm1 \n\t"
|
---|
2810 |
|
---|
2811 | #define NEXT\
|
---|
2812 | "movq (%0), %%mm2 \n\t"\
|
---|
2813 | "movq (%0), %%mm3 \n\t"\
|
---|
2814 | "add %1, %0 \n\t"\
|
---|
2815 | "punpcklbw %%mm4, %%mm2 \n\t"\
|
---|
2816 | "punpckhbw %%mm4, %%mm3 \n\t"\
|
---|
2817 | "paddw %%mm2, %%mm0 \n\t"\
|
---|
2818 | "paddw %%mm3, %%mm1 \n\t"
|
---|
2819 |
|
---|
2820 | #define PREV\
|
---|
2821 | "movq (%0), %%mm2 \n\t"\
|
---|
2822 | "movq (%0), %%mm3 \n\t"\
|
---|
2823 | "add %1, %0 \n\t"\
|
---|
2824 | "punpcklbw %%mm4, %%mm2 \n\t"\
|
---|
2825 | "punpckhbw %%mm4, %%mm3 \n\t"\
|
---|
2826 | "psubw %%mm2, %%mm0 \n\t"\
|
---|
2827 | "psubw %%mm3, %%mm1 \n\t"
|
---|
2828 |
|
---|
2829 |
|
---|
2830 | NEXT //0
|
---|
2831 | NEXT //1
|
---|
2832 | NEXT //2
|
---|
2833 | "movq %%mm0, (%3) \n\t"
|
---|
2834 | "movq %%mm1, 8(%3) \n\t"
|
---|
2835 |
|
---|
2836 | NEXT //3
|
---|
2837 | "psubw %%mm5, %%mm0 \n\t"
|
---|
2838 | "psubw %%mm6, %%mm1 \n\t"
|
---|
2839 | "movq %%mm0, 16(%3) \n\t"
|
---|
2840 | "movq %%mm1, 24(%3) \n\t"
|
---|
2841 |
|
---|
2842 | NEXT //4
|
---|
2843 | "psubw %%mm5, %%mm0 \n\t"
|
---|
2844 | "psubw %%mm6, %%mm1 \n\t"
|
---|
2845 | "movq %%mm0, 32(%3) \n\t"
|
---|
2846 | "movq %%mm1, 40(%3) \n\t"
|
---|
2847 |
|
---|
2848 | NEXT //5
|
---|
2849 | "psubw %%mm5, %%mm0 \n\t"
|
---|
2850 | "psubw %%mm6, %%mm1 \n\t"
|
---|
2851 | "movq %%mm0, 48(%3) \n\t"
|
---|
2852 | "movq %%mm1, 56(%3) \n\t"
|
---|
2853 |
|
---|
2854 | NEXT //6
|
---|
2855 | "psubw %%mm5, %%mm0 \n\t"
|
---|
2856 | "psubw %%mm6, %%mm1 \n\t"
|
---|
2857 | "movq %%mm0, 64(%3) \n\t"
|
---|
2858 | "movq %%mm1, 72(%3) \n\t"
|
---|
2859 |
|
---|
2860 | "movq %%mm7, %%mm6 \n\t"
|
---|
2861 | "punpckhbw %%mm4, %%mm7 \n\t"
|
---|
2862 | "punpcklbw %%mm4, %%mm6 \n\t"
|
---|
2863 |
|
---|
2864 | NEXT //7
|
---|
2865 | "mov %4, %0 \n\t"
|
---|
2866 | "add %1, %0 \n\t"
|
---|
2867 | PREV //0
|
---|
2868 | "movq %%mm0, 80(%3) \n\t"
|
---|
2869 | "movq %%mm1, 88(%3) \n\t"
|
---|
2870 |
|
---|
2871 | PREV //1
|
---|
2872 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2873 | "paddw %%mm7, %%mm1 \n\t"
|
---|
2874 | "movq %%mm0, 96(%3) \n\t"
|
---|
2875 | "movq %%mm1, 104(%3) \n\t"
|
---|
2876 |
|
---|
2877 | PREV //2
|
---|
2878 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2879 | "paddw %%mm7, %%mm1 \n\t"
|
---|
2880 | "movq %%mm0, 112(%3) \n\t"
|
---|
2881 | "movq %%mm1, 120(%3) \n\t"
|
---|
2882 |
|
---|
2883 | PREV //3
|
---|
2884 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2885 | "paddw %%mm7, %%mm1 \n\t"
|
---|
2886 | "movq %%mm0, 128(%3) \n\t"
|
---|
2887 | "movq %%mm1, 136(%3) \n\t"
|
---|
2888 |
|
---|
2889 | PREV //4
|
---|
2890 | "paddw %%mm6, %%mm0 \n\t"
|
---|
2891 | "paddw %%mm7, %%mm1 \n\t"
|
---|
2892 | "movq %%mm0, 144(%3) \n\t"
|
---|
2893 | "movq %%mm1, 152(%3) \n\t"
|
---|
2894 |
|
---|
2895 | "mov %4, %0 \n\t" //FIXME
|
---|
2896 |
|
---|
2897 | : "+&r"(src)
|
---|
2898 | : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
|
---|
2899 | );
|
---|
2900 |
|
---|
2901 | src+= step; // src points to begin of the 8x8 Block
|
---|
2902 |
|
---|
2903 | asm volatile(
|
---|
2904 | "movq %4, %%mm6 \n\t"
|
---|
2905 | "pcmpeqb %%mm5, %%mm5 \n\t"
|
---|
2906 | "pxor %%mm6, %%mm5 \n\t"
|
---|
2907 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2908 |
|
---|
2909 | "1: \n\t"
|
---|
2910 | "movq (%1), %%mm0 \n\t"
|
---|
2911 | "movq 8(%1), %%mm1 \n\t"
|
---|
2912 | "paddw 32(%1), %%mm0 \n\t"
|
---|
2913 | "paddw 40(%1), %%mm1 \n\t"
|
---|
2914 | "movq (%0, %3), %%mm2 \n\t"
|
---|
2915 | "movq %%mm2, %%mm3 \n\t"
|
---|
2916 | "movq %%mm2, %%mm4 \n\t"
|
---|
2917 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
2918 | "punpckhbw %%mm7, %%mm3 \n\t"
|
---|
2919 | "paddw %%mm2, %%mm0 \n\t"
|
---|
2920 | "paddw %%mm3, %%mm1 \n\t"
|
---|
2921 | "paddw %%mm2, %%mm0 \n\t"
|
---|
2922 | "paddw %%mm3, %%mm1 \n\t"
|
---|
2923 | "psrlw $4, %%mm0 \n\t"
|
---|
2924 | "psrlw $4, %%mm1 \n\t"
|
---|
2925 | "packuswb %%mm1, %%mm0 \n\t"
|
---|
2926 | "pand %%mm6, %%mm0 \n\t"
|
---|
2927 | "pand %%mm5, %%mm4 \n\t"
|
---|
2928 | "por %%mm4, %%mm0 \n\t"
|
---|
2929 | "movq %%mm0, (%0, %3) \n\t"
|
---|
2930 | "add $16, %1 \n\t"
|
---|
2931 | "add %2, %0 \n\t"
|
---|
2932 | " js 1b \n\t"
|
---|
2933 |
|
---|
2934 | : "+r"(offset), "+r"(temp_sums)
|
---|
2935 | : "r" ((long)step), "r"(src - offset), "m"(both_masks)
|
---|
2936 | );
|
---|
2937 | }else
|
---|
2938 | src+= step; // src points to begin of the 8x8 Block
|
---|
2939 |
|
---|
2940 | if(eq_mask != -1LL){
|
---|
2941 | uint8_t *temp_src= src;
|
---|
2942 | asm volatile(
|
---|
2943 | "pxor %%mm7, %%mm7 \n\t"
|
---|
2944 | "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
|
---|
2945 | "and "ALIGN_MASK", %%"REG_c" \n\t" // align
|
---|
2946 | // 0 1 2 3 4 5 6 7 8 9
|
---|
2947 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
|
---|
2948 |
|
---|
2949 | "movq (%0), %%mm0 \n\t"
|
---|
2950 | "movq %%mm0, %%mm1 \n\t"
|
---|
2951 | "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
|
---|
2952 | "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
|
---|
2953 |
|
---|
2954 | "movq (%0, %1), %%mm2 \n\t"
|
---|
2955 | "lea (%0, %1, 2), %%"REG_a" \n\t"
|
---|
2956 | "movq %%mm2, %%mm3 \n\t"
|
---|
2957 | "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
|
---|
2958 | "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
|
---|
2959 |
|
---|
2960 | "movq (%%"REG_a"), %%mm4 \n\t"
|
---|
2961 | "movq %%mm4, %%mm5 \n\t"
|
---|
2962 | "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
|
---|
2963 | "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
|
---|
2964 |
|
---|
2965 | "paddw %%mm0, %%mm0 \n\t" // 2L0
|
---|
2966 | "paddw %%mm1, %%mm1 \n\t" // 2H0
|
---|
2967 | "psubw %%mm4, %%mm2 \n\t" // L1 - L2
|
---|
2968 | "psubw %%mm5, %%mm3 \n\t" // H1 - H2
|
---|
2969 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
|
---|
2970 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
|
---|
2971 |
|
---|
2972 | "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
|
---|
2973 | "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
|
---|
2974 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
|
---|
2975 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
|
---|
2976 |
|
---|
2977 | "movq (%%"REG_a", %1), %%mm2 \n\t"
|
---|
2978 | "movq %%mm2, %%mm3 \n\t"
|
---|
2979 | "punpcklbw %%mm7, %%mm2 \n\t" // L3
|
---|
2980 | "punpckhbw %%mm7, %%mm3 \n\t" // H3
|
---|
2981 |
|
---|
2982 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
|
---|
2983 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
|
---|
2984 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
2985 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
2986 | "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
2987 | "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
2988 |
|
---|
2989 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
|
---|
2990 | "movq %%mm0, %%mm1 \n\t"
|
---|
2991 | "punpcklbw %%mm7, %%mm0 \n\t" // L4
|
---|
2992 | "punpckhbw %%mm7, %%mm1 \n\t" // H4
|
---|
2993 |
|
---|
2994 | "psubw %%mm0, %%mm2 \n\t" // L3 - L4
|
---|
2995 | "psubw %%mm1, %%mm3 \n\t" // H3 - H4
|
---|
2996 | "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
|
---|
2997 | "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
|
---|
2998 | "paddw %%mm4, %%mm4 \n\t" // 2L2
|
---|
2999 | "paddw %%mm5, %%mm5 \n\t" // 2H2
|
---|
3000 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
|
---|
3001 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
|
---|
3002 |
|
---|
3003 | "lea (%%"REG_a", %1), %0 \n\t"
|
---|
3004 | "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
|
---|
3005 | "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
|
---|
3006 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
|
---|
3007 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
|
---|
3008 | //50 opcodes so far
|
---|
3009 | "movq (%0, %1, 2), %%mm2 \n\t"
|
---|
3010 | "movq %%mm2, %%mm3 \n\t"
|
---|
3011 | "punpcklbw %%mm7, %%mm2 \n\t" // L5
|
---|
3012 | "punpckhbw %%mm7, %%mm3 \n\t" // H5
|
---|
3013 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
|
---|
3014 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
|
---|
3015 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
|
---|
3016 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
|
---|
3017 |
|
---|
3018 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
|
---|
3019 | "punpcklbw %%mm7, %%mm6 \n\t" // L6
|
---|
3020 | "psubw %%mm6, %%mm2 \n\t" // L5 - L6
|
---|
3021 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
|
---|
3022 | "punpckhbw %%mm7, %%mm6 \n\t" // H6
|
---|
3023 | "psubw %%mm6, %%mm3 \n\t" // H5 - H6
|
---|
3024 |
|
---|
3025 | "paddw %%mm0, %%mm0 \n\t" // 2L4
|
---|
3026 | "paddw %%mm1, %%mm1 \n\t" // 2H4
|
---|
3027 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
|
---|
3028 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
|
---|
3029 |
|
---|
3030 | "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
|
---|
3031 | "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
|
---|
3032 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
|
---|
3033 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
|
---|
3034 |
|
---|
3035 | "movq (%0, %1, 4), %%mm2 \n\t"
|
---|
3036 | "movq %%mm2, %%mm3 \n\t"
|
---|
3037 | "punpcklbw %%mm7, %%mm2 \n\t" // L7
|
---|
3038 | "punpckhbw %%mm7, %%mm3 \n\t" // H7
|
---|
3039 |
|
---|
3040 | "paddw %%mm2, %%mm2 \n\t" // 2L7
|
---|
3041 | "paddw %%mm3, %%mm3 \n\t" // 2H7
|
---|
3042 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
|
---|
3043 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
|
---|
3044 |
|
---|
3045 | "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
|
---|
3046 | "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
|
---|
3047 |
|
---|
3048 | #ifdef HAVE_MMX2
|
---|
3049 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3050 | "psubw %%mm0, %%mm6 \n\t"
|
---|
3051 | "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
|
---|
3052 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3053 | "psubw %%mm1, %%mm6 \n\t"
|
---|
3054 | "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
|
---|
3055 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3056 | "psubw %%mm2, %%mm6 \n\t"
|
---|
3057 | "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
|
---|
3058 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3059 | "psubw %%mm3, %%mm6 \n\t"
|
---|
3060 | "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
|
---|
3061 | #else
|
---|
3062 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3063 | "pcmpgtw %%mm0, %%mm6 \n\t"
|
---|
3064 | "pxor %%mm6, %%mm0 \n\t"
|
---|
3065 | "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
|
---|
3066 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3067 | "pcmpgtw %%mm1, %%mm6 \n\t"
|
---|
3068 | "pxor %%mm6, %%mm1 \n\t"
|
---|
3069 | "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
|
---|
3070 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3071 | "pcmpgtw %%mm2, %%mm6 \n\t"
|
---|
3072 | "pxor %%mm6, %%mm2 \n\t"
|
---|
3073 | "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
|
---|
3074 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3075 | "pcmpgtw %%mm3, %%mm6 \n\t"
|
---|
3076 | "pxor %%mm6, %%mm3 \n\t"
|
---|
3077 | "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
|
---|
3078 | #endif
|
---|
3079 |
|
---|
3080 | #ifdef HAVE_MMX2
|
---|
3081 | "pminsw %%mm2, %%mm0 \n\t"
|
---|
3082 | "pminsw %%mm3, %%mm1 \n\t"
|
---|
3083 | #else
|
---|
3084 | "movq %%mm0, %%mm6 \n\t"
|
---|
3085 | "psubusw %%mm2, %%mm6 \n\t"
|
---|
3086 | "psubw %%mm6, %%mm0 \n\t"
|
---|
3087 | "movq %%mm1, %%mm6 \n\t"
|
---|
3088 | "psubusw %%mm3, %%mm6 \n\t"
|
---|
3089 | "psubw %%mm6, %%mm1 \n\t"
|
---|
3090 | #endif
|
---|
3091 |
|
---|
3092 | "movd %2, %%mm2 \n\t" // QP
|
---|
3093 | "punpcklbw %%mm7, %%mm2 \n\t"
|
---|
3094 |
|
---|
3095 | "movq %%mm7, %%mm6 \n\t" // 0
|
---|
3096 | "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
|
---|
3097 | "pxor %%mm6, %%mm4 \n\t"
|
---|
3098 | "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
|
---|
3099 | "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
|
---|
3100 | "pxor %%mm7, %%mm5 \n\t"
|
---|
3101 | "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
|
---|
3102 | // 100 opcodes
|
---|
3103 | "psllw $3, %%mm2 \n\t" // 8QP
|
---|
3104 | "movq %%mm2, %%mm3 \n\t" // 8QP
|
---|
3105 | "pcmpgtw %%mm4, %%mm2 \n\t"
|
---|
3106 | "pcmpgtw %%mm5, %%mm3 \n\t"
|
---|
3107 | "pand %%mm2, %%mm4 \n\t"
|
---|
3108 | "pand %%mm3, %%mm5 \n\t"
|
---|
3109 |
|
---|
3110 |
|
---|
3111 | "psubusw %%mm0, %%mm4 \n\t" // hd
|
---|
3112 | "psubusw %%mm1, %%mm5 \n\t" // ld
|
---|
3113 |
|
---|
3114 |
|
---|
3115 | "movq "MANGLE(w05)", %%mm2 \n\t" // 5
|
---|
3116 | "pmullw %%mm2, %%mm4 \n\t"
|
---|
3117 | "pmullw %%mm2, %%mm5 \n\t"
|
---|
3118 | "movq "MANGLE(w20)", %%mm2 \n\t" // 32
|
---|
3119 | "paddw %%mm2, %%mm4 \n\t"
|
---|
3120 | "paddw %%mm2, %%mm5 \n\t"
|
---|
3121 | "psrlw $6, %%mm4 \n\t"
|
---|
3122 | "psrlw $6, %%mm5 \n\t"
|
---|
3123 |
|
---|
3124 | "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
|
---|
3125 | "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
|
---|
3126 |
|
---|
3127 | "pxor %%mm2, %%mm2 \n\t"
|
---|
3128 | "pxor %%mm3, %%mm3 \n\t"
|
---|
3129 |
|
---|
3130 | "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
|
---|
3131 | "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
|
---|
3132 | "pxor %%mm2, %%mm0 \n\t"
|
---|
3133 | "pxor %%mm3, %%mm1 \n\t"
|
---|
3134 | "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
|
---|
3135 | "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
|
---|
3136 | "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
|
---|
3137 | "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
|
---|
3138 |
|
---|
3139 | "pxor %%mm6, %%mm2 \n\t"
|
---|
3140 | "pxor %%mm7, %%mm3 \n\t"
|
---|
3141 | "pand %%mm2, %%mm4 \n\t"
|
---|
3142 | "pand %%mm3, %%mm5 \n\t"
|
---|
3143 |
|
---|
3144 | #ifdef HAVE_MMX2
|
---|
3145 | "pminsw %%mm0, %%mm4 \n\t"
|
---|
3146 | "pminsw %%mm1, %%mm5 \n\t"
|
---|
3147 | #else
|
---|
3148 | "movq %%mm4, %%mm2 \n\t"
|
---|
3149 | "psubusw %%mm0, %%mm2 \n\t"
|
---|
3150 | "psubw %%mm2, %%mm4 \n\t"
|
---|
3151 | "movq %%mm5, %%mm2 \n\t"
|
---|
3152 | "psubusw %%mm1, %%mm2 \n\t"
|
---|
3153 | "psubw %%mm2, %%mm5 \n\t"
|
---|
3154 | #endif
|
---|
3155 | "pxor %%mm6, %%mm4 \n\t"
|
---|
3156 | "pxor %%mm7, %%mm5 \n\t"
|
---|
3157 | "psubw %%mm6, %%mm4 \n\t"
|
---|
3158 | "psubw %%mm7, %%mm5 \n\t"
|
---|
3159 | "packsswb %%mm5, %%mm4 \n\t"
|
---|
3160 | "movq %3, %%mm1 \n\t"
|
---|
3161 | "pandn %%mm4, %%mm1 \n\t"
|
---|
3162 | "movq (%0), %%mm0 \n\t"
|
---|
3163 | "paddb %%mm1, %%mm0 \n\t"
|
---|
3164 | "movq %%mm0, (%0) \n\t"
|
---|
3165 | "movq (%0, %1), %%mm0 \n\t"
|
---|
3166 | "psubb %%mm1, %%mm0 \n\t"
|
---|
3167 | "movq %%mm0, (%0, %1) \n\t"
|
---|
3168 |
|
---|
3169 | : "+r" (temp_src)
|
---|
3170 | : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
|
---|
3171 | : "%"REG_a, "%"REG_c
|
---|
3172 | );
|
---|
3173 | }
|
---|
3174 | /*if(step==16){
|
---|
3175 | STOP_TIMER("step16")
|
---|
3176 | }else{
|
---|
3177 | STOP_TIMER("stepX")
|
---|
3178 | }*/
|
---|
3179 | }
|
---|
3180 | #endif //HAVE_MMX
|
---|
3181 |
|
---|
3182 | static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
---|
3183 | QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
|
---|
3184 |
|
---|
3185 | /**
|
---|
3186 | * Copies a block from src to dst and fixes the blacklevel
|
---|
3187 | * levelFix == 0 -> dont touch the brighness & contrast
|
---|
3188 | */
|
---|
3189 | #undef SCALED_CPY
|
---|
3190 |
|
---|
3191 | static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
|
---|
3192 | int levelFix, int64_t *packedOffsetAndScale)
|
---|
3193 | {
|
---|
3194 | #ifndef HAVE_MMX
|
---|
3195 | int i;
|
---|
3196 | #endif
|
---|
3197 | if(levelFix)
|
---|
3198 | {
|
---|
3199 | #ifdef HAVE_MMX
|
---|
3200 | asm volatile(
|
---|
3201 | "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
|
---|
3202 | "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
|
---|
3203 | "lea (%2,%4), %%"REG_a" \n\t"
|
---|
3204 | "lea (%3,%5), %%"REG_d" \n\t"
|
---|
3205 | "pxor %%mm4, %%mm4 \n\t"
|
---|
3206 | #ifdef HAVE_MMX2
|
---|
3207 | #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
|
---|
3208 | "movq " #src1 ", %%mm0 \n\t"\
|
---|
3209 | "movq " #src1 ", %%mm5 \n\t"\
|
---|
3210 | "movq " #src2 ", %%mm1 \n\t"\
|
---|
3211 | "movq " #src2 ", %%mm6 \n\t"\
|
---|
3212 | "punpcklbw %%mm0, %%mm0 \n\t"\
|
---|
3213 | "punpckhbw %%mm5, %%mm5 \n\t"\
|
---|
3214 | "punpcklbw %%mm1, %%mm1 \n\t"\
|
---|
3215 | "punpckhbw %%mm6, %%mm6 \n\t"\
|
---|
3216 | "pmulhuw %%mm3, %%mm0 \n\t"\
|
---|
3217 | "pmulhuw %%mm3, %%mm5 \n\t"\
|
---|
3218 | "pmulhuw %%mm3, %%mm1 \n\t"\
|
---|
3219 | "pmulhuw %%mm3, %%mm6 \n\t"\
|
---|
3220 | "psubw %%mm2, %%mm0 \n\t"\
|
---|
3221 | "psubw %%mm2, %%mm5 \n\t"\
|
---|
3222 | "psubw %%mm2, %%mm1 \n\t"\
|
---|
3223 | "psubw %%mm2, %%mm6 \n\t"\
|
---|
3224 | "packuswb %%mm5, %%mm0 \n\t"\
|
---|
3225 | "packuswb %%mm6, %%mm1 \n\t"\
|
---|
3226 | "movq %%mm0, " #dst1 " \n\t"\
|
---|
3227 | "movq %%mm1, " #dst2 " \n\t"\
|
---|
3228 |
|
---|
3229 | #else //HAVE_MMX2
|
---|
3230 | #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
|
---|
3231 | "movq " #src1 ", %%mm0 \n\t"\
|
---|
3232 | "movq " #src1 ", %%mm5 \n\t"\
|
---|
3233 | "punpcklbw %%mm4, %%mm0 \n\t"\
|
---|
3234 | "punpckhbw %%mm4, %%mm5 \n\t"\
|
---|
3235 | "psubw %%mm2, %%mm0 \n\t"\
|
---|
3236 | "psubw %%mm2, %%mm5 \n\t"\
|
---|
3237 | "movq " #src2 ", %%mm1 \n\t"\
|
---|
3238 | "psllw $6, %%mm0 \n\t"\
|
---|
3239 | "psllw $6, %%mm5 \n\t"\
|
---|
3240 | "pmulhw %%mm3, %%mm0 \n\t"\
|
---|
3241 | "movq " #src2 ", %%mm6 \n\t"\
|
---|
3242 | "pmulhw %%mm3, %%mm5 \n\t"\
|
---|
3243 | "punpcklbw %%mm4, %%mm1 \n\t"\
|
---|
3244 | "punpckhbw %%mm4, %%mm6 \n\t"\
|
---|
3245 | "psubw %%mm2, %%mm1 \n\t"\
|
---|
3246 | "psubw %%mm2, %%mm6 \n\t"\
|
---|
3247 | "psllw $6, %%mm1 \n\t"\
|
---|
3248 | "psllw $6, %%mm6 \n\t"\
|
---|
3249 | "pmulhw %%mm3, %%mm1 \n\t"\
|
---|
3250 | "pmulhw %%mm3, %%mm6 \n\t"\
|
---|
3251 | "packuswb %%mm5, %%mm0 \n\t"\
|
---|
3252 | "packuswb %%mm6, %%mm1 \n\t"\
|
---|
3253 | "movq %%mm0, " #dst1 " \n\t"\
|
---|
3254 | "movq %%mm1, " #dst2 " \n\t"\
|
---|
3255 |
|
---|
3256 | #endif //HAVE_MMX2
|
---|
3257 | #define SCALED_CPY(src1, src2, dst1, dst2)\
|
---|
3258 | REAL_SCALED_CPY(src1, src2, dst1, dst2)
|
---|
3259 |
|
---|
3260 | SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
|
---|
3261 | SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
|
---|
3262 | SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
|
---|
3263 | "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
|
---|
3264 | "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
|
---|
3265 | SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
|
---|
3266 |
|
---|
3267 |
|
---|
3268 | : "=&a" (packedOffsetAndScale)
|
---|
3269 | : "0" (packedOffsetAndScale),
|
---|
3270 | "r"(src),
|
---|
3271 | "r"(dst),
|
---|
3272 | "r" ((long)srcStride),
|
---|
3273 | "r" ((long)dstStride)
|
---|
3274 | : "%"REG_d
|
---|
3275 | );
|
---|
3276 | #else //HAVE_MMX
|
---|
3277 | for(i=0; i<8; i++)
|
---|
3278 | memcpy( &(dst[dstStride*i]),
|
---|
3279 | &(src[srcStride*i]), BLOCK_SIZE);
|
---|
3280 | #endif //HAVE_MMX
|
---|
3281 | }
|
---|
3282 | else
|
---|
3283 | {
|
---|
3284 | #ifdef HAVE_MMX
|
---|
3285 | asm volatile(
|
---|
3286 | "lea (%0,%2), %%"REG_a" \n\t"
|
---|
3287 | "lea (%1,%3), %%"REG_d" \n\t"
|
---|
3288 |
|
---|
3289 | #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
|
---|
3290 | "movq " #src1 ", %%mm0 \n\t"\
|
---|
3291 | "movq " #src2 ", %%mm1 \n\t"\
|
---|
3292 | "movq %%mm0, " #dst1 " \n\t"\
|
---|
3293 | "movq %%mm1, " #dst2 " \n\t"\
|
---|
3294 |
|
---|
3295 | #define SIMPLE_CPY(src1, src2, dst1, dst2)\
|
---|
3296 | REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
|
---|
3297 |
|
---|
3298 | SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
|
---|
3299 | SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
|
---|
3300 | SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
|
---|
3301 | "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
|
---|
3302 | "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
|
---|
3303 | SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
|
---|
3304 |
|
---|
3305 | : : "r" (src),
|
---|
3306 | "r" (dst),
|
---|
3307 | "r" ((long)srcStride),
|
---|
3308 | "r" ((long)dstStride)
|
---|
3309 | : "%"REG_a, "%"REG_d
|
---|
3310 | );
|
---|
3311 | #else //HAVE_MMX
|
---|
3312 | for(i=0; i<8; i++)
|
---|
3313 | memcpy( &(dst[dstStride*i]),
|
---|
3314 | &(src[srcStride*i]), BLOCK_SIZE);
|
---|
3315 | #endif //HAVE_MMX
|
---|
3316 | }
|
---|
3317 | }
|
---|
3318 |
|
---|
3319 | /**
|
---|
3320 | * Duplicates the given 8 src pixels ? times upward
|
---|
3321 | */
|
---|
3322 | static inline void RENAME(duplicate)(uint8_t src[], int stride)
|
---|
3323 | {
|
---|
3324 | #ifdef HAVE_MMX
|
---|
3325 | asm volatile(
|
---|
3326 | "movq (%0), %%mm0 \n\t"
|
---|
3327 | "add %1, %0 \n\t"
|
---|
3328 | "movq %%mm0, (%0) \n\t"
|
---|
3329 | "movq %%mm0, (%0, %1) \n\t"
|
---|
3330 | "movq %%mm0, (%0, %1, 2) \n\t"
|
---|
3331 | : "+r" (src)
|
---|
3332 | : "r" ((long)-stride)
|
---|
3333 | );
|
---|
3334 | #else
|
---|
3335 | int i;
|
---|
3336 | uint8_t *p=src;
|
---|
3337 | for(i=0; i<3; i++)
|
---|
3338 | {
|
---|
3339 | p-= stride;
|
---|
3340 | memcpy(p, src, 8);
|
---|
3341 | }
|
---|
3342 | #endif
|
---|
3343 | }
|
---|
3344 |
|
---|
3345 | /**
|
---|
3346 | * Filters array of bytes (Y or U or V values)
|
---|
3347 | */
|
---|
3348 | static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
---|
3349 | QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
|
---|
3350 | {
|
---|
3351 | PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
|
---|
3352 | int x,y;
|
---|
3353 | #ifdef COMPILE_TIME_MODE
|
---|
3354 | const int mode= COMPILE_TIME_MODE;
|
---|
3355 | #else
|
---|
3356 | const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
|
---|
3357 | #endif
|
---|
3358 | int black=0, white=255; // blackest black and whitest white in the picture
|
---|
3359 | int QPCorrecture= 256*256;
|
---|
3360 |
|
---|
3361 | int copyAhead;
|
---|
3362 | #ifdef HAVE_MMX
|
---|
3363 | int i;
|
---|
3364 | #endif
|
---|
3365 |
|
---|
3366 | const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
|
---|
3367 | const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
|
---|
3368 |
|
---|
3369 | //FIXME remove
|
---|
3370 | uint64_t * const yHistogram= c.yHistogram;
|
---|
3371 | uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
|
---|
3372 | uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
|
---|
3373 | //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
|
---|
3374 |
|
---|
3375 | #ifdef HAVE_MMX
|
---|
3376 | for(i=0; i<57; i++){
|
---|
3377 | int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
|
---|
3378 | int threshold= offset*2 + 1;
|
---|
3379 | c.mmxDcOffset[i]= 0x7F - offset;
|
---|
3380 | c.mmxDcThreshold[i]= 0x7F - threshold;
|
---|
3381 | c.mmxDcOffset[i]*= 0x0101010101010101LL;
|
---|
3382 | c.mmxDcThreshold[i]*= 0x0101010101010101LL;
|
---|
3383 | }
|
---|
3384 | #endif
|
---|
3385 |
|
---|
3386 | if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
|
---|
3387 | else if( (mode & LINEAR_BLEND_DEINT_FILTER)
|
---|
3388 | || (mode & FFMPEG_DEINT_FILTER)
|
---|
3389 | || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
|
---|
3390 | else if( (mode & V_DEBLOCK)
|
---|
3391 | || (mode & LINEAR_IPOL_DEINT_FILTER)
|
---|
3392 | || (mode & MEDIAN_DEINT_FILTER)
|
---|
3393 | || (mode & V_A_DEBLOCK)) copyAhead=13;
|
---|
3394 | else if(mode & V_X1_FILTER) copyAhead=11;
|
---|
3395 | // else if(mode & V_RK1_FILTER) copyAhead=10;
|
---|
3396 | else if(mode & DERING) copyAhead=9;
|
---|
3397 | else copyAhead=8;
|
---|
3398 |
|
---|
3399 | copyAhead-= 8;
|
---|
3400 |
|
---|
3401 | if(!isColor)
|
---|
3402 | {
|
---|
3403 | uint64_t sum= 0;
|
---|
3404 | int i;
|
---|
3405 | uint64_t maxClipped;
|
---|
3406 | uint64_t clipped;
|
---|
3407 | double scale;
|
---|
3408 |
|
---|
3409 | c.frameNum++;
|
---|
3410 | // first frame is fscked so we ignore it
|
---|
3411 | if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
|
---|
3412 |
|
---|
3413 | for(i=0; i<256; i++)
|
---|
3414 | {
|
---|
3415 | sum+= yHistogram[i];
|
---|
3416 | // printf("%d ", yHistogram[i]);
|
---|
3417 | }
|
---|
3418 | // printf("\n\n");
|
---|
3419 |
|
---|
3420 | /* we allways get a completly black picture first */
|
---|
3421 | maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
|
---|
3422 |
|
---|
3423 | clipped= sum;
|
---|
3424 | for(black=255; black>0; black--)
|
---|
3425 | {
|
---|
3426 | if(clipped < maxClipped) break;
|
---|
3427 | clipped-= yHistogram[black];
|
---|
3428 | }
|
---|
3429 |
|
---|
3430 | clipped= sum;
|
---|
3431 | for(white=0; white<256; white++)
|
---|
3432 | {
|
---|
3433 | if(clipped < maxClipped) break;
|
---|
3434 | clipped-= yHistogram[white];
|
---|
3435 | }
|
---|
3436 |
|
---|
3437 | scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
|
---|
3438 |
|
---|
3439 | #ifdef HAVE_MMX2
|
---|
3440 | c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
|
---|
3441 | c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
|
---|
3442 | #else
|
---|
3443 | c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
|
---|
3444 | c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
|
---|
3445 | #endif
|
---|
3446 |
|
---|
3447 | c.packedYOffset|= c.packedYOffset<<32;
|
---|
3448 | c.packedYOffset|= c.packedYOffset<<16;
|
---|
3449 |
|
---|
3450 | c.packedYScale|= c.packedYScale<<32;
|
---|
3451 | c.packedYScale|= c.packedYScale<<16;
|
---|
3452 |
|
---|
3453 | if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
|
---|
3454 | else QPCorrecture= 256*256;
|
---|
3455 | }
|
---|
3456 | else
|
---|
3457 | {
|
---|
3458 | c.packedYScale= 0x0100010001000100LL;
|
---|
3459 | c.packedYOffset= 0;
|
---|
3460 | QPCorrecture= 256*256;
|
---|
3461 | }
|
---|
3462 |
|
---|
3463 | /* copy & deinterlace first row of blocks */
|
---|
3464 | y=-BLOCK_SIZE;
|
---|
3465 | {
|
---|
3466 | uint8_t *srcBlock= &(src[y*srcStride]);
|
---|
3467 | uint8_t *dstBlock= tempDst + dstStride;
|
---|
3468 |
|
---|
3469 | // From this point on it is guranteed that we can read and write 16 lines downward
|
---|
3470 | // finish 1 block before the next otherwise we might have a problem
|
---|
3471 | // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
---|
3472 | for(x=0; x<width; x+=BLOCK_SIZE)
|
---|
3473 | {
|
---|
3474 |
|
---|
3475 | #ifdef HAVE_MMX2
|
---|
3476 | /*
|
---|
3477 | prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
|
---|
3478 | prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
|
---|
3479 | prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
|
---|
3480 | prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
|
---|
3481 | */
|
---|
3482 |
|
---|
3483 | asm(
|
---|
3484 | "mov %4, %%"REG_a" \n\t"
|
---|
3485 | "shr $2, %%"REG_a" \n\t"
|
---|
3486 | "and $6, %%"REG_a" \n\t"
|
---|
3487 | "add %5, %%"REG_a" \n\t"
|
---|
3488 | "mov %%"REG_a", %%"REG_d" \n\t"
|
---|
3489 | "imul %1, %%"REG_a" \n\t"
|
---|
3490 | "imul %3, %%"REG_d" \n\t"
|
---|
3491 | "prefetchnta 32(%%"REG_a", %0) \n\t"
|
---|
3492 | "prefetcht0 32(%%"REG_d", %2) \n\t"
|
---|
3493 | "add %1, %%"REG_a" \n\t"
|
---|
3494 | "add %3, %%"REG_d" \n\t"
|
---|
3495 | "prefetchnta 32(%%"REG_a", %0) \n\t"
|
---|
3496 | "prefetcht0 32(%%"REG_d", %2) \n\t"
|
---|
3497 | :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
|
---|
3498 | "g" ((long)x), "g" ((long)copyAhead)
|
---|
3499 | : "%"REG_a, "%"REG_d
|
---|
3500 | );
|
---|
3501 |
|
---|
3502 | #elif defined(HAVE_3DNOW)
|
---|
3503 | //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
|
---|
3504 | /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
|
---|
3505 | prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
|
---|
3506 | prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
|
---|
3507 | prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
|
---|
3508 | */
|
---|
3509 | #endif
|
---|
3510 |
|
---|
3511 | RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
|
---|
3512 | srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
|
---|
3513 |
|
---|
3514 | RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
|
---|
3515 |
|
---|
3516 | if(mode & LINEAR_IPOL_DEINT_FILTER)
|
---|
3517 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
|
---|
3518 | else if(mode & LINEAR_BLEND_DEINT_FILTER)
|
---|
3519 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
|
---|
3520 | else if(mode & MEDIAN_DEINT_FILTER)
|
---|
3521 | RENAME(deInterlaceMedian)(dstBlock, dstStride);
|
---|
3522 | else if(mode & CUBIC_IPOL_DEINT_FILTER)
|
---|
3523 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
|
---|
3524 | else if(mode & FFMPEG_DEINT_FILTER)
|
---|
3525 | RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
|
---|
3526 | else if(mode & LOWPASS5_DEINT_FILTER)
|
---|
3527 | RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
|
---|
3528 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
|
---|
3529 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
|
---|
3530 | */
|
---|
3531 | dstBlock+=8;
|
---|
3532 | srcBlock+=8;
|
---|
3533 | }
|
---|
3534 | if(width==ABS(dstStride))
|
---|
3535 | linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
|
---|
3536 | else
|
---|
3537 | {
|
---|
3538 | int i;
|
---|
3539 | for(i=0; i<copyAhead; i++)
|
---|
3540 | {
|
---|
3541 | memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
|
---|
3542 | }
|
---|
3543 | }
|
---|
3544 | }
|
---|
3545 |
|
---|
3546 | //printf("\n");
|
---|
3547 | for(y=0; y<height; y+=BLOCK_SIZE)
|
---|
3548 | {
|
---|
3549 | //1% speedup if these are here instead of the inner loop
|
---|
3550 | uint8_t *srcBlock= &(src[y*srcStride]);
|
---|
3551 | uint8_t *dstBlock= &(dst[y*dstStride]);
|
---|
3552 | #ifdef HAVE_MMX
|
---|
3553 | uint8_t *tempBlock1= c.tempBlocks;
|
---|
3554 | uint8_t *tempBlock2= c.tempBlocks + 8;
|
---|
3555 | #endif
|
---|
3556 | int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
|
---|
3557 | int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)];
|
---|
3558 | int QP=0;
|
---|
3559 | /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
|
---|
3560 | if not than use a temporary buffer */
|
---|
3561 | if(y+15 >= height)
|
---|
3562 | {
|
---|
3563 | int i;
|
---|
3564 | /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
|
---|
3565 | blockcopy to dst later */
|
---|
3566 | linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
|
---|
3567 | MAX(height-y-copyAhead, 0), srcStride);
|
---|
3568 |
|
---|
3569 | /* duplicate last line of src to fill the void upto line (copyAhead+7) */
|
---|
3570 | for(i=MAX(height-y, 8); i<copyAhead+8; i++)
|
---|
3571 | memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride));
|
---|
3572 |
|
---|
3573 | /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
|
---|
3574 | linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride);
|
---|
3575 |
|
---|
3576 | /* duplicate last line of dst to fill the void upto line (copyAhead) */
|
---|
3577 | for(i=height-y+1; i<=copyAhead; i++)
|
---|
3578 | memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride));
|
---|
3579 |
|
---|
3580 | dstBlock= tempDst + dstStride;
|
---|
3581 | srcBlock= tempSrc;
|
---|
3582 | }
|
---|
3583 | //printf("\n");
|
---|
3584 |
|
---|
3585 | // From this point on it is guranteed that we can read and write 16 lines downward
|
---|
3586 | // finish 1 block before the next otherwise we might have a problem
|
---|
3587 | // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
---|
3588 | for(x=0; x<width; x+=BLOCK_SIZE)
|
---|
3589 | {
|
---|
3590 | const int stride= dstStride;
|
---|
3591 | #ifdef HAVE_MMX
|
---|
3592 | uint8_t *tmpXchg;
|
---|
3593 | #endif
|
---|
3594 | if(isColor)
|
---|
3595 | {
|
---|
3596 | QP= QPptr[x>>qpHShift];
|
---|
3597 | c.nonBQP= nonBQPptr[x>>qpHShift];
|
---|
3598 | }
|
---|
3599 | else
|
---|
3600 | {
|
---|
3601 | QP= QPptr[x>>4];
|
---|
3602 | QP= (QP* QPCorrecture + 256*128)>>16;
|
---|
3603 | c.nonBQP= nonBQPptr[x>>4];
|
---|
3604 | c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
|
---|
3605 | yHistogram[ srcBlock[srcStride*12 + 4] ]++;
|
---|
3606 | }
|
---|
3607 | c.QP= QP;
|
---|
3608 | #ifdef HAVE_MMX
|
---|
3609 | asm volatile(
|
---|
3610 | "movd %1, %%mm7 \n\t"
|
---|
3611 | "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
|
---|
3612 | "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
|
---|
3613 | "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
|
---|
3614 | "movq %%mm7, %0 \n\t"
|
---|
3615 | : "=m" (c.pQPb)
|
---|
3616 | : "r" (QP)
|
---|
3617 | );
|
---|
3618 | #endif
|
---|
3619 |
|
---|
3620 |
|
---|
3621 | #ifdef HAVE_MMX2
|
---|
3622 | /*
|
---|
3623 | prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
|
---|
3624 | prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
|
---|
3625 | prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
|
---|
3626 | prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
|
---|
3627 | */
|
---|
3628 |
|
---|
3629 | asm(
|
---|
3630 | "mov %4, %%"REG_a" \n\t"
|
---|
3631 | "shr $2, %%"REG_a" \n\t"
|
---|
3632 | "and $6, %%"REG_a" \n\t"
|
---|
3633 | "add %5, %%"REG_a" \n\t"
|
---|
3634 | "mov %%"REG_a", %%"REG_d" \n\t"
|
---|
3635 | "imul %1, %%"REG_a" \n\t"
|
---|
3636 | "imul %3, %%"REG_d" \n\t"
|
---|
3637 | "prefetchnta 32(%%"REG_a", %0) \n\t"
|
---|
3638 | "prefetcht0 32(%%"REG_d", %2) \n\t"
|
---|
3639 | "add %1, %%"REG_a" \n\t"
|
---|
3640 | "add %3, %%"REG_d" \n\t"
|
---|
3641 | "prefetchnta 32(%%"REG_a", %0) \n\t"
|
---|
3642 | "prefetcht0 32(%%"REG_d", %2) \n\t"
|
---|
3643 | :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
|
---|
3644 | "g" ((long)x), "g" ((long)copyAhead)
|
---|
3645 | : "%"REG_a, "%"REG_d
|
---|
3646 | );
|
---|
3647 |
|
---|
3648 | #elif defined(HAVE_3DNOW)
|
---|
3649 | //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
|
---|
3650 | /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
|
---|
3651 | prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
|
---|
3652 | prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
|
---|
3653 | prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
|
---|
3654 | */
|
---|
3655 | #endif
|
---|
3656 |
|
---|
3657 | RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
|
---|
3658 | srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
|
---|
3659 |
|
---|
3660 | if(mode & LINEAR_IPOL_DEINT_FILTER)
|
---|
3661 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
|
---|
3662 | else if(mode & LINEAR_BLEND_DEINT_FILTER)
|
---|
3663 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
|
---|
3664 | else if(mode & MEDIAN_DEINT_FILTER)
|
---|
3665 | RENAME(deInterlaceMedian)(dstBlock, dstStride);
|
---|
3666 | else if(mode & CUBIC_IPOL_DEINT_FILTER)
|
---|
3667 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
|
---|
3668 | else if(mode & FFMPEG_DEINT_FILTER)
|
---|
3669 | RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
|
---|
3670 | else if(mode & LOWPASS5_DEINT_FILTER)
|
---|
3671 | RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
|
---|
3672 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
|
---|
3673 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
|
---|
3674 | */
|
---|
3675 |
|
---|
3676 | /* only deblock if we have 2 blocks */
|
---|
3677 | if(y + 8 < height)
|
---|
3678 | {
|
---|
3679 | if(mode & V_X1_FILTER)
|
---|
3680 | RENAME(vertX1Filter)(dstBlock, stride, &c);
|
---|
3681 | else if(mode & V_DEBLOCK)
|
---|
3682 | {
|
---|
3683 | const int t= RENAME(vertClassify)(dstBlock, stride, &c);
|
---|
3684 |
|
---|
3685 | if(t==1)
|
---|
3686 | RENAME(doVertLowPass)(dstBlock, stride, &c);
|
---|
3687 | else if(t==2)
|
---|
3688 | RENAME(doVertDefFilter)(dstBlock, stride, &c);
|
---|
3689 | }else if(mode & V_A_DEBLOCK){
|
---|
3690 | RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
|
---|
3691 | }
|
---|
3692 | }
|
---|
3693 |
|
---|
3694 | #ifdef HAVE_MMX
|
---|
3695 | RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
|
---|
3696 | #endif
|
---|
3697 | /* check if we have a previous block to deblock it with dstBlock */
|
---|
3698 | if(x - 8 >= 0)
|
---|
3699 | {
|
---|
3700 | #ifdef HAVE_MMX
|
---|
3701 | if(mode & H_X1_FILTER)
|
---|
3702 | RENAME(vertX1Filter)(tempBlock1, 16, &c);
|
---|
3703 | else if(mode & H_DEBLOCK)
|
---|
3704 | {
|
---|
3705 | //START_TIMER
|
---|
3706 | const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
|
---|
3707 | //STOP_TIMER("dc & minmax")
|
---|
3708 | if(t==1)
|
---|
3709 | RENAME(doVertLowPass)(tempBlock1, 16, &c);
|
---|
3710 | else if(t==2)
|
---|
3711 | RENAME(doVertDefFilter)(tempBlock1, 16, &c);
|
---|
3712 | }else if(mode & H_A_DEBLOCK){
|
---|
3713 | RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
|
---|
3714 | }
|
---|
3715 |
|
---|
3716 | RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
|
---|
3717 |
|
---|
3718 | #else
|
---|
3719 | if(mode & H_X1_FILTER)
|
---|
3720 | horizX1Filter(dstBlock-4, stride, QP);
|
---|
3721 | else if(mode & H_DEBLOCK)
|
---|
3722 | {
|
---|
3723 | #ifdef HAVE_ALTIVEC
|
---|
3724 | unsigned char __attribute__ ((aligned(16))) tempBlock[272];
|
---|
3725 | transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
|
---|
3726 |
|
---|
3727 | const int t=vertClassify_altivec(tempBlock-48, 16, &c);
|
---|
3728 | if(t==1) {
|
---|
3729 | doVertLowPass_altivec(tempBlock-48, 16, &c);
|
---|
3730 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
|
---|
3731 | }
|
---|
3732 | else if(t==2) {
|
---|
3733 | doVertDefFilter_altivec(tempBlock-48, 16, &c);
|
---|
3734 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
|
---|
3735 | }
|
---|
3736 | #else
|
---|
3737 | const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
|
---|
3738 |
|
---|
3739 | if(t==1)
|
---|
3740 | RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
|
---|
3741 | else if(t==2)
|
---|
3742 | RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
|
---|
3743 | #endif
|
---|
3744 | }else if(mode & H_A_DEBLOCK){
|
---|
3745 | RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
|
---|
3746 | }
|
---|
3747 | #endif //HAVE_MMX
|
---|
3748 | if(mode & DERING)
|
---|
3749 | {
|
---|
3750 | //FIXME filter first line
|
---|
3751 | if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
|
---|
3752 | }
|
---|
3753 |
|
---|
3754 | if(mode & TEMP_NOISE_FILTER)
|
---|
3755 | {
|
---|
3756 | RENAME(tempNoiseReducer)(dstBlock-8, stride,
|
---|
3757 | c.tempBlured[isColor] + y*dstStride + x,
|
---|
3758 | c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
|
---|
3759 | c.ppMode.maxTmpNoise);
|
---|
3760 | }
|
---|
3761 | }
|
---|
3762 |
|
---|
3763 | dstBlock+=8;
|
---|
3764 | srcBlock+=8;
|
---|
3765 |
|
---|
3766 | #ifdef HAVE_MMX
|
---|
3767 | tmpXchg= tempBlock1;
|
---|
3768 | tempBlock1= tempBlock2;
|
---|
3769 | tempBlock2 = tmpXchg;
|
---|
3770 | #endif
|
---|
3771 | }
|
---|
3772 |
|
---|
3773 | if(mode & DERING)
|
---|
3774 | {
|
---|
3775 | if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
|
---|
3776 | }
|
---|
3777 |
|
---|
3778 | if((mode & TEMP_NOISE_FILTER))
|
---|
3779 | {
|
---|
3780 | RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
|
---|
3781 | c.tempBlured[isColor] + y*dstStride + x,
|
---|
3782 | c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
|
---|
3783 | c.ppMode.maxTmpNoise);
|
---|
3784 | }
|
---|
3785 |
|
---|
3786 | /* did we use a tmp buffer for the last lines*/
|
---|
3787 | if(y+15 >= height)
|
---|
3788 | {
|
---|
3789 | uint8_t *dstBlock= &(dst[y*dstStride]);
|
---|
3790 | if(width==ABS(dstStride))
|
---|
3791 | linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
|
---|
3792 | else
|
---|
3793 | {
|
---|
3794 | int i;
|
---|
3795 | for(i=0; i<height-y; i++)
|
---|
3796 | {
|
---|
3797 | memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
|
---|
3798 | }
|
---|
3799 | }
|
---|
3800 | }
|
---|
3801 | /*
|
---|
3802 | for(x=0; x<width; x+=32)
|
---|
3803 | {
|
---|
3804 | volatile int i;
|
---|
3805 | i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
|
---|
3806 | + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
|
---|
3807 | + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
|
---|
3808 | // + dstBlock[x +13*dstStride]
|
---|
3809 | // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
|
---|
3810 | }*/
|
---|
3811 | }
|
---|
3812 | #ifdef HAVE_3DNOW
|
---|
3813 | asm volatile("femms");
|
---|
3814 | #elif defined (HAVE_MMX)
|
---|
3815 | asm volatile("emms");
|
---|
3816 | #endif
|
---|
3817 |
|
---|
3818 | #ifdef DEBUG_BRIGHTNESS
|
---|
3819 | if(!isColor)
|
---|
3820 | {
|
---|
3821 | int max=1;
|
---|
3822 | int i;
|
---|
3823 | for(i=0; i<256; i++)
|
---|
3824 | if(yHistogram[i] > max) max=yHistogram[i];
|
---|
3825 |
|
---|
3826 | for(i=1; i<256; i++)
|
---|
3827 | {
|
---|
3828 | int x;
|
---|
3829 | int start=yHistogram[i-1]/(max/256+1);
|
---|
3830 | int end=yHistogram[i]/(max/256+1);
|
---|
3831 | int inc= end > start ? 1 : -1;
|
---|
3832 | for(x=start; x!=end+inc; x+=inc)
|
---|
3833 | dst[ i*dstStride + x]+=128;
|
---|
3834 | }
|
---|
3835 |
|
---|
3836 | for(i=0; i<100; i+=2)
|
---|
3837 | {
|
---|
3838 | dst[ (white)*dstStride + i]+=128;
|
---|
3839 | dst[ (black)*dstStride + i]+=128;
|
---|
3840 | }
|
---|
3841 |
|
---|
3842 | }
|
---|
3843 | #endif
|
---|
3844 |
|
---|
3845 | *c2= c; //copy local context back
|
---|
3846 |
|
---|
3847 | }
|
---|