1 | /*
|
---|
2 | * Simple IDCT MMX
|
---|
3 | *
|
---|
4 | * Copyright (c) 2001, 2002 Michael Niedermayer <[email protected]>
|
---|
5 | *
|
---|
6 | * This library is free software; you can redistribute it and/or
|
---|
7 | * modify it under the terms of the GNU Lesser General Public
|
---|
8 | * License as published by the Free Software Foundation; either
|
---|
9 | * version 2 of the License, or (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This library is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
14 | * Lesser General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU Lesser General Public
|
---|
17 | * License along with this library; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
19 | */
|
---|
20 | #include "../dsputil.h"
|
---|
21 | #include "../simple_idct.h"
|
---|
22 |
|
---|
23 | /*
|
---|
24 | 23170.475006
|
---|
25 | 22725.260826
|
---|
26 | 21406.727617
|
---|
27 | 19265.545870
|
---|
28 | 16384.000000
|
---|
29 | 12872.826198
|
---|
30 | 8866.956905
|
---|
31 | 4520.335430
|
---|
32 | */
|
---|
33 | #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
34 | #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
35 | #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
36 | #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
37 | #if 0
|
---|
38 | #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
39 | #else
|
---|
40 | #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
|
---|
41 | #endif
|
---|
42 | #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
43 | #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
44 | #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
45 |
|
---|
46 | #define ROW_SHIFT 11
|
---|
47 | #define COL_SHIFT 20 // 6
|
---|
48 |
|
---|
49 | static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
|
---|
50 | static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
|
---|
51 |
|
---|
52 | static const int16_t __attribute__((aligned(8))) coeffs[]= {
|
---|
53 | 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
|
---|
54 | // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
|
---|
55 | // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
|
---|
56 | 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
|
---|
57 | // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
|
---|
58 | // 0, 0, 0, 0,
|
---|
59 | // 0, 0, 0, 0,
|
---|
60 |
|
---|
61 | C4, C4, C4, C4,
|
---|
62 | C4, -C4, C4, -C4,
|
---|
63 |
|
---|
64 | C2, C6, C2, C6,
|
---|
65 | C6, -C2, C6, -C2,
|
---|
66 |
|
---|
67 | C1, C3, C1, C3,
|
---|
68 | C5, C7, C5, C7,
|
---|
69 |
|
---|
70 | C3, -C7, C3, -C7,
|
---|
71 | -C1, -C5, -C1, -C5,
|
---|
72 |
|
---|
73 | C5, -C1, C5, -C1,
|
---|
74 | C7, C3, C7, C3,
|
---|
75 |
|
---|
76 | C7, -C5, C7, -C5,
|
---|
77 | C3, -C1, C3, -C1
|
---|
78 | };
|
---|
79 |
|
---|
80 | #if 0
|
---|
81 | static void unused_var_killer(){
|
---|
82 | int a= wm1010 + d40000;
|
---|
83 | temp[0]=a;
|
---|
84 | }
|
---|
85 |
|
---|
86 | static void inline idctCol (int16_t * col, int16_t *input)
|
---|
87 | {
|
---|
88 | #undef C0
|
---|
89 | #undef C1
|
---|
90 | #undef C2
|
---|
91 | #undef C3
|
---|
92 | #undef C4
|
---|
93 | #undef C5
|
---|
94 | #undef C6
|
---|
95 | #undef C7
|
---|
96 | int a0, a1, a2, a3, b0, b1, b2, b3;
|
---|
97 | const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
98 | const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
99 | const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
100 | const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
101 | const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
102 | const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
103 | const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
104 | const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
105 | /*
|
---|
106 | if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
|
---|
107 | col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
|
---|
108 | col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
|
---|
109 | return;
|
---|
110 | }*/
|
---|
111 |
|
---|
112 | col[8*0] = input[8*0 + 0];
|
---|
113 | col[8*1] = input[8*2 + 0];
|
---|
114 | col[8*2] = input[8*0 + 1];
|
---|
115 | col[8*3] = input[8*2 + 1];
|
---|
116 | col[8*4] = input[8*4 + 0];
|
---|
117 | col[8*5] = input[8*6 + 0];
|
---|
118 | col[8*6] = input[8*4 + 1];
|
---|
119 | col[8*7] = input[8*6 + 1];
|
---|
120 |
|
---|
121 | a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
|
---|
122 | a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
|
---|
123 | a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
|
---|
124 | a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
|
---|
125 |
|
---|
126 | b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
|
---|
127 | b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
|
---|
128 | b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
|
---|
129 | b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
|
---|
130 |
|
---|
131 | col[8*0] = (a0 + b0) >> COL_SHIFT;
|
---|
132 | col[8*1] = (a1 + b1) >> COL_SHIFT;
|
---|
133 | col[8*2] = (a2 + b2) >> COL_SHIFT;
|
---|
134 | col[8*3] = (a3 + b3) >> COL_SHIFT;
|
---|
135 | col[8*4] = (a3 - b3) >> COL_SHIFT;
|
---|
136 | col[8*5] = (a2 - b2) >> COL_SHIFT;
|
---|
137 | col[8*6] = (a1 - b1) >> COL_SHIFT;
|
---|
138 | col[8*7] = (a0 - b0) >> COL_SHIFT;
|
---|
139 | }
|
---|
140 |
|
---|
141 | static void inline idctRow (int16_t * output, int16_t * input)
|
---|
142 | {
|
---|
143 | int16_t row[8];
|
---|
144 |
|
---|
145 | int a0, a1, a2, a3, b0, b1, b2, b3;
|
---|
146 | const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
147 | const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
148 | const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
149 | const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
150 | const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
151 | const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
152 | const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
153 | const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
---|
154 |
|
---|
155 | row[0] = input[0];
|
---|
156 | row[2] = input[1];
|
---|
157 | row[4] = input[4];
|
---|
158 | row[6] = input[5];
|
---|
159 | row[1] = input[8];
|
---|
160 | row[3] = input[9];
|
---|
161 | row[5] = input[12];
|
---|
162 | row[7] = input[13];
|
---|
163 |
|
---|
164 | if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
|
---|
165 | row[0] = row[1] = row[2] = row[3] = row[4] =
|
---|
166 | row[5] = row[6] = row[7] = row[0]<<3;
|
---|
167 | output[0] = row[0];
|
---|
168 | output[2] = row[1];
|
---|
169 | output[4] = row[2];
|
---|
170 | output[6] = row[3];
|
---|
171 | output[8] = row[4];
|
---|
172 | output[10] = row[5];
|
---|
173 | output[12] = row[6];
|
---|
174 | output[14] = row[7];
|
---|
175 | return;
|
---|
176 | }
|
---|
177 |
|
---|
178 | a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
|
---|
179 | a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
|
---|
180 | a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
|
---|
181 | a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
|
---|
182 |
|
---|
183 | b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
|
---|
184 | b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
|
---|
185 | b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
|
---|
186 | b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
|
---|
187 |
|
---|
188 | row[0] = (a0 + b0) >> ROW_SHIFT;
|
---|
189 | row[1] = (a1 + b1) >> ROW_SHIFT;
|
---|
190 | row[2] = (a2 + b2) >> ROW_SHIFT;
|
---|
191 | row[3] = (a3 + b3) >> ROW_SHIFT;
|
---|
192 | row[4] = (a3 - b3) >> ROW_SHIFT;
|
---|
193 | row[5] = (a2 - b2) >> ROW_SHIFT;
|
---|
194 | row[6] = (a1 - b1) >> ROW_SHIFT;
|
---|
195 | row[7] = (a0 - b0) >> ROW_SHIFT;
|
---|
196 |
|
---|
197 | output[0] = row[0];
|
---|
198 | output[2] = row[1];
|
---|
199 | output[4] = row[2];
|
---|
200 | output[6] = row[3];
|
---|
201 | output[8] = row[4];
|
---|
202 | output[10] = row[5];
|
---|
203 | output[12] = row[6];
|
---|
204 | output[14] = row[7];
|
---|
205 | }
|
---|
206 | #endif
|
---|
207 |
|
---|
208 | static inline void idct(int16_t *block)
|
---|
209 | {
|
---|
210 | int64_t __attribute__((aligned(8))) align_tmp[16];
|
---|
211 | int16_t * const temp= (int16_t*)align_tmp;
|
---|
212 |
|
---|
213 | asm volatile(
|
---|
214 | #if 0 //Alternative, simpler variant
|
---|
215 |
|
---|
216 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
217 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
218 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
219 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
220 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
221 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
222 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
223 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
224 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
225 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
226 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
227 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
228 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
229 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
230 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
231 | #rounder ", %%mm4 \n\t"\
|
---|
232 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
233 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
234 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
235 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
---|
236 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
237 | #rounder ", %%mm0 \n\t"\
|
---|
238 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
239 | "paddd %%mm0, %%mm0 \n\t" \
|
---|
240 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
---|
241 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
242 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
---|
243 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
244 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
245 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
246 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
247 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
248 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
---|
249 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
250 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
251 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
---|
252 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
---|
253 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
254 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
255 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
256 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
---|
257 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
---|
258 | "movq %%mm7, " #dst " \n\t"\
|
---|
259 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
---|
260 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
261 | "movq %%mm2, 24+" #dst " \n\t"\
|
---|
262 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
263 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
264 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
265 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
266 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
---|
267 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
268 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
269 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
270 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
---|
271 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
272 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
273 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
274 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
---|
275 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
276 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
277 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
278 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
---|
279 | "movq %%mm2, 8+" #dst " \n\t"\
|
---|
280 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
281 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
---|
282 | "movq %%mm4, 16+" #dst " \n\t"\
|
---|
283 |
|
---|
284 | #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
285 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
286 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
287 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
288 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
289 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
290 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
291 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
292 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
293 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
294 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
295 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
296 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
297 | #rounder ", %%mm4 \n\t"\
|
---|
298 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
299 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
300 | #rounder ", %%mm0 \n\t"\
|
---|
301 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
302 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
303 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
304 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
305 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
---|
306 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
---|
307 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
---|
308 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
309 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
310 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
|
---|
311 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
312 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
313 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
314 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
315 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
316 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
|
---|
317 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
318 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
319 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
---|
320 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
321 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
322 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
323 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
324 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
325 | "movd %%mm7, " #dst " \n\t"\
|
---|
326 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
327 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
328 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
329 | "movd %%mm2, 96+" #dst " \n\t"\
|
---|
330 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
331 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
332 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
|
---|
333 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
334 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
335 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
336 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
337 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
338 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
---|
339 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
340 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
341 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
342 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
343 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
344 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
345 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
346 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
|
---|
347 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
348 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
349 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
350 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
351 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
352 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
353 | "movd %%mm2, 32+" #dst " \n\t"\
|
---|
354 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
---|
355 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
356 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
357 | "movd %%mm4, 64+" #dst " \n\t"\
|
---|
358 | "movd %%mm5, 80+" #dst " \n\t"\
|
---|
359 |
|
---|
360 |
|
---|
361 | #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
362 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
363 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
364 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
365 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
366 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\
|
---|
367 | "pand %%mm0, %%mm4 \n\t"\
|
---|
368 | "por %%mm1, %%mm4 \n\t"\
|
---|
369 | "por %%mm2, %%mm4 \n\t"\
|
---|
370 | "por %%mm3, %%mm4 \n\t"\
|
---|
371 | "packssdw %%mm4,%%mm4 \n\t"\
|
---|
372 | "movd %%mm4, %%eax \n\t"\
|
---|
373 | "orl %%eax, %%eax \n\t"\
|
---|
374 | "jz 1f \n\t"\
|
---|
375 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
376 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
377 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
378 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
379 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
380 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
381 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
382 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
383 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
384 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
385 | #rounder ", %%mm4 \n\t"\
|
---|
386 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
387 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
388 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
389 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
---|
390 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
391 | #rounder ", %%mm0 \n\t"\
|
---|
392 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
393 | "paddd %%mm0, %%mm0 \n\t" \
|
---|
394 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
---|
395 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
396 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
---|
397 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
398 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
399 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
400 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
401 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
402 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
---|
403 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
404 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
405 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
---|
406 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
---|
407 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
408 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
409 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
410 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
---|
411 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
---|
412 | "movq %%mm7, " #dst " \n\t"\
|
---|
413 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
---|
414 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
415 | "movq %%mm2, 24+" #dst " \n\t"\
|
---|
416 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
417 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
418 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
419 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
420 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
---|
421 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
422 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
423 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
424 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
---|
425 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
426 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
427 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
428 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
---|
429 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
430 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
431 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
432 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
---|
433 | "movq %%mm2, 8+" #dst " \n\t"\
|
---|
434 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
435 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
---|
436 | "movq %%mm4, 16+" #dst " \n\t"\
|
---|
437 | "jmp 2f \n\t"\
|
---|
438 | "1: \n\t"\
|
---|
439 | "pslld $16, %%mm0 \n\t"\
|
---|
440 | "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
|
---|
441 | "psrad $13, %%mm0 \n\t"\
|
---|
442 | "packssdw %%mm0, %%mm0 \n\t"\
|
---|
443 | "movq %%mm0, " #dst " \n\t"\
|
---|
444 | "movq %%mm0, 8+" #dst " \n\t"\
|
---|
445 | "movq %%mm0, 16+" #dst " \n\t"\
|
---|
446 | "movq %%mm0, 24+" #dst " \n\t"\
|
---|
447 | "2: \n\t"
|
---|
448 |
|
---|
449 |
|
---|
450 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
451 | ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
|
---|
452 | /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
|
---|
453 | ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
|
---|
454 | ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
|
---|
455 |
|
---|
456 | DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
|
---|
457 | DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
|
---|
458 | DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
|
---|
459 |
|
---|
460 |
|
---|
461 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
462 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
463 | COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
464 | COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
465 | COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
466 | COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
467 | #else
|
---|
468 | COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
469 | COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
470 | COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
471 | COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
472 | #endif
|
---|
473 |
|
---|
474 | #else
|
---|
475 |
|
---|
476 | #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
477 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
478 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
479 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
480 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
481 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\
|
---|
482 | "pand %%mm0, %%mm4 \n\t"\
|
---|
483 | "por %%mm1, %%mm4 \n\t"\
|
---|
484 | "por %%mm2, %%mm4 \n\t"\
|
---|
485 | "por %%mm3, %%mm4 \n\t"\
|
---|
486 | "packssdw %%mm4,%%mm4 \n\t"\
|
---|
487 | "movd %%mm4, %%eax \n\t"\
|
---|
488 | "orl %%eax, %%eax \n\t"\
|
---|
489 | "jz 1f \n\t"\
|
---|
490 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
491 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
492 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
493 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
494 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
495 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
496 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
497 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
498 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
499 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
500 | #rounder ", %%mm4 \n\t"\
|
---|
501 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
502 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
503 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
504 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
---|
505 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
506 | #rounder ", %%mm0 \n\t"\
|
---|
507 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
508 | "paddd %%mm0, %%mm0 \n\t" \
|
---|
509 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
---|
510 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
511 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
---|
512 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
513 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
514 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
515 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
516 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
517 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
---|
518 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
519 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
520 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
---|
521 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
---|
522 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
523 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
524 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
525 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
---|
526 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
---|
527 | "movq %%mm7, " #dst " \n\t"\
|
---|
528 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
---|
529 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
530 | "movq %%mm2, 24+" #dst " \n\t"\
|
---|
531 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
532 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
533 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
534 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
535 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
---|
536 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
537 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
538 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
539 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
---|
540 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
541 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
542 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
543 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
---|
544 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
545 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
546 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
547 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
---|
548 | "movq %%mm2, 8+" #dst " \n\t"\
|
---|
549 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
550 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
---|
551 | "movq %%mm4, 16+" #dst " \n\t"\
|
---|
552 | "jmp 2f \n\t"\
|
---|
553 | "1: \n\t"\
|
---|
554 | "pslld $16, %%mm0 \n\t"\
|
---|
555 | "paddd "MANGLE(d40000)", %%mm0 \n\t"\
|
---|
556 | "psrad $13, %%mm0 \n\t"\
|
---|
557 | "packssdw %%mm0, %%mm0 \n\t"\
|
---|
558 | "movq %%mm0, " #dst " \n\t"\
|
---|
559 | "movq %%mm0, 8+" #dst " \n\t"\
|
---|
560 | "movq %%mm0, 16+" #dst " \n\t"\
|
---|
561 | "movq %%mm0, 24+" #dst " \n\t"\
|
---|
562 | "2: \n\t"
|
---|
563 |
|
---|
564 | #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
|
---|
565 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
566 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
567 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
568 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
569 | "movq %%mm0, %%mm4 \n\t"\
|
---|
570 | "por %%mm1, %%mm4 \n\t"\
|
---|
571 | "por %%mm2, %%mm4 \n\t"\
|
---|
572 | "por %%mm3, %%mm4 \n\t"\
|
---|
573 | "packssdw %%mm4,%%mm4 \n\t"\
|
---|
574 | "movd %%mm4, %%eax \n\t"\
|
---|
575 | "orl %%eax, %%eax \n\t"\
|
---|
576 | "jz " #bt " \n\t"\
|
---|
577 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
578 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
579 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
580 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
581 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
582 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
583 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
584 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
585 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
586 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
587 | #rounder ", %%mm4 \n\t"\
|
---|
588 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
589 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
590 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
591 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
---|
592 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
593 | #rounder ", %%mm0 \n\t"\
|
---|
594 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
595 | "paddd %%mm0, %%mm0 \n\t" \
|
---|
596 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
---|
597 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
598 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
---|
599 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
600 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
601 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
602 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
603 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
604 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
---|
605 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
606 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
607 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
---|
608 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
---|
609 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
610 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
611 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
612 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
---|
613 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
---|
614 | "movq %%mm7, " #dst " \n\t"\
|
---|
615 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
---|
616 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
617 | "movq %%mm2, 24+" #dst " \n\t"\
|
---|
618 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
619 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
620 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
621 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
622 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
---|
623 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
624 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
625 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
626 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
---|
627 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
628 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
629 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
630 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
---|
631 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
632 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
633 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
634 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
---|
635 | "movq %%mm2, 8+" #dst " \n\t"\
|
---|
636 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
637 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
---|
638 | "movq %%mm4, 16+" #dst " \n\t"\
|
---|
639 |
|
---|
640 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
641 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
642 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
643 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
644 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
645 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
646 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
647 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
648 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
649 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
650 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
651 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
652 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
653 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
654 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
655 | #rounder ", %%mm4 \n\t"\
|
---|
656 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
657 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
658 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
659 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
|
---|
660 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
661 | #rounder ", %%mm0 \n\t"\
|
---|
662 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
663 | "paddd %%mm0, %%mm0 \n\t" \
|
---|
664 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
|
---|
665 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
666 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
|
---|
667 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
668 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
669 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
670 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
671 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
672 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
|
---|
673 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
674 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
675 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
|
---|
676 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
|
---|
677 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
678 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
679 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
680 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
|
---|
681 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
|
---|
682 | "movq %%mm7, " #dst " \n\t"\
|
---|
683 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
|
---|
684 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
685 | "movq %%mm2, 24+" #dst " \n\t"\
|
---|
686 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
687 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
688 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
689 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
690 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
|
---|
691 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
692 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
693 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
694 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
|
---|
695 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
696 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
697 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
698 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
|
---|
699 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
700 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
701 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
702 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
|
---|
703 | "movq %%mm2, 8+" #dst " \n\t"\
|
---|
704 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
705 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
|
---|
706 | "movq %%mm4, 16+" #dst " \n\t"\
|
---|
707 |
|
---|
708 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
709 | DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
|
---|
710 | Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
|
---|
711 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
|
---|
712 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
|
---|
713 |
|
---|
714 | #undef IDCT
|
---|
715 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
716 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
717 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
718 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
719 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
720 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
721 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
722 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
723 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
724 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
725 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
726 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
727 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
728 | #rounder ", %%mm4 \n\t"\
|
---|
729 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
730 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
731 | #rounder ", %%mm0 \n\t"\
|
---|
732 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
733 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
734 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
735 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
736 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
---|
737 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
---|
738 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
---|
739 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
740 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
741 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
|
---|
742 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
743 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
744 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
745 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
746 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
747 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
|
---|
748 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
749 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
750 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
---|
751 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
752 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
753 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
754 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
755 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
756 | "movd %%mm7, " #dst " \n\t"\
|
---|
757 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
758 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
759 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
760 | "movd %%mm2, 96+" #dst " \n\t"\
|
---|
761 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
762 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
763 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
|
---|
764 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
765 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
766 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
767 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
768 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
769 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
---|
770 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
771 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
772 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
773 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
774 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
775 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
776 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
777 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
|
---|
778 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
779 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
780 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
781 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
782 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
783 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
784 | "movd %%mm2, 32+" #dst " \n\t"\
|
---|
785 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
---|
786 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
787 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
788 | "movd %%mm4, 64+" #dst " \n\t"\
|
---|
789 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
790 |
|
---|
791 |
|
---|
792 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
793 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
794 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
795 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
796 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
797 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
798 | #else
|
---|
799 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
800 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
801 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
802 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
803 | #endif
|
---|
804 | "jmp 9f \n\t"
|
---|
805 |
|
---|
806 | "#.balign 16 \n\t"\
|
---|
807 | "4: \n\t"
|
---|
808 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
|
---|
809 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
|
---|
810 |
|
---|
811 | #undef IDCT
|
---|
812 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
813 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
814 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
815 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
816 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
817 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
818 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
819 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
820 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
821 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
822 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
823 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
824 | #rounder ", %%mm4 \n\t"\
|
---|
825 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
826 | #rounder ", %%mm0 \n\t"\
|
---|
827 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
828 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
829 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
830 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
---|
831 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
---|
832 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
---|
833 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
834 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
835 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
836 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
|
---|
837 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
838 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
839 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
840 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
841 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
---|
842 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
843 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
844 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
845 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
846 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
|
---|
847 | "movd %%mm1, " #dst " \n\t"\
|
---|
848 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
849 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
850 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
851 | "movd %%mm2, 96+" #dst " \n\t"\
|
---|
852 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
853 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
854 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
|
---|
855 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
856 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
---|
857 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
858 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
859 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
860 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
861 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
862 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
|
---|
863 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
864 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
|
---|
865 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
866 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
867 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
868 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
869 | "movd %%mm2, 32+" #dst " \n\t"\
|
---|
870 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
|
---|
871 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
872 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
873 | "movd %%mm1, 64+" #dst " \n\t"\
|
---|
874 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
875 |
|
---|
876 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
877 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
878 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
879 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
880 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
881 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
882 | #else
|
---|
883 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
884 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
885 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
886 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
887 | #endif
|
---|
888 | "jmp 9f \n\t"
|
---|
889 |
|
---|
890 | "#.balign 16 \n\t"\
|
---|
891 | "6: \n\t"
|
---|
892 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
|
---|
893 |
|
---|
894 | #undef IDCT
|
---|
895 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
896 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
897 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
898 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
899 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
900 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
901 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
902 | #rounder ", %%mm4 \n\t"\
|
---|
903 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
904 | #rounder ", %%mm0 \n\t"\
|
---|
905 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
906 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
---|
907 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
908 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
909 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
910 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
|
---|
911 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
912 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
913 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
914 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
915 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
---|
916 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
917 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
918 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
919 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
920 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
|
---|
921 | "movd %%mm1, " #dst " \n\t"\
|
---|
922 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
923 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
924 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
925 | "movd %%mm2, 96+" #dst " \n\t"\
|
---|
926 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
927 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
928 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
|
---|
929 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
930 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
---|
931 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
932 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
933 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
934 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
935 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
936 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
|
---|
937 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
938 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
|
---|
939 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
940 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
941 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
942 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
943 | "movd %%mm2, 32+" #dst " \n\t"\
|
---|
944 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
|
---|
945 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
946 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
947 | "movd %%mm1, 64+" #dst " \n\t"\
|
---|
948 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
949 |
|
---|
950 |
|
---|
951 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
952 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
953 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
954 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
955 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
956 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
957 | #else
|
---|
958 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
959 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
960 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
961 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
962 | #endif
|
---|
963 | "jmp 9f \n\t"
|
---|
964 |
|
---|
965 | "#.balign 16 \n\t"\
|
---|
966 | "2: \n\t"
|
---|
967 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
|
---|
968 |
|
---|
969 | #undef IDCT
|
---|
970 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
971 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
972 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
973 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
|
---|
974 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
975 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
976 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
977 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
978 | #rounder ", %%mm4 \n\t"\
|
---|
979 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
980 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
981 | #rounder ", %%mm0 \n\t"\
|
---|
982 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
983 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
984 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
|
---|
985 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
|
---|
986 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
987 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
|
---|
988 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
|
---|
989 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
|
---|
990 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
991 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
992 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
993 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
|
---|
994 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
995 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
996 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
|
---|
997 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
998 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
999 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
1000 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
1001 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
1002 | "movd %%mm7, " #dst " \n\t"\
|
---|
1003 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
1004 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
1005 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
|
---|
1006 | "movd %%mm2, 96+" #dst " \n\t"\
|
---|
1007 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
1008 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
1009 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
|
---|
1010 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
1011 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
1012 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
|
---|
1013 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
1014 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
|
---|
1015 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
|
---|
1016 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
|
---|
1017 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
|
---|
1018 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
1019 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
1020 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
1021 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
1022 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
1023 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
|
---|
1024 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1025 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
1026 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
1027 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1028 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
|
---|
1029 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1030 | "movd %%mm2, 32+" #dst " \n\t"\
|
---|
1031 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
---|
1032 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
1033 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
1034 | "movd %%mm4, 64+" #dst " \n\t"\
|
---|
1035 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
1036 |
|
---|
1037 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
1038 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
1039 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
1040 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
1041 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
1042 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
1043 | #else
|
---|
1044 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
1045 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
1046 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
1047 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
1048 | #endif
|
---|
1049 | "jmp 9f \n\t"
|
---|
1050 |
|
---|
1051 | "#.balign 16 \n\t"\
|
---|
1052 | "3: \n\t"
|
---|
1053 | #undef IDCT
|
---|
1054 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
1055 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
1056 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
1057 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
1058 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1059 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1060 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1061 | #rounder ", %%mm4 \n\t"\
|
---|
1062 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1063 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
1064 | #rounder ", %%mm0 \n\t"\
|
---|
1065 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
1066 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1067 | "movq 64(%2), %%mm3 \n\t"\
|
---|
1068 | "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
1069 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
1070 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
1071 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
1072 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
1073 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1074 | "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
|
---|
1075 | "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
1076 | "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
|
---|
1077 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
1078 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
1079 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
1080 | "movd %%mm7, " #dst " \n\t"\
|
---|
1081 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
1082 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
1083 | "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
|
---|
1084 | "movd %%mm1, 96+" #dst " \n\t"\
|
---|
1085 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
1086 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
1087 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
1088 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
1089 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
1090 | "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
|
---|
1091 | "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
|
---|
1092 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
1093 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
1094 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
1095 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
1096 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1097 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
1098 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
1099 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1100 | "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
|
---|
1101 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1102 | "movd %%mm1, 32+" #dst " \n\t"\
|
---|
1103 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
---|
1104 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
1105 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
1106 | "movd %%mm4, 64+" #dst " \n\t"\
|
---|
1107 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
1108 |
|
---|
1109 |
|
---|
1110 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
1111 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
1112 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
1113 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
1114 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
1115 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
1116 | #else
|
---|
1117 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
1118 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
1119 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
1120 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
1121 | #endif
|
---|
1122 | "jmp 9f \n\t"
|
---|
1123 |
|
---|
1124 | "#.balign 16 \n\t"\
|
---|
1125 | "5: \n\t"
|
---|
1126 | #undef IDCT
|
---|
1127 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
1128 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
1129 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
1130 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
1131 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1132 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1133 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1134 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
1135 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
1136 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
1137 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
1138 | #rounder ", %%mm4 \n\t"\
|
---|
1139 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1140 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
1141 | #rounder ", %%mm0 \n\t"\
|
---|
1142 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
1143 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1144 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
---|
1145 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
---|
1146 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
|
---|
1147 | "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
|
---|
1148 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
|
---|
1149 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1150 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1151 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1152 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
|
---|
1153 | "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
1154 | "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
1155 | #rounder ", %%mm1 \n\t"\
|
---|
1156 | "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
|
---|
1157 | "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
|
---|
1158 | #rounder ", %%mm2 \n\t"\
|
---|
1159 | "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
|
---|
1160 | "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
|
---|
1161 | "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
|
---|
1162 | "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
|
---|
1163 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1164 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
1165 | "psrad $" #shift ", %%mm3 \n\t"\
|
---|
1166 | "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
|
---|
1167 | "movq %%mm4, " #dst " \n\t"\
|
---|
1168 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
1169 | "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
|
---|
1170 | "movq %%mm0, 16+" #dst " \n\t"\
|
---|
1171 | "movq %%mm0, 96+" #dst " \n\t"\
|
---|
1172 | "movq %%mm4, 112+" #dst " \n\t"\
|
---|
1173 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
1174 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
1175 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
1176 | "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
1177 | "movq %%mm5, 32+" #dst " \n\t"\
|
---|
1178 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
1179 | "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1180 | "movq %%mm6, 48+" #dst " \n\t"\
|
---|
1181 | "movq %%mm6, 64+" #dst " \n\t"\
|
---|
1182 | "movq %%mm5, 80+" #dst " \n\t"
|
---|
1183 |
|
---|
1184 |
|
---|
1185 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
1186 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
1187 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
1188 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
1189 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
1190 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
1191 | #else
|
---|
1192 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
1193 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
1194 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
1195 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
1196 | #endif
|
---|
1197 | "jmp 9f \n\t"
|
---|
1198 |
|
---|
1199 |
|
---|
1200 | "#.balign 16 \n\t"\
|
---|
1201 | "1: \n\t"
|
---|
1202 | #undef IDCT
|
---|
1203 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
1204 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
1205 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
|
---|
1206 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
|
---|
1207 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
1208 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1209 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1210 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1211 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
|
---|
1212 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
|
---|
1213 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
|
---|
1214 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
|
---|
1215 | #rounder ", %%mm4 \n\t"\
|
---|
1216 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1217 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
|
---|
1218 | #rounder ", %%mm0 \n\t"\
|
---|
1219 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
|
---|
1220 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
|
---|
1221 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
|
---|
1222 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1223 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
|
---|
1224 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
|
---|
1225 | "movq 64(%2), %%mm1 \n\t"\
|
---|
1226 | "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
|
---|
1227 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
1228 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
|
---|
1229 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
1230 | "psrad $" #shift ", %%mm7 \n\t"\
|
---|
1231 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1232 | "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
|
---|
1233 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
1234 | "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
|
---|
1235 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
1236 | "psrad $" #shift ", %%mm3 \n\t"\
|
---|
1237 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
|
---|
1238 | "movd %%mm7, " #dst " \n\t"\
|
---|
1239 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
|
---|
1240 | "movd %%mm0, 16+" #dst " \n\t"\
|
---|
1241 | "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
|
---|
1242 | "movd %%mm3, 96+" #dst " \n\t"\
|
---|
1243 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
|
---|
1244 | "movd %%mm4, 112+" #dst " \n\t"\
|
---|
1245 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
|
---|
1246 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
|
---|
1247 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
|
---|
1248 | "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
|
---|
1249 | "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
|
---|
1250 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
|
---|
1251 | "psrad $" #shift ", %%mm3 \n\t"\
|
---|
1252 | "psrad $" #shift ", %%mm5 \n\t"\
|
---|
1253 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
|
---|
1254 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1255 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
|
---|
1256 | "psrad $" #shift ", %%mm6 \n\t"\
|
---|
1257 | "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
|
---|
1258 | "movd %%mm3, 32+" #dst " \n\t"\
|
---|
1259 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1260 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
|
---|
1261 | "movd %%mm6, 48+" #dst " \n\t"\
|
---|
1262 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
|
---|
1263 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
|
---|
1264 | "movd %%mm4, 64+" #dst " \n\t"\
|
---|
1265 | "movd %%mm5, 80+" #dst " \n\t"
|
---|
1266 |
|
---|
1267 |
|
---|
1268 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
1269 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
1270 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
1271 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
1272 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
1273 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
1274 | #else
|
---|
1275 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
1276 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
1277 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
1278 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
1279 | #endif
|
---|
1280 | "jmp 9f \n\t"
|
---|
1281 |
|
---|
1282 |
|
---|
1283 | "#.balign 16 \n\t"
|
---|
1284 | "7: \n\t"
|
---|
1285 | #undef IDCT
|
---|
1286 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
|
---|
1287 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
|
---|
1288 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
|
---|
1289 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1290 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1291 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1292 | #rounder ", %%mm4 \n\t"\
|
---|
1293 | #rounder ", %%mm0 \n\t"\
|
---|
1294 | "psrad $" #shift ", %%mm4 \n\t"\
|
---|
1295 | "psrad $" #shift ", %%mm0 \n\t"\
|
---|
1296 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
|
---|
1297 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
|
---|
1298 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
|
---|
1299 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
|
---|
1300 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
|
---|
1301 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
|
---|
1302 | #rounder ", %%mm1 \n\t"\
|
---|
1303 | #rounder ", %%mm2 \n\t"\
|
---|
1304 | "psrad $" #shift ", %%mm1 \n\t"\
|
---|
1305 | "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
|
---|
1306 | "movq %%mm4, " #dst " \n\t"\
|
---|
1307 | "psrad $" #shift ", %%mm2 \n\t"\
|
---|
1308 | "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
|
---|
1309 | "movq %%mm0, 16+" #dst " \n\t"\
|
---|
1310 | "movq %%mm0, 96+" #dst " \n\t"\
|
---|
1311 | "movq %%mm4, 112+" #dst " \n\t"\
|
---|
1312 | "movq %%mm0, 32+" #dst " \n\t"\
|
---|
1313 | "movq %%mm4, 48+" #dst " \n\t"\
|
---|
1314 | "movq %%mm4, 64+" #dst " \n\t"\
|
---|
1315 | "movq %%mm0, 80+" #dst " \n\t"
|
---|
1316 |
|
---|
1317 | //IDCT( src0, src4, src1, src5, dst, rounder, shift)
|
---|
1318 | #if !defined(VBOX) || !defined(__DARWIN__)
|
---|
1319 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
|
---|
1320 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
|
---|
1321 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
|
---|
1322 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
|
---|
1323 | #else
|
---|
1324 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),#nop, 20)
|
---|
1325 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),#nop, 20)
|
---|
1326 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),#nop, 20)
|
---|
1327 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),#nop, 20)
|
---|
1328 | #endif
|
---|
1329 |
|
---|
1330 |
|
---|
1331 | #endif
|
---|
1332 |
|
---|
1333 | /*
|
---|
1334 | Input
|
---|
1335 | 00 40 04 44 20 60 24 64
|
---|
1336 | 10 30 14 34 50 70 54 74
|
---|
1337 | 01 41 03 43 21 61 23 63
|
---|
1338 | 11 31 13 33 51 71 53 73
|
---|
1339 | 02 42 06 46 22 62 26 66
|
---|
1340 | 12 32 16 36 52 72 56 76
|
---|
1341 | 05 45 07 47 25 65 27 67
|
---|
1342 | 15 35 17 37 55 75 57 77
|
---|
1343 |
|
---|
1344 | Temp
|
---|
1345 | 00 04 10 14 20 24 30 34
|
---|
1346 | 40 44 50 54 60 64 70 74
|
---|
1347 | 01 03 11 13 21 23 31 33
|
---|
1348 | 41 43 51 53 61 63 71 73
|
---|
1349 | 02 06 12 16 22 26 32 36
|
---|
1350 | 42 46 52 56 62 66 72 76
|
---|
1351 | 05 07 15 17 25 27 35 37
|
---|
1352 | 45 47 55 57 65 67 75 77
|
---|
1353 | */
|
---|
1354 |
|
---|
1355 | "9: \n\t"
|
---|
1356 | :: "r" (block), "r" (temp), "r" (coeffs)
|
---|
1357 | : "%eax"
|
---|
1358 | );
|
---|
1359 | }
|
---|
1360 |
|
---|
1361 | void ff_simple_idct_mmx(int16_t *block)
|
---|
1362 | {
|
---|
1363 | idct(block);
|
---|
1364 | }
|
---|
1365 |
|
---|
1366 | //FIXME merge add/put into the idct
|
---|
1367 |
|
---|
1368 | void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
1369 | {
|
---|
1370 | idct(block);
|
---|
1371 | put_pixels_clamped_mmx(block, dest, line_size);
|
---|
1372 | }
|
---|
1373 | void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
---|
1374 | {
|
---|
1375 | idct(block);
|
---|
1376 | add_pixels_clamped_mmx(block, dest, line_size);
|
---|
1377 | }
|
---|