VirtualBox

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/i386/vp3dsp_mmx.c@ 5776

Last change on this file since 5776 was 5776, checked in by vboxsync, 17 years ago

ffmpeg: exported to OSE

File size: 11.4 KB
Line 
1/*
2 * Copyright (C) 2004 the ffmpeg project
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19/**
20 * @file vp3dsp_mmx.c
21 * MMX-optimized functions cribbed from the original VP3 source code.
22 */
23
24#include "../dsputil.h"
25#include "mmx.h"
26
27#define IdctAdjustBeforeShift 8
28
29/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30 * idct_constants[0..15] = Mask table (M(I))
31 * idct_constants[16..43] = Cosine table (C(I))
32 * idct_constants[44..47] = 8
33 */
34static uint16_t idct_constants[(4 + 7 + 1) * 4];
35static const uint16_t idct_cosine_table[7] = {
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
37};
38
39#define r0 mm0
40#define r1 mm1
41#define r2 mm2
42#define r3 mm3
43#define r4 mm4
44#define r5 mm5
45#define r6 mm6
46#define r7 mm7
47
48/* from original comments: The Macro does IDct on 4 1-D Dcts */
49#define BeginIDCT() { \
50 movq_m2r(*I(3), r2); \
51 movq_m2r(*C(3), r6); \
52 movq_r2r(r2, r4); \
53 movq_m2r(*J(5), r7); \
54 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
55 movq_m2r(*C(5), r1); \
56 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
57 movq_r2r(r1, r5); \
58 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
59 movq_m2r(*I(1), r3); \
60 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
61 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
62 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
63 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
64 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
65 movq_m2r(*J(7), r1); \
66 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
67 movq_r2r(r0, r5); /* r5 = c1 */ \
68 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
69 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
70 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
71 movq_m2r(*C(7), r7); \
72 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
73 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
74 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
75 movq_m2r(*I(2), r2); \
76 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
77 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
78 movq_r2r(r2, r1); /* r1 = i2 */ \
79 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
80 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
81 movq_m2r(*J(6), r5); \
82 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
83 movq_r2r(r5, r7); /* r7 = i6 */ \
84 psubsw_r2r(r4, r0); /* r0 = A - C */ \
85 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
86 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
87 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
88 paddsw_r2r(r4, r4); /* r4 = C + C */ \
89 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
90 psubsw_r2r(r6, r3); /* r3 = B - D */ \
91 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
92 paddsw_r2r(r6, r6); /* r6 = D + D */ \
93 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
94 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
95 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
96 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
97 movq_m2r(*C(4), r4); \
98 movq_r2r(r3, r5); /* r5 = B - D */ \
99 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
100 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
101 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
102 movq_r2r(r0, r2); /* r2 = A - C */ \
103 movq_m2r(*I(0), r6); \
104 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
105 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
106 movq_m2r(*J(4), r3); \
107 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
108 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
109 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
110 movq_r2r(r6, r0); \
111 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
112 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
113 paddsw_r2r(r1, r1); /* r1 = H + H */ \
114 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
115 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
116 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
117 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
118 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
119 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
120 movq_m2r(*I(1), r0); /* r0 = C. */ \
121 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
122 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
123 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
124}
125
126/* RowIDCT gets ready to transpose */
127#define RowIDCT() { \
128 \
129 BeginIDCT(); \
130 \
131 movq_m2r(*I(2), r3); /* r3 = D. */ \
132 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
133 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
134 paddsw_r2r(r7, r7); /* r7 = G + G */ \
135 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
136 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
137 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
138 paddsw_r2r(r3, r3); \
139 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
140 paddsw_r2r(r5, r5); \
141 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
142 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
143 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
144 paddsw_r2r(r0, r0); \
145 movq_r2m(r1, *I(1)); /* save R1 */ \
146 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
147}
148
149/* Column IDCT normalizes and stores final results */
150#define ColumnIDCT() { \
151 \
152 BeginIDCT(); \
153 \
154 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
155 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
156 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
157 psraw_i2r(4, r2); /* r2 = NR2 */ \
158 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
159 psraw_i2r(4, r1); /* r1 = NR1 */ \
160 movq_m2r(*I(2), r3); /* r3 = D. */ \
161 paddsw_r2r(r7, r7); /* r7 = G + G */ \
162 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
163 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
164 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
165 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
166 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
167 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
168 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
169 psraw_i2r(4, r4); /* r4 = NR4 */ \
170 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
171 psraw_i2r(4, r3); /* r3 = NR3 */ \
172 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
173 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
174 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
175 psraw_i2r(4, r6); /* r6 = NR6 */ \
176 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
177 psraw_i2r(4, r5); /* r5 = NR5 */ \
178 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
179 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
180 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
181 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
182 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
183 psraw_i2r(4, r7); /* r7 = NR7 */ \
184 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
185 psraw_i2r(4, r0); /* r0 = NR0 */ \
186 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
187 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
188 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
189}
190
191/* Following macro does two 4x4 transposes in place.
192
193 At entry (we assume):
194
195 r0 = a3 a2 a1 a0
196 I(1) = b3 b2 b1 b0
197 r2 = c3 c2 c1 c0
198 r3 = d3 d2 d1 d0
199
200 r4 = e3 e2 e1 e0
201 r5 = f3 f2 f1 f0
202 r6 = g3 g2 g1 g0
203 r7 = h3 h2 h1 h0
204
205 At exit, we have:
206
207 I(0) = d0 c0 b0 a0
208 I(1) = d1 c1 b1 a1
209 I(2) = d2 c2 b2 a2
210 I(3) = d3 c3 b3 a3
211
212 J(4) = h0 g0 f0 e0
213 J(5) = h1 g1 f1 e1
214 J(6) = h2 g2 f2 e2
215 J(7) = h3 g3 f3 e3
216
217 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
218 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
219
220 Since r1 is free at entry, we calculate the Js first. */
221
222#define Transpose() { \
223 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
224 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
225 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
226 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
227 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
228 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
229 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
230 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
231 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
232 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
233 movq_r2m(r4, *J(4)); \
234 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
235 movq_r2m(r5, *J(5)); \
236 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
237 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
238 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
239 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
240 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
241 movq_r2m(r6, *J(7)); \
242 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
243 movq_r2m(r1, *J(6)); \
244 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
245 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
246 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
247 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
248 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
249 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
250 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
251 movq_r2m(r0, *I(0)); \
252 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
253 movq_r2m(r1, *I(1)); \
254 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
255 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
256 movq_r2m(r4, *I(3)); \
257 movq_r2m(r2, *I(2)); \
258}
259
260void ff_vp3_dsp_init_mmx(void)
261{
262 int j = 16;
263 uint16_t *p;
264
265 j = 1;
266 do {
267 p = idct_constants + ((j + 3) << 2);
268 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
269 } while (++j <= 7);
270
271 idct_constants[44] = idct_constants[45] =
272 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
273}
274
275void ff_vp3_idct_mmx(int16_t *output_data)
276{
277 /* eax = quantized input
278 * ebx = dequantizer matrix
279 * ecx = IDCT constants
280 * M(I) = ecx + MaskOffset(0) + I * 8
281 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
282 * edx = output
283 * r0..r7 = mm0..mm7
284 */
285
286#define C(x) (idct_constants + 16 + (x - 1) * 4)
287#define Eight (idct_constants + 44)
288
289 /* at this point, function has completed dequantization + dezigzag +
290 * partial transposition; now do the idct itself */
291#define I(K) (output_data + K * 8)
292#define J(K) (output_data + ((K - 4) * 8) + 4)
293
294 RowIDCT();
295 Transpose();
296
297#undef I
298#undef J
299#define I(K) (output_data + (K * 8) + 32)
300#define J(K) (output_data + ((K - 4) * 8) + 36)
301
302 RowIDCT();
303 Transpose();
304
305#undef I
306#undef J
307#define I(K) (output_data + K * 8)
308#define J(K) (output_data + K * 8)
309
310 ColumnIDCT();
311
312#undef I
313#undef J
314#define I(K) (output_data + (K * 8) + 4)
315#define J(K) (output_data + (K * 8) + 4)
316
317 ColumnIDCT();
318
319#undef I
320#undef J
321
322}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette