vp3dsp_mmx.c@ 5776

Last change on this file since 5776 was 5776, checked in by vboxsync, 17 years ago
ffmpeg: exported to OSE
File size: 11.4 KB

Line
1	/*
2	* Copyright (C) 2004 the ffmpeg project
3	*
4	* This library is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU Lesser General Public
6	* License as published by the Free Software Foundation; either
7	* version 2 of the License, or (at your option) any later version.
8	*
9	* This library is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	* Lesser General Public License for more details.
13	*
14	* You should have received a copy of the GNU Lesser General Public
15	* License along with this library; if not, write to the Free Software
16	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17	*/
18
19	/**
20	* @file vp3dsp_mmx.c
21	* MMX-optimized functions cribbed from the original VP3 source code.
22	*/
23
24	#include "../dsputil.h"
25	#include "mmx.h"
26
27	#define IdctAdjustBeforeShift 8
28
29	/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30	* idct_constants[0..15] = Mask table (M(I))
31	* idct_constants[16..43] = Cosine table (C(I))
32	* idct_constants[44..47] = 8
33	*/
34	static uint16_t idct_constants[(4 + 7 + 1) * 4];
35	static const uint16_t idct_cosine_table[7] = {
36	64277, 60547, 54491, 46341, 36410, 25080, 12785
37	};
38
39	#define r0 mm0
40	#define r1 mm1
41	#define r2 mm2
42	#define r3 mm3
43	#define r4 mm4
44	#define r5 mm5
45	#define r6 mm6
46	#define r7 mm7
47
48	/* from original comments: The Macro does IDct on 4 1-D Dcts */
49	#define BeginIDCT() { \
50	movq_m2r(*I(3), r2); \
51	movq_m2r(*C(3), r6); \
52	movq_r2r(r2, r4); \
53	movq_m2r(*J(5), r7); \
54	pmulhw_r2r(r6, r4); /* r4 = c3i3 - i3 / \
55	movq_m2r(*C(5), r1); \
56	pmulhw_r2r(r7, r6); /* r6 = c3i5 - i5 / \
57	movq_r2r(r1, r5); \
58	pmulhw_r2r(r2, r1); /* r1 = c5i3 - i3 / \
59	movq_m2r(*I(1), r3); \
60	pmulhw_r2r(r7, r5); /* r5 = c5i5 - i5 / \
61	movq_m2r(C(1), r0); / (all registers are in use) */ \
62	paddw_r2r(r2, r4); /* r4 = c3i3 / \
63	paddw_r2r(r7, r6); /* r6 = c3i5 / \
64	paddw_r2r(r1, r2); /* r2 = c5i3 / \
65	movq_m2r(*J(7), r1); \
66	paddw_r2r(r5, r7); /* r7 = c5i5 / \
67	movq_r2r(r0, r5); /* r5 = c1 */ \
68	pmulhw_r2r(r3, r0); /* r0 = c1i1 - i1 / \
69	paddsw_r2r(r7, r4); /* r4 = C = c3i3 + c5i5 */ \
70	pmulhw_r2r(r1, r5); /* r5 = c1i7 - i7 / \
71	movq_m2r(*C(7), r7); \
72	psubsw_r2r(r2, r6); /* r6 = D = c3i5 - c5i3 */ \
73	paddw_r2r(r3, r0); /* r0 = c1i1 / \
74	pmulhw_r2r(r7, r3); /* r3 = c7i1 / \
75	movq_m2r(*I(2), r2); \
76	pmulhw_r2r(r1, r7); /* r7 = c7i7 / \
77	paddw_r2r(r1, r5); /* r5 = c1i7 / \
78	movq_r2r(r2, r1); /* r1 = i2 */ \
79	pmulhw_m2r(C(2), r2); / r2 = c2i2 - i2 / \
80	psubsw_r2r(r5, r3); /* r3 = B = c7i1 - c1i7 */ \
81	movq_m2r(*J(6), r5); \
82	paddsw_r2r(r7, r0); /* r0 = A = c1i1 + c7i7 */ \
83	movq_r2r(r5, r7); /* r7 = i6 */ \
84	psubsw_r2r(r4, r0); /* r0 = A - C */ \
85	pmulhw_m2r(C(2), r5); / r5 = c2i6 - i6 / \
86	paddw_r2r(r1, r2); /* r2 = c2i2 / \
87	pmulhw_m2r(C(6), r1); / r1 = c6i2 / \
88	paddsw_r2r(r4, r4); /* r4 = C + C */ \
89	paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
90	psubsw_r2r(r6, r3); /* r3 = B - D */ \
91	paddw_r2r(r7, r5); /* r5 = c2i6 / \
92	paddsw_r2r(r6, r6); /* r6 = D + D */ \
93	pmulhw_m2r(C(6), r7); / r7 = c6i6 / \
94	paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
95	movq_r2m(r4, I(1)); / save C. at I(1) */ \
96	psubsw_r2r(r5, r1); /* r1 = H = c6i2 - c2i6 */ \
97	movq_m2r(*C(4), r4); \
98	movq_r2r(r3, r5); /* r5 = B - D */ \
99	pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
100	paddsw_r2r(r2, r7); /* r7 = G = c6i6 + c2i2 */ \
101	movq_r2m(r6, I(2)); / save D. at I(2) */ \
102	movq_r2r(r0, r2); /* r2 = A - C */ \
103	movq_m2r(*I(0), r6); \
104	pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
105	paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
106	movq_m2r(*J(4), r3); \
107	psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
108	paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
109	psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
110	movq_r2r(r6, r0); \
111	pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
112	paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
113	paddsw_r2r(r1, r1); /* r1 = H + H */ \
114	paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
115	paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
116	pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
117	paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
118	psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
119	paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
120	movq_m2r(I(1), r0); / r0 = C. */ \
121	paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
122	paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
123	psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
124	}
125
126	/* RowIDCT gets ready to transpose */
127	#define RowIDCT() { \
128	\
129	BeginIDCT(); \
130	\
131	movq_m2r(I(2), r3); / r3 = D. */ \
132	psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
133	paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
134	paddsw_r2r(r7, r7); /* r7 = G + G */ \
135	paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
136	paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
137	psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
138	paddsw_r2r(r3, r3); \
139	psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
140	paddsw_r2r(r5, r5); \
141	paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
142	paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
143	psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
144	paddsw_r2r(r0, r0); \
145	movq_r2m(r1, I(1)); / save R1 */ \
146	paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
147	}
148
149	/* Column IDCT normalizes and stores final results */
150	#define ColumnIDCT() { \
151	\
152	BeginIDCT(); \
153	\
154	paddsw_m2r(Eight, r2); / adjust R2 (and R1) for shift */ \
155	paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
156	paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
157	psraw_i2r(4, r2); /* r2 = NR2 */ \
158	psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
159	psraw_i2r(4, r1); /* r1 = NR1 */ \
160	movq_m2r(I(2), r3); / r3 = D. */ \
161	paddsw_r2r(r7, r7); /* r7 = G + G */ \
162	movq_r2m(r2, I(2)); / store NR2 at I2 */ \
163	paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
164	movq_r2m(r1, I(1)); / store NR1 at I1 */ \
165	psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
166	paddsw_m2r(Eight, r4); / adjust R4 (and R3) for shift */ \
167	paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
168	paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
169	psraw_i2r(4, r4); /* r4 = NR4 */ \
170	psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
171	psraw_i2r(4, r3); /* r3 = NR3 */ \
172	paddsw_m2r(Eight, r6); / adjust R6 (and R5) for shift */ \
173	paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
174	paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
175	psraw_i2r(4, r6); /* r6 = NR6 */ \
176	movq_r2m(r4, J(4)); / store NR4 at J4 */ \
177	psraw_i2r(4, r5); /* r5 = NR5 */ \
178	movq_r2m(r3, I(3)); / store NR3 at I3 */ \
179	psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
180	paddsw_m2r(Eight, r7); / adjust R7 (and R0) for shift */ \
181	paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
182	paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
183	psraw_i2r(4, r7); /* r7 = NR7 */ \
184	movq_r2m(r6, J(6)); / store NR6 at J6 */ \
185	psraw_i2r(4, r0); /* r0 = NR0 */ \
186	movq_r2m(r5, J(5)); / store NR5 at J5 */ \
187	movq_r2m(r7, J(7)); / store NR7 at J7 */ \
188	movq_r2m(r0, I(0)); / store NR0 at I0 */ \
189	}
190
191	/* Following macro does two 4x4 transposes in place.
192
193	At entry (we assume):
194
195	r0 = a3 a2 a1 a0
196	I(1) = b3 b2 b1 b0
197	r2 = c3 c2 c1 c0
198	r3 = d3 d2 d1 d0
199
200	r4 = e3 e2 e1 e0
201	r5 = f3 f2 f1 f0
202	r6 = g3 g2 g1 g0
203	r7 = h3 h2 h1 h0
204
205	At exit, we have:
206
207	I(0) = d0 c0 b0 a0
208	I(1) = d1 c1 b1 a1
209	I(2) = d2 c2 b2 a2
210	I(3) = d3 c3 b3 a3
211
212	J(4) = h0 g0 f0 e0
213	J(5) = h1 g1 f1 e1
214	J(6) = h2 g2 f2 e2
215	J(7) = h3 g3 f3 e3
216
217	I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
218	J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
219
220	Since r1 is free at entry, we calculate the Js first. */
221
222	#define Transpose() { \
223	movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
224	punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
225	movq_r2m(r0, I(0)); / save a3 a2 a1 a0 */ \
226	punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
227	movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
228	punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
229	movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
230	punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
231	punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
232	movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
233	movq_r2m(r4, *J(4)); \
234	punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
235	movq_r2m(r5, *J(5)); \
236	punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
237	movq_m2r(I(0), r4); / r4 = a3 a2 a1 a0 */ \
238	punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
239	movq_m2r(I(1), r5); / r5 = b3 b2 b1 b0 */ \
240	movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
241	movq_r2m(r6, *J(7)); \
242	punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
243	movq_r2m(r1, *J(6)); \
244	punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
245	movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
246	punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
247	movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
248	punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
249	punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
250	movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
251	movq_r2m(r0, *I(0)); \
252	punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
253	movq_r2m(r1, *I(1)); \
254	punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
255	punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
256	movq_r2m(r4, *I(3)); \
257	movq_r2m(r2, *I(2)); \
258	}
259
260	void ff_vp3_dsp_init_mmx(void)
261	{
262	int j = 16;
263	uint16_t *p;
264
265	j = 1;
266	do {
267	p = idct_constants + ((j + 3) << 2);
268	p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
269	} while (++j <= 7);
270
271	idct_constants[44] = idct_constants[45] =
272	idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
273	}
274
275	void ff_vp3_idct_mmx(int16_t *output_data)
276	{
277	/* eax = quantized input
278	* ebx = dequantizer matrix
279	* ecx = IDCT constants
280	* M(I) = ecx + MaskOffset(0) + I * 8
281	* C(I) = ecx + CosineOffset(32) + (I-1) * 8
282	* edx = output
283	* r0..r7 = mm0..mm7
284	*/
285
286	#define C(x) (idct_constants + 16 + (x - 1) * 4)
287	#define Eight (idct_constants + 44)
288
289	/* at this point, function has completed dequantization + dezigzag +
290	* partial transposition; now do the idct itself */
291	#define I(K) (output_data + K * 8)
292	#define J(K) (output_data + ((K - 4) * 8) + 4)
293
294	RowIDCT();
295	Transpose();
296
297	#undef I
298	#undef J
299	#define I(K) (output_data + (K * 8) + 32)
300	#define J(K) (output_data + ((K - 4) * 8) + 36)
301
302	RowIDCT();
303	Transpose();
304
305	#undef I
306	#undef J
307	#define I(K) (output_data + K * 8)
308	#define J(K) (output_data + K * 8)
309
310	ColumnIDCT();
311
312	#undef I
313	#undef J
314	#define I(K) (output_data + (K * 8) + 4)
315	#define J(K) (output_data + (K * 8) + 4)
316
317	ColumnIDCT();
318
319	#undef I
320	#undef J
321
322	}

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/i386/vp3dsp_mmx.c@ 5776

Download in other formats: