simple_idct_arm.S@ 5776

Last change on this file since 5776 was 5776, checked in by vboxsync, 17 years ago
ffmpeg: exported to OSE
File size: 21.5 KB

Line
1	/*
2	* simple_idct_arm.S
3	* Copyright (C) 2002 Frederic 'dilb' Boulay.
4	* All Rights Reserved.
5	*
6	* Author: Frederic Boulay <[email protected]>
7	*
8	* You can redistribute this file and/or modify
9	* it under the terms of the GNU General Public License (version 2)
10	* as published by the Free Software Foundation.
11	*
12	* This file is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this library; if not, write to the Free Software
19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20	*
21	*
22	* The function defined in this file, is derived from the simple_idct function
23	* from the libavcodec library part of the ffmpeg project.
24	*/
25
26	/* useful constants for the algorithm, they are save in __constant_ptr__ at */
27	/* the end of the source code.*/
28	#define W1 22725
29	#define W2 21407
30	#define W3 19266
31	#define W4 16383
32	#define W5 12873
33	#define W6 8867
34	#define W7 4520
35	#define MASK_MSHW 0xFFFF0000
36
37	/* offsets of the constants in the vector */
38	#define offW1 0
39	#define offW2 4
40	#define offW3 8
41	#define offW4 12
42	#define offW5 16
43	#define offW6 20
44	#define offW7 24
45	#define offMASK_MSHW 28
46
47	#define ROW_SHIFT 11
48	#define ROW_SHIFT2MSHW (16-11)
49	#define COL_SHIFT 20
50	#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
51	#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
52
53
54	.text
55	.align
56	.global simple_idct_ARM
57
58	simple_idct_ARM:
59	@@ void simple_idct_ARM(int16_t *block)
60	@@ save stack for reg needed (take all of them),
61	@@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
62	@@ so it must not be overwritten, if it is not saved!!
63	@@ R12 is another scratch register, so it should not be saved too
64	@@ save all registers
65	stmfd sp!, {r4-r11, r14} @ R14 is also called LR
66	@@ at this point, R0=block, other registers are free.
67	add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
68	add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
69	@@ add 2 temporary variables in the stack: R0 and R14
70	sub sp, sp, #8 @ allow 2 local variables
71	str r0, [sp, #0] @ save block in sp[0]
72	@@ stack status
73	@@ sp+4 free
74	@@ sp+0 R0 (block)
75
76
77	@@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
78
79
80	__row_loop:
81	@@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
82	ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
83	ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
84	ldr r3, [r14, #8] @ R3=ROWr32[2]
85	ldr r4, [r14, #12] @ R4=ROWr32[3]
86	@@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
87	@@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
88	@@ else follow the complete algorithm.
89	@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
90	@@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
91	orr r5, r4, r3 @ R5=R4 \| R3
92	orr r5, r5, r2 @ R5=R4 \| R3 \| R2
93	orrs r6, r5, r1 @ Test R5 \| R1 (the aim is to check if everything is null)
94	beq __end_row_loop
95	mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
96	ldrsh r6, [r14, #0] @ R6=ROWr16[0]
97	orrs r5, r5, r7 @ R5=R4 \| R3 \| R2 \| R7
98	beq __almost_empty_row
99
100	__b_evaluation:
101	@@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
102	@@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
103	@@ R12=__const_ptr_, R14=&block[n]
104	@@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
105
106	@@ MUL16(b0, W1, row[1]);
107	@@ MUL16(b1, W3, row[1]);
108	@@ MUL16(b2, W5, row[1]);
109	@@ MUL16(b3, W7, row[1]);
110	@@ MAC16(b0, W3, row[3]);
111	@@ MAC16(b1, -W7, row[3]);
112	@@ MAC16(b2, -W1, row[3]);
113	@@ MAC16(b3, -W5, row[3]);
114	ldr r8, [r12, #offW1] @ R8=W1
115	mov r2, r2, asr #16 @ R2=ROWr16[3]
116	mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
117	ldr r9, [r12, #offW3] @ R9=W3
118	ldr r10, [r12, #offW5] @ R10=W5
119	mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
120	ldr r11, [r12, #offW7] @ R11=W7
121	mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
122	mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123	teq r2, #0 @ if null avoid muls
124	mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
125	rsbne r2, r2, #0 @ R2=-ROWr16[3]
126	mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
127	mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128	mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129
130	@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
131	@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
132	@@ R12=__const_ptr_, R14=&block[n]
133	@@ temp = ((uint32_t)row)[2] \| ((uint32_t)row)[3];
134	@@ if (temp != 0) {}
135	orrs r2, r3, r4 @ R2=ROWr32[2] \| ROWr32[3]
136	beq __end_b_evaluation
137
138	@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
139	@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
140	@@ R12=__const_ptr_, R14=&block[n]
141	@@ MAC16(b0, W5, row[5]);
142	@@ MAC16(b2, W7, row[5]);
143	@@ MAC16(b3, W3, row[5]);
144	@@ MAC16(b1, -W1, row[5]);
145	@@ MAC16(b0, W7, row[7]);
146	@@ MAC16(b2, W3, row[7]);
147	@@ MAC16(b3, -W1, row[7]);
148	@@ MAC16(b1, -W5, row[7]);
149	mov r3, r3, asr #16 @ R3=ROWr16[5]
150	teq r3, #0 @ if null avoid muls
151	mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
152	mov r4, r4, asr #16 @ R4=ROWr16[7]
153	mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
154	mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
155	rsbne r3, r3, #0 @ R3=-ROWr16[5]
156	mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
157	@@ R3 is free now
158	teq r4, #0 @ if null avoid muls
159	mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
160	mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
161	rsbne r4, r4, #0 @ R4=-ROWr16[7]
162	mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
163	mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
164	@@ R4 is free now
165	__end_b_evaluation:
166	@@ at this point, R0=b0, R1=b1, R2=ROWr32[2] \| ROWr32[3] (tmp), R3 (free), R4 (free),
167	@@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
168	@@ R12=__const_ptr_, R14=&block[n]
169
170	__a_evaluation:
171	@@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
172	@@ a1 = a0 + W6 * row[2];
173	@@ a2 = a0 - W6 * row[2];
174	@@ a3 = a0 - W2 * row[2];
175	@@ a0 = a0 + W2 * row[2];
176	ldr r9, [r12, #offW4] @ R9=W4
177	mul r6, r9, r6 @ R6=W4*ROWr16[0]
178	ldr r10, [r12, #offW6] @ R10=W6
179	ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
180	add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
181
182	mul r11, r10, r4 @ R11=W6*ROWr16[2]
183	ldr r8, [r12, #offW2] @ R8=W2
184	sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
185	@@ temp = ((uint32_t)row)[2] \| ((uint32_t)row)[3];
186	@@ if (temp != 0) {}
187	teq r2, #0
188	beq __end_bef_a_evaluation
189
190	add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
191	mul r11, r8, r4 @ R11=W2*ROWr16[2]
192	sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
193	add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
194
195
196	@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
197	@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
198	@@ R12=__const_ptr_, R14=&block[n]
199
200
201	@@ a0 += W4*row[4]
202	@@ a1 -= W4*row[4]
203	@@ a2 -= W4*row[4]
204	@@ a3 += W4*row[4]
205	ldrsh r11, [r14, #8] @ R11=ROWr16[4]
206	teq r11, #0 @ if null avoid muls
207	mulne r11, r9, r11 @ R11=W4*ROWr16[4]
208	@@ R9 is free now
209	ldrsh r9, [r14, #12] @ R9=ROWr16[6]
210	addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
211	subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
212	subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
213	addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
214	@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
215	teq r9, #0 @ if null avoid muls
216	mulne r11, r10, r9 @ R11=W6*ROWr16[6]
217	addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
218	mulne r10, r8, r9 @ R10=W2*ROWr16[6]
219	@@ a0 += W6*row[6];
220	@@ a3 -= W6*row[6];
221	@@ a1 -= W2*row[6];
222	@@ a2 += W2*row[6];
223	subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
224	subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
225	addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
226
227	__end_a_evaluation:
228	@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
229	@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
230	@@ R12=__const_ptr_, R14=&block[n]
231	@@ row[0] = (a0 + b0) >> ROW_SHIFT;
232	@@ row[1] = (a1 + b1) >> ROW_SHIFT;
233	@@ row[2] = (a2 + b2) >> ROW_SHIFT;
234	@@ row[3] = (a3 + b3) >> ROW_SHIFT;
235	@@ row[4] = (a3 - b3) >> ROW_SHIFT;
236	@@ row[5] = (a2 - b2) >> ROW_SHIFT;
237	@@ row[6] = (a1 - b1) >> ROW_SHIFT;
238	@@ row[7] = (a0 - b0) >> ROW_SHIFT;
239	add r8, r6, r0 @ R8=a0+b0
240	add r9, r2, r1 @ R9=a1+b1
241	@@ put 2 16 bits half-words in a 32bits word
242	@@ ROWr32[0]=ROWr16[0] \| (ROWr16[1]<<16) (only Little Endian compliant then!!!)
243	ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
244	and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
245	mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
246	and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
247	orr r8, r8, r9
248	str r8, [r14, #0]
249
250	add r8, r3, r5 @ R8=a2+b2
251	add r9, r4, r7 @ R9=a3+b3
252	and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
253	and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
254	orr r8, r8, r9
255	str r8, [r14, #4]
256
257	sub r8, r4, r7 @ R8=a3-b3
258	sub r9, r3, r5 @ R9=a2-b2
259	and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
260	and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
261	orr r8, r8, r9
262	str r8, [r14, #8]
263
264	sub r8, r2, r1 @ R8=a1-b1
265	sub r9, r6, r0 @ R9=a0-b0
266	and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
267	and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
268	orr r8, r8, r9
269	str r8, [r14, #12]
270
271	bal __end_row_loop
272
273	__almost_empty_row:
274	@@ the row was empty, except ROWr16[0], now, management of this special case
275	@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
276	@@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
277	@@ R8=0xFFFF (temp), R9-R11 free
278	mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
279	sub r8, r8, #1 @ R8 is now ready.
280	and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
281	orr r5, r5, r5, lsl #16 @ R5=R5 \| (R5<<16)
282	str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
283	str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
284	str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
285	str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
286
287	__end_row_loop:
288	@@ at this point, R0-R11 (free)
289	@@ R12=__const_ptr_, R14=&block[n]
290	ldr r0, [sp, #0] @ R0=block
291	teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
292	sub r14, r14, #16
293	bne __row_loop
294
295
296
297	@@ at this point, R0=block, R1-R11 (free)
298	@@ R12=__const_ptr_, R14=&block[n]
299	add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
300	__col_loop:
301
302	__b_evaluation2:
303	@@ at this point, R0=block (temp), R1-R11 (free)
304	@@ R12=__const_ptr_, R14=&block[n]
305	@@ proceed with b0-b3 first, followed by a0-a3
306	@@ MUL16(b0, W1, col[8x1]);
307	@@ MUL16(b1, W3, col[8x1]);
308	@@ MUL16(b2, W5, col[8x1]);
309	@@ MUL16(b3, W7, col[8x1]);
310	@@ MAC16(b0, W3, col[8x3]);
311	@@ MAC16(b1, -W7, col[8x3]);
312	@@ MAC16(b2, -W1, col[8x3]);
313	@@ MAC16(b3, -W5, col[8x3]);
314	ldr r8, [r12, #offW1] @ R8=W1
315	ldrsh r7, [r14, #16]
316	mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
317	ldr r9, [r12, #offW3] @ R9=W3
318	ldr r10, [r12, #offW5] @ R10=W5
319	mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320	ldr r11, [r12, #offW7] @ R11=W7
321	mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
322	ldrsh r2, [r14, #48]
323	mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
324	teq r2, #0 @ if 0, then avoid muls
325	mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
326	rsbne r2, r2, #0 @ R2=-ROWr16[3]
327	mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
328	mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329	mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330
331	@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
332	@@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
333	@@ R12=__const_ptr_, R14=&block[n]
334	@@ MAC16(b0, W5, col[5x8]);
335	@@ MAC16(b2, W7, col[5x8]);
336	@@ MAC16(b3, W3, col[5x8]);
337	@@ MAC16(b1, -W1, col[5x8]);
338	@@ MAC16(b0, W7, col[7x8]);
339	@@ MAC16(b2, W3, col[7x8]);
340	@@ MAC16(b3, -W1, col[7x8]);
341	@@ MAC16(b1, -W5, col[7x8]);
342	ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
343	teq r3, #0 @ if 0 then avoid muls
344	mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
345	mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
346	mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
347	rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
348	ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
349	mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
350	@@ R3 is free now
351	teq r4, #0 @ if 0 then avoid muls
352	mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
353	mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
354	rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
355	mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
356	mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
357	@@ R4 is free now
358	__end_b_evaluation2:
359	@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
360	@@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
361	@@ R12=__const_ptr_, R14=&block[n]
362
363	__a_evaluation2:
364	@@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
365	@@ a1 = a0 + W6 * row[2];
366	@@ a2 = a0 - W6 * row[2];
367	@@ a3 = a0 - W2 * row[2];
368	@@ a0 = a0 + W2 * row[2];
369	ldrsh r6, [r14, #0]
370	ldr r9, [r12, #offW4] @ R9=W4
371	mul r6, r9, r6 @ R6=W4*ROWr16[0]
372	ldr r10, [r12, #offW6] @ R10=W6
373	ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
374	add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
375	mul r11, r10, r4 @ R11=W6*ROWr16[2]
376	ldr r8, [r12, #offW2] @ R8=W2
377	add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
378	sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
379	mul r11, r8, r4 @ R11=W2*ROWr16[2]
380	sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
381	add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
382
383	@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
384	@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
385	@@ R12=__const_ptr_, R14=&block[n]
386	@@ a0 += W4*row[4]
387	@@ a1 -= W4*row[4]
388	@@ a2 -= W4*row[4]
389	@@ a3 += W4*row[4]
390	ldrsh r11, [r14, #64] @ R11=ROWr16[4]
391	teq r11, #0 @ if null avoid muls
392	mulne r11, r9, r11 @ R11=W4*ROWr16[4]
393	@@ R9 is free now
394	addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
395	subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
396	subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
397	ldrsh r9, [r14, #96] @ R9=ROWr16[6]
398	addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
399	@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
400	teq r9, #0 @ if null avoid muls
401	mulne r11, r10, r9 @ R11=W6*ROWr16[6]
402	addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
403	mulne r10, r8, r9 @ R10=W2*ROWr16[6]
404	@@ a0 += W6*row[6];
405	@@ a3 -= W6*row[6];
406	@@ a1 -= W2*row[6];
407	@@ a2 += W2*row[6];
408	subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
409	subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
410	addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
411	__end_a_evaluation2:
412	@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
413	@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
414	@@ R12=__const_ptr_, R14=&block[n]
415	@@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
416	@@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
417	@@ col[16] = ((a2 + b2) >> COL_SHIFT);
418	@@ col[24] = ((a3 + b3) >> COL_SHIFT);
419	@@ col[32] = ((a3 - b3) >> COL_SHIFT);
420	@@ col[40] = ((a2 - b2) >> COL_SHIFT);
421	@@ col[48] = ((a1 - b1) >> COL_SHIFT);
422	@@ col[56] = ((a0 - b0) >> COL_SHIFT);
423	@@@@@ no optimisation here @@@@@
424	add r8, r6, r0 @ R8=a0+b0
425	add r9, r2, r1 @ R9=a1+b1
426	mov r8, r8, asr #COL_SHIFT
427	mov r9, r9, asr #COL_SHIFT
428	strh r8, [r14, #0]
429	strh r9, [r14, #16]
430	add r8, r3, r5 @ R8=a2+b2
431	add r9, r4, r7 @ R9=a3+b3
432	mov r8, r8, asr #COL_SHIFT
433	mov r9, r9, asr #COL_SHIFT
434	strh r8, [r14, #32]
435	strh r9, [r14, #48]
436	sub r8, r4, r7 @ R8=a3-b3
437	sub r9, r3, r5 @ R9=a2-b2
438	mov r8, r8, asr #COL_SHIFT
439	mov r9, r9, asr #COL_SHIFT
440	strh r8, [r14, #64]
441	strh r9, [r14, #80]
442	sub r8, r2, r1 @ R8=a1-b1
443	sub r9, r6, r0 @ R9=a0-b0
444	mov r8, r8, asr #COL_SHIFT
445	mov r9, r9, asr #COL_SHIFT
446	strh r8, [r14, #96]
447	strh r9, [r14, #112]
448
449	__end_col_loop:
450	@@ at this point, R0-R11 (free)
451	@@ R12=__const_ptr_, R14=&block[n]
452	ldr r0, [sp, #0] @ R0=block
453	teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
454	sub r14, r14, #2
455	bne __col_loop
456
457
458
459
460	__end_simple_idct_ARM:
461	@@ restore registers to previous status!
462	add sp, sp, #8 @@ the local variables!
463	ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
464
465
466
467	@@ kind of sub-function, here not to overload the common case.
468	__end_bef_a_evaluation:
469	add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
470	mul r11, r8, r4 @ R11=W2*ROWr16[2]
471	sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
472	add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
473	bal __end_a_evaluation
474
475
476	__constant_ptr__: @@ see #defines at the beginning of the source code for values.
477	.align
478	.word W1
479	.word W2
480	.word W3
481	.word W4
482	.word W5
483	.word W6
484	.word W7
485	.word MASK_MSHW

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format