VirtualBox

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/armv4l/simple_idct_arm.S@ 5776

Last change on this file since 5776 was 5776, checked in by vboxsync, 17 years ago

ffmpeg: exported to OSE

File size: 21.5 KB
Line 
1/*
2 * simple_idct_arm.S
3 * Copyright (C) 2002 Frederic 'dilb' Boulay.
4 * All Rights Reserved.
5 *
6 * Author: Frederic Boulay <[email protected]>
7 *
8 * You can redistribute this file and/or modify
9 * it under the terms of the GNU General Public License (version 2)
10 * as published by the Free Software Foundation.
11 *
12 * This file is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 *
22 * The function defined in this file, is derived from the simple_idct function
23 * from the libavcodec library part of the ffmpeg project.
24 */
25
26/* useful constants for the algorithm, they are save in __constant_ptr__ at */
27/* the end of the source code.*/
28#define W1 22725
29#define W2 21407
30#define W3 19266
31#define W4 16383
32#define W5 12873
33#define W6 8867
34#define W7 4520
35#define MASK_MSHW 0xFFFF0000
36
37/* offsets of the constants in the vector */
38#define offW1 0
39#define offW2 4
40#define offW3 8
41#define offW4 12
42#define offW5 16
43#define offW6 20
44#define offW7 24
45#define offMASK_MSHW 28
46
47#define ROW_SHIFT 11
48#define ROW_SHIFT2MSHW (16-11)
49#define COL_SHIFT 20
50#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
51#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
52
53
54 .text
55 .align
56 .global simple_idct_ARM
57
58simple_idct_ARM:
59 @@ void simple_idct_ARM(int16_t *block)
60 @@ save stack for reg needed (take all of them),
61 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
62 @@ so it must not be overwritten, if it is not saved!!
63 @@ R12 is another scratch register, so it should not be saved too
64 @@ save all registers
65 stmfd sp!, {r4-r11, r14} @ R14 is also called LR
66 @@ at this point, R0=block, other registers are free.
67 add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
68 add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
69 @@ add 2 temporary variables in the stack: R0 and R14
70 sub sp, sp, #8 @ allow 2 local variables
71 str r0, [sp, #0] @ save block in sp[0]
72 @@ stack status
73 @@ sp+4 free
74 @@ sp+0 R0 (block)
75
76
77 @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
78
79
80__row_loop:
81 @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
82 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
83 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
84 ldr r3, [r14, #8] @ R3=ROWr32[2]
85 ldr r4, [r14, #12] @ R4=ROWr32[3]
86 @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
87 @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
88 @@ else follow the complete algorithm.
89 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
90 @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
91 orr r5, r4, r3 @ R5=R4 | R3
92 orr r5, r5, r2 @ R5=R4 | R3 | R2
93 orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
94 beq __end_row_loop
95 mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
96 ldrsh r6, [r14, #0] @ R6=ROWr16[0]
97 orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
98 beq __almost_empty_row
99
100__b_evaluation:
101 @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
102 @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
103 @@ R12=__const_ptr_, R14=&block[n]
104 @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
105
106 @@ MUL16(b0, W1, row[1]);
107 @@ MUL16(b1, W3, row[1]);
108 @@ MUL16(b2, W5, row[1]);
109 @@ MUL16(b3, W7, row[1]);
110 @@ MAC16(b0, W3, row[3]);
111 @@ MAC16(b1, -W7, row[3]);
112 @@ MAC16(b2, -W1, row[3]);
113 @@ MAC16(b3, -W5, row[3]);
114 ldr r8, [r12, #offW1] @ R8=W1
115 mov r2, r2, asr #16 @ R2=ROWr16[3]
116 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
117 ldr r9, [r12, #offW3] @ R9=W3
118 ldr r10, [r12, #offW5] @ R10=W5
119 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
120 ldr r11, [r12, #offW7] @ R11=W7
121 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
122 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123 teq r2, #0 @ if null avoid muls
124 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
125 rsbne r2, r2, #0 @ R2=-ROWr16[3]
126 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
127 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129
130 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
131 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
132 @@ R12=__const_ptr_, R14=&block[n]
133 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
134 @@ if (temp != 0) {}
135 orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
136 beq __end_b_evaluation
137
138 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
139 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
140 @@ R12=__const_ptr_, R14=&block[n]
141 @@ MAC16(b0, W5, row[5]);
142 @@ MAC16(b2, W7, row[5]);
143 @@ MAC16(b3, W3, row[5]);
144 @@ MAC16(b1, -W1, row[5]);
145 @@ MAC16(b0, W7, row[7]);
146 @@ MAC16(b2, W3, row[7]);
147 @@ MAC16(b3, -W1, row[7]);
148 @@ MAC16(b1, -W5, row[7]);
149 mov r3, r3, asr #16 @ R3=ROWr16[5]
150 teq r3, #0 @ if null avoid muls
151 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
152 mov r4, r4, asr #16 @ R4=ROWr16[7]
153 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
154 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
155 rsbne r3, r3, #0 @ R3=-ROWr16[5]
156 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
157 @@ R3 is free now
158 teq r4, #0 @ if null avoid muls
159 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
160 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
161 rsbne r4, r4, #0 @ R4=-ROWr16[7]
162 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
163 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
164 @@ R4 is free now
165__end_b_evaluation:
166 @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
167 @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
168 @@ R12=__const_ptr_, R14=&block[n]
169
170__a_evaluation:
171 @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
172 @@ a1 = a0 + W6 * row[2];
173 @@ a2 = a0 - W6 * row[2];
174 @@ a3 = a0 - W2 * row[2];
175 @@ a0 = a0 + W2 * row[2];
176 ldr r9, [r12, #offW4] @ R9=W4
177 mul r6, r9, r6 @ R6=W4*ROWr16[0]
178 ldr r10, [r12, #offW6] @ R10=W6
179 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
180 add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
181
182 mul r11, r10, r4 @ R11=W6*ROWr16[2]
183 ldr r8, [r12, #offW2] @ R8=W2
184 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
185 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
186 @@ if (temp != 0) {}
187 teq r2, #0
188 beq __end_bef_a_evaluation
189
190 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
191 mul r11, r8, r4 @ R11=W2*ROWr16[2]
192 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
193 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
194
195
196 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
197 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
198 @@ R12=__const_ptr_, R14=&block[n]
199
200
201 @@ a0 += W4*row[4]
202 @@ a1 -= W4*row[4]
203 @@ a2 -= W4*row[4]
204 @@ a3 += W4*row[4]
205 ldrsh r11, [r14, #8] @ R11=ROWr16[4]
206 teq r11, #0 @ if null avoid muls
207 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
208 @@ R9 is free now
209 ldrsh r9, [r14, #12] @ R9=ROWr16[6]
210 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
211 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
212 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
213 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
214 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
215 teq r9, #0 @ if null avoid muls
216 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
217 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
218 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
219 @@ a0 += W6*row[6];
220 @@ a3 -= W6*row[6];
221 @@ a1 -= W2*row[6];
222 @@ a2 += W2*row[6];
223 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
224 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
225 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
226
227__end_a_evaluation:
228 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
229 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
230 @@ R12=__const_ptr_, R14=&block[n]
231 @@ row[0] = (a0 + b0) >> ROW_SHIFT;
232 @@ row[1] = (a1 + b1) >> ROW_SHIFT;
233 @@ row[2] = (a2 + b2) >> ROW_SHIFT;
234 @@ row[3] = (a3 + b3) >> ROW_SHIFT;
235 @@ row[4] = (a3 - b3) >> ROW_SHIFT;
236 @@ row[5] = (a2 - b2) >> ROW_SHIFT;
237 @@ row[6] = (a1 - b1) >> ROW_SHIFT;
238 @@ row[7] = (a0 - b0) >> ROW_SHIFT;
239 add r8, r6, r0 @ R8=a0+b0
240 add r9, r2, r1 @ R9=a1+b1
241 @@ put 2 16 bits half-words in a 32bits word
242 @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
243 ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
244 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
245 mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
246 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
247 orr r8, r8, r9
248 str r8, [r14, #0]
249
250 add r8, r3, r5 @ R8=a2+b2
251 add r9, r4, r7 @ R9=a3+b3
252 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
253 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
254 orr r8, r8, r9
255 str r8, [r14, #4]
256
257 sub r8, r4, r7 @ R8=a3-b3
258 sub r9, r3, r5 @ R9=a2-b2
259 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
260 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
261 orr r8, r8, r9
262 str r8, [r14, #8]
263
264 sub r8, r2, r1 @ R8=a1-b1
265 sub r9, r6, r0 @ R9=a0-b0
266 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
267 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
268 orr r8, r8, r9
269 str r8, [r14, #12]
270
271 bal __end_row_loop
272
273__almost_empty_row:
274 @@ the row was empty, except ROWr16[0], now, management of this special case
275 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
276 @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
277 @@ R8=0xFFFF (temp), R9-R11 free
278 mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
279 sub r8, r8, #1 @ R8 is now ready.
280 and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
281 orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
282 str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
283 str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
284 str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
285 str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
286
287__end_row_loop:
288 @@ at this point, R0-R11 (free)
289 @@ R12=__const_ptr_, R14=&block[n]
290 ldr r0, [sp, #0] @ R0=block
291 teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
292 sub r14, r14, #16
293 bne __row_loop
294
295
296
297 @@ at this point, R0=block, R1-R11 (free)
298 @@ R12=__const_ptr_, R14=&block[n]
299 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
300__col_loop:
301
302__b_evaluation2:
303 @@ at this point, R0=block (temp), R1-R11 (free)
304 @@ R12=__const_ptr_, R14=&block[n]
305 @@ proceed with b0-b3 first, followed by a0-a3
306 @@ MUL16(b0, W1, col[8x1]);
307 @@ MUL16(b1, W3, col[8x1]);
308 @@ MUL16(b2, W5, col[8x1]);
309 @@ MUL16(b3, W7, col[8x1]);
310 @@ MAC16(b0, W3, col[8x3]);
311 @@ MAC16(b1, -W7, col[8x3]);
312 @@ MAC16(b2, -W1, col[8x3]);
313 @@ MAC16(b3, -W5, col[8x3]);
314 ldr r8, [r12, #offW1] @ R8=W1
315 ldrsh r7, [r14, #16]
316 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
317 ldr r9, [r12, #offW3] @ R9=W3
318 ldr r10, [r12, #offW5] @ R10=W5
319 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320 ldr r11, [r12, #offW7] @ R11=W7
321 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
322 ldrsh r2, [r14, #48]
323 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
324 teq r2, #0 @ if 0, then avoid muls
325 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
326 rsbne r2, r2, #0 @ R2=-ROWr16[3]
327 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
328 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330
331 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
332 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
333 @@ R12=__const_ptr_, R14=&block[n]
334 @@ MAC16(b0, W5, col[5x8]);
335 @@ MAC16(b2, W7, col[5x8]);
336 @@ MAC16(b3, W3, col[5x8]);
337 @@ MAC16(b1, -W1, col[5x8]);
338 @@ MAC16(b0, W7, col[7x8]);
339 @@ MAC16(b2, W3, col[7x8]);
340 @@ MAC16(b3, -W1, col[7x8]);
341 @@ MAC16(b1, -W5, col[7x8]);
342 ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
343 teq r3, #0 @ if 0 then avoid muls
344 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
345 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
346 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
347 rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
348 ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
350 @@ R3 is free now
351 teq r4, #0 @ if 0 then avoid muls
352 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
353 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
354 rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
355 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
356 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
357 @@ R4 is free now
358__end_b_evaluation2:
359 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
360 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
361 @@ R12=__const_ptr_, R14=&block[n]
362
363__a_evaluation2:
364 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
365 @@ a1 = a0 + W6 * row[2];
366 @@ a2 = a0 - W6 * row[2];
367 @@ a3 = a0 - W2 * row[2];
368 @@ a0 = a0 + W2 * row[2];
369 ldrsh r6, [r14, #0]
370 ldr r9, [r12, #offW4] @ R9=W4
371 mul r6, r9, r6 @ R6=W4*ROWr16[0]
372 ldr r10, [r12, #offW6] @ R10=W6
373 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
374 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
375 mul r11, r10, r4 @ R11=W6*ROWr16[2]
376 ldr r8, [r12, #offW2] @ R8=W2
377 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
378 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
379 mul r11, r8, r4 @ R11=W2*ROWr16[2]
380 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
381 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
382
383 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
384 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
385 @@ R12=__const_ptr_, R14=&block[n]
386 @@ a0 += W4*row[4]
387 @@ a1 -= W4*row[4]
388 @@ a2 -= W4*row[4]
389 @@ a3 += W4*row[4]
390 ldrsh r11, [r14, #64] @ R11=ROWr16[4]
391 teq r11, #0 @ if null avoid muls
392 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
393 @@ R9 is free now
394 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
395 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
396 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
397 ldrsh r9, [r14, #96] @ R9=ROWr16[6]
398 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
399 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
400 teq r9, #0 @ if null avoid muls
401 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
402 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
403 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
404 @@ a0 += W6*row[6];
405 @@ a3 -= W6*row[6];
406 @@ a1 -= W2*row[6];
407 @@ a2 += W2*row[6];
408 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
409 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
410 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
411__end_a_evaluation2:
412 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
413 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
414 @@ R12=__const_ptr_, R14=&block[n]
415 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
416 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
417 @@ col[16] = ((a2 + b2) >> COL_SHIFT);
418 @@ col[24] = ((a3 + b3) >> COL_SHIFT);
419 @@ col[32] = ((a3 - b3) >> COL_SHIFT);
420 @@ col[40] = ((a2 - b2) >> COL_SHIFT);
421 @@ col[48] = ((a1 - b1) >> COL_SHIFT);
422 @@ col[56] = ((a0 - b0) >> COL_SHIFT);
423 @@@@@ no optimisation here @@@@@
424 add r8, r6, r0 @ R8=a0+b0
425 add r9, r2, r1 @ R9=a1+b1
426 mov r8, r8, asr #COL_SHIFT
427 mov r9, r9, asr #COL_SHIFT
428 strh r8, [r14, #0]
429 strh r9, [r14, #16]
430 add r8, r3, r5 @ R8=a2+b2
431 add r9, r4, r7 @ R9=a3+b3
432 mov r8, r8, asr #COL_SHIFT
433 mov r9, r9, asr #COL_SHIFT
434 strh r8, [r14, #32]
435 strh r9, [r14, #48]
436 sub r8, r4, r7 @ R8=a3-b3
437 sub r9, r3, r5 @ R9=a2-b2
438 mov r8, r8, asr #COL_SHIFT
439 mov r9, r9, asr #COL_SHIFT
440 strh r8, [r14, #64]
441 strh r9, [r14, #80]
442 sub r8, r2, r1 @ R8=a1-b1
443 sub r9, r6, r0 @ R9=a0-b0
444 mov r8, r8, asr #COL_SHIFT
445 mov r9, r9, asr #COL_SHIFT
446 strh r8, [r14, #96]
447 strh r9, [r14, #112]
448
449__end_col_loop:
450 @@ at this point, R0-R11 (free)
451 @@ R12=__const_ptr_, R14=&block[n]
452 ldr r0, [sp, #0] @ R0=block
453 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
454 sub r14, r14, #2
455 bne __col_loop
456
457
458
459
460__end_simple_idct_ARM:
461 @@ restore registers to previous status!
462 add sp, sp, #8 @@ the local variables!
463 ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
464
465
466
467@@ kind of sub-function, here not to overload the common case.
468__end_bef_a_evaluation:
469 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
470 mul r11, r8, r4 @ R11=W2*ROWr16[2]
471 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
472 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
473 bal __end_a_evaluation
474
475
476__constant_ptr__: @@ see #defines at the beginning of the source code for values.
477 .align
478 .word W1
479 .word W2
480 .word W3
481 .word W4
482 .word W5
483 .word W6
484 .word W7
485 .word MASK_MSHW
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette