1 | /*
|
---|
2 | * iWMMXt optimized DSP utils
|
---|
3 | * Copyright (c) 2004 AGAWA Koji
|
---|
4 | *
|
---|
5 | * This library is free software; you can redistribute it and/or
|
---|
6 | * modify it under the terms of the GNU Lesser General Public
|
---|
7 | * License as published by the Free Software Foundation; either
|
---|
8 | * version 2 of the License, or (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This library is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
13 | * Lesser General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU Lesser General Public
|
---|
16 | * License along with this library; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
---|
18 | */
|
---|
19 |
|
---|
20 | #include "../dsputil.h"
|
---|
21 |
|
---|
22 | #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
|
---|
23 | #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
|
---|
24 | #define WAVG2B "wavg2b"
|
---|
25 | #include "dsputil_iwmmxt_rnd.h"
|
---|
26 | #undef DEF
|
---|
27 | #undef SET_RND
|
---|
28 | #undef WAVG2B
|
---|
29 |
|
---|
30 | #define DEF(x, y) x ## _ ## y ##_iwmmxt
|
---|
31 | #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
|
---|
32 | #define WAVG2B "wavg2br"
|
---|
33 | #include "dsputil_iwmmxt_rnd.h"
|
---|
34 | #undef DEF
|
---|
35 | #undef SET_RND
|
---|
36 | #undef WAVG2BR
|
---|
37 |
|
---|
38 | // need scheduling
|
---|
39 | #define OP(AVG) \
|
---|
40 | asm volatile ( \
|
---|
41 | /* alignment */ \
|
---|
42 | "and r12, %[pixels], #7 \n\t" \
|
---|
43 | "bic %[pixels], %[pixels], #7 \n\t" \
|
---|
44 | "tmcr wcgr1, r12 \n\t" \
|
---|
45 | \
|
---|
46 | "wldrd wr0, [%[pixels]] \n\t" \
|
---|
47 | "wldrd wr1, [%[pixels], #8] \n\t" \
|
---|
48 | "add %[pixels], %[pixels], %[line_size] \n\t" \
|
---|
49 | "walignr1 wr4, wr0, wr1 \n\t" \
|
---|
50 | \
|
---|
51 | "1: \n\t" \
|
---|
52 | \
|
---|
53 | "wldrd wr2, [%[pixels]] \n\t" \
|
---|
54 | "wldrd wr3, [%[pixels], #8] \n\t" \
|
---|
55 | "add %[pixels], %[pixels], %[line_size] \n\t" \
|
---|
56 | "pld [%[pixels]] \n\t" \
|
---|
57 | "walignr1 wr5, wr2, wr3 \n\t" \
|
---|
58 | AVG " wr6, wr4, wr5 \n\t" \
|
---|
59 | "wstrd wr6, [%[block]] \n\t" \
|
---|
60 | "add %[block], %[block], %[line_size] \n\t" \
|
---|
61 | \
|
---|
62 | "wldrd wr0, [%[pixels]] \n\t" \
|
---|
63 | "wldrd wr1, [%[pixels], #8] \n\t" \
|
---|
64 | "add %[pixels], %[pixels], %[line_size] \n\t" \
|
---|
65 | "walignr1 wr4, wr0, wr1 \n\t" \
|
---|
66 | "pld [%[pixels]] \n\t" \
|
---|
67 | AVG " wr6, wr4, wr5 \n\t" \
|
---|
68 | "wstrd wr6, [%[block]] \n\t" \
|
---|
69 | "add %[block], %[block], %[line_size] \n\t" \
|
---|
70 | \
|
---|
71 | "subs %[h], %[h], #2 \n\t" \
|
---|
72 | "bne 1b \n\t" \
|
---|
73 | : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
|
---|
74 | : [line_size]"r"(line_size) \
|
---|
75 | : "memory", "r12");
|
---|
76 | void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
|
---|
77 | {
|
---|
78 | OP("wavg2br");
|
---|
79 | }
|
---|
80 | void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
|
---|
81 | {
|
---|
82 | OP("wavg2b");
|
---|
83 | }
|
---|
84 | #undef OP
|
---|
85 |
|
---|
86 | void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
|
---|
87 | {
|
---|
88 | uint8_t *pixels2 = pixels + line_size;
|
---|
89 |
|
---|
90 | __asm__ __volatile__ (
|
---|
91 | "mov r12, #4 \n\t"
|
---|
92 | "1: \n\t"
|
---|
93 | "pld [%[pixels], %[line_size2]] \n\t"
|
---|
94 | "pld [%[pixels2], %[line_size2]] \n\t"
|
---|
95 | "wldrd wr4, [%[pixels]] \n\t"
|
---|
96 | "wldrd wr5, [%[pixels2]] \n\t"
|
---|
97 | "pld [%[block], #32] \n\t"
|
---|
98 | "wunpckelub wr6, wr4 \n\t"
|
---|
99 | "wldrd wr0, [%[block]] \n\t"
|
---|
100 | "wunpckehub wr7, wr4 \n\t"
|
---|
101 | "wldrd wr1, [%[block], #8] \n\t"
|
---|
102 | "wunpckelub wr8, wr5 \n\t"
|
---|
103 | "wldrd wr2, [%[block], #16] \n\t"
|
---|
104 | "wunpckehub wr9, wr5 \n\t"
|
---|
105 | "wldrd wr3, [%[block], #24] \n\t"
|
---|
106 | "add %[block], %[block], #32 \n\t"
|
---|
107 | "waddhss wr10, wr0, wr6 \n\t"
|
---|
108 | "waddhss wr11, wr1, wr7 \n\t"
|
---|
109 | "waddhss wr12, wr2, wr8 \n\t"
|
---|
110 | "waddhss wr13, wr3, wr9 \n\t"
|
---|
111 | "wpackhus wr14, wr10, wr11 \n\t"
|
---|
112 | "wpackhus wr15, wr12, wr13 \n\t"
|
---|
113 | "wstrd wr14, [%[pixels]] \n\t"
|
---|
114 | "add %[pixels], %[pixels], %[line_size2] \n\t"
|
---|
115 | "subs r12, r12, #1 \n\t"
|
---|
116 | "wstrd wr15, [%[pixels2]] \n\t"
|
---|
117 | "add %[pixels2], %[pixels2], %[line_size2] \n\t"
|
---|
118 | "bne 1b \n\t"
|
---|
119 | : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
|
---|
120 | : [line_size2]"r"(line_size << 1)
|
---|
121 | : "cc", "memory", "r12");
|
---|
122 | }
|
---|
123 |
|
---|
124 | static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
---|
125 | {
|
---|
126 | return;
|
---|
127 | }
|
---|
128 |
|
---|
129 | int mm_flags; /* multimedia extension flags */
|
---|
130 |
|
---|
131 | int mm_support(void)
|
---|
132 | {
|
---|
133 | return 0; /* TODO, implement proper detection */
|
---|
134 | }
|
---|
135 |
|
---|
136 | void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
|
---|
137 | {
|
---|
138 | mm_flags = mm_support();
|
---|
139 |
|
---|
140 | if (avctx->dsp_mask) {
|
---|
141 | if (avctx->dsp_mask & FF_MM_FORCE)
|
---|
142 | mm_flags |= (avctx->dsp_mask & 0xffff);
|
---|
143 | else
|
---|
144 | mm_flags &= ~(avctx->dsp_mask & 0xffff);
|
---|
145 | }
|
---|
146 |
|
---|
147 | if (!(mm_flags & MM_IWMMXT)) return;
|
---|
148 |
|
---|
149 | c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
|
---|
150 |
|
---|
151 | c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
|
---|
152 | c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
|
---|
153 | c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
|
---|
154 | c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
|
---|
155 | c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
|
---|
156 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
|
---|
157 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
|
---|
158 | c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
|
---|
159 |
|
---|
160 | c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
|
---|
161 | c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
|
---|
162 | c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
|
---|
163 | c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
|
---|
164 | c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
|
---|
165 | c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
|
---|
166 | c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
|
---|
167 | c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
|
---|
168 |
|
---|
169 | c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
|
---|
170 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
|
---|
171 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
|
---|
172 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
|
---|
173 | c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
|
---|
174 | c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
|
---|
175 | c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
|
---|
176 | c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
|
---|
177 |
|
---|
178 | c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
|
---|
179 | c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
|
---|
180 | c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
|
---|
181 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
|
---|
182 | c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
|
---|
183 | c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
|
---|
184 | c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
|
---|
185 | c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
|
---|
186 | }
|
---|