1 diff -pruN libfame-0.9.1.orig/src/dct_mmx.h libfame-0.9.1/src/dct_mmx.h
2 --- libfame-0.9.1.orig/src/dct_mmx.h 2008-08-02 23:18:19.164140803 +1000
3 +++ libfame-0.9.1/src/dct_mmx.h 2008-08-02 23:19:00.554142886 +1000
4 @@ -22,6 +22,9 @@
5
6 #define precision
7
8 +extern FAME_ALIGNED short const _mmx_1[];
9 +extern FAME_ALIGNED short const _mmx_cos[];
10 +
11 static void inline dct_aan_pass(dct_t *cache)
12 {
13 // register unsigned short const *mmx_cos = _mmx_cos;
14 @@ -66,42 +69,42 @@ static void inline dct_aan_pass(dct_t *c
15 #ifdef precision
16 "psllw $0x01, %%mm5\n" /* precision(va0) += 1 bit */
17 #endif
18 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
19 + "paddw (%2), %%mm4\n" /* + 1 */
20 // "pmulhw 16(%1), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
21 - "pmulhw " ASMSYM "_mmx_cos+16, %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
22 + "pmulhw 16(%3), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
23 "" /* STEP 4 */
24 #ifdef precision
25 "psllw $0x02, %%mm6\n" /* precision(v22) += 1 bit */
26 #else
27 "psllw $0x01, %%mm6\n" /* */
28 #endif
29 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
30 + "paddw (%2), %%mm4\n" /* + 1 */
31 // "pmulhw 8(%1), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
32 - "pmulhw " ASMSYM "_mmx_cos+8, %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
33 + "pmulhw 8(%3), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
34 #ifdef precision
35 "psllw $0x02, %%mm2\n" /* precision(v15) += 1 bit */
36 #else
37 "psllw $0x01, %%mm2\n" /* */
38 #endif
39 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
40 + "paddw (%2), %%mm4\n" /* + 1 */
41 // "pmulhw 8(%1), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
42 - "pmulhw " ASMSYM "_mmx_cos+8, %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
43 + "pmulhw 8(%3), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
44 #ifdef precision
45 "psllw $0x02, %%mm4\n" /* precision(v14) += 1 bit */
46 #else
47 "psllw $0x01, %%mm4\n" /* */
48 #endif
49 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
50 + "paddw (%2), %%mm4\n" /* + 1 */
51 // "pmulhw 0(%1), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
52 - "pmulhw " ASMSYM "_mmx_cos, %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
53 + "pmulhw (%3), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
54 "psubsw %%mm5, %%mm4\n" /* v14*-COS2 - va0 -> mm4 (v34) */
55 #ifdef precision
56 "psllw $0x01, %%mm1\n" /* precision(v16) += 1 bit */
57 #endif
58 "psubsw %%mm1, %%mm5\n" /* va0 - v16 -> mm5 */
59 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
60 + "paddw (%2), %%mm4\n" /* + 1 */
61 // "pmulhw 24(%1), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
62 - "pmulhw " ASMSYM "_mmx_cos+24, %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
63 + "pmulhw 24(%3), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
64 "psubsw %%mm5, %%mm1\n" /* v16 * COS8 - va0 -> mm1 (v36)*/
65 "" /* STEP 5 */
66 "movq 0x70(%0), %%mm0\n" /* retrieve v07 -> mm0 */
67 @@ -138,8 +141,8 @@ static void inline dct_aan_pass(dct_t *c
68 "movq %%mm0, 0x30(%0)\n" /* store line 3 */
69 "movq %%mm4, 0x50(%0)\n" /* store line 5 */
70 "movq %%mm2, 0x70(%0)\n" /* store line 7 */
71 - : "=r"(cache)/*, "=r"(mmx_cos)*/
72 - : "0"(cache)/*, "1"(mmx_cos)*/
73 + : "=r"(cache)
74 + : "0"(cache), "r"(_mmx_1), "r"(_mmx_cos)
75 : "memory");
76 }
77
78 diff -pruN libfame-0.9.1.orig/src/dequantize_mmx.h libfame-0.9.1/src/dequantize_mmx.h
79 --- libfame-0.9.1.orig/src/dequantize_mmx.h 2008-08-02 23:18:19.164140803 +1000
80 +++ libfame-0.9.1/src/dequantize_mmx.h 2008-08-02 23:19:00.555141217 +1000
81 @@ -27,8 +27,8 @@
82 "pmullw 0x" #x "8(%3), %%mm5\n" /* premultiply for iDCT */ \
83 "psrlw $0x0b, %%mm4\n" /* keep 5 bits */ \
84 "psrlw $0x0b, %%mm5\n" /* keep 5 bits */ \
85 - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ \
86 - "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 */ \
87 + "paddw (%8), %%mm4\n" /* + 1 */ \
88 + "paddw (%8), %%mm5\n" /* + 1 */ \
89 "psrlw $0x01, %%mm4\n" /* keep 4 bits rounded */ \
90 "psrlw $0x01, %%mm5\n" /* keep 4 bits rounded */ \
91 "psllw $0x04, %%mm0\n" /* multiply by 16 for iDCT */ \
92 @@ -107,7 +107,7 @@ static void inline dequantize_intra_glob
93 DEQUANTIZE_GLOBAL_MISMATCH_CONTROL()
94 DEQUANTIZE_PRESCALE_STEP(7)
95 : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
96 - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
97 + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
98 : "memory");
99
100 asm volatile("movd %%mm6, %0\n" /* export mismatch */
101 @@ -160,8 +160,8 @@ static void inline dequantize_intra_loca
102 "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \
103 "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \
104 "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \
105 - "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \
106 - "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \
107 + "por (%8), %%mm0\n" /* or 1 */ \
108 + "por (%8), %%mm1\n" /* or 1 */ \
109 "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \
110 "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */
111
112 @@ -184,7 +184,7 @@ static void inline dequantize_intra_loca
113 DEQUANTIZE_INTRA_LOCAL_STEP(7)
114 DEQUANTIZE_PRESCALE_STEP(7)
115 : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
116 - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
117 + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
118 : "memory");
119 }
120
121 @@ -256,7 +256,7 @@ static void inline dequantize_inter_glob
122 /* resetting the accumulator when the block is coded intra */
123 DEQUANTIZE_PRESCALE_STEP(7)
124 : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
125 - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
126 + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
127 : "memory");
128
129 asm volatile("movd %%mm6, %0\n" /* export mismatch */
130 @@ -324,8 +324,8 @@ static void inline dequantize_inter_loca
131 "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \
132 "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \
133 "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \
134 - "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \
135 - "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \
136 + "por (%8), %%mm0\n" /* or 1 */ \
137 + "por (%8), %%mm1\n" /* or 1 */ \
138 "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \
139 "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */
140
141 @@ -348,6 +348,6 @@ static void inline dequantize_inter_loca
142 DEQUANTIZE_INTER_LOCAL_STEP(7)
143 DEQUANTIZE_PRESCALE_STEP(7)
144 : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
145 - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
146 + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
147 : "memory");
148 }
149 diff -pruN libfame-0.9.1.orig/src/fame_syntax_mpeg1.c libfame-0.9.1/src/fame_syntax_mpeg1.c
150 --- libfame-0.9.1.orig/src/fame_syntax_mpeg1.c 2008-08-02 23:18:19.167141250 +1000
151 +++ libfame-0.9.1/src/fame_syntax_mpeg1.c 2008-08-02 23:19:00.556140239 +1000
152 @@ -469,89 +469,6 @@ static void mpeg1_block_intra(fame_synta
153 fast_bitbuffer_write(data, shift, table[v+255].code, table[v+255].length);
154
155 /* encode AC coefficients */
156 -#if defined(HAS_BSWAP)
157 - {
158 - unsigned long dummy1, dummy2;
159 -
160 - /* Note:
161 - movsx mpeg1_table_clip+4096(, %%eax ,2), %%eax
162 - has been replaced by
163 - movw mpeg1_table_clip+4096(, %%eax ,2), %%ax
164 - movsx %%ax, %%eax
165 - because the first instruction failed on a PIII!! (wrong sign extension)
166 - whereas it worked well on my P75 :)
167 - */
168 - /* Ok, a bit of explanations for a couple of tricks:
169 - The DC value of block is already coded and stored in v so we can use it to store something.
170 - We add one index to the zigzag table so that after coding block[63] we go to index 0. There
171 - we need to escape the zero counting loop (1), what we ensure by putting a non-zero value in
172 - the DC coefficient. Then we can test for index == 0 to exit.
173 - Now this non-zero value is a bit special :)
174 - In order to have one more 'half' register, we store sp value (16 less significant bit of the
175 - 32 bit register esp) *plus one* in the DC coefficient. Since the stack is aligned at an
176 - address multiple of 4 bytes (at least), we are sure that sp != 0xffff and thus sp+1 will
177 - never be zero. We then retrieve sp at the end for it is needed by 'pop' instructions.
178 - */
179 - /* TODO : echange the role of edx and esp */
180 - __asm__ __volatile__ ("pushl %%ebx\n" /* save ebx */
181 - "pushl %%ebp\n" /* save stack pointer */
182 - "inc %%sp\n" /* make sure sp != 0 */
183 - "movw %%sp, (%%edx)\n" /* store sp+1 in DC ;) */
184 - "movl %%esi, %%ebp\n" /* ebp = vlc_table */
185 - "xorl %%eax, %%eax\n" /* eax = 0 */
186 - "movl $" ASMSYM "mpeg1_zigzag_table+1, %%esi\n" /*esi = zigzag*/
187 - "lea 1(%%esi), %%ebx\n" /* ebx = zigzag_table+1*/
188 - "neg %%ebx\n" /* ebx = -(esi+1) */
189 - ".p2align 4,,7\n" /* align for jump */
190 - "0: xorw %%sp, %%sp\n" /* sp = 0 */
191 - "1: movb (%%esi), %%al\n" /* eax = index in block*/
192 - "incl %%esi\n" /* (faster than lodsb) */
193 - "addw (%%edx, %%eax, 2), %%sp\n" /* sp = unzig */
194 - "jz 1b\n" /* coeff == 0 then loop*/
195 - "orl %%eax, %%eax\n" /* index == 0 then quit*/
196 - "jz 2f\n" /* (faster than jcxz) */
197 - "movsx %%sp, %%eax\n" /* extend sign */
198 - "movw " ASMSYM "mpeg1_table_clip_data+4096(, %%eax ,2), %%ax\n" /*clip*/
199 - "movsx %%ax, %%eax\n" /* extend sign */
200 - "addl %%esi, %%ebx\n" /* ebx = run */
201 - "shll $7, %%eax\n" /* eax *= 128(indexing)*/
202 - "lea (%%eax, %%ebx, 2), %%eax\n" /*eax = 2 * offset*/
203 - "lea (%%ebp, %%eax, 4), %%ebx\n" /* ebx = &vlc */
204 - "movl (%%ebx), %%eax\n" /* eax = code */
205 - "addl 4(%%ebx), %%ecx\n" /* ecx = shift+=length */
206 - "xorl %%ebx, %%ebx\n" /* ebx = 0 */
207 - "shrd %%cl, %%eax, %%ebx\n" /* adjust code to fit */
208 - "shr %%cl, %%eax\n" /* adjust code to fit */
209 - "bswap %%eax\n" /* reverse byte order of code */
210 - "bswap %%ebx\n" /* reverse byte order of code */
211 - "or %%eax, (%%edi)\n" /* put first 32 bits */
212 - "movl %%ecx, %%eax\n" /* eax = shift + length*/
213 - "shrl $5, %%eax\n" /* get dword increment */
214 - "andl $31, %%ecx\n" /* mask shift */
215 - "lea (%%edi, %%eax, 4), %%edi\n"/* data+=(ecx>32)*/
216 - "orl %%ebx, (%%edi)\n" /* put last 32 bits */
217 - "xorl %%eax, %%eax\n" /* eax = 0 */
218 - "lea 1(%%esi), %%ebx\n" /* ebx = esi + 1 (last)*/
219 - "neg %%ebx\n" /* ebx = -(esi + 1) */
220 - "jmp 0b\n" /* loop */
221 - "2:\n"
222 - "movw (%%edx), %%sp\n" /* retrieve sp+1 */
223 - "dec %%sp\n" /* restore esp */
224 - "popl %%ebp\n" /* reload stack pointer*/
225 - "popl %%ebx\n" /* reload ebx */
226 - : "=c"(shift),
227 - "=a"(dummy1),
228 - "=d"(block),
229 - "=D"(data),
230 - "=S"(dummy2)
231 - : "d"(block),
232 - "c"(shift),
233 - "D"(data),
234 - "S"(syntax_mpeg1->vlc_table)
235 - : "memory");
236 - block[0] = v; /* restore DC value */
237 - }
238 -#else
239 {
240 short i;
241 unsigned long last;
242 @@ -573,7 +490,6 @@ static void mpeg1_block_intra(fame_synta
243 }
244 }
245 }
246 -#endif /* HAS_BSWAP */
247
248 /* mark end of block */
249 fast_bitbuffer_write(data, shift, 2, 2);
250 diff -pruN libfame-0.9.1.orig/src/half_mmx.h libfame-0.9.1/src/half_mmx.h
251 --- libfame-0.9.1.orig/src/half_mmx.h 2008-08-02 23:18:19.161140026 +1000
252 +++ libfame-0.9.1/src/half_mmx.h 2008-08-02 23:19:00.556140239 +1000
253 @@ -68,8 +68,8 @@ static void inline mmx_interpolate(unsig
254 "paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/
255 "psrlw $1, %%mm4\n" /* divide by 2 */
256 "psrlw $1, %%mm5\n" /* divide by 2 */
257 - "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */
258 - "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */
259 + "paddw (%8), %%mm3\n" /* add 1 */
260 + "paddw (%8), %%mm6\n" /* add 1 */
261 "packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */
262 "movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */
263 "movq %%mm1, %%mm2\n" /* mm2 = mm1 */
264 @@ -87,7 +87,7 @@ static void inline mmx_interpolate(unsig
265 "movl 12(%0), %3\n" /* %3 = ref[3] */
266 "movq %%mm3, (%3)\n" /* store in frame */
267 : "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
268 - : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
269 + : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
270 : "memory");
271 }
272
273 diff -pruN libfame-0.9.1.orig/src/half_sse.h libfame-0.9.1/src/half_sse.h
274 --- libfame-0.9.1.orig/src/half_sse.h 2008-08-02 23:18:19.160143502 +1000
275 +++ libfame-0.9.1/src/half_sse.h 2008-08-02 23:19:00.557139019 +1000
276 @@ -71,8 +71,8 @@ static void inline mmx_interpolate_signe
277 "paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/
278 "psrlw $1, %%mm4\n" /* divide by 2 */
279 "psrlw $1, %%mm5\n" /* divide by 2 */
280 - "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */
281 - "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */
282 + "paddw (%8), %%mm3\n" /* add 1 */
283 + "paddw (%8), %%mm6\n" /* add 1 */
284 "packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */
285 "movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */
286 "movq %%mm1, %%mm2\n" /* mm2 = mm1 */
287 @@ -90,7 +90,7 @@ static void inline mmx_interpolate_signe
288 "movl 12(%0), %3\n" /* %3 = ref[3] */
289 "movq %%mm3, (%3)\n" /* store in frame */
290 : "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
291 - : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
292 + : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
293 : "memory");
294 }
295
296 diff -pruN libfame-0.9.1.orig/src/idct_mmx.h libfame-0.9.1/src/idct_mmx.h
297 --- libfame-0.9.1.orig/src/idct_mmx.h 2008-08-02 23:18:19.160143502 +1000
298 +++ libfame-0.9.1/src/idct_mmx.h 2008-08-02 23:19:00.557139019 +1000
299 @@ -18,6 +18,10 @@
300 */
301 /*************************** MMX accelerated iDCT ****************************/
302
303 +extern FAME_ALIGNED short const _mmx_1[];
304 +extern FAME_ALIGNED short const _mmx_cos[];
305 +extern FAME_ALIGNED short const _mmx_icos[];
306 +
307 static void inline idct_aan_pass(dct_t * block)
308 {
309 // register unsigned short const *mmx_icos = _mmx_icos;
310 @@ -65,9 +69,9 @@ static void inline idct_aan_pass(dct_t *
311 block[row*8+6] = v45; - v71, v11, v44, v65, v24 -
312 */
313 "psllw $0x02, %%mm5\n" /* adjust v22 for multiply */
314 - "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 for rounding */
315 + "paddw (%2), %%mm5\n" /* + 1 for rounding */
316 // "pmulhw 8(%1), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23) */
317 - "pmulhw " ASMSYM "_mmx_icos+8, %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/
318 + "pmulhw 8(%3), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/
319 "psubsw %%mm4, %%mm5\n" /* v23 - v62 -> mm5 (v24) */
320 "movq %%mm3, %%mm6\n" /* v44 -> mm6 */
321 "paddsw %%mm5, %%mm6\n" /* v44 + v24 -> mm6 (v45) */
322 @@ -125,25 +129,25 @@ static void inline idct_aan_pass(dct_t *
323 block[row*8+4] += v55; - -
324 */
325 "psllw $0x02, %%mm0\n" /* adjust v12 for multiply */
326 - "paddw " ASMSYM "_mmx_1, %%mm0\n" /* + 1 for rounding */
327 + "paddw (%2), %%mm0\n" /* + 1 for rounding */
328 // "pmulhw 8(%1), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
329 - "pmulhw " ASMSYM "_mmx_icos+8, %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
330 + "pmulhw 8(%3), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
331 "movq %%mm2, %%mm6\n" /* v51 -> mm6 */
332 "psubsw %%mm1, %%mm6\n" /* v51 - v71 -> mm6 (va2) */
333 "psllw $0x03, %%mm2\n" /* adjust v51 for multiply */
334 - "paddw " ASMSYM "_mmx_1, %%mm2\n" /* + 1 for rounding */
335 + "paddw (%2), %%mm2\n" /* + 1 for rounding */
336 /* should add another one here but it seems to look better without */
337 // "pmulhw 16(%1), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
338 - "pmulhw " ASMSYM "_mmx_icos+16, %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
339 + "pmulhw 16(%3), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
340 "psllw $0x02, %%mm1\n" /* adjust v71 for multiply */
341 - "paddw " ASMSYM "_mmx_1, %%mm1\n" /* + 1 for rounding */
342 + "paddw (%2), %%mm1\n" /* + 1 for rounding */
343 /* should add another one here but it seems to look better without */
344 // "pmulhw 0(%1), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
345 - "pmulhw " ASMSYM "_mmx_icos, %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
346 + "pmulhw (%3), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
347 "psllw $0x01, %%mm6\n" /* adjust va2 for multiply */
348 - "paddw " ASMSYM "_mmx_1, %%mm6\n" /* + 1 for rounding */
349 + "paddw (%2), %%mm6\n" /* + 1 for rounding */
350 // "pmulhw 24(%1), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
351 - "pmulhw " ASMSYM "_mmx_icos+24, %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
352 + "pmulhw 24(%3), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
353 "psubsw %%mm6, %%mm2\n" /* v53 - va3 -> mm2 (v54) */
354 "psubsw %%mm6, %%mm1\n" /* v73 - va3 -> mm1 (v74) */
355 "psubsw %%mm3, %%mm1\n" /* v74 - v32 -> mm3 (v75) */
356 @@ -167,8 +171,8 @@ static void inline idct_aan_pass(dct_t *
357 "paddsw %%mm0, %%mm7\n" /* v65 + v55 -> mm7 */
358 "movq %%mm6, 0x30(%0)\n" /* mm6 -> line 3 */
359 "movq %%mm7, 0x40(%0)\n" /* mm7 -> line 4 */
360 - : "=r"(block)/*, "=r"(mmx_icos)*/
361 - : "0"(block)/*, "1"(mmx_icos)*/
362 + : "=r"(block)
363 + : "0"(block), "r"(_mmx_1), "r"(_mmx_icos)
364 : "memory");
365 }
366
|