summaryrefslogtreecommitdiff
path: root/libfame/libfame-0.9.1-mmx-pic.patch
blob: 838ee19ae2a158c9adf59637da43c8859fa91d8e (plain)
    1 diff -pruN libfame-0.9.1.orig/src/dct_mmx.h libfame-0.9.1/src/dct_mmx.h
    2 --- libfame-0.9.1.orig/src/dct_mmx.h	2008-08-02 23:18:19.164140803 +1000
    3 +++ libfame-0.9.1/src/dct_mmx.h	2008-08-02 23:19:00.554142886 +1000
    4 @@ -22,6 +22,9 @@
    5  
    6  #define precision
    7  
    8 +extern FAME_ALIGNED short const _mmx_1[];
    9 +extern FAME_ALIGNED short const _mmx_cos[];
   10 +
   11  static void inline dct_aan_pass(dct_t *cache)
   12  {
   13    //  register unsigned short const *mmx_cos = _mmx_cos;
   14 @@ -66,42 +69,42 @@ static void inline dct_aan_pass(dct_t *c
   15  #ifdef precision
   16        "psllw $0x01, %%mm5\n"                 /* precision(va0) += 1 bit */
   17  #endif
   18 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */
   19 +    "paddw (%2), %%mm4\n"                    /* + 1 */
   20  //      "pmulhw 16(%1), %%mm5\n"               /* (v14+v16)*COS6 -> mm5 (va0) */
   21 -      "pmulhw " ASMSYM "_mmx_cos+16, %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
   22 +      "pmulhw 16(%3), %%mm5\n"               /* (v14+v16)*COS6 -> mm5 (va0) */
   23        ""                                     /* STEP 4 */
   24  #ifdef precision
   25        "psllw $0x02, %%mm6\n"                 /* precision(v22) += 1 bit */
   26  #else
   27        "psllw $0x01, %%mm6\n"                 /* */
   28  #endif
   29 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */
   30 +    "paddw (%2), %%mm4\n"                    /* + 1 */
   31        //      "pmulhw  8(%1), %%mm6\n"               /* 2*v22*COS4/2 -> mm6 (v32)*/
   32 -      "pmulhw " ASMSYM "_mmx_cos+8, %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
   33 +      "pmulhw 8(%3), %%mm6\n"                /* 2*v22*COS4/2 -> mm6 (v32)*/
   34  #ifdef precision
   35        "psllw $0x02, %%mm2\n"                 /* precision(v15) += 1 bit */
   36  #else
   37        "psllw $0x01, %%mm2\n"                 /* */
   38  #endif
   39 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */
   40 +    "paddw (%2), %%mm4\n"                    /* + 1 */
   41        //      "pmulhw  8(%1), %%mm2\n"               /* 2*v15*COS4/2 -> mm2 (v35) */
   42 -      "pmulhw " ASMSYM "_mmx_cos+8, %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
   43 +      "pmulhw 8(%3), %%mm2\n"                /* 2*v15*COS4/2 -> mm2 (v35) */
   44  #ifdef precision
   45        "psllw $0x02, %%mm4\n"                 /* precision(v14) += 1 bit */
   46  #else
   47        "psllw $0x01, %%mm4\n"                 /* */
   48  #endif
   49 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */
   50 +    "paddw (%2), %%mm4\n"                    /* + 1 */
   51        //      "pmulhw  0(%1), %%mm4\n"               /* 2 * v14 * -COS2/2 -> mm4 */
   52 -      "pmulhw " ASMSYM "_mmx_cos, %%mm4\n"   /* 2 * v14 * -COS2/2 -> mm4 */
   53 +      "pmulhw (%3), %%mm4\n"                 /* 2 * v14 * -COS2/2 -> mm4 */
   54        "psubsw %%mm5, %%mm4\n"                /* v14*-COS2 - va0 -> mm4 (v34) */
   55  #ifdef precision
   56        "psllw $0x01, %%mm1\n"                 /* precision(v16) += 1 bit */
   57  #endif
   58        "psubsw %%mm1, %%mm5\n"                /* va0 - v16 -> mm5 */
   59 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */
   60 +    "paddw (%2), %%mm4\n"                    /* + 1 */
   61        //      "pmulhw 24(%1), %%mm1\n"               /* v16 * (COS8 - 1) -> mm1 */
   62 -      "pmulhw " ASMSYM "_mmx_cos+24, %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
   63 +      "pmulhw 24(%3), %%mm1\n"               /* v16 * (COS8 - 1) -> mm1 */
   64        "psubsw %%mm5, %%mm1\n"                /* v16 * COS8 - va0 -> mm1 (v36)*/
   65        ""                                     /* STEP 5 */
   66        "movq 0x70(%0), %%mm0\n"               /* retrieve v07 -> mm0 */
   67 @@ -138,8 +141,8 @@ static void inline dct_aan_pass(dct_t *c
   68        "movq %%mm0, 0x30(%0)\n"               /* store line 3 */
   69        "movq %%mm4, 0x50(%0)\n"               /* store line 5 */
   70        "movq %%mm2, 0x70(%0)\n"               /* store line 7 */
   71 -      : "=r"(cache)/*, "=r"(mmx_cos)*/
   72 -      : "0"(cache)/*, "1"(mmx_cos)*/
   73 +      : "=r"(cache)
   74 +      : "0"(cache), "r"(_mmx_1), "r"(_mmx_cos)
   75        : "memory");
   76  }
   77  
   78 diff -pruN libfame-0.9.1.orig/src/dequantize_mmx.h libfame-0.9.1/src/dequantize_mmx.h
   79 --- libfame-0.9.1.orig/src/dequantize_mmx.h	2008-08-02 23:18:19.164140803 +1000
   80 +++ libfame-0.9.1/src/dequantize_mmx.h	2008-08-02 23:19:00.555141217 +1000
   81 @@ -27,8 +27,8 @@
   82      "pmullw 0x" #x "8(%3), %%mm5\n"   /* premultiply for iDCT */	 \
   83      "psrlw $0x0b, %%mm4\n"            /* keep 5 bits */ \
   84      "psrlw $0x0b, %%mm5\n"            /* keep 5 bits */ \
   85 -    "paddw " ASMSYM "_mmx_1, %%mm4\n"   /* + 1 */               \
   86 -    "paddw " ASMSYM "_mmx_1, %%mm5\n"   /* + 1 */               \
   87 +    "paddw (%8), %%mm4\n"             /* + 1 */               \
   88 +    "paddw (%8), %%mm5\n"             /* + 1 */               \
   89      "psrlw $0x01, %%mm4\n"            /* keep 4 bits rounded */ \
   90      "psrlw $0x01, %%mm5\n"            /* keep 4 bits rounded */ \
   91      "psllw $0x04, %%mm0\n"            /* multiply by 16 for iDCT */	 \
   92 @@ -107,7 +107,7 @@ static void inline dequantize_intra_glob
   93  		DEQUANTIZE_GLOBAL_MISMATCH_CONTROL()
   94  		DEQUANTIZE_PRESCALE_STEP(7)
   95  	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
   96 -		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
   97 +		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
   98  		: "memory");
   99  
  100    asm volatile("movd %%mm6, %0\n"           /* export mismatch */
  101 @@ -160,8 +160,8 @@ static void inline dequantize_intra_loca
  102      "pcmpeqw %%mm7, %%mm3\n"          /* invert sign */                  \
  103      "paddw %%mm2, %%mm0\n"            /* sub 1 if >0 */                  \
  104      "paddw %%mm3, %%mm1\n"            /* sub 1 if >0 */                  \
  105 -    "por " ASMSYM "_mmx_1, %%mm0\n"   /* or 1 */               \
  106 -    "por " ASMSYM "_mmx_1, %%mm1\n"   /* or 1 */              \
  107 +    "por (%8), %%mm0\n"               /* or 1 */               \
  108 +    "por (%8), %%mm1\n"               /* or 1 */              \
  109      "pand %%mm4, %%mm0\n"             /* [0-3]=0 if [0-3] was zero */	 \
  110      "pand %%mm5, %%mm1\n"             /* [4-7]=0 if [4-7] was zero */
  111  
  112 @@ -184,7 +184,7 @@ static void inline dequantize_intra_loca
  113  		DEQUANTIZE_INTRA_LOCAL_STEP(7)
  114  		DEQUANTIZE_PRESCALE_STEP(7)
  115  	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
  116 -		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
  117 +		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
  118  		: "memory");
  119  }
  120  
  121 @@ -256,7 +256,7 @@ static void inline dequantize_inter_glob
  122  		/* resetting the accumulator when the block is coded intra */
  123  		DEQUANTIZE_PRESCALE_STEP(7) 
  124  	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
  125 -		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
  126 +		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
  127  		: "memory");
  128  
  129    asm volatile("movd %%mm6, %0\n"           /* export mismatch */
  130 @@ -324,8 +324,8 @@ static void inline dequantize_inter_loca
  131      "pcmpeqw %%mm7, %%mm3\n"          /* invert sign */                  \
  132      "paddw %%mm2, %%mm0\n"            /* sub 1 if >0 */                  \
  133      "paddw %%mm3, %%mm1\n"            /* sub 1 if >0 */                  \
  134 -    "por " ASMSYM "_mmx_1, %%mm0\n"   /* or 1 */               \
  135 -    "por " ASMSYM "_mmx_1, %%mm1\n"   /* or 1 */               \
  136 +    "por (%8), %%mm0\n"               /* or 1 */               \
  137 +    "por (%8), %%mm1\n"               /* or 1 */               \
  138      "pand %%mm4, %%mm0\n"             /* [0-3]=0 if [0-3] was zero */	 \
  139      "pand %%mm5, %%mm1\n"             /* [4-7]=0 if [4-7] was zero */
  140  
  141 @@ -348,6 +348,6 @@ static void inline dequantize_inter_loca
  142  		DEQUANTIZE_INTER_LOCAL_STEP(7)
  143  		DEQUANTIZE_PRESCALE_STEP(7)
  144  	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
  145 -		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
  146 +		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
  147  		: "memory");
  148  }
  149 diff -pruN libfame-0.9.1.orig/src/fame_syntax_mpeg1.c libfame-0.9.1/src/fame_syntax_mpeg1.c
  150 --- libfame-0.9.1.orig/src/fame_syntax_mpeg1.c	2008-08-02 23:18:19.167141250 +1000
  151 +++ libfame-0.9.1/src/fame_syntax_mpeg1.c	2008-08-02 23:19:00.556140239 +1000
  152 @@ -469,89 +469,6 @@ static void mpeg1_block_intra(fame_synta
  153    fast_bitbuffer_write(data, shift, table[v+255].code, table[v+255].length);
  154  
  155    /* encode AC coefficients */
  156 -#if defined(HAS_BSWAP)
  157 -  {
  158 -    unsigned long dummy1, dummy2;
  159 -
  160 -    /* Note:
  161 -        movsx mpeg1_table_clip+4096(, %%eax ,2), %%eax
  162 -       has been replaced by
  163 -        movw mpeg1_table_clip+4096(, %%eax ,2), %%ax
  164 -        movsx %%ax, %%eax
  165 -       because the first instruction failed on a PIII!! (wrong sign extension)
  166 -       whereas it worked well on my P75 :)
  167 -    */
  168 -    /* Ok, a bit of explanations for a couple of tricks:
  169 -         The DC value of block is already coded and stored in v so we can use it to store something.
  170 -	 We add one index to the zigzag table so that after coding block[63] we go to index 0. There
  171 -	 we need to escape the zero counting loop (1), what we ensure by putting a non-zero value in
  172 -	 the DC coefficient. Then we can test for index == 0 to exit.
  173 -	 Now this non-zero value is a bit special :)
  174 -	 In order to have one more 'half' register, we store sp value (16 less significant bit of the
  175 -	 32 bit register esp) *plus one* in the DC coefficient. Since the stack is aligned at an 
  176 -	 address multiple of 4 bytes (at least), we are sure that sp != 0xffff and thus sp+1 will
  177 -	 never be zero. We then retrieve sp at the end for it is needed by 'pop' instructions.
  178 -     */
  179 -    /* TODO : echange the role of edx and esp */
  180 -    __asm__ __volatile__ ("pushl %%ebx\n"             /* save ebx            */
  181 -			  "pushl %%ebp\n"             /* save stack pointer  */
  182 -			  "inc %%sp\n"                /* make sure sp != 0   */
  183 -			  "movw %%sp, (%%edx)\n"      /* store sp+1 in DC ;) */
  184 -			  "movl %%esi, %%ebp\n"       /* ebp = vlc_table     */
  185 -			  "xorl %%eax, %%eax\n"       /* eax = 0             */
  186 -			  "movl $" ASMSYM "mpeg1_zigzag_table+1, %%esi\n" /*esi = zigzag*/
  187 -			  "lea 1(%%esi), %%ebx\n"     /* ebx = zigzag_table+1*/
  188 -			  "neg %%ebx\n"               /* ebx = -(esi+1)      */
  189 -			  ".p2align 4,,7\n"           /* align for jump      */
  190 -			  "0: xorw %%sp, %%sp\n"      /* sp = 0              */
  191 -			  "1: movb (%%esi), %%al\n"   /* eax = index in block*/
  192 -			  "incl %%esi\n"              /* (faster than lodsb) */
  193 -			  "addw (%%edx, %%eax, 2), %%sp\n" /* sp = unzig     */
  194 -			  "jz 1b\n"                   /* coeff == 0 then loop*/
  195 -			  "orl %%eax, %%eax\n"        /* index == 0 then quit*/
  196 -			  "jz 2f\n"                   /* (faster than jcxz)  */
  197 -			  "movsx %%sp, %%eax\n"       /* extend sign         */
  198 -			  "movw " ASMSYM "mpeg1_table_clip_data+4096(, %%eax ,2), %%ax\n" /*clip*/
  199 -			  "movsx %%ax, %%eax\n"       /* extend sign         */
  200 -			  "addl %%esi, %%ebx\n"       /* ebx = run           */
  201 -			  "shll $7, %%eax\n"          /* eax *= 128(indexing)*/
  202 -			  "lea (%%eax, %%ebx, 2), %%eax\n" /*eax = 2 * offset*/
  203 -			  "lea (%%ebp, %%eax, 4), %%ebx\n" /* ebx = &vlc     */
  204 -			  "movl (%%ebx), %%eax\n"     /* eax = code          */
  205 -			  "addl 4(%%ebx), %%ecx\n"    /* ecx = shift+=length */
  206 -			  "xorl %%ebx, %%ebx\n"       /* ebx = 0             */
  207 -			  "shrd %%cl, %%eax, %%ebx\n" /* adjust code to fit  */
  208 -			  "shr %%cl, %%eax\n"         /* adjust code to fit  */
  209 -			  "bswap %%eax\n"      /* reverse byte order of code */
  210 -			  "bswap %%ebx\n"      /* reverse byte order of code */
  211 -			  "or %%eax, (%%edi)\n"       /* put first 32 bits   */
  212 -			  "movl %%ecx, %%eax\n"       /* eax = shift + length*/
  213 -			  "shrl $5, %%eax\n"          /* get dword increment */
  214 -			  "andl $31, %%ecx\n"         /* mask shift          */
  215 -			  "lea   (%%edi, %%eax, 4), %%edi\n"/* data+=(ecx>32)*/
  216 -			  "orl %%ebx, (%%edi)\n"      /* put last 32 bits    */
  217 -			  "xorl %%eax, %%eax\n"       /* eax = 0             */
  218 -			  "lea 1(%%esi), %%ebx\n"     /* ebx = esi + 1 (last)*/
  219 -			  "neg %%ebx\n"               /* ebx = -(esi + 1)    */
  220 -			  "jmp 0b\n"                  /* loop                */
  221 -			  "2:\n"
  222 -			  "movw (%%edx), %%sp\n"      /* retrieve sp+1       */
  223 -			  "dec %%sp\n"                /* restore esp */
  224 -			  "popl %%ebp\n"              /* reload stack pointer*/
  225 -			  "popl %%ebx\n"              /* reload ebx          */
  226 -			  : "=c"(shift),
  227 -			    "=a"(dummy1),
  228 -			    "=d"(block),
  229 -			    "=D"(data),
  230 -			    "=S"(dummy2)
  231 -			  : "d"(block),
  232 -			    "c"(shift),
  233 -			    "D"(data),
  234 -                            "S"(syntax_mpeg1->vlc_table)
  235 -			  : "memory");
  236 -    block[0] = v; /* restore DC value */
  237 -  }
  238 -#else
  239    {
  240      short i;
  241      unsigned long last;
  242 @@ -573,7 +490,6 @@ static void mpeg1_block_intra(fame_synta
  243        }
  244      }
  245    }
  246 -#endif /* HAS_BSWAP */
  247  
  248    /* mark end of block */
  249    fast_bitbuffer_write(data, shift, 2, 2);
  250 diff -pruN libfame-0.9.1.orig/src/half_mmx.h libfame-0.9.1/src/half_mmx.h
  251 --- libfame-0.9.1.orig/src/half_mmx.h	2008-08-02 23:18:19.161140026 +1000
  252 +++ libfame-0.9.1/src/half_mmx.h	2008-08-02 23:19:00.556140239 +1000
  253 @@ -68,8 +68,8 @@ static void inline mmx_interpolate(unsig
  254  		"paddw %%mm5, %%mm6\n"     /* mm6 = ref00+ref10+ref11+1-r 4-7*/
  255  		"psrlw $1, %%mm4\n"        /* divide by 2 */
  256  		"psrlw $1, %%mm5\n"        /* divide by 2 */
  257 -		"paddw " ASMSYM "_mmx_one, %%mm3\n"  /* add 1 */
  258 -		"paddw " ASMSYM "_mmx_one, %%mm6\n"  /* add 1 */
  259 +		"paddw (%8), %%mm3\n"      /* add 1 */
  260 +		"paddw (%8), %%mm6\n"      /* add 1 */
  261  		"packuswb %%mm5, %%mm4\n"  /* pack to byte and saturate */
  262  		"movq 1(%3), %%mm1\n"      /* mm1 = [ref+1] */
  263  		"movq %%mm1, %%mm2\n"      /* mm2 = mm1 */
  264 @@ -87,7 +87,7 @@ static void inline mmx_interpolate(unsig
  265  		"movl 12(%0), %3\n"        /* %3 = ref[3] */
  266  		"movq %%mm3, (%3)\n"       /* store in frame */
  267  		: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
  268 -		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
  269 +		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
  270  		: "memory");
  271  }
  272  
  273 diff -pruN libfame-0.9.1.orig/src/half_sse.h libfame-0.9.1/src/half_sse.h
  274 --- libfame-0.9.1.orig/src/half_sse.h	2008-08-02 23:18:19.160143502 +1000
  275 +++ libfame-0.9.1/src/half_sse.h	2008-08-02 23:19:00.557139019 +1000
  276 @@ -71,8 +71,8 @@ static void inline mmx_interpolate_signe
  277  		"paddw %%mm5, %%mm6\n"     /* mm6 = ref00+ref10+ref11+1-r 4-7*/
  278  		"psrlw $1, %%mm4\n"        /* divide by 2 */
  279  		"psrlw $1, %%mm5\n"        /* divide by 2 */
  280 -		"paddw " ASMSYM "_mmx_one, %%mm3\n"  /* add 1 */
  281 -		"paddw " ASMSYM "_mmx_one, %%mm6\n"  /* add 1 */
  282 +		"paddw (%8), %%mm3\n"      /* add 1 */
  283 +		"paddw (%8), %%mm6\n"      /* add 1 */
  284  		"packuswb %%mm5, %%mm4\n"  /* pack to byte and saturate */
  285  		"movq 1(%3), %%mm1\n"      /* mm1 = [ref+1] */
  286  		"movq %%mm1, %%mm2\n"      /* mm2 = mm1 */
  287 @@ -90,7 +90,7 @@ static void inline mmx_interpolate_signe
  288  		"movl 12(%0), %3\n"        /* %3 = ref[3] */
  289  		"movq %%mm3, (%3)\n"       /* store in frame */
  290  		: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
  291 -		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
  292 +		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
  293  		: "memory");
  294  }
  295  
  296 diff -pruN libfame-0.9.1.orig/src/idct_mmx.h libfame-0.9.1/src/idct_mmx.h
  297 --- libfame-0.9.1.orig/src/idct_mmx.h	2008-08-02 23:18:19.160143502 +1000
  298 +++ libfame-0.9.1/src/idct_mmx.h	2008-08-02 23:19:00.557139019 +1000
  299 @@ -18,6 +18,10 @@
  300  */
  301  /*************************** MMX accelerated iDCT ****************************/
  302  
  303 +extern FAME_ALIGNED short const _mmx_1[];
  304 +extern FAME_ALIGNED short const _mmx_cos[];
  305 +extern FAME_ALIGNED short const _mmx_icos[];
  306 +
  307  static void inline idct_aan_pass(dct_t * block)
  308  {
  309    //  register unsigned short const *mmx_icos = _mmx_icos;
  310 @@ -65,9 +69,9 @@ static void inline idct_aan_pass(dct_t *
  311  	block[row*8+6] = v45;           - v71, v11, v44, v65, v24 -
  312        */
  313        "psllw $0x02, %%mm5\n"               /* adjust v22 for multiply      */
  314 -      "paddw " ASMSYM "_mmx_1, %%mm5\n"   /* + 1 for rounding */
  315 +      "paddw (%2), %%mm5\n"                /* + 1 for rounding */
  316        //      "pmulhw 8(%1), %%mm5\n"              /* 4*v15*ICOS4/4 -> mm5 (v23)   */
  317 -      "pmulhw " ASMSYM "_mmx_icos+8, %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/
  318 +      "pmulhw 8(%3), %%mm5\n"              /* 4*v15*ICOS4/4 -> mm5 (v23)*/
  319        "psubsw %%mm4, %%mm5\n"              /* v23 - v62 -> mm5 (v24)       */
  320        "movq %%mm3, %%mm6\n"                /* v44 -> mm6                   */
  321        "paddsw %%mm5, %%mm6\n"              /* v44 + v24 -> mm6 (v45)       */
  322 @@ -125,25 +129,25 @@ static void inline idct_aan_pass(dct_t *
  323  	block[row*8+4] += v55;          - -
  324        */
  325        "psllw $0x02, %%mm0\n"               /* adjust v12 for multiply      */
  326 -      "paddw " ASMSYM "_mmx_1, %%mm0\n"   /* + 1 for rounding */
  327 +      "paddw (%2), %%mm0\n"                /* + 1 for rounding */
  328        //      "pmulhw 8(%1), %%mm0\n"              /* 4*v12*ICOS4/4 -> mm0 (v13)   */
  329 -      "pmulhw " ASMSYM "_mmx_icos+8, %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13)  */
  330 +      "pmulhw 8(%3), %%mm0\n"              /* 4*v12*ICOS4/4 -> mm0 (v13)  */
  331        "movq   %%mm2, %%mm6\n"              /* v51 -> mm6                   */
  332        "psubsw %%mm1, %%mm6\n"              /* v51 - v71 -> mm6 (va2)       */
  333        "psllw $0x03, %%mm2\n"               /* adjust v51 for multiply      */
  334 -      "paddw " ASMSYM "_mmx_1, %%mm2\n"   /* + 1 for rounding */
  335 +      "paddw (%2), %%mm2\n"                /* + 1 for rounding */
  336        /* should add another one here but it seems to look better without */
  337        //      "pmulhw 16(%1), %%mm2\n"             /* 8*v51*ICOS6/8 -> mm2 (v53)   */
  338 -      "pmulhw " ASMSYM "_mmx_icos+16, %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
  339 +      "pmulhw 16(%3), %%mm2\n"             /* 8*v51*ICOS6/8 -> mm2 (v53) */
  340        "psllw $0x02, %%mm1\n"               /* adjust v71 for multiply      */
  341 -      "paddw " ASMSYM "_mmx_1, %%mm1\n"   /* + 1 for rounding */
  342 +      "paddw (%2), %%mm1\n"                /* + 1 for rounding */
  343        /* should add another one here but it seems to look better without */
  344        //      "pmulhw 0(%1), %%mm1\n"              /* 4*v71*ICOS2/4 -> mm1 (v73)   */
  345 -      "pmulhw " ASMSYM "_mmx_icos, %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73)   */
  346 +      "pmulhw (%3), %%mm1\n"               /* 4*v71*ICOS2/4 -> mm1 (v73)   */
  347        "psllw $0x01, %%mm6\n"               /* adjust va2 for multiply      */
  348 -      "paddw " ASMSYM "_mmx_1, %%mm6\n"   /* + 1 for rounding */
  349 +      "paddw (%2), %%mm6\n"                /* + 1 for rounding */
  350        //      "pmulhw 24(%1), %%mm6\n"             /* 2*v12*ICOS8/2 -> mm6 (va3)   */
  351 -      "pmulhw " ASMSYM "_mmx_icos+24, %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
  352 +      "pmulhw 24(%3), %%mm6\n"             /* 2*v12*ICOS8/2 -> mm6 (va3) */
  353        "psubsw %%mm6, %%mm2\n"              /* v53 - va3 -> mm2 (v54)       */
  354        "psubsw %%mm6, %%mm1\n"              /* v73 - va3 -> mm1 (v74)       */
  355        "psubsw %%mm3, %%mm1\n"              /* v74 - v32 -> mm3 (v75)       */
  356 @@ -167,8 +171,8 @@ static void inline idct_aan_pass(dct_t *
  357        "paddsw %%mm0, %%mm7\n"              /* v65 + v55 -> mm7             */
  358        "movq %%mm6, 0x30(%0)\n"             /* mm6 -> line 3                */
  359        "movq %%mm7, 0x40(%0)\n"             /* mm7 -> line 4                */
  360 -      : "=r"(block)/*, "=r"(mmx_icos)*/
  361 -      : "0"(block)/*, "1"(mmx_icos)*/
  362 +      : "=r"(block)
  363 +      : "0"(block), "r"(_mmx_1), "r"(_mmx_icos)
  364        : "memory");
  365  }
  366  

Generated by cgit