[AD] Clear MMX, 16-bit

[ Thread Index | Date Index | More lists.liballeg.org/allegro-developers Archives ]



Finally IT WORKS !!!!

Ok, here's the code; it's a replacement for src/i386/iblit16.s

If mmx is disabled, it reverts to non mmx clear. If the destination is a video bitmap, it uses the segment decode of %es, else it saves the decode. It also checks for alignment and copies pixels until alinement has been reached.

It's not as optimized as I would like it to be. I'll start working on that, but I need people to test this one. Also, is it possible for memory or system bitmaps not to be in %ds ? If so, I'll need to change this code.

My early test show the MMX function can clear 190.1 MB/sec of RAM, whereas the non-MMX version can do 189 MB/sec. But the Allegro test programs reports 3980 versus 3820 in favor of the MMX version. (This is under Windows). But under DOS (which is for some reason slower than dos-in-win95), my test shows the memory clear speed to be the same (170 MB/sec), but the video RAM clear speed *DOUBLES* with MMX to 86 MB/sec from 42.5 MB/sec !


/*         ______   ___    ___ 
 *        /\  _  \ /\_ \  /\_ \ 
 *        \ \ \L\ \\//\ \ \//\ \      __     __   _ __   ___ 
 *         \ \  __ \ \ \ \  \ \ \   /'__`\ /'_ `\/\`'__\/ __`\
 *          \ \ \/\ \ \_\ \_ \_\ \_/\  __//\ \L\ \ \ \//\ \L\ \
 *           \ \_\ \_\/\____\/\____\ \____\ \____ \ \_\\ \____/
 *            \/_/\/_/\/____/\/____/\/____/\/___L\ \/_/ \/___/
 *                                           /\____/
 *                                           \_/__/
 *
 *      16 bit bitmap blitting (written for speed, not readability :-)
 *
 *      By Shawn Hargreaves.
 *
 *      See readme.txt for copyright information.
 */


#include "asmdefs.inc"
#include "blit.inc"

#ifdef ALLEGRO_COLOR16

.text

/* void _linear_clear_to_color16(BITMAP *bitmap, int color);
 *  Fills a linear bitmap with the specified color. It will use
 *  MMX instructions if available.
 */
FUNC(_linear_clear_to_color16)
   pushl %ebp
   movl %esp, %ebp
   pushl %ebx
   pushl %esi
   pushl %edi
   pushw %es

   #ifdef ALLEGRO_MMX            /* Only use MMX if the compiler supports it */

   movl GLOBL(cpu_mmx), %eax     /* And if it's been enabled (or rather, not disabled :) */
   orl %eax, %eax
   jz clear_no_mmx

   movl ARG1, %edx               /* edx = bmp */

   movl BMP_CT(%edx), %ebx       /* line to start at */

   movw BMP_SEG(%edx), %es       /* select segment */

   movl BMP_CR(%edx), %esi       /* width to clear */
   subl BMP_CL(%edx), %esi
   cld

   movl ARG2, %eax               /* duplicate color twice */
   movl ARG2, %ecx
   shll $16, %eax
   andl $0xFFFF, %ecx
   orl %ecx, %eax

   pushl %eax

   movl BMP_ID(%edx), %eax       /* If the bitmap is from video memory */
   andl $0x80000000, %eax
   jnz clearMMXseg               /* Use an MMX segment-prefixed clear */

   _align_

clearMMX_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl BMP_CL(%edx), %edi
   leal (%eax, %edi, 2), %edi    /* get line address  */

   popl %eax                     /* Get eax back */

   movl %esi, %ecx               /* width to clear */

   movd %eax, %mm0               /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */
   movd %eax, %mm1
   psllq $32, %mm0
   por %mm1, %mm0

   pushl %eax                    /* Save eax */
  
   testl $3, %edi                /* Is destination aligned on 32-bit ? */
   jz clearMMX_aligned

   movw %ax, (%edi)              /* else move one pixel */
   addl $2, %edi

   decl %ecx
   jz clearMMX_no_long

clearMMX_aligned:
   movd %ecx, %mm2               /* Save for later */
   shrl $2, %ecx                 /* Divide by 4 for 8-byte memory move */
   jz clearMMX_finish_line       /* If there's less than 4 pixels to clear, no need for MMX */

clearMMX_continue_line:
   movq %mm0, (%edi)
   addl $8, %edi
   decl %ecx
   jnz clearMMX_continue_line

clearMMX_finish_line:
   movd %mm2, %ecx               /* Get the number of pixels that we had to draw */

   andl $3, %ecx                 /* Check if there's any left */
   jz clearMMX_no_long
                                 /* Else, write trailing pixels */

   shrl $1, %ecx
   jz clearMMX_finish_line2

   movw %ax, (%edi)
   addl $2, %edi
   jz clearMMX_no_long

clearMMX_finish_line2:

   shrl $1, %ecx
   jz clearMMX_no_long

   movl %eax, (%edi)
   addl $4, %edi

clearMMX_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clearMMX_loop              /* and loop */

   popl %eax

   emms                          /* Clear FPU tag word */

   jmp clear_end

clearMMXseg:

   _align_

clearMMXseg_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl BMP_CL(%edx), %edi
   leal (%eax, %edi, 2), %edi    /* get line address  */

   popl %eax                     /* Get eax back */

   movl %esi, %ecx               /* width to clear */

   movd %eax, %mm0               /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */
   movd %eax, %mm1
   psllq $32, %mm0
   por %mm1, %mm0

   pushl %eax                    /* Save eax */
  
   testl $3, %edi                /* Is destination aligned on 32-bit ? */
   jz clearMMXseg_aligned

   movw %ax, %es:(%edi)              /* else move one pixel */
   addl $2, %edi

   decl %ecx
   jz clearMMXseg_no_long

clearMMXseg_aligned:
   movd %ecx, %mm2               /* Save for later */
   shrl $2, %ecx                 /* Divide by 4 for 8-byte memory move */
   jz clearMMXseg_finish_line    /* If there's less than 4 pixels to clear, no need for MMX */

clearMMXseg_continue_line:
   movq %mm0, %es:(%edi)
   addl $8, %edi
   decl %ecx
   jnz clearMMXseg_continue_line

clearMMXseg_finish_line:
   movd %mm2, %ecx               /* Get the number of pixels that we had to draw */

   andl $3, %ecx                 /* Check if there's any left */
   jz clearMMXseg_no_long
                                 /* Else, write trailing pixels */

   shrl $1, %ecx
   jz clearMMXseg_finish_line2

   movw %ax, %es:(%edi)
   addl $2, %edi
   jz clearMMXseg_no_long

clearMMXseg_finish_line2:

   shrl $1, %ecx
   jz clearMMXseg_no_long

   movl %eax, %es:(%edi)
   addl $4, %edi

clearMMXseg_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clearMMXseg_loop              /* and loop */

   popl %eax

   emms                             /* Clear FPU tag word */

   jmp clear_end

   #endif /* ALLEGRO_MMX */

clear_no_mmx:                    /* If no MMX is available, use the non-MMX version */

   movl ARG1, %edx               /* edx = bmp */
   movl BMP_CT(%edx), %ebx       /* line to start at */

   movw BMP_SEG(%edx), %es       /* select segment */

   movl BMP_CR(%edx), %esi       /* width to clear */
   subl BMP_CL(%edx), %esi
   cld

   _align_
clear_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl BMP_CL(%edx), %edi 
   leal (%eax, %edi, 2), %edi    /* get line address  */

   movw ARG2, %ax                /* duplicate color twice */
   shll $16, %eax
   movw ARG2, %ax 

   movl %esi, %ecx               /* width to clear */
   shrl $1, %ecx                 /* halve for 32 bit clear */
   jnc clear_no_word
   stosw                         /* clear an odd word */

clear_no_word:
   jz clear_no_long 

   rep ; stosl                   /* clear the line */

clear_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clear_loop                 /* and loop */

clear_end:

   UNWRITE_BANK()

   popw %es
   popl %edi
   popl %esi
   popl %ebx
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_clear_to_color16() */



/* void _linear_blit16(BITMAP *source, BITMAP *dest, int source_x, source_y, 
 *                                     int dest_x, dest_y, int width, height);
 *  Normal forwards blitting routine for linear bitmaps.
 */
FUNC(_linear_blit16)
   pushl %ebp
   movl %esp, %ebp
   pushw %es 
   pushl %edi
   pushl %esi
   pushl %ebx

   movl B_DEST, %edx
   movw BMP_SEG(%edx), %es       /* load destination segment */
   movw %ds, %bx                 /* save data segment selector */
   cld                           /* for forward copy */

   shrl $1, B_WIDTH              /* halve counter for long copies */
   jz blit_only_one_word
   jnc blit_even_words

   _align_
   BLIT_LOOP(longs_and_word, 2,  /* long at a time, plus leftover word */
      rep ; movsl
      movsw
   )
   jmp blit_done

   _align_
blit_even_words: 
   BLIT_LOOP(even_words, 2,      /* copy a long at a time */
      rep ; movsl
   )
   jmp blit_done

   _align_
blit_only_one_word: 
   BLIT_LOOP(only_one_word, 2,   /* copy just the one word */
      movsw
   )

   _align_
blit_done:
   movl B_SOURCE, %edx
   UNWRITE_BANK()

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   popw %es
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_blit16() */




/* void _linear_blit_backward16(BITMAP *source, BITMAP *dest, int source_x, 
 *                      int source_y, int dest_x, dest_y, int width, height);
 *  Reverse blitting routine, for overlapping linear bitmaps.
 */
FUNC(_linear_blit_backward16)
   pushl %ebp
   movl %esp, %ebp
   pushw %es 
   pushl %edi
   pushl %esi
   pushl %ebx

   movl B_HEIGHT, %eax           /* y values go from high to low */
   decl %eax
   addl %eax, B_SOURCE_Y
   addl %eax, B_DEST_Y

   movl B_WIDTH, %eax            /* x values go from high to low */
   decl %eax
   addl %eax, B_SOURCE_X
   addl %eax, B_DEST_X

   movl B_DEST, %edx
   movw BMP_SEG(%edx), %es       /* load destination segment */
   movw %ds, %bx                 /* save data segment selector */

   _align_
blit_backwards_loop:
   movl B_DEST, %edx             /* destination bitmap */
   movl B_DEST_Y, %eax           /* line number */
   WRITE_BANK()                  /* select bank */
   movl B_DEST_X, %edi           /* x offset */
   leal (%eax, %edi, 2), %edi

   movl B_SOURCE, %edx           /* source bitmap */
   movl B_SOURCE_Y, %eax         /* line number */
   READ_BANK()                   /* select bank */
   movl B_SOURCE_X, %esi         /* x offset */
   leal (%eax, %esi, 2), %esi

   movl B_WIDTH, %ecx            /* x loop counter */
   movw BMP_SEG(%edx), %ds       /* load data segment */
   std                           /* backwards */
   rep ; movsw                   /* copy the line */

   movw %bx, %ds                 /* restore data segment */
   decl B_SOURCE_Y
   decl B_DEST_Y
   decl B_HEIGHT
   jg blit_backwards_loop        /* and loop */

   cld                           /* finished */

   movl B_SOURCE, %edx
   UNWRITE_BANK()

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   popw %es
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_blit_backward16() */

FUNC(_linear_blit16_end)
   ret




/* void _linear_masked_blit16(BITMAP *source, *dest, int source_x, source_y, 
 *                            int dest_x, dest_y, int width, height);
 *  Masked (skipping zero pixels) blitting routine for linear bitmaps.
 */
FUNC(_linear_masked_blit16)
   pushl %ebp
   movl %esp, %ebp
   pushw %es 
   pushl %edi
   pushl %esi
   pushl %ebx

   movl B_DEST, %edx
   movw BMP_SEG(%edx), %es 
   movw %ds, %bx 
   cld 

   _align_
   BLIT_LOOP(masked, 2,

      movl BMP_VTABLE(%edx), %edx
      movl VTABLE_MASK_COLOR(%edx), %edx

      _align_
   masked_blit_x_loop:
      movw (%esi), %ax           /* read a byte */
      addl $2, %esi

      cmpw %ax, %dx              /* test it */
      je masked_blit_skip

      movw %ax, %es:(%edi)       /* write the pixel */
      addl $2, %edi
      decl %ecx
      jg masked_blit_x_loop
      jmp masked_blit_x_loop_done

      _align_
   masked_blit_skip:
      addl $2, %edi              /* skip zero pixels */
      decl %ecx
      jg masked_blit_x_loop

   masked_blit_x_loop_done:
   )

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   popw %es
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_masked_blit16() */




#endif      /* ifdef ALLEGRO_COLOR16 */


 - Robert J Ohannessian


There is always one more bug.


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/