Re: [AD] Bug in Allegro's color convertors? |
[ Thread Index |
Date Index
| More lists.liballeg.org/allegro-developers Archives
]
Eric Botcazou wrote:
[snip]
I also permuted the MMX registers to let them match the pixel index number.
Could you take a look and re-schedule the MMX instructions if needed ? You
know much more the topic than I do.
Done. I make some changes to the MMX code to get better timing on the i686.
I shaved another 2 cycles out of it. I wasn't able to do anything for the
non-MMX code however. It's very tight considering the Pentium pairing rules.
[snip]
- non-MMX code on a Pentium Classic/200:
Comparing test profile logs 32to24_noMMX.log and 32to24-2_noMMX.log
What's surprising is the lack of coherency in between the various functions.
For example, circlefill is slower, but hline has the same speed. Same for
putpixel vs circle. This is probably due to random noise in the system
(Windows). I wouldn't worry too much about it, especially since this is the
worst combination of color depths, speedwise.
[snip]
--
- Robert J Ohannessian
"Microsoft code is probably O(n^20)" (my CS prof)
http://pages.infinit.net/voidstar/
Index: icolconv.s
===================================================================
RCS file: /cvsroot/alleg/allegro/src/misc/icolconv.s,v
retrieving revision 1.23
diff -u -b -r1.23 icolconv.s
--- icolconv.s 14 Jan 2002 11:05:27 -0000 1.23
+++ icolconv.s 25 Apr 2002 23:17:32 -0000
@@ -1250,6 +1250,10 @@
pushl %edi
/* init register values */
+ movl $0xFFFFFF, %eax /* get RGB mask */
+ movd %eax, %mm5 /* low RGB mask in mm5 */
+ movd %eax, %mm6
+ psllq $32, %mm6 /* high RGB mask in mm6 */
movl ARG1, %eax /* eax = src_rect */
movl GFXRECT_WIDTH(%eax), %ecx /* ecx = src_rect->width */
@@ -1287,29 +1291,37 @@
_align_
next_block_32_to_24:
- movq (%esi), %mm0 /* mm0 = [.RGB1][.RGB0] */
- movq 8(%esi), %mm1 /* mm1 = [.RGB3][.RGB2] */
- movq %mm0, %mm2
- movq %mm1, %mm3
- movq %mm1, %mm4
- psllq $48, %mm3
- psllq $40, %mm0
- psrlq $32, %mm2
- psrlq $40, %mm0
- psllq $24, %mm2
- por %mm3, %mm0
- por %mm2, %mm0
- psllq $8, %mm4
- psllq $40, %mm1
- psrlq $32, %mm4
- psrlq $56, %mm1
- por %mm4, %mm1
- movq %mm0, (%edi)
- movd %mm1, 8(%edi)
- addl $16, %esi
+ /* i686: 14 cycles/4 pixels, i586: 11 cycles/4 pixels */
+ movq (%esi), %mm0 /* mm0 = [ARGBARGB](1)(0) */
addl $12, %edi
+ movq 8(%esi), %mm2 /* mm2 = [ARGBARGB](3)(2) */
+ addl $16, %esi
+
+ movq %mm0, %mm1 /* mm1 = [ARGBARGB](1)(0) */
+ movq %mm2, %mm3 /* mm3 = [ARGBARGB](3)(2) */
+
+ pand %mm6, %mm1 /* mm1 = [.RGB....](1) */
+ pand %mm5, %mm2 /* mm2 = [.....RGB](2) */
+ psrlq $8, %mm1 /* mm1 = [..RGB...](1) */
+ pand %mm6, %mm3 /* mm3 = [.RGB....](3) */
+
+ movq %mm2, %mm4 /* mm4 = [.....RGB](2) */
+ pand %mm5, %mm0 /* mm0 = [.....RGB](0) */
+
+ psrlq $16, %mm2 /* mm2 = [.......R](2) */
+ por %mm1, %mm0 /* mm0 = [..RGBRGB](1)(0) */
+
+ psrlq $24, %mm3 /* mm3 = [....RGB.](3) */
decl %ecx
+
+ psllq $48, %mm4 /* mm4 = [GB......](2) */
+ por %mm3, %mm2 /* mm2 = [....RGBR](3)(2) */
+
+ por %mm4, %mm0 /* mm0 = [GBRGBRGB](2)(1)(0) */
+ movq %mm0, -12(%edi)
+ movd %mm2, -4(%edi)
+
jnz next_block_32_to_24
#ifndef ALLEGRO_COLORCONV_ALIGNED_WIDTH
@@ -1335,16 +1347,17 @@
shrl $1, %ecx
jnc end_of_line_32_to_24
- movq (%esi), %mm0 /* read 2 pixels */
+ /* 4 cycles/2 pixels */
+ movq (%esi), %mm0 /* mm0 = [.RGB.RGB](1)(0) */
- movq %mm0, %mm1
+ movq %mm0, %mm1 /* mm1 = [.RGB.RGB](1)(0) */
- psllq $40, %mm0
- psrlq $32, %mm1
- psrlq $40, %mm0
- psllq $24, %mm1
+ pand %mm5, %mm0 /* mm0 = [.....RGB](0) */
+ pand %mm6, %mm1 /* mm1 = [.RGB....](1) */
- por %mm1, %mm0
+ psrlq $8, %mm1 /* mm1 = [..RGB...](1) */
+
+ por %mm1, %mm0 /* mm0 = [..RGBRGB](1)(0) */
movd %mm0, (%edi)
psrlq $32, %mm0
@@ -2957,6 +2970,8 @@
INIT_REGISTERS_NO_MMX(SIZE_4, SIZE_3, LOOP_RATIO_1)
#endif
+ movl $0xFFFFFF, %ebp
+
_align_
next_line_32_to_24_no_mmx:
movl MYLOCAL1, %ecx
@@ -2970,27 +2985,31 @@
_align_
/* 100% Pentium pairable loop */
- /* 10 cycles = 9 cycles/4 pixels + 1 cycle loop */
+ /* 12 cycles = 11 cycles/4 pixels + 1 cycle loop */
next_block_32_to_24_no_mmx:
- movl 4(%esi), %ebx /* ebx = pixel2 */
+ movl 4(%esi), %ebx /* ebx = [ARGB](2) */
addl $12, %edi /* 4 pixels written */
- movl %ebx, %ebp /* ebp = pixel2 */
- movl 12(%esi), %edx /* edx = pixel4 */
- shll $8, %edx /* edx = pixel4 << 8 */
- movl (%esi), %eax /* eax = pixel1 */
- shll $24, %ebx /* ebx = b8 pixel2 << 24 */
- movb 10(%esi), %dl /* edx = pixel4 | r8 pixel3 */
- orl %eax, %ebx /* ebx = b8 pixel2 | pixel1 */
- movl %ebp, %eax /* eax = pixel2 */
- shrl $8, %eax /* eax = r8g8 pixel2 */
- movl %ebx, -12(%edi) /* write pixel1..b8 pixel2 */
- movl 8(%esi), %ebx /* ebx = pixel 3 */
- movl %edx, -4(%edi) /* write r8 pixel3..pixel4 */
- shll $16, %ebx /* ebx = g8b8 pixel3 << 16 */
+ movl (%esi), %eax /* eax = [ARGB](1) */
+ movl %ebx, %edx /* edx = [ARGB](2) */
+ shll $24, %edx /* edx = [B...](2) */
+ andl %ebp, %ebx /* ebx = [.RGB](2) */
+ shrl $8, %ebx /* ebx = [..RG](2) */
+ andl %ebp, %eax /* eax = [.RGB](1) */
+ orl %edx, %eax /* eax = [BRGB](2)(1) */
+ movl 8(%esi), %edx /* edx = [ARGB](3) */
+ movl %eax, -12(%edi) /* write [BRGB](2)(1) */
+ movl %edx, %eax /* eax = [ARGB](3) */
+ shll $16, %edx /* edx = [GB..](3) */
+ andl %ebp, %eax /* eax = [.RGB](3) */
+ shrl $16, %eax /* eax = [...R](3) */
+ orl %edx, %ebx /* ebx = [GBRG](3)(2) */
+ movl 12(%esi), %edx /* edx = [ARGB](4) */
+ movl %ebx, -8(%edi) /* write [GBRG](3)(2) */
+ shll $8, %edx /* edx = [RGB.](4) */
addl $16, %esi /* 4 pixels read */
- orl %ebx, %eax /* eax = g8b8 pixel3 | r8g8 pixel2 */
+ orl %edx, %eax /* eax = [RGBR](4)(3) */
decl %ecx
- movl %eax, -8(%edi) /* write g8r8 pixel2..b8g8 pixel3 */
+ movl %eax, -4(%edi) /* write [RGBR](4)(3) */
jnz next_block_32_to_24_no_mmx
popl %edx
@@ -3019,7 +3038,7 @@
movl 4(%esi), %ebx
addl $8, %esi
movl %ebx, %ecx
- andl $0xFFFFFF, %eax
+ andl %ebp, %eax
shll $24, %ebx
orl %ebx, %eax
shrl $8, %ecx