[hatari-devel] Structures in Hatari according to pahole

[ Thread Index | Date Index | More lists.tuxfamily.org/hatari-devel Archives ]


Hi,

I was looking at Hatari structures with pahole, because how the items
in frequently used structures are ordered, affects their memory usage
and performance.

Items in structures should be ordered so that they're on their natural
boundaries (this is easiest done by putting largest members first),
AND so that items used together are close together in the structure, on
the same cacheline if they fit one.  Cacheline size differs between
processors though, but I think typically they're still 32 or 64.


Hatari structures which might be changed:

----------------------------------------------
struct dsp_core_s {
        volatile int               running;              /*     0     4 */
        Uint16                     instr_cycle;          /*     4     2 */
        Uint16                     pc;                   /*     6     2 */
        Uint32                     registers[64];        /*     8   256 */
        /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
        Uint16                     stack[2][16];         /*   264    64 */
        /* --- cacheline 5 boundary (320 bytes) was 8 bytes ago --- */
        Uint32                     ramext[32768];        /*   328 131072 */
        /* --- cacheline 2053 boundary (131392 bytes) was 8 bytes ago --- */
        Uint32                     rom[2][512];          /* 131400  4096 */
        /* --- cacheline 2117 boundary (135488 bytes) was 8 bytes ago --- */
        Uint32                     ramint[3][512];       /* 135496  6144 */
        /* --- cacheline 2213 boundary (141632 bytes) was 8 bytes ago --- */
        volatile Uint32            periph;               /* 141640   512 */
        /* --- cacheline 2221 boundary (142144 bytes) was 8 bytes ago --- */
        volatile Uint32            dsp_host_htx;         /* 142152     4 */
        volatile Uint32            dsp_host_rtx;         /* 142156     4 */
        Uint16                     dsp_host_isr_HREQ;    /* 142160     2 */
        volatile Uint8             hostport;             /* 142162    12 */

        /* XXX 2 bytes hole, try to pack */

        dsp_core_ssi_t             ssi;                  /* 142176    52 */
        /* --- cacheline 2222 boundary (142208 bytes) was 20 bytes ago --- 
*/
        Uint32                     loop_rep;             /* 142228     4 */
        Uint32                     pc_on_rep;            /* 142232     4 */
        Uint16                     bootstrap_pos;        /* 142236     2 */
        Uint16                     interrupt_state;      /* 142238     2 */
        Uint16                     interrupt_instr_fetch; /* 142240     2 */
        Uint16                     interrupt_save_pc;    /* 142242     2 */
        Uint16                     interrupt_counter;    /* 142244     2 */
        Uint16                     interrupt_IplToRaise; /* 142246     2 */
        Uint16                     interrupt_pipeline_count; /* 142248     2 
*/
        Sint16                     interrupt_ipl[12];    /* 142250    24 */
        /* --- cacheline 2223 boundary (142272 bytes) was 2 bytes ago --- */
        Uint16                     interrupt_isPending[12]; /* 142274    24 
*/

        /* size: 142300, cachelines: 2224 */
        /* sum members: 142296, holes: 1, sum holes: 2 */
        /* padding: 2 */
        /* last cacheline: 28 bytes */
};      /* definitions: 4 */
----------------------------------------------

-> Interrupt handling might be better to be all together on same cacheline.
   Maybe also registers & stack would be better if they were fully on
   a cacheline of their own?


----------------------------------------------
struct dsp_interrupt_s {
        const Uint16               inter;                /*     0     2 */
        const Uint16               vectorAddr;           /*     2     2 */
        const Uint16               periph;               /*     4     2 */

        /* XXX 2 bytes hole, try to pack */

        const char  *              name;                 /*     8     4 */

        /* size: 12, cachelines: 1 */
        /* sum members: 10, holes: 1, sum holes: 2 */
        /* last cacheline: 12 bytes */
};      /* definitions: 1 */
----------------------------------------------

-> Moving "name" as first would remove hole, then it would be padding.
   (wouldn't help for accessing array of these)


---------------------------------------------
struct instr_def {
        unsigned int               bits;                 /*     0     4 */
        int                        n_variable;           /*     4     4 */
        char                       bitpos[16];           /*     8    16 */
        unsigned int               mask;                 /*    24     4 */
        int                        cpulevel;             /*    28     4 */
        int                        plevel;               /*    32     4 */
        struct {
                unsigned int       flaguse:3;            /*    36:29  4 */
                unsigned int       flagset:3;            /*    36:26  4 */
        } flaginfo[5]; /*    36    20 */
        unsigned char              sduse;                /*    56     1 */

        /* XXX 3 bytes hole, try to pack */

        const char  *              opcstr;               /*    60     4 */
        /* --- cacheline 1 boundary (64 bytes) --- */

        /* size: 64, cachelines: 1 */
        /* sum members: 61, holes: 1, sum holes: 3 */
};      /* definitions: 2 */
---------------------------------------------

-> Moving "sduse" to last would remove the hole, then that would be padding.
   (wouldn't help for accessing array of these)


----------------------------------------------
struct videl_s {
        _Bool                      bUseSTShifter;        /*     0     1 */
        Uint8                      reg_ffff8006_save;    /*     1     1 */
        Uint8                      monitor_type;         /*     2     1 */

        /* XXX 1 byte hole, try to pack */

        Uint32                     videoBaseAddr;        /*     4     4 */
        Sint16                     leftBorderSize;       /*     8     2 */
        Sint16                     rightBorderSize;      /*    10     2 */
        Sint16                     upperBorderSize;      /*    12     2 */
        Sint16                     lowerBorderSize;      /*    14     2 */
        Uint16                     XSize;                /*    16     2 */
        Uint16                     YSize;                /*    18     2 */
        Uint16                     save_scrWidth;        /*    20     2 */
        Uint16                     save_scrHeight;       /*    22     2 */
        Uint16                     save_scrBpp;          /*    24     2 */
        _Bool                      hostColorsSync;       /*    26     1 */

        /* size: 28, cachelines: 1 */
        /* sum members: 26, holes: 1, sum holes: 1 */
        /* padding: 1 */
        /* last cacheline: 28 bytes */
};      /* definitions: 1 */
----------------------------------------------

-> Moving the last _Bool to e.g. fourth position, would remove
   the hole & padding.  1 bytes memory saving. :-)


----------------
struct hd6301_opcode_t {
        Uint8                      op_value;             /*     0     1 */
        Uint8                      op_bytes;             /*     1     1 */

        /* XXX 2 bytes hole, try to pack */

        void                       (*op_func)(void);     /*     4     4 */
        Uint8                      op_n_cycles;          /*     8     1 */

        /* XXX 3 bytes hole, try to pack */

        const char  *              op_mnemonic;          /*    12     4 */
        Uint8                      op_disasm;            /*    16     1 */

        /* size: 20, cachelines: 1 */
        /* sum members: 12, holes: 2, sum holes: 5 */
        /* padding: 3 */
        /* last cacheline: 20 bytes */
};      /* definitions: 1 */
----------------

-> It would be better to put those 4 Uint8 members together,
   that would save noticeably memory and more of these items
   would fit into same cacheline.


Besides the potential DSP core changes, this could have largest effect,
if Hatari would actually use the HD6301 emulation (that seems to be
built-in, although I thought it was disabled as it's unused)...


	- Eero



Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/