[hatari-devel] Structures in Hatari according to pahole |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/hatari-devel Archives
]
Hi,
I was looking at Hatari structures with pahole, because how the items
in frequently used structures are ordered, affects their memory usage
and performance.
Items in structures should be ordered so that they're on their natural
boundaries (this is easiest done by putting largest members first),
AND so that items used together are close together in the structure, on
the same cacheline if they fit one. Cacheline size differs between
processors though, but I think typically they're still 32 or 64.
Hatari structures which might be changed:
----------------------------------------------
struct dsp_core_s {
volatile int running; /* 0 4 */
Uint16 instr_cycle; /* 4 2 */
Uint16 pc; /* 6 2 */
Uint32 registers[64]; /* 8 256 */
/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
Uint16 stack[2][16]; /* 264 64 */
/* --- cacheline 5 boundary (320 bytes) was 8 bytes ago --- */
Uint32 ramext[32768]; /* 328 131072 */
/* --- cacheline 2053 boundary (131392 bytes) was 8 bytes ago --- */
Uint32 rom[2][512]; /* 131400 4096 */
/* --- cacheline 2117 boundary (135488 bytes) was 8 bytes ago --- */
Uint32 ramint[3][512]; /* 135496 6144 */
/* --- cacheline 2213 boundary (141632 bytes) was 8 bytes ago --- */
volatile Uint32 periph; /* 141640 512 */
/* --- cacheline 2221 boundary (142144 bytes) was 8 bytes ago --- */
volatile Uint32 dsp_host_htx; /* 142152 4 */
volatile Uint32 dsp_host_rtx; /* 142156 4 */
Uint16 dsp_host_isr_HREQ; /* 142160 2 */
volatile Uint8 hostport; /* 142162 12 */
/* XXX 2 bytes hole, try to pack */
dsp_core_ssi_t ssi; /* 142176 52 */
/* --- cacheline 2222 boundary (142208 bytes) was 20 bytes ago ---
*/
Uint32 loop_rep; /* 142228 4 */
Uint32 pc_on_rep; /* 142232 4 */
Uint16 bootstrap_pos; /* 142236 2 */
Uint16 interrupt_state; /* 142238 2 */
Uint16 interrupt_instr_fetch; /* 142240 2 */
Uint16 interrupt_save_pc; /* 142242 2 */
Uint16 interrupt_counter; /* 142244 2 */
Uint16 interrupt_IplToRaise; /* 142246 2 */
Uint16 interrupt_pipeline_count; /* 142248 2
*/
Sint16 interrupt_ipl[12]; /* 142250 24 */
/* --- cacheline 2223 boundary (142272 bytes) was 2 bytes ago --- */
Uint16 interrupt_isPending[12]; /* 142274 24
*/
/* size: 142300, cachelines: 2224 */
/* sum members: 142296, holes: 1, sum holes: 2 */
/* padding: 2 */
/* last cacheline: 28 bytes */
}; /* definitions: 4 */
----------------------------------------------
-> Interrupt handling might be better to be all together on same cacheline.
Maybe also registers & stack would be better if they were fully on
a cacheline of their own?
----------------------------------------------
struct dsp_interrupt_s {
const Uint16 inter; /* 0 2 */
const Uint16 vectorAddr; /* 2 2 */
const Uint16 periph; /* 4 2 */
/* XXX 2 bytes hole, try to pack */
const char * name; /* 8 4 */
/* size: 12, cachelines: 1 */
/* sum members: 10, holes: 1, sum holes: 2 */
/* last cacheline: 12 bytes */
}; /* definitions: 1 */
----------------------------------------------
-> Moving "name" as first would remove hole, then it would be padding.
(wouldn't help for accessing array of these)
---------------------------------------------
struct instr_def {
unsigned int bits; /* 0 4 */
int n_variable; /* 4 4 */
char bitpos[16]; /* 8 16 */
unsigned int mask; /* 24 4 */
int cpulevel; /* 28 4 */
int plevel; /* 32 4 */
struct {
unsigned int flaguse:3; /* 36:29 4 */
unsigned int flagset:3; /* 36:26 4 */
} flaginfo[5]; /* 36 20 */
unsigned char sduse; /* 56 1 */
/* XXX 3 bytes hole, try to pack */
const char * opcstr; /* 60 4 */
/* --- cacheline 1 boundary (64 bytes) --- */
/* size: 64, cachelines: 1 */
/* sum members: 61, holes: 1, sum holes: 3 */
}; /* definitions: 2 */
---------------------------------------------
-> Moving "sduse" to last would remove the hole, then that would be padding.
(wouldn't help for accessing array of these)
----------------------------------------------
struct videl_s {
_Bool bUseSTShifter; /* 0 1 */
Uint8 reg_ffff8006_save; /* 1 1 */
Uint8 monitor_type; /* 2 1 */
/* XXX 1 byte hole, try to pack */
Uint32 videoBaseAddr; /* 4 4 */
Sint16 leftBorderSize; /* 8 2 */
Sint16 rightBorderSize; /* 10 2 */
Sint16 upperBorderSize; /* 12 2 */
Sint16 lowerBorderSize; /* 14 2 */
Uint16 XSize; /* 16 2 */
Uint16 YSize; /* 18 2 */
Uint16 save_scrWidth; /* 20 2 */
Uint16 save_scrHeight; /* 22 2 */
Uint16 save_scrBpp; /* 24 2 */
_Bool hostColorsSync; /* 26 1 */
/* size: 28, cachelines: 1 */
/* sum members: 26, holes: 1, sum holes: 1 */
/* padding: 1 */
/* last cacheline: 28 bytes */
}; /* definitions: 1 */
----------------------------------------------
-> Moving the last _Bool to e.g. fourth position, would remove
the hole & padding. 1 bytes memory saving. :-)
----------------
struct hd6301_opcode_t {
Uint8 op_value; /* 0 1 */
Uint8 op_bytes; /* 1 1 */
/* XXX 2 bytes hole, try to pack */
void (*op_func)(void); /* 4 4 */
Uint8 op_n_cycles; /* 8 1 */
/* XXX 3 bytes hole, try to pack */
const char * op_mnemonic; /* 12 4 */
Uint8 op_disasm; /* 16 1 */
/* size: 20, cachelines: 1 */
/* sum members: 12, holes: 2, sum holes: 5 */
/* padding: 3 */
/* last cacheline: 20 bytes */
}; /* definitions: 1 */
----------------
-> It would be better to put those 4 Uint8 members together,
that would save noticeably memory and more of these items
would fit into same cacheline.
Besides the potential DSP core changes, this could have largest effect,
if Hatari would actually use the HD6301 emulation (that seems to be
built-in, although I thought it was disabled as it's unused)...
- Eero