Re: [hatari-devel] WinUAE and 030 cache hits/misses?

[ Thread Index | Date Index | More lists.tuxfamily.org/hatari-devel Archives ]


Hi,

On maanantai 21 tammikuu 2013, laurent.sallafranque@xxxxxxx wrote:
> If you add it, I can quickly improve the cycles counting by taking into
> account the hit/miss flag. I'm sure this would increase the quality of
> the approximative cycles computing I'm curently doing (and maybe let
> some timing critic programs run a bit better), especially the programs
> that deals with the DSP for music.

Attached is a preliminary patch for testing this, but I noticed some
wierd things with it:

* There can be 0-3 cache misses *per* instruction (prefetch), I'm assuming
  this to mean that several instructions that were not in cache can get
  fetched with the same prefetch.  Comments on that?

* With WinUAE core, CPU instruction cycle counts are zero when debugger
  is called.   With old UAE core they're fine.  Are debugger calls in
  WinUAE in similar places as they are in old UAE CPU core?


	- Eero
diff -r dffe52d0aef1 src/cpu/newcpu.c
--- a/src/cpu/newcpu.c	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/cpu/newcpu.c	Thu Jan 24 01:18:30 2013 +0200
@@ -4695,6 +4695,7 @@
 		return;
 	}
 	// cache miss
+	CpuInstruction.iCacheMisses++;
 	data = mem_access_delay_longi_read_ce020 (addr);
 	if (!(regs.cacr & 2)) {
 		c->tag = tag;
@@ -4710,6 +4711,7 @@
 	int i;
 	uae_u32 pc = m68k_getpc () + o;
 
+	CpuInstruction.iCacheMisses = 0;
 	for (;;) {
 		for (i = 0; i < 2; i++) {
 			if (pc == regs.prefetch020addr[0]) {
@@ -4779,6 +4781,7 @@
 		return;
 	}
 	// cache miss
+	CpuInstruction.iCacheMisses++;
 	data = mem_access_delay_longi_read_ce020 (addr);
 	if ((regs.cacr & 3) == 1) { // not frozen and enabled
 		update_cache030 (c, data, tag, lws);
@@ -4946,6 +4949,7 @@
 	int i;
 	uae_u32 pc = m68k_getpc () + o;
 
+	CpuInstruction.iCacheMisses = 0;
 	for (;;) {
 		for (i = 0; i < 2; i++) {
 			if (pc == regs.prefetch020addr[0]) {
diff -r dffe52d0aef1 src/debug/68kDisass.c
--- a/src/debug/68kDisass.c	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/debug/68kDisass.c	Thu Jan 24 01:18:30 2013 +0200
@@ -2454,10 +2454,10 @@
 		{
 			/* assume comments are for things which aren't profiled */
 			float percentage;
-			Uint32 count, cycles;
-			if (Profile_CpuAddressData(addr, &percentage, &count, &cycles))
+			Uint32 count, cycles, misses;
+			if (Profile_CpuAddressData(addr, &percentage, &count, &cycles, &misses))
 			{
-				sprintf(commentBuffer, "%5.2f%% (%d, %d)", percentage, count, cycles);
+				sprintf(commentBuffer, "%5.2f%% (%u, %u, %u)", percentage, count, cycles, misses);
 				Disass68kComposeStr(lineBuffer, commentBuffer, optionPosComment+1, 0);
 			}
 		}
diff -r dffe52d0aef1 src/debug/profile.c
--- a/src/debug/profile.c	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/debug/profile.c	Thu Jan 24 01:18:30 2013 +0200
@@ -25,18 +25,19 @@
 
 typedef struct {
 	Uint32 count;	/* how many times this address is used */
-	Uint32 cycles;	/* what address this is (for sorting) */
+	Uint32 cycles;	/* how many CPU cycles was taken at this address */
+	Uint32 misses;  /* how many CPU cache misses happend at this address */
 } profile_item_t;
 
 typedef struct {
-	unsigned long long all_cycles, all_count;
-	Uint32 max_cycles;	/* for overflow check (cycles > count) */
+	unsigned long long all_cycles, all_count, all_misses;
+	Uint32 max_cycles;	/* for overflow check (cycles > count or misses) */
 	Uint32 lowest, highest;	/* active address range within memory area */
 	Uint32 active;          /* number of active addresses */
 } profile_area_t;
 
 static struct {
-	unsigned long long all_cycles, all_count;
+	unsigned long long all_cycles, all_count, all_misses;
 	Uint32 size;          /* number of allocated profile data items */
 	profile_item_t *data; /* profile data items */
 	profile_area_t ram;   /* normal RAM stats */
@@ -93,13 +94,14 @@
  * Get CPU cycles, count and count percentage for given address.
  * Return true if data was available and non-zero, false otherwise.
  */
-bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint32 *cycles)
+bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint32 *cycles, Uint32 *misses)
 {
 	Uint32 idx;
 	if (!cpu_profile.data) {
 		return false;
 	}
 	idx = address2index(addr);
+	*misses = cpu_profile.data[idx].misses;
 	*cycles = cpu_profile.data[idx].cycles;
 	*count = cpu_profile.data[idx].count;
 	*percentage = 100.0*(*count)/cpu_profile.all_count;
@@ -148,6 +150,13 @@
 	fprintf(stderr, "- used cycles:\n  %llu (%.2f%% of all)\n",
 		area->all_cycles,
 		(float)area->all_cycles/cpu_profile.all_cycles*100);
+#if ENABLE_WINUAE_CPU
+	if (cpu_profile.all_misses) {	/* CPU cache in use? */
+		fprintf(stderr, "- icache misses:\n  %llu (%.2f%% of all)\n",
+			area->all_misses,
+			(float)area->all_misses/cpu_profile.all_misses*100);
+	}
+#endif
 	if (area->max_cycles == MAX_PROFILE_VALUE) {
 		fprintf(stderr, "- Counters OVERFLOW!\n");
 	}
@@ -225,7 +234,56 @@
 
 
 /**
- * compare function for qsort() to sort CPU profile data by descdending
+ * compare function for qsort() to sort CPU profile data by descending
+ * address icache misses.
+ */
+static int profile_by_cpu_misses(const void *p1, const void *p2)
+{
+	Uint32 count1 = cpu_profile.data[*(const Uint32*)p1].misses;
+	Uint32 count2 = cpu_profile.data[*(const Uint32*)p2].misses;
+	if (count1 > count2) {
+		return -1;
+	}
+	if (count1 < count2) {
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Sort CPU profile data addresses by cycle counts and show the results.
+ */
+static void Profile_CpuShowMisses(unsigned int show)
+{
+	unsigned int active;
+	Uint32 *sort_arr, *end, addr;
+	profile_item_t *data = cpu_profile.data;
+	float percentage;
+	Uint32 count;
+
+	if (!data) {
+		fprintf(stderr, "ERROR: no CPU profiling data available!\n");
+		return;
+	}
+
+	active = cpu_profile.active;
+	sort_arr = cpu_profile.sort_arr;
+	qsort(sort_arr, active, sizeof(*sort_arr), profile_by_cpu_misses);
+
+	printf("addr:\t\tmisses:\n");
+	show = (show < active ? show : active);
+	for (end = sort_arr + show; sort_arr < end; sort_arr++) {
+		addr = index2address(*sort_arr);
+		count = data[*sort_arr].misses;
+		percentage = 100.0*count/cpu_profile.all_misses;
+		printf("0x%06x\t%.2f%%\t%d%s\n", addr, percentage, count,
+		       count == MAX_PROFILE_VALUE ? " (OVERFLOW)" : "");
+	}
+	printf("%d CPU addresses listed.\n", show);
+}
+
+/**
+ * compare function for qsort() to sort CPU profile data by descending
  * address cycles counts.
  */
 static int profile_by_cpu_cycles(const void *p1, const void *p2)
@@ -397,19 +455,30 @@
  */
 void Profile_CpuUpdate(void)
 {
-	Uint32 idx, cycles;
+	Uint32 idx, cycles, misses;
+
 	idx = address2index(M68000_GetPC());
 	assert(idx <= cpu_profile.size);
 
 	if (likely(cpu_profile.data[idx].count < MAX_PROFILE_VALUE)) {
 		cpu_profile.data[idx].count++;
 	}
+
 	cycles = CurrentInstrCycles + nWaitStateCycles;
 	if (likely(cpu_profile.data[idx].cycles < MAX_PROFILE_VALUE - cycles)) {
 		cpu_profile.data[idx].cycles += cycles;
 	} else {
 		cpu_profile.data[idx].cycles = MAX_PROFILE_VALUE;
 	}
+
+#if ENABLE_WINUAE_CPU
+	misses = CpuInstruction.iCacheMisses;
+	if (likely(cpu_profile.data[idx].misses < MAX_PROFILE_VALUE - misses)) {
+		cpu_profile.data[idx].misses += misses;
+	} else {
+		cpu_profile.data[idx].misses = MAX_PROFILE_VALUE;
+	}
+#endif
 }
 
 
@@ -425,6 +494,7 @@
 		return;
 	}
 	area->all_count += count;
+	area->all_misses += item->misses;
 	area->all_cycles += cycles;
 
 	if (cycles > area->max_cycles) {
@@ -487,6 +557,7 @@
 		update_area(i, item, area);
 	}
 
+	cpu_profile.all_misses = cpu_profile.ram.all_misses + cpu_profile.rom.all_misses + cpu_profile.tos.all_misses;
 	cpu_profile.all_cycles = cpu_profile.ram.all_cycles + cpu_profile.rom.all_cycles + cpu_profile.tos.all_cycles;
 	cpu_profile.all_count = cpu_profile.ram.all_count + cpu_profile.rom.all_count + cpu_profile.tos.all_count;
 
@@ -871,7 +942,7 @@
 char *Profile_Match(const char *text, int state)
 {
 	static const char *names[] = {
-		"addresses", "counts", "cycles", "off", "on", "stats", "symbols"
+		"addresses", "counts", "cycles", "misses", "off", "on", "stats", "symbols"
 	};
 	static int i, len;
 	
@@ -890,12 +961,12 @@
 }
 
 const char Profile_Description[] =
-	  "<on|off|stats|counts|cycles|symbols|addresses> [show count]\n"
+	  "<on|off|stats|counts|cycles|misses|symbols|addresses> [show count]\n"
 	  "\t'on' & 'off' enable and disable profiling.  Data is collected\n"
 	  "\tuntil debugger is entered again at which point you get profiling\n"
 	  "\tstatistics ('stats') summary.  Then you can ask for list of the\n"
-	  "\tPC addresses, sorted either by execution 'counts' or 'cycles'\n"
-	  "\tthey used. Former can be limited just to addresses with 'symbols'.\n"
+	  "\tPC addresses, sorted either by execution 'counts', used 'cycles'\n"
+	  "\tor icache misses. First can be limited just to addresses with 'symbols'.\n"
 	  "\t'addresses' lists the profiled addresses in order, with\n"
 	  "\tthe instructions (currently) residing at them.\n"
 	  "\tYou can also give optional limit on how many will be shown.";
@@ -940,6 +1011,13 @@
 		} else {
 			Profile_CpuShowStats();
 		}
+	} else if (strcmp(psArgs[1], "misses") == 0) {
+		if (bForDsp) {
+			fprintf(stderr, "Cache misses are recorded only for CPU, not DSP.\n");
+			return false;
+		} else {
+			Profile_CpuShowMisses(show);
+		}
 	} else if (strcmp(psArgs[1], "cycles") == 0) {
 		if (bForDsp) {
 			Profile_DspShowCycles(show);
diff -r dffe52d0aef1 src/debug/profile.h
--- a/src/debug/profile.h	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/debug/profile.h	Thu Jan 24 01:18:30 2013 +0200
@@ -18,7 +18,7 @@
 extern void Profile_CpuUpdate(void);
 extern void Profile_CpuStop(void);
 /* CPU profile results */
-extern bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint32 *cycles);
+extern bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint32 *cycles, Uint32 *misses);
 
 /* DSP profile control */
 extern bool Profile_DspStart(void);
diff -r dffe52d0aef1 src/includes/m68000.h
--- a/src/includes/m68000.h	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/includes/m68000.h	Thu Jan 24 01:18:30 2013 +0200
@@ -138,6 +138,13 @@
 #define	BUS_MODE_CPU		0			/* bus is owned by the cpu */
 #define	BUS_MODE_BLITTER	1			/* bus is owned by the blitter */
 
+/* information about current CPU instruction */
+typedef struct {
+	/* TODO: move other instruction specific Hatari variables here */
+	int iCacheMisses;
+} cpu_instruction_t;
+
+extern cpu_instruction_t CpuInstruction;
 
 extern Uint32 BusErrorAddress;
 extern Uint32 BusErrorPC;
diff -r dffe52d0aef1 src/m68000.c
--- a/src/m68000.c	Thu Jan 24 00:28:07 2013 +0200
+++ b/src/m68000.c	Thu Jan 24 01:18:30 2013 +0200
@@ -79,6 +79,9 @@
 #include "mmu_common.h"
 #endif
 
+/* information about current CPU instruction */
+cpu_instruction_t CpuInstruction;
+
 Uint32 BusErrorAddress;         /* Stores the offending address for bus-/address errors */
 Uint32 BusErrorPC;              /* Value of the PC when bus error occurs */
 bool bBusErrorReadWrite;        /* 0 for write error, 1 for read error */


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/