/*
* 6.37.c
*/
typedef int array_t[N][N];
int sumA(array_t a) {
int i, j;
int sum = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum += a[i][j];
return sum;
}
int sumB(array_t a) {
int i, j;
int sum = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum += a[j][i];
return sum;
}
int sumC(array_t a) {
int i, j;
int sum = 0;
for (i = 0; i < N; i+=2)
for (j = 0; j < N; j+=2)
sum += (a[j][i] + a[j][i+1] + a[j+1][i] + a[j+1][i+1])
}
C = 4096, B = 16, E = 1, S = 256
sizeof(array_t) == 64 * 64 == 4096 == 4*C
memory-cache graph
memory address start from 0 and end to 4096*4.
cell size is 16B. number(0-255) in cell means cache block number that the cell
will be cached.
0 +---------+
| 0 |
16 +---------+
| 1 |
32 +---------+
| 2 |
48 +---------+
| . |
| . |
| . |
| . |
| . |
4096-16 +---------+
| 255 |
4096 +---------+
| 0 |
4096+16 +---------+
| 1 |
4096+32 +---------+
| . |
| . |
| . |
| . |
| . |
| . |
4096*4-16+---------+
| 255 |
4096*4 +---------+
A. sumA
sum += a[i][j];
read memory address order:
0, 4, 8, 12, ….., 4096*4-4
read cache order:
0, 0, 0, 0, 1, 1, 1, 1,…..255, 255, 255, 255, 0, 0, 0, 0,…..255, 255, 255, 255
first cache read miss, next 3 time read hit.
miss rate: 25%
B. sumB
sum += a[j][i];
read memory address order:
0, 644, 6442, …. 64463, 4, 644+4, 6442+4, …. 4096*4-4
read cache order:
0, 16, 32, 48, … 240,(4 times) 1, 17, 33, … 241,(4 times) 15, 31, 47, … 255(4 times)
let’s see first read loop:
read cache order loop 4 times
0, 16, 32, 48, … 240,(4 times)
first loop all miss, next 3 loop all hit
so miss rate is 25%.
C. sumC
for (i = 0; i < N; i+=2)
for (j = 0; j < N; j+=2)
sum += (a[j][i] + a[j][i+1] + a[j+1][i] + a[j+1][i+1]);
easy to see that read a[j][i+1] and a[j+1][i+1] always hit
same like
for (i = 0; i < N; i+=2)
for (j = 0; j < N; j+=2)
sum += (a[j][i] + a[j+1][i]);
same like
for (i = 0; i < N; i+=2)
for (j = 0; j < N; j++)
sum += a[j][i];
total read count = 64*64
because of i+=2,
read cache order only loop 2 times
0, 16, 32, 48, … 240,(2 times)
so miss rate is 50%
totol read miss count = 64/2 * 64 * 50% = 64*64/4
so miss rate is still 25%.
A. sumA
sum += a[i][j];
read memory by step 1
miss rate 25%
B. sumB
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum += a[j][i];
it’s interesting.
let’s see first inner loop a[0][0] -> a[59][0]
read memory address order:
0, 604, 6042, …. 604*59
read cache order:
0, 15, 30, …., 225, (17 numbers) 255, 14, 29, ….., 224, (17 numbers) 254, 13, 28, ….., 223, (17 numbers) 253, 12, 27, 42, 57, 72, 87, 102, 117 (9 numbers)
all read miss and store into different blocks
next 3 loops: a[0][1] -> a[59][1], a[0][2] -> a[59][2], a[0][3] -> a[59][3]
all hit
althrough N is smaller and not power of 2, miss rate is still 25%
C. sumC
same as miss rate when N = 64
25%