assume matrix size N = 4, cache block size B = 8Byte, sizeof(int) = 4
matrix size 4*4
+--+--+--+--+
|0 |1 |2 |3 |
+--+--+--+--+
|4 |5 |6 |7 |
+--+--+--+--+
|8 |9 |10|11|
+--+--+--+--+
|12|13|14|15|
+--+--+--+--+
function transponse
void transpose(int* dst, int* src, int N) {
int i, j;
for (i = 0; i <= N-1; i++)
for (j = 0; j <= N-1; j++)
dst[j*N+i] = src[i*N+j];
}
every cache block can store 2 int numbers
traverse src by step 1, order:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
order 0: cache miss, load src[0],src[1] order 1: cache hit because of order 0 order 2: cache miss, load src[2],src[3] …. order 15: cache hit
if B is greater, hit rate will be greater too.
traverse dst by step 4, order:
0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15
element 0: miss, load dst[0],dst[1] element 4: miss, load dst[4],dst[5] element 8: miss, load dst[8],dst[9] element 12: miss, load dst[12],dst[13] element 1: interesting, order 0 has loaded dst[1] into cache, hit. but in many cases, element 4/8/12 may use the same cache block with element 0. so miss is also possible. …. element 15: hit / miss
let’s assume worest case: all element miss cache
if we split matrix by 2*2
split matrix by size 2*2 block
+--+--+--+--+
| | |
+ 0 + 1 +
| | |
+--+--+--+--+
| | |
+ 2 + 3 +
| | |
+--+--+--+--+
transpose block 0 itself
transpose block 1 with 2
transpose block 3 itself
code
for (i = 0; i <= N-2; i+=2)
for (j = 0; j <= N-2; j+=2) {
dst[j*N+i] = src[i*N+j];
dst[j*N+i+1] = src[(i+1)*N+j];
dst[(j+1)*N+i] = src[i*N+j+1];
dst[(j+1)*N+i+1] = src[(i+1)*N+j+1];
}
when i = 0, j = 0, transpose block 0 itself
+--+--+ +--+--+
|0 |1 | |0 |4 |
+--+--+ => +--+--+
|4 |5 | |1 |5 |
+--+--+ +--+--+
dst[0] = src[0];
dst[1] = src[4];
dst[4] = src[1];
dst[5] = src[5];
if element 0 is miss, element 1 must hit; if element 4 is miss, element 5 must hit;
50% is the highest hit rate in such low cache block size.
if B is greater and cache size C is larger, we can split matrix into 44, 88 or more larger. theoretically we will archive the highest hit rate.
finally code:
/*
* transpose.c
*/
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
// a large prime number
#define MATRIX_N 9973
#define MEM_SIZE (sizeof(int) * MATRIX_N * MATRIX_N)
#define LOOP 1000
#define BLOCK 16
void randomize(void *mem, size_t size) {
int rnd = open("/dev/urandom", O_RDONLY);
read(rnd, mem, size);
close(rnd);
}
void transpose(int *dst, int *src, int N) {
int i, j;
for (i = 0; i <= N - 1; i++)
for (j = 0; j <= N - 1; j++)
dst[j * N + i] = src[i * N + j];
}
void effective_transpose(int *dst, int *src, int N) {
int i, j, a, b;
for (i = 0; i <= N - BLOCK; i += BLOCK)
for (j = 0; j <= N - BLOCK; j += BLOCK)
for (a = i; a < i + BLOCK; a++)
for (b = j; b < j + BLOCK; b++)
dst[b * N + a] = src[a * N + b];
int offset = i;
for (i = offset; i <= N - 1; i++)
for (j = 0; j < offset; j += BLOCK)
for (b = j; b < j + BLOCK; b++)
dst[b * N + i] = src[i * N + b];
for (i = 0; i <= N - 1; i++)
for (j = offset; j <= N - 1; j++)
dst[j * N + i] = src[i * N + j];
}
void test(void) {
int *d = (int *)malloc(MEM_SIZE);
int *s = (int *)malloc(MEM_SIZE);
randomize((void *)s, MEM_SIZE);
transpose(d, s, MATRIX_N);
for (int i = 0; i < MATRIX_N; i++)
for (int j = 0; j < MATRIX_N; j++)
assert(s[i * MATRIX_N + j] == d[j * MATRIX_N + i]);
memset(d, 0, MEM_SIZE);
effective_transpose(d, s, MATRIX_N);
for (int i = 0; i < MATRIX_N; i++)
for (int j = 0; j < MATRIX_N; j++)
assert(s[i * MATRIX_N + j] == d[j * MATRIX_N + i]);
free((void *)d);
free((void *)s);
}
void prof(void) {
int *d = (int *)malloc(MEM_SIZE);
int *s = (int *)malloc(MEM_SIZE);
for (int c = 0; c < LOOP; c++) transpose(d, s, MATRIX_N);
free((void *)d);
free((void *)s);
}
void prof_effect(void) {
int *d = (int *)malloc(MEM_SIZE);
int *s = (int *)malloc(MEM_SIZE);
for (int c = 0; c < LOOP; c++) effective_transpose(d, s, MATRIX_N);
free((void *)d);
free((void *)s);
}
int main(int argc, char *argv[]) {
test();
/* prof(); */
/* prof_effect(); */
return 0;
}
in code, matrix size 1024*1024, loop 1000 times to measure program run time.
change BLOCK from 2 to 16, record time statistics
origin function run time: 16.46s
BLOCK | time(s) |
---|---|
2 | 9.99 |
3 | 7.16 |
4 | 5.6 |
5 | 5.66 |
6 | 5.34 |
7 | 5.39 |
8 | 5.38 |
9 | 5.48 |
10 | 6.21 |
11 | 7.9 |
12 | 10.17 |
13 | 11.14 |
14 | 11.88 |
15 | 12.11 |
16 | 11.85 |
tip: look up cpu cache info
$ cat /sys/devices/system/cpu/cpu0/cache/*