6.45

assume matrix size N = 4, cache block size B = 8Byte, sizeof(int) = 4

matrix size 4*4
+--+--+--+--+
|0 |1 |2 |3 |
+--+--+--+--+
|4 |5 |6 |7 |
+--+--+--+--+
|8 |9 |10|11|
+--+--+--+--+
|12|13|14|15|
+--+--+--+--+

function transponse

void transpose(int* dst, int* src, int N) {
  int i, j;

  for (i = 0; i <= N-1; i++)
    for (j = 0; j <= N-1; j++)
      dst[j*N+i] = src[i*N+j];
}

every cache block can store 2 int numbers

traverse src by step 1, order:

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

order 0: cache miss, load src[0],src[1] order 1: cache hit because of order 0 order 2: cache miss, load src[2],src[3] …. order 15: cache hit

if B is greater, hit rate will be greater too.

traverse dst by step 4, order:

0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15

element 0: miss, load dst[0],dst[1] element 4: miss, load dst[4],dst[5] element 8: miss, load dst[8],dst[9] element 12: miss, load dst[12],dst[13] element 1: interesting, order 0 has loaded dst[1] into cache, hit. but in many cases, element 4/8/12 may use the same cache block with element 0. so miss is also possible. …. element 15: hit / miss

let’s assume worest case: all element miss cache

if we split matrix by 2*2

split matrix by size 2*2 block
+--+--+--+--+
|     |     |
+  0  +  1  +
|     |     |
+--+--+--+--+
|     |     |
+  2  +  3  +
|     |     |
+--+--+--+--+

transpose block 0 itself

transpose block 1 with 2

transpose block 3 itself

code

  for (i = 0; i <= N-2; i+=2)
    for (j = 0; j <= N-2; j+=2) {
      dst[j*N+i] = src[i*N+j];
      dst[j*N+i+1] = src[(i+1)*N+j];
      dst[(j+1)*N+i] = src[i*N+j+1];
      dst[(j+1)*N+i+1] = src[(i+1)*N+j+1];
    }

when i = 0, j = 0, transpose block 0 itself

+--+--+      +--+--+
|0 |1 |      |0 |4 |
+--+--+  =>  +--+--+
|4 |5 |      |1 |5 |
+--+--+      +--+--+

dst[0] = src[0];
dst[1] = src[4];
dst[4] = src[1];
dst[5] = src[5];

if element 0 is miss, element 1 must hit; if element 4 is miss, element 5 must hit;

50% is the highest hit rate in such low cache block size.

if B is greater and cache size C is larger, we can split matrix into 44, 88 or more larger. theoretically we will archive the highest hit rate.

finally code:

/*
 * transpose.c
 */
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

// a large prime number
#define MATRIX_N 9973
#define MEM_SIZE (sizeof(int) * MATRIX_N * MATRIX_N)
#define LOOP 1000
#define BLOCK 16

void randomize(void *mem, size_t size) {
  int rnd = open("/dev/urandom", O_RDONLY);
  read(rnd, mem, size);
  close(rnd);
}

void transpose(int *dst, int *src, int N) {
  int i, j;

  for (i = 0; i <= N - 1; i++)
    for (j = 0; j <= N - 1; j++)
      dst[j * N + i] = src[i * N + j];
}

void effective_transpose(int *dst, int *src, int N) {
  int i, j, a, b;

  for (i = 0; i <= N - BLOCK; i += BLOCK)
    for (j = 0; j <= N - BLOCK; j += BLOCK)
      for (a = i; a < i + BLOCK; a++)
        for (b = j; b < j + BLOCK; b++)
          dst[b * N + a] = src[a * N + b];

  int offset = i;

  for (i = offset; i <= N - 1; i++)
    for (j = 0; j < offset; j += BLOCK)
      for (b = j; b < j + BLOCK; b++)
          dst[b * N + i] = src[i * N + b];

  for (i = 0; i <= N - 1; i++)
    for (j = offset; j <= N - 1; j++)
      dst[j * N + i] = src[i * N + j];
}

void test(void) {
  int *d = (int *)malloc(MEM_SIZE);
  int *s = (int *)malloc(MEM_SIZE);
  randomize((void *)s, MEM_SIZE);

  transpose(d, s, MATRIX_N);

  for (int i = 0; i < MATRIX_N; i++)
    for (int j = 0; j < MATRIX_N; j++)
      assert(s[i * MATRIX_N + j] == d[j * MATRIX_N + i]);

  memset(d, 0, MEM_SIZE);
  effective_transpose(d, s, MATRIX_N);

  for (int i = 0; i < MATRIX_N; i++)
    for (int j = 0; j < MATRIX_N; j++)
      assert(s[i * MATRIX_N + j] == d[j * MATRIX_N + i]);

  free((void *)d);
  free((void *)s);
}

void prof(void) {
  int *d = (int *)malloc(MEM_SIZE);
  int *s = (int *)malloc(MEM_SIZE);

  for (int c = 0; c < LOOP; c++) transpose(d, s, MATRIX_N);

  free((void *)d);
  free((void *)s);
}

void prof_effect(void) {
  int *d = (int *)malloc(MEM_SIZE);
  int *s = (int *)malloc(MEM_SIZE);

  for (int c = 0; c < LOOP; c++) effective_transpose(d, s, MATRIX_N);

  free((void *)d);
  free((void *)s);
}

int main(int argc, char *argv[]) {
  test();

  /* prof(); */
  /* prof_effect(); */

  return 0;
}

in code, matrix size 1024*1024, loop 1000 times to measure program run time.

change BLOCK from 2 to 16, record time statistics

origin function run time: 16.46s

BLOCKtime(s)
29.99
37.16
45.6
55.66
65.34
75.39
85.38
95.48
106.21
117.9
1210.17
1311.14
1411.88
1512.11
1611.85

tip: look up cpu cache info

$ cat /sys/devices/system/cpu/cpu0/cache/*
comments powered by Disqus