matrix size and thread number
/*
* 12.34.h
*/
#define N 640
#define M 640
#define THREAD (1<<4)
#define ROWS_PER_THREAD (N / THREAD)
non concurrent version
/*
* 12.34.non.concurrent.c
*/
#include <stdio.h>
#include "csapp.h"
#include "12.34.h"
int M1[N][M];
int M2[N][M];
int MUL12[N][M];
void non_concurrent_mul(void) {
int i, j, k;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++) {
int sum = 0;
for (k = 0; k < M; k++) {
sum += M1[i][k] * M2[k][j];
}
MUL12[i][j] = sum;
}
}
int main(int argc, char* argv[]) {
non_concurrent_mul();
return 0;
}
concurrent version
/*
* 12.34.concurrent.c
*/
#include <stdio.h>
#include "csapp.h"
#include "12.34.h"
int M1[N][M];
int M2[N][M];
int MUL12[N][M];
void *thread_mul(void *vargp) {
int idx = *(int*)vargp;
int start = ROWS_PER_THREAD * idx;
int i, j, k;
for (i = start; i < start+ROWS_PER_THREAD; i++)
for (j = 0; j < N; j++) {
int sum = 0;
for (k = 0; k < M; k++) {
sum += M1[i][k] * M2[k][j];
}
MUL12[i][j] = sum;
}
}
void concurrent_mul(void) {
pthread_t tid[THREAD];
int param[THREAD];
int i;
for (i = 0; i < THREAD; i++) {
param[i] = i;
Pthread_create(&tid[i], NULL, thread_mul, ¶m[i]);
}
for (i = 0; i < THREAD; i++) {
Pthread_join(tid[i], NULL);
}
}
int main(int argc, char* argv[]) {
concurrent_mul();
return 0;
}
measure performance
(cd ./site/content/chapter12/code; make clean && make && make measure)
output
(time ./12.34.non.concurrent)
0.90user 0.00system 0:00.90elapsed 99%CPU (0avgtext+0avgdata 3704maxresident)k
0inputs+0outputs (0major+756minor)pagefaults 0swaps
// single cpu run time 0.9s
(time ./12.34.concurrent)
2.20user 0.00system 0:00.64elapsed 341%CPU (0avgtext+0avgdata 3896maxresident)k
0inputs+0outputs (0major+1462minor)pagefaults 0swaps
// 4 cpu, total run time 2.2s(every cpu run 0.55s, concurrent!!)
more detialed
thread(t) | 1 | 2 | 4 | 8 | 16 |
---|---|---|---|---|---|
core(p) | 1 | 2 | 4 | 4 | 4 |
time(Tp) | 0.86 | 0.466 | 0.626 | 0.627 | 0.628 |
speedup(Sp) | 1 | 1.84 | 1.37 | 1.37 | 1.37 |
efficiency(Ep) | 100% | 92.2% | 34.3% | 34.3% | 34.3% |