2013-04-22 80 views
1

所以有這個代碼cache.c查找緩存性能

#include <sys/times.h> 
#include <stdio.h> 

#define CACHE_MIN (1024) /* smallest cache (in words) */ 
#define CACHE_MAX (1024*1024) /* largest cache */ 
#define STRIDE_MIN 1 /* smallest stride (in words) */ 
#define STRIDE_MAX 128 /* largest stride */ 
#define SAMPLE 10 /* to get a larger time sample */ 
#define CLK_TCK 60 /* number clock cycles per second */ 
int x[CACHE_MAX]; /* array going to stride through */ 

double get_seconds() { /* routine to read time */ 

    struct tms rusage; 
    times (&rusage); /* UNIX utility: time in clock ticks */ 
    return (double) (rusage.tms_utime)/CLK_TCK; 
} 

int main() { 

    int register i, index, stride, limit, temp; 
    int steps, tsteps, csize; 
    double sec0, sec; /* timing variables */ 
    for (csize = CACHE_MIN; csize <= CACHE_MAX; csize = csize * 2) 

    for (stride = STRIDE_MIN; stride <= STRIDE_MAX; stride = stride * 2) { 
     sec = 0; /* initialize timer */ 
     limit = csize - stride + 1; /* cache size this loop */ 
     steps = 0; 
     do { /* repeat until collect 1 second */ 

    sec0 = get_seconds(); /* start timer */ 
    for (i = SAMPLE * stride; i != 0; i = i - 1) /* larger sample */ 
     for (index = 0; index < limit; index = index + stride) 
     x[index] = x[index] + 1; /* cache access */ 
    steps = steps + 1; /* count while loop iterations */ 
    sec = sec + (get_seconds() - sec0); /* end timer */ 

     } 
     while (sec < 1.0); /* until collect 1 second */ 

     /* Repeat empty loop to loop subtract overhead */ 
     tsteps = 0; /* used to match number of while iterations */ 
     do { /* repeat until same number of iterations as above */ 

    sec0 = get_seconds(); /* start timer */ 
    for (i = SAMPLE * stride; i != 0; i = i - 1) /* larger sample */ 
     for (index = 0; index < limit; index = index + stride) 
     temp = temp + index; /* dummy code */ 
    tsteps = tsteps + 1; /* count while iterations */ 
    sec = sec - (get_seconds() - sec0); /* - overhead */ 

     } 
     while (tsteps < steps); /* until equal to number of iterations */ 

     if(stride==STRIDE_MIN) printf("\n"); /* extra line to separate array sizes */ 
     printf("Size(bytes): %7d Stride(bytes): %4d read+write: %4.0f ns\n", 
     csize * sizeof (int), stride * sizeof (int), 
     (double) sec*1e9/(steps*SAMPLE*stride*((limit-1)/stride + 1))); 

    } /* end of both outer for loops */ 
} 

運行時,我得到這個輸出

Size(bytes): 4096 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 4096 Stride(bytes): 8 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 16 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 32 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 64 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 128 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 256 read+write: 0 ns 
Size(bytes): 4096 Stride(bytes): 512 read+write: 0 ns 

Size(bytes): 8192 Stride(bytes): 4 read+write: 0 ns 
Size(bytes): 8192 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 8192 Stride(bytes): 16 read+write: 0 ns 
Size(bytes): 8192 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 8192 Stride(bytes): 64 read+write: 0 ns 
Size(bytes): 8192 Stride(bytes): 128 read+write: 0 ns 
Size(bytes): 8192 Stride(bytes): 256 read+write: 1 ns 
Size(bytes): 8192 Stride(bytes): 512 read+write: 0 ns 

Size(bytes): 16384 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 16384 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 16384 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 16384 Stride(bytes): 32 read+write: 0 ns 
Size(bytes): 16384 Stride(bytes): 64 read+write: 1 ns 
Size(bytes): 16384 Stride(bytes): 128 read+write: 0 ns 
Size(bytes): 16384 Stride(bytes): 256 read+write: 0 ns 
Size(bytes): 16384 Stride(bytes): 512 read+write: 0 ns 

Size(bytes): 32768 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 32768 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 32768 Stride(bytes): 16 read+write: 0 ns 
Size(bytes): 32768 Stride(bytes): 32 read+write: 0 ns 
Size(bytes): 32768 Stride(bytes): 64 read+write: 1 ns 
Size(bytes): 32768 Stride(bytes): 128 read+write: 0 ns 
Size(bytes): 32768 Stride(bytes): 256 read+write: 0 ns 
Size(bytes): 32768 Stride(bytes): 512 read+write: 0 ns 

Size(bytes): 65536 Stride(bytes): 4 read+write: 0 ns 
Size(bytes): 65536 Stride(bytes): 8 read+write: 0 ns 
Size(bytes): 65536 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 65536 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 65536 Stride(bytes): 64 read+write: 2 ns 
Size(bytes): 65536 Stride(bytes): 128 read+write: 2 ns 
Size(bytes): 65536 Stride(bytes): 256 read+write: 1 ns 
Size(bytes): 65536 Stride(bytes): 512 read+write: 1 ns 

Size(bytes): 131072 Stride(bytes): 4 read+write: 0 ns 
Size(bytes): 131072 Stride(bytes): 8 read+write: 0 ns 
Size(bytes): 131072 Stride(bytes): 16 read+write: 0 ns 
Size(bytes): 131072 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 131072 Stride(bytes): 64 read+write: 2 ns 
Size(bytes): 131072 Stride(bytes): 128 read+write: 2 ns 
Size(bytes): 131072 Stride(bytes): 256 read+write: 2 ns 
Size(bytes): 131072 Stride(bytes): 512 read+write: 1 ns 

Size(bytes): 262144 Stride(bytes): 4 read+write: 0 ns 
Size(bytes): 262144 Stride(bytes): 8 read+write: 0 ns 
Size(bytes): 262144 Stride(bytes): 16 read+write: 0 ns 
Size(bytes): 262144 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 262144 Stride(bytes): 64 read+write: 2 ns 
Size(bytes): 262144 Stride(bytes): 128 read+write: 2 ns 
Size(bytes): 262144 Stride(bytes): 256 read+write: 2 ns 
Size(bytes): 262144 Stride(bytes): 512 read+write: 1 ns 

Size(bytes): 524288 Stride(bytes): 4 read+write: 0 ns 
Size(bytes): 524288 Stride(bytes): 8 read+write: 0 ns 
Size(bytes): 524288 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 524288 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 524288 Stride(bytes): 64 read+write: 3 ns 
Size(bytes): 524288 Stride(bytes): 128 read+write: 3 ns 
Size(bytes): 524288 Stride(bytes): 256 read+write: 3 ns 
Size(bytes): 524288 Stride(bytes): 512 read+write: 3 ns 

Size(bytes): 1048576 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 1048576 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 1048576 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 1048576 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 1048576 Stride(bytes): 64 read+write: 3 ns 
Size(bytes): 1048576 Stride(bytes): 128 read+write: 3 ns 
Size(bytes): 1048576 Stride(bytes): 256 read+write: 3 ns 
Size(bytes): 1048576 Stride(bytes): 512 read+write: 3 ns 

Size(bytes): 2097152 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 2097152 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 2097152 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 2097152 Stride(bytes): 32 read+write: 1 ns 
Size(bytes): 2097152 Stride(bytes): 64 read+write: 3 ns 
Size(bytes): 2097152 Stride(bytes): 128 read+write: 3 ns 
Size(bytes): 2097152 Stride(bytes): 256 read+write: 3 ns 
Size(bytes): 2097152 Stride(bytes): 512 read+write: 3 ns 

Size(bytes): 4194304 Stride(bytes): 4 read+write: 1 ns 
Size(bytes): 4194304 Stride(bytes): 8 read+write: 1 ns 
Size(bytes): 4194304 Stride(bytes): 16 read+write: 1 ns 
Size(bytes): 4194304 Stride(bytes): 32 read+write: 2 ns 
Size(bytes): 4194304 Stride(bytes): 64 read+write: 3 ns 
Size(bytes): 4194304 Stride(bytes): 128 read+write: 3 ns 
Size(bytes): 4194304 Stride(bytes): 256 read+write: 3 ns 
Size(bytes): 4194304 Stride(bytes): 512 read+write: 3 ns 

現在我試圖找到一種高速緩存命中和未命中的速度有多快,以及第一級緩存的大小與第一級緩存的塊大小一起。

是不是一級緩存大小和塊大小隻有4kb? 我不確定如何找到速度,但有什麼想法?

回答

1
  1. 使用線程關聯將您的性能檢查線程連接到單個特定內核。這樣可以消除不同處理器內核之間線程遷移的影響,從而導致錯誤的結果。
  2. 使用時間戳計數器並測量CPU週期中的開銷。這是x86 CPU上最細緻的時間測量計時器。
  3. 不要忘記從測量結果中減去時間測量的時間。
  4. 由編譯器在反彙編器中生成的控制代碼,以確保編譯器沒有引入不需要的優化(例如,將變量放入CPU寄存器而不是放入內存中)。

CPU緩存和緩存行的大小高度依賴於特定的CPU型號,並且可能會有很大差異。檢查您使用的CPU的文檔。