[c] L2 HW 프리 페 처가 정말 도움이됩니까?

나는에있어 위스키 호수 i7-8565U 와 (두 번 더 L2 캐시 크기보다) 데이터의 512 킬로바이트를 복사 카운터 및 시간을 반환 한 분석 및 L2 HW 프리 페처의 작품에 대하여 직면 한 몇 가지 오해.

에서 인텔 매뉴얼 4 권 MSR MSR이 0x1A4비트 0 (비활성화 1) L2 HW 프리 페처를 controlloing위한의.

다음 벤치 마크를 고려하십시오.

memcopy.h:

void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);

memcopy.S:

avx_memcpy_forward_lsls:
    shr rdx, 0x3
    xor rcx, rcx
avx_memcpy_forward_loop_lsls:
    vmovdqa ymm0, [rsi + 8*rcx]
    vmovdqa [rdi + rcx*8], ymm0
    vmovdqa ymm1, [rsi + 8*rcx + 0x20]
    vmovdqa [rdi + rcx*8 + 0x20], ymm1
    add rcx, 0x08
    cmp rdx, rcx
    ja avx_memcpy_forward_loop_lsls
    ret

main.c:

#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"

#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024

_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);

#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
    do{\
        printf("Benchmarking " #fn "\n");\
        __run_benchmark(runs, run_iterations, fn, dest, src, sz);\
    }while(0)

int main(void){
    int fd = open("/dev/urandom", O_RDONLY);
    read(fd, src, sizeof src);
    run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}

static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
                                               void *restrict dest, const void *restrict src, size_t sz){
    while(iterations --> 0){
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
    }
}

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
    unsigned current_run = 1;
    while(current_run <= runs){
        benchmark_copy_function(run_iterations, fn, dest, src, sz);
        printf("Run %d finished\n", current_run);
        current_run++;
    }
}

컴파일 된 2 회 실행 고려 main.c

I .

MSR:

$ sudo rdmsr -p 0 0x1A4
0

Run:

$ taskset -c 0 sudo ../profile.sh ./bin

 Performance counter stats for './bin':

    10 486 164 071      L1-dcache-loads                                               (12,13%)
    10 461 354 384      L1-dcache-load-misses     #   99,76% of all L1-dcache hits    (12,05%)
    10 481 930 413      L1-dcache-stores                                              (12,05%)
    10 461 136 686      l1d.replacement                                               (12,12%)
    31 466 394 422      l1d_pend_miss.fb_full                                         (12,11%)
   211 853 643 294      l1d_pend_miss.pending                                         (12,09%)
     1 759 204 317      LLC-loads                                                     (12,16%)
            31 007      LLC-load-misses           #    0,00% of all LL-cache hits     (12,16%)
     3 154 901 630      LLC-stores                                                    (6,19%)
    15 867 315 545      l2_rqsts.all_pf                                               (9,22%)
                 0      sw_prefetch_access.t1_t2                                      (12,22%)
         1 393 306      l2_lines_out.useless_hwpf                                     (12,16%)
     3 549 170 919      l2_rqsts.pf_hit                                               (12,09%)
    12 356 247 643      l2_rqsts.pf_miss                                              (12,06%)
                 0      load_hit_pre.sw_pf                                            (12,09%)
     3 159 712 695      l2_rqsts.rfo_hit                                              (12,06%)
     1 207 642 335      l2_rqsts.rfo_miss                                             (12,02%)
     4 366 526 618      l2_rqsts.all_rfo                                              (12,06%)
     5 240 013 774      offcore_requests.all_data_rd                                     (12,06%)
    19 936 657 118      offcore_requests.all_requests                                     (12,09%)
     1 761 660 763      offcore_response.demand_data_rd.any_response                                     (12,12%)
       287 044 397      bus-cycles                                                    (12,15%)
    36 816 767 779      resource_stalls.any                                           (12,15%)
    36 553 997 653      resource_stalls.sb                                            (12,15%)
    38 035 066 210      uops_retired.stall_cycles                                     (12,12%)
    24 766 225 119      uops_executed.stall_cycles                                     (12,09%)
    40 478 455 041      uops_issued.stall_cycles                                      (12,05%)
    24 497 256 548      cycle_activity.stalls_l1d_miss                                     (12,02%)
    12 611 038 018      cycle_activity.stalls_l2_miss                                     (12,09%)
        10 228 869      cycle_activity.stalls_l3_miss                                     (12,12%)
    24 707 614 483      cycle_activity.stalls_mem_any                                     (12,22%)
    24 776 110 104      cycle_activity.stalls_total                                     (12,22%)
    48 914 478 241      cycles                                                        (12,19%)

      12,155774555 seconds time elapsed

      11,984577000 seconds user
       0,015984000 seconds sys

II.

MSR:

$ sudo rdmsr -p 0 0x1A4
1

Run:

$ taskset -c 0 sudo ../profile.sh ./bin

 Performance counter stats for './bin':

    10 508 027 832      L1-dcache-loads                                               (12,05%)
    10 463 643 206      L1-dcache-load-misses     #   99,58% of all L1-dcache hits    (12,09%)
    10 481 296 605      L1-dcache-stores                                              (12,12%)
    10 444 854 468      l1d.replacement                                               (12,15%)
    29 287 445 744      l1d_pend_miss.fb_full                                         (12,17%)
   205 569 630 707      l1d_pend_miss.pending                                         (12,17%)
     5 103 444 329      LLC-loads                                                     (12,17%)
            33 406      LLC-load-misses           #    0,00% of all LL-cache hits     (12,17%)
     9 567 917 742      LLC-stores                                                    (6,08%)
     1 157 237 980      l2_rqsts.all_pf                                               (9,12%)
                 0      sw_prefetch_access.t1_t2                                      (12,17%)
           301 471      l2_lines_out.useless_hwpf                                     (12,17%)
       218 528 985      l2_rqsts.pf_hit                                               (12,17%)
       938 735 722      l2_rqsts.pf_miss                                              (12,17%)
                 0      load_hit_pre.sw_pf                                            (12,17%)
         4 096 281      l2_rqsts.rfo_hit                                              (12,17%)
     4 972 640 931      l2_rqsts.rfo_miss                                             (12,17%)
     4 976 006 805      l2_rqsts.all_rfo                                              (12,17%)
     5 175 544 191      offcore_requests.all_data_rd                                     (12,17%)
    15 772 124 082      offcore_requests.all_requests                                     (12,17%)
     5 120 635 892      offcore_response.demand_data_rd.any_response                                     (12,17%)
       292 980 395      bus-cycles                                                    (12,17%)
    37 592 020 151      resource_stalls.any                                           (12,14%)
    37 317 091 982      resource_stalls.sb                                            (12,11%)
    38 121 826 730      uops_retired.stall_cycles                                     (12,08%)
    25 430 699 605      uops_executed.stall_cycles                                     (12,04%)
    41 416 190 037      uops_issued.stall_cycles                                      (12,04%)
    25 326 579 070      cycle_activity.stalls_l1d_miss                                     (12,04%)
    25 019 148 253      cycle_activity.stalls_l2_miss                                     (12,03%)
         7 384 770      cycle_activity.stalls_l3_miss                                     (12,03%)
    25 442 709 033      cycle_activity.stalls_mem_any                                     (12,03%)
    25 406 897 956      cycle_activity.stalls_total                                     (12,03%)
    49 877 044 086      cycles                                                        (12,03%)

      12,231406658 seconds time elapsed

      12,226386000 seconds user
       0,004000000 seconds sys

나는 카운터를 보았다 :

12 611 038 018 cycle_activity.stalls_l2_miss v / s
25 019 148 253 cycle_activity.stalls_l2_miss

MSR 비활성화 L2 HW 프리 페 처가 적용되고 있음을 나타냅니다. 또한 다른 l2 / LLC 관련 사항은 크게 다릅니다. 차이는 다른 실행에서 재현 할 수 있습니다 . 문제는 total time사이클과 차이가 거의 없다는 것입니다 .

48 914 478 241 cycles v / s
49 877 044 086 cycles

12,155774555 seconds time elapsed v / s
12,231406658 seconds time elapsed

질문 :
L2 미스가 다른 성능 제한에 의해 숨겨져 있습니까?
그렇다면 어떤 카운터를 이해해야하는지 제안 할 수 있습니까?

답변

예, L2 스 트리머는 많은 시간을 정말 도움이됩니다.

memcpy는 계산 대기 시간이 숨겨져 있지 않으므로 OoO exec 리소스 (ROB 크기)가 적어도 L2 누락에서 얻은 추가로드 대기 시간을 처리 할 수 있다고 생각합니다. L3에 맞는 중간 크기 작업 세트 (1MiB)를 사용하여 L3 적중을 발생시키는 데 프리 페치가 필요하지 않습니다.

그리고 유일한 지침은로드 / 저장 (및 루프 오버 헤드)이므로 OoO 창에는 훨씬 앞서서 요구되는로드가 포함됩니다.

L2 공간 프리 페처 및 L1d 프리 페 처가 여기서 도움을주는 경우 IDK.

이 가설을 테스트하기위한 예측 : 어레이를 더 크게 만들어 L3 미스를 얻으면 OoO exec가 DRAM으로가는로드 레이턴시를 감추기에 충분하지 않으면 전체 시간에 차이가있을 수 있습니다. HW 프리 페치 트리거가 더 멀리 진행될 수 있습니다.

HW의 다른 큰 장점은이 때 올 프리 페치 할 수 있습니다 귀하의 계산과 유지는 L2 히트를 얻을 수 있도록. (중간 길이이지만 루프 전달 종속성 체인이 아닌 계산을 갖는 루프에서)

요구로드와 OoO exec는 ROB 용량에 다른 압력이없는 경우 사용 가능한 (단일 스레드) 메모리 대역폭을 사용하는 한 많은 작업을 수행 할 수 있습니다.

또한 인텔 CPU에서 모든 캐시 미스의합니다 (RS / 스케줄러에서) 백 엔드 재생을 요할 수 있습니다 의존 마이크로 연산 데이터가 도달 할 것으로 예상 될 때, L1D 및 L2 미스 각각 하나씩 있습니다. 그 후, 핵심은 데이터가 L3에서 도착하기를 기다리는 동안 낙관적으로 스팸을 차단합니다.

(참조 https://chat.stackoverflow.com/rooms/206639/discussion-on-question-by-beeonrope-are-load-ops-deallocated-from-the-rs-when-th을 하고 로부터 해제로드 작전인가 그들이 파견, 완료 또는 다른 시간에 RS? )

캐시 미스로드 자체는 아닙니다. 이 경우 상점 지시 사항이됩니다. 보다 구체적으로, 포트 4에 대한 저장소 데이터 UOP. 여기서는 중요하지 않습니다. L3 대역폭에서 32 바이트 저장소 및 병목 현상을 사용하면 클럭 당 1 포트 4 uop에 가깝지 않습니다.

답변

예, L2 HW 프리 페처는 매우 유용합니다!

예를 들어, 실행 내 컴퓨터 (i7-6700HQ)에 대한 결과를 검색 할 tinymembench을 . 결과의 첫 번째 열은 모든 프리 페 처가 켜져 있고 두 번째 결과 열은 L2 스트리 머가 꺼져있는 상태입니다 (그러나 다른 모든 프리 페처는 여전히 켜져 있음).

이 테스트는 내 컴퓨터의 L3보다 훨씬 큰 32 개의 MiB 소스 및 대상 버퍼를 사용하므로 대부분 DRAM에 대한 누락을 테스트합니다.

==========================================================================
== Memory bandwidth tests                                               ==
==                                                                      ==
== Note 1: 1MB = 1000000 bytes                                          ==
== Note 2: Results for 'copy' tests show how many bytes can be          ==
==         copied per second (adding together read and writen           ==
==         bytes would have provided twice higher numbers)              ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
==         to first fetch data into it, and only then write it to the   ==
==         destination (source -> L1 cache, L1 cache -> destination)    ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in    ==
==         brackets                                                     ==
==========================================================================

                                                       L2 streamer ON            OFF
 C copy backwards                                     :   7962.4 MB/s    4430.5 MB/s
 C copy backwards (32 byte blocks)                    :   7993.5 MB/s    4467.0 MB/s
 C copy backwards (64 byte blocks)                    :   7989.9 MB/s    4438.0 MB/s
 C copy                                               :   8503.1 MB/s    4466.6 MB/s
 C copy prefetched (32 bytes step)                    :   8729.2 MB/s    4958.4 MB/s
 C copy prefetched (64 bytes step)                    :   8730.7 MB/s    4958.4 MB/s
 C 2-pass copy                                        :   6171.2 MB/s    3368.7 MB/s
 C 2-pass copy prefetched (32 bytes step)             :   6193.1 MB/s    4104.2 MB/s
 C 2-pass copy prefetched (64 bytes step)             :   6198.8 MB/s    4101.6 MB/s
 C fill                                               :  13372.4 MB/s   10610.5 MB/s
 C fill (shuffle within 16 byte blocks)               :  13379.4 MB/s   10547.5 MB/s
 C fill (shuffle within 32 byte blocks)               :  13365.8 MB/s   10636.9 MB/s
 C fill (shuffle within 64 byte blocks)               :  13588.7 MB/s   10588.3 MB/s
 -
 standard memcpy                                      :  11550.7 MB/s    8216.3 MB/s
 standard memset                                      :  23188.7 MB/s   22686.8 MB/s
 -
 MOVSB copy                                           :   9458.4 MB/s    6523.7 MB/s
 MOVSD copy                                           :   9474.5 MB/s    6510.7 MB/s
 STOSB fill                                           :  23329.0 MB/s   22901.5 MB/s
 SSE2 copy                                            :   9073.1 MB/s    4970.3 MB/s
 SSE2 nontemporal copy                                :  12647.1 MB/s    7492.5 MB/s
 SSE2 copy prefetched (32 bytes step)                 :   9106.0 MB/s    5069.8 MB/s
 SSE2 copy prefetched (64 bytes step)                 :   9113.5 MB/s    5063.1 MB/s
 SSE2 nontemporal copy prefetched (32 bytes step)     :  11770.8 MB/s    7453.4 MB/s
 SSE2 nontemporal copy prefetched (64 bytes step)     :  11937.1 MB/s    7712.1 MB/s
 SSE2 2-pass copy                                     :   7092.8 MB/s    4355.2 MB/s
 SSE2 2-pass copy prefetched (32 bytes step)          :   7001.4 MB/s    4585.1 MB/s
 SSE2 2-pass copy prefetched (64 bytes step)          :   7055.1 MB/s    4557.9 MB/s
 SSE2 2-pass nontemporal copy                         :   5043.2 MB/s    3263.3 MB/s
 SSE2 fill                                            :  14087.3 MB/s   10947.1 MB/s
 SSE2 nontemporal fill                                :  33134.5 MB/s   32774.3 MB/s

이 테스트에서 L2 스 트리머를 갖는 것은 결코 느리지 않으며 종종 거의 두 배 빠릅니다.

일반적으로 결과에 다음 패턴이 나타날 수 있습니다.

사본은 일반적으로 채우기보다 영향을 많이받는 것 같습니다.
standard memset및 STOSB fill(이 플랫폼에서 동일한 것으로 요약 ) 영향이 가장 적으며 프리 페치 된 결과가없는 것보다 몇 % 빠릅니다.
표준 memcpy은 아마도 여기에서 32 바이트 AVX 명령어를 사용하는 유일한 사본이며 사본의 영향을 가장 적게받는 것입니다.

또한 다른 세 프리 페처를 켜고 끄려고 시도했지만 일반적으로이 벤치 마크에는 거의 영향을 미치지 않았습니다.