2017-04-03 118 views


當不使用PACKET_MMAP時,在調用sendto時,將一個幀從用戶地址存儲器中的緩衝區複製到內核內存中的SK buf,然後內核必須將數據包複製到NIC訪問的存儲器中,向NIC發信號以將該幀DMA存入其自己的硬件緩衝區並將其排隊以供傳輸。當使用PACKET_MMAP套接字選項時,映射的內存由應用程序分配並鏈接到原始套接字。應用程序將數據包放入mmap緩衝區,調用sendto,而不是內核必須將數據包複製到SK buf中,它可以直接從mmapped緩衝區讀取它們。也可以從環形緩衝區中讀取分組的「塊」,而不是單獨的分組/幀。因此,性能增加是一個系統調用,可以爲每個幀複製多個幀和一個較少的複製操作,以將它們放入NIC硬件緩衝區。



[email protected]:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1 
Using inteface lo (1) 
Running in Tx mode 
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 17.65 (2206128128) pps 1457152 
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.08 (2385579520) pps 1575680 
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.28 (2409609728) pps 1591552 
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.31 (2414260736) pps 1594624 
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.30 (2411935232) pps 1593088 


[email protected]:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1 
Using inteface lo (1) 
Running in Tx mode 
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 18.44 (2305001412) pps 1522458 
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.30 (2537520018) pps 1676037 
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.29 (2535744096) pps 1674864 
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.26 (2533014354) pps 1673061 
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.32 (2539476106) pps 1677329 


void *packet_tx_mmap(void* thd_opt_p) { 

    struct thd_opt *thd_opt = thd_opt_p; 
    int32_t sock_fd = setup_socket_mmap(thd_opt_p); 
    if (sock_fd == EXIT_FAILURE) exit(EXIT_FAILURE); 

    struct tpacket2_hdr *hdr; 
    uint8_t *data; 
    int32_t send_ret = 0; 
    uint16_t i; 

    while(1) { 

     for (i = 0; i < thd_opt->tpacket_req.tp_frame_nr; i += 1) { 

      hdr = (void*)(thd_opt->mmap_buf + (thd_opt->tpacket_req.tp_frame_size * i)); 
      data = (uint8_t*)(hdr + TPACKET_ALIGN(TPACKET2_HDRLEN)); 

      memcpy(data, thd_opt->tx_buffer, thd_opt->frame_size); 
      hdr->tp_len = thd_opt->frame_size; 
      hdr->tp_status = TP_STATUS_SEND_REQUEST; 


     send_ret = sendto(sock_fd, NULL, 0, 0, NULL, 0); 
     if (send_ret == -1) { 
      perror("sendto error"); 

     thd_opt->tx_pkts += thd_opt->tpacket_req.tp_frame_nr; 
     thd_opt->tx_bytes += send_ret; 


    return NULL; 



void *packet_tx(void* thd_opt_p) { 

    struct thd_opt *thd_opt = thd_opt_p; 

    int32_t sock_fd = setup_socket(thd_opt_p); 

    if (sock_fd == EXIT_FAILURE) { 
     printf("Can't create socket!\n"); 

    while(1) { 

     thd_opt->tx_bytes += sendto(sock_fd, thd_opt->tx_buffer, 
            thd_opt->frame_size, 0, 
            (struct sockaddr*)&thd_opt->bind_addr, 
     thd_opt->tx_pkts += 1; 




// Set the TPACKET version, v2 for Tx and v3 for Rx 
// (v2 supports packet level send(), v3 supports block level read()) 
int32_t sock_pkt_ver = -1; 

if(thd_opt->sk_mode == SKT_TX) { 
    static const int32_t sock_ver = TPACKET_V2; 
    sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver)); 
} else { 
    static const int32_t sock_ver = TPACKET_V3; 
    sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver)); 

if (sock_pkt_ver < 0) { 
    perror("Can't set socket packet version"); 
    return EXIT_FAILURE; 

memset(&thd_opt->tpacket_req, 0, sizeof(struct tpacket_req)); 
memset(&thd_opt->tpacket_req3, 0, sizeof(struct tpacket_req3)); 

//thd_opt->block_sz = 4096; // These are set else where 
//thd_opt->block_nr = 256; 
//thd_opt->block_frame_sz = 4096; 

int32_t sock_mmap_ring = -1; 
if (thd_opt->sk_mode == SKT_TX) { 

    thd_opt->tpacket_req.tp_block_size = thd_opt->block_sz; 
    thd_opt->tpacket_req.tp_frame_size = thd_opt->block_sz; 
    thd_opt->tpacket_req.tp_block_nr = thd_opt->block_nr; 
    // Allocate per-frame blocks in Tx mode (TPACKET_V2) 
    thd_opt->tpacket_req.tp_frame_nr = thd_opt->block_nr; 

    sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_TX_RING , (void*)&thd_opt->tpacket_req , sizeof(struct tpacket_req)); 

} else { 

    thd_opt->tpacket_req3.tp_block_size = thd_opt->block_sz; 
    thd_opt->tpacket_req3.tp_frame_size = thd_opt->block_frame_sz; 
    thd_opt->tpacket_req3.tp_block_nr = thd_opt->block_nr; 
    thd_opt->tpacket_req3.tp_frame_nr = (thd_opt->block_sz * thd_opt->block_nr)/thd_opt->block_frame_sz; 
    thd_opt->tpacket_req3.tp_retire_blk_tov = 1; 
    thd_opt->tpacket_req3.tp_feature_req_word = 0; 

    sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_RX_RING , (void*)&thd_opt->tpacket_req3 , sizeof(thd_opt->tpacket_req3)); 

if (sock_mmap_ring == -1) { 
    perror("Can't enable Tx/Rx ring for socket"); 
    return EXIT_FAILURE; 

thd_opt->mmap_buf = NULL; 
thd_opt->rd = NULL; 

if (thd_opt->sk_mode == SKT_TX) { 

    thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0); 

    if (thd_opt->mmap_buf == MAP_FAILED) { 
     perror("mmap failed"); 
     return EXIT_FAILURE; 

} else { 

    thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0); 

    if (thd_opt->mmap_buf == MAP_FAILED) { 
     perror("mmap failed"); 
     return EXIT_FAILURE; 

    // Per bock rings in Rx mode (TPACKET_V3) 
    thd_opt->rd = (struct iovec*)calloc(thd_opt->tpacket_req3.tp_block_nr * sizeof(struct iovec), 1); 

    for (uint16_t i = 0; i < thd_opt->tpacket_req3.tp_block_nr; ++i) { 
     thd_opt->rd[i].iov_base = thd_opt->mmap_buf + (i * thd_opt->tpacket_req3.tp_block_size); 
     thd_opt->rd[i].iov_len = thd_opt->tpacket_req3.tp_block_size; 


更新1:結果針對物理接口(一個或多個) 有人提到,使用PACKET_MMAP時可能沒有看到性能差異的原因之一是我將流量發送到回送接口(其中,一方面,沒有QDISC)。由於運行packet_tx_mmap()packet_tx()例程中的任何一個都可以生成超過10Gbps的數據,並且我只有10Gbps接口可供我使用。我將兩個接口綁定在一起,這些結果與上面顯示的幾乎相同,但兩者之間的速度差異很小兩個功能:

packet_tx()到20G bond0

  • 1線程:平均10.77Gbps〜/ 889kfps〜
  • 2個線程:平均19.19Gbps〜/ 1.58Mfps〜
  • 3線程:平均19.67Gbps 〜/ 1.62Mfps〜(這是 快的鍵將去)

packet_tx_mmap()到20G bond0:

  • 1線程:平均11.08Gbps〜/ 913kfps〜
  • 2個線程:平均19.0Gbps〜/ 1.57Mfps〜
  • 3線程:平均19.66Gbps〜/ 1.62Mfps〜(這是如 快的鍵將去)


在所有上述測試中,軟IRQ的數量大致相同(使用this script測量)。在一個線程運行packet_tx()的情況下,CPU內核每秒鐘有大約40k箇中斷。有2個和3個線程分別在2個和3個核心上運行40K。使用packet_tx_mmap()時的結果在哪裏相同。大約40k個軟核IRQ用於一個CPU內核上的單線程。運行2個和3個線程時每個核心爲40k。




你的網絡有多快?你的尺寸有多大?你可能只是讓你的鏈接飽和?你有沒有檢查實際(自動協商)比特率? – maxy


幀的大小是1514個八位字節的頭文件,我將流量發送到回送接口lo,如輸出中所示。我正在將流量發送到回送接口,以消除網卡問題。 – jwbensley


我的理解是,因爲'packet_tx_mmap'函數應該與內核共享一個緩衝區,這意味着多個數據包在一個sendto()'系統調用中從userland複製到kernelland,所以向loopback接口發送流量意味着我們正在測試並且不用擔心將數據包DMA傳給NIC,這對於packet_tx和packet_tx_mmap都是相同的過程,因爲這是進一步在內核堆棧之下。 – jwbensley










爲了找到相關的內核代碼,有幾個關鍵字可以搜索,但PACKET_TX_RING脫穎而出作爲此功能特有的套接字選項。在interwebs上搜索「PACKET_TX_RING linux交叉引用」會出現少量引用,其中包括af_packet.c,其中有一點檢查似乎是執行所有AF_PACKET功能,包括數據包mmap。


一個非常強大的工具,用於從內核中獲取這樣的信息是SystemTap。 (使用這需要你的內核安裝調試符號。我碰巧使用Ubuntu和this是獲得了SystemTap在Ubuntu上工作的祕訣。)


$ sudo stap -e 'probe kernel.function("tpacket_snd") { printf("W00T!\n"); }' & 
[1] 19961 
$ sudo ./packet_mmap -c 1 eth0 
data offset = 32 bytes 
start fill() thread 
send 1 packets (+150 bytes) 
end of task fill() 
Loop until queue empty (0) 
END (number of error:0) 





# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 

# 325 static void __packet_set_status(struct packet_sock *po, void *frame, int status) 
# 326 { 
# 327 union tpacket_uhdr h; 
# 328 
# 329 h.raw = frame; 
# 330 switch (po->tp_version) { 
# 331 case TPACKET_V1: 
# 332  h.h1->tp_status = status; 
# 333  flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 
# 334  break; 
# 335 case TPACKET_V2: 
# 336  h.h2->tp_status = status; 
# 337  flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 
# 338  break; 
# 339 case TPACKET_V3: 
# 340 default: 
# 341  WARN(1, "TPACKET version not supported.\n"); 
# 342  BUG(); 
# 343 } 
# 344 
# 345 smp_wmb(); 
# 346 } 

probe kernel.statement("[email protected]/packet/af_packet.c:334") { 
    printf("SET(V1): %d (0x%.16x)\n", $status, $frame); 

probe kernel.statement("[email protected]/packet/af_packet.c:338") { 
    printf("SET(V2): %d\n", $status); 

# 348 static int __packet_get_status(struct packet_sock *po, void *frame) 
# 349 { 
# 350 union tpacket_uhdr h; 
# 351 
# 352 smp_rmb(); 
# 353 
# 354 h.raw = frame; 
# 355 switch (po->tp_version) { 
# 356 case TPACKET_V1: 
# 357  flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 
# 358  return h.h1->tp_status; 
# 359 case TPACKET_V2: 
# 360  flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 
# 361  return h.h2->tp_status; 
# 362 case TPACKET_V3: 
# 363 default: 
# 364  WARN(1, "TPACKET version not supported.\n"); 
# 365  BUG(); 
# 366  return 0; 
# 367 } 
# 368 } 

probe kernel.statement("[email protected]/packet/af_packet.c:358") { 
    printf("GET(V1): %d (0x%.16x)\n", $h->h1->tp_status, $frame); 

probe kernel.statement("[email protected]/packet/af_packet.c:361") { 
    printf("GET(V2): %d\n", $h->h2->tp_status); 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2136 do { 
# 2137  ph = packet_current_frame(po, &po->tx_ring, 
# 2139 
# 2140  if (unlikely(ph == NULL)) { 
# 2141   schedule(); 
# 2142   continue; 
# 2143  } 
# 2144 
# 2145  status = TP_STATUS_SEND_REQUEST; 
# 2146  hlen = LL_RESERVED_SPACE(dev); 
# 2147  tlen = dev->needed_tailroom; 
# 2148  skb = sock_alloc_send_skb(&po->sk, 
# 2149    hlen + tlen + sizeof(struct sockaddr_ll), 
# 2150    0, &err); 
# 2151 
# 2152  if (unlikely(skb == NULL)) 
# 2153   goto out_status; 
# 2154 
# 2155  tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 
# 2156     addr, hlen); 
# [...] 
# 2176  skb->destructor = tpacket_destruct_skb; 
# 2177  __packet_set_status(po, ph, TP_STATUS_SENDING); 
# 2178  atomic_inc(&po->tx_ring.pending); 
# 2179 
# 2180  status = TP_STATUS_SEND_REQUEST; 
# 2181  err = dev_queue_xmit(skb); 
# 2182  if (unlikely(err > 0)) { 
# [...] 
# 2195  } 
# 2196  packet_increment_head(&po->tx_ring); 
# 2197  len_sum += tp_len; 
# 2198 } while (likely((ph != NULL) || 
# 2199   ((!(msg->msg_flags & MSG_DONTWAIT)) && 
# 2200   (atomic_read(&po->tx_ring.pending)))) 
# 2201  ); 
# 2202 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    printf("tpacket_snd: args(%s)\n", $$parms); 

probe kernel.statement("[email protected]/packet/af_packet.c:2140") { 
    printf("tpacket_snd:2140: current frame ph = 0x%.16x\n", $ph); 

probe kernel.statement("[email protected]/packet/af_packet.c:2141") { 
    printf("tpacket_snd:2141: (ph==NULL) --> schedule()\n"); 

probe kernel.statement("[email protected]/packet/af_packet.c:2142") { 
    printf("tpacket_snd:2142: flags 0x%x, pending %d\n", 
    $msg->msg_flags, $po->tx_ring->pending->counter); 

probe kernel.statement("[email protected]/packet/af_packet.c:2197") { 
    printf("tpacket_snd:2197: flags 0x%x, pending %d\n", 
    $msg->msg_flags, $po->tx_ring->pending->counter); 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    printf("tpacket_snd: return(%d)\n", $err); 

# 1946 static void tpacket_destruct_skb(struct sk_buff *skb) 
# 1947 { 
# 1948 struct packet_sock *po = pkt_sk(skb->sk); 
# 1949 void *ph; 
# 1950 
# 1951 if (likely(po->tx_ring.pg_vec)) { 
# 1952  __u32 ts; 
# 1953 
# 1954  ph = skb_shinfo(skb)->destructor_arg; 
# 1955  BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 
# 1956  atomic_dec(&po->tx_ring.pending); 
# 1957 
# 1958  ts = __packet_set_timestamp(po, ph, skb); 
# 1959  __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 
# 1960 } 
# 1961 
# 1962 sock_wfree(skb); 
# 1963 } 

probe kernel.statement("[email protected]/packet/af_packet.c:1959") { 
    printf("tpacket_destruct_skb:1959: ph = 0x%.16x, ts = 0x%x, pending %d\n", 
    $ph, $ts, $po->tx_ring->pending->counter); 


首先我們定義探測器,當tx_ring中的數據包設置或讀取狀態時,打印出信息。接下來我們爲tpacket_snd的回調函數和do {...} while (...)迴路中的點定義探測器,處理tx_ring中的數據包。最後我們向skb析構函數添加一個探測器。

我們可以用sudo stap status.stp啓動SystemTap腳本。然後運行sudo packet_mmap -c 2 <interface>通過接口發送2幀。下面是我從SystemTap的腳本得到的輸出:

[1492581245.839850] tpacket_snd: args(po=0xffff88016720ee38 msg=0x14) 
[1492581245.839865] GET(V1): 1 (0xffff880241202000) 
[1492581245.839873] tpacket_snd:2140: current frame ph = 0xffff880241202000 
[1492581245.839887] SET(V1): 2 (0xffff880241202000) 
[1492581245.839918] tpacket_snd:2197: flags 0x40, pending 1 
[1492581245.839923] GET(V1): 1 (0xffff88013499c000) 
[1492581245.839929] tpacket_snd:2140: current frame ph = 0xffff88013499c000 
[1492581245.839935] SET(V1): 2 (0xffff88013499c000) 
[1492581245.839946] tpacket_snd:2197: flags 0x40, pending 2 
[1492581245.839951] GET(V1): 0 (0xffff88013499e000) 
[1492581245.839957] tpacket_snd:2140: current frame ph = 0x0000000000000000 
[1492581245.839961] tpacket_snd:2141: (ph==NULL) --> schedule() 
[1492581245.839977] tpacket_snd:2142: flags 0x40, pending 2 
[1492581245.839984] tpacket_snd: return(300) 
[1492581245.840077] tpacket_snd: args(po=0x0 msg=0x14) 
[1492581245.840089] GET(V1): 0 (0xffff88013499e000) 
[1492581245.840098] tpacket_snd:2140: current frame ph = 0x0000000000000000 
[1492581245.840093] tpacket_destruct_skb:1959: ph = 0xffff880241202000, ts = 0x0, pending 1 
[1492581245.840102] tpacket_snd:2141: (ph==NULL) --> schedule() 
[1492581245.840104] SET(V1): 0 (0xffff880241202000) 
[1492581245.840112] tpacket_snd:2142: flags 0x40, pending 1 
[1492581245.840116] tpacket_destruct_skb:1959: ph = 0xffff88013499c000, ts = 0x0, pending 0 
[1492581245.840119] tpacket_snd: return(0) 
[1492581245.840123] SET(V1): 0 (0xffff88013499c000) 


network capture of first run of packet_mmap

有很多在SystemTap中輸出的有用信息。我們可以看到tpacket_snd獲得了環中第一幀的狀態(TP_STATUS_SEND_REQUEST爲1),然後將其設置爲TP_STATUS_SENDING(2)。它與第二個一樣。下一幀的狀態爲TP_STATUS_AVAILABLE(0),它不是發送請求,所以它調用schedule()來產生並繼續循環。由於沒有更多的幀要發送(ph==NULL)並且已經請求了非阻塞(msg->msg_flags ==MSG_DONTWAIT),所以do {...} while (...)循環終止,並且tpacket_snd返回300,排隊傳輸的字節數。

接下來,packet_mmap再次調用sendto(通過「循環直到隊列空」代碼),但在tx環中沒有更多數據要發送,並且請求非阻塞,因此它立即返回0,因爲不是數據已排隊。請注意,它檢查狀態的框架與上次調用時檢查的框架相同 - 它不是從tx環中的第一個框架開始,它檢查了head(在用戶空間中不可用)。

異步調用析構函數,首先在第一幀上,將幀的狀態設置爲TP_STATUS_AVAILABLE,然後遞減未決計數,然後在第二幀上調用。請注意,如果未請求非阻塞,則在do {...} while (...)循環結束時的測試將等到所有未完成的數據包在返回之前已傳輸到NIC(假設它支持分散的數據)。您可以通過運行packet_mmap並使用-t選項來觀察此選項,該選項用於使用阻塞I/O的「線程」(直到它進入「循環直到隊列爲空」)。

有幾件事要注意。首先,SystemTap輸出上的時間戳不會增加:從SystemTap輸出推斷時間順序並不安全。其次,請注意網絡捕獲(本地完成)上的時間戳是不同的。 FWIW,這個界面在便宜的塔式電腦中是廉價的1G。

所以在這一點上,我想我們或多或少知道af_packet是如何處理共享tx環的。接下來的內容是tx環中的幀如何到達網絡接口。查看linux網絡內核中的控制流的overviewthis section(處理層2傳輸的方式)可能會有幫助。 OK,所以如果你對2層傳輸的處理有一個基本的瞭解,看起來這個包mmap接口應該是一個巨大的消防水帶;加載一個帶有數據包的共享tx環,調用sendto()MSG_DONTWAIT,然後tpacket_snd將遍歷創建skb的tx隊列並將它們排入qdisc。異步地,skb將從qdisc中出隊併發送到硬件tx環。 skb應該是non-linear,所以他們會引用tx環中的數據而不是複製,而一個不錯的現代NIC應該能夠處理分散的數據並引用tx環中的數據。當然,這些假設中的任何一個都可能是錯誤的,所以我們試着用這種消防水帶將大量的傷害轉移到qdisc上。


這樣做的第一個問題是默認qdisc pfifo_fast不保留統計信息。所以讓我們用qdisc pfifo替換它。默認pfifo將隊列限制爲TXQUEUELEN幀(通常默認爲1000)。但是,因爲我們想展示出壓倒性一個隊列規定,讓我們明確地將其設置爲50:

$ sudo tc qdisc add dev eth0 root pfifo limit 50 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8004: root refcnt 2 limit 50p 
Sent 42 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) 
backlog 0b 0p requeues 0 


# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    printf("tpacket_snd: args(%s)\n", $$parms); 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    printf("tpacket_snd: return(%d)\n", $err); 

開始與sudo stap call-return.stp的SystemTap的腳本,然後讓高爐8096 1500字節幀成隊列規定與微薄的50架容量:

$ sudo ./packet_mmap -c 8096 -s 1500 eth0 
data offset = 32 bytes 
start fill() thread 
send 8096 packets (+12144000 bytes) 
end of task fill() 
Loop until queue empty (0) 
END (number of error:0) 


$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8004: root refcnt 2 limit 50p 
Sent 25755333 bytes 8606 pkt (dropped 1, overlimits 0 requeues 265) 
backlog 0b 0p requeues 265 


[1492603552.938414] tpacket_snd: args(po=0xffff8801673ba338 msg=0x14) 
[1492603553.036601] tpacket_snd: return(12144000) 
[1492603553.036706] tpacket_snd: args(po=0x0 msg=0x14) 
[1492603553.036716] tpacket_snd: return(0) 

WAT?在tpacket_snd中處理8096幀花了將近100ms?讓我們來看看實際需要傳輸多長時間;在1千兆/秒〜= 97毫秒時,這是8096幀,1500字節/幀。 WAT?它聞起來有些阻塞。

讓我們仔細看看tpacket_snd。 Groan:

skb = sock_alloc_send_skb(&po->sk, 
       hlen + tlen + sizeof(struct sockaddr_ll), 
       0, &err); 

0看起來非常無害,但實際上這是noblock參數。它應該是msg->msg_flags & MSG_DONTWAIT(原來這是fixed in 4.1)。這裏發生的事情是qdisc的大小不是唯一的限制性資源。如果爲skb分配空間將超出套接字sndbuf限制的大小,則此調用將阻止等待skb被釋放,或者將-EAGAIN返回給非阻塞調用方。在V4.1的修復中,如果請求非阻塞,它將返回寫入的字節數,如果非零,否則-EAGAIN給調用者,這幾乎看起來像某人不希望你弄清楚如何使用這個(例如你填寫一個80MB數據的tx環,調用sendto與MSG_DONTWAIT,你會得到一個結果,你發送150KB而不是EWOULDBLOCK)。所以如果你運行4.1之前的內核(我相信OP運行> 4.1並且不受這個bug的影響),你需要打補丁af_packet.c並構建一個新的內核或者升級到內核4.1或者更好。


# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2133 if (size_max > dev->mtu + reserve + VLAN_HLEN) 
# 2134  size_max = dev->mtu + reserve + VLAN_HLEN; 
# 2135 
# 2136 do { 
# [...] 
# 2148  skb = sock_alloc_send_skb(&po->sk, 
# 2149    hlen + tlen + sizeof(struct sockaddr_ll), 
# 2150    msg->msg_flags & MSG_DONTWAIT, &err); 
# 2151 
# 2152  if (unlikely(skb == NULL)) 
# 2153   goto out_status; 
# [...] 
# 2181  err = dev_queue_xmit(skb); 
# 2182  if (unlikely(err > 0)) { 
# 2183   err = net_xmit_errno(err); 
# 2184   if (err && __packet_get_status(po, ph) == 
# 2185     TP_STATUS_AVAILABLE) { 
# 2186    /* skb was destructed already */ 
# 2187    skb = NULL; 
# 2188    goto out_status; 
# 2189   } 
# 2190   /* 
# 2191   * skb was dropped but not destructed yet; 
# 2192   * let's treat it like congestion or err < 0 
# 2193   */ 
# 2194   err = 0; 
# 2195  } 
# 2196  packet_increment_head(&po->tx_ring); 
# 2197  len_sum += tp_len; 
# 2198 } while (likely((ph != NULL) || 
# 2199   ((!(msg->msg_flags & MSG_DONTWAIT)) && 
# 2200   (atomic_read(&po->tx_ring.pending)))) 
# 2201  ); 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    printf("tpacket_snd: args(%s)\n", $$parms); 

probe kernel.statement("[email protected]/packet/af_packet.c:2133") { 
    printf("tpacket_snd:2133: sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 

probe kernel.statement("[email protected]/packet/af_packet.c:2153") { 
    printf("tpacket_snd:2153: sock_alloc_send_skb err = %d, sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 

probe kernel.statement("[email protected]/packet/af_packet.c:2182") { 
    if ($err != 0) { 
    printf("tpacket_snd:2182: dev_queue_xmit err = %d\n", $err); 

probe kernel.statement("[email protected]/packet/af_packet.c:2187") { 
    printf("tpacket_snd:2187: destructed: net_xmit_errno = %d\n", $err); 

probe kernel.statement("[email protected]/packet/af_packet.c:2194") { 
    printf("tpacket_snd:2194: *NOT* destructed: net_xmit_errno = %d\n", $err); 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    printf("tpacket_snd: return(%d) sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 


$ sudo tc qdisc add dev eth0 root pfifo limit 50 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8001: root refcnt 2 limit 50p 
Sent 2154 bytes 21 pkt (dropped 0, overlimits 0 requeues 0) 
backlog 0b 0p requeues 0 
$ sudo ./packet_mmap -c 200 -s 1500 eth0 
c_sndbuf_sz:  1228800 
data offset = 32 bytes 
send buff size = 1228800 
got buff size = 425984 
buff size smaller than desired, trying to force... 
got buff size = 2457600 
start fill() thread 
send: No buffer space available 
end of task fill() 
send: No buffer space available 
Loop until queue empty (-1) 
[repeated another 17 times] 
send 3 packets (+4500 bytes) 
Loop until queue empty (4500) 
Loop until queue empty (0) 
END (number of error:0) 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8001: root refcnt 2 limit 50p 
Sent 452850 bytes 335 pkt (dropped 19, overlimits 0 requeues 3) 
backlog 0b 0p requeues 3 


[1492759330.907151] tpacket_snd: args(po=0xffff880393246c38 msg=0x14) 
[1492759330.907162] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 1 
[1492759330.907491] tpacket_snd:2182: dev_queue_xmit err = 1 
[1492759330.907494] tpacket_snd:2187: destructed: net_xmit_errno = -105 
[1492759330.907500] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 218639 
[1492759330.907646] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.907653] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[1492759330.907688] tpacket_snd:2182: dev_queue_xmit err = 1 
[1492759330.907691] tpacket_snd:2187: destructed: net_xmit_errno = -105 
[1492759330.907694] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[repeated 17 times] 
[1492759330.908541] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.908543] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[1492759330.908554] tpacket_snd: return(4500) sk_sndbuf = 2457600 sk_wmem_alloc = 196099 
[1492759330.908570] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.908572] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 196099 
[1492759330.908576] tpacket_snd: return(0) sk_sndbuf = 2457600 sk_wmem_alloc = 196099 



無論如何,即使我們沒有一個完整的解決方案來有效地防止NIC餓死,我們現在已經知道了足以回答OP問題的方法。根據我們所瞭解的情況,我們知道當OP在阻塞模式下用一個tx環呼叫sendto時,tpacket_snd將啓動將skbs排入qdisc,直到超過sndbuf限制(並且默認通常很小,約213K ,並且進一步,我發現在共享tx環中引用的幀數據會被計數到這個時間),當它阻塞時(仍然持有pg_vec_lock)。隨着skb的釋放,更多的幀將被排隊,也許sndbuf會再次超出,我們將再次阻止。最終,所有的數據都將被排隊等待到qdisc,但是tpacket_snd將繼續阻塞,直到所有的幀都被髮送出去爲止(你不能將tx環中的幀標記爲可用,直到網卡已經收到它爲止驅動器環中的skb引用tx環中的幀),同時仍然保持pg_vec_lock。在這一點上,NIC被餓死,其他任何套接字編寫器都被鎖定。另一方面,當OP每次發佈一個數據包時,它將被packet_snd處理,如果在sndbuf中沒有空間,然後將該幀排入qdisc並立即返回,將會被阻止。它不等待幀被傳輸。當qdisc被排空時,可以將其他幀排入隊列。如果發行商能夠跟上,那麼NIC永遠不會餓死。



感謝您的所有幫助,我現在正在使用調試符號和未優化版本的'af_packet.c'構建一個內核。當我們等待時,只是一些食物而已。我沒有在我的應用程序中使用'MSG_DONTWAIT'標誌。我沒有試圖使用非阻塞調用,所以使用'sock_alloc_send_skb(x,x,0,x)'得到了上面的代碼片段 - 即使我們發現第三個參數沒有被正確傳遞的錯誤,無論如何都應該是零? – jwbensley


另外我有一個4.4.x的內核,但只要你有> = 3.14,如果你看看我已經放在Github上的代碼,有一個套接字選項來繞過QDISC層,並跨越傳輸。一旦我構建了我的調試內核,我將爲'packet_mmap.c'測試程序添加相同的套接字選項以查看其效果:'int bypass = 1;''int ret = setsockopt(sock_fd,SOL_PACKET,PACKET_QDISC_BYPASS,&bypass,sizeof (旁路));' – jwbensley


我的內核仍在編譯,但從快速掃描'af_packet.c'我期待在L2695'err = po-> xmit(skb);'指向'packet_direct_xmit()',而不是'dev_queue_xmit()',參見'af_packet.c'中的L3751。我在想[this](http://lxr.free-electrons.com/source/net/packet/af_packet.c#L3751)指向[this](http://lxr.free-electrons.com/ source/net/packet/af_packet.c#L250),它指向[this](http://lxr.free-electrons.com/source/include/linux/netdevice.h#L3970),它指向[this]( http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/igb/igb_main.c#L2143)。 – jwbensley