Try   HackMD

Linux 核心專題筆記

eBPF TCP 伺服器

eBPF

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

  • 在使用者層級編譯好程式碼BPF bytecode
  • 透過loader載入到核心層級進行驗證,執行
  • 結束後兩種方式輸出1. 輸出到per-event data, eBPF map

perf-event data 又稱Perf Event Buffer

  • map型態BPF_MAP_TYPE_PERF_EVENT_ARRAY
  • 每個CPU維護一個自己的Buffer
  • 使用的bpf helper
long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)

bpf CO-RE

主要目標是解決不同核心版本下使用編譯過的bpf程式
透過BPF Type Format (BTF)定位現在核心版本下資料結構的offset
可以想像成提供一個虛擬索引去轉換不同核心版本下的實際位置

參考資料

eBPF 函式庫

BPF 函數定義

BPF_CALL2_5
定義BPF可以呼叫的函式
核心程式碼位置
例子:bpg_map_update_elem實作

BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); }

BPF Maps

Map 類型: 參見

透過bpf syscall 存取

int map_fd; union bpf_attr attr = { .map_type = BPF_MAP_TYPE_ARRAY; .key_size = sizeof(__u32); .value_size = sizeof(__u32); .max_entries = 256; }; map_fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr));

靜態產生

#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries) \ struct { \ __uint(type, _type); \ __uint(key_size, sizeof(_key_type)); \ __uint(value_size, sizeof(_value_type)); \ __uint(max_entries, _max_entries); \ } _name SEC(".maps"); #define BPF_PERF_OUTPUT(_name) \ BPF_MAP(_name, BPF_MAP_TYPE_PERF_EVENT_ARRAY, int, int, 2048);

Environment

kernel config

cat /boot/config-6.11.0-17-generic | grep -i bpf
CONFIG_BPF=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
# BPF subsystem
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_BPF_UNPRIV_DEFAULT_OFF=y
# CONFIG_BPF_PRELOAD is not set
CONFIG_BPF_LSM=y
# end of BPF subsystem
CONFIG_CGROUP_BPF=y
CONFIG_IPV6_SEG6_BPF=y
CONFIG_NETFILTER_BPF_LINK=y
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NET_CLS_BPF=m
CONFIG_NET_ACT_BPF=m
CONFIG_BPF_STREAM_PARSER=y
CONFIG_LWTUNNEL_BPF=y
# HID-BPF support
CONFIG_HID_BPF=y
# end of HID-BPF support
CONFIG_BPF_EVENTS=y
CONFIG_BPF_KPROBE_OVERRIDE=y
CONFIG_TEST_BPF=m

執行環境

uname -a
Linux brianpan-Aspire-A14-52MT 6.11.0-17-generic #17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2 x86_64 x86_64 x86_64 GNU/Linux

gcc --version
gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
Copyright (C) 2023 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

sudo lscpu
Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          42 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   8
  On-line CPU(s) list:    0-7
Vendor ID:                GenuineIntel
  BIOS Vendor ID:         Intel(R) Corporation
  Model name:             Intel(R) Core(TM) Ultra 5 226V
    BIOS Model name:      Intel(R) Core(TM) Ultra 5 226V To Be Filled By O.E.M. 
                          CPU @ 0.4GHz
    BIOS CPU family:      773
    CPU family:           6
    Model:                189
    Thread(s) per core:   1
    Core(s) per socket:   8
    Socket(s):            1
    Stepping:             1
    CPU(s) scaling MHz:   40%
    CPU max MHz:          4500.0000
    CPU min MHz:          400.0000
    BogoMIPS:             6220.80
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 s
                          s ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc 
                          art arch_perfmon pebs bts rep_good nopl xtopology nons
                          top_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq 
                          dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma c
                          x16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt t
                          sc_deadline_timer aes xsave avx f16c rdrand lahf_lm ab
                          m 3dnowprefetch cpuid_fault epb intel_ppin ssbd ibrs i
                          bpb stibp ibrs_enhanced tpr_shadow flexpriority ept vp
                          id ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms
                           invpcid rdt_a rdseed adx smap clflushopt clwb intel_p
                          t sha_ni xsaveopt xsavec xgetbv1 xsaves split_lock_det
                          ect user_shstk avx_vnni lam wbnoinvd dtherm ida arat p
                          ln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_r
                          eq hfi vnmi umip pku ospke waitpkg gfni vaes vpclmulqd
                          q rdpid bus_lock_detect movdiri movdir64b fsrm md_clea
                          r serialize pconfig arch_lbr ibt flush_l1d arch_capabi
                          lities
Virtualization features:  
  Virtualization:         VT-x
Caches (sum of all):      
  L1d:                    320 KiB (8 instances)
  L1i:                    512 KiB (8 instances)
  L2:                     14 MiB (5 instances)
  L3:                     8 MiB (1 instance)
NUMA:                     
  NUMA node(s):           1
  NUMA node0 CPU(s):      0-7
Vulnerabilities:          
  Gather data sampling:   Not affected
  Itlb multihit:          Not affected
  L1tf:                   Not affected
  Mds:                    Not affected
  Meltdown:               Not affected
  Mmio stale data:        Not affected
  Reg file data sampling: Not affected
  Retbleed:               Not affected
  Spec rstack overflow:   Not affected
  Spec store bypass:      Mitigation; Speculative Store Bypass disabled via prct
                          l
  Spectre v1:             Mitigation; usercopy/swapgs barriers and __user pointe
                          r sanitization
  Spectre v2:             Mitigation; Enhanced / Automatic IBRS; IBPB conditiona
                          l; RSB filling; PBRSB-eIBRS Not affected; BHI Not affe
                          cted
  Srbds:                  Not affected
  Tsx async abort:        Not affected

套件管理

sudo apt install -y linux-headers-$(uname -r) bpfcc-tools python3-bpfcc libbpfcc libbpfcc-dev clang

上學期專案

Project Link

Ideas

  • By sockops type program BPF_PROG_TYPE_SOCK_OPS, program will be called during lifetime of the socket: REF
  • By stream_verdict type program BPF_SK_SKB_VERDICT, our program skips the kernel network stack, REF

stream_verdict

解釋: https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_SK_SKB/?utm_source=chatgpt.com#as-bpf_sk_skb_stream_verdict-program
stream parser 流程過後會呼叫stream_verdict 來過濾封包

沒辦法從bpftrace找到sk_skb/stream_verdict

Can't find the entry point of sk_skb/stream_verdict from listing available ebpf call

sudo bpftrace -l

Linux source code Readme

compilation error fix

/usr/include/linux/types.h:5:10: fatal error: 'asm/types.h' file not found
    5 | #include <asm/types.h>
      |          ^~~~~~~~~~~~~
1 error generated.
sudo apt-get install -y gcc-multilib

載入程式

# load sockmap_ops to file system
sudo bpftool prog load bpf_sockops.o /sys/fs/bpf/bpf_sockops
sudo bpftool prog show pinned /sys/fs/bpf/bpf_sockops
127: sock_ops  name bpf_sockmap  tag d1bb5f447965262d  gpl
	loaded_at 2025-05-18T22:31:14-0500  uid 0
	xlated 376B  jited 216B  memlock 4096B  map_ids 5,7
	btf_id 204

mount a bpf fs on bpffs folder

mkdir bpffs
sudo mount -t bpf none bpffs

ping map to bpf filesystem

sudo bpftool map show name sockmap_ops
5: sockhash  name sockmap_ops  flags 0x0
	key 16B  value 4B  max_entries 65535  memlock 1048912B
sudo bpftool map pin name sockmap_ops bpffs/sockmap_ops

# dump the map by name
sudo bpftool map dump name sockmap_ops

attach bpf_socksops to cgroup

sudo bpftool cgroup attach /sys/fs/cgroup/ sock_ops pinned /sys/fs/bpf/bpf_sockops

load bpf_redir.o program with pinned map named sockmap_ops

sudo bpftool prog load bpf_redir.o /sys/fs/bpf/bpf_redir map name sockmap_ops pinned bpffs/sockmap_ops

check if map and bpf program are loaded

sudo bpftool prog list
127: sock_ops  name bpf_sockmap  tag d1bb5f447965262d  gpl
	loaded_at 2025-05-18T22:31:14-0500  uid 0
	xlated 376B  jited 216B  memlock 4096B  map_ids 5,7
	btf_id 204
136: sk_skb  name bpf_redir  tag 8aae03b571c7bc42  gpl
	loaded_at 2025-05-18T23:20:44-0500  uid 0
	xlated 536B  jited 307B  memlock 4096B  map_ids 5,11
	btf_id 215

attach the program to stream_verdict

sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir stream_verdict pinned bpffs/sockmap_ops

run the program

./ebpf-echo-server

ss -l | grep 12345
tcp   LISTEN 0      1024                                           0.0.0.0:12345                     0.0.0.0:*    

test echo

telnet 192.168.1.139 12345
Trying 192.168.1.139...
Connected to 192.168.1.139.
Escape character is '^]'.
xxx
xxx

output of bpf_sockops.o

List program sections
readelf -S  bpf_sockops.o
There are 28 section headers, starting at offset 0x2b68:

Section Headers:
  [Nr] Name              Type             Address           Offset
       Size              EntSize          Flags  Link  Info  Align
  [ 0]                   NULL             0000000000000000  00000000
       0000000000000000  0000000000000000           0     0     0
  [ 1] .strtab           STRTAB           0000000000000000  00002a05
       000000000000015c  0000000000000000           0     0     1
  [ 2] .text             PROGBITS         0000000000000000  00000040
       0000000000000000  0000000000000000  AX       0     0     4
  [ 3] sockops           PROGBITS         0000000000000000  00000040
       0000000000000148  0000000000000000  AX       0     0     8
  [ 4] .relsockops       REL              0000000000000000  00002010
       0000000000000030  0000000000000010   I      27     3     8
  [ 5] license           PROGBITS         0000000000000000  00000188
       0000000000000004  0000000000000000   A       0     0     1
  [ 6] .maps             PROGBITS         0000000000000000  00000190
       0000000000000028  0000000000000000  WA       0     0     8
  [ 7] .rodata           PROGBITS         0000000000000000  000001b8
       000000000000002c  0000000000000000   A       0     0     1
  [ 8] .debug_loclists   PROGBITS         0000000000000000  000001e4
       000000000000008d  0000000000000000           0     0     1
  [ 9] .debug_abbrev     PROGBITS         0000000000000000  00000271
       00000000000001c6  0000000000000000           0     0     1
  [10] .debug_info       PROGBITS         0000000000000000  00000437
       00000000000004fa  0000000000000000           0     0     1
  [11] .rel.debug_info   REL              0000000000000000  00002040
       0000000000000050  0000000000000010   I      27    10     8
  [12] .debug_str_o[...] PROGBITS         0000000000000000  00000931
       00000000000001b4  0000000000000000           0     0     1
  [13] .rel.debug_s[...] REL              0000000000000000  00002090
       00000000000006b0  0000000000000010   I      27    12     8
  [14] .debug_str        PROGBITS         0000000000000000  00000ae5
       0000000000000574  0000000000000001  MS       0     0     1
  [15] .debug_addr       PROGBITS         0000000000000000  00001059
       0000000000000038  0000000000000000           0     0     1
  [16] .rel.debug_addr   REL              0000000000000000  00002740
       0000000000000060  0000000000000010   I      27    15     8
  [17] .BTF              PROGBITS         0000000000000000  00001094
       0000000000000a53  0000000000000000           0     0     4
  [18] .rel.BTF          REL              0000000000000000  000027a0
       0000000000000040  0000000000000010   I      27    17     8
  [19] .BTF.ext          PROGBITS         0000000000000000  00001ae8
       0000000000000170  0000000000000000           0     0     4
  [20] .rel.BTF.ext      REL              0000000000000000  000027e0
       0000000000000140  0000000000000010   I      27    19     8
  [21] .debug_frame      PROGBITS         0000000000000000  00001c58
       0000000000000028  0000000000000000           0     0     8
  [22] .rel.debug_frame  REL              0000000000000000  00002920
       0000000000000020  0000000000000010   I      27    21     8
  [23] .debug_line       PROGBITS         0000000000000000  00001c80
       000000000000011a  0000000000000000           0     0     1
  [24] .rel.debug_line   REL              0000000000000000  00002940
       00000000000000c0  0000000000000010   I      27    23     8
  [25] .debug_line_str   PROGBITS         0000000000000000  00001d9a
       00000000000000ac  0000000000000001  MS       0     0     1
  [26] .llvm_addrsig     LOOS+0xfff4c03   0000000000000000  00002a00
       0000000000000005  0000000000000000   E      27     0     1
  [27] .symtab           SYMTAB           0000000000000000  00001e48
       00000000000001c8  0000000000000018           1    16     8

list symbol table & hex of .maps section

readelf -s bpf_sockops.o

Symbol table '.symtab' contains 19 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND 
     1: 0000000000000000     0 FILE    LOCAL  DEFAULT  ABS bpf_sockops.c
     2: 0000000000000000     0 SECTION LOCAL  DEFAULT    3 sockops
     3: 0000000000000138     0 NOTYPE  LOCAL  DEFAULT    3 LBB0_5
     4: 0000000000000118     0 NOTYPE  LOCAL  DEFAULT    3 LBB0_4
     5: 0000000000000000    23 OBJECT  LOCAL  DEFAULT    7 update_sockmap_o[...]
     6: 0000000000000017    21 OBJECT  LOCAL  DEFAULT    7 update_sockmap_o[...]
     7: 0000000000000000     0 SECTION LOCAL  DEFAULT    7 .rodata
     8: 0000000000000000     0 SECTION LOCAL  DEFAULT    8 .debug_loclists
     9: 0000000000000000     0 SECTION LOCAL  DEFAULT    9 .debug_abbrev
    10: 0000000000000000     0 SECTION LOCAL  DEFAULT   12 .debug_str_offsets
    11: 0000000000000000     0 SECTION LOCAL  DEFAULT   14 .debug_str
    12: 0000000000000000     0 SECTION LOCAL  DEFAULT   15 .debug_addr
    13: 0000000000000000     0 SECTION LOCAL  DEFAULT   21 .debug_frame
    14: 0000000000000000     0 SECTION LOCAL  DEFAULT   23 .debug_line
    15: 0000000000000000     0 SECTION LOCAL  DEFAULT   25 .debug_line_str
    16: 0000000000000000   328 FUNC    GLOBAL DEFAULT    3 bpf_sockmap
    17: 0000000000000000    40 OBJECT  GLOBAL DEFAULT    6 sockmap_ops
    18: 0000000000000000     4 OBJECT  GLOBAL DEFAULT    5 __license


readelf -x .maps  bpf_sockops.o

Hex dump of section '.maps':
  0x00000000 00000000 00000000 00000000 00000000 ................
  0x00000010 00000000 00000000 00000000 00000000 ................
  0x00000020 00000000 00000000                   ........

bpf_redir.o

can't objdump bpf_redir.o

objdump -d bpf_redir.o

bpf_redir.o:     file format elf64-little

objdump: can't disassemble for architecture UNKNOWN!

use llvm-objdump to retrieve the content

llvm-objdump-18 -d --section=sk_skb/stream_verdict  bpf_redir.o

bpf_redir.o:	file format elf64-bpf

Disassembly of section sk_skb/stream_verdict:

0000000000000000 <bpf_redir>:
       0:	b7 00 00 00 01 00 00 00	r0 = 0x1
       1:	61 12 58 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x58)
       2:	55 02 37 00 02 00 00 00	if r2 != 0x2 goto +0x37 <LBB0_9>
       3:	61 12 88 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x88)
       4:	55 02 35 00 39 30 00 00	if r2 != 0x3039 goto +0x35 <LBB0_9>
       5:	61 12 00 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x0)
       6:	15 02 33 00 00 00 00 00	if r2 == 0x0 goto +0x33 <LBB0_9>
       7:	61 13 60 00 00 00 00 00	r3 = *(u32 *)(r1 + 0x60)
       8:	61 12 5c 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x5c)
       9:	5d 32 0f 00 00 00 00 00	if r2 != r3 goto +0xf <LBB0_5>
      10:	b7 03 00 00 39 30 00 00	r3 = 0x3039
      11:	6b 3a fc ff 00 00 00 00	*(u16 *)(r10 - 0x4) = r3
      12:	63 2a f8 ff 00 00 00 00	*(u32 *)(r10 - 0x8) = r2
      13:	63 2a f4 ff 00 00 00 00	*(u32 *)(r10 - 0xc) = r2
      14:	b7 02 00 00 02 00 00 00	r2 = 0x2
      15:	63 2a f0 ff 00 00 00 00	*(u32 *)(r10 - 0x10) = r2
      16:	61 12 84 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x84)
      17:	dc 02 00 00 20 00 00 00	r2 = be32 r2
      18:	6b 2a fe ff 00 00 00 00	*(u16 *)(r10 - 0x2) = r2
      19:	bf a3 00 00 00 00 00 00	r3 = r10
      20:	07 03 00 00 f0 ff ff ff	r3 += -0x10
      21:	18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r2 = 0x0 ll
      23:	b7 04 00 00 01 00 00 00	r4 = 0x1
      24:	05 00 0e 00 00 00 00 00	goto +0xe <LBB0_6>

00000000000000c8 <LBB0_5>:
      25:	63 3a f8 ff 00 00 00 00	*(u32 *)(r10 - 0x8) = r3
      26:	63 2a f4 ff 00 00 00 00	*(u32 *)(r10 - 0xc) = r2
      27:	b7 02 00 00 02 00 00 00	r2 = 0x2
      28:	63 2a f0 ff 00 00 00 00	*(u32 *)(r10 - 0x10) = r2
      29:	61 12 84 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x84)
      30:	b7 03 00 00 39 30 00 00	r3 = 0x3039
      31:	6b 3a fe ff 00 00 00 00	*(u16 *)(r10 - 0x2) = r3
      32:	dc 02 00 00 20 00 00 00	r2 = be32 r2
      33:	6b 2a fc ff 00 00 00 00	*(u16 *)(r10 - 0x4) = r2
      34:	bf a3 00 00 00 00 00 00	r3 = r10
      35:	07 03 00 00 f0 ff ff ff	r3 += -0x10
      36:	18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r2 = 0x0 ll
      38:	b7 04 00 00 00 00 00 00	r4 = 0x0

0000000000000138 <LBB0_6>:
      39:	85 00 00 00 48 00 00 00	call 0x48
      40:	bf 01 00 00 00 00 00 00	r1 = r0
      41:	67 01 00 00 20 00 00 00	r1 <<= 0x20
      42:	77 01 00 00 20 00 00 00	r1 >>= 0x20
      43:	15 01 09 00 01 00 00 00	if r1 == 0x1 goto +0x9 <LBB0_8>
      44:	bf 03 00 00 00 00 00 00	r3 = r0
      45:	87 03 00 00 00 00 00 00	r3 = -r3
      46:	18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r1 = 0x0 ll
      48:	b7 02 00 00 2a 00 00 00	r2 = 0x2a
      49:	bf 06 00 00 00 00 00 00	r6 = r0
      50:	85 00 00 00 06 00 00 00	call 0x6
      51:	bf 60 00 00 00 00 00 00	r0 = r6
      52:	05 00 05 00 00 00 00 00	goto +0x5 <LBB0_9>

00000000000001a8 <LBB0_8>:
      53:	18 01 00 00 2a 00 00 00 00 00 00 00 00 00 00 00	r1 = 0x2a ll
      55:	b7 02 00 00 2c 00 00 00	r2 = 0x2c
      56:	85 00 00 00 06 00 00 00	call 0x6
      57:	b7 00 00 00 01 00 00 00	r0 = 0x1

00000000000001d0 <LBB0_9>:
      58:	95 00 00 00 00 00 00 00	exit

eBPF 暫存器使用

  • r0: 回傳值
  • r1-r5: hold arguments from ebpf programs
    assembly
       1:	61 12 58 00 00 00 00 00	r2 = *(u32 *)(r1 + 0x58)

c code

skb->family
  • r6-r10: callee saved registers that will be preserved on helper function call
    Callee-saved registers (AKA non-volatile registers, or call-preserved) are used to hold long-lived values that should be preserved across calls.

  • r10: Read-only with frame pointer address

Experiments

Check bpf_printk messages

sudo cat /sys/kernel/debug/tracing/trace_pipe

Running bench

Timeout from modified bench

brianpan@brianpan-Aspire-A14-52MT:~/kernel/ebpf-tcp-server$ ./bench 
Generating String...
Connecting...
Getting the socket name...
Send & Recv...
Finish Sending.
recv timeout occurred

Cat debugging pipe

cat /sys/kernel/debug/tracing/trace_pipe
irq/181-iwlwifi-564     [004] ..s31 11110.660072: bpf_trace_printk: Update map success.

           bench-16639   [000] ...11 11158.604765: bpf_trace_printk: Update map success.

           bench-16639   [000] ..s31 11158.604794: bpf_trace_printk: Update map success.

           bench-16639   [000] ..s31 11159.621702: bpf_trace_printk: bpf_sk_redirect_hash() failed 0, error 

Debugging

Try finding the key by bpf_map_lookup_elem, but BPF verifier disables the action to conversion address to uintptr_t

uintptr_t r = (uintptr_t) bpf_map_lookup_elem(&sockmap_ops, &skm_key);
if (!r) {
    bpf_printk("key found");
}

Failure tcpdump

22:51:03.777244 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [S], seq 3130719987, win 65495, options [mss 65495,sackOK,TS val 3153089712 ecr 0,nop,wscale 7], length 0
22:51:03.777275 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [S.], seq 4067401096, ack 3130719988, win 65483, options [mss 65495,sackOK,TS val 3153089712 ecr 3153089712,nop,wscale 7], length 0
22:51:03.777314 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [.], ack 1, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 0
22:51:03.777380 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [P.], seq 1:50, ack 1, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 49
22:51:03.777387 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [.], ack 50, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 0
22:51:04.824177 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [F.], seq 50, ack 1, win 512, options [nop,nop,TS val 3153090759 ecr 3153089712], length 0
22:51:04.864823 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [.], ack 51, win 512, options [nop,nop,TS val 3153090800 ecr 3153090759], length 0

Success call

22:54:11.986839 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [S], seq 4060196293, win 65495, options [mss 65495,sackOK,TS val 3153277922 ecr 0,nop,wscale 7], length 0
22:54:11.986882 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [S.], seq 1682861897, ack 4060196294, win 65483, options [mss 65495,sackOK,TS val 3153277922 ecr 3153277922,nop,wscale 7], length 0
22:54:11.986949 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [.], ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:11.987079 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [P.], seq 1:50, ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 49
22:54:11.987115 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [.], ack 50, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:11.987190 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [F.], seq 50, ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:12.027846 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [.], ack 51, win 512, options [nop,nop,TS val 3153277963 ecr 3153277922], length 0

Trace source

static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
	u32 key_size = map->key_size, hash;
	struct bpf_shtab_bucket *bucket;
	struct bpf_shtab_elem *elem;

	WARN_ON_ONCE(!rcu_read_lock_held());

	hash = sock_hash_bucket_hash(key, key_size);
	bucket = sock_hash_select_bucket(htab, hash);
	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);

	return elem ? elem->sk : NULL;
}

BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
	   struct bpf_map *, map, void *, key, u64, flags)
{
	WARN_ON_ONCE(!rcu_read_lock_held());

	if (likely(sock_map_sk_is_suitable(sops->sk) &&
		   sock_map_op_okay(sops)))
		return sock_hash_update_common(map, key, sops->sk, flags);
	return -EOPNOTSUPP;
}

REF

static int sock_hash_update_common(struct bpf_map *map, void *key,
				   struct sock *sk, u64 flags)
{
	struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
	u32 key_size = map->key_size, hash;
	struct bpf_shtab_elem *elem, *elem_new;
	struct bpf_shtab_bucket *bucket;
	struct sk_psock_link *link;
	struct sk_psock *psock;
	int ret;

	WARN_ON_ONCE(!rcu_read_lock_held());
	if (unlikely(flags > BPF_EXIST))
		return -EINVAL;

	link = sk_psock_init_link();
	if (!link)
		return -ENOMEM;

	ret = sock_map_link(map, sk);
	if (ret < 0)
		goto out_free;

	psock = sk_psock(sk);
	WARN_ON_ONCE(!psock);

	hash = sock_hash_bucket_hash(key, key_size);
	bucket = sock_hash_select_bucket(htab, hash);

	spin_lock_bh(&bucket->lock);
	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
	if (elem && flags == BPF_NOEXIST) {
		ret = -EEXIST;
		goto out_unlock;
	} else if (!elem && flags == BPF_EXIST) {
		ret = -ENOENT;
		goto out_unlock;
	}

	elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
	if (IS_ERR(elem_new)) {
		ret = PTR_ERR(elem_new);
		goto out_unlock;
	}

	sock_map_add_link(psock, link, map, elem_new);
	/* Add new element to the head of the list, so that
	 * concurrent search will find it before old elem.
	 */
	hlist_add_head_rcu(&elem_new->node, &bucket->head);
	if (elem) {
		hlist_del_rcu(&elem->node);
		sock_map_unref(elem->sk, elem);
		sock_hash_free_elem(htab, elem);
	}
	spin_unlock_bh(&bucket->lock);
	return 0;
out_unlock:
	spin_unlock_bh(&bucket->lock);
	sk_psock_put(sk, psock);
out_free:
	sk_psock_free_link(link);
	return ret;
}

Guess if sock_hash_lookup_elem and bpf_sock_hash_update are running on different CPU, RCU took old data and failed to find the element

usleep() between connect(), send() call in bench.c

Tested 100 times and no more error

// add usleep to sleep 0.1s
usleep(100000);
printf("Send & Recv...\n");

Benchmark

Setup

sudo sysctl net.core.somaxconn=4096
sudo sysctl net.ipv4.tcp_max_syn_backlog=4096
ulimit -n 32768

Running benchmark between ebpf, kecho, user

https://github.com/Brianpan/ebpf-tcp-server/blob/main/benchmark/echo_bench.png

kecho performs better than ebpf echo
Guess if it is related to threads are contending ebpf map