Learn More →
BPF_MAP_TYPE_PERF_EVENT_ARRAY
long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64
flags, void *data, u64 size)
主要目標是解決不同核心版本下使用編譯過的bpf程式
透過BPF Type Format (BTF)定位現在核心版本下資料結構的offset
可以想像成提供一個虛擬索引去轉換不同核心版本下的實際位置
BPF_CALL2_5
定義BPF可以呼叫的函式
核心程式碼位置
例子:bpg_map_update_elem實作
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
Map 類型: 參見
透過bpf syscall 存取
int map_fd;
union bpf_attr attr = {
.map_type = BPF_MAP_TYPE_ARRAY;
.key_size = sizeof(__u32);
.value_size = sizeof(__u32);
.max_entries = 256;
};
map_fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
靜態產生
#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries) \
struct { \
__uint(type, _type); \
__uint(key_size, sizeof(_key_type)); \
__uint(value_size, sizeof(_value_type)); \
__uint(max_entries, _max_entries); \
} _name SEC(".maps");
#define BPF_PERF_OUTPUT(_name) \
BPF_MAP(_name, BPF_MAP_TYPE_PERF_EVENT_ARRAY, int, int, 2048);
cat /boot/config-6.11.0-17-generic | grep -i bpf
CONFIG_BPF=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
# BPF subsystem
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_BPF_UNPRIV_DEFAULT_OFF=y
# CONFIG_BPF_PRELOAD is not set
CONFIG_BPF_LSM=y
# end of BPF subsystem
CONFIG_CGROUP_BPF=y
CONFIG_IPV6_SEG6_BPF=y
CONFIG_NETFILTER_BPF_LINK=y
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NET_CLS_BPF=m
CONFIG_NET_ACT_BPF=m
CONFIG_BPF_STREAM_PARSER=y
CONFIG_LWTUNNEL_BPF=y
# HID-BPF support
CONFIG_HID_BPF=y
# end of HID-BPF support
CONFIG_BPF_EVENTS=y
CONFIG_BPF_KPROBE_OVERRIDE=y
CONFIG_TEST_BPF=m
uname -a
Linux brianpan-Aspire-A14-52MT 6.11.0-17-generic #17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2 x86_64 x86_64 x86_64 GNU/Linux
gcc --version
gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
Copyright (C) 2023 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
sudo lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 42 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 8
On-line CPU(s) list: 0-7
Vendor ID: GenuineIntel
BIOS Vendor ID: Intel(R) Corporation
Model name: Intel(R) Core(TM) Ultra 5 226V
BIOS Model name: Intel(R) Core(TM) Ultra 5 226V To Be Filled By O.E.M.
CPU @ 0.4GHz
BIOS CPU family: 773
CPU family: 6
Model: 189
Thread(s) per core: 1
Core(s) per socket: 8
Socket(s): 1
Stepping: 1
CPU(s) scaling MHz: 40%
CPU max MHz: 4500.0000
CPU min MHz: 400.0000
BogoMIPS: 6220.80
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
ca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 s
s ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
art arch_perfmon pebs bts rep_good nopl xtopology nons
top_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq
dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma c
x16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt t
sc_deadline_timer aes xsave avx f16c rdrand lahf_lm ab
m 3dnowprefetch cpuid_fault epb intel_ppin ssbd ibrs i
bpb stibp ibrs_enhanced tpr_shadow flexpriority ept vp
id ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms
invpcid rdt_a rdseed adx smap clflushopt clwb intel_p
t sha_ni xsaveopt xsavec xgetbv1 xsaves split_lock_det
ect user_shstk avx_vnni lam wbnoinvd dtherm ida arat p
ln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_r
eq hfi vnmi umip pku ospke waitpkg gfni vaes vpclmulqd
q rdpid bus_lock_detect movdiri movdir64b fsrm md_clea
r serialize pconfig arch_lbr ibt flush_l1d arch_capabi
lities
Virtualization features:
Virtualization: VT-x
Caches (sum of all):
L1d: 320 KiB (8 instances)
L1i: 512 KiB (8 instances)
L2: 14 MiB (5 instances)
L3: 8 MiB (1 instance)
NUMA:
NUMA node(s): 1
NUMA node0 CPU(s): 0-7
Vulnerabilities:
Gather data sampling: Not affected
Itlb multihit: Not affected
L1tf: Not affected
Mds: Not affected
Meltdown: Not affected
Mmio stale data: Not affected
Reg file data sampling: Not affected
Retbleed: Not affected
Spec rstack overflow: Not affected
Spec store bypass: Mitigation; Speculative Store Bypass disabled via prct
l
Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointe
r sanitization
Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditiona
l; RSB filling; PBRSB-eIBRS Not affected; BHI Not affe
cted
Srbds: Not affected
Tsx async abort: Not affected
sudo apt install -y linux-headers-$(uname -r) bpfcc-tools python3-bpfcc libbpfcc libbpfcc-dev clang
BPF_PROG_TYPE_SOCK_OPS
, program will be called during lifetime of the socket: REFBPF_SK_SKB_VERDICT
, our program skips the kernel network stack, REF解釋: https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_SK_SKB/?utm_source=chatgpt.com#as-bpf_sk_skb_stream_verdict-program
stream parser 流程過後會呼叫stream_verdict 來過濾封包
Can't find the entry point of sk_skb/stream_verdict
from listing available ebpf call
sudo bpftrace -l
/usr/include/linux/types.h:5:10: fatal error: 'asm/types.h' file not found
5 | #include <asm/types.h>
| ^~~~~~~~~~~~~
1 error generated.
sudo apt-get install -y gcc-multilib
# load sockmap_ops to file system
sudo bpftool prog load bpf_sockops.o /sys/fs/bpf/bpf_sockops
sudo bpftool prog show pinned /sys/fs/bpf/bpf_sockops
127: sock_ops name bpf_sockmap tag d1bb5f447965262d gpl
loaded_at 2025-05-18T22:31:14-0500 uid 0
xlated 376B jited 216B memlock 4096B map_ids 5,7
btf_id 204
mount a bpf fs on bpffs folder
mkdir bpffs
sudo mount -t bpf none bpffs
ping map to bpf filesystem
sudo bpftool map show name sockmap_ops
5: sockhash name sockmap_ops flags 0x0
key 16B value 4B max_entries 65535 memlock 1048912B
sudo bpftool map pin name sockmap_ops bpffs/sockmap_ops
# dump the map by name
sudo bpftool map dump name sockmap_ops
attach bpf_socksops to cgroup
sudo bpftool cgroup attach /sys/fs/cgroup/ sock_ops pinned /sys/fs/bpf/bpf_sockops
load bpf_redir.o program with pinned map named sockmap_ops
sudo bpftool prog load bpf_redir.o /sys/fs/bpf/bpf_redir map name sockmap_ops pinned bpffs/sockmap_ops
check if map and bpf program are loaded
sudo bpftool prog list
127: sock_ops name bpf_sockmap tag d1bb5f447965262d gpl
loaded_at 2025-05-18T22:31:14-0500 uid 0
xlated 376B jited 216B memlock 4096B map_ids 5,7
btf_id 204
136: sk_skb name bpf_redir tag 8aae03b571c7bc42 gpl
loaded_at 2025-05-18T23:20:44-0500 uid 0
xlated 536B jited 307B memlock 4096B map_ids 5,11
btf_id 215
attach the program to stream_verdict
sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir stream_verdict pinned bpffs/sockmap_ops
run the program
./ebpf-echo-server
ss -l | grep 12345
tcp LISTEN 0 1024 0.0.0.0:12345 0.0.0.0:*
test echo
telnet 192.168.1.139 12345
Trying 192.168.1.139...
Connected to 192.168.1.139.
Escape character is '^]'.
xxx
xxx
readelf -S bpf_sockops.o
There are 28 section headers, starting at offset 0x2b68:
Section Headers:
[Nr] Name Type Address Offset
Size EntSize Flags Link Info Align
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .strtab STRTAB 0000000000000000 00002a05
000000000000015c 0000000000000000 0 0 1
[ 2] .text PROGBITS 0000000000000000 00000040
0000000000000000 0000000000000000 AX 0 0 4
[ 3] sockops PROGBITS 0000000000000000 00000040
0000000000000148 0000000000000000 AX 0 0 8
[ 4] .relsockops REL 0000000000000000 00002010
0000000000000030 0000000000000010 I 27 3 8
[ 5] license PROGBITS 0000000000000000 00000188
0000000000000004 0000000000000000 A 0 0 1
[ 6] .maps PROGBITS 0000000000000000 00000190
0000000000000028 0000000000000000 WA 0 0 8
[ 7] .rodata PROGBITS 0000000000000000 000001b8
000000000000002c 0000000000000000 A 0 0 1
[ 8] .debug_loclists PROGBITS 0000000000000000 000001e4
000000000000008d 0000000000000000 0 0 1
[ 9] .debug_abbrev PROGBITS 0000000000000000 00000271
00000000000001c6 0000000000000000 0 0 1
[10] .debug_info PROGBITS 0000000000000000 00000437
00000000000004fa 0000000000000000 0 0 1
[11] .rel.debug_info REL 0000000000000000 00002040
0000000000000050 0000000000000010 I 27 10 8
[12] .debug_str_o[...] PROGBITS 0000000000000000 00000931
00000000000001b4 0000000000000000 0 0 1
[13] .rel.debug_s[...] REL 0000000000000000 00002090
00000000000006b0 0000000000000010 I 27 12 8
[14] .debug_str PROGBITS 0000000000000000 00000ae5
0000000000000574 0000000000000001 MS 0 0 1
[15] .debug_addr PROGBITS 0000000000000000 00001059
0000000000000038 0000000000000000 0 0 1
[16] .rel.debug_addr REL 0000000000000000 00002740
0000000000000060 0000000000000010 I 27 15 8
[17] .BTF PROGBITS 0000000000000000 00001094
0000000000000a53 0000000000000000 0 0 4
[18] .rel.BTF REL 0000000000000000 000027a0
0000000000000040 0000000000000010 I 27 17 8
[19] .BTF.ext PROGBITS 0000000000000000 00001ae8
0000000000000170 0000000000000000 0 0 4
[20] .rel.BTF.ext REL 0000000000000000 000027e0
0000000000000140 0000000000000010 I 27 19 8
[21] .debug_frame PROGBITS 0000000000000000 00001c58
0000000000000028 0000000000000000 0 0 8
[22] .rel.debug_frame REL 0000000000000000 00002920
0000000000000020 0000000000000010 I 27 21 8
[23] .debug_line PROGBITS 0000000000000000 00001c80
000000000000011a 0000000000000000 0 0 1
[24] .rel.debug_line REL 0000000000000000 00002940
00000000000000c0 0000000000000010 I 27 23 8
[25] .debug_line_str PROGBITS 0000000000000000 00001d9a
00000000000000ac 0000000000000001 MS 0 0 1
[26] .llvm_addrsig LOOS+0xfff4c03 0000000000000000 00002a00
0000000000000005 0000000000000000 E 27 0 1
[27] .symtab SYMTAB 0000000000000000 00001e48
00000000000001c8 0000000000000018 1 16 8
readelf -s bpf_sockops.o
Symbol table '.symtab' contains 19 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
1: 0000000000000000 0 FILE LOCAL DEFAULT ABS bpf_sockops.c
2: 0000000000000000 0 SECTION LOCAL DEFAULT 3 sockops
3: 0000000000000138 0 NOTYPE LOCAL DEFAULT 3 LBB0_5
4: 0000000000000118 0 NOTYPE LOCAL DEFAULT 3 LBB0_4
5: 0000000000000000 23 OBJECT LOCAL DEFAULT 7 update_sockmap_o[...]
6: 0000000000000017 21 OBJECT LOCAL DEFAULT 7 update_sockmap_o[...]
7: 0000000000000000 0 SECTION LOCAL DEFAULT 7 .rodata
8: 0000000000000000 0 SECTION LOCAL DEFAULT 8 .debug_loclists
9: 0000000000000000 0 SECTION LOCAL DEFAULT 9 .debug_abbrev
10: 0000000000000000 0 SECTION LOCAL DEFAULT 12 .debug_str_offsets
11: 0000000000000000 0 SECTION LOCAL DEFAULT 14 .debug_str
12: 0000000000000000 0 SECTION LOCAL DEFAULT 15 .debug_addr
13: 0000000000000000 0 SECTION LOCAL DEFAULT 21 .debug_frame
14: 0000000000000000 0 SECTION LOCAL DEFAULT 23 .debug_line
15: 0000000000000000 0 SECTION LOCAL DEFAULT 25 .debug_line_str
16: 0000000000000000 328 FUNC GLOBAL DEFAULT 3 bpf_sockmap
17: 0000000000000000 40 OBJECT GLOBAL DEFAULT 6 sockmap_ops
18: 0000000000000000 4 OBJECT GLOBAL DEFAULT 5 __license
readelf -x .maps bpf_sockops.o
Hex dump of section '.maps':
0x00000000 00000000 00000000 00000000 00000000 ................
0x00000010 00000000 00000000 00000000 00000000 ................
0x00000020 00000000 00000000 ........
can't objdump bpf_redir.o
objdump -d bpf_redir.o
bpf_redir.o: file format elf64-little
objdump: can't disassemble for architecture UNKNOWN!
use llvm-objdump to retrieve the content
llvm-objdump-18 -d --section=sk_skb/stream_verdict bpf_redir.o
bpf_redir.o: file format elf64-bpf
Disassembly of section sk_skb/stream_verdict:
0000000000000000 <bpf_redir>:
0: b7 00 00 00 01 00 00 00 r0 = 0x1
1: 61 12 58 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x58)
2: 55 02 37 00 02 00 00 00 if r2 != 0x2 goto +0x37 <LBB0_9>
3: 61 12 88 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x88)
4: 55 02 35 00 39 30 00 00 if r2 != 0x3039 goto +0x35 <LBB0_9>
5: 61 12 00 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x0)
6: 15 02 33 00 00 00 00 00 if r2 == 0x0 goto +0x33 <LBB0_9>
7: 61 13 60 00 00 00 00 00 r3 = *(u32 *)(r1 + 0x60)
8: 61 12 5c 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x5c)
9: 5d 32 0f 00 00 00 00 00 if r2 != r3 goto +0xf <LBB0_5>
10: b7 03 00 00 39 30 00 00 r3 = 0x3039
11: 6b 3a fc ff 00 00 00 00 *(u16 *)(r10 - 0x4) = r3
12: 63 2a f8 ff 00 00 00 00 *(u32 *)(r10 - 0x8) = r2
13: 63 2a f4 ff 00 00 00 00 *(u32 *)(r10 - 0xc) = r2
14: b7 02 00 00 02 00 00 00 r2 = 0x2
15: 63 2a f0 ff 00 00 00 00 *(u32 *)(r10 - 0x10) = r2
16: 61 12 84 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x84)
17: dc 02 00 00 20 00 00 00 r2 = be32 r2
18: 6b 2a fe ff 00 00 00 00 *(u16 *)(r10 - 0x2) = r2
19: bf a3 00 00 00 00 00 00 r3 = r10
20: 07 03 00 00 f0 ff ff ff r3 += -0x10
21: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
23: b7 04 00 00 01 00 00 00 r4 = 0x1
24: 05 00 0e 00 00 00 00 00 goto +0xe <LBB0_6>
00000000000000c8 <LBB0_5>:
25: 63 3a f8 ff 00 00 00 00 *(u32 *)(r10 - 0x8) = r3
26: 63 2a f4 ff 00 00 00 00 *(u32 *)(r10 - 0xc) = r2
27: b7 02 00 00 02 00 00 00 r2 = 0x2
28: 63 2a f0 ff 00 00 00 00 *(u32 *)(r10 - 0x10) = r2
29: 61 12 84 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x84)
30: b7 03 00 00 39 30 00 00 r3 = 0x3039
31: 6b 3a fe ff 00 00 00 00 *(u16 *)(r10 - 0x2) = r3
32: dc 02 00 00 20 00 00 00 r2 = be32 r2
33: 6b 2a fc ff 00 00 00 00 *(u16 *)(r10 - 0x4) = r2
34: bf a3 00 00 00 00 00 00 r3 = r10
35: 07 03 00 00 f0 ff ff ff r3 += -0x10
36: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
38: b7 04 00 00 00 00 00 00 r4 = 0x0
0000000000000138 <LBB0_6>:
39: 85 00 00 00 48 00 00 00 call 0x48
40: bf 01 00 00 00 00 00 00 r1 = r0
41: 67 01 00 00 20 00 00 00 r1 <<= 0x20
42: 77 01 00 00 20 00 00 00 r1 >>= 0x20
43: 15 01 09 00 01 00 00 00 if r1 == 0x1 goto +0x9 <LBB0_8>
44: bf 03 00 00 00 00 00 00 r3 = r0
45: 87 03 00 00 00 00 00 00 r3 = -r3
46: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll
48: b7 02 00 00 2a 00 00 00 r2 = 0x2a
49: bf 06 00 00 00 00 00 00 r6 = r0
50: 85 00 00 00 06 00 00 00 call 0x6
51: bf 60 00 00 00 00 00 00 r0 = r6
52: 05 00 05 00 00 00 00 00 goto +0x5 <LBB0_9>
00000000000001a8 <LBB0_8>:
53: 18 01 00 00 2a 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x2a ll
55: b7 02 00 00 2c 00 00 00 r2 = 0x2c
56: 85 00 00 00 06 00 00 00 call 0x6
57: b7 00 00 00 01 00 00 00 r0 = 0x1
00000000000001d0 <LBB0_9>:
58: 95 00 00 00 00 00 00 00 exit
1: 61 12 58 00 00 00 00 00 r2 = *(u32 *)(r1 + 0x58)
c code
skb->family
r6-r10: callee saved registers that will be preserved on helper function call
Callee-saved registers (AKA non-volatile registers, or call-preserved) are used to hold long-lived values that should be preserved across calls.
r10: Read-only with frame pointer address
sudo cat /sys/kernel/debug/tracing/trace_pipe
Timeout from modified bench
brianpan@brianpan-Aspire-A14-52MT:~/kernel/ebpf-tcp-server$ ./bench
Generating String...
Connecting...
Getting the socket name...
Send & Recv...
Finish Sending.
recv timeout occurred
Cat debugging pipe
cat /sys/kernel/debug/tracing/trace_pipe
irq/181-iwlwifi-564 [004] ..s31 11110.660072: bpf_trace_printk: Update map success.
bench-16639 [000] ...11 11158.604765: bpf_trace_printk: Update map success.
bench-16639 [000] ..s31 11158.604794: bpf_trace_printk: Update map success.
bench-16639 [000] ..s31 11159.621702: bpf_trace_printk: bpf_sk_redirect_hash() failed 0, error
Try finding the key by bpf_map_lookup_elem, but BPF verifier disables the action to conversion address to uintptr_t
uintptr_t r = (uintptr_t) bpf_map_lookup_elem(&sockmap_ops, &skm_key);
if (!r) {
bpf_printk("key found");
}
22:51:03.777244 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [S], seq 3130719987, win 65495, options [mss 65495,sackOK,TS val 3153089712 ecr 0,nop,wscale 7], length 0
22:51:03.777275 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [S.], seq 4067401096, ack 3130719988, win 65483, options [mss 65495,sackOK,TS val 3153089712 ecr 3153089712,nop,wscale 7], length 0
22:51:03.777314 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [.], ack 1, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 0
22:51:03.777380 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [P.], seq 1:50, ack 1, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 49
22:51:03.777387 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [.], ack 50, win 512, options [nop,nop,TS val 3153089712 ecr 3153089712], length 0
22:51:04.824177 IP brianpan-Aspire-A14-52MT.lan.39950 > brianpan-Aspire-A14-52MT.lan.12345: Flags [F.], seq 50, ack 1, win 512, options [nop,nop,TS val 3153090759 ecr 3153089712], length 0
22:51:04.864823 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39950: Flags [.], ack 51, win 512, options [nop,nop,TS val 3153090800 ecr 3153090759], length 0
Success call
22:54:11.986839 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [S], seq 4060196293, win 65495, options [mss 65495,sackOK,TS val 3153277922 ecr 0,nop,wscale 7], length 0
22:54:11.986882 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [S.], seq 1682861897, ack 4060196294, win 65483, options [mss 65495,sackOK,TS val 3153277922 ecr 3153277922,nop,wscale 7], length 0
22:54:11.986949 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [.], ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:11.987079 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [P.], seq 1:50, ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 49
22:54:11.987115 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [.], ack 50, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:11.987190 IP brianpan-Aspire-A14-52MT.lan.39392 > brianpan-Aspire-A14-52MT.lan.12345: Flags [F.], seq 50, ack 1, win 512, options [nop,nop,TS val 3153277922 ecr 3153277922], length 0
22:54:12.027846 IP brianpan-Aspire-A14-52MT.lan.12345 > brianpan-Aspire-A14-52MT.lan.39392: Flags [.], ack 51, win 512, options [nop,nop,TS val 3153277963 ecr 3153277922], length 0
static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 key_size = map->key_size, hash;
struct bpf_shtab_bucket *bucket;
struct bpf_shtab_elem *elem;
WARN_ON_ONCE(!rcu_read_lock_held());
hash = sock_hash_bucket_hash(key, key_size);
bucket = sock_hash_select_bucket(htab, hash);
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
return elem ? elem->sk : NULL;
}
BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
struct bpf_map *, map, void *, key, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held());
if (likely(sock_map_sk_is_suitable(sops->sk) &&
sock_map_op_okay(sops)))
return sock_hash_update_common(map, key, sops->sk, flags);
return -EOPNOTSUPP;
}
static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags)
{
struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 key_size = map->key_size, hash;
struct bpf_shtab_elem *elem, *elem_new;
struct bpf_shtab_bucket *bucket;
struct sk_psock_link *link;
struct sk_psock *psock;
int ret;
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
psock = sk_psock(sk);
WARN_ON_ONCE(!psock);
hash = sock_hash_bucket_hash(key, key_size);
bucket = sock_hash_select_bucket(htab, hash);
spin_lock_bh(&bucket->lock);
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
if (elem && flags == BPF_NOEXIST) {
ret = -EEXIST;
goto out_unlock;
} else if (!elem && flags == BPF_EXIST) {
ret = -ENOENT;
goto out_unlock;
}
elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
if (IS_ERR(elem_new)) {
ret = PTR_ERR(elem_new);
goto out_unlock;
}
sock_map_add_link(psock, link, map, elem_new);
/* Add new element to the head of the list, so that
* concurrent search will find it before old elem.
*/
hlist_add_head_rcu(&elem_new->node, &bucket->head);
if (elem) {
hlist_del_rcu(&elem->node);
sock_map_unref(elem->sk, elem);
sock_hash_free_elem(htab, elem);
}
spin_unlock_bh(&bucket->lock);
return 0;
out_unlock:
spin_unlock_bh(&bucket->lock);
sk_psock_put(sk, psock);
out_free:
sk_psock_free_link(link);
return ret;
}
Guess if sock_hash_lookup_elem
and bpf_sock_hash_update
are running on different CPU, RCU took old data and failed to find the element
Tested 100 times and no more error
// add usleep to sleep 0.1s
usleep(100000);
printf("Send & Recv...\n");
sudo sysctl net.core.somaxconn=4096
sudo sysctl net.ipv4.tcp_max_syn_backlog=4096
ulimit -n 32768
https://github.com/Brianpan/ebpf-tcp-server/blob/main/benchmark/echo_bench.png
kecho performs better than ebpf echo
Guess if it is related to threads are contending ebpf map
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up