contributed by < Julian-Chu >
linux2021
確認爲 Ubuntu 20.04
cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.2 LTS"
lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 39 bits physical, 48 bits virtual
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 2
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 94
Model name: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
Stepping: 3
CPU MHz: 2600.479
CPU max MHz: 3500.0000
CPU min MHz: 800.0000
BogoMIPS: 5199.98
Virtualization: VT-x
L1d cache: 128 KiB
L1i cache: 128 KiB
L2 cache: 1 MiB
L3 cache: 6 MiB
NUMA node0 CPU(s): 0-7
Vulnerability Itlb multihit: KVM: Mitigation: Split huge pages
Vulnerability L1tf: Mitigation; PTE Inversion; VMX conditional cache flushes, SMT vulnerable
Vulnerability Mds: Mitigation; Clear CPU buffers; SMT vulnerable
Vulnerability Meltdown: Mitigation; PTI
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2: Mitigation; Full generic retpoline, IBPB conditional, IBRS_FW, STIBP conditional, RSB filling
Vulnerability Srbds: Mitigation; Microcode
Vulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT vulnerable
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm c
onstant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg
fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpc
id_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdsee
d adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d
使用 clock_gettime
來取得時間
使用 ktime 量測執行時間, 仿照作業說明改寫fib_write
輸出在 kernel 的執行時間
sudo taskset -c 0 ./client-plot
發現會比綁定之前有更多的雜訊
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=0"
還是會出現雜訊
避免設定 isolcpus=0
,而是設定為最後一個有效的 CPU,然後 IRQ affinity 也要調整
:notes: jserv
老師我想請教一下, 設定最後一個有效的 CPU 的原因是什麼, CPU 0 會有特殊用途嗎
前幾次執行都還有雜訊, 後續穩定下來(?
準備以下 shell script 來設定 CPU 以最高頻率執行所有 process
$ cat performance.sh
for i in /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
do
echo performance > /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
done
$ sudo sh performance.sh
關閉 Intel 處理器的 turbo mode
sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
開關 turbo mode 幾次, 發現 userspace 的效能在 turbo 關閉下需要的執行時間明顯增加, 但在 kernel 影響略小 ( turbo mode 對 userspace 跟 kernel 的影響差異? )
todo: cache/memory bound 與常見的 CPU/IO bound 差異
參考KYG-yaya573142 的報告, 透過 mlock 的系統呼叫, 確保 特定 page 不會被 swap out
與上一步相比, 前幾次的 userspace 執行約降低 200 ns
練習實作的 big number with xs 不夠優雅, 學習模仿 AdrianHuang的實作
知識點:
Reading from /dev/fibonacci at offset 90, returned the sequence 2880067194370816120.
Reading from /dev/fibonacci at offset 91, returned the sequence 4660046610375530309.
Reading from /dev/fibonacci at offset 92, returned the sequence 7540113804746346429.
-Reading from /dev/fibonacci at offset 93, returned the sequence 12200160415121876738.
-Reading from /dev/fibonacci at offset 94, returned the sequence 19740274219868223167.
-Reading from /dev/fibonacci at offset 95, returned the sequence 31940434634990099905.
-Reading from /dev/fibonacci at offset 96, returned the sequence 51680708854858323072.
-Reading from /dev/fibonacci at offset 97, returned the sequence 83621143489848422977.
-Reading from /dev/fibonacci at offset 98, returned the sequence 135301852344706746049.
-Reading from /dev/fibonacci at offset 99, returned the sequence 218922995834555169026.
-Reading from /dev/fibonacci at offset 100, returned the sequence 354224848179261915075.
offset 92 之後的數值正確
相較原來版本的效能, 降低非常多, 意外的是 kernel to userspace 的效能與原本差異不大。
todo: 了解 copy_to_user
初版實作, 利用 unsigned long long data[6]
代表big number, BigN_carry
代表每個元素需要進位的上限, 只需要計算對應 index 的加法跟進位, 缺點彈性差, 但實作較爲簡單。
#include <stdio.h>
#include <string.h>
#define BigN_carry 10000000000000000000
typedef struct {
unsigned long long data[6]
} BigN;
void init_BigN(BigN *n){
int data_size = sizeof(n->data)/sizeof(n->data[0]);
for(int i = 0; i<data_size; i++){
n->data[i] = 0;
}
}
void BigN_add(BigN *a, BigN *b, BigN *res){
int data_size = sizeof(res->data)/sizeof(res->data[0]);
long long carry = 0;
for(int i = 0; i<data_size; i++){
if(a->data[i] + carry > BigN_carry - b->data[i]){
res->data[i] = (a->data[i] - BigN_carry) + b->data[i] + carry;
carry = 1;
}else{
res->data[i] = a->data[i] + b->data[i] + carry;
carry = 0;
}
}
}
BigN fib_sequence(long long k)
{
/* FIXME: use clz/ctz and fast algorithms to speed up */
BigN f[k + 2];
init_BigN(&f[0]);
f[0].data[0] = 0;
init_BigN(&f[1]);
f[1].data[0] = 1;
for (int i = 2; i <= k; i++) {
init_BigN(&f[i]);
BigN_add(&f[i-2], &f[i-1], &f[i]);
}
return f[k];
}
static void BigN_to_str(BigN *n, char str[128]){
for(int i= 6; i >=0 ; i--){
if(n->data[i] == 0 && i != 0){
continue;
}
char substr[32];
sprintf(substr, "%lld", n->data[i]);
strcat(str, substr);
}
}
int main(){
char str[128] = "";
BigN num = fib_sequence(500);
BigN_to_str(&num, str);
printf("ans: %s\n", str);
return 0;
}
ans: 13942322456169788013972438287407283950070256587697307264108962948325571622863290691557658876222521294125
以上測試程式碼可以印出 f(500)
但是移植到 fibdrv 時出現問題
static void BigN_to_str(BigN *n, char str[128]){
for(int i= 6; i >=0 ; i--){
if(n->data[i] == 0 && i != 0){
continue;
}
char substr[32];
sprintf(substr, "%lld", n->data[i]);
strcat(str, substr);
printk("substr: %s\n", substr);
}
printk("str: %s\n", str);
}
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
char __user *user_buf,
size_t size,
loff_t *offset)
{
kt = ktime_get();
BigN result = fib_sequence(*offset);
char str[128] = "";
BigN_to_str(&result, str);
printk("res: %s\n", str);
copy_to_user(user_buf, str, sizeof(str));
kt = ktime_sub(ktime_get(), kt);
return 128;
}
利用 dmesg
觀察 printk 印出來的值, 發現 substr會重複印出兩次, 而且值都不相同
[18328.402830] substr: 2095270912
[18328.402831] substr: 0
[18328.402832] str: 20952709120
[18328.402832] res: 20952709120
[18328.402834] substr: 892928048
[18328.402835] substr: 1
[18328.402835] str: 8929280481
[18328.402835] res: 8929280481
發現是 static function
導致的問題, 且 sprintf
用錯格式, unsigned long long
應爲 "%llu"
, sprintf
修正為 snprintf
修正後
void BigN_to_str(BigN *n, char str[128]) {
for(int i = 6; i >=0 ; i--){
if (n->data[i] == 0 && i != 0) {
continue;
}
char substr[32];
snprintf(substr, 32, "%llu", n->data[i]);
strcat(str, substr);
printk("substr: %s\n", substr);
}
printk("str: %s\n", str);
}
修正後用 dmesg 查看正常
[19155.534939] substr: 1
[19155.534939] str: 1
[19155.534940] res: 1
[19155.534948] substr: 0
[19155.534949] str: 0
[19155.534949] res: 0
todo: 確認 static function 的行爲以及函數內部變數的生命週期
上一節自定義結構在實作乘法時遇到困難, 需要改寫, 先以原始版本實作 fast doubling
static long long fib_sequence(long long k)
{
/* FIXME: use clz/ctz and fast algorithms to speed up */
long long msb = 0;
for (long long i = k; i; i >>= 1) {
msb++;
}
long long a = 0, b =1, f_2k, f_2k1;
for(long i = 1; i <= msb; i++){
f_2k = a * (2 * b -a);
f_2k1 = a * a + b * b;
if ((k>>(msb-i)) & 1) {
a = f_2k1;
b = f_2k + f_2k1;
} else {
a = f_2k;
b = f_2k1;
}
}
return a;
}
相較原始版本, 整體效能有提升
將上述程式碼中用以計算 msb 位數的迴圈用 clz
取代,
/* long long msb = 0; */
/* for(long long i = k; i; i>>=1){ */
/* msb++; */
/* } */
long long msb = 64 - __builtin_clz(k);
測試後效能沒有比原本用 loop 計算 msb 的作法好
單獨取出 loop 跟 clz 運算作比較,
int main()
{
struct timespec t_start, t_end;
long long k = 500;
long long msb = 0;
for (int i = 0; i <= 1000; i++) {
long long elapsed_time, elapsed_time_clz;
msb = 0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for(long long i = k; i; i>>=1){
msb++;
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
elapsed_time = (t_end.tv_sec * NANOSEC + t_end.tv_nsec) -
(t_start.tv_sec * NANOSEC + t_start.tv_nsec);
msb = 0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
msb = 64 - __builtin_clz(k);
clock_gettime(CLOCK_MONOTONIC, &t_end);
elapsed_time_clz = (t_end.tv_sec * NANOSEC + t_end.tv_nsec) -
(t_start.tv_sec * NANOSEC + t_start.tv_nsec);
printf("%d %lld %lld\n", i, elapsed_time, elapsed_time_clz);
}
return 0;
}
結果
gcc -O0
還是使用 loop 作運算較佳測試 optimization option 對 clz的影響
當 -O2
後, __builtin_clz
的效能已經與 loop
持平,
optimize options 對 __builtin_clz
在這:個測試有顯著的影響
以上是錯誤結論, 原本程式碼有錯誤。 更新測試。
注意測試範圍和測試程式的撰寫方式,避免在同一個迴圈先後執行不同的實作 :notes: jserv
重新組織測試範圍跟程式碼:
#define NANOSEC 1e9
int main()
{
struct timespec t_start, t_end;
long long k = 500;
const int size = 1000;
long long elapsed_time_clz[size], elapsed_time[size];
int i = 0;
for (; i <= size; i++) {
long long msb = 0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for (long long j = k; j; j >>= 1) {
msb++;
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
elapsed_time[i] = (t_end.tv_sec * NANOSEC + t_end.tv_nsec) - (t_start.tv_sec * NANOSEC + t_start.tv_nsec);
}
i = 0;
for (; i <= size; i++) {
long long msb = 0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
msb = 64 - __builtin_clz(k);
clock_gettime(CLOCK_MONOTONIC, &t_end);
elapsed_time_clz[i] = (t_end.tv_sec * NANOSEC + t_end.tv_nsec) - (t_start.tv_sec * NANOSEC + t_start.tv_nsec);
}
i = 0;
for(; i <= size; i++){
printf("%d %lld %lld \n", i, elapsed_time[i], elapsed_time_clz[i]);
}
return 0;
gcc -O0
clz 運算明顯勝過 loop
gcc -O1
兩者的差距已經拉近, clz 還是較佳, optimization option 對 clz 也有優化效果
gcc -O2
兩者幾乎相同
contributed by < Julian-Chu >
Oct 8, 2023α−1
Aug 12, 2023contributed by < Julian-Chu > K08: ktcp 自我檢查清單 參照 Linux 核心模組掛載機制,解釋 $ sudo insmod khttpd.ko port=1999 這命令是如何讓 port=1999 傳遞到核心,作為核心模組初始化的參數呢? 首先會看到 main.c 裡面對 port 參數進行宣告跟初始化 static ushort port = DEFAULT_PORT; module_param(port, ushort, S_IRUGO);
Jun 8, 2022contributed by < Julian-Chu > GitHub Data type 主要使用的 struct, 與去年不同改以 Linux Doubly Linked Lists 實作 lab0 typedef struct { char *value; struct list_head list; } element_t;
May 8, 2022or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up