非root使用者無權限讀取/tmp/vgpu/cudevshr.cache問題 === ### 問題描述 當contioner 為一般使用者要訪問與修改/tmp/vgpu/cudevshr.cache資料夾失敗 multiprocess_memory_limit.c:645 Fail to open shrreg /tmp/vgpu/xxx.cache: errno=13 ``` [4pdvGPU Msg(1:140596120348480:libvgpu.c:872)]: Initializing... Matplotlib created a temporary config/cache directory at /tmp/matplotlib-qyzb67oo because the default path (/home/myUser/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing. INFO: Started server process [1] INFO: Waiting for application startup. INFO: [2023-06-07 09:12:57.006051] inference on gpu: True [4pdvGPU Msg(1:140596120348480:device.c:248)]: driver version=11060 [4pdvGPU Msg(1:140596120348480:hook.c:400)]: loaded nvml libraries [4pdvGPU Msg(1:140596120348480:hook.c:408)]: initial_virtual_map [4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:645)]: Fail to open shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=13 [4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:650)]: Fail to init shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9 [4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:654)]: Fail to write shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9 [4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:657)]: Fail to reseek shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9 [4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:667)]: Fail to lock shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9 ``` ![](https://hackmd.io/_uploads/ByhVYsaIn.png) ### 修改方法一 更改nvcr.go currentbundle的權限變更為777 [https://github.com/4paradigm/k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go) 路徑 k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go ```go= // line 119 const SharedPath = "/tmp/vgpu/containers/" func (r nvidiaContainerRuntime) addMonitor(ctrmsg []string, spec *specs.Spec) error { if len(ctrmsg) == 0 { return errors.New("ctrmsg not matched") } os.MkdirAll(SharedPath, os.ModePerm) #line 128 //新增777權限在SharedPath //os.Chmod(SharedPath, 0777) currentbundle, _ := os.Getwd() currentbundle = currentbundle + "/vgpucache/" os.MkdirAll(currentbundle, os.ModePerm) vgpupath := SharedPath + ctrmsg[0] os.Remove(vgpupath) err := os.Symlink(currentbundle, vgpupath) if err != nil { return errors.New("symbolic symbol creation failed") } dpath := SharedPath os.MkdirAll(dpath, os.ModePerm) sharedmnt := specs.Mount{ Destination: "/tmp/vgpu/", Source: currentbundle, Type: "bind", Options: []string{"rbind", "rw"}, } spec.Mounts = append(spec.Mounts, sharedmnt) r.logger.Println("mounts=", spec.Mounts) dirname, _ := os.Getwd() r.logger.Println("pwd=", dirname) return nil } ``` ### 修改方法二 將CUDA_DEVICE_MEMORY_SHARED_CACHE使用權修改到user家目錄下 路徑 k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go ```cpp= // addNVIDIAHook modifies the specified OCI specification in-place, inserting a // prestart hook. func (r nvidiaContainerRuntime) addNVIDIAHook(spec *specs.Spec) error { path, err := exec.LookPath("nvidia-container-runtime-hook") if err != nil { path = hookDefaultFilePath _, err = os.Stat(path) if err != nil { return err } } r.logger.Printf("prestart hook path: %s %s\n", path) envmap, newuuids, err := GetNvidiaUUID(r, spec.Process.Env) if err != nil { r.logger.Println("GetNvidiaUUID failed") } else { if len(envmap) > 0 { restr := "" for idx, val := range envmap { restr = appendtofilestr(idx, val, restr) tmp1 := idx + "=" + val found := false for idx1, val1 := range spec.Process.Env { if strings.Compare(strings.Split(val1, "=")[0], idx) == 0 { spec.Process.Env[idx1] = tmp1 found = true r.logger.Println("modified env", tmp1) continue } } if !found { spec.Process.Env = append(spec.Process.Env, tmp1) r.logger.Println("appended env", tmp1) } } restr = appendtofilestr("CUDA_DEVICE_MEMORY_SHARED_CACHE", "/tmp/vgpu/cudevshr.cache", restr) #L192 //修改為使用者目錄下 //restr = appendtofilestr("CUDA_DEVICE_MEMORY_SHARED_CACHE", "~/vgpu/cudevshr.cache", restr) #L192 ioutil.WriteFile("envfile.vgpu", []byte(restr), os.ModePerm) dir, _ := os.Getwd() sharedmnt := specs.Mount{ Destination: "/tmp/envfile.vgpu", Source: dir + "/envfile.vgpu", Type: "bind", Options: []string{"rbind", "rw"}, } spec.Mounts = append(spec.Mounts, sharedmnt) //spec.Mounts = append(spec.Mounts, ) } if len(newuuids) > 0 { //r.logger.Println("Get new uuids", newuuids) //spec.Process.Env = append(spec.Process.Env, newuuids[0]) err1 := r.addMonitor(newuuids, spec) if err1 != nil { r.logger.Println("addMonitorPath failed", err1.Error()) } } } args := []string{path} if spec.Hooks == nil { spec.Hooks = &specs.Hooks{} } else if len(spec.Hooks.Prestart) != 0 { for _, hook := range spec.Hooks.Prestart { if !strings.Contains(hook.Path, "nvidia-container-runtime-hook") { continue } r.logger.Println("existing nvidia prestart hook in OCI spec file") return nil } } spec.Hooks.Prestart = append(spec.Hooks.Prestart, specs.Hook{ Path: path, Args: append(args, "prestart"), }) r.logger.Println("newEnvs=", spec.Process.Env) return nil } ``` 參考網址 [https://github.com/4paradigm/k8s-vgpu-scheduler/issues/12](https://github.com/4paradigm/k8s-vgpu-scheduler/issues/12) ### 4paradigm/k8s-vgpu-scheduler(中文版readme) https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md ![](https://hackmd.io/_uploads/r1n0SR6Ln.png) ![](https://hackmd.io/_uploads/rkplwCpI3.png)