非root使用者無權限讀取/tmp/vgpu/cudevshr.cache問題
===
### 問題描述
當contioner 為一般使用者要訪問與修改/tmp/vgpu/cudevshr.cache資料夾失敗
multiprocess_memory_limit.c:645 Fail to open shrreg /tmp/vgpu/xxx.cache: errno=13
```
[4pdvGPU Msg(1:140596120348480:libvgpu.c:872)]: Initializing...
Matplotlib created a temporary config/cache directory at /tmp/matplotlib-qyzb67oo because the default path (/home/myUser/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: [2023-06-07 09:12:57.006051] inference on gpu: True
[4pdvGPU Msg(1:140596120348480:device.c:248)]: driver version=11060
[4pdvGPU Msg(1:140596120348480:hook.c:400)]: loaded nvml libraries
[4pdvGPU Msg(1:140596120348480:hook.c:408)]: initial_virtual_map
[4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:645)]: Fail to open shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=13
[4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:650)]: Fail to init shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9
[4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:654)]: Fail to write shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9
[4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:657)]: Fail to reseek shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9
[4pdvGPU ERROR (pid:1 thread=140596120348480 multiprocess_memory_limit.c:667)]: Fail to lock shrreg /tmp/vgpu/21a4261c-d971-40f1-a864-a20033b9fb02.cache: errno=9
```

### 修改方法一
更改nvcr.go currentbundle的權限變更為777
[https://github.com/4paradigm/k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go](https://github.com/4paradigm/k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go)
路徑 k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go
```go=
// line 119
const SharedPath = "/tmp/vgpu/containers/"
func (r nvidiaContainerRuntime) addMonitor(ctrmsg []string, spec *specs.Spec) error {
if len(ctrmsg) == 0 {
return errors.New("ctrmsg not matched")
}
os.MkdirAll(SharedPath, os.ModePerm) #line 128
//新增777權限在SharedPath
//os.Chmod(SharedPath, 0777)
currentbundle, _ := os.Getwd()
currentbundle = currentbundle + "/vgpucache/"
os.MkdirAll(currentbundle, os.ModePerm)
vgpupath := SharedPath + ctrmsg[0]
os.Remove(vgpupath)
err := os.Symlink(currentbundle, vgpupath)
if err != nil {
return errors.New("symbolic symbol creation failed")
}
dpath := SharedPath
os.MkdirAll(dpath, os.ModePerm)
sharedmnt := specs.Mount{
Destination: "/tmp/vgpu/",
Source: currentbundle,
Type: "bind",
Options: []string{"rbind", "rw"},
}
spec.Mounts = append(spec.Mounts, sharedmnt)
r.logger.Println("mounts=", spec.Mounts)
dirname, _ := os.Getwd()
r.logger.Println("pwd=", dirname)
return nil
}
```
### 修改方法二
將CUDA_DEVICE_MEMORY_SHARED_CACHE使用權修改到user家目錄下
路徑 k8s-vgpu-scheduler/blob/v1.1.1.5/cmd/nvidia-container-runtime/nvcr.go
```cpp=
// addNVIDIAHook modifies the specified OCI specification in-place, inserting a
// prestart hook.
func (r nvidiaContainerRuntime) addNVIDIAHook(spec *specs.Spec) error {
path, err := exec.LookPath("nvidia-container-runtime-hook")
if err != nil {
path = hookDefaultFilePath
_, err = os.Stat(path)
if err != nil {
return err
}
}
r.logger.Printf("prestart hook path: %s %s\n", path)
envmap, newuuids, err := GetNvidiaUUID(r, spec.Process.Env)
if err != nil {
r.logger.Println("GetNvidiaUUID failed")
} else {
if len(envmap) > 0 {
restr := ""
for idx, val := range envmap {
restr = appendtofilestr(idx, val, restr)
tmp1 := idx + "=" + val
found := false
for idx1, val1 := range spec.Process.Env {
if strings.Compare(strings.Split(val1, "=")[0], idx) == 0 {
spec.Process.Env[idx1] = tmp1
found = true
r.logger.Println("modified env", tmp1)
continue
}
}
if !found {
spec.Process.Env = append(spec.Process.Env, tmp1)
r.logger.Println("appended env", tmp1)
}
}
restr = appendtofilestr("CUDA_DEVICE_MEMORY_SHARED_CACHE", "/tmp/vgpu/cudevshr.cache", restr) #L192
//修改為使用者目錄下
//restr = appendtofilestr("CUDA_DEVICE_MEMORY_SHARED_CACHE", "~/vgpu/cudevshr.cache", restr) #L192
ioutil.WriteFile("envfile.vgpu", []byte(restr), os.ModePerm)
dir, _ := os.Getwd()
sharedmnt := specs.Mount{
Destination: "/tmp/envfile.vgpu",
Source: dir + "/envfile.vgpu",
Type: "bind",
Options: []string{"rbind", "rw"},
}
spec.Mounts = append(spec.Mounts, sharedmnt)
//spec.Mounts = append(spec.Mounts, )
}
if len(newuuids) > 0 {
//r.logger.Println("Get new uuids", newuuids)
//spec.Process.Env = append(spec.Process.Env, newuuids[0])
err1 := r.addMonitor(newuuids, spec)
if err1 != nil {
r.logger.Println("addMonitorPath failed", err1.Error())
}
}
}
args := []string{path}
if spec.Hooks == nil {
spec.Hooks = &specs.Hooks{}
} else if len(spec.Hooks.Prestart) != 0 {
for _, hook := range spec.Hooks.Prestart {
if !strings.Contains(hook.Path, "nvidia-container-runtime-hook") {
continue
}
r.logger.Println("existing nvidia prestart hook in OCI spec file")
return nil
}
}
spec.Hooks.Prestart = append(spec.Hooks.Prestart, specs.Hook{
Path: path,
Args: append(args, "prestart"),
})
r.logger.Println("newEnvs=", spec.Process.Env)
return nil
}
```
參考網址
[https://github.com/4paradigm/k8s-vgpu-scheduler/issues/12](https://github.com/4paradigm/k8s-vgpu-scheduler/issues/12)
### 4paradigm/k8s-vgpu-scheduler(中文版readme)
https://github.com/4paradigm/k8s-vgpu-scheduler/blob/master/README_cn.md

