## install helm(v2) cd /var/cache/GMN_image_files/common/helm tar -zxvf linux-amd64.tgz cp linux-amd64/helm /usr/local/bin/ docker load -i tiller.tar kubectl -n kube-system create sa tiller kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller helm init --service-account tiller --skip-refresh --force-upgrade --upgrade --tiller-image gcr.io/kubernetes-helm/tiller:v2.16.7 helm init --client-only helm version ## install prometheus kubectl create ns monitoring /tmp/prometheus.j2 ``` hostname: node1 nodePort: 31002 port: 9090 tls_config: cert_file: member-node1.pem key_file: member-node1-key.pem ``` helm install -n prometheus --namespace=monitoring /var/cache/GMN_image_files/common/prometheus/helm_prometheus.tgz -f /tmp/prometheus.j2 add the code snippet to prometheus config ``` ... scrape_configs: - job_name: monitoring/kubeshare-collector/0 honor_timestamps: true scrape_interval: 5s scrape_timeout: 5s metrics_path: /kubeshare-collector scheme: http kubernetes_sd_configs: - role: endpoints namespaces: names: - kube-system relabel_configs: - source_labels: [__meta_kubernetes_service_label_app] separator: ; regex: kubeshare-collector replacement: $1 action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] separator: ; regex: collector replacement: $1 action: keep - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] separator: ; regex: Node;(.*) target_label: node replacement: ${1} action: replace - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] separator: ; regex: Pod;(.*) target_label: pod replacement: ${1} action: replace - source_labels: [__meta_kubernetes_namespace] separator: ; regex: (.*) target_label: namespace replacement: $1 action: replace - source_labels: [__meta_kubernetes_service_name] separator: ; regex: (.*) target_label: service replacement: $1 action: replace - source_labels: [__meta_kubernetes_pod_name] separator: ; regex: (.*) target_label: pod replacement: $1 action: replace - source_labels: [__meta_kubernetes_service_name] separator: ; regex: (.*) target_label: job replacement: ${1} action: replace - source_labels: [__meta_kubernetes_service_label_app] separator: ; regex: (.+) target_label: job replacement: ${1} action: replace - separator: ; regex: (.*) target_label: endpoint replacement: collector action: replace - job_name: monitoring/kubeshare-aggregator/0 honor_timestamps: true scrape_interval: 5s scrape_timeout: 5s metrics_path: /kubeshare-aggregator scheme: http kubernetes_sd_configs: - role: endpoints namespaces: names: - kube-system relabel_configs: - source_labels: [__meta_kubernetes_service_label_app] separator: ; regex: kubeshare-aggregator replacement: $1 action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] separator: ; regex: aggregator replacement: $1 action: keep - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] separator: ; regex: Node;(.*) target_label: node replacement: ${1} action: replace - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] separator: ; regex: Pod;(.*) target_label: pod replacement: ${1} action: replace - source_labels: [__meta_kubernetes_namespace] separator: ; regex: (.*) target_label: namespace replacement: $1 action: replace - source_labels: [__meta_kubernetes_service_name] separator: ; regex: (.*) target_label: service replacement: $1 action: replace - source_labels: [__meta_kubernetes_pod_name] separator: ; regex: (.*) target_label: pod replacement: $1 action: replace - source_labels: [__meta_kubernetes_service_name] separator: ; regex: (.*) target_label: job replacement: ${1} action: replace - source_labels: [__meta_kubernetes_service_label_app] separator: ; regex: (.+) target_label: job replacement: ${1} action: replace - separator: ; regex: (.*) target_label: endpoint replacement: aggregator action: replace ... ``` then restart prometheus ## install kubeshare v2 import kubeshare image `find /mnt/172.16.200.50/CAT/Kubeshare-Release/Release-v2.1.5/ -name "*.tar" | xargs -I {} docker load -i {}` add SharedGPU=true label at gpu node`kubectl label nodes node1 SharedGPU=true` The cluster manager(master node) needs to provide the physical GPU topology file at /kubeshare/scheduler/kubeshare-config.yaml ``` cellTypes: 2080-NODE: childCellType: "NVIDIA-GeForce-RTX-2080-Ti" childCellNumber: 2 childCellPriority: 100 isNodeLevel: true cells: - cellType: 2080-NODE cellId: default-k8s ``` **important! The cellId is master node name, it must be the same as the master node name** use helm to install kubeshare v2 `helm install -n kubeshare-v2 --namespace=kubesystem /mnt/172.16.200.50/CAT/Kubeshare-Release/Release-v2.1.5/helm_kubeshare_Release-v2.1.5.tgz` check `gpu_capacity` metrics in prometheus, it should has value. Because kubeshare needs to read it.