Submarine

MISC

Request to submarine hitcount server

for i in {1..1000000}; do curl -s https://hits.dwyl.com/apache/submarine.svg; done > /dev/null 2>&1

Create a python virtualenv

sudo apt-get install python3-distutils wget https://files.pythonhosted.org/packages/33/bc/fa0b5347139cd9564f0d44ebd2b147ac97c36b2403943dbee8a25fd74012/virtualenv-16.0.0.tar.gz tar xf virtualenv-16.0.0.tar.gz python3 virtualenv-16.0.0/virtualenv.py venv . venv/bin/activate pip3 install tensorflow==1.13.1 zip -r myvenv.zip venv deactivate

Install protobuf

wget https://github.com/protocolbuffers/protobuf/releases/download/v3.10.1/protobuf-python-3.10.1.tar.gz tar -zxvf protobuf-python-3.10.1.tar.gz cd protobuf-3.10.1 ./configure make sudo make install sudo ldconfig protoc --version # libprotoc 3.10.1

compile protobuf

  • pip install grpcio
  • pip install grpcio-tools
python -m grpc_tools.protoc -I../../submarine-commons/commons-rpc/src/main/proto/ --python_out=./submarine/proto --grpc_python_out=./submarine/proto ../../submarine-commons/commons-rpc/src/main/proto/SubmarineServerProtocol.proto

R/W Apache hdfs by tensorflow setup

export JAVA_HOME=/path/to/java-version-openjdk export HADOOP_HDFS_HOME=/path/to/hadoop #normally this is where you extracted the hadoop tar file. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server export PATH=$PATH:$HADOOP_HDFS_HOME/bin:$HADOOP_HDFS_HOME/sbin export CLASSPATH="$(hadoop classpath --glob)"

R/W Cloudera hdfs by tensorflow setup

# CDH HADOOP_HOME can not be set when store checkpoint in hdfs export HADOOP_HDFS_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-hdfs export JRE_HOME=/opt/jdk1.8.0_221/jre export CLASSPATH="$(hadoop classpath --glob)" export JAVA_HOME=/opt/jdk1.8.0_221 export PATH=$PATH:/opt/jdk1.8.0_221/bin:/opt/apache-maven-3.6.1/bin export LD_LIBRARY_PATH=/opt/cloudera/parcels/CDH/lib64:$JAVA_HOME/jre/lib/amd64/server

mini-submarine save checkpoint to hdfs

${JAVA_CMD} -cp /opt/submarine-current/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar:/usr/local/hadoop/etc/hadoop \ org.apache.submarine.client.cli.Cli job run --name tf-job-001 \ --framework tensorflow \ --verbose \ --input_path "" \ --num_workers 2 \ --env JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ --env JRE_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre \ --env LD_LIBRARY_PATH=/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server \ --worker_resources memory=1G,vcores=1 \ --num_ps 1 \ --ps_resources memory=1G,vcores=1 \ --worker_launch_cmd "${WORKER_CMD}" \ --ps_launch_cmd "myvenv.zip/venv/bin/python mnist_distributed.py --steps 2 --data_dir /tmp/data --working_dir hdfs://localhost:9000/user/yarn" \ --insecure \ --conf tony.containers.resources=/home/yarn/submarine/myvenv.zip#archive,/home/yarn/submarine/mnist_distributed.py,/opt/submarine-current/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar

Submarine Tracking

submarine.set_tracking_uri("mysql+pymysql://submarine:password@192.168.103.9:3306/submarine")

Log network utilization

import psutil bandwith = psutil.net_io_counters().bytes_sent + psutil.net_io_counters().bytes_recv experiment.log_metric("bandwith", bandwith, step = my_step)

Submarine ML library

  • Built on TensorFlow and PyTorch for user easily to build ml model

Submarine on Kubernetes

Submarine-site

<property> <name>submarine.k8s.kube.config</name> <value>/home/submarine/.kube/config</value> <description>Kube config for kubernetes, you should get the config from cluster</description> </property> <property> <name>jdbc.url</name> <value>jdbc:mysql://127.0.0.1:3306/submarine?useUnicode=true&amp;characterEncoding=UTF-8&amp;autoReconnect=true&amp;failOverReadOnly=false&amp;zeroDateTimeBehavior=convertToNull&amp;useSSL=false&amp;allowPublicKeyRetrieval=true</value> </property> <property> <name>jdbc.username</name> <value>submarine</value> </property> <property> <name>jdbc.password</name> <value>password</value> </property>

Install kind

curl -Lo ./kind "https://github.com/kubernetes-sigs/kind/releases/download/v0.7.0/kind-$(uname)-amd64" chmod +x ./kind mv ./kind /some-dir-in-your-PATH/kind

Install helm

curl https://helm.baltorepo.com/organization/signing.asc | sudo apt-key add - sudo apt-get install apt-transport-https --yes echo "deb https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list sudo apt-get update sudo apt-get install helm

Create Cluster

cat <<EOF | kind create cluster --image kindest/node:v1.15.6 --name k8s-submarine --config=- kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane kubeadmConfigPatches: - | kind: InitConfiguration nodeRegistration: kubeletExtraArgs: node-labels: "ingress-ready=true" extraPortMappings: - containerPort: 80 hostPort: 80 protocol: TCP - containerPort: 443 hostPort: 443 protocol: TCP EOF or kind create cluster --image kindest/node:v1.15.6 --name k8s-submarine or minikube start --kubernetes-version v1.14.3 kubectl create namespace submarine kubectl config set-context --current --namespace=submarine

Load docker image into kind cluster

kind load docker-image apache/submarine:server-0.5.0 --name k8s-submarine kind load docker-image apache/submarine:database-0.5.0 --name k8s-submarine

helm install submarine

helm install submarine ./helm-charts/submarine kubectl port-forward svc/submarine-server 8080:8080 helm delete submarine

Create tf-operator

kubectl apply -f ./dev-support/k8s/tfjob/crd.yaml kubectl kustomize ./dev-support/k8s/tfjob/operator | kubectl apply -f -

Create pytorch-operator

kubectl apply -f ./dev-support/k8s/pytorchjob/

Submit job

mnist job

curl -X POST -H "Content-Type: application/json" -d ' { "meta": { "name": "tf-mnist-json", "namespace": "default", "framework": "TensorFlow", "cmd": "python /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log --learning_rate=0.01 --batch_size=150", "envVars": { "ENV_1": "ENV1" } }, "environment": { "image": "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0" }, "spec": { "Ps": { "replicas": 1, "resources": "cpu=1,memory=512M" }, "Worker": { "replicas": 1, "resources": "cpu=1,memory=512M" } } } ' http://127.0.0.1:32080/api/v1/experiment

DeepFM

curl -X POST -H "Content-Type: application/json" -d ' { "meta": { "name": "tf-deepfm", "namespace": "default", "framework": "TensorFlow", "cmd": "python3 run_deepfm.py -conf deepfm.json", "envVars": { "ENV1": "ENV1" } }, "environment": { "image": "docker.io/pingsutw/tf-deepfm:0.5.0" }, "spec": { "Worker": { "replicas": 1, "resources": "cpu=4,memory=4096M" } } } ' http://127.0.0.1:8080/api/v1/experiment

Tracking example

curl -X POST -H "Content-Type: application/json" -d ' { "meta": { "name": "tracking-example", "namespace": "default", "framework": "TensorFlow", "cmd": "python3 /var/example/tracking.py", "envVars": { "ENV_1": "ENV1" } }, "environment": { "image": "pingsutw/tracking-example:0.5.0" }, "spec": { "Ps": { "replicas": 1, "resources": "cpu=1,memory=1024M" }, "Worker": { "replicas": 1, "resources": "cpu=1,memory=1024M" } } } ' http://127.0.0.1:32080/api/v1/experiment

Tensorboard Example

curl -X POST -H "Content-Type: application/json" -d ' { "meta": { "name": "tensorflow-tensorboard-dist-mnist", "namespace": "default", "framework": "TensorFlow", "cmd": "python /var/tf_mnist/mnist_with_summaries.py --log_dir=$(SUBMARINE_TENSORBOARD_LOG_DIR) --learning_rate=0.01 --batch_size=20", "envVars": { "ENV_1": "ENV1" } }, "environment": { "image": "apache/submarine:tf-mnist-with-summaries-1.0" }, "spec": { "Worker": { "replicas": 1, "resources": "cpu=1,memory=512M" } } } ' http://127.0.0.1:32080/api/v1/experiment

Sync Code Example

curl -X POST -H "Content-Type: application/json" -d ' { "meta": { "name": "tf-mnist5", "namespace": "default", "framework": "TensorFlow", "cmd": "ls /code", "envVars": { "ENV_1": "ENV1" } }, "environment": { "image": "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0" }, "spec": { "Ps": { "replicas": 1, "resources": "cpu=1,memory=1024M" }, "Worker": { "replicas": 1, "resources": "cpu=1,memory=1024M" } }, "code": { "syncMode": "git", "url" : "https://github.com/apache/submarine.git" } } ' http://127.0.0.1:8080/api/v1/experiment

Create environment

curl -X POST -H "Content-Type: application/json" -d ' { "name": "my-submarine-env1", "dockerImage" : "apache/submarine:jupyter-notebook-0.5.0-SNAPSHOT", "kernelSpec" : { "name" : "team_default_python_3.7", "channels" : ["defaults"], "dependencies" : [""] } } ' http://127.0.0.1:8080/api/v1/environment

Get experiment metrics

curl -H "Content-Type:application/json" -d ' {"id": "experiment_1596257207157_0002", "workerIndex":"ps-0"} ' http://127.0.0.1:8080/api/metric/selective

set default namespace

kubectl config set-context --current --namespace=submarine kubectl config view --minify | grep namespace: # namespace: submarine

manage the submarine job

kubectl -n submarine get tfjob kubectl get pods kubectl get tfjob mnist -o yaml kubectl describe tfjob mnist

Clean up

kind delete cluster --name k8s-submarine kubectl delete tfjob mnist k delete tfjob --all -n default

Submarine environment

conda env create -f environment.yml conda env export > environment.yml

Submarine web site

  • Install Jekyll on Ubuntu
sudo apt-get install ruby-full build-essential zlib1g-dev echo '# Install Ruby Gems to ~/gems' >> ~/.bashrc echo 'export GEM_HOME="$HOME/gems"' >> ~/.bashrc echo 'export PATH="$HOME/gems/bin:$PATH"' >> ~/.bashrc source ~/.bashrc gem install jekyll bundler
  • clone apache/submarine-site
git clone https://github.com/apache/submarine-site.git git checkout master # docker run -it -p 4000:4000 -v $PWD/submarine-site:/submarine-site hadoopsubmarine/submarine-website:1.0.0 bash cd /submarine-site bundle exec jekyll serve --watch --host=0.0.0.0
tags: Submarine