for i in {1..1000000}; do curl -s https://hits.dwyl.com/apache/submarine.svg; done > /dev/null 2>&1
sudo apt-get install python3-distutils
wget https://files.pythonhosted.org/packages/33/bc/fa0b5347139cd9564f0d44ebd2b147ac97c36b2403943dbee8a25fd74012/virtualenv-16.0.0.tar.gz
tar xf virtualenv-16.0.0.tar.gz
python3 virtualenv-16.0.0/virtualenv.py venv
. venv/bin/activate
pip3 install tensorflow==1.13.1
zip -r myvenv.zip venv
deactivate
wget https://github.com/protocolbuffers/protobuf/releases/download/v3.10.1/protobuf-python-3.10.1.tar.gz
tar -zxvf protobuf-python-3.10.1.tar.gz
cd protobuf-3.10.1
./configure
make
sudo make install
sudo ldconfig
protoc --version
# libprotoc 3.10.1
python -m grpc_tools.protoc -I../../submarine-commons/commons-rpc/src/main/proto/ --python_out=./submarine/proto --grpc_python_out=./submarine/proto ../../submarine-commons/commons-rpc/src/main/proto/SubmarineServerProtocol.proto
export JAVA_HOME=/path/to/java-version-openjdk
export HADOOP_HDFS_HOME=/path/to/hadoop #normally this is where you extracted the hadoop tar file.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server
export PATH=$PATH:$HADOOP_HDFS_HOME/bin:$HADOOP_HDFS_HOME/sbin
export CLASSPATH="$(hadoop classpath --glob)"
# CDH HADOOP_HOME can not be set when store checkpoint in hdfs
export HADOOP_HDFS_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-hdfs
export JRE_HOME=/opt/jdk1.8.0_221/jre
export CLASSPATH="$(hadoop classpath --glob)"
export JAVA_HOME=/opt/jdk1.8.0_221
export PATH=$PATH:/opt/jdk1.8.0_221/bin:/opt/apache-maven-3.6.1/bin
export LD_LIBRARY_PATH=/opt/cloudera/parcels/CDH/lib64:$JAVA_HOME/jre/lib/amd64/server
${JAVA_CMD} -cp /opt/submarine-current/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar:/usr/local/hadoop/etc/hadoop \
org.apache.submarine.client.cli.Cli job run --name tf-job-001 \
--framework tensorflow \
--verbose \
--input_path "" \
--num_workers 2 \
--env JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
--env JRE_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre \
--env LD_LIBRARY_PATH=/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server \
--worker_resources memory=1G,vcores=1 \
--num_ps 1 \
--ps_resources memory=1G,vcores=1 \
--worker_launch_cmd "${WORKER_CMD}" \
--ps_launch_cmd "myvenv.zip/venv/bin/python mnist_distributed.py --steps 2 --data_dir /tmp/data --working_dir hdfs://localhost:9000/user/yarn" \
--insecure \
--conf tony.containers.resources=/home/yarn/submarine/myvenv.zip#archive,/home/yarn/submarine/mnist_distributed.py,/opt/submarine-current/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar
submarine.set_tracking_uri("mysql+pymysql://submarine:password@192.168.103.9:3306/submarine")
import psutil
bandwith = psutil.net_io_counters().bytes_sent + psutil.net_io_counters().bytes_recv
experiment.log_metric("bandwith", bandwith, step = my_step)
<property>
<name>submarine.k8s.kube.config</name>
<value>/home/submarine/.kube/config</value>
<description>Kube config for kubernetes, you should get the config from cluster</description>
</property>
<property>
<name>jdbc.url</name>
<value>jdbc:mysql://127.0.0.1:3306/submarine?useUnicode=true&characterEncoding=UTF-8&autoReconnect=true&failOverReadOnly=false&zeroDateTimeBehavior=convertToNull&useSSL=false&allowPublicKeyRetrieval=true</value>
</property>
<property>
<name>jdbc.username</name>
<value>submarine</value>
</property>
<property>
<name>jdbc.password</name>
<value>password</value>
</property>
curl -Lo ./kind "https://github.com/kubernetes-sigs/kind/releases/download/v0.7.0/kind-$(uname)-amd64"
chmod +x ./kind
mv ./kind /some-dir-in-your-PATH/kind
curl https://helm.baltorepo.com/organization/signing.asc | sudo apt-key add -
sudo apt-get install apt-transport-https --yes
echo "deb https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list
sudo apt-get update
sudo apt-get install helm
cat <<EOF | kind create cluster --image kindest/node:v1.15.6 --name k8s-submarine --config=-
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
extraPortMappings:
- containerPort: 80
hostPort: 80
protocol: TCP
- containerPort: 443
hostPort: 443
protocol: TCP
EOF
or
kind create cluster --image kindest/node:v1.15.6 --name k8s-submarine
or
minikube start --kubernetes-version v1.14.3
kubectl create namespace submarine
kubectl config set-context --current --namespace=submarine
kind load docker-image apache/submarine:server-0.5.0 --name k8s-submarine
kind load docker-image apache/submarine:database-0.5.0 --name k8s-submarine
helm install submarine ./helm-charts/submarine
kubectl port-forward svc/submarine-server 8080:8080
helm delete submarine
kubectl apply -f ./dev-support/k8s/tfjob/crd.yaml
kubectl kustomize ./dev-support/k8s/tfjob/operator | kubectl apply -f -
kubectl apply -f ./dev-support/k8s/pytorchjob/
curl -X POST -H "Content-Type: application/json" -d '
{
"meta": {
"name": "tf-mnist-json",
"namespace": "default",
"framework": "TensorFlow",
"cmd": "python /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log --learning_rate=0.01 --batch_size=150",
"envVars": {
"ENV_1": "ENV1"
}
},
"environment": {
"image": "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0"
},
"spec": {
"Ps": {
"replicas": 1,
"resources": "cpu=1,memory=512M"
},
"Worker": {
"replicas": 1,
"resources": "cpu=1,memory=512M"
}
}
}
' http://127.0.0.1:32080/api/v1/experiment
curl -X POST -H "Content-Type: application/json" -d '
{
"meta": {
"name": "tf-deepfm",
"namespace": "default",
"framework": "TensorFlow",
"cmd": "python3 run_deepfm.py -conf deepfm.json",
"envVars": {
"ENV1": "ENV1"
}
},
"environment": {
"image": "docker.io/pingsutw/tf-deepfm:0.5.0"
},
"spec": {
"Worker": {
"replicas": 1,
"resources": "cpu=4,memory=4096M"
}
}
}
' http://127.0.0.1:8080/api/v1/experiment
curl -X POST -H "Content-Type: application/json" -d '
{
"meta": {
"name": "tracking-example",
"namespace": "default",
"framework": "TensorFlow",
"cmd": "python3 /var/example/tracking.py",
"envVars": {
"ENV_1": "ENV1"
}
},
"environment": {
"image": "pingsutw/tracking-example:0.5.0"
},
"spec": {
"Ps": {
"replicas": 1,
"resources": "cpu=1,memory=1024M"
},
"Worker": {
"replicas": 1,
"resources": "cpu=1,memory=1024M"
}
}
}
' http://127.0.0.1:32080/api/v1/experiment
curl -X POST -H "Content-Type: application/json" -d '
{
"meta": {
"name": "tensorflow-tensorboard-dist-mnist",
"namespace": "default",
"framework": "TensorFlow",
"cmd": "python /var/tf_mnist/mnist_with_summaries.py --log_dir=$(SUBMARINE_TENSORBOARD_LOG_DIR) --learning_rate=0.01 --batch_size=20",
"envVars": {
"ENV_1": "ENV1"
}
},
"environment": {
"image": "apache/submarine:tf-mnist-with-summaries-1.0"
},
"spec": {
"Worker": {
"replicas": 1,
"resources": "cpu=1,memory=512M"
}
}
}
' http://127.0.0.1:32080/api/v1/experiment
curl -X POST -H "Content-Type: application/json" -d '
{
"meta": {
"name": "tf-mnist5",
"namespace": "default",
"framework": "TensorFlow",
"cmd": "ls /code",
"envVars": {
"ENV_1": "ENV1"
}
},
"environment": {
"image": "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0"
},
"spec": {
"Ps": {
"replicas": 1,
"resources": "cpu=1,memory=1024M"
},
"Worker": {
"replicas": 1,
"resources": "cpu=1,memory=1024M"
}
},
"code": {
"syncMode": "git",
"url" : "https://github.com/apache/submarine.git"
}
}
' http://127.0.0.1:8080/api/v1/experiment
curl -X POST -H "Content-Type: application/json" -d '
{
"name": "my-submarine-env1",
"dockerImage" : "apache/submarine:jupyter-notebook-0.5.0-SNAPSHOT",
"kernelSpec" : {
"name" : "team_default_python_3.7",
"channels" : ["defaults"],
"dependencies" :
[""]
}
}
' http://127.0.0.1:8080/api/v1/environment
curl -H "Content-Type:application/json" -d '
{"id": "experiment_1596257207157_0002", "workerIndex":"ps-0"}
' http://127.0.0.1:8080/api/metric/selective
kubectl config set-context --current --namespace=submarine
kubectl config view --minify | grep namespace:
# namespace: submarine
kubectl -n submarine get tfjob
kubectl get pods
kubectl get tfjob mnist -o yaml
kubectl describe tfjob mnist
kind delete cluster --name k8s-submarine
kubectl delete tfjob mnist
k delete tfjob --all -n default
conda env create -f environment.yml
conda env export > environment.yml
sudo apt-get install ruby-full build-essential zlib1g-dev
echo '# Install Ruby Gems to ~/gems' >> ~/.bashrc
echo 'export GEM_HOME="$HOME/gems"' >> ~/.bashrc
echo 'export PATH="$HOME/gems/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc
gem install jekyll bundler
git clone https://github.com/apache/submarine-site.git
git checkout master
# docker run -it -p 4000:4000 -v $PWD/submarine-site:/submarine-site hadoopsubmarine/submarine-website:1.0.0 bash
cd /submarine-site
bundle exec jekyll serve --watch --host=0.0.0.0
Submarine
SkyPilot is a framework for running LLMs, AI, and batch jobs on any cloud, offering maximum cost savings, highest GPU availability, and managed execution.
Apr 17, 2024Motivation Currently it is hard to implement backend plugins, especially for data-scientists & MLE’s who do not have working knowledge of Golang. Also, performance requirements, maintenance and development is cumbersome. The document here proposes a path to make it possible to write plugins rapidly, while decoupling them from the core flytepropeller engine. Goals Plugins should be easy to author - no need of code generation, using tools that MLEs and Data Scientists are not accustomed to using. Most important plugins for Flyte today are plugins that communicate with external services. It should be possible to test these plugins independently and also deploy them privately. It should be possible for users to use backend plugins for local development, especially in flytekit and unionML
Dec 28, 2023PR: https://github.com/flyteorg/flytekit/pull/1782
Sep 8, 2023Issues Discussion Motivation: Why do you think this is important? Currently flyteadmin notifications are delivered using the PagerDuty, Github and Slack email APIs. On AWS deployments FlyteAdmin uses SES to trigger emails, for all others the only alternative email implementation is SendGrid integration. Setting up SES or SendGrid can be somewhat complicated. Furthermore, asking your Flyte users to configure the aforementioned services with email integrations adds even more overhead. It would be simpler as an alternative to provide webhook integration for notification so that users only have to configure existing API keys for PagerDuty/Github/Slack. Flyte currently only allows sending notifications by email and requires users to explicitly define notification rules in their launchplans. FlyteAdmin Webhook
Jun 5, 2023or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up