Model-Manager - HackMD

# MODEL MANAGER FLOW **Step 1:** Training Event Topic: Training Event object: ModelTrainingEvent { ModelDesignSchema design_schema = 1 string dataset_artifact_url = 2 } **Step 2:** Fetch ModelDesignSchema from Redis by passing 'modelKeyToken'. Deserialize and unpack the ModelDesignSchema. Redis Config: IP - 10.168.134.54:6379 No auth **Step 3:** Refer dataset from shared K8 PVC path 'datasetPath'. 'datasetPath' will be already mounted. **Step 4:** Initiate Training Process - Create Docker Image of model training - Create Kubernetes Job **Step 5:** Save training result: ``` 1. Send status of training in kafka queue Queue Details: Topic - Message - { string design_schema_id ModelTrainingStatusEnum status; //Success/Failure repeated String errors; } ``` ``` 2. Send Training output in Model Store Model Store Details: Artifact: retail_forecasting_model.pkl Output: ``` ``` 3. Send Training output to Inference Store BLOB Storage(Minio) is used to store training result and its path will be pushed to inference queue. Inference Store Queue Details: Topic - Message - { string model_id; string dataset_artifact_url; } ``` # **Model Design Schema Proto:** **ModelDesignSchema** ``` message ModelDesignSchema { core.base.Token design_token = 1; core.base.Token producer_token = 2; algorithms.ModelAlgorithms algorithm = 3; ModelDesignVersion version = 4; ModelDesignMetadata metadata = 5; ModelDesignInputSchema input = 6; ModelDesignOutputSchema output = 7; algorithms.ModelHyperParameter hyper_parameter = 8; ModelDesignPreprocessorSchema preprocessor = 9; ModelDesignPostprocessorSchema postprocessor = 10; ModelDesignCriteria criteria = 11; instance.ModelAssembly model_framework = 12; core.base.Token dataset_token = 13; algorithms.ModelLossCriteria loss_criteria = 14; algorithms.ModelOptimizer learning_function = 15; algorithms.ModelEvaluationCriteria eval_criteria = 16; ModelTrainingOutputStore output_store = 17; } ``` **ModelAlgorithms** ``` message ModelAlgorithms { core.base.Token algorithm_token = 1; oneof cortex_model_algorithm_is { ClassificationModels classification = 10; // RegressionModels regression = 20; // ClusteringModels clustering = 30; // DensityModels density = 40; } } message ClassificationModels { oneof classification_model_type_is { BinaryClassificationModels binary = 1; // MulticlassClassificationModels multiclass = 2; // MultilabelClassificationModels multilabel = 3; // ExtremeClassificationModels extreme = 4; } } message BinaryClassificationModels { oneof binary_classification_model_sub_type_is { LogisticRegression logistic_regression = 1; //ArtificialNeuralNetworkClassifier neural_network = 2; //SupportVectorMachineClassifier svm = 2; //LinearDiscriminantAnalysis lda = 3; //QuadraticDiscriminantAnalysis qda = 4; //DecisionTreeClassifier decision_tree = 6; //RandomForestClassifier random_forest = 7; //GbdtClassifier gbdt = 8; //XgboostClassifier xgboost = 9; } } message LogisticRegression { ModelHyperParameter hyper_parameters = 1; model.instance.LogisticRegressionParameter parameters = 2; // repeated string attributes = 3; } message LogisticRegressionParameter { repeated string classes_ = 1; repeated double coef_ = 2; // feature weights e.g., m1, m2, m3, m4, etc. repeated double intercept_ = 3; // y intercept (m0) int64 n_features_in_ = 4; // repeated int64 feature_names_in_ = 5; repeated int32 n_iter_ = 5; } ``` **ModelDesignInputSchema** ``` message ModelDesignInputSchema { common.ModelInputType input_type = 1; // instance.ModelFramework model_framework = 3; ModelDatasetFramework dataset_framework = 2; oneof input_schema_is_one_of { ModelDesignMultivariateInputSchema multivariate = 10; ModelDesignStreamInputSchema time_series = 11; ModelDesignImageInputSchema image = 12; } } enum ModelInputType { MODEL_INPUT_UNSPECIFIED = 0; MODEL_INPUT_MULTIVARIATE = 2100; MODEL_INPUT_BASKET = 2200; MODEL_INPUT_BAG = 2300; MODEL_INPUT_SERIES = 2400; MODEL_INPUT_SEQUENCE = 2500; MODEL_INPUT_TEXT = 2600; MODEL_INPUT_IMAGE = 2700; MODEL_INPUT_VIDEO = 2800; MODEL_INPUT_AUDIO = 2900; } message ModelDatasetFramework { ModelDatasetFrameworkType dataset_framework_type = 1; // "TORCHVISION" string framework_dataset_name =2; // "CIFAR-10". string dataset_version =3; } enum ModelDatasetFrameworkType { DATASET_FRAMEWORK_GENERIC_DEFAULT = 0; DATASET_FRAMEWORK_TORCHVISION=1; DATASET_FRAMEWORK_KERAS=2; DATASET_FRAMEWORK_HUGGINGFACE=3; } message ModelDesignMultivariateInputSchema { // /healthcare/hrp/feature/bp or attribute token or model output/inference // /telecom/inference/customer/churn // /telecom/feature/customer/engagement // /telecom/knowledge/customer/gender // repeated telos.core.base.Token variable = 1; // same as instance input schema core.type.StrVector design_input = 2; // Goes to Model (Output of Preprocessor) } message StrVector{ repeated string element = 1; } ``` **ModelDesignOutputSchema** ``` message ModelDesignOutputSchema { common.ModelOutputType design_output_type = 1; oneof output_is_one_of { ModelDesignClassificationOutputSchema classification_schema = 10; ModelDesignRegressionOutputSchema regression_schema = 11; ModelDesignProjectionOutputSchema projection_schema = 12; ModelDesignClusteringOutputSchema clustering_schema = 13; ModelDesignDensityOutputSchema density_schema = 14; ModelDesignRecommendationOutputSchema recommendation_schema = 15; ModelDesignForecastingOutputSchema forecasting_schema = 16; ModelDesignAnomalyOutputSchema anomaly_schema = 17; ModelDesignOptimizationOutputSchema optimization_schema = 18; ModelDesignDetectionOutputSchema detection_schema = 19; ModelDesignAnnotationOutputSchema annotation_schema = 20; } } enum ModelOutputType { OUTPUT_TYPE_UNSPECIFIED = 0; OUTPUT_TYPE_CLASSIFICATION = 2; OUTPUT_TYPE_REGRESSION = 3; OUTPUT_TYPE_PROJECTION = 4; OUTPUT_TYPE_CLUSTERING = 5; OUTPUT_TYPE_DENSITY = 6; OUTPUT_TYPE_RECOMMENDATION = 7; OUTPUT_TYPE_FORECASTING = 8; OUTPUT_TYPE_ANOMALY = 9; OUTPUT_TYPE_OPTIMIZATION = 10; OUTPUT_TYPE_DETECTION = 11; OUTPUT_TYPE_ANNOTATION = 12; } message ModelDesignClassificationOutputSchema { oneof classification_output_schema_is_one_of { ModelDesignBinaryClassificationOutputSchema binary = 1; ModelDesignMultiClassClassificationOutputSchema multi_class = 10; //ModelMultiLabelClassificationOutputSchema multi_label = 11; //ModelExtremeClassificationOutputSchema extreme = 12; } } message ModelDesignBinaryClassificationOutputSchema { string positive = 1; string negative = 2; telos.core.base.Token target_variable = 3; } message ModelDesignMultiClassClassificationOutputSchema { telos.core.type.Collection class_dictionary = 1; } message ModelDesignRegressionOutputSchema { //core.quantity.QuantitySchema target_variable = 1; // atomic numeric string label_schema = 1; string inference_schema = 2; } ``` **ModelHyperParameter** ``` message ModelHyperParameter { core.base.Token design_token = 1; // ModelDesignToken core.base.Token algorithm_token = 2; map<string, HyperParamater> hyper_parameter = 3; // TODO: can be changed to Quantity? } message HyperParamater { oneof hyperparameter_is_oneof { string str = 1; bool bl = 2; float flt = 3; int32 i32 = 4; core.type.F32Vector fvector = 5; core.type.StrVector svector = 6; } } ``` **ModelDesignPreprocessorSchema** ``` message ModelDesignPreprocessorSchema { oneof preprocessor_is_one_of { ModelDesignMultivariatePreprocessorSchema multivariate = 1; // CortexImagePreprocessor image = 2; // CortexVideoPreprocessor video = 3; // CortexSeriesPreprocessor series = 4; // CortexSequencePreprocessor sequence = 5; // CortexBagPreprocessor bag = 6; } } message ModelDesignMultivariatePreprocessorSchema { //telos.core.expression.ExpressionProgram program = 1; repeated PreprocessorExpression preprocess = 1; } message PreprocessorExpression { string expression = 1; string output=2; map<string, core.type.Primitive> constants = 3; // tbd: right place or not ?? } message Primitive { oneof primitive_is_one_of { float float_data = 1; double double_data = 2; uint32 uint32_data = 3; sint32 sint32_data = 4; uint64 uint64_data = 5; sint64 sint64_data = 6; string string_data = 7; bool boolean_data = 8; Vector list_data = 9; //NumberVector number_list_data = 10; //StrStrMap mapdata = 10; } } ``` **ModelDesignPostprocessorSchema** ``` message ModelDesignPostprocessorSchema { oneof postprocessor_schema_is { ModelDesignClassificationProbabilityPostprocessorSchema probability_postprocessor_schema = 1; } } message ModelDesignClassificationProbabilityPostprocessorSchema { telos.core.expression.ExpressionProgram program = 1; } message ExpressionProgram { ExpressionMap program = 1; ExpressionProgramInitialization initialization = 2; ExpressionProgramDependency dependency = 3; // gets computed at the time of validation/initialization repeated string topological_order = 4; } ``` **ModelAssembly** ``` message ModelAssembly { ModelFramework model_framework = 1; //variant name(MODEL_FRAMEWORK_SKLEARN) & variantVersion(0.24.2) ModelDependencies dependencies = 2; ModelCompute compute_dependencies = 3; ModelDeploymentRPC deployment_rpc = 4; // can be removed - this was added for v1 } message ModelFramework { ModelFrameworkType framework_type = 1; // "scikit-learn" string framework_version = 2; //0.24.2 string framework_algorithm_name =3; // algorithm name within framework. } enum ModelFrameworkType { MODEL_FRAMEWORK_SKLEARN_DEFAULT = 0; MODEL_FRAMEWORK_SKLEARN = 1; MODEL_FRAMEWORK_TENSORFLOW = 2; MODEL_FRAMEWORK_PYTORCH = 3; MODEL_FRAMEWORK_MXNET = 4; MODEL_FRAMEWORK_CAFFE = 5; MODEL_FRAMEWORK_TRITON_SERVER = 6; MODEL_FRAMEWORK_LIGHTGBM = 7; MODEL_FRAMEWORK_CUSTOM = 8; MODEL_FRAMEWORK_XGBOOST = 9; MODEL_FRAMEWORK_CORNAC = 10; } message ModelDependencies { string name = 1; string version = 2; } message ModelCompute { oneof compute_type { ModelCPUCompute cpu = 1; ModelGPUCompute gpu = 2; } } message ModelCPUCompute { CpuType cpu_type = 1; int32 no_of_cores = 2; float ram = 3; } enum GpuType { GPU_TYPE_NVIDIA_DEFAULT = 0; GPU_TYPE_NVIDIA = 1; GPU_TYPE_AMD = 2; } enum CpuType { CPU_TYPE_INTEL_DEFAULT = 0; CPU_TYPE_INTEL = 1; CPU_TYPE_AMD = 2; CPU_TYPE_M1 = 3; CPU_TYPE_M2 = 4; } message ModelGPUCompute { GpuType gpu_type = 1; } //TODO : This change is just for a v1 for cortex e2e. Going forward use ModelEndpoint message ModelDeploymentRPC{ string url = 1; } ``` **ModelTrainingOutputStore** ``` message ModelTrainingOutputStore { oneof options { bool model_store = 1; bool inference_store = 2; } } ```