diff --git a/.gitignore b/.gitignore index d31c47a..9353809 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,21 @@ values.yaml .idea/ + +__pycache__/ +*.pyc +*.log + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ \ No newline at end of file diff --git a/agents/MANIFEST.in b/agents/MANIFEST.in index facb665..55d440a 100644 --- a/agents/MANIFEST.in +++ b/agents/MANIFEST.in @@ -2,3 +2,4 @@ include README.md include LICENSE recursive-include mlsysops/templates *.j2 recursive-include mlsysops/policies *.py +recursive-include mlsysops/crds *.yaml diff --git a/agents/Makefile b/agents/Makefile index 546241b..2dd5e44 100644 --- a/agents/Makefile +++ b/agents/Makefile @@ -1,28 +1,33 @@ # Variables -PLATFORMS := linux/amd64#,linux/arm64/v8 +PLATFORMS := linux/amd64,linux/arm64/v8 DOCKER_BUILDX=docker buildx build - +#REGISTRY=registry.mlsysops.eu/aug-private/aug-uc +REGISTRY ?= harbor.nbfc.io/mlsysops # Default tags (can be overridden from CLI) -NODE_AGENT_TAG ?= registry.mlsysops.eu/agent/agents/node -CLUSTER_AGENT_TAG ?= registry.mlsysops.eu/agent/agents/cluster -CONTINUUM_AGENT_TAG ?= registry.mlsysops.eu/agent/agents/continuum -TEST_APP_TAG ?= registry.mlsysops.eu/agent/agents/test_app +NODE_AGENT_TAG ?= $(REGISTRY)/node-agent +CLUSTER_AGENT_TAG ?=$(REGISTRY)/cluster-agent +CONTINUUM_AGENT_TAG ?=$(REGISTRY)/continuum-agent +FITA_AGENT_TAG ?=$(REGISTRY)/fita-agent +TEST_APP_TAG ?=$(REGISTRY)/test_app CI_COMMIT_TAG ?= 0.0.0 module: python3 -m build node_agent: - $(DOCKER_BUILDX) --platform ${PLATFORMS} --no-cache -t $(NODE_AGENT_TAG):$(CI_COMMIT_TAG) --push -f node/Dockerfile node + $(DOCKER_BUILDX) --platform ${PLATFORMS} -t $(NODE_AGENT_TAG):$(CI_COMMIT_TAG) --push -f node/Dockerfile . cluster_agent: - $(DOCKER_BUILDX) --platform ${PLATFORMS} --no-cache -t $(CLUSTER_AGENT_TAG):$(CI_COMMIT_TAG) --push -f cluster/Dockerfile cluster + $(DOCKER_BUILDX) --platform ${PLATFORMS} -t $(CLUSTER_AGENT_TAG):$(CI_COMMIT_TAG) --push -f cluster/Dockerfile . continuum_agent: - $(DOCKER_BUILDX) --platform ${PLATFORMS} --no-cache -t $(CONTINUUM_AGENT_TAG):$(CI_COMMIT_TAG) --push -f continuum/Dockerfile continuum + $(DOCKER_BUILDX) --platform ${PLATFORMS} -t $(CONTINUUM_AGENT_TAG):$(CI_COMMIT_TAG) --push -f continuum/Dockerfile . + +fita_agent: + $(DOCKER_BUILDX) --platform ${PLATFORMS} -t $(FITA_AGENT_TAG):$(CI_COMMIT_TAG) --push -f fita/Dockerfile . test_application: $(DOCKER_BUILDX) --platform ${PLATFORMS} --no-cache -t $(TEST_APP_TAG):$(CI_COMMIT_TAG) -f tests/application/Dockerfile --push tests/application # Build all targets -all: node_agent cluster_agent continuum_agent \ No newline at end of file +all: node_agent cluster_agent continuum_agent fita_agent \ No newline at end of file diff --git a/agents/cluster/.dockerignore b/agents/cluster/.dockerignore new file mode 100644 index 0000000..f1b149f --- /dev/null +++ b/agents/cluster/.dockerignore @@ -0,0 +1,3 @@ +.env +*.log +*.csv \ No newline at end of file diff --git a/agents/cluster/.env.augmenta b/agents/cluster/.env.augmenta new file mode 100644 index 0000000..259793b --- /dev/null +++ b/agents/cluster/.env.augmenta @@ -0,0 +1,13 @@ +NODE_NAME=mlsysops-2 +CLUSTER_NAME=mlsysops-2 +EJABBERD_DOMAIN=xmpp.aug.mlsysops.eu +NODE_PASSWORD=1234 +MLSYSOPS_INSTALL_PATH=. +CONFIG_PATH=config.yaml +DESCRIPTION_PATH=descriptions +REDIS_HOST=100.64.0.4 +REDIS_PORT=6379 +MLS_POLICY_DIRECTORY=policies +TELEMETRY_ENDPOINT=100.64.0.1:43170 +LOCAL_OTEL_ENDPOINT=http://100.64.0.1:9999/metrics +MLS_CORE_POLICIES_ENABLED=False \ No newline at end of file diff --git a/agents/cluster/.env.uth b/agents/cluster/.env.uth new file mode 100644 index 0000000..4f3be8b --- /dev/null +++ b/agents/cluster/.env.uth @@ -0,0 +1,10 @@ +NODE_NAME=mls-test-manage +CLUSTER_NAME=mls-test-manage +EJABBERD_DOMAIN=10.64.83.239 +NODE_PASSWORD=1234 +MLSYSOPS_INSTALL_PATH=. +CONFIG_PATH=config.yaml +DESCRIPTION_PATH=descriptions +REDIS_HOST=10.64.83.239 +REDIS_PORT=6379 +MLS_POLICY_DIRECTORY=policies \ No newline at end of file diff --git a/agents/cluster/Dockerfile b/agents/cluster/Dockerfile index 84cdaa4..b240e30 100644 --- a/agents/cluster/Dockerfile +++ b/agents/cluster/Dockerfile @@ -5,15 +5,20 @@ FROM harbor.nbfc.io/proxy_cache/library/python:3.10-slim WORKDIR /workdir # Copy all application files into the image -COPY . /workdir +COPY ./cluster /workdir/cluster +COPY ./mlsysops /workdir/mlsysops # Install dependencies from requirements.txt -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install --no-cache-dir -r fluidity/requirements.txt +RUN pip install --no-cache-dir -r /workdir/mlsysops/requirements.txt +RUN pip install --no-cache-dir -r /workdir/cluster/requirements.txt +RUN pip install --no-cache-dir -r /workdir/cluster/fluidity/requirements.txt -# Export PYTHONPATH for the working directory -ENV PYTHONPATH=/workdir +# Set up a working directory +WORKDIR /workdir/cluster + +# Export PYTHONPATH for the working directory +ENV PYTHONPATH=/workdir:/workdir/cluster:/workdir/cluster/fluidity # Default command to start the application CMD ["python3", "main.py"] \ No newline at end of file diff --git a/agents/cluster/MLSClusterAgent.py b/agents/cluster/MLSClusterAgent.py index 0ca9e8d..68c83df 100644 --- a/agents/cluster/MLSClusterAgent.py +++ b/agents/cluster/MLSClusterAgent.py @@ -125,16 +125,17 @@ async def message_queue_listener(self): self.telemetry_controller.remote_remove_node_exporter_pod(data['node']) case mlsysops.events.MessageEvents.NODE_STATE_SYNC.value: # logger.debug(f"Going to send {self.nodes_state[data['node']]} to node {data['node']}") - await self.send_message_to_node( - data['node'], - MessageEvents.NODE_STATE_SYNC.value, - self.nodes_state[data['node']]) + if data['node'] in self.nodes_state.keys(): + await self.send_message_to_node( + data['node'], + MessageEvents.NODE_STATE_SYNC.value, + self.nodes_state[data['node']]) case _: logger.error(f"Unhandled event type: {event}") except Exception as e: logger.error(f"Error processing message in message_queue_listener: {e}") - logger.debug("Started Message Queue Listener...") + logger.error(traceback.format_exc()) async def fluidity_message_listener(self): """ diff --git a/agents/cluster/aug-kubeconfig.yaml b/agents/cluster/aug-kubeconfig.yaml new file mode 100644 index 0000000..5deea3c --- /dev/null +++ b/agents/cluster/aug-kubeconfig.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUVtVENDQXdHZ0F3SUJBZ0lVU3Z3T0NaTHdyYnRZeUdiYjFWbmRlbE9jNkVjd0RRWUpLb1pJaHZjTkFRRUwKQlFBd1hERUxNQWtHQTFVRUJoTUNlSGd4Q2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVgpCQW9NQVhneENqQUlCZ05WQkFzTUFYZ3hDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0Ck1CNFhEVEkxTVRBeU5EQTNOVGN4TjFvWERUSTJNVEF5TkRBM05UY3hOMW93WERFTE1Ba0dBMVVFQmhNQ2VIZ3gKQ2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVkJBb01BWGd4Q2pBSUJnTlZCQXNNQVhneApDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0TUlJQm9qQU5CZ2txaGtpRzl3MEJBUUVGCkFBT0NBWThBTUlJQmlnS0NBWUVBd1JZODlMeVI0YWd6em95bUZFRHdNb1VmMktnbC9aSjlyak1EbFlGWnY4QVkKQ003QjJ6L0JvNUc0aWdxVzJLbkpUTUJlRFpDVXBZTEliWG1ub2gxODQ1ek1BOHcrK0IwYWRHaVNpK3o2dkxBdAptd3FzK1l3bHE3cFB5c2NIdENPUWxOUG9hblRxQWJBNmxYSWxvQXRzTEF5SXgxalg2K1h5WTh2WDk2MXM1UnNyCllRMjRDSnl5aHN2d29EU0JIdTVtTzZKVTJOTFhsNGJZTE1vbjAvemZnS3FiN3NxMnZkeFlYclhEZ3lpRlg1ZGIKcE9IVlVKbk0xMUNDVDhSNVJjVlQyczE1NkIySXdJMHFkTFBwQm02aWY3UWlBdmdBNWQ0RHNnVlRMMzJVVXFuOAprTTNLVkhoSU4rM0ovcnVCZktHTXZQWnphM3NSa2pkeHhVV3d2THdNYXFPcDZLbXM3NURIcmJuajM5bVZvcXRZCnB3WC96YUpoU2FUS2JIOUhLOUhJMjM3ZFNCVUJkUndFM1ZKUDRCSHJyVjBZUE5BTC9ldCtZV3RCOVVYKzNGM1UKdW5CU0pqbkNOU1lieStaRmFrMzJib2lac0xlTVMxSGNadmxNVGhQVWZOQmUzNmQxazlGMHRJSlN4NkE4QmlsegpYK0dHM1grOHdyZkIxMUQyTWYyL0FnTUJBQUdqVXpCUk1CMEdBMVVkRGdRV0JCU3FOUFp5V0dKQjRmVEtxRlplCmNVVzMzZmptU1RBZkJnTlZIU01FR0RBV2dCU3FOUFp5V0dKQjRmVEtxRlplY1VXMzNmam1TVEFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCZ1FDNGhhSlkvSld5bnRpZGJLNXpJS2VlUzVrOQp6ZkNCNXJxRkJSOTkzTmdUNm1LT3ljRFFSYVJqeHR0aGJ6eWZUeXU2SEJ5M1BIaUFiWjlNU1E0R09IYmlKa1ZQCkNkbTBzVXVLOUJXNjdtWGdSRXJFdEF2VVlSSkpQSXZFNWFzN2tWY1d5YWNHSFpveTNRdXRLZGtOS1JrQkxBZ0QKTERvR1gvazBwREtIVTVmcTc4aEFCbjNYQTN0VVVpbEtUeDFtZlR6eFEwcVlUK0pmOFBlQ3J4V3VUYkJFSHlHRAp4VTkxSkdmcXNOcy90SEtXS1JiRS9LNldnRmRHWU1lWUljWFRGL25xZnlDYmUxOEVEekpGdzQyeW1obDFiZTFOCnpRMU9sRzRJRmNINWdSeFZQMkxrUDBpdldZVGZuOUE3ZFBFU242UlFPYjZUZFZvNXdhRG9nSWRyVEg3dmNLMXUKQlYrN0tzdjdqcGsycFNhenI0V3NpQlgrYkFxYUFZcjNoQlBaRVUzNjBDZ0NnU2Zoc01xMHY1VDhyaEtONGFJMQpxUzJzV2piN09JRVd0M3g0YzA2dEtUTVFkSExqRTJnbVg3cEt5TGhUdWtkQnpld0pCdGRtOUlIUVYxQ25hdk03CktnVkJqZDE4NHg5bUw3UGdvaXJWcU1zN3R4bXJNZEpIU0FyYmNsQT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + server: https://100.64.0.4:32644 + name: karmada-apiserver +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJlRENDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTmpFeU9USTFOVEl3SGhjTk1qVXhNREkwTURjMU5UVXlXaGNOTXpVeE1ESXlNRGMxTlRVeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTmpFeU9USTFOVEl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSQzBqaDZseDVlZmpoNjljOTNHb1MxakNNQ1grSTFwQXh0NzBtOHVuTWgKdGZtNmF1dWI1VHpBSXU0SGJoZi9wa083ck1FSGVNbXZUT3RqMHNnbVpjY2xvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVXJNMzluMnkzakx3KzZra0FrZ21kCnl0ZGl5TzR3Q2dZSUtvWkl6ajBFQXdJRFNRQXdSZ0loQVBpRFlldmdRY1EyVENvQ0dITy9qWGRCNGtXT1hucFkKdGxnazlVRkFWQ1RkQWlFQXdQcUxhWk9LeDVtby9KM3dCZnhFQWErOHVjUnVydjNBQUF2UVlHVFA0NjA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + server: https://100.64.0.4:6443 + name: karmada-host +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkakNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTmpFeU1Ua3pPVEl3SGhjTk1qVXhNREl6TVRFek5qTXlXaGNOTXpVeE1ESXhNVEV6TmpNeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTmpFeU1Ua3pPVEl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSU1JvK2p0cHgrRzc3SUdhYW9hcjY1b2ZWOTBVdG85RUhxWmQyZk5NdjIKOGd6anErWDBqR1M2SVpSa3E1eTVoSHZheVBsSWNmbWFCS2RMZDJLdmQyMVZvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVTM4R1dKaWhibUdGYkM0YnZjY3JiCktHRTE0aTR3Q2dZSUtvWkl6ajBFQXdJRFJ3QXdSQUlnT3gxeVhRSUVsbHRJRkEyRlEyVmVUcVhjQVIzMUpKRWoKRk9XUVpnbVlkOHNDSUgwcWpnZjRSRW1hWWVaNThRUHN6Tm9TODFtRzQzVVJobGV0STdyK3FUM1IKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + server: https://100.64.0.1:6443 + name: mlsysops-2 +contexts: +- context: + cluster: karmada-apiserver + user: karmada-apiserver + name: karmada-apiserver +- context: + cluster: karmada-host + user: karmada-host + name: karmada-host +- context: + cluster: mlsysops-2 + user: mlsysops-2 + name: mlsysops-2 +current-context: mlsysops-2 +kind: Config +preferences: {} +users: +- name: karmada-apiserver + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZQVENDQTZXZ0F3SUJBZ0lVT1ZwUkhkeTdvNHJINkNYQ2c1aytDN0M2VkJVd0RRWUpLb1pJaHZjTkFRRU0KQlFBd1hERUxNQWtHQTFVRUJoTUNlSGd4Q2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVgpCQW9NQVhneENqQUlCZ05WQkFzTUFYZ3hDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0Ck1CNFhEVEkxTVRBeU5EQTNOVEl3TUZvWERUTXdNVEF5TXpBM05USXdNRm93TURFWE1CVUdBMVVFQ2hNT2MzbHoKZEdWdE9tMWhjM1JsY25NeEZUQVRCZ05WQkFNVERITjVjM1JsYlRwaFpHMXBiakNDQWFJd0RRWUpLb1pJaHZjTgpBUUVCQlFBRGdnR1BBRENDQVlvQ2dnR0JBTkxYa1BDODVsemU4ZTJtdU9HMVNCWUhXdkFYbWE0Y1V4S1loN0FqCnFHYnFlUlEwQURWYVBsRUtNK3NzRm9QeUZmeGxEQ3BHUndnbFR1LzVCRU5OazFJTGhQV3VHb1FFclhSQkhZdmsKQVdOT2lRRFBQRWRQemNUdHlYUGYrME43Z3owcDVXbTFZMnIvQVRFb295dEdpbzVNVEhXbDFEUTMzQVdROExmUgpoVmR0RHZDaTBIMTduSDNWdXAzZzZ2RTh2c3kxNUFhaEtrMFFYVmpNK1VHTDRmMDlXTkRxNGdOODNyeHVRWjlFClFPUkg3NlhvTFBZUVNNT09pSHVIUkwzcnBERTd3eWxEdndkZHNLdDdzVGh3VnpZRXRMOU9Ndm5ieUp4NWUrd2gKdnRaK1ZrTk42QnRRNWtHRUc3dk9lUFRTdDQwUFk5YlVKcDdWSXJrZGowbEhVTnZpM0ZrcGQ5aDBZVW54ZXBsagpTeDRQbVF5aHlFVHRCL0JQTy9HS0Mzb3NlVGt6RTREbVQ5R3VzbmZ5ekMyWjQzUjQxODd4eVdRWjVNMGpvSHN0CnZjNmR3UGtXTS9uWHBKYjZQSTM2aDliS3RSbFpjdmsxZFEwQ1RsRVVSUDNyTjAvc1ppUnA5L3dNSTdCTnRyNVcKd2FnNk1heXdhejhzYVh4aXM0MTRvZTJoNFFJREFRQUJvNElCSVRDQ0FSMHdEZ1lEVlIwUEFRSC9CQVFEQWdXZwpNQjBHQTFVZEpRUVdNQlFHQ0NzR0FRVUZCd01DQmdnckJnRUZCUWNEQVRBTUJnTlZIUk1CQWY4RUFqQUFNQjBHCkExVWREZ1FXQkJURGhwdVdiV1gxbHBRZElMdDFQNGM0TXJRM1R6QWZCZ05WSFNNRUdEQVdnQlNxTlBaeVdHSkIKNGZUS3FGWmVjVVczM2ZqbVNUQ0JuUVlEVlIwUkJJR1ZNSUdTZ2hacmRXSmxjbTVsZEdWekxtUmxabUYxYkhRdQpjM1pqZ2dsc2IyTmhiR2h2YzNTQ0p5b3VaWFJqWkM1cllYSnRZV1JoTFhONWMzUmxiUzV6ZG1NdVkyeDFjM1JsCmNpNXNiMk5oYklJaUtpNXJZWEp0WVdSaExYTjVjM1JsYlM1emRtTXVZMngxYzNSbGNpNXNiMk5oYklJVUtpNXIKWVhKdFlXUmhMWE41YzNSbGJTNXpkbU9IQkg4QUFBR0hCR1JBQUFRd0RRWUpLb1pJaHZjTkFRRU1CUUFEZ2dHQgpBQ0FMK08vdkJGNWhBMGF6eFFrYVI3QTZvVFRkMDZRWGVEaWxxQUV5WHFHek5VeDRXTnBVcG9VZ0NKcGFKVDVFCmgxenNzbWcvMmJFbXNidy8wdmhpTm9LcHNoSDR4aVU4ejYrQWw5WjZManBQRVZrNy9IZUhFTW5kdStKekZMSkMKZ0ZhL2p5TldRZU9mWkpjY0t3aGZ1a0M5dDlrZFc3MVh0WWU3UjMzVm9oNzBPTk5uQ3lHNWY4ZklzeHh0YThwQQp3N2tRMVJINEtIUCtFcGliYkFObHpXQTFQb0ZQcHhjVExyeGM2RDVUUUY0V2ZucjVrek5Jb21nR3RqNXVlTzRpCnExeEhIUGFKanJ3Nk1mY0JZZWY2TlZUTXp4TnErWmVTeTdpZG93eURGWG42MExnTjA2VitWS0pBWGNNUEFRbDAKSFJhVTlVNTVSWmx0Z2NTTWxKYWI1VTlnZTBCT3NRcnBSanhraC9sS3JiVE8wOGxYVm9zSlNacXIvVU9ZUXFOUQpUdkgwQWVORWlWdUUzSnlIamxSR0F4RlhYRkx4c3pTMnNQRXA1aUxWMVlJb2ltVGl6WEVBVzJNQ21EM3I2T25LCjZtemRyZkw0VUEzM2JwMHFxajFwTE5RQUlsQURRTy94WHE0OVV4SXI5azJvcm1qZTRBMW1Qbm1uZUNjN3h4L1EKWWc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== + client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlHNUFJQkFBS0NBWUVBMHRlUThMem1YTjd4N2FhNDRiVklGZ2RhOEJlWnJoeFRFcGlIc0NPb1p1cDVGRFFBCk5WbytVUW96Nnl3V2cvSVYvR1VNS2taSENDVk83L2tFUTAyVFVndUU5YTRhaEFTdGRFRWRpK1FCWTA2SkFNODgKUjAvTnhPM0pjOS83UTN1RFBTbmxhYlZqYXY4Qk1TaWpLMGFLamt4TWRhWFVORGZjQlpEd3Q5R0ZWMjBPOEtMUQpmWHVjZmRXNm5lRHE4VHkrekxYa0JxRXFUUkJkV016NVFZdmgvVDFZME9yaUEzemV2RzVCbjBSQTVFZnZwZWdzCjloQkl3NDZJZTRkRXZldWtNVHZES1VPL0IxMndxM3V4T0hCWE5nUzB2MDR5K2R2SW5IbDc3Q0crMW41V1EwM28KRzFEbVFZUWJ1ODU0OU5LM2pROWoxdFFtbnRVaXVSMlBTVWRRMitMY1dTbDMySFJoU2ZGNm1XTkxIZytaREtISQpSTzBIOEU4NzhZb0xlaXg1T1RNVGdPWlAwYTZ5ZC9MTUxabmpkSGpYenZISlpCbmt6U09nZXkyOXpwM0ErUll6CitkZWtsdm84amZxSDFzcTFHVmx5K1RWMURRSk9VUlJFL2VzM1QreG1KR24zL0F3anNFMjJ2bGJCcURveHJMQnIKUHl4cGZHS3pqWGloN2FIaEFnTUJBQUVDZ2dHQUljWUtINllDdUttMTB6MjdxOXdnR2ZjL3Arbi80aFlEY0owaQpUeUxISFFVK00vQklnRU5lYkJqWVREbVp1YlVUdElSek9HcGRnbjZIY0lWUkczN1h0SExINzVyZUFIdkhWVElCCkM2R1FYMW43TkRmK0RlWklyZHp4UWpDRXpVQ0J1MUIwd1VRZkVzcWFaRWVBSzdSM0NXaExieVVMSXZvM3NtU1AKU3NLek8xZnE1cjBON0t2YVc0WGJDT000ZjBBZHlDanl4K1VWSTVjVmZONElwMmxDNEEvdHBEbWJ6cmMzN3JNeQpxOUtCR3BBellUNFp4YWpDSHlFNzE1M0pjeXJGcmQ2SmgxN25WWnI0Wmkya2RrcmVGVlZXTWxIS0pabHJTVVkrCjhxKzZsRHRrdkxtV0JQanRsdlFoTGd3dkY4dk5uV0lHeFU2bWV5bHRwUzVqVXFBMmRabkhicTlSc1ZtdER0R3YKRUFHRWYrWVM5T1hVeGYrbHA1MENwT1BKVDlMMlloTmtJTW9jWngzZzE2TVBoeVhDaWUrMWNTK3FXZ3NCSU9ydAoybXJROEZyWFdtbWxjelhGcTNHMW5rUXZRWEVjQkI5d0V2WTZTYVFrd1NrY3dsTWltK1hxb3hlQ2RKNGJnUDZ1CkIvTlNBdUFVekFhL3V4T2hwcmJXTGtZTVBGV0JBb0hCQVBSZTM1Y0ZRK2c1cDh6WFRjQUREL20rQkM0SWVYYjgKbnBHa2ZGWk9wZnJhMytTRkppV2NNVEVsTmZuZVN5eEZ2MTN4bERJUkdqVTFRNXloZ3EzNEdpL3JuWktveWpKNAp0N1VRZzcrYTNWRDFjRk5jbDlveXcxK3ovcDZZQXlIN1h1emp5OTkwNWIvblREOEs4dDRVTjR4UjR3OUYzd0k3ClkxYkpEMmFOUmxaYUhsMExlMElJL1dmbkk4QnJKc0F6UmJ6cC9Hc3p4bjdhVnNZVTQ3eisyV3RnZEd2OFFYZXAKZEJMUi94Z1BmQjlwSUdZekxMVCtxeFpJMThXMmxYTnRhUUtCd1FEYzREaFdmU0o5OGN6YVplV2tiMUZIQlJxYQp1L3ZCNVF0QU12bWN5RW10eTFWbDVaeXZXczdxWW9EWWZHcS9kUllBeXVWMTZDc2tuWTlDQ1o2bTFLeUI2MUhoCjVNdmV4S041SENCcFRXcG14Q2VVQ2VReEVkNk50bktWcDJoaWJWU0JiVkZoWm83Y25JZko1NkFZMFhnM2Z6TEQKaTlYYlV6eENCVmc2czJmVm5oemxxWG1pemtwTE94QTRScmpKdUIvNjlJR1JOOVlRSzJ1SnJ4cStnditMU0ZXVQpTUmpKdVhKdDRNdUlaY0VxSmtkY0o1dWw4ZTY1QUFsbkwzcVg2YmtDZ2NCYXNxdkIzWTVxcnB0OUI1d3YyL01kCitreUsyVkpidXo4UFIwRnV0eG9ibFFqc1JBNzRmcFF6YldBdk1UWXRaR1cyZEdkUWZkQ0FWeGlWRmN6dXpOckwKVm1QUmZ4d3N4MmxIcHExL2UvY1FpVWY3YUs1anFkdWh0YWlYTEV4ZFhITEs4WEZGcXBsck5HbTBMRmdmM05rUgpGNE55czdnbGwrMHkxTVVHaHdESUpBS0kvaTN0UUhEdEUzak5mZmVQOW5lM3VoQ1pxVW5PNHpLNXFoOHNrbG9YCjVHRFJnM3E0OHdmRnhITXRkdkhXcHFvRHhKS2ZTcll0T3pQNmlkM0dUYUVDZ2NFQXZicGd5bkpNK1pDRjNpaGYKUFlOZGRycjhvbVBObXoxTEQ4ZDV3T0lPNEJua2JsWHc1SFFGZnpKRDZUSGVaRDJJUEFBTllOMmdpYUJLR1hpMwo3aWlMVWtJNElPbHU4aGxKTHRRcFJ5cjN0ZlRTNW9na21yMzF4UmN1a0R5d0hyUTVQY0FRZ3JFZEVJWG80bGR6Cm56MzNuUHhITjgvYlVMM0w2ZzQ3UHBrekhUem5KNlFQVnZrS3VscXUwUmgyek5lN3FNNzBxVEFrVWRscVUyZTEKZW1aK2FqOGxPKzRhTzlWdlpORVBKRWY3eUYvL05qclRxWlVLOGFFRmNyeEcrMDBKQW9IQkFKSzlXbGVPYWpJMAo3T1I2WFhRTVRUY3JDUnlGbVdvazFYWmY5c1krMStUQ09RUnJzVFFZSmlGdmgrTXFZaXRhdmNCY3IzaVRncDlWCkRIM2w2S1Q0aUtvK0srbjErM1Rmc1h1Nmp3ejRzeCtVVG0vd0JsQm5hbUk4NjVsQlBOcnhwdzdOZkEwdzIyd3UKakthanNPREJqbDFIYmRSVm9KTXlwZit0UHNRc3J4SzVld1RMak1BV0pRV0dQVDdHeDF1L1Y4Z2p1QVZjVSs3bgo2c0VmbENVYmlyNHBndVpicHVCWmRldVJQTFA4NHVzS2c0aDRhL2JXRXhMQ2VXa3BTQ1Rkanc9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo= +- name: karmada-host + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrVENDQVRlZ0F3SUJBZ0lJQ3pUS2tBSzQzQjR3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOell4TWpreU5UVXlNQjRYRFRJMU1UQXlOREEzTlRVMU1sb1hEVEkyTVRBeQpOREEzTlRVMU1sb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJDcDJXYjZXWDlmQTkzYi8KYkRuTEdFTXFYYTFsaGhlL0VCc0dkUmNJVlRPM3poUXVLV2FyMjJvRk0vM3ZyT1ZON1VyWVJ0cS9heW1MTkVPVwp5VmtLMTZlalNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCUVNZNFFjMlNtSEtSNnBhSTMrV01kc0hMMkQyekFLQmdncWhrak9QUVFEQWdOSUFEQkYKQWlBTEN2VURPbG5OZk5Tak05Z3hpdUVUL3l3QnBNdHl0VXNTYTZEREh2K3RWZ0loQU5zdHFrZ2h4YVI4clpwKwo0YnpxdHZubTZ5TkZTQ2xqZVFXSHdoNW1YS29qCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkakNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTmpFeU9USTFOVEl3SGhjTk1qVXhNREkwTURjMU5UVXlXaGNOTXpVeE1ESXlNRGMxTlRVeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTmpFeU9USTFOVEl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFRSUd2a3o0UWozZmgyS3ZQTXBoR2Rrcm5ZN0dyUUlLUTBPc21QUFR3bHAKZ1o4VjRGNElpYmo4OVlBNSs5SUtjK3lpUUhmcXJhNE01c2ZsdFlTbExxUWdvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVUVtT0VITmtwaHlrZXFXaU4vbGpICmJCeTlnOXN3Q2dZSUtvWkl6ajBFQXdJRFJ3QXdSQUlnVHAxa2wwVFJVamFNaUJMK3dRVkVJWGFWaEw4MzJuNCsKSVpSVjZLMGppWThDSUV5OFA5UFNGYUhaQ3dhSFhsc2pEZjFpRWFRdmtiR0VUYnNad3pjaFJIeWQKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUxPc2FvYWVPVldKUXZGcGlwLzQrSmMvTHVFakg3MmRDaVNGcUhkNFkxWHNvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFS25aWnZwWmYxOEQzZHY5c09jc1lReXBkcldXR0Y3OFFHd1oxRndoVk03Zk9GQzRwWnF2YgphZ1V6L2UrczVVM3RTdGhHMnI5cktZczBRNWJKV1FyWHB3PT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= +- name: mlsysops-2 + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrRENDQVRlZ0F3SUJBZ0lJWWlQRHIzQ21sZkl3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOell4TWpFNU16a3lNQjRYRFRJMU1UQXlNekV4TXpZek1sb1hEVEkyTVRBeQpNekV4TXpZek1sb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJMZFJXVVExd2lXNWNNQ2YKUkQ1OG1uZ0pPdUsxcmlZeFVoa1ErUGxJcWhlWDJuV2RYMGhYMFNPTjJ6WVBiVUlrLzJya1pzZE84K0c1NnNwWApocWNCQVUyalNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCUXkxR2FHN2dHY3pKc3M5UFFjSEZ1YWlCR0R5REFLQmdncWhrak9QUVFEQWdOSEFEQkUKQWlBMnF1NURDbVp2NHpvR1VXUVJxWFVYbUdLMzNJV05SYWxaUHN1MG5PcDhXZ0lnRlJONElBM05NdmViOGhabwpvUHlTOXFTMDNSYUlYMlFNd1NzOWJ6VFdUd009Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkakNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTmpFeU1Ua3pPVEl3SGhjTk1qVXhNREl6TVRFek5qTXlXaGNOTXpVeE1ESXhNVEV6TmpNeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTmpFeU1Ua3pPVEl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFUaUdOVUptNTFNWC9pdTFPSk53U0ozUFlvY1RkV1I4RmQrYzhXdWNjWDEKUHI1UUlVRDB0a1gxcGNuNktlQjdsaUZ2SkZ0bjkvNW9YR3pJUHdqd0p3WkFvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVU10Um1odTRCbk15YkxQVDBIQnhiCm1vZ1JnOGd3Q2dZSUtvWkl6ajBFQXdJRFJ3QXdSQUlnTUpGUXlVK3prS0o1SHNySlNNZTZzWHNwa1RDWVNrSVcKSGNidkZsQVJkQjRDSUNaUmgwV0l6MXIrQ3k5dmVVbWdjWW1md25wcDI5RlRrYjdZK0N4Z2I5emoKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSVByK2gyMUUzVUp2bmZ1eE5EV2xiVU5qMlBiL0hWcURhV3FzRGVvZDRCUnpvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFdDFGWlJEWENKYmx3d0o5RVBueWFlQWs2NHJXdUpqRlNHUkQ0K1VpcUY1ZmFkWjFmU0ZmUgpJNDNiTmc5dFFpVC9hdVJteDA3ejRibnF5bGVHcHdFQlRRPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= diff --git a/agents/cluster/config.yaml b/agents/cluster/config.yaml index 720895b..d27b1b7 100644 --- a/agents/cluster/config.yaml +++ b/agents/cluster/config.yaml @@ -1,7 +1,6 @@ mechanisms: - "fluidity" default_telemetry_metrics: "None" -policy_directory: "/etc/mlsysops/policies" mechanisms_directory: "mechanisms" continuum_layer: "cluster" @@ -9,7 +8,6 @@ monitor_data_retention_time: 30 node_exporter_scrape_interval: 10s managed_telemetry_enabled: true - behaviours: APIPingBehaviour: enabled: False @@ -26,9 +24,9 @@ behaviours: ManagementModeBehaviour: enabled: False ManageSubscriptionBehaviour: - enabled: True - MessageReceivingBehavior: enabled: False + MessageReceivingBehavior: + enabled: True message_queue: "message_queue" Subscribe: - enabled: True + enabled: False diff --git a/agents/cluster/descriptions/mls-test-manage.yaml b/agents/cluster/descriptions/mls-test-manage.yaml new file mode 100644 index 0000000..bd14b60 --- /dev/null +++ b/agents/cluster/descriptions/mls-test-manage.yaml @@ -0,0 +1,8 @@ +MLSysOpsCluster: + name: mls-test-manage + cluster_id: mls-test-manage + nodes: + - mls-compute-vm2 + - mls-compute-vm3 + - mls-compute-vm2-b1-node1 + - mls-compute-vm2-b1-node2 \ No newline at end of file diff --git a/agents/cluster/descriptions/mlsysops-2.yaml b/agents/cluster/descriptions/mlsysops-2.yaml new file mode 100644 index 0000000..a05db77 --- /dev/null +++ b/agents/cluster/descriptions/mlsysops-2.yaml @@ -0,0 +1,6 @@ +MLSysOpsCluster: + name: mlsysops-2 + cluster_id: mlsysops-2 + nodes: + - 48b02d764794 + - 48b02d15cf09 \ No newline at end of file diff --git a/agents/cluster/descriptions/mlsysops-cluster-agent1.rni.dc.ubiwhere.lan.yaml b/agents/cluster/descriptions/mlsysops-cluster-agent1.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..875e7ef --- /dev/null +++ b/agents/cluster/descriptions/mlsysops-cluster-agent1.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,7 @@ +MLSysOpsCluster: + name: mlsysops-cluster-agent1 + cluster_id: mlsysops-cluster-agent1.rni.dc.ubiwhere.lan + nodes: + - mlsysops-worker1.rni.dc.ubiwhere.lan + - mlsysops-worker2.rni.dc.ubiwhere.lan + - mlsysops-worker3.rni.dc.ubiwhere.lan \ No newline at end of file diff --git a/agents/cluster/fluidity/controller.py b/agents/cluster/fluidity/controller.py index c43be1f..ed20a98 100644 --- a/agents/cluster/fluidity/controller.py +++ b/agents/cluster/fluidity/controller.py @@ -60,6 +60,7 @@ extend_pod_env_template, create_svc_object, create_svc_manifest, \ create_pod_manifest, change_comp_spec, validate_host +import traceback def check_diff(d1, d2): diff = DeepDiff(d1, d2) @@ -210,14 +211,14 @@ def create_cr(cr_dict, cr_kind): updated_dict = { "apiVersion": "mlsysops.eu/v1", "kind": cr_kind, + "name": cr_name, "metadata": { "name": cr_name } } updated_dict.update(cr_dict) - logger.info(f'Updated dict {updated_dict}') - + resp = None try: logger.info('Trying to read cr_kind %s with name %s if already exists', cr_kind, cr_name) @@ -281,6 +282,7 @@ def apply_cluster_description(fpath=None, file=None): updated_dict = { "apiVersion": "mlsysops.eu/v1", "kind": crd_info['kind'], + "name": cr_name, "metadata": { "name": cr_name } @@ -293,15 +295,18 @@ def apply_cluster_description(fpath=None, file=None): try: resp = cr_api.get_fluidity_object(plural, cr_name) except FluidityApiException: - logger.error('Retrieving %s failed', cr_name) + pass if resp: + logger.info('Retrieved %s', resp) return resp['metadata']['name'] try: + logger.warning(f'Trying to create {updated_dict} plural {plural}') crs = cr_api.create_fluidity_object(plural, updated_dict) except FluidityApiException: logger.error('Creating %s failed', cr_name) + logger.error(traceback.format_exc()) return None return updated_dict['metadata']['name'] @@ -522,7 +527,7 @@ async def _mls_inbound_monitor(self): continue name = data.get("name") - + logger.warning(f"Got {message}") if event is None or data is None or name is None: logger.info('Ignoring message: One of event/data/name is missing.') continue @@ -698,7 +703,7 @@ async def _app_handler(self): case 'MODIFIED': match resource: case 'mlsysopsapps': - res, comp_dict = await self._handle_upd_app(name, spec) + res, comp_dict = await self._handle_upd_app(name, spec,uid) if res: status = Status.COMPLETED.value else: @@ -739,7 +744,7 @@ async def _app_handler(self): if initial_plan: plan.pop('initial_plan') - + logger.warning(f"Plan in fluidity: {plan}") for comp_name in plan: if comp_name not in self.apps_dict[name]['components']: logger.error(f"Component {comp_name} not in internal app structure. Ignoring") @@ -753,7 +758,7 @@ async def _app_handler(self): # Validate new host if not validate_host(comp_spec['pod_template'], comp_spec, action_entry['host'], self.nodes): - logger.error(f"Host {action_entry['host']} did not pass eligibility check") + logger.error(f"Host {action_entry['host']} did not pass eligibility check.-") status = Status.FAILED.value break @@ -789,25 +794,25 @@ async def _app_handler(self): # for all app components initial_deployment_pending = False - for comp_name in self.apps_dict[name]['components']: - comp_spec = self.apps_dict[name]['components'][comp_name] + # for comp_name in self.apps_dict[name]['components']: + # comp_spec = self.apps_dict[name]['components'][comp_name] + # + # if 'hosts' not in comp_spec or comp_spec['hosts'] == []: + # initial_deployment_pending = True + # break + # + # if initial_deployment_pending: + # logger.info('Initial deployment not executed for all components - ignoring') + # status = Status.FAILED.value + # else: + plan.pop('initial_plan') + self.apps_dict[name]['curr_plan']['curr_deployment'] = plan - if 'hosts' not in comp_spec or comp_spec['hosts'] == []: - initial_deployment_pending = True - break - - if initial_deployment_pending: - logger.info('Initial deployment not executed for all components - ignoring') - status = Status.FAILED.value + res, comp_dict = await self._handle_upd_app(name, spec, plan_uid) + if res: + status = Status.COMPLETED.value else: - plan.pop('initial_plan') - self.apps_dict[name]['curr_plan']['curr_deployment'] = plan - - res, comp_dict = await self._handle_upd_app(name, spec, plan_uid) - if res: - status = Status.COMPLETED.value - else: - status = Status.FAILED.value + status = Status.FAILED.value # This event will include one or more entries that specify individual events # for each aspect of the produced plan @@ -1098,7 +1103,7 @@ async def _handle_upd_app(self, app_name, new_app_spec, plan_uid): # Validate new host if not validate_host(comp_spec['pod_template'], comp_spec, move_target_host, self.nodes): - logger.error(f"Host {move_target_host} did not pass eligibility check") + logger.error(f"Host {move_target_host} did not pass eligibility check.") return False, {} res = append_host_to_list({'host': move_src_host, 'status': 'INACTIVE'}, comp_spec['hosts'], remove=True) @@ -1117,7 +1122,7 @@ async def _handle_upd_app(self, app_name, new_app_spec, plan_uid): remove = False # Validate new host if not validate_host(comp_spec['pod_template'], comp_spec, host, self.nodes): - logger.error(f"Host {host} did not pass eligibility check") + logger.error(f"Host {host} did not pass eligibility check..") return False, {} res = append_host_to_list({'host': host, 'status': status}, comp_spec['hosts'], remove=remove) @@ -1134,7 +1139,7 @@ async def _handle_upd_app(self, app_name, new_app_spec, plan_uid): # We also need # Validate new spec if not validate_host(entry['new_spec'], comp_spec, entry['host'], self.nodes): - logger.error(f"Host {entry['host']} did not pass eligibility check") + logger.error(f"Host {entry['host']} did not pass eligibility check...") return False, {} result, updated_spec = change_comp_spec(self.apps_dict[app_name], entry, comp_spec, @@ -1239,7 +1244,7 @@ async def main(inbound_queue=None, outbound_queue=None, cluster_description=None logger.info('Namespace does not exist.') create_mls_namespace(cluster_config.NAMESPACE) - ensure_crds() + # ensure_crds() hostname = os.getenv("NODE_NAME",socket.gethostname()) working_dir = os.getcwd() @@ -1250,6 +1255,7 @@ async def main(inbound_queue=None, outbound_queue=None, cluster_description=None if not cluster_config.CLUSTER_ID: logger.error("Error on applying cluster description") + logger.error(traceback.format_exc()) sys.exit(0) logger.info(f'Current namespace {cluster_config.NAMESPACE}') diff --git a/agents/cluster/fluidity/deploy.py b/agents/cluster/fluidity/deploy.py index 7db9d48..32c6ad7 100644 --- a/agents/cluster/fluidity/deploy.py +++ b/agents/cluster/fluidity/deploy.py @@ -317,6 +317,12 @@ def create_pod_manifest(comp_spec, old_spec=None): if 'host_network' in comp_spec['spec']: manifest['spec']['hostNetwork'] = comp_spec['spec']['host_network'] + if 'volumes' in comp_spec['spec']: + volumes = transform_dict_keys(comp_spec['spec']['volumes']) + manifest['spec']['volumes'] = volumes + + logger.info(f"new pod manifest {manifest}") + return manifest diff --git a/agents/cluster/fluidity/fluidity_monitor.py b/agents/cluster/fluidity/fluidity_monitor.py index f15e47f..3409ba5 100644 --- a/agents/cluster/fluidity/fluidity_monitor.py +++ b/agents/cluster/fluidity/fluidity_monitor.py @@ -81,10 +81,11 @@ async def _check_for_app_resources(self): watcher_task = asyncio.create_task(watcher_obj.run()) await watcher_task - except Exception as e: - logger.error('Unexpected exception encountered: %s', e) except asyncio.CancelledError: logger.info("Watcher task cancelled cleanly.") + raise + except Exception as e: + logger.error('Unexpected exception encountered: %s', e) logger.info('Resource checker exiting.') @@ -96,14 +97,14 @@ async def _check_for_system_resources(self, resource_description): crd_plural = resource_description resource_description = 'CRD' logger.info(f'resource_description {resource_description}, crd_plural {crd_plural}') - + list_func = lambda **kwargs: self.crd_api.list_namespaced_custom_object( group=API_GROUP, version=VERSION, namespace=cluster_config.NAMESPACE, plural=crd_plural, **kwargs - ) + ) else: logger.info(f'resource_description {resource_description}') @@ -120,15 +121,15 @@ async def _check_for_system_resources(self, resource_description): # Run watcher inside a cancellable task watcher_task = asyncio.create_task(watcher_obj.run()) await watcher_task + except asyncio.CancelledError: + logger.info("Watcher task cancelled cleanly.") + raise except kubernetes_asyncio.client.exceptions.ApiException as exc: logger.error(f'exception for CRD {crd_plural} encountered: {exc}') except Exception as e: logger.error(f'Unexpected exception for CRD {crd_plural} encountered: {e}') - except asyncio.CancelledError: - logger.info("Watcher task cancelled cleanly.") logger.info('Resource checker exiting.') - async def run(self): """Main thread function.""" system_task_len = len(CRDS_INFO_LIST) diff --git a/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsApplication.yaml b/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsApplication.yaml index a754e4c..040f5e9 100644 --- a/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsApplication.yaml +++ b/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsApplication.yaml @@ -171,7 +171,13 @@ spec: enum: - containerd - docker - - emb_serve + - embserve + - kata + - kata-qemu + - kata-clh + - kata-fc + - urunc + - nvidia containers: type: array items: @@ -210,6 +216,8 @@ spec: enum: - arm64 - amd64 + - arm-v7 + - arm-v8 frequency: type: number description: required frequency in Hz. diff --git a/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsNode.yaml b/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsNode.yaml index 0d8ede1..a10a220 100644 --- a/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsNode.yaml +++ b/agents/cluster/fluidity/manifests/templates/mvp/crds/MLSysOpsNode.yaml @@ -111,7 +111,13 @@ spec: enum: - containerd - docker - - emb_serve + - embserve + - kata + - kata-qemu + - kata-clh + - kata-fc + - urunc + - nvidia hardware: type: object properties: @@ -126,6 +132,8 @@ spec: enum: - amd64 - arm64 + - arm-v7 + - arm-v8 frequency: type: array description: All the possible CPU frequency values in Hz. diff --git a/agents/cluster/fluidity/nodes.py b/agents/cluster/fluidity/nodes.py index f9601ea..266beda 100644 --- a/agents/cluster/fluidity/nodes.py +++ b/agents/cluster/fluidity/nodes.py @@ -90,6 +90,7 @@ def append_host_to_list(entry_dict, hosts, remove=False): # At this point we did not find the entry, so we append it to the list. hosts.append(entry_dict) + return True def get_node_internal_ip(node_name): """Get the cluster-internal IP of a node. diff --git a/agents/cluster/fluidity/objects_api.py b/agents/cluster/fluidity/objects_api.py index 0fd8241..c766104 100644 --- a/agents/cluster/fluidity/objects_api.py +++ b/agents/cluster/fluidity/objects_api.py @@ -21,6 +21,7 @@ from __future__ import print_function import logging import os +import traceback from kubernetes import client, config from kubernetes.client.rest import ApiException @@ -46,7 +47,6 @@ def __init__(self, api_client=None): config.load_incluster_config() else: config.load_kube_config() - self.cr_api = client.CustomObjectsApi() #: custom resources API client def list_fluidity_object(self, plural, field_select=None, label_select=None): @@ -86,13 +86,13 @@ def create_fluidity_object(self, plural, cr_body): cr_body) except ApiException as exc: logger.exception('%s creation failed: %s', crd_info['kind'], exc) + logger.exception(traceback.format_exc()) raise FluidityApiException from exc def get_fluidity_object(self, plural, name): """Retrieve custom fluidity resource object""" _, crd_info = get_crd_info(plural) version = crd_info['version'] - try: cri = self.cr_api.get_namespaced_custom_object( API_GROUP, @@ -102,7 +102,7 @@ def get_fluidity_object(self, plural, name): name) return cri except ApiException as exc: - logger.exception('%s retrieval failed: %s', crd_info['kind'], exc) + logger.error('%s retrieval failed: %s', crd_info['kind'], exc) raise FluidityApiException from exc def update_fluidity_object(self, plural, name, cr_body): diff --git a/agents/cluster/fluidity/watcher.py b/agents/cluster/fluidity/watcher.py index 5ead4cf..8574ba4 100644 --- a/agents/cluster/fluidity/watcher.py +++ b/agents/cluster/fluidity/watcher.py @@ -18,7 +18,7 @@ import asyncio import signal -import logging +import logging from kubernetes import client import kubernetes_asyncio from mlsysops.events import MessageEvents @@ -26,6 +26,7 @@ from mlsysops.logger_util import logger + class ResourceWatcher: def __init__(self, list_func, resource_description, notification_queue, query_kwargs=None, crd_plural=None): self.list_func = list_func @@ -33,33 +34,38 @@ def __init__(self, list_func, resource_description, notification_queue, query_kw self.notification_queue = notification_queue self.query_kwargs = query_kwargs or {} self.crd_plural = crd_plural - self._stop_event = asyncio.Event() + self._stop_event = asyncio.Event() self._watch_stream = None - + async def get_resource_version(self): resp = None - + try: resp = await self.list_func(**self.query_kwargs) - + except kubernetes_asyncio.client.exceptions.ApiException as e: logger.debug(f"Unhandled ApiException: {e}") await asyncio.sleep(1) + return None except Exception as e: logger.debug(f"Unhandled Exception: {e}") await asyncio.sleep(1) - + return None + # Handle both dict (CustomObjectsApi) and model object (CoreV1Api) - if self.resource_description == 'CRD': - return resp['metadata']['resourceVersion'] - else: - return resp.metadata.resource_version + try: + if self.resource_description == 'CRD': + return resp.get('metadata', {}).get('resourceVersion') if isinstance(resp, dict) else None + else: + return getattr(resp.metadata, 'resource_version', None) + except Exception: + return None async def run(self): resource_version = None if self.resource_description == 'CRD': logger.info(f'CRD plural {self.crd_plural}') - + while not self._stop_event.is_set(): try: w = kubernetes_asyncio.watch.Watch() @@ -70,17 +76,36 @@ async def run(self): timeout_seconds=60, **self.query_kwargs ) - + async with w.stream(self.list_func, **stream_kwargs) as stream: async for event in stream: operation = event['type'] obj = event['object'] + + # Handle BOOKMARK events: only update resource_version, no name/uid present. + if operation == 'BOOKMARK': + try: + if self.resource_description == 'CRD': + rv = obj.get('metadata', {}).get('resourceVersion') + else: + rv = getattr(obj.metadata, 'resource_version', None) + if rv: + resource_version = rv + except Exception: + pass + continue + metadata = self._extract_metadata(obj) + if not metadata or not metadata.get('resourceVersion'): + logger.debug(f"Skipping event without usable metadata: type={operation}") + continue + resource_version = metadata['resourceVersion'] name = metadata.get('name') uid = metadata.get('uid') - - logger.info(f"Event: {operation} - {self.resource_description}: {name}, plural {self.crd_plural}") + + logger.info( + f"Event: {operation} - {self.resource_description}: {name}, plural {self.crd_plural}") msg = { 'operation': operation, @@ -178,18 +203,25 @@ async def run(self): logger.debug("Watcher shutting down...") def _extract_metadata(self, obj): - if self.resource_description == 'CRD': - metadata = obj['metadata'] - return { - 'name': metadata['name'], - 'resourceVersion': metadata['resourceVersion'] - } - else: - metadata = obj.metadata - return { - 'name': metadata.name, - 'resourceVersion': metadata.resource_version - } + try: + if self.resource_description == 'CRD': + # obj is dict-like when using CustomObjectsApi + metadata = obj.get('metadata', {}) if isinstance(obj, dict) else {} + return { + 'name': metadata.get('name'), + 'uid': metadata.get('uid'), + 'resourceVersion': metadata.get('resourceVersion') + } + else: + # obj is a model object when using CoreV1Api + md = getattr(obj, 'metadata', None) + return { + 'name': getattr(md, 'name', None) if md else None, + 'uid': getattr(md, 'uid', None) if md else None, + 'resourceVersion': getattr(md, 'resource_version', None) if md else None + } + except Exception: + return None async def close_watch(self): if self._watch_stream: diff --git a/agents/cluster/kubeconfing-mls-test.yml b/agents/cluster/kubeconfing-mls-test.yml new file mode 100644 index 0000000..073e92f --- /dev/null +++ b/agents/cluster/kubeconfing-mls-test.yml @@ -0,0 +1,44 @@ + +apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUVtVENDQXdHZ0F3SUJBZ0lVQXlZcnZDRXJhaXpFcER1bkU4by9CcThRbXFnd0RRWUpLb1pJaHZjTkFRRUwKQlFBd1hERUxNQWtHQTFVRUJoTUNlSGd4Q2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVgpCQW9NQVhneENqQUlCZ05WQkFzTUFYZ3hDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0Ck1CNFhEVEkxTURreU1EQTJOVGd4TUZvWERUSTJNRGt5TURBMk5UZ3hNRm93WERFTE1Ba0dBMVVFQmhNQ2VIZ3gKQ2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVkJBb01BWGd4Q2pBSUJnTlZCQXNNQVhneApDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0TUlJQm9qQU5CZ2txaGtpRzl3MEJBUUVGCkFBT0NBWThBTUlJQmlnS0NBWUVBekhqa1RZMmR2ZTByRS9EZGVNRzQyVUZHM0NqRERXbTVRUTZLaE1UVmZwaS8KUVFRQTZGM2wrd2tPalBZRW1CdjVwdEY4OFlRbTFBdUJiVjFQbFU1SU1JaTR2aWsxRk9sUi9ER2N1TVFVcmdiMwpua1pJL1I3UmpBK0dkTVFCbU9IM3dHVUVSQzF3WGFVeDY3bTVMZEVoNExMS2dsZjJZSERoTWxaYXJkcHppUXlZCmJYY1JydWl2ZEExbUhzTGZ4QW85R3QvYWY3N21XaUowbnloOEF0YXFQR2VGVk9TcTlsOEl1ZkQ0K1Brc1dJNkYKNVdFZW9xNS9raHJLR1QzVlRkOS9reG9OVTJkMUpTdnJpUHhFbStHTU10WEV3Z0VmNUhrN3FmZHg1SHVKcWZ1QwpSU0crdWFQb1JCOWpZWmVMYjBvQ3FlSTlGN0JseFpkVDNVckVxbGdXbTAvSDJJTytPKzR1eXN6RTYwOExyTUhXCkYzNzJsUUhrdkdmbWNGUy9mTTl5ak5zNWdvVTdGeXdsWEZMUFJHeVZEd1h6NUhFU1dkZXZvYWp5bGc3RlNSM3MKU1dBa2tEREMwazZxVHIvZ0RpTnRNakZsMHN6RHNRd2tqR1R6RExVby9PaG5raVJ4TkJQN0tnVHVtVEhsT1VaMAppVlY4TWt0eVpUMUlLVnFwd0FHL0FnTUJBQUdqVXpCUk1CMEdBMVVkRGdRV0JCUVpFeHUrU2c4TUxWQWtIMjNvCk1oY2R1ZmhTRGpBZkJnTlZIU01FR0RBV2dCUVpFeHUrU2c4TUxWQWtIMjNvTWhjZHVmaFNEakFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCZ1FDQk56TktZVDkzL1o1OFJZUEYvTTNwRXF4cgppZ1BYd0hCOTNURlpkL25qL2lldGdMSi81VTZoSHFTYmZKa1lja204bEcyV21BbDh1UXpiVk1MZGFoSFlpUmMvCnlETmkwN3EvbE5VdDV4czlvMWl4RkVlMFVseVRjT0NpQUNFZGlkQVNCYU1WcjVmRDd4RURNa0cyRHhoS3hnVFcKT1cwMXFBZmRuaytyRUVQR0MxT2pRVlFBd0oxYVVXNzRCemh0dHBnWEN3Y0s5R09vUTQ0cVBUN2hBQ3dGM1NybQp5WGpxSGE4c0hGNzd6ZFZkOTRnbHhaMTZvRm1MaFNpem5GaVovenZudEQ1eDRBWjIwOFg5SjdKMDlJK2FGRnVPClkwa0I3cXIrOUdWU2F1L3JiY1cySXdqOEJVSnZncndpdjBmbzhmazRhbUx2Q1AvcjNaVVU3b2dZaWpGa1pMQUEKdHpLc2tzOHp0eXp3bmpwQWZZUWpHclZqdjJwWWtUK3p6YUo1cjI3RjFwdjBwbU5Xdzc0RDB4NGdLd0FYSUdhaApXRHRHSzQ1aEVETWZEam9iSTNHUldHVm5ZTWE5eThxSjBCVHYzZGZmMS8wTDRLTWp3ZitFeDNkNG1PWm9YcVJkClBsV2Y1RmNydDVFTCtpRHNoa0RGaiswYjhNaW4vRlFsZ2FCQzAxRT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + server: https://10.64.83.239:32644 + name: karmada-apiserver +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTlRnek5UQTRPVGN3SGhjTk1qVXdPVEl3TURZME9ERTNXaGNOTXpVd09URTRNRFkwT0RFMwpXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTlRnek5UQTRPVGN3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFTaUFSTGlnZEdLeHhHY3B1YUZ5SjhOWmVHRC9ZZW5Jdjc5RHI5Qk1YK08KUHdLRDFwWXYxUDlHZ0MwQjd5LysxaVFZdzlWVk5QL1hPenpJeVlwSVJKWVNvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVWJmUmFVdXdKU0RVNXBEY0pXajFuCmtNTWwvRFl3Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUlnTENzMDRkbTJFeFFyNHd4TnpLQ1pBS1ZTSWFIYmZVVWMKRXI0Z0lHNUU1K0lDSVFEVitRajFKSHdSTHRYTWtud0RVcWhuekdPS2h1TTU0aGJOcGc2eWVVV2toQT09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + server: https://10.64.83.230:6443 + name: mls-test-manage +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTlRnek5URXpNekl3SGhjTk1qVXdPVEl3TURZMU5UTXlXaGNOTXpVd09URTRNRFkxTlRNeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTlRnek5URXpNekl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFRUE9wWTF4andNYm5VYnl5VVdFbGxvNXJBTWlvd1ZiNjFNQlNBcE1IcC8KcjhZeEg2VzZRVlR0TWZrNmtBVlcvaTFTeDg3UzlDWlg0K3MwMGZkOUlaZEVvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVXVsS3U3cHFqVitWNmowS1pxZlJjCnVtQ3lRVUV3Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUloQU9iYkE1SlQwd2haWldVd0NibXdndkMrK2VpcDFCbGgKa2xJMXlUdmZUbjNnQWlCZi9kT0hVSEtIZC80cmVEVXQ5N0lsUU44U2ZIQlJPbERxdzlzTkJzWTBuUT09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + server: https://10.64.83.239:6443 + name: karmada-host +contexts: +- context: + cluster: karmada-host + user: karmada-host + name: karmada-host +- context: + cluster: karmada-apiserver + user: karmada-apiserver + name: karmada-apiserver +- context: + cluster: mls-test-manage + user: mls-test-manage + name: mls-test-manage +current-context: mls-test-manage +kind: Config +preferences: {} +users: +- name: karmada-host + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrRENDQVRlZ0F3SUJBZ0lJVmMwK3E4UDdGdm93Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOelU0TXpVeE16TXlNQjRYRFRJMU1Ea3lNREEyTlRVek1sb1hEVEkyTURreQpNREEyTlRVek1sb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJKZGE4U0JIS1U4WTRuR0UKS2lVZkpYR1dFbVp3ZldQaWdZck5sNDZySzZvbGYzQ0ZqNU9qRkN0c3loNVo5VDl6TGVTMExUb2Fta3hwU2ZUcgpzNmRJQy9palNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCVHI1MWdnSGFFMGVRWmY5dit3N1JKSTFrUDNZekFLQmdncWhrak9QUVFEQWdOSEFEQkUKQWlCbXZlYTFLcWZXQ1NCRm9PKzlNekVKT1ZqaUZWclFVVHFZTXZCNi9JQkF4UUlnZXRkZWFzeU0rcTNZWWduNgo2YVNiNTB2WWpnTnNtcVF0Rmg4NHYzMWVURDA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTlRnek5URXpNekl3SGhjTk1qVXdPVEl3TURZMU5UTXlXaGNOTXpVd09URTRNRFkxTlRNeQpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTlRnek5URXpNekl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSaTZMK0F0ZHN6eC9ZWGNRTWZ0eSt2eWFGTWFITmpGQTVDeE1PY3NET0gKKzJnQjhXY05VZHZXa3VnM25Tc2VWVVlsdWpFS0FuTW1RaWlieVNKMGVYMkFvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVTYrZFlJQjJoTkhrR1gvYi9zTzBTClNOWkQ5Mk13Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUloQU0zM3lTaHJaamxpWHYrNDh6d2dtdG52MGZMUWJMTUMKUTBFRCtUTmxNNDk4QWlCWGxBK253a3IxV1YrbWlrWkFSTm83NzUvYklSTUFnd1ZtTTFKK1hKaHliZz09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUhDWjlYRlBwRm5MbTduZU1LcWxxd0w4V1FONEpJV2ZJNzBKckxoUFd6MXhvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFbDFyeElFY3BUeGppY1lRcUpSOGxjWllTWm5COVkrS0JpczJYanFzcnFpVi9jSVdQazZNVQpLMnpLSGxuMVAzTXQ1TFF0T2hxYVRHbEo5T3V6cDBnTCtBPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= +- name: karmada-apiserver + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZQVENDQTZXZ0F3SUJBZ0lVVXFpVlJDeXB5WERoYm5RUDR0TFRoQktQNmg0d0RRWUpLb1pJaHZjTkFRRU0KQlFBd1hERUxNQWtHQTFVRUJoTUNlSGd4Q2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVgpCQW9NQVhneENqQUlCZ05WQkFzTUFYZ3hDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0Ck1CNFhEVEkxTURreU1EQTJOVE13TUZvWERUTXdNRGt4T1RBMk5UTXdNRm93TURFWE1CVUdBMVVFQ2hNT2MzbHoKZEdWdE9tMWhjM1JsY25NeEZUQVRCZ05WQkFNVERITjVjM1JsYlRwaFpHMXBiakNDQWFJd0RRWUpLb1pJaHZjTgpBUUVCQlFBRGdnR1BBRENDQVlvQ2dnR0JBTDZYbUF0VC9PMDlXRStOT2oxWEE5M1MrK293aDhWRHFCZWh6VmRaCmFJaGR5bDZZTkd1OVc2U0VMeGFFOFF6Z2ZRWmROdXRTQ0RxRUdtWmtIczBRRHU4d1lWbFhseGhiN2lLc2JyaFoKVExaMTlQQThuVDBEdHRvelo4aGNka2hYSHllNzRWRG5sNVNvRzZmNVN2bkxaL2Y0TVlVNWtJTTJndmVBMm5qcwpMdGwraVg2ZFpBSXZyd2pCM1J6R1lkNFN5UmswUERJQXdEYzFMS0svMWduNzI0K0hwSVhoelo4bEtYSFVqWUdKClNBS3VvUFVWNWdrSEN6MjhSN1JTdzZIZGVwYTk1NlI1MkJoSjlpUTkrcWhQL3pib0gzb2U4YUg2dmhDSXhYYUsKbVZpN1BPWVVmWEthZDYrc0VxTkxhMkpsT1hwaS9nSGFjK2lNUjdXU09DbE9kQzFodjcreFZvZy9kZWtxVFRGZApjcjc5TkhEMVdOalRON1lZdUV1R3h3aDEyR3RzVUc4RFJDc2RyZ2VKSCtGeEs4MWpjRzR1NFJKWHVXN2VKQVdKCis3MXlQYURmUkg5cEtMczl3RVpDOWtjZGwrL29QQWplUzdRa2YyQXZtcmorSjNYcndMRmNRcXZZd1NFajN6RzgKckhJY0lxLzB5QmZJRjJJbWNWQUI2K1o5K3dJREFRQUJvNElCSVRDQ0FSMHdEZ1lEVlIwUEFRSC9CQVFEQWdXZwpNQjBHQTFVZEpRUVdNQlFHQ0NzR0FRVUZCd01DQmdnckJnRUZCUWNEQVRBTUJnTlZIUk1CQWY4RUFqQUFNQjBHCkExVWREZ1FXQkJTSzZsOEVEK1ROelpLay9IZlhPRzYzWTNNRFlqQWZCZ05WSFNNRUdEQVdnQlFaRXh1K1NnOE0KTFZBa0gyM29NaGNkdWZoU0RqQ0JuUVlEVlIwUkJJR1ZNSUdTZ2hacmRXSmxjbTVsZEdWekxtUmxabUYxYkhRdQpjM1pqZ2dsc2IyTmhiR2h2YzNTQ0p5b3VaWFJqWkM1cllYSnRZV1JoTFhONWMzUmxiUzV6ZG1NdVkyeDFjM1JsCmNpNXNiMk5oYklJaUtpNXJZWEp0WVdSaExYTjVjM1JsYlM1emRtTXVZMngxYzNSbGNpNXNiMk5oYklJVUtpNXIKWVhKdFlXUmhMWE41YzNSbGJTNXpkbU9IQkg4QUFBR0hCQXBBVSs4d0RRWUpLb1pJaHZjTkFRRU1CUUFEZ2dHQgpBSVRmN21GZjh4UnM2dVdHMHVRTnBjUiszb0p3NnVvUENUYUQzdk9rTXEwR3c4YTBCQkk3K3dnVTFsYnBralVOCnVrUm85b0d6eEZDWmE0YzFUS3RvcURTZi9IdHh4SDRYM0FraDlJZkNWa0JzSnB1TDFVeVZXU0JYSWwxY08rWjEKMHlDd0lUSXB3NnZxQkVWNGptRWpIR1ZSS3NCRHVpU0ZCdEx0bEt1U0h5aEwyMzQwWlBkVlFxczQyY21icjE0dApEVkE5a2I1S3VFTlN0NUdMNlh4TGhQNkN5QzVMMzVQSDlZWnBHMGE4Nk1zL0VQTXNta0dZRExJTGdwUGRRNk5ZCmlDUUExR0NGNGNDL201SU55RkFmSHVjeUNNcmxRUk1KQmdTL2lwZHRHZXQvU1Frb1lFaVNKanAyejVxcE5WNjIKV0NMRWs2bE5oMmJqSUt3bkdtVDBhWkdKUU0xckZsWXR1R1kxcWx3aTVSaytIcGt2RjkrUGdrS2YxQnIybEFtZAo5eTkvdk04K0tkREgzR2c5ak9GRVJNRWplUktUaGRta2dMRXhLTjdkWURLa1FhTXIrWkVmUTBiZzVJUm5UUnRLClVqVXlpSkhQQWxIQUJIS29oL0ZIZFF2YUJyYzM3ZEdVSEduSkZuT3BYUlNBb2l5VkxNTHpZUXpMZWdBd1JBMFAKSGc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== + client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlHNHdJQkFBS0NBWUVBdnBlWUMxUDg3VDFZVDQwNlBWY0QzZEw3NmpDSHhVT29GNkhOVjFsb2lGM0tYcGcwCmE3MWJwSVF2Rm9UeERPQjlCbDAyNjFJSU9vUWFabVFlelJBTzd6QmhXVmVYR0Z2dUlxeHV1RmxNdG5YMDhEeWQKUFFPMjJqTm55RngyU0ZjZko3dmhVT2VYbEtnYnAvbEsrY3RuOS9neGhUbVFnemFDOTREYWVPd3UyWDZKZnAxawpBaSt2Q01IZEhNWmgzaExKR1RROE1nREFOelVzb3IvV0NmdmJqNGVraGVITm55VXBjZFNOZ1lsSUFxNmc5UlhtCkNRY0xQYnhIdEZMRG9kMTZscjNucEhuWUdFbjJKRDM2cUUvL051Z2ZlaDd4b2ZxK0VJakZkb3FaV0xzODVoUjkKY3BwM3I2d1NvMHRyWW1VNWVtTCtBZHB6Nkl4SHRaSTRLVTUwTFdHL3Y3RldpRDkxNlNwTk1WMXl2djAwY1BWWQoyTk0zdGhpNFM0YkhDSFhZYTJ4UWJ3TkVLeDJ1QjRrZjRYRXJ6V053Ymk3aEVsZTVidDRrQlluN3ZYSTlvTjlFCmYya291ejNBUmtMMlJ4Mlg3K2c4Q041THRDUi9ZQythdVA0bmRldkFzVnhDcTlqQklTUGZNYnlzY2h3aXIvVEkKRjhnWFlpWnhVQUhyNW4zN0FnTUJBQUVDZ2dHQkFKdklEcWViczQwNTZmZkQ2SW1BbVJGenhTSnJIbDVYYVlvLwpYelo1UnQ3SFFvVEZCWHZXbjVROTByc21XRng5Rk5QaDF1VEsvUnpQSTBEenJ0VlB2ZFUxaHl4ZVVvcS9nVXBvCkJHdTcyZDc5VlFJVG1UWG8xa2xOWVJPOHQ1VnpXR3pDdld6Q2p2d2IxOWNaQlhaZEVwTDFFNHpnTzBWMGhlSEYKT1UxRFJ4dlF1Sk44RStWUmtsQ25ISkxMZCt6ckQrSUE5djZyN0hCemp4TkxsNmc3YXYvTEQrZTZZN1lqeXRkVwpzMzdnZGJKemgrNHJRd1czOHpFM3RxZ21TZThBdGl2RmtnYUl4VXN3UWZsODVkNzZtdXk0aEIyQUd1TkpmQ09rCi9tM0FJek9QcUZmNFVuaDFPVEdxd0ppSHFoRDdycWsvSGVoM1V0dDF6RGdCQTlmbnJ5VFd4N1V5UmtQaXA2ZnkKK1UwdXlKamdoY2V6d1ZPUEtkRUtZZHJuU3BsTlhUSERHTWdCMFNUNmJLdFpVem9Bd3RvU1FDbEUwbUpxQm9ORApGSTZEdVF4bEczcThhT1VqTm9DQTBBWURlUXUzcWd5RGdoeEJGZVRETTltZmtqSnR2RHpnY3F1Yk5IaGNEaW9YCnpaenJxc0lmSDdQMlI5RU5VbVl6K0R0MjVsWjJJUUtCd1FEUDgza2VGWTRFTWVYYk10VFBuZnZIb2lSWnREQm4KcUdVczI4UVozbU5oblE1WCtkbTZxMlRMZjQwSE9pd3B6NzB1UEZ2MnNMNXBnRCtLbk9IVmRza2hlaDFmZDRoQwo2alYyTlN0eUV4dFVPTHJMZXFYWDdCTlJjQjVicnJXYlZwaVJ2cUZYTkFSNUNVOTVJcEljdmkwQVR2L2NZVU1pCmRINFVldXVta1RNTU1tN1ExMk5WNnJocklidDl3VkRIMEpiV0x3SWFRazFEVmJyb290dm5ZYUNrRVd1TTBZengKOVlQbU40enpIS3RBVWdXSVlzS2xzU0NBYThGQnZMWVI3ekVDZ2NFQTZxRlN2bWg2QmJFcU5CR2daRW9iaGp3Ugo2Y0lCRitRN0YxN0tkT0t3QTdFNWQ2dzQwNFpkd0VUaVlsdmtud3NlZXVRbFhDRHhUK0pUbWZtT3o4Wjh1SFozCjREUlNhN0htblRoMmUrajFwMFpNenVSZnEyWDdQOURjVWt2NDViUG5qN0o2d3dRL2RTaDhjaElDYWdROWRKU24KODV5T2h6cWZRMFMzWUdGYlVEMERRUWNHbmlVL3NtZFgvMERFQVlINS9rQTJjd01uYm0vUzNNTjJNM0dsVFcwegpPYmtsekJWMGZwbDk4aGk0ZEJJbERjK3pianQ2b2hsYjNaK25xNnpyQW9IQVRLMzB6VmtaWXdhYlFYYSs3a2ZOCkhkSDJJWFhHRzVNR3E4SGx6N3duWFVreWp0SThyUTZpUGorc1d5dmlPUkMzSzZOcUxjOENoNWlZMVozLzBGemEKMFpOYjNXWVFRc3htSms1NDFMbWk3QzEzQTYwWUZrU2c0dmk2MlFmVlE1QnM5ZUViVVRLSEs3SjVNYXZEd1FHSApTb1BXNlFhY1lNb2Y2NkZmMzYzTzZ3SjF4bUNGNnBxQW9sQnYrSEVlUG0vMkRzaUt0L0FXMld0MVhtOGZNM1RXCmJXbFhCVGdJQnZpSFNJN1hWSllQdDhvRzdFR3VrTFo5eTV6d1NETXoxKzBoQW9IQVFHNXV0NFVmUlNMaXNRdksKSE9CUThLcmNUSElQTk0xakFNYnlGK2tNWnVjckhkYjRmL1hkRnVpVHRtai92a0VpYktMc000R0VZdWlIQnJpRwpaRDVnajVOUTFndm9PbFRQUEl0OTFnY2lObjVTcDFvRmY1N3d3UlNNL0RpS3NDR1lURmk1NTRYcUd6VXovMnM4CnFIcGptZmtjcWdSckMyMm93S05GTEQxd0F2Zk9SZlEwK0FiSW1DaFMvUEIvSXQ3YmxOdHpXR0FjV3ZKd1dGbUEKd3gzSlJCcmh2MDg5dHU2SllCNnNmR2dYZHlCb2RvTjhCYUJseTgvazcxWmhoLzVqQW9IQURjREgzOTkrbjljTAoyeDQ3K3ZteHh5VXBiSmtpamVGNWRhOUlvbHFmcmRIYmJ5REJaNUJQcURnQ2tFbkd3Zkg4eEhlQXNNb2JLS2pTCnFwbFp3cjFSQjlZRFNQT1djRHNqT0pvMWdMbi80YmxvNU05QnpLOEgvYXZKVTgxTGpTZkZkSUlWOXdDRlYySXkKSUQ5R3Z6Yk82UEdQbWt2dGRqTStRVitURVkvcHlmTWFMTUdWVUIrMVI4QlVTZE9mUUluZ0lKc0IvVG0zZ2dKVApqWU9LMjN2cEpzdDZ6ZHZPbFF5S3RwOWlqUXBVY096NUNMRlBGRklqdVM1TGxTOGREVTRkCi0tLS0tRU5EIFJTQSBQUklWQVRFIEtFWS0tLS0tCg== +- name: mls-test-manage + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrVENDQVRlZ0F3SUJBZ0lJZStNWkJtdUtJUlV3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOelU0TXpVd09EazNNQjRYRFRJMU1Ea3lNREEyTkRneE4xb1hEVEkyTURreQpNREEyTkRneE4xb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJFT3FQdkRVMzJZbWlmb1MKV0Zrd2tWZEU3Z0dBQVVGZ25zODdOOWcwUzJoMUxrSXJRd3VmRFl6VnpHb1VDQjZ3Q3Ftak9PN2V3ZEd3enBTeAprNHp5RXlxalNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCVDhPM1hiVnFNL2UwNE82dUJLK2l2TXRwcjlIekFLQmdncWhrak9QUVFEQWdOSUFEQkYKQWlFQXREV1JpSWRGclNCOXVuVW1UNGl0Qm0waTI3bCtISVlFT0tZd1VWb0t4cjhDSUJ3clBCM2ZmenRwdk9FSQplQm1NSmZ4WUQrOW16aGt2bjVVajlOeFRNNitwCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJlRENDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTlRnek5UQTRPVGN3SGhjTk1qVXdPVEl3TURZME9ERTNXaGNOTXpVd09URTRNRFkwT0RFMwpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTlRnek5UQTRPVGN3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSUUQ5RWN4RCs0QkJISEE0NTZZd3ViT0NFYWsvVnZFdTRwaTFnbUhOaU8KdlFkZ2VyY0QzeGRPQmFwc0lXemE3TG9EU0V2QUVuNjRuUEcxM1JzMVNlZlJvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVS9EdDEyMWFqUDN0T0R1cmdTdm9yCnpMYWEvUjh3Q2dZSUtvWkl6ajBFQXdJRFNRQXdSZ0loQU5ZWW1sUGtoa3U0YWNKQVZWSTc4Wi9TN0hXTXNUWk4KN3dpN3ZjajkzUkZVQWlFQThkOWpHUDNmUS9pTjlXVWJ4NEVLM2gxN3QvODZXNnFYaTJiT0tQY0pxdDA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUMzMDN2MGFnRkpoMmwxenlwTGxTdjVmdmNMS29meEhoUXhZTGJydHZnTmZvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFUTZvKzhOVGZaaWFKK2hKWVdUQ1JWMFR1QVlBQlFXQ2V6enMzMkRSTGFIVXVRaXREQzU4TgpqTlhNYWhRSUhyQUtxYU00N3Q3QjBiRE9sTEdUalBJVEtnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= diff --git a/agents/cluster/mechanisms/DroneControllerAPI/ControllerAPI.py b/agents/cluster/mechanisms/DroneControllerAPI/ControllerAPI.py new file mode 100644 index 0000000..b6fc85d --- /dev/null +++ b/agents/cluster/mechanisms/DroneControllerAPI/ControllerAPI.py @@ -0,0 +1,54 @@ +from DroneControllerAPI.UDP import UDP +from DroneControllerAPI.Messages import * +from DroneControllerAPI.DCException import DCException +import time + +#For simple testing +class ControllerAPI: + + def __init__(self, ctlr_ip, ctlr_port): + + + self.ctlr = UDP() + self.ctlr_addr = (ctlr_ip,ctlr_port) + self.Seqno = 1 + + + def StartFollow(self,VerticalOffset,HorizontalOffset): + + sfreq = StartFollowReq(Seqno=self.Seqno, + VerticalOffset=VerticalOffset, + HorizontalOffset=HorizontalOffset) + + self.ctlr.send(self.ctlr_addr,sfreq) + try: + sfrepl = self.ctlr.recv() + except Exception as e: + raise DCException(f"Timeout") + return + if sfrepl.Seqno != self.Seqno: + # print(f"error in sequence number: local {self.Seqno} received {sfrepl.Seqno}") + raise DCException(f"SEQ_NUMBER_INCONSISTENCY Local {self.Seqno} Received {sfrepl.Seqno}") + self.Seqno = self.Seqno + 1 + return sfrepl.StatusCode + + + + def StopFollow(self): + + sfreq = StopFollowReq(Seqno=self.Seqno) + + self.ctlr.send(self.ctlr_addr,sfreq) + try: + sfrepl = self.ctlr.recv() + except Exception as e: + raise DCException(f"Timeout") + return + + if sfrepl.Seqno != self.Seqno: + raise DCException(f"SEQ_NUMBER_INCONSISTENCY Local {self.Seqno} Received {sfrepl.Seqno}") + self.Seqno = self.Seqno + 1 + + return sfrepl.StatusCode + + diff --git a/agents/cluster/mechanisms/DroneControllerAPI/DCException.py b/agents/cluster/mechanisms/DroneControllerAPI/DCException.py new file mode 100644 index 0000000..4669c04 --- /dev/null +++ b/agents/cluster/mechanisms/DroneControllerAPI/DCException.py @@ -0,0 +1,4 @@ +class DCException(Exception): + + def __init__(self, message): + super().__init__(message) \ No newline at end of file diff --git a/agents/cluster/mechanisms/DroneControllerAPI/Messages.py b/agents/cluster/mechanisms/DroneControllerAPI/Messages.py new file mode 100644 index 0000000..581d7b0 --- /dev/null +++ b/agents/cluster/mechanisms/DroneControllerAPI/Messages.py @@ -0,0 +1,130 @@ +import struct +from dataclasses import dataclass, asdict, field +from dataclass_struct import STRUCT_TYPE, dataclass_struct +from enum import Enum + + +class MsgTypes(Enum): + TRACTOR_INFO = 1 + DRONE_STATUS = 2 + AP_SENSOR_INFO = 3 + IN_POSITION = 4 + START_FOLLOW_REQ = 5 + START_FOLLOW_RPL = 6 + STOP_FOLLOW_REQ = 7 + STOP_FOLLOW_RPL = 8 + SET_REPORTING_RATE_REQ = 9 + SET_REPORTING_RATE_RPL = 10 + UAV_STATUS = 11 + DOME_VALUES_UPDATE = 12 + + +@dataclass_struct +class Test: + Type: int = field(default = MsgTypes.TRACTOR_INFO.value, metadata={STRUCT_TYPE:'>i'}) + param1: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + param2: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + + +@dataclass +class UAVStatus: + Type: int = MsgTypes.UAV_STATUS.value + Status: int = 0 + + +@dataclass_struct +class TractorInfo: + Type: int = field(default = MsgTypes.TRACTOR_INFO.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + Timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + Longitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Latitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Altitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Heading: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Speed: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + DevID: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + lux_r: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_g: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_b: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_i: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + dome_timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>q'}) + + +@dataclass_struct +class DroneStatus: + Type: int = field(default = MsgTypes.DRONE_STATUS.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + Timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + State: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + + +@dataclass_struct +class AutopilotSensorInfo: + Type: int = field(default = MsgTypes.AP_SENSOR_INFO.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + Timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>q'}) + Longitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Latitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Altitude: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Heading: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Speed: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Pitch: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Roll: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + Yaw: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_r: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_g: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_b: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_i: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + dome_timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>q'}) + + +@dataclass_struct +class StartFollowReq: + Type: int = field(default = MsgTypes.START_FOLLOW_REQ.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + VerticalOffset: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + HorizontalOffset: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + +@dataclass_struct +class StartFollowRpl: + Type: int = field(default = MsgTypes.START_FOLLOW_RPL.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + StatusCode: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + +@dataclass_struct +class StopFollowReq: + Type: int = field(default = MsgTypes.STOP_FOLLOW_REQ.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + +@dataclass_struct +class StopFollowRpl: + Type: int = field(default = MsgTypes.STOP_FOLLOW_RPL.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + StatusCode: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + + +@dataclass_struct +class SetReportingRateReq: + Type: int = field(default = MsgTypes.SET_REPORTING_RATE_REQ.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + +@dataclass_struct +class SetReportingRateRpl: + Type: int = field(default = MsgTypes.SET_REPORTING_RATE_RPL.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + StatusCode: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + +#helper class +@dataclass_struct +class DomeValuesUpdate: + Type: int = field(default = MsgTypes.DOME_VALUES_UPDATE.value, metadata={STRUCT_TYPE:'>i'}) + Seqno: int = field(default = 0, metadata={STRUCT_TYPE:'>i'}) + lux_r: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_g: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_b: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + lux_i: float = field(default = 0, metadata={STRUCT_TYPE:'>f'}) + dome_timestamp: int = field(default = 0, metadata={STRUCT_TYPE:'>q'}) + + + + diff --git a/agents/cluster/mechanisms/DroneControllerAPI/UDP.py b/agents/cluster/mechanisms/DroneControllerAPI/UDP.py new file mode 100644 index 0000000..75720ef --- /dev/null +++ b/agents/cluster/mechanisms/DroneControllerAPI/UDP.py @@ -0,0 +1,90 @@ +import socket +import threading +import logging +import logging.config +from struct import * +import time +from DroneControllerAPI.Messages import * +from dataclass_struct import STRUCT_TYPE, dataclass_struct + +# from src.env import * + + +type_to_dataclass = { + MsgTypes.TRACTOR_INFO.value:TractorInfo, + MsgTypes.DRONE_STATUS.value:DroneStatus, + MsgTypes.AP_SENSOR_INFO.value:AutopilotSensorInfo, + MsgTypes.START_FOLLOW_REQ.value:StartFollowReq, + MsgTypes.START_FOLLOW_RPL.value:StartFollowRpl, + MsgTypes.STOP_FOLLOW_REQ.value:StopFollowReq, + MsgTypes.STOP_FOLLOW_RPL.value:StopFollowRpl, + MsgTypes.SET_REPORTING_RATE_REQ.value:SetReportingRateReq, + MsgTypes.SET_REPORTING_RATE_RPL.value:SetReportingRateRpl +} + +logger = logging.getLogger(__name__) + +class UDP(object): + + + def __init__(self,ip="",port=0): + self.sock = socket.socket(socket.AF_INET, + socket.SOCK_DGRAM) + + self.sock.bind((ip,port)) + + self.ip, self.port = self.sock.getsockname() + + self.recv_handler = self.__dumy_handler + + # logger.info(f"UDP started at {self.ip}:{self.port}") + + def ServeForever(self): + self.rcv_th = threading.Thread(target = self.__rcv_loop, + daemon=True) + self.rcv_th.start() + + def RegisterRecvHandler(self,handler): + self.recv_handler = handler + + def __bin_to_dataclass(self,msg): + header = msg[:4] + mtype = unpack('>i',header)[0] + + obj = type_to_dataclass[mtype].instance_from_buffer(msg) + return obj + + def __rcv_loop(self): + #return in blocking mode if recv was invoked before + # self.sock.settimeout(0) + while True: + + data , addr = self.sock.recvfrom(1024) + #print(f"received message _rcv {data} from {addr}") + obj = self.__bin_to_dataclass(data) + self.recv_handler(addr,obj) + + + def __dumy_handler(self,address,data): + pass + + def recv(self): + try: + self.sock.settimeout(2) + data , addr = self.sock.recvfrom(1024) + # print(f"received message recv {data} from {addr}") + except socket.timeout as e: + raise e + + obj = self.__bin_to_dataclass(data) + return obj + + def send(self,address,data): + self.sock.sendto(data.to_buffer(),address) + + + + + + + diff --git a/agents/cluster/mechanisms/DroneControllerAPI/__init__.py b/agents/cluster/mechanisms/DroneControllerAPI/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agents/cluster/mechanisms/drone_controller.py b/agents/cluster/mechanisms/drone_controller.py new file mode 100644 index 0000000..6b78014 --- /dev/null +++ b/agents/cluster/mechanisms/drone_controller.py @@ -0,0 +1,96 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # +# # + +import json +import traceback +# from mlsysops.logger_util import logger +from mlsysops import MessageEvents +import mlsysops +from mlsysops.logger_util import logger +from DroneControllerAPI import ControllerAPI +import os +drone_ctrl_client = ControllerAPI.ControllerAPI(os.getenv("DRONE_CONTROLLER_ENDPOINT","100.64.0.2"),1980) + +queues = {"inbound": None, "outbound": None} +# AUG ADDITIONS -- START +from mlstelemetry import MLSTelemetry +mlsClient = MLSTelemetry("drone_controller_mechanism", "aug_cluster_mechanism") + +# drone command codes +DRONE_IDLE = 10 +UI_START_FOLLOW = 11 +UI_STOP_FOLLOW = 12 + +DRONE_CONTROLLER_DISABLE = os.getenv("DRONE_CONTROLLER_DISABLE", "false") + +async def send_last_drone_cmd(cmd): + mlsClient.pushMetric(f'fluidity_drone_command', "gauge", cmd) + +# TODO: REMOVE! This is for the desk setup testing only. +async def start_follow_tractor(): + try: + status=drone_ctrl_client.StartFollow(10,10) + logger.info(status) + except Exception as e: + logger.error(str(e)) + await send_last_drone_cmd(UI_START_FOLLOW) + +# TODO: REMOVE! This is for the desk setup testing only. +async def stop_follow_tractor(): + try: + status=drone_ctrl_client.StopFollow() + logger.info(status) + except Exception as e: + logger.error(str(e)) + await send_last_drone_cmd(UI_STOP_FOLLOW) + + +def initialize(inbound_queue=None, outbound_queue=None, agent_state=None): + global fluidity_mechanism_instance + + logger.debug("Initializing fluidity mechanism") + + queues["inbound"] = inbound_queue + queues["outbound"] = outbound_queue + + +async def apply(plan): + try: + # AUG ADDITIONS -- START + if "start_follow_tractor" in plan and plan["start_follow_tractor"]: + logger.info('Sending start follow tractor msg to drone controller.') + mlsClient.pushLogInfo('DRONE_COMMAND: START FOLLOW') + # NOTE: We should wait until the drone controller's status is 'FOLLOWING' before we deploy the drone app. + # We need this because we do not want the drone app to send invalid results based on an irrelevant region. + if DRONE_CONTROLLER_DISABLE.lower() == "false": + await start_follow_tractor() + elif "stop_follow_tractor" in plan and plan["stop_follow_tractor"]: + logger.info('Sending stop follow tractor msg to drone controller.') + mlsClient.pushLogInfo('DRONE_COMMAND: STOP FOLLOW') + if DRONE_CONTROLLER_DISABLE.lower() == "false": + await stop_follow_tractor() + except Exception as e: + logger.debug("Error in sending message to fluidity") + print(traceback.format_exc()) + + return False + +def get_state(): + return {} + + +def get_options(): + return {} \ No newline at end of file diff --git a/agents/cluster/mechanisms/fluidity.py b/agents/cluster/mechanisms/fluidity.py index 66991ac..850ad91 100644 --- a/agents/cluster/mechanisms/fluidity.py +++ b/agents/cluster/mechanisms/fluidity.py @@ -31,19 +31,22 @@ from mlsysops import MessageEvents import mlsysops from mlsysops.logger_util import logger +import mlstelemetry +import time queues = {"inbound": None, "outbound": None} class FluidityMechanism: - state: Dict = field(default_factory=dict) inbound_queue = None outbound_queue = None internal_queue_inbound = None internal_queue_outbound = None state = None + relocation_state = {} fluidity_proxy_plans = {} + mls_client = mlstelemetry.MLSTelemetry("cluster_agent", "fluidity_mechanism") def __init__(self, mlsysops_inbound_queue=None, mlsysops_outbound_queue=None, agent_state=None): self.inbound_queue = mlsysops_inbound_queue @@ -108,7 +111,7 @@ async def internal_message_listener(self): # Listen to fluidity messages message = await self.internal_queue_inbound.get() - # Log or save message for debugging + # # Log or save message for debugging with open("fluidity_dump.json", "w") as file: file.write(json.dumps(message, skipkeys=True, indent=4, default=str, ensure_ascii=False, sort_keys=True, separators=(',', ': '))) @@ -230,12 +233,14 @@ async def internal_message_listener(self): if app_name: # Build or merge components into the application components_data = {} + logger.debug(f"Building components for application {components}") for component in components: metadata = component.get("metadata", {}) metadata_name = metadata.get("name") metadata_uid = metadata.get("uid") qos_metrics = component.get("qos_metrics", []) - if metadata_name and metadata_uid: + + if metadata_name: components_data[metadata_name] = { "uid": metadata_uid, "qos_metrics": qos_metrics, @@ -259,7 +264,32 @@ async def internal_message_listener(self): payload = message.get("payload", {}) app_name = payload.get("name") del self.state["applications"][app_name] + elif event == MessageEvents.POD_DELETED.value: + # Handle pod modified event + payload = message.get("payload", {}) + pod_spec = payload.get("spec", "{}") + pod_metadata = pod_spec.get("metadata", {}) + pod_labels = pod_metadata.get("labels", {}) + pod_state = pod_spec.get("status", {}).get("phase", "Unknown") + node_name = pod_spec.get("spec", {}).get("node_name", None) + # Extract labels + app_name = pod_labels.get("mlsysops.eu/app") + component_name = pod_labels.get("mlsysops.eu/component") + if app_name and app_name in self.state["applications"] and component_name: + # Update the application component state + app = self.state["applications"][app_name] + components = app.get("components", {}) + + components.setdefault(component_name, {}).update({ + "labels": pod_labels, + "state": "Deleted", + "node_placed": None + }) + + # Test log + logger.test( + f"|10| Fluidity mechanism planuid:{pod_labels.get('mlsysops.eu/planUID', '-')} pod deleted status:Success") elif event == MessageEvents.POD_MODIFIED.value: # Handle pod modified event payload = message.get("payload", {}) @@ -273,6 +303,7 @@ async def internal_message_listener(self): app_name = pod_labels.get("mlsysops.eu/app") component_name = pod_labels.get("mlsysops.eu/component") component_uid = pod_labels.get("mlsysops.eu/componentUID") + plan_uid = pod_labels.get("mlsysops.eu/planUID", None) if app_name and app_name in self.state["applications"] and component_name: # Update the application component state @@ -285,6 +316,36 @@ async def internal_message_listener(self): "node_placed": node_name }) + if plan_uid and plan_uid in self.relocation_state and component_name in self.relocation_state[ + plan_uid]: + logger.debug(f"Pod name {pod_metadata['name']}") + logger.debug(f"Pod state {pod_state}") + logger.debug(f"Host name {node_name}") + + start_timestamp = self.relocation_state[plan_uid][component_name]['start'] + # Get timestamp of modification event of the new pod + curr_timestamp = time.perf_counter() + diff = curr_timestamp - start_timestamp + + # If the new Pod is deployed successfully on the new host, record the delay + if pod_state == 'Running' and node_name == self.relocation_state[plan_uid][component_name][ + 'dst']: + logger.info(f"Relocation delay is {diff}") + self.mls_client.pushMetric("relocation_delay", "gauge", diff, + {"comp_name": component_name}) + logger.debug(f"Removing {component_name} from relocation state of plan {plan_uid}") + + # Remove entry from dictionary + del self.relocation_state[plan_uid][component_name] + if self.relocation_state[plan_uid] == {}: + logger.debug(f"relocation_state for plan {plan_uid} is empty.") + del self.relocation_state[plan_uid] + # If the Pod is in Pending state, we measure the delay until the call to kubernetes is done + elif pod_state == 'Pending' and node_name == \ + self.relocation_state[plan_uid][component_name]['dst']: + logger.debug(f"New Pod start delay is {diff}") + self.mls_client.pushMetric("deployment_delay", "gauge", diff, + {"comp_name": component_name}) # Test log logger.test( f"|3| Fluidity mechanism planuid:{pod_labels.get('mlsysops.eu/planUID','-')} pod modification status:Success") @@ -311,6 +372,7 @@ async def internal_message_listener(self): logger.warning(f"Node name is missing for component '{component_name}' in app '{app_name}'.") else: logger.warning("Invalid or missing app/component labels in pod_modified event payload.") + except CancelledError: logger.debug("Cancelled error in internal_message_listener") break @@ -333,26 +395,26 @@ def initialize(inbound_queue=None, outbound_queue=None, agent_state=None): async def apply(plan): global fluidity_mechanism_instance - try: - # Validate the payload using Pydantic - FluidityPlanPayload(**plan) - except ValidationError as e: - # Print validation errors if any - logger.error(f"Plan Validation failed: {e}") - - msg = { - "event": MessageEvents.PLAN_EXECUTED.value, - 'payload': { - 'name': plan['plan_uid'], - } - } - - # forward the message to MLS agent - await fluidity_mechanism_instance.inbound_queue.put(msg) - logger.test(f"|1| Fluidity mechanism planuid:{plan['plan_uid']} failed validation status:Failed") - logger.test(plan) - logger.test(e) - return + # try: + # # Validate the payload using Pydantic + # FluidityPlanPayload(**plan) + # except ValidationError as e: + # # Print validation errors if any + # logger.error(f"Plan Validation failed: {e}") + # + # msg = { + # "event": MessageEvents.PLAN_EXECUTED.value, + # 'payload': { + # 'name': plan['plan_uid'], + # } + # } + # + # # forward the message to MLS agent + # await fluidity_mechanism_instance.inbound_queue.put(msg) + # logger.test(f"|1| Fluidity mechanism planuid:{plan['plan_uid']} failed validation status:Failed") + # logger.test(plan) + # logger.test(e) + # return msg = { "event": MessageEvents.PLAN_SUBMITTED.value, @@ -361,6 +423,28 @@ async def apply(plan): try: await fluidity_mechanism_instance.internal_queue_outbound.put(msg) logger.test(f"|1| Fluidity mechanism forwarded planuid:{plan['plan_uid']} to Fluidity status:True") + for comp in plan['deployment_plan']: + if comp == 'initial_plan': + continue + # If 'move' action in a plan for a given component, + # Store the new plan info (plan_uid, plan creation timestamp, component names, src/dst nodes) + for action_entry in plan['deployment_plan'][comp]: + if action_entry['action'] == 'move': + if plan['plan_uid'] not in fluidity_mechanism_instance.relocation_state: + logger.debug(f"Creating entry for plan_uid {plan['plan_uid']}") + fluidity_mechanism_instance.relocation_state[plan['plan_uid']] = {} + + curr_timestamp = time.perf_counter() + + src = action_entry['src_host'] + dst = action_entry['target_host'] + + logger.debug(f"Found move action for component {comp} from {src} to {dst}") + fluidity_mechanism_instance.relocation_state[plan['plan_uid']][comp] = { + 'start': curr_timestamp, + 'src': src, + 'dst': dst + } except Exception as e: logger.debug("Error in sending message to fluidity") diff --git a/agents/cluster/mls-test-manage.yml b/agents/cluster/mls-test-manage.yml new file mode 100644 index 0000000..60da9f5 --- /dev/null +++ b/agents/cluster/mls-test-manage.yml @@ -0,0 +1,19 @@ +apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTlRnek5UQTRPVGN3SGhjTk1qVXdPVEl3TURZME9ERTNXaGNOTXpVd09URTRNRFkwT0RFMwpXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTlRnek5UQTRPVGN3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFTaUFSTGlnZEdLeHhHY3B1YUZ5SjhOWmVHRC9ZZW5Jdjc5RHI5Qk1YK08KUHdLRDFwWXYxUDlHZ0MwQjd5LysxaVFZdzlWVk5QL1hPenpJeVlwSVJKWVNvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVWJmUmFVdXdKU0RVNXBEY0pXajFuCmtNTWwvRFl3Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUlnTENzMDRkbTJFeFFyNHd4TnpLQ1pBS1ZTSWFIYmZVVWMKRXI0Z0lHNUU1K0lDSVFEVitRajFKSHdSTHRYTWtud0RVcWhuekdPS2h1TTU0aGJOcGc2eWVVV2toQT09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + server: https://10.64.83.230:6443 + name: mls-test-manage +contexts: +- context: + cluster: mls-test-manage + user: mls-test-manage + name: mls-test-manage +current-context: mls-test-manage +kind: Config +preferences: {} +users: +- name: mls-test-manage + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrVENDQVRlZ0F3SUJBZ0lJZStNWkJtdUtJUlV3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOelU0TXpVd09EazNNQjRYRFRJMU1Ea3lNREEyTkRneE4xb1hEVEkyTURreQpNREEyTkRneE4xb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJFT3FQdkRVMzJZbWlmb1MKV0Zrd2tWZEU3Z0dBQVVGZ25zODdOOWcwUzJoMUxrSXJRd3VmRFl6VnpHb1VDQjZ3Q3Ftak9PN2V3ZEd3enBTeAprNHp5RXlxalNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCVDhPM1hiVnFNL2UwNE82dUJLK2l2TXRwcjlIekFLQmdncWhrak9QUVFEQWdOSUFEQkYKQWlFQXREV1JpSWRGclNCOXVuVW1UNGl0Qm0waTI3bCtISVlFT0tZd1VWb0t4cjhDSUJ3clBCM2ZmenRwdk9FSQplQm1NSmZ4WUQrOW16aGt2bjVVajlOeFRNNitwCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJlRENDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTlRnek5UQTRPVGN3SGhjTk1qVXdPVEl3TURZME9ERTNXaGNOTXpVd09URTRNRFkwT0RFMwpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTlRnek5UQTRPVGN3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSUUQ5RWN4RCs0QkJISEE0NTZZd3ViT0NFYWsvVnZFdTRwaTFnbUhOaU8KdlFkZ2VyY0QzeGRPQmFwc0lXemE3TG9EU0V2QUVuNjRuUEcxM1JzMVNlZlJvMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVS9EdDEyMWFqUDN0T0R1cmdTdm9yCnpMYWEvUjh3Q2dZSUtvWkl6ajBFQXdJRFNRQXdSZ0loQU5ZWW1sUGtoa3U0YWNKQVZWSTc4Wi9TN0hXTXNUWk4KN3dpN3ZjajkzUkZVQWlFQThkOWpHUDNmUS9pTjlXVWJ4NEVLM2gxN3QvODZXNnFYaTJiT0tQY0pxdDA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUMzMDN2MGFnRkpoMmwxenlwTGxTdjVmdmNMS29meEhoUXhZTGJydHZnTmZvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFUTZvKzhOVGZaaWFKK2hKWVdUQ1JWMFR1QVlBQlFXQ2V6enMzMkRSTGFIVXVRaXREQzU4TgpqTlhNYWhRSUhyQUtxYU00N3Q3QjBiRE9sTEdUalBJVEtnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= \ No newline at end of file diff --git a/agents/cluster/policies/Ubiwhere_FSM.yaml b/agents/cluster/policies/Ubiwhere_FSM.yaml new file mode 100644 index 0000000..4acd945 --- /dev/null +++ b/agents/cluster/policies/Ubiwhere_FSM.yaml @@ -0,0 +1,77 @@ +statechart: + name: Cluster Agent FSM + description: States of the cluster agent. + preamble: | + timestamp = 0 + root state: + name: App + initial: S1 + + states: + - name: S1 + transitions: + - target: S2 + event: lamppost_appears + action: | + send_command("start_image") + + - name: S2 + transitions: + # - target: S3 + # event: image_LP + # action: | + # send_command("NONE") + - target: S5 + event: noise_LP + action: | + send_command("stop_image") + - target : S1 + event: lamppost_disappears + action: | + send_command("stop_image") + + # - name: S3 + # transitions: + # # - target: S2 + # # event: image_HP + # # action: | + # # send_command("NONE") + # - target: S5 + # event: noise_LP + # action: | + # send_command("stop_image") + # - target: S1 + # event: lamppost_disappears + # action: | + # send_command("stop_image") + + # - name: S4 + # transitions: + # - target: S2 + # event: noise_HP + # action: | + # send_command("NONE") + # - target: S5 + # event: image_LP + # action: | + # send_command("stop_image") + # - target: S1 + # event: lamppost_disappears + # action: | + # send_command("stop_image") + + - name: S5 + transitions: + - target: S1 + event: lamppost_disappears + action: | + send_command("NONE") + - target: S2 + event: noise_inference_from_LP_to_HP + action: | + send_command("start_image") + + + + - name: S6 #return to state 1 + diff --git a/agents/cluster/policies/policy-relocateComponents.py b/agents/cluster/policies/_policy-relocate.py similarity index 58% rename from agents/cluster/policies/policy-relocateComponents.py rename to agents/cluster/policies/_policy-relocate.py index 39f2f3e..d496067 100644 --- a/agents/cluster/policies/policy-relocateComponents.py +++ b/agents/cluster/policies/_policy-relocate.py @@ -8,52 +8,45 @@ import random from mlsysops.logger_util import logger -from mlsysops.utilities import evaluate_condition +from mlsysops.utilities import evaluate_condition, parse_analyze_interval def initialize(): print(f"Initializing policy {inspect.stack()[1].filename}") initialContext = { "telemetry": { - "metrics": ["node_load1"], - "system_scrape_interval": "5s" + "metrics": [], + "system_scrape_interval": "1s" }, - "mechanisms": ["fluidity"], + "mechanisms": ["fita"], "packages": [], "configuration": { "analyze_interval": "5s" }, + "latest_timestamp": None, + "core": False, "scope": "application", + "moving_interval": "30s" } return initialContext async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): + current_timestamp = time.time() - # policy handles single policy, always an array with a single application - application_spec = application_description[0]['spec'] - application_components = application_spec['components'] - - for application_component in application_components: - component_metrics = application_component['qos_metrics'] - for component_metric in component_metrics: - metric_name = component_metric['application_metric_id'] - # Get latest values from telemetry data - try: - latest_telemetry_df = await telemetry['query'](latest=True) - except Exception as e: - continue - component_metric_target = component_metric['target'] - component_measured_metric = latest_telemetry_df[metric_name].values[0] - logger.debug( - f"metric {metric_name} Target {component_metric_target} measurement {component_measured_metric} ") - - if component_measured_metric is None: - continue - - if evaluate_condition(component_metric_target,component_measured_metric, component_metric['relation']): - # even one telemetry metric is not fulfilled, return true - return True, context + # The first time called + if context['latest_timestamp'] is None: + context['latest_timestamp'] = current_timestamp + return True, context + + # All the next ones, get it + analyze_interval = parse_analyze_interval(context['moving_interval']) + logger.info( + f"{current_timestamp} - {context['latest_timestamp']} = {current_timestamp - context['latest_timestamp']} with interval {analyze_interval}") + + if current_timestamp - context['latest_timestamp'] > analyze_interval: + context['latest_timestamp'] = current_timestamp + return True, context return False, context @@ -61,10 +54,11 @@ async def analyze(context, application_description, system_description, mechanis async def plan(context, application_description, system_description, mechanisms, telemetry, ml_connector): application = application_description[0] - + # check if in the state the client app has been placed # use fluidity state for that components_state = mechanisms['fluidity']['state']['applications'][application_description[0]['name']]['components'] + logger.info('Clalled plan') context['name'] = application['name'] context['spec'] = application['spec'] @@ -74,18 +68,15 @@ async def plan(context, application_description, system_description, mechanisms, plan_result['deployment_plan'] = {} for component in application['spec']['components']: + logger.debug(f'11 component: {component}') comp_name = component['metadata']['name'] node_placement = component.get("node_placement") - if node_placement: - node_name = node_placement.get("node", None) - if node_name: # static placed component, do not touch - continue - current_node_placed = components_state[comp_name]['node_placed'] if current_node_placed is not None: # component is placed, move it to another - available_nodes = [node for node in system_description['MLSysOpsCluster']['nodes'] if node != current_node_placed] + available_nodes = [node for node in system_description['MLSysOpsCluster']['nodes'] if node != current_node_placed and "b1" in node] + node_to_place = random.choice(available_nodes) new_component_plan = { diff --git a/agents/cluster/policies/policy-changeCompSpec.py b/agents/cluster/policies/policy-changeCompSpec.py deleted file mode 100644 index 6399256..0000000 --- a/agents/cluster/policies/policy-changeCompSpec.py +++ /dev/null @@ -1,194 +0,0 @@ -"""Plugin module for custom policies - notify function.""" -from __future__ import print_function -import copy -import logging -import inspect -from itertools import cycle -import random -import time -import re -from mlsysops.logger_util import logger - - -def parse_analyze_interval(interval: str) -> int: - """ - Parses an analyze interval string in the format 'Xs|Xm|Xh|Xd' and converts it to seconds. - - Args: - interval (str): The analyze interval as a string (e.g., "5m", "2h", "1d"). - - Returns: - int: The interval in seconds. - - Raises: - ValueError: If the format of the interval string is invalid. - """ - # Match the string using a regex: an integer followed by one of s/m/h/d - match = re.fullmatch(r"(\d+)([smhd])", interval) - if not match: - raise ValueError(f"Invalid analyze interval format: '{interval}'") - - # Extract the numeric value and the time unit - value, unit = int(match.group(1)), match.group(2) - - # Convert to seconds based on the unit - if unit == "s": # Seconds - return value - elif unit == "m": # Minutes - return value * 60 - elif unit == "h": # Hours - return value * 60 * 60 - elif unit == "d": # Days - return value * 24 * 60 * 60 - else: - raise ValueError(f"Unsupported time unit '{unit}' in interval: '{interval}'") - - -spec_changes = cycle([ - # {'runtime_class_name': cycle(['crun', 'nvidia'])}, - {'image': cycle(['harbor.nbfc.io/mlsysops/test-app:sha-90e0077', 'harbor.nbfc.io/mlsysops/test-app:latest'])}, - {'platform_requirements': { - 'cpu': { - 'requests': '', # in m - 'limits': '' # in m - }, - 'memory': { - 'requests': '', # in Mi - 'limits': '' # in Mi - } - } - } -]) - - -def initialize(): - print(f"Initializing policy {inspect.stack()[1].filename}") - - initialContext = { - "telemetry": { - "metrics": ["node_load1"], - "system_scrape_interval": "1s" - }, - "mechanisms": [ - "fluidity_proxy" - ], - "packages": [], - "configuration": { - "analyze_interval": "30s" - }, - "latest_timestamp": None, - "scope": "application", - "curr_comp_idx": 0, - "current_placement": None, - "initial_deployment_finished": False, - "moving_interval": "300s", - "dynamic_placement_comp": None - } - - return initialContext - - -def get_curr_container_img(comp_name, context): - for comp in context['spec']['components']: - if comp_name != comp['metadata']['name']: - continue - - container = comp.get("containers")[0]['image'] - - return container - - -async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): - current_timestamp = time.time() - - # The first time called - if context['latest_timestamp'] is None: - context['latest_timestamp'] = current_timestamp - return True, context - - # All the next ones, get it - analyze_interval = parse_analyze_interval(context['moving_interval']) - logger.info(f"{current_timestamp} - {context['latest_timestamp']} = {current_timestamp - context['latest_timestamp']} with interval {analyze_interval}") - - if current_timestamp - context['latest_timestamp'] > analyze_interval: - context['latest_timestamp'] = current_timestamp - return True, context - - return True, context - - -async def plan(context, application_description, system_description, mechanisms, telemetry, ml_connector): - plan_result = {} - plan_result['deployment_plan'] = {} - application = application_description[0] - description_changed = False - change_idx = cycle([0, 1, 2]) - curr_change = next(spec_changes) - - #logger.info(f'Curr change is {curr_change}') - component = application['spec']['components'][0] - comp_name = component['metadata']['name'] - # logger.info(f'component spec {component}') - if 'node_placement' in component and 'node' in component['node_placement']: - node = component['node_placement']['node'] - logger.info(f'Found static placement on {node} for comp {comp_name}') - else: - node = system_description['MLSysOpsCluster']['nodes'][0] - logger.info(f'Randomly select host {node} for {comp_name}') - - plan_result['deployment_plan'][comp_name] = [] - - for key in curr_change: - logger.info(f"key is {key}") - # logger.info(f"curr_change[key] is {curr_change[key]}") - # logger.info(f"next(curr_change[key]) is {next(curr_change[key])}") - if key == 'runtime_class_name': - component[key] = next(curr_change[key]) - else: - for container in component['containers']: - - if key == 'image': - container[key] = next(curr_change[key]) - continue - - request_cpu = str(random.randint(0, 300)) - limit_cpu = str(random.randint(301, 400)) - cpu_suffix = 'm' - - request_mem = str(random.randint(0, 300)) - limit_mem = str(random.randint(301, 400)) - mem_suffix = 'Mi' - logger.info(f'request_cpu+cpu_suffix {request_cpu+cpu_suffix}') - - if key not in container or 'cpu' not in container[key] or 'memory' not in container[key]: - container[key] = { - 'cpu': { - 'requests': '', - 'limits': '' - }, - 'memory': { - 'requests': '', - 'limits': '' - } - } - - container[key]['cpu']['requests'] = request_cpu+cpu_suffix - container[key]['cpu']['limits'] = limit_cpu+cpu_suffix - - container[key]['memory']['requests'] = request_mem+mem_suffix - container[key]['memory']['limits'] = limit_mem+mem_suffix - - plan_result['deployment_plan'][comp_name].append({'action': 'change_spec', 'new_spec': component, 'host': node}) - logger.info(f"Applying change type {key} to comp {comp_name}") - - - if plan_result: - plan_result['name'] = application['name'] - # This policy will only take effect after initial deployment is done. - plan_result['deployment_plan']['initial_plan'] = False - - new_plan = { - "fluidity": plan_result - } - - return new_plan, context diff --git a/agents/cluster/policies/policy-ubiw-cluster.py b/agents/cluster/policies/policy-ubiw-cluster.py new file mode 100644 index 0000000..3195838 --- /dev/null +++ b/agents/cluster/policies/policy-ubiw-cluster.py @@ -0,0 +1,597 @@ +# """Plugin module for custom policies - notify function.""" +from __future__ import print_function +import pprint +import copy +import logging +import os +import time +import inspect +import random + +from mlstelemetry import MLSTelemetry + +from mlsysops.logger_util import logger + +mlsClient = MLSTelemetry("fluidity_mechanism", "ubiwhere_policy") + +from sismic.io import import_from_yaml +from sismic.model import Statechart +from sismic.interpreter import Interpreter + +analyze_counter = 0 + +class ClusterAgentFSM: + internalState = None + stateInterperter = None + + def __init__(self, name, command_handler): + self.name = name + + self.internalState = import_from_yaml(filepath="policies/Ubiwhere_FSM.yaml") + assert isinstance(self.internalState, Statechart) + + # Create an interpreter for this statechart + self.stateInterperter = Interpreter(self.internalState, initial_context={'send_command':command_handler}) + initStep = self.stateInterperter.execute_once() + + def __str__(self): + return f"{self.name}: {self.internalState}" + + def print_status(self, step): + # possible values: ['event', 'transitions', 'entered_states', 'exited_states', 'sent_events'] + return_str = "" + previous_s = [] + actual_s = [] + for attribute in ['event', 'transitions', 'entered_states', 'exited_states', 'sent_events']: + print('{} current state: {}'.format(attribute, getattr(step, attribute))) + return_str = return_str + '{} current state: {}'.format(attribute, getattr(step, attribute)) + + actual_s = getattr(step, 'entered_states') + previous_s = getattr(step, 'exited_states') + if previous_s == []: + previous_s = None + else: + previous_s = previous_s[0] + if actual_s == []: + actual_s = None + else: + actual_s = actual_s[0] + return return_str,previous_s,actual_s + + def get_status(self): + if len(self.stateInterperter.configuration) > 1: + print(self.stateInterperter.configuration) + return self.stateInterperter.configuration[1] + else: + return "removed" + + def set_event(self, event): + step = self.stateInterperter.queue(event).execute_once() + return self.print_status(step) + + +logger = logging.getLogger(__name__) +current_command = '' +# TODO: To be removed -- START +test_counter = 0 +# NOTE: To be removed -- END +noise_threshold = 10 +camera_threshold = 11 + +def update_comand(command): + global current_command + + logger.error('in update_comand, prev_cmd %s', current_command) + logger.error('in update_comand, curr_cmd %s', command) + + current_command = command + +fsm = ClusterAgentFSM('ubi-app', update_comand) + + +def generate_event(status,snapshot): + event = "" + if status == "S1": # Both NAVL + if snapshot['L'] == "AVL": #S2 + return "lamppost_appears" + else: + return "No event" + elif status == "S2": # T NAVL D AVL + if snapshot['L'] == "NAVL": #S1 + return "lamppost_disappears" + # elif snapshot['I'] == "LP": #S3 + # return "image_LP" + elif snapshot['N'] == "LP": #S5 + return "noise_LP" + else: + return "No event" + # elif status == "S3": # T OK D AVL + # if snapshot['L'] == "NAVL": # S1 + # return "lamppost_disappears" + # elif snapshot['I'] == "HP": # S2 + # return "image_HP" + # elif snapshot['N'] == "LP": # S5 + # return "noise_LP" + # else: + # return "No event" + # elif status == "S4": # T OK D NAVL + # if snapshot['L'] == "NAVL": # S1 + # return "lamppost_disappears" + # elif snapshot['N'] == "HP": # S2 + # return "image_HP" + # elif snapshot['I'] == "LP": # S5 + # return "image_LP" + # else: + # return "No event" + elif status == "S5": # T NOK D NAVL + if snapshot['L'] == "NAVL": # S1 + return "lamppost_disappears" + elif snapshot['N'] == "HP": # S2 + return "noise_inference_from_LP_to_HP" + else: + return "No event" + + + + +def parse_analyze_interval(interval: str) -> int: + """ + Parses an analyze interval string in the format 'Xs|Xm|Xh|Xd' and converts it to seconds. + + Args: + interval (str): The analyze interval as a string (e.g., "5m", "2h", "1d"). + + Returns: + int: The interval in seconds. + + Raises: + ValueError: If the format of the interval string is invalid. + """ + # Match the string using a regex: an integer followed by one of s/m/h/d + match = re.fullmatch(r"(\d+)([smhd])", interval) + if not match: + raise ValueError(f"Invalid analyze interval format: '{interval}'") + + # Extract the numeric value and the time unit + value, unit = int(match.group(1)), match.group(2) + + # Convert to seconds based on the unit + if unit == "s": # Seconds + return value + elif unit == "m": # Minutes + return value * 60 + elif unit == "h": # Hours + return value * 60 * 60 + elif unit == "d": # Days + return value * 24 * 60 * 60 + else: + raise ValueError(f"Unsupported time unit '{unit}' in interval: '{interval}'") + + +def initialize(): + print(f"Initializing policy {inspect.stack()[1].filename}") + + initialContext = { + "telemetry": { + "metrics": ["node_app_prediction"], + "system_scrape_interval": "1s" + }, + "mechanisms": [ + "fluidity" + ], + "packages": ["sismic"], + "configuration": { + "analyze_interval": "15s" + }, + "noise_app_deployed": False, + "noise_app_init_deployment": True, + "lamppost_ready": False, + "lamppost_hostname": None, + "camera_app_deployed": False, + "latest_timestamp": None, + "core": True, + "scope": "application", + "current_placement": None, + "initial_deployment_finished": False, + "moving_interval": "30s", + "dynamic_placement_comp": None + } + + return initialContext + +def get_first_node(cluster_description): + return cluster_description['nodes'][0] + + + +async def get_metric(metric_name, telemetry): + # Get latest values from telemetry data + component_measured_metric = None + try: + latest_telemetry_df = await telemetry['query'](latest=True) + #latest_telemetry_df = telemetry['query'](latest=True) + component_measured_metric = latest_telemetry_df[metric_name].values[0] + logger.debug(f"metric {metric_name} measurement {component_measured_metric} ") + except Exception as e: + logger.error(f"Failed to get metric {metric_name}") + + return component_measured_metric + + + +""" Plugin function to implement the initial deployment logic. +""" +async def initial_plan(context, app_desc, system_description, mechanisms, telemetry): + # NOTE: The parsing of app_desc which the fluidity handles, should be moved here. + logger.error('initial deployment phase') + global current_command + global fsm + global analyze_counter + analyze_counter = 0 + + components_state = mechanisms['fluidity']['state']['applications'][context['name']]['components'] + + context['component_names'] = [] + for component in app_desc['spec']['components']: + comp_name = component['metadata']['name'] + context['component_names'].append(comp_name) + if 'noise' in comp_name and comp_name in components_state: + context["noise_app_deployed"] = components_state[comp_name]['node_placed'] + elif 'proxy-cv' in comp_name and comp_name in components_state: + context["camera_app_deployed"] = components_state[comp_name]['node_placed'] + context["lamppost_hostname"] = component['node_placement']['node'] + + #logger.error('initial deployment phase ', app_desc) + logger.error(f"component_names {context['component_names']}") + + context['FSM'] = fsm + context['name'] = app_desc['name'] + context['spec'] = app_desc['spec'] + + logger.error('app_desc[spec] %s', app_desc['spec']) + snapshot = { + 'I': 'None', # Image-based detection component + 'N': 'None', # Noise-based detection component + 'L': 'None' # Lamppost node. + } + plan = {} + + for node_name in mechanisms['fluidity']['state']['nodes']: + #logger.error(f"mechanisms for node {node_name}: {mechanisms['fluidity']['state']['nodes'][node_name]}") + + node = mechanisms['fluidity']['state']['nodes'][node_name] + if 'spec' not in node: + continue + if node_name == context["lamppost_hostname"]: + context["lamppost_ready"] = node['ready'] + if 'labels' not in node['spec'] or 'node-type:lamppost' not in node['spec']['labels']: + continue + if 'node-type:lamppost' in node['spec']['labels']: + context["lamppost_ready"] = node['ready'] + + logger.error(f"context['lamppost_ready'] {context['lamppost_ready']}") + + if context["lamppost_hostname"] == None: + logger.error('Did not find any candidate nodes. Going to return.') + return plan + if not context["lamppost_ready"] : + snapshot['L'] = 'NAVL' + logger.error('Lamppost is not ready. Going to return false.') + return plan + else: + logger.error('Lamppost is ready. ') + snapshot['L'] = 'AVL' + + try: + status = context["FSM"].get_status() + logger.error(f"Got status {status}") + event = generate_event(status, snapshot) + logger.error(f"event {event}") + context['prev_snapshot'] = snapshot + logger.error(f"prev_snapshot {context['prev_snapshot']}") + # if event != "" and event != "No event": + # mlsClient.pushLogInfo("TR Event: " + event) + # logger.error(f"TR Event: {event}") + context['transition_result'], context['previous_state'], context['actual_state'] = context["FSM"].set_event(event) + if context['previous_state'] == None: + logger.error('previous_state is None') + # if context['actual_state'] != None and context['previous_state'] != None: + # mlsClient.pushLogInfo("TR: " + context['previous_state'] + " -> " + context['actual_state']) + + status = context["FSM"].get_status() + logger.error(f"Got new status {status}") + # if event != "" and event != None: + # mlsClient.pushLogInfo("Caught event: " + event) + # mlsClient.pushLogInfo("CS: " + status) + except Exception as e: + logger.error(f"Caught exception {e}") + + + cmd = current_command + logger.error(f"cmd is {cmd}") + if cmd == "NONE": + logger.error('Empty cmd.') + #elif cmd == 'start_image_start_noise': + elif cmd == 'start_image' and not context["camera_app_deployed"]: + plan = {} + for component in context['component_names']: + plan[component] = [{'action': 'deploy', 'host': context["lamppost_hostname"]}] + #logger.error(f"Adding deploy for {plan[component]}") + context['initial_deployment_finished'] = True + logger.error('Initial plan returning plan %s' % plan) + logger.error(status) + return plan + + +async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): + global current_command + global noise_threshold + global camera_threshold + + application = application_description[0] + context['name'] = application['name'] + components_state = mechanisms['fluidity']['state']['applications'][context['name']]['components'] + + if 'initial_deployment_finished' in context and context['initial_deployment_finished'] == False: + return True, context + + for component in application['spec']['components']: + comp_name = component['metadata']['name'] + if 'proxy-cv' in comp_name and comp_name in components_state: + context["camera_app_deployed"] = components_state[comp_name]['node_placed'] + + + for node_name in mechanisms['fluidity']['state']['nodes']: + #logger.error(f"mechanisms for node {node_name}: {mechanisms['fluidity']['state']['nodes'][node_name]}") + + node = mechanisms['fluidity']['state']['nodes'][node_name] + if 'spec' not in node: + continue + if node_name == context["lamppost_hostname"]: + context["lamppost_ready"] = node['ready'] + if 'labels' not in node['spec'] or 'node-type:lamppost' not in node['spec']['labels']: + continue + if 'node-type:lamppost' in node['spec']['labels']: + context["lamppost_ready"] = node['ready'] + + snapshot = { + 'I': 'None', + 'N': 'None', + 'L': 'None' + } + # TODO: To be removed -- START + # time.sleep(5) + # global analyze_counter + # analyze_counter +=1 + # logger.error(f"analyze_counter {analyze_counter}") + + # if analyze_counter % 6 == 0: + # logger.error(f"returning true") + # return True, context + + # return False, context + + # NOTE: To be removed -- END + logger.error('Prev snapshot was %s', context['prev_snapshot']) + #if curr_deployment['proxy-cv'][0]['status'] == 'ACTIVE': + if context["camera_app_deployed"]: + snapshot['I'] = 'RUN' + + #if curr_deployment['noise-detection-app'][0]['status'] == 'ACTIVE': + snapshot['N'] = 'RUN' + + status = context["FSM"].get_status() + + #camera_host = curr_deployment['proxy-cv'][0]['name'] + camera_host = context["lamppost_hostname"] + #noise_host = curr_deployment['noise-detection-app'][0]['name'] + noise_host = camera_host + #collector_host = curr_deployment['collector-app'][0]['name'] + #logger.error('camera_host: %s, noise_host: %s, collector_host: %s', camera_host, noise_host, collector_host) + logger.error(f"camera_host: {camera_host}, noise_host: {noise_host}, lamposthostname {context['lamppost_hostname']}") + + # If the noise app is running, retrieve the app-level metric. + # if curr_deployment['noise-detection-app'][0]['status'] == 'ACTIVE' and get_node_availability(noise_host, + # nodes['k8snodes']): + #if get_node_availability(noise_host, nodes['k8snodes']): + if context["lamppost_ready"]: + #noise_prediction = mlsClient.get_metric_value_with_label(metric_name="node_app_prediction") + #noise_prediction = await get_metric("node_app_prediction", telemetry) + + metric_object = mlsClient.get_metric_value_with_label(metric_name="node_app_prediction",label_name="node_name", label_value=context["lamppost_hostname"]) + if metric_object: + noise_prediction = metric_object[0].get("value", 0) + logger.error(f"App {application['name']} received metric {noise_prediction} from host {context['lamppost_hostname']}") + else: + noise_prediction = None + + if noise_prediction == None or noise_prediction == 0: #or noise_prediction['value'] == 'NA': + logger.error('Did not receive any metrics from noise-app yet.') + snapshot['N'] = 'RUN' + else: + logger.error('App noise detection prediction is %s', noise_prediction) + #if noise_prediction['value'] >= noise_threshold: + + noise_prediction = float(noise_prediction) + random.uniform(-1, 1) + logger.error('MODIFIED App noise detection prediction is %s', noise_prediction) + if noise_prediction >= noise_threshold: + logger.error(f"Detected HIGH probability {noise_prediction}") + snapshot['N'] = 'HP' + else: + logger.error(f"Detected LOW probability {noise_prediction}") + snapshot['N'] = 'LP' + + snapshot['I'] = 'RUN' + # If the camera app is running, retrieve the app-level metric. + # if curr_deployment['proxy-cv'][0]['status'] == 'ACTIVE' and get_node_availability(camera_host, + # nodes['k8snodes']): + # camera_prediction = mlsClient.get_metric_value_with_label(metric_name="camera_app_prediction") + # if camera_prediction == None or camera_prediction['value'] == 'NA': + # logger.error('Did not receive any metrics from proxy-cv yet.') + # snapshot['I'] = 'RUN' + # else: + # logger.error('App camera detection prediction is %s', camera_prediction['value']) + # if camera_prediction['value'] >= camera_threshold: + # snapshot['I'] = 'HP' + # else: + # snapshot['I'] = 'LP' + + #if get_node_availability(noise_host, nodes['k8snodes']) == False: + if not context["lamppost_ready"]: + snapshot['L'] = 'NAVL' + else: + snapshot['L'] = 'AVL' + + # logger.error('Previous snapshot %s', context['prev_snapshot']) + # logger.error('Current snapshot %s', snapshot) + + logger.error('Current status %s', status) + event = generate_event(status, snapshot) + # mlsClient.pushLogInfo("Previous noise state: " + context['prev_snapshot']['N']) + # mlsClient.pushLogInfo("Previous camera state: " + context['prev_snapshot']['I']) + # mlsClient.pushLogInfo("Previous lamppost state: " + context['prev_snapshot']['L']) + # mlsClient.pushLogInfo("Current noise state: " + snapshot['N']) + # mlsClient.pushLogInfo("Current camera state: " + snapshot['I']) + # mlsClient.pushLogInfo("Current lamppost state: " + snapshot['L']) + # if event != "" and event != "No event": + # mlsClient.pushLogInfo("TR Event: " + event) + + context['prev_snapshot'] = snapshot + logger.error(event) + + context['transition_result'], context['previous_state'], context['actual_state'] = context["FSM"].set_event(event) + if context['previous_state'] == None: + logger.error('previous_state is None') + # if context['actual_state'] != None and context['previous_state'] != None: + # mlsClient.pushLogInfo("TR: " + context['previous_state'] + " -> " + context['actual_state']) + + status = context["FSM"].get_status() + # if event != "" and event != None: + # mlsClient.pushLogInfo("Caught event: " + event) + # mlsClient.pushLogInfo("CS: " + status) + + # logger.error(status) + # time.sleep(10) + if context['previous_state'] != None and context['actual_state'] != None: + return True, context + else: + return False, context + + +async def plan(context, application_description, system_description, mechanisms, telemetry, ml_connector): + # Possible commands to Fluidity + # 1. 'start_image_start_noise' + # 2. 'start_image' + # 3. 'stop_image_stop_noise' + # 4. 'stop_image' + # 5. 'stop_noise' + + global current_command + # TODO: To be removed -- START + global test_counter + # NOTE: To be removed -- END + # logger.error('Curr deployment: %s', curr_deployment) + # camera_host = curr_deployment['proxy-cv'][0]['name'] + # noise_host = curr_deployment['noise-detection-app'][0]['name'] + # collector_host = curr_deployment['collector-app'][0]['name'] + # logger.error('camera_host: %s, noise_host: %s, collector_host: %s', camera_host, noise_host, collector_host) + + application = application_description[0] + components_state = mechanisms['fluidity']['state']['applications'][context['name']]['components'] + logger.error(f"components_state {components_state}") + + # TODO: Check if this should be moved to else below + #status = context["FSM"].get_status() + plan_result = { + 'deployment_plan': {}, + } + + if 'initial_deployment_finished' in context and context['initial_deployment_finished'] == False: + # NOTE: Check if context is updated properly after the invocation + initial_plan_result = await initial_plan(context, application, system_description, mechanisms, telemetry) + if initial_plan_result: + plan_result['deployment_plan'] = initial_plan_result + plan_result['deployment_plan']['initial_plan'] = True + else: + logger.error(f"INITIAL PLAN FAILED") + logger.error(f"plan_result {plan_result}") + return plan_result, context + else: + plan_result['deployment_plan']['initial_plan'] = False + + cmd = current_command + + # TODO: To be removed -- START + #time.sleep(10) + # temp_result = test_counter % 6 + # # send_command("start_image_start_noise") + # # send_command("stop_image_stop_noise") + # # send_command("stop_image") + # # send_command("stop_noise") + # # send_command("start_image") + # if temp_result == 0: + # cmd = "stop_image" + # elif temp_result == 1: + # cmd = "start_image" + # elif temp_result == 2: + # #cmd = "stop_noise" + # cmd = "NONE" + # elif temp_result == 3: + # cmd = "NONE" + # #cmd = "start_noise" + # elif temp_result == 4: + # cmd = "stop_image" + # #cmd = "stop_image_stop_noise" + # elif temp_result == 5: + # cmd = "start_image" + # #cmd = "start_image_start_noise" + + # test_counter +=1 + # NOTE: To be removed -- END + # Creation of the plan_result as actions to be + # provided to Fluidity's internal mechanism. + logger.error('cmd is %s', cmd) + if cmd == "NONE": + logger.error('Empty cmd.') + # mlsClient.pushLogInfo("Current app deployment: "+curr_deployment) + #mlsClient.pushLogInfo("Fluidity will execute command: " + cmd) + + for comp in application['spec']['components']: + comp_name = comp['metadata']['name'] + logger.error(comp_name) + # if comp_name == 'noise-detection-app': + # if 'start_noise' in cmd and (comp_name not in components_state or not components_state[comp_name]['node_placed']): + # plan_result['curr_deployment'][comp_name] = { + # 'action': 'deploy', + # 'host': context["lamppost_hostname"] + # } + # elif 'stop_noise' in cmd and (comp_name in components_state and components_state[comp_name]['node_placed']): + # plan_result['curr_deployment'][comp_name] = { + # 'action': 'remove', + # 'host': context["lamppost_hostname"] + # } + if 'proxy-cv' in comp_name: + if 'start_image' in cmd and (comp_name not in components_state or not components_state[comp_name]['node_placed']): + plan_result['deployment_plan'][comp_name] = [{ + 'action': 'deploy', + 'host': context["lamppost_hostname"] + }] + if 'stop_image' in cmd and (comp_name in components_state and components_state[comp_name]['node_placed']): + plan_result['deployment_plan'][comp_name] = [{ + 'action': 'remove', + 'host': context["lamppost_hostname"] + }] + logger.error('Actions to be returned to Fluidity %s' % plan_result) + + logger.error('plan: New plan %s', plan_result) + fluidity_dict = { + 'deployment_plan': plan_result['deployment_plan'], + 'name': context['name'] + } + + new_plan = { + "fluidity": fluidity_dict + } + + + return new_plan, context diff --git a/agents/cluster/requirements.txt b/agents/cluster/requirements.txt index 48c9411..72d56f9 100644 --- a/agents/cluster/requirements.txt +++ b/agents/cluster/requirements.txt @@ -1,4 +1,3 @@ -mlsysops mlstelemetry opentelemetry-exporter-otlp opentelemetry-api @@ -17,4 +16,6 @@ ruamel.yaml jsonschema redis watchdog -deepdiff \ No newline at end of file +deepdiff +dataclass_struct +sismic \ No newline at end of file diff --git a/agents/continuum/CR-newfeature.yaml b/agents/continuum/CR-newfeature.yaml deleted file mode 100644 index bf0ee26..0000000 --- a/agents/continuum/CR-newfeature.yaml +++ /dev/null @@ -1,91 +0,0 @@ -apiVersion: mlsysops.eu/v1 -cluster_placement: - cluster_id: [cluster1] -component_interactions: -- {component_name1: client-app, component_name2: server-app, type: egress} -components: -- containers: - - command: [python, TcpServer.py] - env: - - {name: OTEL_RESOURCE_ATTRIBUTES, value: 'service.name=server-app, service.version=0.0.0, - service.experimentid=test'} - - {name: OTEL_SERVICE_NAME, value: server-app} - - name: NODE_IP - value_from: - field_ref: {field_path: status.hostIP} - - {name: TELEMETRY_ENDPOINT, value: '"$(NODE_IP):43170"'} - image: registry.mlsysops.eu/agent/agents/test_app:0.0.0 - image_pull_policy: IfNotPresent - platform_requirements: - cpu: - architecture: [amd64] - frequency: 1.4 - limits: 500m - requests: 250m - disk: '120' - memory: {limits: 128Mi, requests: 64Mi} - performance_indicator: 30.0 - ports: - - {container_port: 10000, protocol: TCP} - host_network: false - metadata: {name: server-app, uid: a9jwduj9028uje} - node_placement: - continuum_layer: [edge] - qos_metrics: - - application_metric_id: metric-1 - relation: lower_or_equal - system_metrics_hints: [cpu_frequency] - target: 20.0 - restart_policy: OnFailure - sensors: - - camera: {camera_type: rgb, minimum_framerate: 20, model: d455, resolution: 1024x768} - endpoint_variable: CAMERA_ENDPOINT - instances: 1.0 -- containers: - - command: [python, TcpClient.py] - env: - - {name: OTEL_RESOURCE_ATTRIBUTES, value: 'service.name=server-app, service.version=0.0.0, - service.experimentid=test'} - - {name: OTEL_SERVICE_NAME, value: server-app} - - name: NODE_IP - value_from: - field_ref: {field_path: status.hostIP} - - {name: TELEMETRY_ENDPOINT, value: '"$(NODE_IP):43170"'} - - {name: TCP_SERVER_IP, value: server-app} - image: registry.mlsysops.eu/agent/agents/test_app:0.0.0 - image_pull_policy: IfNotPresent - platform_requirements: - cpu: - architecture: [arm64] - frequency: 1.4 - limits: 500m - requests: 250m - disk: '100' - gpu: {memory: '2', model: k80, utilization_request: '10'} - memory: {limits: 128Mi, requests: 64Mi} - performance_indicator: 320.0 - ports: - - {container_port: 10000, protocol: TCP} - metadata: {name: client-app, uid: jdaddwewed235uje} - node_placement: - continuum_layer: [edge] - labels: ["'node-type:rpi'"] - qos_metrics: - - {application_metric_id: metric-2, relation: equal, target: 30.0} - restart_policy: OnFailure - sensors: - - endpoint_variable: TEMPERATURE_ENDPOINT - instances: 1.0 - temperature: {accuracy: 0.4, measurement_max: 45.0, measurement_min: 13.0, measurement_unit: celsius, - model: sdc30, sampling_frequency: 10.0} -global_satisfaction: - achievement_weights: - - {metric_id: metric-1, weight: 0.5} - - {metric_id: metric-2, weight: 0.5} - relation: greater_than - threshold: 0.7 -kind: MLSysOpsApp -metadata: {name: newfeature} -permitted_actions: [component_relocation, traffic_redirection, change_container_image, - change_container_runtime_class, change_container_cpu_set, change_container_resource_requirements, - acceleration] diff --git a/agents/continuum/CR-test-application.yaml b/agents/continuum/CR-test-application.yaml deleted file mode 100644 index 646c139..0000000 --- a/agents/continuum/CR-test-application.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: mlsysops.eu/v1 -cluster_placement: - cluster_id: [uth-prod-cluster] -component_interactions: -- {component_name1: client-app, component_name2: server-app, type: egress} -components: -- containers: - - command: [python, TcpServer.py] - env: - - {name: OTEL_RESOURCE_ATTRIBUTES, value: 'service.name=server-app, service.version=0.0.0'} - - {name: OTEL_SERVICE_NAME, value: server-app} - - name: NODE_IP - value_from: - field_ref: {field_path: status.hostIP} - - {name: TELEMETRY_ENDPOINT, value: $(NODE_IP):43170} - - {name: TCP_SERVER_IP, value: 0.0.0.0} - image: harbor.nbfc.io/mlsysops/test-app:latest - image_pull_policy: IfNotPresent - ports: - - {container_port: 10000, protocol: TCP} - metadata: {name: server-app, uid: server-app-v1} - node_placement: {node: mls-ubiw-2} - qos_metrics: - - {application_metric_id: test_received_success_counter_total, relation: lower_or_equal, - target: 20.0} - restart_policy: OnFailure -- containers: - - command: [python, TcpClient.py] - env: - - {name: OTEL_RESOURCE_ATTRIBUTES, value: 'service.name=client-app, service.version=0.0.0'} - - {name: OTEL_SERVICE_NAME, value: client-app} - - name: NODE_IP - value_from: - field_ref: {field_path: status.hostIP} - - {name: TELEMETRY_ENDPOINT, value: $(NODE_IP):43170} - - {name: TCP_SERVER_IP, value: server-app} - image: harbor.nbfc.io/mlsysops/test-app:latest - image_pull_policy: IfNotPresent - ports: - - {container_port: 10000, protocol: TCP} - metadata: {name: client-app, uid: client-app-v1} - qos_metrics: - - {application_metric_id: metric-2, relation: equal, target: 30.0} - restart_policy: OnFailure -global_satisfaction: - achievement_weights: - - {metric_id: test_received_success_counter, weight: 0.5} - - {metric_id: test_sent_success_counter, weight: 0.5} - relation: greater_than - threshold: 0.7 -kind: MLSysOpsApp -metadata: {name: test-application} -permitted_actions: [component_relocation, traffic_redirection, change_container_image, - change_container_runtime_class, change_container_cpu_set, change_container_resource_requirements, - acceleration] diff --git a/agents/continuum/Dockerfile b/agents/continuum/Dockerfile index 5f0cb9c..831eba6 100644 --- a/agents/continuum/Dockerfile +++ b/agents/continuum/Dockerfile @@ -4,17 +4,18 @@ FROM harbor.nbfc.io/proxy_cache/library/python:3.10-slim # Set up a working directory WORKDIR /workdir -# Copy requirements to the image -COPY requirements.txt /workdir - -# Install dependencies from requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - # Export PYTHONPATH for the working directory ENV PYTHONPATH=/workdir # Copy all application files into the image -COPY . /workdir +COPY ./continuum /workdir/continuum +COPY ./mlsysops /workdir/mlsysops + +# Install dependencies from requirements.txt +RUN pip install --no-cache-dir -r /workdir/mlsysops/requirements.txt +RUN pip install --no-cache-dir -r /workdir/continuum/requirements.txt + +WORKDIR /workdir/continuum # Default command to start the application CMD ["python3", "main.py"] \ No newline at end of file diff --git a/agents/continuum/MLSContinuumAgent.py b/agents/continuum/MLSContinuumAgent.py index 34949d6..7642f34 100644 --- a/agents/continuum/MLSContinuumAgent.py +++ b/agents/continuum/MLSContinuumAgent.py @@ -130,7 +130,8 @@ async def apply_propagation_policies(self): """ try: # Extract cluster names where the cluster status is True (ready) - cluster_names = [name for name, status in self.clusters.items() if status.lower() == 'true'] + cluster_names = [name for name, status in self.clusters.items()] + print(f"Cluster names: {cluster_names}") logger.debug(f"Applying PropagationPolicy with cluster names: {cluster_names}") @@ -179,8 +180,29 @@ async def apply_propagation_policies(self): except Exception as e: logger.error(f"Error applying Simple PropagationPolicy: {e}") + # Apply resource registry + try: + name = "mlsysops-resource-registry" + simple_template = env.get_template("resource-registry.yaml") + rendered_simple_policy = simple_template.render(name=name, cluster_names=cluster_names) + + # Parse YAML to Python dictionary + yaml = YAML(typ='safe') + simple_policy_body = yaml.load(rendered_simple_policy) + + # Apply the Simple PropagationPolicy + await self._apply_resource_registry( + name=name, + body=simple_policy_body, + plural="resourceregistries" + ) + + except Exception as e: + logger.error(f"Error applying resource registries: {e}") + except Exception as e: logger.error(f"Error applying PropagationPolicies: {e}") + logger.error(traceback.format_exc()) async def _apply_policy(self, policy_name: str, policy_body: dict, plural: str, namespaced: bool = False, namespace: str = None): """ @@ -285,6 +307,71 @@ async def _apply_policy(self, policy_name: str, policy_body: dict, plural: str, except Exception as e: logger.error(f"Error applying resource '{policy_name}': {e}") + # ... existing code ... + async def _apply_resource_registry(self, name: str, body: dict, plural: str): + + try: + # Load the Kubernetes configuration + await kubernetes_asyncio.config.load_kube_config(config_file=self.karmada_api_kubeconfig, context='karmada-apiserver') + + async with kubernetes_asyncio.client.ApiClient() as api_client: + custom_api = kubernetes_asyncio.client.CustomObjectsApi(api_client) + + # Define API group and version for ResourceRegistry (cluster-scoped) + group = "search.karmada.io" + version = "v1alpha1" + + resource_name = name + resource_body = body + + logger.debug( + f"Applying resource '{resource_name}' with group: {group}, version: {version}, plural: {plural}" + ) + + try: + # Fetch the current cluster-scoped resource + current_resource = await custom_api.get_cluster_custom_object( + group=group, + version=version, + plural=plural, + name=resource_name + ) + + # Add the required resourceVersion field to the body + resource_version = current_resource["metadata"]["resourceVersion"] + resource_body["metadata"]["resourceVersion"] = resource_version + + logger.info(f"Resource '{resource_name}' exists. Updating it...") + + # Perform an update using replace + await custom_api.replace_cluster_custom_object( + group=group, + version=version, + plural=plural, + name=resource_name, + body=resource_body + ) + logger.info(f"Resource '{resource_name}' updated successfully.") + + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 404: + # If the resource doesn't exist, create a new one + logger.info(f"Resource '{resource_name}' not found. Creating a new one...") + + # Create the new cluster-scoped resource + await custom_api.create_cluster_custom_object( + group=group, + version=version, + plural=plural, + body=resource_body + ) + logger.info(f"New resource '{resource_name}' created successfully.") + else: + raise # Re-raise any non-404 exceptions + + except Exception as e: + logger.error(f"Error applying resource '{name}': {e}") + # ... existing code ... async def ensure_crds(self): """Ensure all MLSysOps CRDs are registered. @@ -295,14 +382,34 @@ async def ensure_crds(self): #: the REST API group name API_GROUP = 'mlsysops.eu' #: System file directory of CRDs - _CRDS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates/')) + + # Use the packaged mlsysops.crds directory instead of local templates + try: + import importlib.resources as pkg_resources # Python 3.9+ + from mlsysops import crds as mlsysops_crds_pkg + has_pkg_resources = True + except Exception: + has_pkg_resources = False + + def _crd_file_path(filename: str) -> str: + if has_pkg_resources: + # Extract the CRD to a temp file to provide a filesystem path for YAML loader + import tempfile, shutil + with pkg_resources.files(mlsysops_crds_pkg).joinpath(filename).open('rb') as src: + tmp_dir = tempfile.mkdtemp(prefix="mlsysops-crds-") + dst_path = os.path.join(tmp_dir, filename) + with open(dst_path, 'wb') as dst: + shutil.copyfileobj(src, dst) + return dst_path + # Fallback: relative path resolution if resources package not available + return os.path.abspath(os.path.join(os.path.dirname(__file__), 'mlsysops', 'crds', filename)) mlsysops_node_dict = { 'singular': 'mlsysopsnode', 'plural': 'mlsysopsnodes', 'kind': 'MLSysOpsNode', 'crd_name': f'mlsysopsnodes.{API_GROUP}', - 'crd_file': f'{_CRDS_DIR}/MLSysOpsNode.yaml', + 'crd_file': _crd_file_path('MLSysOpsNode.yaml'), 'version': 'v1' } @@ -311,7 +418,7 @@ async def ensure_crds(self): 'plural': 'mlsysopsapps', 'kind': 'MLSysOpsApp', 'crd_name': f'mlsysopsapps.{API_GROUP}', - 'crd_file': f'{_CRDS_DIR}/MLSysOpsApplication.yaml', + 'crd_file': _crd_file_path('MLSysOpsApplication.yaml'), 'version': 'v1' } @@ -320,7 +427,7 @@ async def ensure_crds(self): 'plural': 'mlsysopscontinuums', 'kind': 'MLSysOpsContinuum', 'crd_name': f'mlsysopscontinuums.{API_GROUP}', - 'crd_file': f'{_CRDS_DIR}/MLSysOpsContinuum.yaml', + 'crd_file': _crd_file_path('MLSysOpsContinuum.yaml'), 'version': 'v1' } @@ -329,7 +436,7 @@ async def ensure_crds(self): 'plural': 'mlsysopsclusters', 'kind': 'MLSysOpsCluster', 'crd_name': f'mlsysopsclusters.{API_GROUP}', - 'crd_file': f'{_CRDS_DIR}/MLSysOpsCluster.yaml', + 'crd_file': _crd_file_path('MLSysOpsCluster.yaml'), 'version': 'v1' } @@ -355,8 +462,8 @@ async def ensure_crds(self): with open(crd_info['crd_file'], 'r') as data: body = yaml.load(data) except IOError: - logger.error('Resource definition not in dir %s.', - crd_info['crd_file']) + logger.error('Resource definition not accessible at %s.', crd_info['crd_file']) + continue try: await ext_api.create_custom_resource_definition(body) except ApiException as exc: @@ -371,7 +478,7 @@ async def get_karmada_clusters(self): """ try: # Load the kubeconfig file with the specified path - await kubernetes_asyncio.config.load_kube_config(config_file=self.karmada_api_kubeconfig) + await kubernetes_asyncio.config.load_kube_config(config_file=self.karmada_api_kubeconfig, context='karmada-apiserver') # Create an API client for the Custom Resources API api_client = kubernetes_asyncio.client.CustomObjectsApi() @@ -387,7 +494,6 @@ async def get_karmada_clusters(self): version=version, plural=plural ) - # Process the response to extract cluster names and details clusters = [] for item in response.get("items", []): @@ -398,16 +504,6 @@ async def get_karmada_clusters(self): return_object = {} for cluster in clusters: - # example - # [{'name': 'uth-dev-cluster', 'status': [ - # {'type': 'Ready', 'status': 'False', 'lastTransitionTime': '2025-04-07T10:24:31Z', - # 'reason': 'ClusterNotReachable', 'message': 'cluster is not reachable'}]}, {'name': 'uth-prod-cluster', - # 'status': [ - # {'type': 'Ready', - # 'status': 'True', - # 'lastTransitionTime': '2025-05-13T15:48:28Z', - # reason': 'ClusterReady', - # message': 'cluster is healthy and ready to accept workloads'}]}] return_object[cluster['name']] = cluster['status'][0]['status'] # true online, false offline return return_object diff --git a/agents/continuum/config.yaml b/agents/continuum/config.yaml index 279ea22..2302cac 100644 --- a/agents/continuum/config.yaml +++ b/agents/continuum/config.yaml @@ -3,7 +3,6 @@ mechanisms: mechanisms_directory: "mechanisms" default_telemetry_metrics: "None" -policy_directory: "policies" continuum_layer: "continuum" monitor_data_retention_time: 30 node_exporter_scrape_interval: 10s diff --git a/agents/continuum/descriptions/mls-test-karmada.yaml b/agents/continuum/descriptions/mls-test-karmada.yaml new file mode 100644 index 0000000..c901c13 --- /dev/null +++ b/agents/continuum/descriptions/mls-test-karmada.yaml @@ -0,0 +1,3 @@ +MLSysOpsContinuum: + continuum_id: mls-test-karmada + clusters: ["mls-test-manage"] \ No newline at end of file diff --git a/agents/continuum/requirements.txt b/agents/continuum/requirements.txt index 020786a..65b2ffe 100644 --- a/agents/continuum/requirements.txt +++ b/agents/continuum/requirements.txt @@ -1,4 +1,3 @@ -mlsysops mlstelemetry opentelemetry-exporter-otlp opentelemetry-api @@ -14,4 +13,5 @@ cachetools kubernetes redis kubernetes_asyncio -watchdog \ No newline at end of file +watchdog +ruamel.yaml \ No newline at end of file diff --git a/agents/continuum/templates/application-cr-propagation-policy.yaml b/agents/continuum/templates/application-cr-propagation-policy.yaml index afb8ace..80f7a6c 100644 --- a/agents/continuum/templates/application-cr-propagation-policy.yaml +++ b/agents/continuum/templates/application-cr-propagation-policy.yaml @@ -14,6 +14,9 @@ spec: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding namespace: default + - apiVersion: v1 + kind: Service + namespace: mlsysops-framework conflictResolution: Overwrite placement: clusterAffinity: diff --git a/agents/continuum/templates/resource-registry.yaml b/agents/continuum/templates/resource-registry.yaml index e40ece6..a2a1839 100644 --- a/agents/continuum/templates/resource-registry.yaml +++ b/agents/continuum/templates/resource-registry.yaml @@ -1,11 +1,13 @@ apiVersion: search.karmada.io/v1alpha1 kind: ResourceRegistry metadata: - name: proxy-resources + name: {{ name }} spec: targetCluster: clusterNames: - - uth-prod-cluster + {% for cluster in cluster_names %} + - {{ cluster }} + {% endfor %} resourceSelectors: - apiVersion: v1 kind: Pod diff --git a/agents/fita/Dockerfile b/agents/fita/Dockerfile new file mode 100644 index 0000000..44ce76b --- /dev/null +++ b/agents/fita/Dockerfile @@ -0,0 +1,21 @@ +# Base image +FROM harbor.nbfc.io/proxy_cache/library/python:3.10-slim + +# Export PYTHONPATH for the working directory +ENV PYTHONPATH=/workdir:/workdir/mechanisms + +WORKDIR /workdir + +# Copy all application files into the image +COPY ./fita /workdir/node +COPY ./mlsysops /workdir/mlsysops + +# Set up a working directory +WORKDIR /workdir/node + +# Install dependencies from requirements.txt +RUN pip install --no-cache-dir -r /workdir/mlsysops/requirements.txt +RUN pip install --no-cache-dir -r /workdir/node/requirements.txt + +# Default command to start the application +CMD ["python3", "main.py"] \ No newline at end of file diff --git a/agents/fita/MLSFITAAgent.py b/agents/fita/MLSFITAAgent.py new file mode 100644 index 0000000..65f6b9b --- /dev/null +++ b/agents/fita/MLSFITAAgent.py @@ -0,0 +1,218 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import asyncio +import traceback + +from mlsysops.agent import MLSAgent +from mlsysops.events import MessageEvents +from mlsysops.logger_util import logger + + +class MLSFITAAgent(MLSAgent): + + def __init__(self): + # Initialize base MLS agent class + print("In INIT OF FITA AGENT") + super().__init__() + + # { 'app_name' : { "components" : [component_name] } } + self.active_application = {} + + + async def run(self): + """ + Main process of the MLSAgent. + """ + await super().run() + + logger.info("Starting MLSAgent process...") + + # Start the message queue listener task + message_queue_task = asyncio.create_task(self.message_queue_listener()) + self.running_tasks.append(message_queue_task) + + fita_task = asyncio.create_task(self.fita_message_listener()) + self.running_tasks.append(fita_task) + + fluidity_proxy_task = asyncio.create_task(self.fluidity_proxy_message_listener()) + self.running_tasks.append(fluidity_proxy_task) + + # sending sync request + await self.send_message_to_node(self.state.configuration.cluster, MessageEvents.NODE_STATE_SYNC.value, {"node": self.state.configuration.node}) + + try: + results = await asyncio.gather(*self.running_tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.error(f"Task raised an exception: {result}") + except Exception as e: + logger.error(f"Error in running tasks: {e}") + + print("MLSAgent stopped.") + + async def message_queue_listener(self): + """ + Coroutine that listens for and processes messages from a queue. It manages the lifecycle + of applications and their components, handles telemetry updates, synchronizes node states, + and forwards specific events to other mechanisms when necessary. The coroutine operates in + an infinite loop, extracting and reacting to events received from the message queue. + + Raises + ------ + Exception + If an error occurs during the processing of a message, the resulting exception is logged + but not propagated, ensuring continuous operation of the infinite loop. + """ + logger.info("Starting Message Queue Listener...") + while True: + try: + # Wait for a message from the queue + message = await self.message_queue.get() + + # Extract event type and application details from the message + event = message.get("event") # Expected event field + data = message.get("payload") # Additional application-specific data + logger.debug(f"Received message: {event}") + + # Act upon the event type + if event == MessageEvents.COMPONENT_PLACED.value: + application_object = self.active_application.get(data['name'], None) + if application_object is None: + self.active_application[data['name']] = {"components" : []} + self.active_application[data['name']]['components'].append(data['component_name']) + logger.debug(f"Component {data['component_name']} placed in new application {data['name']}") + await self.application_controller.on_application_received(data) + await self.policy_controller.start_application_policies(data['name']) + else: + if data['component_name'] not in application_object['components']: + application_object['components'].append(data['component_name']) + logger.debug(f"Component {data['component_name']} placed in existing application {data['name']}") + await self.application_controller.on_application_updated(data) + elif event == MessageEvents.COMPONENT_UPDATED.value: + application_object = self.active_application.get(data['name'], None) + if application_object is not None: + logger.debug(f"Component {data['component_name']} updated in application {data['name']}") + await self.application_controller.on_application_updated(data) + + elif event == MessageEvents.COMPONENT_REMOVED.value: + application_object = self.active_application.get(data['name'], None) + if application_object is not None: + if data['component_name'] in application_object['components']: + application_object['components'].remove(data['component_name']) + logger.debug(f"Component {data['component_name']} removed from application {data['name']}") + await self.application_controller.on_application_updated(data) + if len(application_object['components']) == 0: + await self.application_controller.on_application_terminated(data['name']) + await self.policy_controller.delete_application_policies(data['name']) + del self.active_application[data['name']] + logger.debug(f"All components of application {data['name']} removed.") + elif event == MessageEvents.OTEL_NODE_INTERVAL_UPDATE.value: + await self.telemetry_controller.add_new_interval(id=self.state.configuration.cluster,new_interval=data[0]['interval']) + elif event == MessageEvents.NODE_STATE_SYNC.value: + logger.debug(f"Received NODE_STATE_SYNC msg from cluster ") + for application_name, application_data in data.items(): + for component_name,_ in application_data['components'].items(): + application_object = self.active_application.get(application_name, None) + + if application_object is None: + self.active_application[application_name] = {"components": []} + self.active_application[application_name]['components'].append(component_name) + logger.debug(f"Component {component_name} placed in new application {application_name}") + await self.application_controller.on_application_received(application_data) + await self.policy_controller.start_application_policies(application_name) + else: + if component_name not in application_object['components']: + application_object['components'].append(component_name) + logger.debug( + f"Component {component_name} placed in existing application {application_name}") + await self.application_controller.on_application_updated(application_data) + elif event == MessageEvents.MESSAGE_TO_FLUIDITY_PROXY.value: + # forward to fluidity proxy + logger.debug(f"Received message to fluidity proxy mechanism") + if self.mechanisms_controller.is_mechanism_enabled("fluidity_proxy"): + await self.mechanisms_controller.queues['fluidity_proxy']['outbound'].put(message) + else: + logger.error(f"Unhandled event type: {event}") + logger.error(traceback.format_exc()) + + except Exception as e: + logger.error(f"Error processing message: {traceback.format_exc()}") + + async def fluidity_proxy_message_listener(self): + """ + Handles incoming messages from the fluidity proxy message queue, processes the + received events, and executes corresponding actions based on event types. + + Raises + ------ + asyncio.CancelledError + Raised when the task is cancelled while awaiting. + Exception + General exception raised if an unexpected error occurs during message + processing. + + Returns + ------- + None + """ + logger.debug(f"MLSAGENT Node:::: Starting fluidity proxy message listener.... ") + while True: + try: + msg = await self.mechanisms_controller.queues['fluidity_proxy']['inbound'].get() + + event = msg.get("event") + data = msg.get("payload") + logger.debug(f"Received msg from fluidity_proxy event { event }: { data }") + + match event: + case MessageEvents.MESSAGE_TO_FLUIDITY.value: + # Send the message to cluster fluidity + data['node'] = self.state.configuration.node + await self.send_message_to_node(self.state.configuration.cluster, event, data) + continue + case MessageEvents.PLAN_EXECUTED.value: + await self.update_plan_status(data['plan_uid'], "fluidity_proxy", data['status']) + case _: + logger.error(f"Received msg from fluidity proxy with wrong event") + + except asyncio.CancelledError: + logger.debug(f"fluidityproxy_message_listener: CancelledError") + break + except Exception as e: + logger.error(f"fluidityproxy_message_listener: Error processing msg: {e}") + await asyncio.sleep(1) + + async def fita_message_listener(self): + logger.debug(f"FITA Agent:::: Starting fita message listener.... ") + while True: + try: + msg = await self.mechanisms_controller.queues['fita']['inbound'].get() + + event = msg.get("event") + data = msg.get("payload") + logger.debug(f"Received msg from fita event { event }: { data }") + + match event: + case _: + logger.error(f"Received msg from fita with no event") + + except asyncio.CancelledError: + logger.debug(f"fita listener: CancelledError") + break + except Exception as e: + logger.error(f"fita: Error processing msg: {e}") + logger.error(traceback.format_exc()) + await asyncio.sleep(1) diff --git a/agents/fita/config.yaml b/agents/fita/config.yaml new file mode 100644 index 0000000..b12a0e7 --- /dev/null +++ b/agents/fita/config.yaml @@ -0,0 +1,30 @@ + mechanisms: + - "fita" + - "fluidity_proxy" + default_telemetry_metrics: "None" + policy_directory: "policies" + mechanisms_directory: "mechanisms" + continuum_layer: "node" + monitor_data_retention_time: 30 + node_exporter_scrape_interval: 10s + + + behaviours: + APIPingBehaviour: + enabled: False + Check_ml_deployment_Behaviour: # This is oneshot behaviour should be False always + enabled: False + CheckInactiveClustersBehaviour: + enabled: False + period: 10 + HBReceiverBehaviour: + enabled: True + HeartbeatBehaviour: + enabled: False + period: 10 # Example parameter for PeriodicBehaviour + ManagementModeBehaviour: + enabled: False + ManageSubscriptionBehaviour: + enabled: False + Subscribe: + enabled: True diff --git a/agents/fita/descriptions/mls-compute-vm2-b1-node1.yaml b/agents/fita/descriptions/mls-compute-vm2-b1-node1.yaml new file mode 100644 index 0000000..cd244ec --- /dev/null +++ b/agents/fita/descriptions/mls-compute-vm2-b1-node1.yaml @@ -0,0 +1,17 @@ +MLSysOpsNode: + name: mls-compute-vm2-b1-node1 + cluster_id: mls-test-manage + continuum_layer: far_edge + sensors: + - temperature: + model: "dummy" # NOTE: fill in or remove + environment: + node_type: native + os: zephyr + container_runtime: + - embserve + hardware: + cpu: + architecture: arm-v7 + memory: "4" # Change if needed + disk: "120" # Change if needed \ No newline at end of file diff --git a/agents/fita/descriptions/mls-compute-vm2-b1-node2.yaml b/agents/fita/descriptions/mls-compute-vm2-b1-node2.yaml new file mode 100644 index 0000000..a01b1e2 --- /dev/null +++ b/agents/fita/descriptions/mls-compute-vm2-b1-node2.yaml @@ -0,0 +1,17 @@ +MLSysOpsNode: + name: mls-compute-vm2-b1-node2 + cluster_id: mls-test-manage + continuum_layer: far_edge + sensors: + - temperature: + model: "dummy" # NOTE: fill in or remove + environment: + node_type: native + os: zephyr + container_runtime: + - embserve + hardware: + cpu: + architecture: arm-v7 + memory: "4" # Change if needed + disk: "120" # Change if needed \ No newline at end of file diff --git a/agents/fita/main.py b/agents/fita/main.py new file mode 100644 index 0000000..4e37e02 --- /dev/null +++ b/agents/fita/main.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import asyncio +import os + +from dotenv import load_dotenv + +from mlsysops.logger_util import logger +from MLSFITAAgent import MLSFITAAgent + +# Path to your .env file +dotenv_path = '.env' + +# Check if the .env file exists +if os.path.exists(dotenv_path): + load_dotenv(dotenv_path) # Load the .env file into environment variables + logger.debug(f".env file found and loaded from: {dotenv_path}") +else: + logger.debug(f"No .env file found at: {dotenv_path}") + + +async def shutdown(signal_name, agent, all_tasks): + """ + Gracefully shuts down the asyncio event loop. + + Args: + signal_name (str): The name of the received signal (e.g., SIGTERM, SIGINT). + agent (MLSClusterAgent): The agent instance to be stopped or cleaned. + all_tasks (list): List of running asyncio tasks to cancel. + """ + logger.info(f"Received {signal_name}. Shutting down gracefully...") + + # Gracefully stop the MLSClusterAgent + try: + await agent.stop() # Assuming `stop` is a method for cleanup in your agent class + logger.debug("Agent stopped successfully.") + except Exception as e: + logger.error(f"Error while stopping the agent: {e}") + + # Cancel all running tasks + for task in all_tasks: + task.cancel() + try: + await task + except asyncio.CancelledError: + logger.debug(f"Task {task} cancelled successfully.") + except Exception as e: + logger.error(f"Error while cancelling the task {task}: {e}") + + logger.info("Shutdown complete. Exiting process.") + + + +async def main(): + """ + Entry point for the node agent program. + This function initializes and runs the MLS Node Agent. + """ + global main_task + + # Instantiate the MLSAgent class + agent = MLSFITAAgent() + + try: + # Run the MLSAgent's main process (update with the actual method name) + agent_task = asyncio.create_task(agent.run()) + main_task = agent_task + + await asyncio.gather(agent_task) + + except asyncio.CancelledError: + logger.info("Agent stoped. Performing cleanup...") + if agent: + await agent.stop() # Stop the agent during cleanup + except Exception as e: + logger.error(f"An error occurred in the main task: {e}") + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt as e: + logger.info("MLSNodeAgent stopped.") diff --git a/agents/fita/mechanisms/example_usage.py b/agents/fita/mechanisms/example_usage.py new file mode 100644 index 0000000..d11aec9 --- /dev/null +++ b/agents/fita/mechanisms/example_usage.py @@ -0,0 +1,32 @@ +import asyncio +import json + +import fita + +queue_inbound = asyncio.Queue() +queue_outbound = asyncio.Queue() + +async def tester_listener(): + while True: + message = await queue_inbound.get() + + print(message) + +async def main(): + + asyncio.create_task(tester_listener()) + fita.initialize(inbound_queue= queue_outbound, outbound_queue= queue_inbound) + + print("Going into loop") + #Loop forever + while True: + try: + await asyncio.sleep(10) + print("Applying control knob") + fita.apply({"event":"CONTROL_KNOB_EVENT","payload":{"control_knob":"TX_PWR","action":"SET","value":"MED"}}) + except KeyboardInterrupt: + break + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) \ No newline at end of file diff --git a/agents/fita/mechanisms/fita.py b/agents/fita/mechanisms/fita.py new file mode 100644 index 0000000..266453d --- /dev/null +++ b/agents/fita/mechanisms/fita.py @@ -0,0 +1,227 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # +# # + +import os +import asyncio +import json +import traceback +import uuid +from asyncio import CancelledError +from dataclasses import dataclass, field +from typing import Dict + +from nextgengw_mqtt_bridge import Mqtt_Bridge +from paho.mqtt.packettypes import PacketTypes +import paho.mqtt.client as mqtt +import mlstelemetry +# MQTT broker information +mqtt_broker_address = os.getenv('MQTT_IP', 'mqtt-broker.fita.svc.cluster.local') +mqtt_broker_port = int(os.getenv('MQTT_PORT','1883')) +node_id = os.getenv('NODE_ID',"b1_node1") +# Extract the trailing bx_nodek part from NODE_ID values like NODENAME-bx_nodek +# Fallback to the whole node_id if the pattern is not present. +try: + # Split on the last hyphen and take the suffix + bx_suffix = node_id.rsplit("-", 1)[-1] +except Exception as e: + # In case of unexpected issues, keep original + bx_suffix = node_id + +# Use bx_suffix wherever the short node identifier is needed +node_id = bx_suffix + +telemetry_config_list = os.getenv('TELEMETRY_ENDPOINTS',f"{node_id}/Temperature/0/Property/Sensor_Value") + +queues = {"inbound": None, "outbound": None} + +mlsTelemetry = mlstelemetry.MLSTelemetry("fita", "fita_mechanism") + +def test_msg(client, userdata, msg): + print(f"Binaryappdata msg: {msg.payload}") + + +class FitaMechanism: + + state: Dict = field(default_factory=dict) + inbound_queue = None + outbound_queue = None + state = None + fita_proxy_plans = {} + mqtt_bridge = None + + def __init__(self, mlsysops_inbound_queue=None, mlsysops_outbound_queue=None, agent_state=None): + self.inbound_queue = mlsysops_inbound_queue + self.outbound_queue = mlsysops_outbound_queue + self.loop = asyncio.get_running_loop() # inside async context + + self.state = {"applications": {}, "nodes": {}, "submittedPlans": {}} + + self.mqtt_bridge = Mqtt_Bridge(mqtt_broker_address, mqtt_broker_port, node_id) + + print(f"Initializing FITA mechanism of node {node_id} {mqtt_broker_address} : {mqtt_broker_port}") + + self.mqtt_bridge.start(self.fita_on_mqtt_connect) + + self.mqtt_bridge.publish(f"{node_id}/BinaryAppDataContainer/0/Property/Data", '{"operation":"START_OBSERVE"}') + self.mqtt_bridge.subscribe(f"{node_id}/BinaryAppDataContainer/0/Property/Data", test_msg) + + + def fita_on_telemetry_message(self, client, userdata, msg): + print(f"Telemetry message arrived: {msg.payload}") + + payload = json.loads(msg.payload) + + if "operation" in payload.keys(): + return + try: + telemetry_endpoints = telemetry_config_list.split(";") + for endpoint in telemetry_endpoints: + if endpoint.replace("/Property","") in msg.topic: + + keys = endpoint.split("/") #0 node id, 1 object name, 2 object instance, 3 ignore, 4 property name + value = payload[keys[0]]["sdfObject"][keys[1]][int(keys[2])]["sdfProperty"][keys[4]] + + #Send telemetry to mlsysops agent + telemetry_event = {"event":"TELEMETRY_EVENT","endpoint":endpoint, "value":value} + # {node_id}_{object_name}_{object_instance}_{property_name} + telemetry_key = endpoint.replace("/","_") + # TODO : the value format might vary + for key,extracted_value in value.items(): + if extracted_value is None or extracted_value == "": + continue + print(f"Sending telemetry {telemetry_key}: {key} : {extracted_value}") + mlsTelemetry.pushMetric(f"{telemetry_key.replace(f'{node_id}_','')}", + "gauge", + float(extracted_value), + attributes={"fita_node_id": node_id}) + # self.loop.call_soon_threadsafe(self.outbound_queue.put_nowait, telemetry_event) + except Exception as e: + print(e) + print(traceback.format_exc()) + + + # def fita_on_control_knob_message(self, client, userdata, msg): + # try: + # print(f"Binary message arrived: {msg.payload}") + + # payload_json = json.loads(msg.payload) + # operation = bytearray.fromhex(payload_json[node_id]["sdfObject"]["BinaryAppDataContainer"][0]["sdfProperty"]["Data"]["0"]).decode() + # print(f"Operation: {operation}") + + # #CHECK WHAT LAST OPERATION WAS + # if operation == "set\0": + # #Report event + # control_knob_event = {} + # self.outbound_queue.put(control_knob_event) + # except Exception as e: + # logger.exception(e) + + # Callback function for when the client receives a CONNACK response from the broker + def fita_on_mqtt_connect(self, client, userdata, flags, rc, properties): + try: + if rc == 0: + print("Connected to MQTT broker") + + #Subscribe control knob object, not necessary if we don't care about success of action + #self.mqtt_bridge.subscribe(f"{node_id}/BinaryAppDataContainer/0/Data", self.fita_on_control_knob_message) + #self.mqtt_bridge.publish(f"{node_id}/BinaryAppDataContainer/0/Property/Data", '{"operation":"START_OBSERVE"}') + + #Parse Telemetry Configuration + telemetry_endpoints = telemetry_config_list.split(";") + for endpoint in telemetry_endpoints: + print(endpoint) + self.mqtt_bridge.publish(endpoint, '{"operation":"START_OBSERVE"}') + self.mqtt_bridge.subscribe(endpoint.replace("/Property",""), self.fita_on_telemetry_message) #.replace because nextgen has different path due to a bug + self.mqtt_bridge.subscribe(endpoint, self.fita_on_telemetry_message) #.replace because nextgen has different path due to a bug + else: + print("Connection failed with result code " + str(rc)) + except BaseException as e: + print(e) + print(traceback.format_exc()) + + +fita_mechanism_instance = None + + +def initialize(inbound_queue=None, outbound_queue=None, agent_state=None): + global fita_mechanism_instance + + print("Initializing fita mechanism") + + queues["inbound"] = inbound_queue + queues["outbound"] = outbound_queue + + fita_mechanism_instance = FitaMechanism(inbound_queue, outbound_queue, agent_state) + + +async def apply(plan): + global fita_mechanism_instance + + print('Received from MLSysOps queue msg: %s', plan) + event = plan.get("event", None) + data = plan.get("payload", None) + + if event is None or data is None: + print('Ignoring message: One of event/data is missing.') + return False + + #Detect whatever event type = control knob action + if event == "CONTROL_KNOB_EVENT": + + response_topic = str(uuid.uuid4().hex) + fita_mechanism_instance.state['submittedPlans'][response_topic] = \ + {"planid": plan['plan_id'], "status": "PENDING"} + publish_property = mqtt.Properties(PacketTypes.PUBLISH) + publish_property.ResponseTopic = response_topic + + if data["action"] == "SET": + inner_data = { + "sdfProperty":{ + "Data":{ + "0":f'mlsysops knobs set {data["control_knob"]} {data["value"]}' + } + }, + "sdfAction":{}, + "sdfEvent": {} + } + payload = { + "operation": "POST", + "data": json.dumps(inner_data) + } + + #Publish command to set control knob value + fita_mechanism_instance.mqtt_bridge.publish( + f"{node_id}/BinaryAppDataContainer/0", + json.dumps(payload), + properties=publish_property, + ) + + return True + + return False + + +def get_state(): + global fita_mechanism_instance + if fita_mechanism_instance is not None: + return fita_mechanism_instance.state + else: + return {} + + +def get_options(): + global fita_mechanism_instance + return {} \ No newline at end of file diff --git a/agents/fita/mechanisms/fluidity_proxy.py b/agents/fita/mechanisms/fluidity_proxy.py new file mode 100644 index 0000000..9f60773 --- /dev/null +++ b/agents/fita/mechanisms/fluidity_proxy.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # +# # + + +from mlsysops.events import MessageEvents +import asyncio + +queues = {"inbound": None, "outbound": None} +state = None +node_name = None +async def fluidity_proxy_loop(): + global state + global queues + while True: + message = await queues['inbound'].get() + + event = message['event'] + payload = message['payload'] + match (event): + case MessageEvents.MESSAGE_TO_FLUIDITY_PROXY.value: + fluidity_internal_payload = payload.get("payload", {}) + fluidity_internal_event = payload.get("event", None) + match fluidity_internal_event: + case MessageEvents.FLUIDITY_INTERNAL_PLAN_UPDATE.value: + await queues['outbound'].put(fluidity_internal_payload) + case MessageEvents.FLUIDITY_INTERNAL_STATE_UPDATE.value: + # update internal state + state = payload + case "NETWORK_REDIRECT": + continue # TODO + case _: + print("Unknown event in fluidity proxy") + pass + + return False # async + +def initialize(inbound_queue=None, outbound_queue=None, agent_state=None): + global node_name + + queues["inbound"] = outbound_queue + queues["outbound"] = inbound_queue + + node_name = agent_state.configuration.node + + asyncio.create_task(fluidity_proxy_loop()) + +async def apply(plan): + print("--------------------------Applying fluidity plan", plan) + global node_name + # This mechanism uses the messaging interface to send to cluster fluidity + await queues['outbound'].put({ + "event": MessageEvents.MESSAGE_TO_FLUIDITY.value, + "payload": { + "event": MessageEvents.FLUIDITY_INTERNAL_PLAN_SUBMITTED.value, + "payload": { + "event": MessageEvents.PLAN_SUBMITTED.value, + "payload": plan + }, + "node": node_name + } + }) + + +def get_state(): + global state + return state + + +def get_options(): + pass \ No newline at end of file diff --git a/agents/fita/mechanisms/nextgengw_mqtt_bridge.py b/agents/fita/mechanisms/nextgengw_mqtt_bridge.py new file mode 100644 index 0000000..9a9e82f --- /dev/null +++ b/agents/fita/mechanisms/nextgengw_mqtt_bridge.py @@ -0,0 +1,40 @@ +import paho.mqtt.client as mqtt +from paho.mqtt.subscribeoptions import SubscribeOptions +from mlsysops.logger_util import logger + +class Mqtt_Bridge: + def __init__(self, ip,port,device_id): + self.broker_address = ip + self.broker_port = port + self.device_id = device_id + + def publish(self, topic, message, properties=None): + self.client.publish(topic, payload=message, properties=properties) + + def subscribe(self, topic, callback): + self.client.subscribe(topic, options=SubscribeOptions(noLocal=True)) + self.client.message_callback_add(topic,callback) + + # Callback function for when a message is received from the broker + def on_message(self, client, userdata, msg): + try: + logger.info("Unfiltered message arrived : "+ message) + except Exception as e: + logging.exception(datetime.datetime.now()) + + def start(self, connect_cb): + # Create MQTT client instance + self.client = mqtt.Client(protocol=5) + + # Set callback functions + self.client.on_connect = connect_cb + self.client.on_message = self.on_message + + # Set username and password + self.client.username_pw_set(f"NextGen_Agent_{self.device_id}", "Fr4unh0f3r") + + # Connect to MQTT broker + self.client.connect(self.broker_address, self.broker_port, 60) + + # Loop to maintain network traffic flow, handles reconnecting, etc. + self.client.loop_start() \ No newline at end of file diff --git a/agents/fita/policies/policy-fita-power.py b/agents/fita/policies/policy-fita-power.py new file mode 100644 index 0000000..50283a1 --- /dev/null +++ b/agents/fita/policies/policy-fita-power.py @@ -0,0 +1,118 @@ +"""Plugin module for custom policies - notify function.""" +from __future__ import print_function + +import inspect +import pprint +import copy +import json +import logging +import os +import queue +import re +import sys +import threading +import time +import random + +from mlstelemetry import MLSTelemetry + +mlsClient = MLSTelemetry("policy_cpu", "policy_cpu") + + +def parse_analyze_interval(interval: str) -> int: + """ + Parses an analyze interval string in the format 'Xs|Xm|Xh|Xd' and converts it to seconds. + + Args: + interval (str): The analyze interval as a string (e.g., "5m", "2h", "1d"). + + Returns: + int: The interval in seconds. + + Raises: + ValueError: If the format of the interval string is invalid. + """ + # Match the string using a regex: an integer followed by one of s/m/h/d + match = re.fullmatch(r"(\d+)([smhd])", interval) + if not match: + raise ValueError(f"Invalid analyze interval format: '{interval}'") + + # Extract the numeric value and the time unit + value, unit = int(match.group(1)), match.group(2) + + # Convert to seconds based on the unit + if unit == "s": # Seconds + return value + elif unit == "m": # Minutes + return value * 60 + elif unit == "h": # Hours + return value * 60 * 60 + elif unit == "d": # Days + return value * 24 * 60 * 60 + else: + raise ValueError(f"Unsupported time unit '{unit}' in interval: '{interval}'") + + +def initialize(): + initialContext = { + "telemetry": { + "metrics": ["b1_node1_temperature_0_property_sensor_value"], + "system_scrape_interval": "1s" + }, + "mechanisms": [ + "CPUFrequencyConfigurator" + ], + "packages": [ + + ], + "configuration": { + "analyze_interval": "10s" + }, + "latest_timestamp": None, + "core": False, + "scope": "application" + } + + return initialContext + + +async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): + # a simple policy that periodically changes the frequency of the node + # Analyze + current_timestamp = time.time() + + # The first time called + if context['latest_timestamp'] is None: + context['latest_timestamp'] = current_timestamp + return False, context + + # All the next ones, get it + analyze_interval = parse_analyze_interval(context['configuration']['analyze_interval']) + if current_timestamp - context['latest_timestamp'] > analyze_interval: + context['latest_timestamp'] = current_timestamp + return True, context + + return False, context + + + +async def plan(context, application_description, system_description, mechanisms, telemetry, ml_connector): + current_state = "LOW" + print(telemetry) + # FITA command options: LOW,MED, HIGH + if current_state == "LOW": + fita_command = { + "event":"CONTROL_KNOB_EVENT", + "payload":{"control_knob":"TX_PWR","action":"SET","value":"MED"} + } + if current_state == "MED": + fita_command = { + "event": "CONTROL_KNOB_EVENT", + "payload": {"control_knob": "TX_PWR", "action": "SET", "value": "LOW"} + } + + print(f"Fita policy Sending {fita_command}") + new_plan = { + "fita": fita_command + } + return new_plan, context diff --git a/agents/fita/requirements.txt b/agents/fita/requirements.txt new file mode 100644 index 0000000..1801a8b --- /dev/null +++ b/agents/fita/requirements.txt @@ -0,0 +1,17 @@ +mlstelemetry +opentelemetry-exporter-otlp +opentelemetry-api +opentelemetry-sdk +prometheus_client +cachetools +python-dotenv +asyncio +pandas +pynvml +nvidia-pyindex +cpufreq +kubernetes +redis +watchdog +kubernetes_asyncio +paho-mqtt==1.6.1 \ No newline at end of file diff --git a/agents/mlsysops/agent.py b/agents/mlsysops/agent.py index 0efe7c7..202a627 100644 --- a/agents/mlsysops/agent.py +++ b/agents/mlsysops/agent.py @@ -30,6 +30,7 @@ from mlsysops.spade.mls_spade import MLSSpade from mlsysops.tasks.monitor import MonitorTask from mlsysops.data.monitor import MonitorData +from mlsysops.tasks.watchdog import WatchdogTask from mlsysops.logger_util import logger @@ -56,14 +57,11 @@ def __init__(self): # ## -------- SPADE ------------------# logger.debug("Initializing SPADE...") try: - logger.debug("in try...") self.message_queue = asyncio.Queue() - logger.debug("after queue...") self.spade_instance = MLSSpade(self.state, self.message_queue) except Exception as e: logger.error(f"Error initializing SPADE: {e}") - print("blahblahblah") # Telemetry self.telemetry_controller = TelemetryController(self) @@ -87,6 +85,11 @@ def __init__(self): monitor_async_task = asyncio.create_task(self.monitor_task.run()) self.running_tasks.append(monitor_async_task) + # Watchdog task (monitors task_log for TTL expiry) + self.watchdog_task = WatchdogTask(self.state) + watchdog_async_task = asyncio.create_task(self.watchdog_task.run()) + self.running_tasks.append(watchdog_async_task) + # ##--------- Scheduler --------------# logger.debug("Initializing scheduler...") self.scheduler = PlanScheduler(self.state) @@ -133,15 +136,13 @@ async def message_queue_listener(self): """ Task to listen for messages from the message queue and act upon them. """ - print("Starting default Message Queue Listener...") while True: try: # Wait for a message from the queue (default behavior) message = await self.message_queue.get() - print(f"Received message: {message}") - # Default handling logic (can be extended in subclasses) + # Default handling logic (can be overloaded in children classes) except Exception as e: - print(f"Error in message listener: {e}") + logger.error(f"Error in message listener: {e}") async def send_message_to_node(self, recipient, event, payload): """ @@ -185,8 +186,7 @@ async def run(self): """ Main process of the MLSAgent. """ - # Apply MLS System description - print("In RUN of AGENT") + try: if self.state.configuration.continuum_layer == 'cluster': logger.debug(f"Applying system description") @@ -204,13 +204,13 @@ async def run(self): logger.error(f"Error executing command: {e}") await self.policy_controller.load_policy_modules() - await self.policy_controller.load_core_policy_modules() + if self.state.configuration.enable_core_policies: + await self.policy_controller.load_core_policy_modules() await self.telemetry_controller.apply_configuration_telemetry() await self.telemetry_controller.initialize() try: - print("In spade_instance_start") await self.spade_instance.start(auto_register=True) except Exception as e: logger.error(f"Error starting SPADE: {traceback.format_exc()}") @@ -219,5 +219,4 @@ async def run(self): await self.policy_controller.start_global_policies() self.policy_controller.start_policy_directory_monitor() - return True - + return True \ No newline at end of file diff --git a/agents/mlsysops/controllers/application.py b/agents/mlsysops/controllers/application.py index 76532a4..27166a7 100644 --- a/agents/mlsysops/controllers/application.py +++ b/agents/mlsysops/controllers/application.py @@ -36,7 +36,7 @@ def __init__(self, agent): self.agent = agent self.application_tasks_running = {} - def __del__(self): + def __del__(self): """ Cancels and clears all running application tasks upon deletion. @@ -64,7 +64,7 @@ async def on_application_received(self, application_data: Dict): application_id=application_data["name"], application_description=application_data ) - + self.agent.state.add_application(new_application.application_id, new_application) # Update the monitoring list for the application's metrics @@ -76,7 +76,7 @@ async def on_application_received(self, application_data: Dict): await self.agent.monitor_task.add_metric(metric_name) # Start an analyze task for this application - analyze_object = AnalyzeTask(new_application.application_id,self.agent.state, "application") + analyze_object = AnalyzeTask(new_application.application_id, self.agent.state, "application") analyze_task = asyncio.create_task(analyze_object.run()) self.application_tasks_running[new_application.application_id] = analyze_task @@ -112,17 +112,9 @@ async def on_application_updated(self, data): None """ if data['name'] in self.application_tasks_running: - self.agent.state.update_application(data['name'],data) + self.agent.state.update_application(data['name'], data) else: logger.error(f'No application {data["name"]} found.') async def run(self): - """ - Continuously checks the state for new applications and handles them. - """ - while True: - for app_id, app_object in MLSState.applications.items(): - print(f'Application {app_id}') - - # Check periodically (adjust the sleep interval as needed) - await asyncio.sleep(10) \ No newline at end of file + pass diff --git a/agents/mlsysops/controllers/libs/otel_pods.py b/agents/mlsysops/controllers/libs/otel_pods.py index 8524761..c6427de 100644 --- a/agents/mlsysops/controllers/libs/otel_pods.py +++ b/agents/mlsysops/controllers/libs/otel_pods.py @@ -17,7 +17,7 @@ import string import traceback -from kubernetes import client , config , watch +from kubernetes import client, config, watch from enum import Enum from ruamel.yaml import YAML @@ -38,10 +38,12 @@ client_handler = None -class STATUS(Enum): # i use it to check if a node has an otel collector pod deployed and if not we should deploy it + +class STATUS(Enum): # i use it to check if a node has an otel collector pod deployed and if not we should deploy it NOT_DEPLOYED = 0 DEPLOYED = 1 + def get_api_handler(): global client_handler if client_handler is None: @@ -54,7 +56,7 @@ def get_api_handler(): def set_node_dict(v1: client.CoreV1Api) -> None: - global node_list_dict # List of dictionaries + global node_list_dict # List of dictionaries global task_list """ [dict1 , dict2, dict3] @@ -71,23 +73,21 @@ def set_node_dict(v1: client.CoreV1Api) -> None: try: node_list_dict = [] initial_list = [] - http_response = v1.list_node() # http GET , returns a V1NodeList object + http_response = v1.list_node() # http GET , returns a V1NodeList object # Note, the responce is not an ordinary list , it contains V1Node objects item_list = http_response.items - for item in item_list: # item represents a node dictionary , item : V1Node + for item in item_list: # item represents a node dictionary , item : V1Node - initial_list.append(item) # append V1Nodes , i use it later - key = item.metadata.name # Get the key + initial_list.append(item) # append V1Nodes , i use it later + key = item.metadata.name # Get the key assigned_pod_name = pod_name + str(node_counter) - label_value = item.metadata.labels # Get the labels + label_value = item.metadata.labels # Get the labels config_name = configmap_name + str(node_counter) - - - val = [assigned_pod_name , config_name , STATUS.NOT_DEPLOYED , label_value] - node = {key : val} + val = [assigned_pod_name, config_name, STATUS.NOT_DEPLOYED, label_value] + node = {key: val} node_list_dict.append(node) node_counter += 1 task_list = [None] * node_counter @@ -141,11 +141,9 @@ def create_pod_spec(pod_name: str, node_name: str, configmap_name: str) -> str: async def create_pod(v1: client.CoreV1Api, pod_name: str, node_name: str, configmap_name: str) -> None: # Define the pod spec - pod_spec = create_pod_spec(pod_name,node_name, configmap_name) - logger.debug(f'Pod spec is {pod_spec}') + pod_spec = create_pod_spec(pod_name, node_name, configmap_name) try: http_response = v1.create_namespaced_pod(namespace=namespace, body=pod_spec) # HTTP POST - logger.info(f"Pod {pod_name} created successfully on node {node_name} in namespace {namespace}.") except client.exceptions.ApiException as ex: if ex.status == 404: logger.error(f"Status 404: Pod creation failed for pod {pod_name} in namespace {namespace}.") @@ -197,9 +195,10 @@ def create_node_exporter_pod_spec(pod_name: str, node_name: str, flags: str, por return manifest_dict -async def create_node_exporter_pod(v1: client.CoreV1Api, pod_name: str, node_name: str,flags: str, port: int) -> None: + +async def create_node_exporter_pod(v1: client.CoreV1Api, pod_name: str, node_name: str, flags: str, port: int) -> None: # Define the pod spec - pod_spec = create_node_exporter_pod_spec(pod_name,node_name,flags,port) + pod_spec = create_node_exporter_pod_spec(pod_name, node_name, flags, port) logger.debug(f'Pod spec is {pod_spec}') try: http_response = v1.create_namespaced_pod(namespace=namespace, body=pod_spec) # HTTP POST @@ -217,12 +216,11 @@ async def create_node_exporter_pod(v1: client.CoreV1Api, pod_name: str, node_nam logger.error(str(e)) return None -def delete_pod(v1:client.CoreV1Api , pod_name:str) -> None: +def delete_pod(v1: client.CoreV1Api, pod_name: str) -> None: try: - http_response = v1.delete_namespaced_pod(name = pod_name, namespace= namespace,body = client.V1DeleteOptions(grace_period_seconds = 0)) - logger.debug(f'Pod with name {pod_name} from {namespace} namespace has been deleted') - + http_response = v1.delete_namespaced_pod(name=pod_name, namespace=namespace, + body=client.V1DeleteOptions(grace_period_seconds=0)) except client.exceptions.ApiException as e: logger.error(traceback.format_exc()) if e.status == 404: @@ -232,14 +230,14 @@ def delete_pod(v1:client.CoreV1Api , pod_name:str) -> None: return None -async def create_configmap(v1: client.CoreV1Api, configmap_name: str, otel_specs :str , verbose=False) -> client.V1ConfigMap: +async def create_configmap(v1: client.CoreV1Api, configmap_name: str, otel_specs: str, + verbose=False) -> client.V1ConfigMap: try: configmap = client.V1ConfigMap( metadata=client.V1ObjectMeta(name=configmap_name), data={"otel-collector-config.yaml": otel_specs} ) - # Run the synchronous API call in a separate thread created_configmap = v1.create_namespaced_config_map(namespace, configmap) @@ -258,13 +256,14 @@ async def create_configmap(v1: client.CoreV1Api, configmap_name: str, otel_specs def remove_configmap(v1: client.CoreV1Api, configmap_name: str) -> None: try: - http_response = v1.delete_namespaced_config_map( name=configmap_name, namespace=namespace) + http_response = v1.delete_namespaced_config_map(name=configmap_name, namespace=namespace) except client.exceptions.ApiException as ex: logger.error(f"Error removing ConfigMap due to API '{configmap_name}': {ex.reason}") except Exception as ex: logger.error(f"Error removing ConfigMap '{configmap_name}': {ex}") + def remove_service() -> None: """ Removes a specified Kubernetes service from a namespace. @@ -288,16 +287,18 @@ def remove_service() -> None: logger.error(f"Error removing Service '{service_name}': {ex}") -async def read_configmap(v1: client.CoreV1Api , configmap_name: str) -> client.V1ConfigMap : # Return the configmap object not the dict +async def read_configmap(v1: client.CoreV1Api, + configmap_name: str) -> client.V1ConfigMap: # Return the configmap object not the dict try: - configmap_obj = v1.read_namespaced_config_map( name=configmap_name, namespace=namespace) - return(configmap_obj) + configmap_obj = v1.read_namespaced_config_map(name=configmap_name, namespace=namespace) + return (configmap_obj) except Exception as ex: logger.error(ex) return None -async def redeploy_configmap(v1:client.CoreV1Api, otel_specs: str,configmap: client.V1ConfigMap) -> None: - try : + +async def redeploy_configmap(v1: client.CoreV1Api, otel_specs: str, configmap: client.V1ConfigMap) -> None: + try: """ Configmap is a V1ConfigMap obj , we want to change the .data field with the new otel specs We cannot access the configmap.data[key] like a list , because the .keys method returns a dictionary with keys and not a list we also could use the key name (see above) but i want to add more abstraction @@ -306,9 +307,10 @@ async def redeploy_configmap(v1:client.CoreV1Api, otel_specs: str,configmap: cli for key in keys: configmap.data[key] = otel_specs - configmap_name = configmap.metadata.name # str + configmap_name = configmap.metadata.name # str - http_response = v1.replace_namespaced_config_map(name = configmap_name, namespace = namespace,body = configmap) # http PUT + http_response = v1.replace_namespaced_config_map(name=configmap_name, namespace=namespace, + body=configmap) # http PUT # The body argument is a V1ConfigMap obj @@ -318,8 +320,8 @@ async def redeploy_configmap(v1:client.CoreV1Api, otel_specs: str,configmap: cli logger.error(e) return None -async def deploy_node_exporter_pod(node_name: str, flags: str,port: int) -> bool : +async def deploy_node_exporter_pod(node_name: str, flags: str, port: int) -> bool: v1 = get_api_handler() logger.debug(f'Node exporter Pod with name:{node_name} is been created') @@ -329,13 +331,15 @@ async def deploy_node_exporter_pod(node_name: str, flags: str,port: int) -> bool except Exception as e: logger.error(f'Error creating pod for node {node_name} : {e}') logger.error(traceback.format_exc()) - return None,None + return None, None return final_pod_name -async def create_otel_pod(node_name: str , otel_yaml) -> bool : + +async def create_otel_pod(node_name: str, otel_yaml) -> bool: """ Creates an OpenTelemetry (OTEL) pod and its associated ConfigMap on the provided node. + If the pod or ConfigMap already exist, they are deleted first before recreation. This asynchronous function is responsible for setting up the necessary ConfigMap and pod to enable OpenTelemetry functionality for a specific node in a Kubernetes cluster. @@ -357,15 +361,66 @@ async def create_otel_pod(node_name: str , otel_yaml) -> bool : logger.debug(f'OTEL Pod with name:{node_name} is been created') final_config_name = f"{base_configmap_name}-{node_name}" final_pod_name = f"{base_pod_name}-{node_name}" + try: + # Check if the OTEL pod already exists + existing_pod = None + try: + existing_pod = v1.read_namespaced_pod(name=final_pod_name, namespace=namespace) + logger.info(f"OTEL Pod {final_pod_name} already exists in namespace {namespace}. It will be deleted and recreated.") + except client.exceptions.ApiException as ex: + if ex.status == 404: + logger.debug(f"OTEL Pod {final_pod_name} does not exist in namespace {namespace}.") + else: + logger.error(f"Error while checking OTEL pod existence: {ex.reason} (code: {ex.status})") + logger.error(traceback.format_exc()) + return None, None + + # Check if the ConfigMap already exists + existing_configmap = None + try: + existing_configmap = v1.read_namespaced_config_map(name=final_config_name, namespace=namespace) + logger.info(f"ConfigMap {final_config_name} already exists in namespace {namespace}. It will be deleted and recreated.") + except client.exceptions.ApiException as ex: + if ex.status == 404: + logger.debug(f"ConfigMap {final_config_name} does not exist in namespace {namespace}.") + else: + logger.error(f"Error while checking ConfigMap existence: {ex.reason} (code: {ex.status})") + logger.error(traceback.format_exc()) + return None, None + + # If the pod exists, delete it + if existing_pod: + try: + delete_pod(v1, final_pod_name) + logger.info(f"Pod {final_pod_name} deleted successfully.") + except Exception as e: + logger.error(f"Error while deleting existing pod {final_pod_name}: {e}") + logger.error(traceback.format_exc()) + return None, None + + # If the ConfigMap exists, delete it + if existing_configmap: + try: + remove_configmap(v1, final_config_name) + logger.info(f"ConfigMap {final_config_name} deleted successfully.") + except Exception as e: + logger.error(f"Error while deleting existing ConfigMap {final_config_name}: {e}") + logger.error(traceback.format_exc()) + return None, None + + # Create the ConfigMap and OTEL pod await create_configmap(v1, final_config_name, otel_yaml) await create_pod(v1, final_pod_name, node_name, final_config_name) + logger.info(f"Successfully created OTEL ConfigMap {final_config_name} and Pod {final_pod_name}.") + except Exception as e: logger.error(f'Error creating pod for node {node_name} : {e}') logger.error(traceback.format_exc()) - return None,None + return None, None + + return final_pod_name, final_config_name - return final_pod_name , final_config_name def delete_otel_pod(node_name: str) -> bool: """ @@ -397,6 +452,7 @@ def delete_otel_pod(node_name: str) -> bool: return True + def delete_node_exporter_pod(node_name: str) -> bool: v1 = get_api_handler() @@ -410,7 +466,7 @@ def delete_node_exporter_pod(node_name: str) -> bool: return True -def create_svc_manifest(name_prefix=None,selector="otel-collector"): +def create_svc_manifest(name_prefix=None, selector="otel-collector"): """Create manifest for service-providing component using Jinja template. Returns: manifest (str): The rendered service manifest as a string. @@ -430,18 +486,18 @@ def create_svc_manifest(name_prefix=None,selector="otel-collector"): 'name': name, 'type': "ClusterIP", 'selector': selector, - "otlp_grpc_port": int(os.getenv("MLS_OTEL_GRPC_PORT","43170")), - "otlp_http_port": int(os.getenv("MLS_OTEL_HTTP_PORT","43180")), - "otlp_prometheus_port": int(os.getenv("MLS_OTEL_PROM_PORT","9999")) + "otlp_grpc_port": int(os.getenv("MLS_OTEL_GRPC_PORT", "43170")), + "otlp_http_port": int(os.getenv("MLS_OTEL_HTTP_PORT", "43180")), + "otlp_prometheus_port": int(os.getenv("MLS_OTEL_PROM_PORT", "9999")) }) - yaml = YAML(typ='safe',pure=True) + yaml = YAML(typ='safe', pure=True) manifest_dict = yaml.load(manifest) return manifest_dict -async def create_svc(name_prefix=None,svc_manifest=None,selector=None): +async def create_svc(name_prefix=None, svc_manifest=None, selector=None): """Create a Kubernetes service. Note: For testing it deletes the service if already exists. @@ -454,14 +510,13 @@ async def create_svc(name_prefix=None,svc_manifest=None,selector=None): """ core_api = get_api_handler() if svc_manifest is None: - svc_manifest = create_svc_manifest(name_prefix,selector) + svc_manifest = create_svc_manifest(name_prefix, selector) resp = None try: logger.info('Trying to read service if already exists') resp = core_api.read_namespaced_service( name=svc_manifest['metadata']['name'], namespace=namespace) - #print(resp) except ApiException as exc: if exc.status != 404: logger.error('Unknown error reading service: %s', exc) @@ -472,22 +527,22 @@ async def create_svc(name_prefix=None,svc_manifest=None,selector=None): resp = core_api.delete_namespaced_service( name=svc_manifest['metadata']['name'], namespace=namespace) - #print(resp) except ApiException as exc: logger.error('Failed to delete service: %s', exc) try: svc_obj = core_api.create_namespaced_service(body=svc_manifest, namespace=namespace) - #print(svc_obj) return svc_obj except ApiException as exc: logger.error('Failed to create service: %s', exc) return None -async def create_node_exporter_pod_with_restart(v1: client.CoreV1Api, pod_name: str, node_name: str, flags: str, port: int) -> None: + +async def create_node_exporter_pod_with_restart(v1: client.CoreV1Api, pod_name: str, node_name: str, flags: str, + port: int) -> None: """ Checks if a pod already exists. If it exists, deletes the pod and recreates it. - + Args: v1: Kubernetes CoreV1Api client. pod_name: Name of the pod to create or restart. @@ -530,6 +585,7 @@ async def create_node_exporter_pod_with_restart(v1: client.CoreV1Api, pod_name: logger.error(traceback.format_exc()) return None + async def create_otel_pod_with_restart(node_name: str, otel_yaml: dict): """ Checks if an OpenTelemetry (OTEL) pod exists. If it does not exist, deletes the associated @@ -564,11 +620,10 @@ async def create_otel_pod_with_restart(node_name: str, otel_yaml: dict): # If the pod exists, delete it and its associated ConfigMap if existing_pod: try: - logger.info(f"Deleting existing OTEL Pod {final_pod_name} and ConfigMap {final_config_name}.") delete_otel_pod(node_name) - logger.info(f"Deleted OTEL Pod {final_pod_name} and ConfigMap {final_config_name} successfully.") except Exception as e: - logger.error(f"Error while deleting existing OTEL pod {final_pod_name} or ConfigMap {final_config_name}: {e}") + logger.error( + f"Error while deleting existing OTEL pod {final_pod_name} or ConfigMap {final_config_name}: {e}") logger.error(traceback.format_exc()) return final_pod_name, final_config_name # Stop on delete error diff --git a/agents/mlsysops/controllers/mechanisms.py b/agents/mlsysops/controllers/mechanisms.py index 6c74bcc..85d2dff 100644 --- a/agents/mlsysops/controllers/mechanisms.py +++ b/agents/mlsysops/controllers/mechanisms.py @@ -16,6 +16,7 @@ import importlib import os import asyncio +import traceback from mlsysops.data.state import MLSState from mlsysops.logger_util import logger @@ -113,4 +114,5 @@ def load_mechanisms_modules(self, agent_state): logger.debug(f"{self._state.active_mechanisms[mechanism_name]}") except Exception as e: - logger.error(f"Failed to load module {mechanism_name} from {file_path}: {e}") \ No newline at end of file + logger.error(f"Failed to load module {mechanism_name} from {file_path}: {e}") + logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/agents/mlsysops/controllers/policy.py b/agents/mlsysops/controllers/policy.py index 0cd3b8e..6b023c5 100644 --- a/agents/mlsysops/controllers/policy.py +++ b/agents/mlsysops/controllers/policy.py @@ -21,6 +21,7 @@ from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler +from watchdog.observers.polling import PollingObserver import asyncio @@ -77,7 +78,7 @@ def get_policy_instance(self, scope: str, id: str, policy_name: str = None ): None if no matching policy exists or an error occurs. """ try: - # logger.debug(f"Getting policy instance for scope: {scope} and id: {id} name {policy_name}") + logger.debug(f"Getting policy instance for scope: {scope} and id: {id} name {policy_name} active policies {self.active_policies}") if policy_name is None: # analyze calls if scope == PolicyScopes.APPLICATION.value: return self.active_policies[scope][id].items() @@ -93,7 +94,7 @@ def get_policy_instance(self, scope: str, id: str, policy_name: str = None ): return self.active_policies[scope][policy_name] except Exception as e: logger.error(f"Invalid policy instance: {e}") - logger.error(f"active_policies {traceback.format_exc()}") + logger.error(f"{traceback.format_exc()}") return None async def start_global_policies(self): @@ -103,7 +104,8 @@ async def start_global_policies(self): if policy_template.scope == PolicyScopes.GLOBAL.value: new_policy_object = policy_template.clone() new_policy_object.load_module() - new_policy_object.initialize(self.agent) + if not new_policy_object.initialize(self.agent): + return # TODO put some check, if the policies handle mechanism that are not available new_analyze_task = AnalyzeClass.AnalyzeTask( id=new_policy_object.name, @@ -123,6 +125,9 @@ async def start_application_policies(self,application_id): new_policy_object = policy_template.clone() new_policy_object.load_module() new_policy_object.initialize(self.agent) + if not new_policy_object.initialize(self.agent): + continue + if not self.active_policies[PolicyScopes.APPLICATION.value].get(application_id): self.active_policies[PolicyScopes.APPLICATION.value][application_id] = {} @@ -184,12 +189,13 @@ async def load_policy_modules(self): policy_object = Policy(policy_name, file_path) policy_object.load_module() policy_object.validate() - policy_object.initialize(self.agent) + if not policy_object.initialize(self.agent): + continue # Add the policy in the module self.state.add_policy(policy_name,policy_object) # add the global policies as templates - logger.info(f"Loaded module {policy_name} from {file_path}") + logger.info(f"Loaded policy module {policy_name} from {file_path}") except Exception as e: logger.error(f"Failed to load policy modules: {e}") @@ -228,7 +234,8 @@ async def load_core_policy_modules(self): policy_object = Policy(policy_name, file_path, core=True) policy_object.load_module() policy_object.validate() - policy_object.initialize(self.agent) + if not policy_object.initialize(self.agent): + continue # Add the policy in the module self.state.add_policy(policy_name, policy_object) # add the global policies as templates @@ -239,6 +246,7 @@ async def load_core_policy_modules(self): def handle_policy_change(self,file_path: str, event: FileEvents): filename = os.path.basename(file_path) + logger.warning(f"Policy change detected: {filename} {event}") if filename.startswith("policy-") and filename.endswith(".py"): policy_name = filename.split('-')[1].rsplit('.py', 1)[0] match event: @@ -247,11 +255,12 @@ def handle_policy_change(self,file_path: str, event: FileEvents): policy_object = Policy(policy_name, file_path) policy_object.load_module() policy_object.validate() - policy_object.initialize(self.agent) + if not policy_object.initialize(self.agent): + return # Add the policy in the module self.state.add_policy(policy_name, policy_object) # add the global policies as templates - + logger.warning(f"Added new policy to state: {policy_name} {self.state.policies} and active apps {self.active_policies}") # activate the policy new_policy_object = policy_object.clone() new_policy_object.load_module() @@ -271,6 +280,11 @@ def handle_policy_change(self,file_path: str, event: FileEvents): for running_application_id in self.active_policies[PolicyScopes.APPLICATION.value].keys(): self.active_policies[PolicyScopes.APPLICATION.value][running_application_id][ new_policy_object.name] = new_policy_object + if len(self.active_policies[PolicyScopes.APPLICATION.value].keys()) == 0: + # no policies add it + for application_id in self.state.applications.keys(): + logger.warning(f"starting application policy for application_id: {application_id}") + self.agent.current_loop.create_task(self.start_application_policies(application_id)) logger.debug(f"Started new Application Policy {new_policy_object.name}") logger.info(f"Loaded new policy from file: {policy_name} {file_path}") @@ -287,7 +301,7 @@ def handle_policy_change(self,file_path: str, event: FileEvents): policy_object.load_module() policy_object.validate() policy_object.initialize(self.agent) - logger.info(f"Reloaded module application {policy_name}") + logger.info(f"Reloaded application policy {policy_name}") except Exception as e: logger.error(f"Error while reloading policy modules: {e}") case FileEvents.DELETED: @@ -324,14 +338,36 @@ def start_policy_directory_monitor(self): """ directory = self.state.configuration.policy_directory + # Decide on observer type: + # - Use PollingObserver when running in Kubernetes or when files are symlinks, + # to properly detect updates to symlink targets (ConfigMap mounts). + use_polling = False + try: + # Heuristic: in Kubernetes, this env var is present + if os.environ.get("KUBERNETES_SERVICE_HOST"): + use_polling = True + else: + # If any policy file is a symlink, prefer polling for reliability + for filename in os.listdir(directory): + if filename.startswith("policy-") and filename.endswith(".py"): + file_path = os.path.join(directory, filename) + if os.path.islink(file_path): + use_polling = True + break + except Exception as e: + logger.warning(f"Falling back to default observer due to detection error: {e}") + # Set up the event handler and observer event_handler = PolicyDirectoryHandler(callback=self.handle_policy_change) - self.observer = Observer() + self.observer = PollingObserver() if use_polling else Observer() self.observer.schedule(event_handler, directory, recursive=False) # Start the observer in the background (non-blocking) self.observer.start() - logger.info(f"Started monitoring the policy directory: {directory}") + logger.info( + f"Started monitoring the policy directory: {directory} " + f"(observer={'PollingObserver' if use_polling else 'Observer'})" + ) def stop_policy_directory_monitor(self): """ @@ -363,3 +399,10 @@ def on_created(self, event): def on_deleted(self, event): if not event.is_directory: self.callback(event.src_path, FileEvents.DELETED) + + def on_moved(self, event): + # Handle atomic replace/rename patterns often used for updates + # Prefer the destination path if available. + if not event.is_directory: + new_path = getattr(event, "dest_path", event.src_path) + self.callback(new_path, FileEvents.MODIFIED) diff --git a/agents/mlsysops/controllers/telemetry.py b/agents/mlsysops/controllers/telemetry.py index 731ae2c..7769810 100644 --- a/agents/mlsysops/controllers/telemetry.py +++ b/agents/mlsysops/controllers/telemetry.py @@ -159,6 +159,10 @@ async def initialize(self): if self.agent.state.configuration.node_exporter_enabled: node_exporter_pod_port = int(os.getenv("MLS_NODE_EXPORTER_PORT", "9200")) node_exporter_flags = os.getenv("MLS_OTEL_NODE_EXPORTER_FLAGS", "os") + if node_exporter_flags == "os": + # check for agent configuration + if self.agent.state.configuration.node_exporter_collectors is not None: + node_exporter_flags = self.agent.state.configuration.node_exporter_collectors pod_name = await deploy_node_exporter_pod(self.agent.state.hostname,node_exporter_flags,node_exporter_pod_port) self.node_exporter_pod_list.append({ "node": self.agent.state.hostname, @@ -196,6 +200,10 @@ async def initialize(self): if self.agent.state.configuration.node_exporter_enabled: node_exporter_pod_port = int(os.getenv("MLS_NODE_EXPORTER_PORT", "9200")) node_exporter_flags = os.getenv("MLS_OTEL_NODE_EXPORTER_FLAGS", "os") + if node_exporter_flags == "os": + # check for agent configuration + if self.agent.state.configuration.node_exporter_collectors is not None: + node_exporter_flags = self.agent.state.configuration.node_exporter_collectors payload = {"node": self.agent.state.hostname, "port": node_exporter_pod_port, "flags": node_exporter_flags} await self.agent.send_message_to_node(self.agent.state.configuration.cluster,mlsysops.events.MessageEvents.NODE_EXPORTER_DEPLOY.value,payload) return @@ -216,7 +224,6 @@ async def initialize(self): tempo_export_endpoint=os.getenv("MLS_OTEL_TEMPO_EXPORT_ENDPOINT"), local_endpoint_metrics_expiration=str(scrape_interval + 5) + "s", otel_collector_selector="continuum-otel-collector" - ) self.local_config = parsed_otel_config @@ -230,6 +237,10 @@ async def initialize(self): if self.agent.state.configuration.node_exporter_enabled: node_exporter_pod_port = int(os.getenv("MLS_NODE_EXPORTER_PORT", "9200")) node_exporter_flags = os.getenv("MLS_OTEL_NODE_EXPORTER_FLAGS", "os") + if node_exporter_flags == "os": + # check for agent configuration + if self.agent.state.configuration.node_exporter_collectors is not None: + node_exporter_flags = self.agent.state.configuration.node_exporter_collectors pod_name = await deploy_node_exporter_pod(self.agent.state.hostname, node_exporter_flags, node_exporter_pod_port) self.node_exporter_pod_list.append({ diff --git a/agents/mlsysops/crds/MLSysOpsApplication.yaml b/agents/mlsysops/crds/MLSysOpsApplication.yaml new file mode 100644 index 0000000..a5aaf24 --- /dev/null +++ b/agents/mlsysops/crds/MLSysOpsApplication.yaml @@ -0,0 +1,331 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + # name must match the spec fields below, and be in the form: . + name: mlsysopsapps.mlsysops.eu +spec: + # group name to use for REST API: /apis// + group: mlsysops.eu + scope: Namespaced + names: + plural: mlsysopsapps + singular: mlsysopsapp + kind: MLSysOpsApp + shortNames: + - mlsapp + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + name: + type: string + description: The application name. + cluster_placement: + type: object + properties: + cluster_id: + type: array + items: + type: string + description: Array of clusters that can host the application. + components: + type: array + items: + type: object + properties: + metadata: + type: object + properties: + name: + type: string + description: The unique name of the component + uid: + type: string + description: The unique identifier of the component (not given by app provider). + node_placement: + type: object + properties: + continuum_layer: + type: array + items: + type: string + enum: + - cloud + - far_edge + - edge_infrastructure + - edge + - "*" + description: The required component placement on the continuum. "*" symbol means "anywhere on the continuum". + mobile: + type: boolean + description: Specify if the component needs to be deployed on a mobile node (optional) + labels: + type: array + items: + type: string + description: The required labels for filtering. + node: + type: string + description: The required node name to be the host of the component (optional). + sensors: + type: array + items: + type: object + properties: + camera: + type: object + properties: + model: + type: string + enum: + - d455 + - imx477 + - picamera-v2 + description: The model name of the camera sensor + camera_type: + type: string + enum: + - rgb + - nir + - thermal + - monocular + description: The camera sensor type. + minimum_framerate: + type: integer + resolution: + type: string + enum: + - 1024x768 + - 4056x3040 + temperature: + type: object + properties: + model: + type: string + enum: + - sdc30 + - ds18b20 + description: The model name of the temperature sensor + qos_metrics: + type: array + items: + type: object + properties: + application_metric_id: + type: string + description: App metric id. + target: + type: number + relation: + type: string + enum: + - lower_or_equal + - greater_or_equal + - equal + - lower_than + - greater_than + host_network: + type: boolean + description: Host networking requested for this component. + Use the host's network namespace. If this option is set, + the ports that will be used must be specified. Default to false. + runtime_class_name: + type: string + enum: + - nvidia + - default + - kata-fc + - kata-dragon + - urunc + - crun + - lunatic + - nvidia-experimental + - spin + - wasmedge + - slight + - runc + restart_policy: + type: string + enum: + - Always + - OnFailure + - Never + description: Restart policy for the container. Default to Always. + os: + type: string + enum: + - ubuntu + - kali + - zephyr + node_type: + type: string + enum: + - virtualized # In the form of a Virtual Machine + - native # Non-virtualized, including OS + - bare_metal # Non-virtualized, without OS + container_runtime: + type: string + enum: + - containerd + - docker + - embserve + - kata + - kata-qemu + - kata-clh + - kata-fc + - urunc + - nvidia + containers: + type: array + items: + type: object + properties: + image: + type: string + description: The name of the container image. + command: + type: array + items: + type: string + image_pull_policy: + type: string + enum: + - Always + - Never + - IfNotPresent + description: Image pull policy. Defaults to Always if :latest tag is specified, + or IfNotPresent otherwise. + platform_requirements: + type: object + description: The resource requirements of the container. + properties: + cpu: + type: object + properties: + requests: + type: string + limits: + type: string + architecture: + type: array + items: + type: string + enum: + - arm64 + - amd64 + - arm-v7 + - arm-v8 + frequency: + type: number + description: required frequency in Hz. + performance_indicator: + type: number + description: This field assists MLSysOps with an initial hint in order to + filter out nodes based on their performance capabilities. + memory: + type: object + properties: + requests: + type: string + limits: + type: string + disk: + type: string + description: required Disk space (in GB). + gpu: + type: object + properties: + model: + type: string + enum: + - k80 + - k40 + memory: + type: string + performance_indicator: + type: number + description: This field assists MLSysOps with an initial hint in order to + filter out nodes based on their performance capabilities. + ports: + type: array + items: + type: object + properties: + container_port: + type: integer + description: Number of port to expose on the component's IP address. + This must be a valid port number, 0 < x < 65536. + protocol: + type: string + enum: + - UDP + - TCP + - SCTP + description: Protocol for port. Defaults to "TCP". + description: Environment variables for the container. + env: + type: array + items: + type: object + properties: + name: + type: string + description: Name of the environment variable. + value_from: + type: object + properties: + field_ref: + type: object + properties: + field_path: + type: string + value: + type: string + description: Value of the environment variable. + description: Environment variables for the container. + required: + - containers + - metadata + component_interactions: + type: array + items: + type: object + properties: + component_name1: + type: string + description: The "source" component. + type: + type: string + enum: + - ingress + - egress + component_name2: + type: string + description: The "destination" component. + global_satisfaction: + type: object + properties: + threshold: + type: number + description: Happiness minimum required value (range (0-1]) + relation: + type: string + enum: + - greater_or_equal + - equal + - greater_than + achievement_weights: + type: array + items: + type: object + properties: + metric_id: + type: string + weight: + type: number + required: + - components diff --git a/agents/mlsysops/crds/MLSysOpsCluster.yaml b/agents/mlsysops/crds/MLSysOpsCluster.yaml new file mode 100644 index 0000000..f40171e --- /dev/null +++ b/agents/mlsysops/crds/MLSysOpsCluster.yaml @@ -0,0 +1,39 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + # name must match the spec fields below, and be in the form: . + name: mlsysopsclusters.mlsysops.eu +spec: + # group name to use for REST API: /apis// + group: mlsysops.eu + scope: Namespaced #Cluster + names: + plural: mlsysopsclusters + singular: mlsysopscluster + kind: MLSysOpsCluster + shortNames: + - mlscluster + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + description: MLSysOps cluster formal specification. + properties: + name: + type: string + description: The cluster name. + cluster_id: + type: string + description: The unique cluster identifier. + nodes: + type: array + items: + type: string + description: The set of registered nodes. + required: + - name + - cluster_id + - nodes diff --git a/agents/mlsysops/crds/MLSysOpsContinuum.yaml b/agents/mlsysops/crds/MLSysOpsContinuum.yaml new file mode 100644 index 0000000..25a4a87 --- /dev/null +++ b/agents/mlsysops/crds/MLSysOpsContinuum.yaml @@ -0,0 +1,39 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + # name must match the spec fields below, and be in the form: . + name: mlsysopscontinuums.mlsysops.eu +spec: + # group name to use for REST API: /apis// + group: mlsysops.eu + scope: Namespaced #Cluster + names: + plural: mlsysopscontinuums + singular: mlsysopscontinuum + kind: MLSysOpsContinuum + shortNames: + - mlscontinuum + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + description: MLSysOps continuum formal specification. + properties: + name: + type: string + description: The continuum slice name. + continuum_id: + type: string + description: The unique continuum identifier. + clusters: + type: array + items: + type: string + description: The set of registered clusters. + required: + - name + - continuum_id + - clusters diff --git a/agents/mlsysops/crds/MLSysOpsNode.yaml b/agents/mlsysops/crds/MLSysOpsNode.yaml new file mode 100644 index 0000000..a10a220 --- /dev/null +++ b/agents/mlsysops/crds/MLSysOpsNode.yaml @@ -0,0 +1,167 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + # name must match the spec fields below, and be in the form: . + name: mlsysopsnodes.mlsysops.eu +spec: + # group name to use for REST API: /apis// + group: mlsysops.eu + scope: Namespaced #Cluster + names: + plural: mlsysopsnodes + singular: mlsysopsnode + kind: MLSysOpsNode + shortNames: + - mlsnode + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + name: + type: string + description: The name of the node. + labels: + type: array + items: + type: string + description: The required labels for filtering. + continuum_layer: + type: string + enum: + - cloud + - edge_infrastructure + - edge + - far_edge + cluster_id: + type: string + description: The unique cluster identifier that the node resides. + mobile: + type: boolean + description: Specify if the node is mobile or stationary. + location: + type: array + description: + This is used for fixed nodes. We assume that mobile node's + location is telemetry data which is not captured via these descriptions. + We can also assume that for mobile nodes this refers to base station's coordinates (lon, lat). + items: + type: number + sensors: + type: array + description: Available sensors on a node are presented as services provided by MLSysOps. + items: + type: object + properties: + camera: + type: object + properties: + model: + type: string + enum: + - imx415 + - imx219 + - d455 + - imx477 + - picamera-v2 + description: The model name of the camera sensor. + camera_type: + type: string + description: The camera sensor type. + framerate: + type: integer + supported_resolutions: + type: array + items: + type: string + enum: + - 1024x768 + - 4056x3040 + temperature: + type: object + properties: + model: + type: string + enum: + - sdc30 + - ds18b20 + description: The model name of the temperature sensor + environment: + type: object + properties: + node_type: + type: string + enum: + - virtualized # In the form of a Virtual Machine + - native # Non-virtualized, including OS + - bare_metal # Non-virtualized, without OS + os: + type: string + enum: + - ubuntu + - kali + - zephyr + container_runtime: + type: array + items: + type: string + enum: + - containerd + - docker + - embserve + - kata + - kata-qemu + - kata-clh + - kata-fc + - urunc + - nvidia + hardware: + type: object + properties: + cpu: + type: object + properties: + model: + type: string + description: CPU model name. + architecture: + type: string + enum: + - amd64 + - arm64 + - arm-v7 + - arm-v8 + frequency: + type: array + description: All the possible CPU frequency values in Hz. + items: + type: number + performance_indicator: + type: number + description: Quantifies the processing capabilities of the platform. + memory: + type: string + description: Memory size (in GB). + disk: + type: string + description: Disk space in GB (local storage). + gpu: + type: object + properties: + model: + type: string + enum: + - k80 + - k40 + memory: + type: string + performance_indicator: + type: number + description: Quantifies the processing capabilities of the platform. + required: + - continuum_layer + - environment + - hardware \ No newline at end of file diff --git a/agents/mlsysops/data/configuration.py b/agents/mlsysops/data/configuration.py index 9b23008..5bd809d 100644 --- a/agents/mlsysops/data/configuration.py +++ b/agents/mlsysops/data/configuration.py @@ -19,27 +19,58 @@ import yaml import os +def _get_mechanisms_list(): + """ + Get mechanisms list from MLS_MECHANISM_ENABLED environment variable. + + Returns: + List[str]: List of mechanisms from comma-separated env variable, or empty list if not set. + """ + env_value = os.getenv("MLS_MECHANISM_ENABLED") + if env_value is None: + return [] + return [m.strip() for m in env_value.split(",") if m.strip()] + + +def _get_bool_env(env_var: str, default: bool): + """ + Get boolean value from environment variable. + + Args: + env_var: Environment variable name + default: Default value if env variable not set + + Returns: + bool: True if env var is 'true', '1', 'yes', 'on' (case-insensitive), False otherwise + """ + env_value = os.getenv(env_var) + if env_value is None: + return default + return env_value.lower() in ('true', '1', 'yes', 'on') + @dataclass class AgentConfig: """ Dataclass representing the agent configuration. """ - mechanisms: List[str] = field(default_factory=list) + mechanisms: List[str] = field(default_factory=lambda: _get_mechanisms_list()) default_telemetry_metrics: List[str] = field(default_factory=list) - policy_directory: str = "" - mechanisms_directory: str = "" + policy_directory: str = field(default_factory=lambda: os.getenv("MLS_POLICY_DIRECTORY", "/etc/mlsysops/policies")) + mechanisms_directory: str = field(default_factory=lambda: os.getenv("MLS_MECHANISMS_DIRECTORY", "/etc/mlsysops/mechanisms")) continuum_layer: str = "" behaviours: Dict[str, bool] = field(default_factory=dict) system_description: dict = field(default_factory=dict) # Telemetry - node_exporter_scrape_interval: str = "5s" - monitoring_interval: str = "5s" + node_exporter_scrape_interval: str = field(default_factory=lambda: os.getenv("MLS_NODE_EXPORTER_SCRAPE_INTERVAL", "5s")) + monitoring_interval: str = field(default_factory=lambda: os.getenv("MLS_MONITORING_INTERVAL", "5s")) - node_exporter_enabled: bool = True - otel_deploy_enabled: bool = True + node_exporter_enabled: bool = field(default_factory=lambda: _get_bool_env("MLS_NODE_EXPORTER_ENABLED", True)) + otel_deploy_enabled: bool = field(default_factory=lambda: _get_bool_env("MLS_OTEL_DEPLOY_ENABLED", True)) + node_exporter_collectors: str = field(default_factory=lambda: os.getenv("MLS_NODE_EXPORTER_METRICS", "os")) + enable_core_policies: bool = field(default_factory=lambda: _get_bool_env("MLS_CORE_POLICIES_ENABLED", True)) node: str = field(default_factory=lambda: os.getenv("NODE_NAME", socket.gethostname())) cluster: str = field(default_factory=lambda: os.getenv("CLUSTER_NAME", "")) @@ -48,6 +79,14 @@ class AgentConfig: n_jid: str = field(init=False) c_jid: str = field(init=False) + # Watchdog / TTL configuration (can be overridden by configuration or policy context) + task_ttl: float = field( + default_factory=lambda: float(os.getenv("MLS_TASK_TTL_SECONDS", "60.0")) + ) + watchdog_interval: float = field( + default_factory=lambda: float(os.getenv("MLS_WATCHDOG_INTERVAL_SECONDS", "1.0")) + ) + def __post_init__(self): """ Calculate derived fields after initialization, e.g., JIDs. @@ -57,11 +96,40 @@ def __post_init__(self): def update(self, **kwargs): """ - Updates the configuration object with new values. - :param kwargs: Key-value pairs to update the configuration. + Updates the attributes of an object based on provided keyword arguments. + Certain attributes are protected from updates if their respective environment + variables are set. If keys related to derived fields are modified, + recalculation is triggered. + + Parameters: + **kwargs: dict + Keyword arguments containing keys and corresponding values to + update the attributes of the object. + + Raises: + KeyError + If an invalid configuration key is provided that does not match + any existing attribute of the object. """ + # Keys that should not be updated if their corresponding env variable is set + env_protected_keys = { + "enable_core_policies": "MLS_CORE_POLICIES", + "policy_directory": "MLS_POLICY_DIRECTORY", + "mechanisms_directory": "MLS_MECHANISM_DIRECTORY", + "mechanisms": "MLS_MECHANISM_ENABLED", + "node_exporter_enabled": "MLS_NODE_EXPORTER_ENABLED", + "otel_deploy_enabled": "MLS_OTEL_DEPLOY_ENABLED", + "node_exporter_collectors": "MLS_NODE_EXPORTER_METRICS", + "monitoring_interval": "MLS_MONITORING_INTERVAL", + "node_exporter_scrape_interval": "MLS_NODE_EXPORTER_SCRAPE_INTERVAL" + } + for key, value in kwargs.items(): if hasattr(self, key): + # Skip update if this key has an env variable set + if key in env_protected_keys and os.getenv(env_protected_keys[key]) is not None: + continue + setattr(self, key, value) # Recalculate derived fields if needed if key in {"node", "cluster", "domain"}: diff --git a/agents/mlsysops/policies/policy-cluster-dynamicPlacement.py b/agents/mlsysops/policies/policy-cluster-dynamicPlacement.py index 8558805..07e8fe4 100644 --- a/agents/mlsysops/policies/policy-cluster-dynamicPlacement.py +++ b/agents/mlsysops/policies/policy-cluster-dynamicPlacement.py @@ -33,7 +33,7 @@ def initialize(): -def initial_plan(context, app_desc, system_description, components_state): +def initial_plan(context, app_desc, system_description, components_state, nodes_desc): """ Generates an initial deployment plan for application components in a distributed system. @@ -78,16 +78,31 @@ def initial_plan(context, app_desc, system_description, components_state): if node_name: logger.info(f'Component {comp_name} is static placed. No initial plan needed') continue - if components_state[comp_name]['node_placed'] is not None: - logger.info(f'Component {comp_name} already placed in {components_state[comp_name]["node_placed"]}. No initial plan needed') - continue + + # get continuum layer + continuum_layer = node_placement.get("continuum_layer", None) + candidate_nodes = [] + for node_name, node_desc in nodes_desc.items(): + logger.debug(f'node_name {node_name} node_desc {node_desc}') + if 'spec' not in node_desc: + logger.warning(f'node {node_name} does not have spec') + continue + if 'continuum_layer' in node_desc['spec'] and node_desc['spec']['continuum_layer'] == continuum_layer[0]: + logger.warning(f'node {node_name} matches continuum_layer {continuum_layer}') + candidate_nodes.append(node_name) + + # if components_state[comp_name]['node_placed'] is not None: + # logger.info(f'Component {comp_name} already placed in {components_state[comp_name]["node_placed"]}. No initial plan needed') + # continue # Initial deployment needed for this component + if len(candidate_nodes) > 0: + context["current_placement"] = random.choice(candidate_nodes) plan[comp_name] = [{'action': 'deploy', 'host': context["current_placement"]}] return plan, context async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): components_state = mechanisms['fluidity']['state']['applications'][application_description[0]['name']]['components'] - + logger.info('analyze: components_state %s', components_state) for component in application_description[0]['spec']['components']: comp_name = component['metadata']['name'] node_placement = component.get("node_placement") @@ -118,8 +133,8 @@ async def plan(context, application_description, system_description, mechanisms, # check if in the state the client app has been placed # use fluidity state for that components_state = mechanisms['fluidity']['state']['applications'][application_description[0]['name']]['components'] - - initial_plan_result, new_context = initial_plan(context, application, system_description,components_state) + nodes_desc = mechanisms['fluidity']['state']['nodes'] + initial_plan_result, new_context = initial_plan(context, application, system_description,components_state,nodes_desc) if len(initial_plan_result.keys()) > 0: # in case an initial plan exists for at least one component, we cannot send non-initial plan payload plan_result['deployment_plan'] = initial_plan_result diff --git a/agents/mlsysops/policy.py b/agents/mlsysops/policy.py index c960a05..097b25f 100644 --- a/agents/mlsysops/policy.py +++ b/agents/mlsysops/policy.py @@ -20,6 +20,7 @@ import ast import asyncio +import traceback from .logger_util import logger @@ -37,6 +38,11 @@ def initialize(self, agent): # Check if it was initialized try: policy_initial_context = self.module.initialize().copy() + node_name = policy_initial_context.get("node_name",None) + if node_name is not None: + if node_name != agent.state.configuration.node: + logger.debug(f"Policy {self.name} is not configured for node {agent.state.configuration.node}") + return False self.context.update(policy_initial_context) # Add telemetry metrics @@ -54,8 +60,10 @@ def initialize(self, agent): logger.debug(f"Policy {self.name} initialized {self.context}") self.scope = self.context['scope'] + return True except Exception as e: logger.error(f"Failed to initialize policy {self.name}: {e}") + return False def update_context(self,context): self.context = context @@ -69,6 +77,7 @@ async def analyze(self,application_description, system_description, mechanisms, analyze_result,updated_context = await self.module.analyze(self.context,application_description, system_description, mechanisms, telemetry, ml_connector) except Exception as e: logger.error(f"Error in policy analyze {self.name}: {e}") + logger.error(traceback.format_exc()) return False self.update_context(updated_context) self.last_analyze_run = time.time() @@ -80,6 +89,7 @@ async def plan(self,application_description, system_description, mechanisms, tel new_plan, updated_context = await self.module.plan(self.context,application_description, system_description, mechanisms, telemetry, ml_connector) except Exception as e: logger.error(f"Error in policy plan {self.name}: {e}") + logger.error(traceback.format_exc()) return {} self.update_context(updated_context) return new_plan diff --git a/agents/mlsysops/requirements.txt b/agents/mlsysops/requirements.txt index 1427cbd..43e37e7 100644 --- a/agents/mlsysops/requirements.txt +++ b/agents/mlsysops/requirements.txt @@ -1,4 +1,4 @@ -attrs==21.2.0 +attrs==25.4.0 Jinja2==3.0.3 kubernetes==32.0.1 mlstelemetry==0.3.2 @@ -6,4 +6,8 @@ pandas==2.2.3 python-dotenv==1.1.0 PyYAML==6.0.2 redis -watchdog \ No newline at end of file +watchdog +spade==3.3.3 +numpy>=1.20 +kubernetes_asyncio==32.0.0 +ruamel.yaml \ No newline at end of file diff --git a/agents/mlsysops/scheduler.py b/agents/mlsysops/scheduler.py index d9aa94a..2ce9f72 100644 --- a/agents/mlsysops/scheduler.py +++ b/agents/mlsysops/scheduler.py @@ -74,27 +74,34 @@ async def run(self): should_discard = False - # if was executed a plan earlier, then discard it. - if asset in mechanisms_touched: + # if was executed a plan earlier, for a specific mechanism and a specific application, then discard it. + # TODO: double check, touched by the same policy + if asset in mechanisms_touched and mechanisms_touched[asset]['application_id'] == plan.application_id: should_discard = True task_log = self.state.get_task_log(plan.uuid) # Check if there is a pending task log from previous runs if task_log: + logger.test( + f"|1| Debug:planuid:{str(plan.uuid)} {task_log} for plan {plan.application_id}") if (task_log['status'] == Status.PENDING.value - and task_log['mechanism'][asset] == Status.PENDING.value): + and task_log['mechanism'][asset] == Status.PENDING.value + and task_log['application_id'] == plan.application_id): + logger.test(f"|2| Debug2:planuid:{str(plan.uuid)} {task_log} for plan {plan.application_id}") should_discard = True # check if the application has been removed for this application scoped plan if (plan.application_id not in self.state.applications and plan.application_id not in self.state.active_mechanisms): # TODO easy way to do for now. different mechanism scope + logger.error("should discard") should_discard = True # TODO: check for fluidity debug # Check if it is core, should override the discard mechanism if not plan.core and should_discard: logger.test(f"|1| Plan planuid:{str(plan.uuid)} status:Discarded") + logger.error(f"True is: {plan.core} should discard: {should_discard}") self.state.update_task_log(plan.uuid,updates={"status": "Discarded"}) continue @@ -106,7 +113,8 @@ async def run(self): mechanisms_touched[asset] = { "timestamp": time.time(), "plan_uid": plan.uuid, - "plan": command + "plan": command, + "application_id": plan.application_id } # start execution task diff --git a/agents/mlsysops/spade/behaviors/Check_ml_deployment_Behaviour.py b/agents/mlsysops/spade/behaviors/Check_ml_deployment_Behaviour.py index 9345c59..cc796a9 100644 --- a/agents/mlsysops/spade/behaviors/Check_ml_deployment_Behaviour.py +++ b/agents/mlsysops/spade/behaviors/Check_ml_deployment_Behaviour.py @@ -25,19 +25,21 @@ def get_pod_info(comp_name, model_id, api_client): """Query Karmada proxy API to find the pod with the given component name.""" - path = "/apis/search.karmada.io/v1alpha1/proxying/karmada/proxy/api/v1/namespaces/default/pods" + path = "/apis/search.karmada.io/v1alpha1/proxying/karmada/proxy/api/v1/namespaces/mlsysops/pods" try: response = api_client.call_api( resource_path=path, method="GET", auth_settings=["BearerToken"], response_type="json", _preload_content=False ) pods = json.loads(response[0].data.decode("utf-8")) - + logger.debug(f"Found {len(pods.get('items', []))} pods in the namespace.") for pod in pods.get("items", []): - if pod["metadata"]["name"].startswith(comp_name) and pod["status"]["phase"] == "Running" \ - and model_id in pod["metadata"]["labels"].get("mlsysops.eu/app"): - logger.debug(f"Found running pod: {pod['metadata']['name']} on host: {pod['spec']['nodeName']}") - return pod["metadata"]["name"], pod["spec"]["nodeName"], pod['metadata']['labels']['mlsysops.eu/app'] + logger.debug(f"Checking pod: {pod['metadata']['name']}") + if 'mlsysops.eu/app' in pod["metadata"]["labels"]: + if pod["status"]["phase"] == "Running" \ + and model_id in pod["metadata"]["labels"].get("mlsysops.eu/app"): + logger.debug(f"Found running pod: {pod['metadata']['name']} on host: {pod['spec']['nodeName']}") + return pod["metadata"]["name"], pod["spec"]["nodeName"], pod['metadata']['labels']['mlsysops.eu/app'] except ApiException as exc: logger.error(f"Failed to fetch pods: {exc}") @@ -84,38 +86,6 @@ def get_node_ip(host, api_client): logger.error(f"Failed to resolve IP for node: {host}") return node_ip - -# def get_node_ip(host): -# # Get a list of the nodes -# nodes = get_k8s_nodes() -# node_ip = None -# for node in nodes: -# node_name = node.metadata.name -# if node.metadata.name == host: -# internal_ip = None -# external_ip = None -# addresses = node.status.addresses -# print('Addresses ' + addresses) -# for address in addresses: -# if address.type == "ExternalIP": -# external_ip = address.address -# print(f"Node: {node_name}, External IP: {external_ip}") -# elif address.type == "InternalIP": -# internal_ip = address.address -# print(f"Node: {node_name}, Internal IP: {internal_ip}") -# if external_ip == None: -# print('External IP not found for node that should be accessible externally.') -# if internal_ip == None: -# print('Internal IP not found for node that should be accessible externally.') -# else: -# node_ip = internal_ip -# else: -# node_ip = external_ip -# break -# return node_ip - - - class Check_ml_deployment_Behaviour(OneShotBehaviour): def __init__(self, redis_manager, model_id, comp_name, core_api): @@ -131,6 +101,7 @@ async def run(self): # Load Karmada kubeconfig and create Kubernetes API client karmada_api_kubeconfig = os.getenv("KARMADA_API_KUBECONFIG", "kubeconfigs/karmada-api.kubeconfig") + logger.debug(f"Loading Karmada kubeconfig: {karmada_api_kubeconfig}") try: config.load_kube_config(config_file=karmada_api_kubeconfig) api_client = client.ApiClient() @@ -148,7 +119,7 @@ async def run(self): logger.debug(f"Found pod: {pod_name} running on host: {host}") break - svc_path = f"/apis/search.karmada.io/v1alpha1/proxying/karmada/proxy/api/v1/namespaces/default/services/{self.comp_name}" + svc_path = f"/apis/search.karmada.io/v1alpha1/proxying/karmada/proxy/api/v1/namespaces/mlsysops/services/ml-{self.model_id}" logger.debug(f"Fetching service details from Karmada proxy API: {svc_path}") try: response = api_client.call_api( @@ -184,55 +155,4 @@ async def run(self): logger.debug(f"Pushing endpoint details to Redis: {info}") self.r.update_dict_value("endpoint_hash", self.model_id, str(info)) - await asyncio.sleep(2) - - # while True: - # pod_name = None - # # Waits until it reads a pod with the given name - # pod_name, host = get_pod_name(self.comp_name) - # # Retrieve svc endpoint info - # if pod_name is None: - # logger.debug('Failed to get status of comp with name ' + str(self.comp_name)) - # await asyncio.sleep(5) - # else: - # break - # - # svc_obj = None - # try: - # svc_obj = self.core_api.read_namespaced_service( - # name=self.comp_name, - # namespace=config.NAMESPACE) - # except ApiException as exc: - # if exc.status != 404: - # print('Unknown error reading service: ' + exc)n - # return None - # - # # Retrieve svc endpoint info - # if svc_obj is None: - # print('Failed to read svc with name ' + self.comp_name) - # # Add handling - # - # # Retrieve the assigned VIP:port - # local_endpoint = svc_obj.spec.cluster_ip + ':' + str(svc_obj.spec.ports[0].port) - # if svc_obj.spec.ports[0].node_port: - # global_endpoint_port = str(svc_obj.spec.ports[0].node_port) - # else: - # global_endpoint_port = None - # - # if self.model_id != None: - # timestamp = datetime.now() - # info = { - # 'status': 'deployed', - # 'timestamp': str(timestamp), - # 'local_endpoint': local_endpoint - # } - # - # node_ip = get_node_ip(host) - # if global_endpoint_port and node_ip: - # info['global_endpoint'] = node_ip + ':' + global_endpoint_port - # - # print('Going to push to redis_conf endpoint_queue the value ' + str(info)) - # # NOTE: PLACEHOLDER FOR REDIS - YOU CAN CHANGE THIS WITH ANOTHER TYPE OF COMMUNICATION - # self.r.update_dict_value('endpoint_hash', self.model_id, str(info)) - # - # await asyncio.sleep(2) \ No newline at end of file + await asyncio.sleep(2) \ No newline at end of file diff --git a/agents/mlsysops/spade/behaviors/HBRecieverBehaviour.py b/agents/mlsysops/spade/behaviors/HBRecieverBehaviour.py index 1d8ce23..ef8960a 100644 --- a/agents/mlsysops/spade/behaviors/HBRecieverBehaviour.py +++ b/agents/mlsysops/spade/behaviors/HBRecieverBehaviour.py @@ -29,7 +29,6 @@ def __init__(self, redis_manager): self.r = redis_manager async def run(self): - logger.debug(f"HBReceiverBehaviour") msg = await self.receive(timeout=5) if msg and msg.get_metadata("performative") == "clus_hb": node_jid = str(msg.sender).split("/")[0] diff --git a/agents/mlsysops/spade/behaviors/ML_process_Behaviour.py b/agents/mlsysops/spade/behaviors/ML_process_Behaviour.py index f33ab4a..2819604 100644 --- a/agents/mlsysops/spade/behaviors/ML_process_Behaviour.py +++ b/agents/mlsysops/spade/behaviors/ML_process_Behaviour.py @@ -18,6 +18,8 @@ import os import time import yaml +from ruamel.yaml import YAML + from spade.behaviour import OneShotBehaviour # Make sure to import the ML check behavior from its module. from .Check_ml_deployment_Behaviour import Check_ml_deployment_Behaviour @@ -25,15 +27,14 @@ from mlstelemetry import MLSTelemetry from ...logger_util import logger +from jinja2 import Template, PackageLoader, Environment, select_autoescape import kubernetes_asyncio from kubernetes_asyncio.client.api import CustomObjectsApi from kubernetes_asyncio.client import ApiException - +import traceback mlsTelemetryClient = MLSTelemetry("continuum", "agent") -os.environ['TELEMETRY_ENDPOINT'] = "karmada.mlsysops.eu:4317" - sleep_time = 1 @@ -41,7 +42,7 @@ def transform_description(input_dict): # Extract the name and other fields under "MLSysOpsApplication" - ml_sys_ops_data = input_dict.pop("MLSysOpsApplication", {}) + ml_sys_ops_data = input_dict.pop("MLSysOpsApp", {}) app_name = ml_sys_ops_data.pop("name", "") # Create a new dictionary with the desired structure @@ -59,7 +60,78 @@ def transform_description(input_dict): # Convert the updated dictionary to a YAML-formatted string yaml_output = yaml.dump(updated_dict, default_flow_style=False) - return yaml_output + return yaml_output, updated_dict + +def create_svc_manifest(name_suffix=None,selector=""): + """Create manifest for service-providing component using Jinja template. + Returns: + manifest (str): The rendered service manifest as a string. + """ + + loader = PackageLoader("mlsysops", "templates") + env = Environment( + loader=loader, + autoescape=select_autoescape(enabled_extensions=("j2")) + ) + template = env.get_template('ml-component-service.j2') + name = f"ml-{name_suffix}" + # Render the template with the context data + manifest = template.render({ + 'name': name, + 'type': "ClusterIP", + 'selector': selector, + "ml_comp_port": "8000", + }) + + yaml = YAML(typ='safe',pure=True) + manifest_dict = yaml.load(manifest) + + return manifest_dict + +async def create_svc(name_suffix=None,svc_manifest=None,selector=None): + """Create a Kubernetes service. + + Note: For testing it deletes the service if already exists. + + Args: + svc_manifest (dict): The Service manifest. + + Returns: + svc (obj): The instantiated V1Service object. + """ + async with kubernetes_asyncio.client.ApiClient() as api_client: + namespace = "mlsysops" + core_api = kubernetes_asyncio.client.CoreV1Api(api_client) + + if svc_manifest is None: + svc_manifest = create_svc_manifest(name_suffix,selector) + resp = None + try: + logger.info('Trying to read service if already exists') + resp = await core_api.read_namespaced_service( + name=svc_manifest['metadata']['name'], + namespace=namespace) + except ApiException as exc: + if exc.status != 404: + logger.error('Unknown error reading service: %s', exc) + return None + if resp: + try: + logger.info('Trying to delete service if already exists') + await core_api.delete_namespaced_service( + name=svc_manifest['metadata']['name'], + namespace=namespace) + except ApiException as exc: + logger.error('Failed to delete service: %s', exc) + try: + logger.info(f'Trying to create service {namespace}') + logger.debug(svc_manifest) + svc_obj = await core_api.create_namespaced_service(body=svc_manifest, + namespace=namespace) + return svc_obj + except ApiException as exc: + logger.error('Failed to create service: %s', exc) + return None class ML_process_Behaviour(CyclicBehaviour): """ @@ -79,6 +151,7 @@ async def run(self): try: await kubernetes_asyncio.config.load_kube_config(config_file=karmada_api_kubeconfig) + logger.info(f" Karmada api config Loaded with external kubeconfig: {karmada_api_kubeconfig}") except kubernetes_asyncio.config.ConfigException: logger.error(f"Error loading karmada api config with external kubeconfig: {karmada_api_kubeconfig}") return @@ -94,55 +167,54 @@ async def run(self): q_info = self.r.pop(self.r.ml_q) q_info = q_info.replace("'", '"') - print(q_info) data_queue = json.loads(q_info) - if 'MLSysOpsApplication' not in data_queue: - # probably it is removal - print(f"fffff {data_queue.keys()}") - for key in data_queue.keys(): - model_id = key + logger.debug(data_queue) + if 'MLSysOpsApp' not in data_queue: + model_id = list(data_queue.keys())[0] else: - model_id = data_queue["MLSysOpsApplication"]["mlsysops-id"] - data_queue['MLSysOpsApplication']['name'] = data_queue['MLSysOpsApplication']['name'] + "-" + model_id + model_id = data_queue["MLSysOpsApp"]["components"][0]["metadata"]["uid"] + # data_queue['MLSysOpsApp']['name'] = data_queue['MLSysOpsApp']['name'] + "-" + model_id + data_queue['MLSysOpsApp']['name'] = model_id + try: - comp_name = data_queue["MLSysOpsApplication"]["components"][0]["Component"]["name"] - cluster_id = data_queue["MLSysOpsApplication"]["clusterPlacement"]["clusterID"][0] + comp_name = data_queue["MLSysOpsApp"]["components"][0]["metadata"]["name"] + cluster_id = data_queue["MLSysOpsApp"]["cluster_placement"]["cluster_id"][0] self.r.update_dict_value("ml_location", model_id, cluster_id) except KeyError: cluster_id = self.r.get_dict_value("ml_location", model_id) - print("CLUSTER ID " + str(cluster_id)) group = "mlsysops.eu" version = "v1" plural = "mlsysopsapps" - namespace = "default" + namespace = "mlsysops" name = model_id - - if self.r.get_dict_value("endpoint_hash", model_id) == "To_be_removed": + queue_state = self.r.get_dict_value("endpoint_hash", model_id) + logger.debug(f"Queue state: {queue_state}") + if queue_state == "To_be_removed": try: # Delete the existing custom resource - logger.debug(f"Deleting Custom Resource: {name}") await custom_api.delete_namespaced_custom_object( group=group, version=version, namespace=namespace, plural=plural, - name="ml-app-" + model_id - ) - logger.debug(f"Custom Resource '{name}' deleted successfully.") - await self.message_queue.put({ - "event": "application_removed", - "payload": data_dict - } + name=name ) + # await self.message_queue.put({ + # "event": "application_removed", + # "payload": data_queue + # } + # ) self.r.update_dict_value("endpoint_hash", model_id, "Removed") self.r.remove_key("endpoint_hash", model_id) + logger.debug(f"Custom Resource '{name}' deleted successfully.") + except ApiException as e: if e.status == 404: - print(f"Custom Resource '{name}' not found. Skipping deletion.") + logger.debug(f"Custom Resource '{name}' not found. Skipping deletion.") else: - print(f"Error deleting Custom Resource '{name}': {e}") + logger.debug(f"Error deleting Custom Resource '{name}': {e}") raise else: try: @@ -154,17 +226,17 @@ async def run(self): self.r.update_dict_value("endpoint_hash", model_id, str(info)) # Transform and parse the description - file_content = transform_description(data_queue) + file_content, updated_dict = transform_description(data_queue) yaml_handler = yaml.safe_load(file_content) cr_spec = yaml_handler - await self.message_queue.put( - { - "event": "application_submitted", - "payload": file_content - } - ) - + # await self.message_queue.put( + # { + # "event": "application_submitted", + # "payload": data_queue + # } + # ) + logger.debug(f"Creating or updating Custom Resource: {name}") try: current_resource = await custom_api.get_namespaced_custom_object( @@ -186,7 +258,6 @@ async def run(self): ) logger.debug(f"Custom Resource '{name}' updated successfully.") except ApiException as e: - logger.debug(f"Error processing Custom Resource: {e}") if e.status == 404: logger.debug(f"creating Custom Resource: {name} {group} {version} {namespace} {plural} {cr_spec}") # Resource does not exist; create it @@ -198,6 +269,10 @@ async def run(self): body=cr_spec ) logger.debug(f"Custom Resource '{name}' created successfully.") + # Create ML Component Service + await create_svc(name_suffix=model_id, + selector=f"{model_id}") + else: logger.error(f"Error processing Custom Resource: {e}") @@ -207,6 +282,7 @@ async def run(self): except Exception as e: logger.error(f"Error during deployment of '{name}': {e}") + logger.error(traceback.format_exc()) self.r.update_dict_value("endpoint_hash", model_id, "Deployment_Failed") await asyncio.sleep(1) \ No newline at end of file diff --git a/agents/mlsysops/spade/behaviors/MessageReceivingBehavior.py b/agents/mlsysops/spade/behaviors/MessageReceivingBehavior.py index 901810d..36ee831 100644 --- a/agents/mlsysops/spade/behaviors/MessageReceivingBehavior.py +++ b/agents/mlsysops/spade/behaviors/MessageReceivingBehavior.py @@ -33,7 +33,8 @@ def __init__(self, message_queue: asyncio.Queue): async def run(self): - msg = await self.receive(timeout=10) # wait for a message for 10 seconds + msg = await self.receive(timeout=5) + # logger.debug(f"Received message: {msg}") if msg: sender = str(msg._sender).split("/")[0] @@ -44,10 +45,9 @@ async def run(self): resp = Message(to=sender) resp.thread = msg.thread - logger.debug(f"Received {event} from {sender} of performative {performative}") + # logger.debug(f"Received {event} from {sender} of performative {performative}") match (performative, event): case ("request", MessageEvents.COMPONENT_PLACED.value): - logger.debug("Application Component Placed") # Decode payload payload = { "event": event, @@ -56,7 +56,6 @@ async def run(self): # inform agent for receiving await self.message_queue.put(payload) case ("request", MessageEvents.COMPONENT_REMOVED.value): - logger.debug("Application Component Removed") # Decode payload payload = { "event": event, @@ -65,7 +64,6 @@ async def run(self): # inform agent for receiving await self.message_queue.put(payload) case ("request", MessageEvents.OTEL_DEPLOY.value): - logger.debug(f"Received OTEL Create from {sender}") # Decode payload payload = { "event": event, @@ -73,7 +71,6 @@ async def run(self): } await self.message_queue.put(payload) case ("request", MessageEvents.OTEL_REMOVE.value): - logger.debug(f"Received OTEL remove from {sender}") # Decode payload payload = { "event": event, @@ -81,28 +78,25 @@ async def run(self): } await self.message_queue.put(payload) case ("request", MessageEvents.NODE_EXPORTER_DEPLOY.value): - logger.debug(f"Received {event} from {sender}") payload = { "event": event, "payload": json.loads(msg.body) } await self.message_queue.put(payload) case ("request", MessageEvents.NODE_EXPORTER_REMOVE.value): - logger.debug(f"Received {event} from {sender}") payload = { "event": event, "payload": json.loads(msg.body) } await self.message_queue.put(payload) case ("request", MessageEvents.NODE_SYSTEM_DESCRIPTION_SUBMITTED.value): - logger.debug(f"Received node sys desc update from {sender}") + logger.debug(f"Received {event} from {sender} of performative {msg.body}") payload = { "event": event, "payload": json.loads(msg.body) } await self.message_queue.put(payload) case ("request", MessageEvents.MESSAGE_TO_FLUIDITY.value): - logger.debug(f"Received {event} from {sender}") payload = { "event": event, "payload": json.loads(msg.body) @@ -110,13 +104,10 @@ async def run(self): await self.message_queue.put(payload) case _: try: - logger.debug(f"Received unknown event {event} from {sender} - forwarding to MLSAgent") payload = { "event": event, "payload": json.loads(msg.body) } await self.message_queue.put(payload) - except Exception: - print("Exception ;-)") - else: - logger.debug("Did not received any message after 10 seconds") + except Exception: + pass diff --git a/agents/mlsysops/spade/mls_spade.py b/agents/mlsysops/spade/mls_spade.py index acff375..574e372 100644 --- a/agents/mlsysops/spade/mls_spade.py +++ b/agents/mlsysops/spade/mls_spade.py @@ -22,30 +22,93 @@ from spade.message import Message from spade.template import Template -from .behaviors.CheckInactiveClustersBehaviour import CheckInactiveClustersBehaviour -from .behaviors.Check_ml_deployment_Behaviour import Check_ml_deployment_Behaviour -from .behaviors.HBRecieverBehaviour import HBReceiverBehaviour -from .behaviors.ML_process_Behaviour import ML_process_Behaviour -from .behaviors.ManagementModeBehaviour import ManagementModeBehaviour -from .behaviors.FailoverBehaviour import FailoverBehavior -from .behaviors.ProcessBehaviour import ProcessBehaviour from ..logger_util import logger -from .behaviors.HeartbeatBehavior import HeartbeatBehaviour -from .behaviors.MessageReceivingBehavior import MessageReceivingBehavior -from .behaviors.MessageSendingBehavior import MessageSendingBehavior -from .behaviors.SubscribeBehavior import Subscribe -from .behaviors.APIPingBehaviour import APIPingBehaviour -from .behaviors.ManageSubscriptionBehaviour import ManageSubscriptionBehaviour + +try: + from .behaviors.CheckInactiveClustersBehaviour import CheckInactiveClustersBehaviour +except Exception as e: + logger.warning(f"Failed to import CheckInactiveClustersBehaviour: {e}") + CheckInactiveClustersBehaviour = None + +try: + from .behaviors.Check_ml_deployment_Behaviour import Check_ml_deployment_Behaviour +except Exception as e: + logger.warning(f"Failed to import Check_ml_deployment_Behaviour: {e}") + Check_ml_deployment_Behaviour = None + +try: + from .behaviors.HBRecieverBehaviour import HBReceiverBehaviour +except Exception as e: + logger.warning(f"Failed to import HBReceiverBehaviour: {e}") + HBReceiverBehaviour = None + +try: + from .behaviors.ML_process_Behaviour import ML_process_Behaviour +except Exception as e: + logger.warning(f"Failed to import ML_process_Behaviour: {e}") + ML_process_Behaviour = None + +try: + from .behaviors.ManagementModeBehaviour import ManagementModeBehaviour +except Exception as e: + logger.warning(f"Failed to import ManagementModeBehaviour: {e}") + ManagementModeBehaviour = None + +try: + from .behaviors.FailoverBehaviour import FailoverBehavior +except Exception as e: + logger.warning(f"Failed to import FailoverBehavior: {e}") + FailoverBehavior = None + +try: + from .behaviors.ProcessBehaviour import ProcessBehaviour +except Exception as e: + logger.warning(f"Failed to import ProcessBehaviour: {e}") + ProcessBehaviour = None + +try: + from .behaviors.HeartbeatBehavior import HeartbeatBehaviour +except Exception as e: + logger.warning(f"Failed to import HeartbeatBehaviour: {e}") + HeartbeatBehaviour = None + +try: + from .behaviors.MessageReceivingBehavior import MessageReceivingBehavior +except Exception as e: + logger.warning(f"Failed to import MessageReceivingBehavior: {e}") + MessageReceivingBehavior = None + +try: + from .behaviors.MessageSendingBehavior import MessageSendingBehavior +except Exception as e: + logger.warning(f"Failed to import MessageSendingBehavior: {e}") + MessageSendingBehavior = None + +try: + from .behaviors.SubscribeBehavior import Subscribe +except Exception as e: + logger.warning(f"Failed to import Subscribe: {e}") + Subscribe = None + +try: + from .behaviors.APIPingBehaviour import APIPingBehaviour +except Exception as e: + logger.warning(f"Failed to import APIPingBehaviour: {e}") + APIPingBehaviour = None + +try: + from .behaviors.ManageSubscriptionBehaviour import ManageSubscriptionBehaviour +except Exception as e: + logger.warning(f"Failed to import ManageSubscriptionBehaviour: {e}") + ManageSubscriptionBehaviour = None from ..data.state import MLSState from mlsysops.spade.redis_mgt import RedisManager + class MLSSpade(Agent): def __init__(self, state: MLSState, message_queue: asyncio.Queue): - print(state.configuration) - print(state) super().__init__(state.configuration.n_jid, state.configuration.n_pass) - print("AFTER INIT") self.is_subscribed = None self.cluster = state.configuration.cluster @@ -53,80 +116,131 @@ def __init__(self, state: MLSState, message_queue: asyncio.Queue): self.snapshot_queue = Queue() self.message_queue = message_queue self.redis = RedisManager() - print("BEFORE redis connect") - self.redis.connect() - print("AFTER redis connect") + + if state.configuration.continuum_layer != "node": + self.redis.connect() self.state = state self.behaviours_config = state.configuration.behaviours - self.behaviour_classes = { - "APIPingBehaviour": APIPingBehaviour, - "CheckInactiveClustersBehaviour": CheckInactiveClustersBehaviour, - "Check_ml_deployment_Behaviour": Check_ml_deployment_Behaviour, - "HBReceiverBehaviour": HBReceiverBehaviour, - "HeartbeatBehaviour": HeartbeatBehaviour, - "ML_process_Behaviour": ML_process_Behaviour, - "ManagementModeBehaviour": ManagementModeBehaviour, - "ManageSubscriptionBehaviour": ManageSubscriptionBehaviour, - "MessageReceivingBehavior": MessageReceivingBehavior, - "MessageSendingBehavior": MessageSendingBehavior, - "ProcessBehaviour": ProcessBehaviour, - "FailoverBehaviour": FailoverBehavior, - "Subscribe": Subscribe - } - print("AFTER INIT END") + self.behaviour_classes = {} + + try: + self.behaviour_classes["APIPingBehaviour"] = APIPingBehaviour + except Exception as e: + logger.warning(f"Failed to add APIPingBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["CheckInactiveClustersBehaviour"] = CheckInactiveClustersBehaviour + except Exception as e: + logger.warning(f"Failed to add CheckInactiveClustersBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["Check_ml_deployment_Behaviour"] = Check_ml_deployment_Behaviour + except Exception as e: + logger.warning(f"Failed to add Check_ml_deployment_Behaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["HBReceiverBehaviour"] = HBReceiverBehaviour + except Exception as e: + logger.warning(f"Failed to add HBReceiverBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["HeartbeatBehaviour"] = HeartbeatBehaviour + except Exception as e: + logger.warning(f"Failed to add HeartbeatBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["ML_process_Behaviour"] = ML_process_Behaviour + except Exception as e: + logger.warning(f"Failed to add ML_process_Behaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["ManagementModeBehaviour"] = ManagementModeBehaviour + except Exception as e: + logger.warning(f"Failed to add ManagementModeBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["ManageSubscriptionBehaviour"] = ManageSubscriptionBehaviour + except Exception as e: + logger.warning(f"Failed to add ManageSubscriptionBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["MessageReceivingBehavior"] = MessageReceivingBehavior + except Exception as e: + logger.warning(f"Failed to add MessageReceivingBehavior to behaviour_classes: {e}") + + try: + self.behaviour_classes["MessageSendingBehavior"] = MessageSendingBehavior + except Exception as e: + logger.warning(f"Failed to add MessageSendingBehavior to behaviour_classes: {e}") + + try: + self.behaviour_classes["ProcessBehaviour"] = ProcessBehaviour + except Exception as e: + logger.warning(f"Failed to add ProcessBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["FailoverBehaviour"] = FailoverBehavior + except Exception as e: + logger.warning(f"Failed to add FailoverBehaviour to behaviour_classes: {e}") + + try: + self.behaviour_classes["Subscribe"] = Subscribe + except Exception as e: + logger.warning(f"Failed to add Subscribe to behaviour_classes: {e}") async def send_message(self, recipient: str, event: str, payload: dict): behavior = MessageSendingBehavior(recipient, event, payload) self.add_behaviour(behavior) - async def new_agent_appeared(self,agent_jid): + async def new_agent_appeared(self, agent_jid): pass async def setup(self): self.is_subscribed = False logger.debug("MLSSpade agent setup") - logger.debug(f"Configured behaviors: {self.behaviours_config}") for behaviour_name, config in self.behaviours_config.items(): - if not config.get("enabled", False): - continue + try: + if not config.get("enabled", False): + break - behaviour_class = self.behaviour_classes.get(behaviour_name) - if not behaviour_class: - logger.warning(f"No behavior class found for {behaviour_name}") - continue + behaviour_class = self.behaviour_classes.get(behaviour_name) + if not behaviour_class: + logger.warning(f"No behavior class found for {behaviour_name}") + continue - # Exclude the 'enabled' flag and get the rest of the parameters from the config - config_params = {k: v for k, v in config.items() if k != "enabled"} + # Exclude the 'enabled' flag and get the rest of the parameters from the config + config_params = {k: v for k, v in config.items() if k != "enabled"} - try: - sig = inspect.signature(behaviour_class.__init__) - except Exception as e: - logger.error(f"Cannot inspect __init__ for {behaviour_name}: {e}") - continue + try: + sig = inspect.signature(behaviour_class.__init__) + except Exception as e: + logger.error(f"Cannot inspect __init__ for {behaviour_name}: {e}") + continue - # The first parameter is 'self'; skip it. - valid_params = list(sig.parameters.keys())[1:] - # Filter config_params to include only valid constructor parameters - filtered_params = {k: v for k, v in config_params.items() if k in valid_params} + # The first parameter is 'self'; skip it. + valid_params = list(sig.parameters.keys())[1:] + # Filter config_params to include only valid constructor parameters + filtered_params = {k: v for k, v in config_params.items() if k in valid_params} - # Inject required parameters if they are missing - if "redis_manager" in valid_params and "redis_manager" not in filtered_params: - filtered_params["redis_manager"] = self.redis + # Inject required parameters if they are missing + if "redis_manager" in valid_params and "redis_manager" not in filtered_params: + filtered_params["redis_manager"] = self.redis - if "message_queue" in valid_params and "message_queue" not in filtered_params: - filtered_params["message_queue"] = self.message_queue - - if "agent_to_subscribe" in valid_params and "agent_to_subscribe" not in filtered_params: - filtered_params["agent_to_subscribe"] = self.subscription_manager + if "message_queue" in valid_params and "message_queue" not in filtered_params: + filtered_params["message_queue"] = self.message_queue + + if "agent_to_subscribe" in valid_params and "agent_to_subscribe" not in filtered_params: + filtered_params["agent_to_subscribe"] = self.subscription_manager - try: # Instantiate the behavior with the filtered (and possibly injected) parameters behaviour_instance = behaviour_class(**filtered_params) self.add_behaviour(behaviour_instance) logger.debug(f"Added behavior: {behaviour_name} with params {filtered_params}") + break except Exception as e: - logger.error(f"Error instantiating {behaviour_name} with params {filtered_params}: {e}") + logger.warning(f"Error instantiating {behaviour_name}: {e}") + continue agent_exec_ins_behaviour = MessageReceivingBehavior(self.message_queue) self.add_behaviour(agent_exec_ins_behaviour) diff --git a/agents/mlsysops/spade/redis_mgt.py b/agents/mlsysops/spade/redis_mgt.py index cb7c1b2..41d4c26 100644 --- a/agents/mlsysops/spade/redis_mgt.py +++ b/agents/mlsysops/spade/redis_mgt.py @@ -19,7 +19,7 @@ from ..logger_util import logger # Fetching environment variables with default values if not set -redis_host = os.getenv('REDIS_HOST', '172.25.27.72') # Default to '10.96.12.155' +redis_host = os.getenv('REDIS_HOST', '10.64.83.239') # Default to '10.96.12.155' redis_port = int(os.getenv('REDIS_PORT', 6379)) # Default to 6379 redis_db_number = int(os.getenv('REDIS_DB_NUMBER', 0)) # Default to 0 redis_password = os.getenv('REDIS_PASSWORD', 'secret') # Uncomment if password is needed diff --git a/agents/mlsysops/tasks/analyze.py b/agents/mlsysops/tasks/analyze.py index 01ce325..997426a 100644 --- a/agents/mlsysops/tasks/analyze.py +++ b/agents/mlsysops/tasks/analyze.py @@ -82,40 +82,41 @@ async def process_analyze(self, active_policy: Policy): async def run(self): # TODO put some standard checks. - while True: - logger.debug(f"Analyze task for {self.id} and scope {self.scope}") - active_policies = PolicyController().get_policy_instance(self.scope, self.id) - - try: - if active_policies is not None: - for app_policy_name, app_policy in active_policies: - # logger.debug(f"Active Policy {app_policy_name} for application {self.id} calling analyze period {self.analyze_period}") - - analyze_interval = parse_interval_string(app_policy.get_analyze_period_from_context()) - self.analyze_periods.append(analyze_interval) - if analyze_interval == 0: - # run once and exit - await self.process_analyze(app_policy) - break - - # Check if we need to run analyze - if time.time() - app_policy.last_analyze_run > analyze_interval: - await self.process_analyze(app_policy) - + while True: + logger.debug(f"Analyze task for {self.id} and scope {self.scope}") + active_policies = PolicyController().get_policy_instance(self.scope, self.id) + + try: + if active_policies is not None: + for app_policy_name, app_policy in active_policies: + logger.debug(f"Active Policy {app_policy_name} for application {self.id} calling analyze period {self.analyze_period}") + + analyze_interval = parse_interval_string(app_policy.get_analyze_period_from_context()) + self.analyze_periods.append(analyze_interval) + if analyze_interval == 0: + # run once and exit + await self.process_analyze(app_policy) + break + + # Check if we need to run analyze + if time.time() - app_policy.last_analyze_run > analyze_interval: + await self.process_analyze(app_policy) + + if len(self.analyze_periods) > 0: self.analyze_period = min(self.analyze_periods) self.analyze_periods = [] - else: - logger.warn(f"No policy for {self.id}") - - await asyncio.sleep(self.analyze_period) - except asyncio.CancelledError: - # Handle task cancellation logic here (clean up if necessary) - logger.debug(f"Analyze Task for {self.id} {self.scope} has been cancelled") - return # Propagate the cancellation so the task actually stops - except Exception as e: - # Handle other exceptions - logger.error(f"Unexpected exception in AnalyzeTask: {e}") - logger.error(traceback.format_exc()) - await asyncio.sleep(self.analyze_period) + else: + logger.warn(f"No policy for {self.id}") + + await asyncio.sleep(self.analyze_period) + except asyncio.CancelledError: + # Handle task cancellation logic here (clean up if necessary) + logger.debug(f"Analyze Task for {self.id} {self.scope} has been cancelled") + return # Propagate the cancellation so the task actually stops + except Exception as e: + # Handle other exceptions + logger.error(f"Unexpected exception in AnalyzeTask: {e}") + logger.error(traceback.format_exc()) + await asyncio.sleep(self.analyze_period) diff --git a/agents/mlsysops/tasks/monitor.py b/agents/mlsysops/tasks/monitor.py index 964889e..d2b2110 100644 --- a/agents/mlsysops/tasks/monitor.py +++ b/agents/mlsysops/tasks/monitor.py @@ -15,6 +15,7 @@ import asyncio import time +import traceback from datetime import datetime from typing import Any, Optional, Dict, List from mlstelemetry import MLSTelemetry @@ -136,8 +137,9 @@ async def run(self): await self.__data.add_entry(entry) except Exception as e: - #logger.error(f"Error fetching telemetry for metric '{metric_name}': {str(e)}") - pass + logger.warning(f"Error fetching telemetry for metric '{metric_name}'") + logger.warning(traceback.format_exc()) + # Fetch mechanisms state for mechanism_key, mechanism_object in self.state.active_mechanisms.items(): mechanism_object['state'] = mechanism_object['module'].get_state() diff --git a/agents/mlsysops/tasks/watchdog.py b/agents/mlsysops/tasks/watchdog.py new file mode 100644 index 0000000..87be495 --- /dev/null +++ b/agents/mlsysops/tasks/watchdog.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025. MLSysOps Consortium +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import time + +from mlsysops.data.state import MLSState +from mlsysops.data.task_log import Status +from mlsysops.logger_util import logger + + +class WatchdogTask: + """ + Periodically scans MLSState.task_log and marks Pending tasks as Failed + when they exceed the configured TTL. + + TTL source order: + 1. Per-task override in 'arguments' (if provided by policy) under key 'ttl'. + 2. Global state.task_ttl (can be set from environment or policy context). + """ + + def __init__(self, state: MLSState): + self.state = state + self.interval_seconds = float(self.state.configuration.watchdog_interval) + + async def run(self): + while True: + try: + await asyncio.sleep(self.interval_seconds) + self._check_for_expired_tasks() + except asyncio.CancelledError: + logger.debug("WatchdogTask cancelled, stopping.") + break + except Exception as exc: + logger.warning(f"WatchdogTask encountered an error: {exc}") + + def _check_for_expired_tasks(self): + df = self.state.task_log + if df.empty: + return + + now = time.time() + + # Only Pending tasks + pending_mask = df["status"] == Status.PENDING.value + if not pending_mask.any(): + return + + pending_df = df[pending_mask] + + logger.debug(f"Found {len(pending_df)} pending tasks to check for expiration.") + + for _, row in pending_df.iterrows(): + try: + start_time = float(row["start_time"]) + except Exception: + # If start_time is malformed, skip this row + continue + + # Per-task TTL from arguments if provided by policy + per_task_ttl = None + arguments = row.get("arguments") + if isinstance(arguments, dict): + per_task_ttl = arguments.get("ttl") + + try: + ttl = float(per_task_ttl) if per_task_ttl is not None else float(self.state.configuration.task_ttl) + except Exception: + ttl = float(self.state.configuration.task_ttl) + + if now - start_time > ttl: + plan_uid = row["uuid"] + logger.debug( + f"Watchdog expiring task {plan_uid}: " + f"start_time={start_time}, now={now}, ttl={ttl}" + ) + # Mark as Failed and update end_time + self.state.update_task_log( + plan_uid, + updates={ + "status": Status.FAILED.value, + "end_time": now, + }, + ) diff --git a/agents/mlsysops/templates/ml-component-service.j2 b/agents/mlsysops/templates/ml-component-service.j2 new file mode 100644 index 0000000..912d060 --- /dev/null +++ b/agents/mlsysops/templates/ml-component-service.j2 @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: "{{ name }}" + namespace: "mlsysops" +spec: + type: "{{ type }}" + ports: + - name: "ml-component" + port: {{ ml_comp_port }} + protocol: "TCP" + targetPort: {{ ml_comp_port }} + selector: + mlsysops.eu/app: "{{ selector }}" \ No newline at end of file diff --git a/agents/mlsysops/templates/otel-config.yaml.j2 b/agents/mlsysops/templates/otel-config.yaml.j2 index 4af33ea..7b389f6 100644 --- a/agents/mlsysops/templates/otel-config.yaml.j2 +++ b/agents/mlsysops/templates/otel-config.yaml.j2 @@ -69,6 +69,8 @@ processors: - key: node_name action: insert value: '${env:NODE_HOSTNAME}' + batch: + timeout: 1s service: pipelines: @@ -76,18 +78,22 @@ service: receivers: [otlp,prometheus {%- if k8s_cluster_receiver is not none -%} ,k8s_cluster{%- endif -%} ] - processors: [attributes/add_labels] + processors: [attributes/add_labels,batch] exporters: [ prometheus, {%- if otlp_export_endpoint is not none -%}otlp{%- if mimir_export_endpoint is not none or tempo_export_endpoint is not none -%}, {%- endif -%}{%- endif -%} {%- if mimir_export_endpoint is not none -%}prometheusremotewrite{%- if tempo_export_endpoint is not none -%}, {%- endif -%}{%- endif -%} {%- if tempo_export_endpoint is not none -%}tempo{%- endif -%} - ] {% if loki_export_endpoint is not none %} + ] + {%- if otlp_export_endpoint is not none or loki_export_endpoint is not none %} logs: receivers: [ otlp {%- if k8s_cluster_receiver is not none -%} ,k8s_cluster, k8s_events{%- endif -%} ] processors: [ ] - exporters: [ loki,otlp ] - {%- endif -%} + exporters: [ + {%- if otlp_export_endpoint is not none -%}otlp{%- if loki_export_endpoint is not none -%}, {% endif -%}{%- endif -%} + {%- if loki_export_endpoint is not none -%}loki{%- endif -%} + ] + {%- endif %} diff --git a/agents/mlsysops/utilities.py b/agents/mlsysops/utilities.py index b4a26b9..30cd01c 100644 --- a/agents/mlsysops/utilities.py +++ b/agents/mlsysops/utilities.py @@ -16,6 +16,7 @@ from .logger_util import logger import operator +import re def evaluate_condition(a, b, operator: str) -> bool: """ @@ -250,4 +251,37 @@ def node_matches_requirements(node, comp_spec): if not cmp_fields(gpu_perf, node_gpu.get("performance_indicator", None), operator.gt, "gpu perf indicator"): return False - return True \ No newline at end of file + return True + +def parse_analyze_interval(interval: str) -> int: + """ + Parses an analyze interval string in the format 'Xs|Xm|Xh|Xd' and converts it to seconds. + + Args: + interval (str): The analyze interval as a string (e.g., "5m", "2h", "1d"). + + Returns: + int: The interval in seconds. + + Raises: + ValueError: If the format of the interval string is invalid. + """ + # Match the string using a regex: an integer followed by one of s/m/h/d + match = re.fullmatch(r"(\d+)([smhd])", interval) + if not match: + raise ValueError(f"Invalid analyze interval format: '{interval}'") + + # Extract the numeric value and the time unit + value, unit = int(match.group(1)), match.group(2) + + # Convert to seconds based on the unit + if unit == "s": # Seconds + return value + elif unit == "m": # Minutes + return value * 60 + elif unit == "h": # Hours + return value * 60 * 60 + elif unit == "d": # Days + return value * 24 * 60 * 60 + else: + raise ValueError(f"Unsupported time unit '{unit}' in interval: '{interval}'") diff --git a/agents/node/.dockerignore b/agents/node/.dockerignore new file mode 100644 index 0000000..f1b149f --- /dev/null +++ b/agents/node/.dockerignore @@ -0,0 +1,3 @@ +.env +*.log +*.csv \ No newline at end of file diff --git a/agents/node/Dockerfile b/agents/node/Dockerfile index 5f0cb9c..abeb041 100644 --- a/agents/node/Dockerfile +++ b/agents/node/Dockerfile @@ -1,20 +1,21 @@ # Base image FROM harbor.nbfc.io/proxy_cache/library/python:3.10-slim -# Set up a working directory -WORKDIR /workdir - -# Copy requirements to the image -COPY requirements.txt /workdir - -# Install dependencies from requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - # Export PYTHONPATH for the working directory ENV PYTHONPATH=/workdir +WORKDIR /workdir + # Copy all application files into the image -COPY . /workdir +COPY ./node /workdir/node +COPY ./mlsysops /workdir/mlsysops + +# Set up a working directory +WORKDIR /workdir/node + +# Install dependencies from requirements.txt +RUN pip install --no-cache-dir -r /workdir/mlsysops/requirements.txt +RUN pip install --no-cache-dir -r /workdir/node/requirements.txt # Default command to start the application CMD ["python3", "main.py"] \ No newline at end of file diff --git a/agents/node/MLSNodeAgent.py b/agents/node/MLSNodeAgent.py index 90fc557..0d09e31 100644 --- a/agents/node/MLSNodeAgent.py +++ b/agents/node/MLSNodeAgent.py @@ -48,6 +48,9 @@ async def run(self): # TODO make it optional fluidity_proxy_task = asyncio.create_task(self.fluidity_proxy_message_listener()) self.running_tasks.append(fluidity_proxy_task) + # TODO make it optional + vaccel_task = asyncio.create_task(self.vaccel_message_listener()) + self.running_tasks.append(vaccel_task) # sending sync request await self.send_message_to_node(self.state.configuration.cluster, MessageEvents.NODE_STATE_SYNC.value, {"node": self.state.configuration.node}) @@ -194,3 +197,45 @@ async def fluidity_proxy_message_listener(self): logger.error(f"fluidityproxy_message_listener: Error processing msg: {e}") await asyncio.sleep(1) print(f"MLSAGENT:::: stopping fluidity message listener.... ") + + async def vaccel_message_listener(self): + """ + Handles incoming messages from the fluidity proxy message queue, processes the + received events, and executes corresponding actions based on event types. + + Raises + ------ + asyncio.CancelledError + Raised when the task is cancelled while awaiting. + Exception + General exception raised if an unexpected error occurs during message + processing. + + Returns + ------- + None + """ + logger.debug(f"MLSAGENT Node:::: Starting vaccel proxy message listener.... ") + while True: + try: + msg = await self.mechanisms_controller.queues['vaccel']['outbound'].get() + + event = msg.get("event") + data = msg.get("payload") + logger.debug(f"Received msg from vaccel event { event }: { data }") + + match event: + case MessageEvents.PLAN_EXECUTED.value: + await self.update_plan_status(data['plan_uid'], "vaccel", data['status']) + case _: + logger.error(f"Received msg from vaccel mechanism with wrong event") + + except asyncio.CancelledError: + logger.debug(f"vaccel: CancelledError") + break + except Exception as e: + logger.error(f"vaccel: Error processing msg: {e}") + print(self.mechanisms_controller.queues) + print(traceback.format_exc()) + await asyncio.sleep(1) + print(f"MLSAGENT:::: stopping vaccel listener.... ") diff --git a/agents/node/config.yaml b/agents/node/config.yaml index 54bfb9c..e05bd2b 100644 --- a/agents/node/config.yaml +++ b/agents/node/config.yaml @@ -1,29 +1,31 @@ - mechanisms: - - "fluidity_proxy" - default_telemetry_metrics: "None" - policy_directory: "/etc/mlsysops/policies" - mechanisms_directory: "mechanisms" - continuum_layer: "node" - monitor_data_retention_time: 30 - node_exporter_scrape_interval: 10s - node_description: descriptions/csl-rpi5-1 - - behaviours: - APIPingBehaviour: - enabled: False - Check_ml_deployment_Behaviour: # This is oneshot behaviour should be False always - enabled: False - CheckInactiveClustersBehaviour: - enabled: False - period: 10 - HBReceiverBehaviour: - enabled: True - HeartbeatBehaviour: - enabled: False - period: 10 # Example parameter for PeriodicBehaviour - ManagementModeBehaviour: - enabled: False - ManageSubscriptionBehaviour: - enabled: False - Subscribe: - enabled: True +mechanisms: + - "fluidity_proxy" + - "vaccel" +default_telemetry_metrics: "None" +mechanisms_directory: "mechanisms" +continuum_layer: "node" +monitor_data_retention_time: 30 +node_exporter_scrape_interval: 10s +policy_directory: "policies" +behaviours: + APIPingBehaviour: + enabled: False + Check_ml_deployment_Behaviour: # This is oneshot behaviour should be False always + enabled: False + CheckInactiveClustersBehaviour: + enabled: False + period: 10 + HBReceiverBehaviour: + enabled: True + HeartbeatBehaviour: + enabled: False + period: 10 # Example parameter for PeriodicBehaviour + ManagementModeBehaviour: + enabled: False + ManageSubscriptionBehaviour: + enabled: False + Subscribe: + enabled: False + MessageReceivingBehavior: + enabled: True + message_queue: "message_queue" diff --git a/agents/node/descriptions/mls-compute-vm2.yaml b/agents/node/descriptions/mls-compute-vm2.yaml new file mode 100644 index 0000000..b7a9728 --- /dev/null +++ b/agents/node/descriptions/mls-compute-vm2.yaml @@ -0,0 +1,17 @@ +MLSysOpsNode: + name: mls-compute-vm2 # Change this accordingly. E.g. labnuc05-b1-node8 + cluster_id: mls-test-manage + continuum_layer: far_edge + sensors: + - temperature: + model: "dummy" # NOTE: fill in or remove + environment: + node_type: native + os: zephyr + container_runtime: + - containerd + hardware: + cpu: + architecture: amd64 + memory: "4" # Change if needed + disk: "120" # Change if needed \ No newline at end of file diff --git a/agents/node/mechanisms/vaccel.py b/agents/node/mechanisms/vaccel.py new file mode 100644 index 0000000..e41e072 --- /dev/null +++ b/agents/node/mechanisms/vaccel.py @@ -0,0 +1,117 @@ +import os +import asyncio +from typing import Any, Dict + +import requests + +from mlsysops.events import MessageEvents +from mlsysops.data.task_log import Status + +BACKEND_API_URL = os.getenv("BACKEND_API_URL", "http://10.64.82.70:5000") + +inbound_queue = None +self_outbound_queue = None + +def initialize(inbound_queue, outbound_queue, agent_state=None): + """ + Optional initialization hook for the agent. + + Currently a no-op, but you can wire inbound/outbound queues or agent_state + here if needed later. + """ + global self_outbound_queue + # Placeholder for future use + print(f"ini {outbound_queue}") + self_outbound_queue = outbound_queue + return + + +async def apply(value: Dict[str, Any]) -> bool: + """ + Apply a backend configuration by calling the /set-backend endpoint. + + Expected `value` structure: + { + "backend": "", # e.g. "stock", "vaccel-local", "vaccel-bf" + "remote_address": "" # optional, for remote backends + } + """ + global self_outbound_queue + backend_name = value.get("backend") + remote_address = value.get("remote_address", None) + + if not backend_name: + print("apply() called without 'backend' in value") + return False + + payload = { + "name": backend_name, + "remote_address": remote_address, + } + + url = f"{BACKEND_API_URL}/set-backend" + + def _post(): + return requests.post(url, json=payload, timeout=5) + + try: + response = await asyncio.to_thread(_post) + if not response.ok: + print(f"Failed to set backend: {response.status_code} {response.text}") + await self_outbound_queue.put({ + "event": MessageEvents.PLAN_EXECUTED.value, + "payload": { + "plan_uid": value["plan_uid"], + "status": Status.FAILED.value + } + }) + return False + print(f"Successfully set backend to {backend_name} {response.json()}") + await self_outbound_queue.put({ + "event": MessageEvents.PLAN_EXECUTED.value, + "payload": { + "plan_uid": value["plan_uid"], + "status": Status.COMPLETED.value + } + }) + return True + except Exception as e: + print(f"Error calling {url}: {e}") + return False + + +def get_options() -> Dict[str, Any]: + """ + Fetch available backend options from /get-backends endpoint. + + Returns the JSON response on success, or {} on failure. + """ + url = f"{BACKEND_API_URL}/get-backends" + try: + response = requests.get(url, timeout=5) + if not response.ok: + print(f"Failed to fetch backends: {response.status_code} {response.text}") + return {} + return response.json() + except Exception as e: + print(f"Error calling {url}: {e}") + return {} + + +def get_state() -> Dict[str, Any]: + """ + Return current state for this agent by querying the active backend + from /get-active-backend. + + Returns the JSON response on success, or {} on failure. + """ + url = f"{BACKEND_API_URL}/get-active-backend" + try: + response = requests.get(url, timeout=5) + if not response.ok: + print(f"Failed to fetch active backend: {response.status_code} {response.text}") + return {} + return response.json() + except Exception as e: + print(f"Error calling {url}: {e}") + return {} \ No newline at end of file diff --git a/agents/node/policies/policy-CPUFrequencyConfigurator.py b/agents/node/policies/policy-CPUFrequencyConfigurator.py index bcb8d5d..35058b4 100644 --- a/agents/node/policies/policy-CPUFrequencyConfigurator.py +++ b/agents/node/policies/policy-CPUFrequencyConfigurator.py @@ -70,7 +70,8 @@ def initialize(): }, "latest_timestamp": None, "core": False, - "scope": "global" + "scope": "global", + "node_name": "mls-compute-vm3" } return initialContext diff --git a/agents/node/policies/policy-vaccel.py b/agents/node/policies/policy-vaccel.py new file mode 100644 index 0000000..2e2468f --- /dev/null +++ b/agents/node/policies/policy-vaccel.py @@ -0,0 +1,136 @@ +"""Plugin module for custom policies - notify function.""" +from __future__ import print_function +import copy +import logging +import inspect +from itertools import cycle +import random +import time +import re +from mlsysops.logger_util import logger +from mlstelemetry import MLSTelemetry + +mlsClient = MLSTelemetry("policy", "policy_vaccel") + +def parse_analyze_interval(interval: str) -> int: + """ + Parses an analyze interval string in the format 'Xs|Xm|Xh|Xd' and converts it to seconds. + + Args: + interval (str): The analyze interval as a string (e.g., "5m", "2h", "1d"). + + Returns: + int: The interval in seconds. + + Raises: + ValueError: If the format of the interval string is invalid. + """ + # Match the string using a regex: an integer followed by one of s/m/h/d + match = re.fullmatch(r"(\d+)([smhd])", interval) + if not match: + raise ValueError(f"Invalid analyze interval format: '{interval}'") + + # Extract the numeric value and the time unit + value, unit = int(match.group(1)), match.group(2) + + # Convert to seconds based on the unit + if unit == "s": # Seconds + return value + elif unit == "m": # Minutes + return value * 60 + elif unit == "h": # Hours + return value * 60 * 60 + elif unit == "d": # Days + return value * 24 * 60 * 60 + else: + raise ValueError(f"Unsupported time unit '{unit}' in interval: '{interval}'") + +def initialize(): + initialContext = { + "telemetry": { + "metrics": ["vaccel_dynamicity_enabled","vaccel_dynamicity_threshold", "processing_time", "inference_time"], + "system_scrape_interval": "1s" + }, + "mechanisms": [ + "vaccel" + ], + "packages": [], + "configuration": { + "analyze_interval": "5s" + }, + "latest_timestamp": None, + "core": False, + "scope": "global", + "curr_comp_idx": 0, + "current_placement": None, + "initial_deployment_finished": False, + "moving_interval": "600s", + "dynamic_placement_comp": None, + "threshold_epsilon": 50 + } + + return initialContext + + +async def get_metric(metric_name, telemetry): + # Get latest values from telemetry data + component_measured_metric = None + try: + latest_telemetry_df = await telemetry['query'](latest=True) + # latest_telemetry_df = telemetry['query'](latest=True) + component_measured_metric = latest_telemetry_df[metric_name].values[0] + logger.debug(f"metric {metric_name} measurement {component_measured_metric} ") + except Exception as e: + logger.error(f"Failed to get metric {metric_name}") + + return component_measured_metric + +async def analyze(context, application_description, system_description, mechanisms, telemetry, ml_connector): + is_enabled = await get_metric("vaccel_dynamicity_enabled",telemetry) + threshold = await get_metric("vaccel_dynamicity_threshold",telemetry) + processing_time = await get_metric("processing_time",telemetry) + inference_time = await get_metric("inference_time",telemetry) + + if is_enabled and processing_time > threshold: + logger.info(f"Processing time {processing_time} is above threshold {threshold}") + return True, context + + if is_enabled and processing_time < (threshold - context['threshold_epsilon']): + logger.info(f"Processing time {processing_time} is below threshold {threshold} with epsilon {context['threshold_epsilon']}") + return True, context + + logger.info(f"Processing time {processing_time} is below threshold {threshold}") + return False, context + + +async def plan(context, application_description, system_description, mechanisms, telemetry, ml_connector): + + threshold = await get_metric("vaccel_dynamicity_threshold",telemetry) + processing_time = await get_metric("processing_time",telemetry) + inference_time = await get_metric("inference_time",telemetry) + logger.info(f"Processing time {processing_time} and threshold {threshold}") + + current_backend = mechanisms['vaccel']['state']['backend'] + current_options = mechanisms['vaccel']['options'] + logger.info(f"Current backend {current_backend} and options {current_options}") + + if processing_time > threshold: + plan_result = { + "backend": "vaccel-bf" + } + + + if processing_time < (threshold - context['threshold_epsilon']): + plan_result = { + "backend": "vaccel-cpu" + } + + if current_backend == plan_result['backend']: + return {},context + new_plan = { + "vaccel": plan_result + } + + logger.info('plan: New plan %s', new_plan) + + return new_plan, context diff --git a/agents/node/requirements.txt b/agents/node/requirements.txt index 496e12d..4ca805f 100644 --- a/agents/node/requirements.txt +++ b/agents/node/requirements.txt @@ -1,4 +1,3 @@ -mlsysops mlstelemetry opentelemetry-exporter-otlp opentelemetry-api diff --git a/agents/setup.py b/agents/setup.py index 10596ee..49925a7 100644 --- a/agents/setup.py +++ b/agents/setup.py @@ -28,7 +28,7 @@ "watchdog" ], package_data={ - "mlsysops": ["templates/*.j2","policies/*.py"], + "mlsysops": ["templates/*.j2","policies/*.py", "crds/*.yaml"], }, classifiers=[ "Development Status :: 3 - Alpha", diff --git a/agents/tests/application/test_CR.yaml b/agents/tests/application/test_CR.yaml index cc93bf4..13a37e7 100644 --- a/agents/tests/application/test_CR.yaml +++ b/agents/tests/application/test_CR.yaml @@ -1,78 +1,62 @@ apiVersion: mlsysops.eu/v1 -kind: MLSysOpsApp -metadata: - name: test-application -clusterPlacement: - clusterID: - - "uth-prod-cluster" - instances: 1 +cluster_placement: + cluster_id: + - mls-test-manage +component_interactions: + - component_name1: client-app + component_name2: server-app + type: egress components: - - Component: + - containers: + - command: + - python + - TcpServer.py + env: + - name: OTEL_RESOURCE_ATTRIBUTES + value: >- + service.name=server-app, service.version=0.0.0, + service.experimentid=test + - name: OTEL_SERVICE_NAME + value: server-app + - name: NODE_IP + - name: TELEMETRY_ENDPOINT + value: $(NODE_IP):43170 + - name: TCP_SERVER_IP + value: 0.0.0.0 + image: registry.mlsysops.eu/agent/agents/test_app:0.0.1 + image_pull_policy: Always + ports: + - container_port: 10000 + protocol: TCP + metadata: name: server-app - nodePlacement: - continuumLayer: - - Edge - node: csl-rpi5-1 - labels: - - "node-type:edge" - Scaling: - scalingMode: manual - instances: 1 - restartPolicy: OnFailure - containers: - - image: registry.mlsysops.eu/agent/agents/test_app:0.0.0 - imagePullPolicy: IfNotPresent - command: ["python", "TcpServer.py"] - env: - - name: OTEL_RESOURCE_ATTRIBUTES - value: "service.name=server-app, service.version=0.0.0, service.experimentid=test" - - name: OTEL_SERVICE_NAME - value: "server-app" - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - - name: TELEMETRY_ENDPOINT - value: "$(NODE_IP):43170" - - name: TCP_SERVER_IP - value: "0.0.0.0" - ports: - - containerPort: 10000 - protocol: TCP - - Component: + uid: 1asd23123 + restart_policy: OnFailure + - containers: + - command: + - python + - TcpClient.py + env: + - name: OTEL_RESOURCE_ATTRIBUTES + value: >- + service.name=server-app, service.version=0.0.0, + service.experimentid=test + - name: OTEL_SERVICE_NAME + value: server-app + - name: NODE_IP + - name: TELEMETRY_ENDPOINT + value: $(NODE_IP):43170 + - name: TCP_SERVER_IP + value: server-app + image: registry.mlsysops.eu/agent/agents/test_app:0.0.1 + image_pull_policy: Always + ports: + - container_port: 10000 + protocol: TCP + metadata: name: client-app - nodePlacement: - continuumLayer: - - Edge - node: csl-vader - labels: - - "node-type:edge" - Scaling: - scalingMode: manual - instances: 1 - restartPolicy: OnFailure - containers: - - image: registry.mlsysops.eu/agent/agents/test_app:0.0.0 - imagePullPolicy: IfNotPresent - command: ["python", "TcpClient.py"] - env: - - name: OTEL_RESOURCE_ATTRIBUTES - value: "service.name=server-app, service.version=0.0.0, service.experimentid=test" - - name: OTEL_SERVICE_NAME - value: "server-app" - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - - name: TELEMETRY_ENDPOINT - value: "$(NODE_IP):43170" - - name: TCP_SERVER_IP - value: "server-app" - ports: - - containerPort: 10000 - protocol: TCP -componentInteractions: - - componentName1: client-app - type: egress - componentName2: server-app - + restart_policy: OnFailure +kind: MLSysOpsApp +metadata: + name: test-application + namespace: mlsysops \ No newline at end of file diff --git a/mlconnector/README.md b/mlconnector/README.md index 80377fe..3de85bf 100644 --- a/mlconnector/README.md +++ b/mlconnector/README.md @@ -34,8 +34,6 @@ This is used for internal communication of the varrious services. You can setup - `POSTGRES_DB`: PostgreSQL database name (default, `mlmodel`) - `POSTGRES_USER`: PostgreSQL username (default, `postgres`) - `POSTGRES_PASSWORD`: PostgreSQL password (default, `strongpassword`) -- `PGADMIN_DEFAULT_EMAIL`: pgAdmin default login email (default, `user@mail.com`) -- `PGADMIN_DEFAULT_PASSWORD`: pgAdmin default login password (default, `strongpassword`) - `DB_HOST_NAME`: Database host (e.g., `database`, This corresponds to the name of the container) - `DB_PORT`: Database port (default: `5432`) - `DB_DRIVER`: Database driver string (default, `postgresql+asyncpg`) **NOTE:** Only use an async driver diff --git a/mlconnector/api_full_documentation.md b/mlconnector/api_full_documentation.md index 0e3d7b6..8477e52 100644 --- a/mlconnector/api_full_documentation.md +++ b/mlconnector/api_full_documentation.md @@ -647,11 +647,12 @@ reg = Ridge(alpha=1.0, random_state=0) reg.fit(X, y) ... -# It is important that all models are saved with a .pkl extension -# Serialize with pickle to a .pkl file +# Serialize with pickle to a .pkl file or any other format output_path = "diabetes_ridge.pkl" with open(output_path, "wb") as f: pickle.dump(reg, f) +# joblib.dump(bundle, model_path) (Using joblib) +# or you can load the model in the custom function (see inference section) ``` ## 2. Register ML model with @@ -719,9 +720,12 @@ The above step should return a model_id that will be used in the next steps. Her - Model file (pickled file saved in step one above) - Training data. This will be used for explainability and drift detection. (Note, it has to be the exact same data used to train the model, otherwise you will get wrong results) - Requirements file that defines the environment the model was trained in. +- If you will use a different predict function (See inference section). Upload these one by one using the example bellow; Note: file_kind can be `model`, `data`, `code`, and `env` + + ```python import requests @@ -729,7 +733,7 @@ files = { "file": open("model.pkl", "rb"), "file_kind": (None, "model") } -resp = requests.post("BASE_URL/model/1234/upload", files=files) +resp = requests.post("BASE_URL/model/{model_id}/upload", files=files) print(resp.json()) ``` ## 3. Deployment @@ -775,20 +779,59 @@ curl -X GET "BASE_URL/deployment/get/status/dep-iris-001" ## 4. Inference Endpoint (including Explainability) -### 4.1 Predict Call - -Assuming deployment created with `deployment_id = dep-iris-001`: +### 4.1 Inference -```bash -curl -X POST "BASE_URL/deployment/dep-iris-001/predict" \ - -H "Content-Type: application/json" \ - -d '{ - "data": [[5.1, 3.5, 1.4, 0.2]], - "explain": true - }' -``` +Once the ML application is ready, the response will contain the inference endpoint. -**Response:** +```python +url = ”BASE_URL/prediction" +headers = { + "accept": "application/json", + "Content-Type": "application/json", +} +payload = { + "data": [{…}], + "is_fun": False, + "explanation": False +} +resp = requests.post(url, json=payload, headers=headers) +``` + - `data` is list of dictionaries in the format of `feature:value` + - `is_fun` If set to `True` the inference application will use a custom predict function. This has to specified by the application owner. See example below. + + + + + + + + + +
scikilearnpytorch
+
import joblib
+
+def predict(path, df):
+    """Minimal sklearn: load bundle & predict."""
+    b = joblib.load(path)              # {'pipeline': fitted_estimator, ...}
+    return b["pipeline"].predict(df).tolist()
+
+
+
import torch, numpy as np
+
+def predict(path, df, feats=None, mean=None, scale=None):
+    """Minimal PyTorch (TorchScript)."""
+    m = torch.jit.load(path, map_location="cpu").eval()   # one-file scripted model
+    X = df[feats].to_numpy(np.float32) if feats else df.to_numpy(np.float32)
+    if mean is not None and scale is not None:              # optional scaling
+        X = (X - np.asarray(mean, np.float32)) / np.asarray(scale, np.float32)
+    with torch.no_grad():
+        y = m(torch.from_numpy(X)).argmax(1).cpu().numpy()
+    return y.tolist()
+
+
+ + - `explanation` If set to True, then the response includes explanations. +**Example response:** ```json { "prediction": [0], diff --git a/mlconnector/db/Dockerfile b/mlconnector/db/Dockerfile index ba325ea..51839df 100644 --- a/mlconnector/db/Dockerfile +++ b/mlconnector/db/Dockerfile @@ -1,5 +1,9 @@ -FROM harbor.nbfc.io/proxy_cache/library/postgres -USER root -RUN export LANGUAGE=en_US.UTF-8 -COPY configs/init-my-db.sh /docker-entrypoint-initdb.d/init-user-db.sh -# COPY configs/drift_metrics_mmd.csv /docker-entrypoint-initdb.d/drift_metrics_mmd.csv +FROM postgres:16-bookworm + +ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 + +# Copy init assets. Number prefix enforces order if you have multiple files. +COPY configs/init-my-db.sh /docker-entrypoint-initdb.d/10-init-my-db.sh + +# Normalize Windows line endings just in case (no harm if already LF) +RUN sed -i 's/\r$//' /docker-entrypoint-initdb.d/10-init-my-db.sh diff --git a/mlconnector/db/configs/data.py b/mlconnector/db/configs/data.py new file mode 100644 index 0000000..4e9db97 --- /dev/null +++ b/mlconnector/db/configs/data.py @@ -0,0 +1,63 @@ +import pandas as pd +from sqlalchemy import text +from dotenv import load_dotenv +import os + +# Load environment variables +load_dotenv(override=True) + +# Database config +db_config = { + "DB_DRIVER": "postgresql+psycopg2", # e.g. postgresql+asyncpg + "DB_USER": os.getenv("POSTGRES_USER"), + "DB_PASSWORD": os.getenv("POSTGRES_PASSWORD"), + "DB_HOST": "localhost", + "DB_PORT": os.getenv("DB_PORT"), + "DB_NAME": os.getenv("POSTGRES_DB") +} + +# Build connection string +DATABASE_URL = ( + f"{db_config['DB_DRIVER']}://{db_config['DB_USER']}:{db_config['DB_PASSWORD']}" + f"@{db_config['DB_HOST']}:{db_config['DB_PORT']}/{db_config['DB_NAME']}" +) +print(f"Connecting to database at {DATABASE_URL}") +"""# Create async engine and session +engine = create_async_engine(DATABASE_URL, echo=False) +AsyncSessionLocal = sessionmaker(bind=engine, expire_on_commit=False, class_=AsyncSession) + +# Main async logic +async def insert_drift_metrics(): + df = pd.read_csv("drift_metrics_mmd.csv") + + # Add required fields + df["rowid"] = [str(uuid.uuid4()) for _ in range(len(df))] + df["timestamp"] = datetime.utcnow() + + async with AsyncSessionLocal() as session: + for _, row in df.iterrows(): + await session.execute(text("" + INSERT INTO drift_metrics ( + rowid, feature, type, statistic, p_value, + method, drift_detected, timestamp, modelid + ) VALUES ( + :rowid, :feature, :type, :statistic, :p_value, + :method, :drift_detected, :timestamp, :modelid + ) + ""), { + "rowid": row["rowid"], + "feature": row["feature"], + "type": row["type"], + "statistic": float(row["statistic"]), + "p_value": float(row["p_value"]), + "method": row["method"], + "drift_detected": str(row["drift_detected"]), + "timestamp": row["timestamp"], + "modelid": row["modelid"] + }) + await session.commit() + +# Entry point +if __name__ == "__main__": + asyncio.run(insert_drift_metrics()) +""" \ No newline at end of file diff --git a/mlconnector/db/configs/drift_metrics_mmd.csv b/mlconnector/db/configs/drift_metrics_mmd.csv index 9969e25..f236460 100644 --- a/mlconnector/db/configs/drift_metrics_mmd.csv +++ b/mlconnector/db/configs/drift_metrics_mmd.csv @@ -1,78 +1,78 @@ rowid,feature,type,statistic,p_value,method,drift_detected,timestamp,modelid -21274428-311b-4619-9656-8921a7daaaf4,size,numerical,0.00033121,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d222 -0cec2f3b-b59e-4233-80ae-18fecc90d27f,download_time_ms,numerical,0.00065948,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d223 -f5b93d40-2dcd-4fce-9af8-6b373cad7e28,hour,numerical,0.00733468,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d224 -f479d2f2-d400-450b-b554-66271ccc116b,minute,numerical,0.00500349,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d225 -aafb90e2-6f68-4621-9c2a-d1e013523363,second,numerical,0.00543424,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d226 -004c8188-2da4-45b8-b9b8-ac6e2fc29f07,time_of_day,categorical,0.02429509,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d227 -ed64f40d-57e4-4b06-a538-955edef9fa2a,day_of_week,categorical,0.84222516,0.999995148,mmd,FALSE,20/12/2024 10:24,b7869631-438a-457e-b13e-7aeec243d228 -df56eb3d-e2a5-4b65-ba30-1b90cca4e77b,size,numerical,0.00033076,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d229 -f6b014b1-f791-4bfa-b561-a259c1c595e6,download_time_ms,numerical,0.00072822,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d230 -08f74bd5-61cc-4301-ae35-d1f2cd66bc2d,hour,numerical,0.00392956,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d231 -63b7a404-bfd6-44d9-940e-b4ad03f56d8e,minute,numerical,0.00431306,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d232 -fd6826c8-f993-4513-9e02-371405eb009f,second,numerical,0.00406827,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d233 -426c8e3d-8488-4980-9959-7c9acaa747b2,time_of_day,categorical,0.011688597,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d234 -07b02fc2-f127-40c4-823f-e00cc9f8d367,day_of_week,categorical,0.8779275,0.999995148,mmd,FALSE,22/12/2024 14:39,b7869631-438a-457e-b13e-7aeec243d235 -1c6591db-8d4a-419b-a2d1-7c3abb430274,size,numerical,0.00033153,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d236 -c0600fd3-f697-49b0-987e-148661d76ff4,download_time_ms,numerical,0.0007217,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d237 -e549b12f-b759-4531-a6ed-5ae4fe08b284,hour,numerical,0.00493939,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d238 -f81bce9b-5054-417a-ac68-add034f894d7,minute,numerical,0.00488431,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d239 -9cb58b7b-42f3-4023-92c4-7c4940575f25,second,numerical,0.00546426,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d240 -d2b7c48f-74ce-48e8-8a09-5bc9054adf8e,time_of_day,categorical,0.01934626,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d241 -f2d88df4-a9ce-4b00-8d75-0a59c25e9287,day_of_week,categorical,0.865846182,0.999995148,mmd,FALSE,24/12/2024 16:46,b7869631-438a-457e-b13e-7aeec243d242 -9b285b07-e39c-4df7-af39-758a777f786d,size,numerical,0.00033068,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d243 -7e0ac9c5-a484-4b7f-9315-8c071828e0cc,download_time_ms,numerical,0.00061516,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d244 -7f1c1a9e-4c2a-4388-8c96-e12e7fbafb3f,hour,numerical,0.00655878,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d245 -45dbe2af-7d09-4bec-b027-323e97079188,minute,numerical,0.00592618,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d246 -53455c8c-78d9-472a-8a7b-40ea8214cf47,second,numerical,0.00412502,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d247 -cc3f33dc-3a3b-46bc-bede-17f42c42e5e2,time_of_day,categorical,0.023638868,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d248 -fbc1ae73-d305-4227-a593-57675a2f2973,day_of_week,categorical,0.93799827,0.999995148,mmd,FALSE,26/12/2024 19:40,b7869631-438a-457e-b13e-7aeec243d249 -d1822bbf-7733-4c07-a016-330b8e0a4638,size,numerical,0.00033203,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d250 -f42ca950-c45d-4646-b60d-0ed02fd1ea4f,download_time_ms,numerical,0.00064752,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d251 -ca586559-49b7-4f2f-8392-c93a018b25dd,hour,numerical,0.00636723,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d252 -9b9a15d0-8cda-486b-9d31-956a6c8e22f6,minute,numerical,0.00425282,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d253 -e94ac054-16de-43d2-8c19-47768c2b7e3b,second,numerical,0.0074355,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d254 -a6c58883-68f5-46dc-a7e5-b969bb5d4077,time_of_day,categorical,0.02095412,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d255 -0ab4da28-0661-4525-a16b-486f6a65581a,day_of_week,categorical,0.924213,0.999995148,mmd,FALSE,28/12/2024 22:48,b7869631-438a-457e-b13e-7aeec243d256 -b8b9f0b2-8b69-405b-8336-1cfc4e725444,size,numerical,0.00024079,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d257 -9192f13a-9ca5-4c54-93e4-a29c34fb6cc1,download_time_ms,numerical,0.00046964,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d258 -25c874b8-8508-4aea-8127-f50254e3b4c2,hour,numerical,0.0064353,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d259 -3ab2ec8a-6390-4e62-b019-2162b6c9e513,minute,numerical,0.00327469,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d260 -31c254e5-e782-486b-b1ea-2339b7e0e2d5,second,numerical,0.00465031,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d261 -73594a11-2c4c-407a-b8b9-427afe8bdce4,time_of_day,categorical,0.014303058,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d262 -2868ed21-dab3-42e4-9e0c-b4e6b1281ac5,day_of_week,categorical,0.65481213,0.999995148,mmd,FALSE,31/12/2024 01:52,b7869631-438a-457e-b13e-7aeec243d263 -d094eae6-fc25-4986-bb21-866749bdd126,size,numerical,0.00021075,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d264 -88460a29-5647-496b-8543-5f4fa2d0a677,download_time_ms,numerical,0.00051039,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d265 -0636f25c-dc3e-4728-bfa2-a8d6504a7d34,hour,numerical,0.00434996,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d266 -8099d768-ca8a-447d-93dd-337620f0b7e9,minute,numerical,0.00326226,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d267 -d3ee3573-6e41-4ff2-98bb-d1c2c3a39e65,second,numerical,0.00281693,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d268 -2e8052bc-ac51-44e7-9994-aa9c27c592d3,time_of_day,categorical,0.01493261,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d269 -2f6e9413-ddef-44bb-8106-98559f571aa1,day_of_week,categorical,0.41776616,0.999995148,mmd,FALSE,03/01/2025 13:00,b7869631-438a-457e-b13e-7aeec243d270 -9e299cb2-cee0-4597-beed-c8567f6988c7,size,numerical,0.00018964,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d271 -1227b344-b159-4a39-8098-17161a2dc959,download_time_ms,numerical,0.00059779,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d272 -4fe50563-18a9-4a5d-a97e-fecc298d0ff3,hour,numerical,0.00281148,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d273 -8b7b82c7-7b4b-4680-8bf0-318e6d099656,minute,numerical,0.00337089,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d274 -68c1267e-043d-41b0-af38-221a1ad04a23,second,numerical,0.00314416,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d275 -52b222da-514d-42c9-a816-1a9b006dbc38,time_of_day,categorical,0.01187764,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d276 -4beb5f3d-61c0-48af-8f14-fc8a5dfce3db,day_of_week,categorical,0.38304824,0.999995148,mmd,FALSE,07/01/2025 09:04,b7869631-438a-457e-b13e-7aeec243d277 -45fc7348-8387-47cb-bb71-3a333d0fddc1,size,numerical,0.00018414,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d278 -0c6ef3b5-b3e8-41c7-aac4-018019d0ddbf,download_time_ms,numerical,0.00070718,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d279 -7cc01923-5d7a-42a6-aca4-1d719fb7499c,hour,numerical,0.00375158,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d280 -76183889-071f-4a31-bb78-2878c323bee0,minute,numerical,0.00347721,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d281 -43447b3f-10f3-4e24-971e-c136724de226,second,numerical,0.00221131,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d282 -8e25690b-b269-460e-97eb-bf4e3f603d1f,time_of_day,categorical,0.00820001,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d283 -98f9da83-bc8f-4d8e-b3ae-eb891c6c25ce,day_of_week,categorical,0.44187321,0.999995148,mmd,FALSE,11/01/2025 14:01,b7869631-438a-457e-b13e-7aeec243d284 -aea40d1f-dcaf-4f63-b98c-217f7c51fdde,size,numerical,0.00018419,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d285 -6366a4bc-4544-4e15-8c19-050631ca9483,download_time_ms,numerical,0.00079163,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d286 -31f2490a-4df5-404b-9480-3dd3dbc28c45,hour,numerical,0.00255881,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d287 -ac343a44-01c9-4bad-ba5f-39f84074d60b,minute,numerical,0.0023392,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d288 -b490c840-32ef-48b2-8cad-c79f8cb2db21,second,numerical,0.00342082,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d289 -13cd7d7f-10bd-4e67-af22-3bef7187e16e,time_of_day,categorical,0.01074746,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d290 -46d4799a-903b-4115-87d0-069406a57305,day_of_week,categorical,0.35549921,0.999995148,mmd,FALSE,15/01/2025 23:35,b7869631-438a-457e-b13e-7aeec243d291 -a06005d4-a5e0-4ffb-87e3-e809d27fce18,size,numerical,0.00018536,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d292 -bd064b48-f384-4eac-be4d-e2d729c89e06,download_time_ms,numerical,0.00041902,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d293 -db394877-9b81-4fef-a146-77d747673f33,hour,numerical,0.00433487,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d294 -0e934049-8929-49aa-9f13-580c7a7d4f24,minute,numerical,0.00237785,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d295 -50f8b627-1fd5-418f-a2de-0915b66bdd6d,second,numerical,0.00324063,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d296 -c5530b94-d029-44a6-bc8a-597b19c6f86a,time_of_day,categorical,0.00567503,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d297 -bf2be45d-cc4d-43da-9ac6-6c28859e9cf5,day_of_week,categorical,0.38945563,0.999995148,mmd,FALSE,20/01/2025 04:45,b7869631-438a-457e-b13e-7aeec243d298 \ No newline at end of file +21274428-311b-4619-9656-8921a7daaaf4,size,numerical,0.00033121,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0cec2f3b-b59e-4233-80ae-18fecc90d27f,download_time_ms,numerical,0.00065948,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c6 +f5b93d40-2dcd-4fce-9af8-6b373cad7e28,hour,numerical,0.00733468,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c7 +f479d2f2-d400-450b-b554-66271ccc116b,minute,numerical,0.00500349,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c8 +aafb90e2-6f68-4621-9c2a-d1e013523363,second,numerical,0.00543424,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c9 +004c8188-2da4-45b8-b9b8-ac6e2fc29f07,time_of_day,categorical,0.02429509,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c10 +ed64f40d-57e4-4b06-a538-955edef9fa2a,day_of_week,categorical,0.84222516,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c11 +df56eb3d-e2a5-4b65-ba30-1b90cca4e77b,size,numerical,0.00033076,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c12 +f6b014b1-f791-4bfa-b561-a259c1c595e6,download_time_ms,numerical,0.00072822,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c13 +08f74bd5-61cc-4301-ae35-d1f2cd66bc2d,hour,numerical,0.00392956,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c14 +63b7a404-bfd6-44d9-940e-b4ad03f56d8e,minute,numerical,0.00431306,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c15 +fd6826c8-f993-4513-9e02-371405eb009f,second,numerical,0.00406827,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c16 +426c8e3d-8488-4980-9959-7c9acaa747b2,time_of_day,categorical,0.011688597,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c17 +07b02fc2-f127-40c4-823f-e00cc9f8d367,day_of_week,categorical,0.8779275,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c18 +1c6591db-8d4a-419b-a2d1-7c3abb430274,size,numerical,0.00033153,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c19 +c0600fd3-f697-49b0-987e-148661d76ff4,download_time_ms,numerical,0.0007217,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c20 +e549b12f-b759-4531-a6ed-5ae4fe08b284,hour,numerical,0.00493939,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c21 +f81bce9b-5054-417a-ac68-add034f894d7,minute,numerical,0.00488431,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c22 +9cb58b7b-42f3-4023-92c4-7c4940575f25,second,numerical,0.00546426,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c23 +d2b7c48f-74ce-48e8-8a09-5bc9054adf8e,time_of_day,categorical,0.01934626,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c24 +f2d88df4-a9ce-4b00-8d75-0a59c25e9287,day_of_week,categorical,0.865846182,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c25 +9b285b07-e39c-4df7-af39-758a777f786d,size,numerical,0.00033068,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c26 +7e0ac9c5-a484-4b7f-9315-8c071828e0cc,download_time_ms,numerical,0.00061516,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c27 +7f1c1a9e-4c2a-4388-8c96-e12e7fbafb3f,hour,numerical,0.00655878,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c28 +45dbe2af-7d09-4bec-b027-323e97079188,minute,numerical,0.00592618,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c29 +53455c8c-78d9-472a-8a7b-40ea8214cf47,second,numerical,0.00412502,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c30 +cc3f33dc-3a3b-46bc-bede-17f42c42e5e2,time_of_day,categorical,0.023638868,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c31 +fbc1ae73-d305-4227-a593-57675a2f2973,day_of_week,categorical,0.93799827,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c32 +d1822bbf-7733-4c07-a016-330b8e0a4638,size,numerical,0.00033203,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c33 +f42ca950-c45d-4646-b60d-0ed02fd1ea4f,download_time_ms,numerical,0.00064752,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c34 +ca586559-49b7-4f2f-8392-c93a018b25dd,hour,numerical,0.00636723,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c35 +9b9a15d0-8cda-486b-9d31-956a6c8e22f6,minute,numerical,0.00425282,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c36 +e94ac054-16de-43d2-8c19-47768c2b7e3b,second,numerical,0.0074355,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c37 +a6c58883-68f5-46dc-a7e5-b969bb5d4077,time_of_day,categorical,0.02095412,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c38 +0ab4da28-0661-4525-a16b-486f6a65581a,day_of_week,categorical,0.924213,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c39 +b8b9f0b2-8b69-405b-8336-1cfc4e725444,size,numerical,0.00024079,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c40 +9192f13a-9ca5-4c54-93e4-a29c34fb6cc1,download_time_ms,numerical,0.00046964,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c41 +25c874b8-8508-4aea-8127-f50254e3b4c2,hour,numerical,0.0064353,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c42 +3ab2ec8a-6390-4e62-b019-2162b6c9e513,minute,numerical,0.00327469,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c43 +31c254e5-e782-486b-b1ea-2339b7e0e2d5,second,numerical,0.00465031,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c44 +73594a11-2c4c-407a-b8b9-427afe8bdce4,time_of_day,categorical,0.014303058,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c45 +2868ed21-dab3-42e4-9e0c-b4e6b1281ac5,day_of_week,categorical,0.65481213,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c46 +d094eae6-fc25-4986-bb21-866749bdd126,size,numerical,0.00021075,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c47 +88460a29-5647-496b-8543-5f4fa2d0a677,download_time_ms,numerical,0.00051039,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c48 +0636f25c-dc3e-4728-bfa2-a8d6504a7d34,hour,numerical,0.00434996,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c49 +8099d768-ca8a-447d-93dd-337620f0b7e9,minute,numerical,0.00326226,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c50 +d3ee3573-6e41-4ff2-98bb-d1c2c3a39e65,second,numerical,0.00281693,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c51 +2e8052bc-ac51-44e7-9994-aa9c27c592d3,time_of_day,categorical,0.01493261,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c52 +2f6e9413-ddef-44bb-8106-98559f571aa1,day_of_week,categorical,0.41776616,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c53 +9e299cb2-cee0-4597-beed-c8567f6988c7,size,numerical,0.00018964,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c54 +1227b344-b159-4a39-8098-17161a2dc959,download_time_ms,numerical,0.00059779,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c55 +4fe50563-18a9-4a5d-a97e-fecc298d0ff3,hour,numerical,0.00281148,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c56 +8b7b82c7-7b4b-4680-8bf0-318e6d099656,minute,numerical,0.00337089,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c57 +68c1267e-043d-41b0-af38-221a1ad04a23,second,numerical,0.00314416,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c58 +52b222da-514d-42c9-a816-1a9b006dbc38,time_of_day,categorical,0.01187764,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c59 +4beb5f3d-61c0-48af-8f14-fc8a5dfce3db,day_of_week,categorical,0.38304824,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c60 +45fc7348-8387-47cb-bb71-3a333d0fddc1,size,numerical,0.00018414,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c61 +0c6ef3b5-b3e8-41c7-aac4-018019d0ddbf,download_time_ms,numerical,0.00070718,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c62 +7cc01923-5d7a-42a6-aca4-1d719fb7499c,hour,numerical,0.00375158,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c63 +76183889-071f-4a31-bb78-2878c323bee0,minute,numerical,0.00347721,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c64 +43447b3f-10f3-4e24-971e-c136724de226,second,numerical,0.00221131,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c65 +8e25690b-b269-460e-97eb-bf4e3f603d1f,time_of_day,categorical,0.00820001,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c66 +98f9da83-bc8f-4d8e-b3ae-eb891c6c25ce,day_of_week,categorical,0.44187321,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c67 +aea40d1f-dcaf-4f63-b98c-217f7c51fdde,size,numerical,0.00018419,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c68 +6366a4bc-4544-4e15-8c19-050631ca9483,download_time_ms,numerical,0.00079163,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c69 +31f2490a-4df5-404b-9480-3dd3dbc28c45,hour,numerical,0.00255881,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c70 +ac343a44-01c9-4bad-ba5f-39f84074d60b,minute,numerical,0.0023392,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c71 +b490c840-32ef-48b2-8cad-c79f8cb2db21,second,numerical,0.00342082,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c72 +13cd7d7f-10bd-4e67-af22-3bef7187e16e,time_of_day,categorical,0.01074746,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c73 +46d4799a-903b-4115-87d0-069406a57305,day_of_week,categorical,0.35549921,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c74 +a06005d4-a5e0-4ffb-87e3-e809d27fce18,size,numerical,0.00018536,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c75 +bd064b48-f384-4eac-be4d-e2d729c89e06,download_time_ms,numerical,0.00041902,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c76 +db394877-9b81-4fef-a146-77d747673f33,hour,numerical,0.00433487,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c77 +0e934049-8929-49aa-9f13-580c7a7d4f24,minute,numerical,0.00237785,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c78 +50f8b627-1fd5-418f-a2de-0915b66bdd6d,second,numerical,0.00324063,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c79 +c5530b94-d029-44a6-bc8a-597b19c6f86a,time_of_day,categorical,0.00567503,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c80 +bf2be45d-cc4d-43da-9ac6-6c28859e9cf5,day_of_week,categorical,0.38945563,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c81 \ No newline at end of file diff --git a/mlconnector/db/configs/init-my-db.sh b/mlconnector/db/configs/init-my-db.sh index 7c0ca52..c13813f 100644 --- a/mlconnector/db/configs/init-my-db.sh +++ b/mlconnector/db/configs/init-my-db.sh @@ -5,4 +5,4 @@ psql -v ON_ERROR_STOP=1 \ --username "$POSTGRES_USER" \ --dbname "$POSTGRES_DB" <<-EOSQL -EOSQL +EOSQL \ No newline at end of file diff --git a/mlconnector/docker-compose.yaml b/mlconnector/docker-compose.yaml index 0d901c6..9d65f37 100644 --- a/mlconnector/docker-compose.yaml +++ b/mlconnector/docker-compose.yaml @@ -2,7 +2,7 @@ version: '3.8' services: db: - #image: registry.mlsysops.eu/usecases/augmenta-demo-testbed/side-db:0.0.1 + # image: registry.hackathon.mlsysops.eu/hackathon/mlconnector-db:0.0.1 build: ./db container_name: database env_file: @@ -27,44 +27,8 @@ services: timeout: 5s retries: 5 - pgadmin: - image: dpage/pgadmin4 - container_name: pgadmin - restart: always - ports: - - "23456:80" - environment: - PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} - PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} - - networks: - - api_network - - #redis: - # image: redis:latest - # container_name: deployment_queue - # restart: always - # ports: - # - "6379:6379" - # networks: - # - api_network - # command: /bin/sh -c "redis-server --save 20 1 --loglevel warning --requirepass $$REDIS_HOST_PASSWORD" - # command: redis-server --save 20 1 --loglevel warning --requirepass secret - # env_file: - # - .env - - #redisinsight: - # image: redislabs/redisinsight:latest - # container_name: redisinsight - # ports: - # - "5540:5540" - # depends_on: - # - redis - # networks: - # - api_network - app: - #image: registry.mlsysops.eu/usecases/augmenta-demo-testbed/side-api:0.0.1 + #image: registry.hackathon.mlsysops.eu/hackathon/mlconnector-api:0.0.1 build: ./src container_name: api env_file: @@ -88,31 +52,6 @@ services: networks: - api_network - #drift: - # build: ./drift_app - # container_name: drfit_detection - # - # restart: always - # ports: - # - "8050:8050" - # depends_on: - # app: - # condition: service_healthy - # restart: true - # networks: - # - api_network - - #xai: - #image: registry.mlsysops.eu/usecases/augmenta-demo-testbed/side-api:0.0.1 - # build: ./xai-server-app - # container_name: xai - #env_file: - # - .env - # restart: always - # ports: - # - "34567:8091" - # networks: - # - api_network volumes: db_data: diff --git a/mlconnector/src/Dockerfile b/mlconnector/src/Dockerfile index 16eeee6..83c5d66 100644 --- a/mlconnector/src/Dockerfile +++ b/mlconnector/src/Dockerfile @@ -1,5 +1,5 @@ -FROM harbor.nbfc.io/proxy_cache/library/python:3.11.5-slim-bookworm +FROM python:3.11.5-slim-bookworm # Add curl for healthcheck RUN apt-get update && \ @@ -39,8 +39,12 @@ COPY . /code EXPOSE 8090 -RUN chmod +x /code/startup.sh +# RUN chmod +x /code/startup.sh -CMD ["/code/startup.sh"] -# CMD ["/bin/bash","-c","sudo ./startup.sh"] -# CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8090"] +# normalize line endings + exec bit +RUN sed -i 's/\r$//' /code/startup.sh && \ + awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' /code/startup.sh > /tmp/s.sh && mv /tmp/s.sh /code/startup.sh && \ + chmod +x /code/startup.sh + +# call the interpreter explicitly +CMD ["/bin/sh", "/code/startup.sh"] \ No newline at end of file diff --git a/mlconnector/src/db/data.py b/mlconnector/src/db/data.py new file mode 100644 index 0000000..ffb0eb9 --- /dev/null +++ b/mlconnector/src/db/data.py @@ -0,0 +1,67 @@ +import pandas as pd +import uuid +from datetime import datetime +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy import text +from sqlalchemy.orm import sessionmaker +from dotenv import load_dotenv +import os +import asyncio + +# Load environment variables +load_dotenv(override=True) + +# Database config +db_config = { + "DB_DRIVER": os.getenv("DB_DRIVER"), # e.g. postgresql+asyncpg + "DB_USER": os.getenv("POSTGRES_USER"), + "DB_PASSWORD": os.getenv("POSTGRES_PASSWORD"), + "DB_HOST": os.getenv("DB_HOST_NAME"), + "DB_PORT": os.getenv("DB_PORT"), + "DB_NAME": os.getenv("POSTGRES_DB") +} + +# Build connection string +DATABASE_URL = ( + f"{db_config['DB_DRIVER']}://{db_config['DB_USER']}:{db_config['DB_PASSWORD']}" + f"@{db_config['DB_HOST']}:{db_config['DB_PORT']}/{db_config['DB_NAME']}" +) + +# Create async engine and session +engine = create_async_engine(DATABASE_URL, echo=False) +AsyncSessionLocal = sessionmaker(bind=engine, expire_on_commit=False, class_=AsyncSession) + +# Main async logic +async def insert_drift_metrics(): + df = pd.read_csv("drift_metrics_mmd.csv") + + # Add required fields + df["rowid"] = [str(uuid.uuid4()) for _ in range(len(df))] + #df["timestamp"] = datetime.utcnow() + + async with AsyncSessionLocal() as session: + for _, row in df.iterrows(): + await session.execute(text(""" + INSERT INTO drift_metrics ( + rowid, feature, type, statistic, p_value, + method, drift_detected, timestamp, modelid + ) VALUES ( + :rowid, :feature, :type, :statistic, :p_value, + :method, :drift_detected, :timestamp, :modelid + ) + """), { + "rowid": row["rowid"], + "feature": row["feature"], + "type": row["type"], + "statistic": float(row["statistic"]), + "p_value": float(row["p_value"]), + "method": row["method"], + "drift_detected": str(row["drift_detected"]), + "timestamp": datetime.strptime(row["timestamp"], '%d/%m/%Y %H:%M'), + "modelid": row["modelid"] + }) + await session.commit() + +# Entry point +if __name__ == "__main__": + asyncio.run(insert_drift_metrics()); \ No newline at end of file diff --git a/mlconnector/src/db/drift_metrics_mmd.csv b/mlconnector/src/db/drift_metrics_mmd.csv new file mode 100644 index 0000000..f25363b --- /dev/null +++ b/mlconnector/src/db/drift_metrics_mmd.csv @@ -0,0 +1,78 @@ +rowid,feature,type,statistic,p_value,method,drift_detected,timestamp,modelid +21274428-311b-4619-9656-8921a7daaaf,size,numerical,0.00033121,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0cec2f3b-b59e-4233-80ae-18fecc90d27f,download_time_ms,numerical,0.00065948,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f5b93d40-2dcd-4fce-9af8-6b373cad7e28,hour,numerical,0.00733468,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f479d2f2-d400-450b-b554-66271ccc116b,minute,numerical,0.00500349,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +aafb90e2-6f68-4621-9c2a-d1e013523363,second,numerical,0.00543424,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +004c8188-2da4-45b8-b9b8-ac6e2fc29f07,time_of_day,categorical,0.02429509,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +ed64f40d-57e4-4b06-a538-955edef9fa2a,day_of_week,categorical,0.84222516,0.999995148,mmd,FALSE,20/12/2024 10:24,17c8d8ee-93c7-43c0-8774-55903680a6c5 +df56eb3d-e2a5-4b65-ba30-1b90cca4e77b,size,numerical,0.00033076,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f6b014b1-f791-4bfa-b561-a259c1c595e6,download_time_ms,numerical,0.00072822,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +08f74bd5-61cc-4301-ae35-d1f2cd66bc2d,hour,numerical,0.00392956,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +63b7a404-bfd6-44d9-940e-b4ad03f56d8e,minute,numerical,0.00431306,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +fd6826c8-f993-4513-9e02-371405eb009f,second,numerical,0.00406827,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +426c8e3d-8488-4980-9959-7c9acaa747b2,time_of_day,categorical,0.011688597,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +07b02fc2-f127-40c4-823f-e00cc9f8d367,day_of_week,categorical,0.8779275,0.999995148,mmd,FALSE,22/12/2024 14:39,17c8d8ee-93c7-43c0-8774-55903680a6c5 +1c6591db-8d4a-419b-a2d1-7c3abb430274,size,numerical,0.00033153,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +c0600fd3-f697-49b0-987e-148661d76ff4,download_time_ms,numerical,0.0007217,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +e549b12f-b759-4531-a6ed-5ae4fe08b284,hour,numerical,0.00493939,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f81bce9b-5054-417a-ac68-add034f894d7,minute,numerical,0.00488431,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +9cb58b7b-42f3-4023-92c4-7c4940575f25,second,numerical,0.00546426,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +d2b7c48f-74ce-48e8-8a09-5bc9054adf8e,time_of_day,categorical,0.01934626,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f2d88df4-a9ce-4b00-8d75-0a59c25e9287,day_of_week,categorical,0.865846182,0.999995148,mmd,FALSE,24/12/2024 16:46,17c8d8ee-93c7-43c0-8774-55903680a6c5 +9b285b07-e39c-4df7-af39-758a777f786d,size,numerical,0.00033068,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +7e0ac9c5-a484-4b7f-9315-8c071828e0cc,download_time_ms,numerical,0.00061516,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +7f1c1a9e-4c2a-4388-8c96-e12e7fbafb3f,hour,numerical,0.00655878,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +45dbe2af-7d09-4bec-b027-323e97079188,minute,numerical,0.00592618,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +53455c8c-78d9-472a-8a7b-40ea8214cf47,second,numerical,0.00412502,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +cc3f33dc-3a3b-46bc-bede-17f42c42e5e2,time_of_day,categorical,0.023638868,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +fbc1ae73-d305-4227-a593-57675a2f2973,day_of_week,categorical,0.93799827,0.999995148,mmd,FALSE,26/12/2024 19:40,17c8d8ee-93c7-43c0-8774-55903680a6c5 +d1822bbf-7733-4c07-a016-330b8e0a4638,size,numerical,0.00033203,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +f42ca950-c45d-4646-b60d-0ed02fd1ea4f,download_time_ms,numerical,0.00064752,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +ca586559-49b7-4f2f-8392-c93a018b25dd,hour,numerical,0.00636723,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +9b9a15d0-8cda-486b-9d31-956a6c8e22f6,minute,numerical,0.00425282,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +e94ac054-16de-43d2-8c19-47768c2b7e3b,second,numerical,0.0074355,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +a6c58883-68f5-46dc-a7e5-b969bb5d4077,time_of_day,categorical,0.02095412,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0ab4da28-0661-4525-a16b-486f6a65581a,day_of_week,categorical,0.924213,0.999995148,mmd,FALSE,28/12/2024 22:48,17c8d8ee-93c7-43c0-8774-55903680a6c5 +b8b9f0b2-8b69-405b-8336-1cfc4e725444,size,numerical,0.00024079,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +9192f13a-9ca5-4c54-93e4-a29c34fb6cc1,download_time_ms,numerical,0.00046964,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +25c874b8-8508-4aea-8127-f50254e3b4c2,hour,numerical,0.0064353,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +3ab2ec8a-6390-4e62-b019-2162b6c9e513,minute,numerical,0.00327469,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +31c254e5-e782-486b-b1ea-2339b7e0e2d5,second,numerical,0.00465031,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +73594a11-2c4c-407a-b8b9-427afe8bdce4,time_of_day,categorical,0.014303058,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +2868ed21-dab3-42e4-9e0c-b4e6b1281ac5,day_of_week,categorical,0.65481213,0.999995148,mmd,FALSE,31/12/2024 01:52,17c8d8ee-93c7-43c0-8774-55903680a6c5 +d094eae6-fc25-4986-bb21-866749bdd126,size,numerical,0.00021075,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +88460a29-5647-496b-8543-5f4fa2d0a677,download_time_ms,numerical,0.00051039,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0636f25c-dc3e-4728-bfa2-a8d6504a7d34,hour,numerical,0.00434996,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +8099d768-ca8a-447d-93dd-337620f0b7e9,minute,numerical,0.00326226,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +d3ee3573-6e41-4ff2-98bb-d1c2c3a39e65,second,numerical,0.00281693,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +2e8052bc-ac51-44e7-9994-aa9c27c592d3,time_of_day,categorical,0.01493261,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +2f6e9413-ddef-44bb-8106-98559f571aa1,day_of_week,categorical,0.41776616,0.999995148,mmd,FALSE,03/01/2025 13:00,17c8d8ee-93c7-43c0-8774-55903680a6c5 +9e299cb2-cee0-4597-beed-c8567f6988c7,size,numerical,0.00018964,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +1227b344-b159-4a39-8098-17161a2dc959,download_time_ms,numerical,0.00059779,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +4fe50563-18a9-4a5d-a97e-fecc298d0ff3,hour,numerical,0.00281148,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +8b7b82c7-7b4b-4680-8bf0-318e6d099656,minute,numerical,0.00337089,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +68c1267e-043d-41b0-af38-221a1ad04a23,second,numerical,0.00314416,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +52b222da-514d-42c9-a816-1a9b006dbc38,time_of_day,categorical,0.01187764,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +4beb5f3d-61c0-48af-8f14-fc8a5dfce3db,day_of_week,categorical,0.38304824,0.999995148,mmd,FALSE,07/01/2025 09:04,17c8d8ee-93c7-43c0-8774-55903680a6c5 +45fc7348-8387-47cb-bb71-3a333d0fddc1,size,numerical,0.00018414,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0c6ef3b5-b3e8-41c7-aac4-018019d0ddbf,download_time_ms,numerical,0.00070718,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +7cc01923-5d7a-42a6-aca4-1d719fb7499c,hour,numerical,0.00375158,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +76183889-071f-4a31-bb78-2878c323bee0,minute,numerical,0.00347721,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +43447b3f-10f3-4e24-971e-c136724de226,second,numerical,0.00221131,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +8e25690b-b269-460e-97eb-bf4e3f603d1f,time_of_day,categorical,0.00820001,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +98f9da83-bc8f-4d8e-b3ae-eb891c6c25ce,day_of_week,categorical,0.44187321,0.999995148,mmd,FALSE,11/01/2025 14:01,17c8d8ee-93c7-43c0-8774-55903680a6c5 +aea40d1f-dcaf-4f63-b98c-217f7c51fdde,size,numerical,0.00018419,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +6366a4bc-4544-4e15-8c19-050631ca9483,download_time_ms,numerical,0.00079163,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +31f2490a-4df5-404b-9480-3dd3dbc28c45,hour,numerical,0.00255881,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +ac343a44-01c9-4bad-ba5f-39f84074d60b,minute,numerical,0.0023392,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +b490c840-32ef-48b2-8cad-c79f8cb2db21,second,numerical,0.00342082,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +13cd7d7f-10bd-4e67-af22-3bef7187e16e,time_of_day,categorical,0.01074746,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +46d4799a-903b-4115-87d0-069406a57305,day_of_week,categorical,0.35549921,0.999995148,mmd,FALSE,15/01/2025 23:35,17c8d8ee-93c7-43c0-8774-55903680a6c5 +a06005d4-a5e0-4ffb-87e3-e809d27fce18,size,numerical,0.00018536,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +bd064b48-f384-4eac-be4d-e2d729c89e06,download_time_ms,numerical,0.00041902,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +db394877-9b81-4fef-a146-77d747673f33,hour,numerical,0.00433487,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +0e934049-8929-49aa-9f13-580c7a7d4f24,minute,numerical,0.00237785,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +50f8b627-1fd5-418f-a2de-0915b66bdd6d,second,numerical,0.00324063,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +c5530b94-d029-44a6-bc8a-597b19c6f86a,time_of_day,categorical,0.00567503,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 +bf2be45d-cc4d-43da-9ac6-6c28859e9cf5,day_of_week,categorical,0.38945563,0.999995148,mmd,FALSE,20/01/2025 04:45,17c8d8ee-93c7-43c0-8774-55903680a6c5 \ No newline at end of file diff --git a/mlconnector/src/endpoints/mldeployment.py b/mlconnector/src/endpoints/mldeployment.py index 3839519..e4c9fae 100644 --- a/mlconnector/src/endpoints/mldeployment.py +++ b/mlconnector/src/endpoints/mldeployment.py @@ -100,8 +100,10 @@ async def delete_deployment( if existing_deployment is None: raise HTTPException(status_code=404, detail="Deployment was not found") - await db.delete(existing_deployment) - await db.commit() + + await utl.delete_deployments(db=db, deployment_id=deployment_id) + #await db.delete(existing_deployment) + #await db.commit() # Return a message indicating successful deletion return {"message": "Deployment was deleted successfully"} \ No newline at end of file diff --git a/mlconnector/src/migrations/versions/file.txt b/mlconnector/src/migrations/versions/file.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/mlconnector/src/migrations/versions/file.txt @@ -0,0 +1 @@ + diff --git a/mlconnector/src/startup.sh b/mlconnector/src/startup.sh index c9d6ac4..055f998 100644 --- a/mlconnector/src/startup.sh +++ b/mlconnector/src/startup.sh @@ -5,4 +5,4 @@ alembic revision --autogenerate -m "Initial load" alembic upgrade head # start server -uvicorn main:app --host "0.0.0.0" --port "8090" \ No newline at end of file +uvicorn main:app --host "0.0.0.0" --port "8090" diff --git a/mlconnector/src/utils/api/generate_dockerfile.py b/mlconnector/src/utils/api/generate_dockerfile.py index 2da4b3b..105480c 100644 --- a/mlconnector/src/utils/api/generate_dockerfile.py +++ b/mlconnector/src/utils/api/generate_dockerfile.py @@ -6,6 +6,7 @@ import pickle from pkg_resources import Requirement from io import StringIO +from typing import Any, Dict from utils.manage_s3 import S3Manager @@ -43,6 +44,10 @@ import base64 import urllib.parse from datetime import datetime, timezone +import importlib.util +import sys +from pathlib import Path +from predict import predict from mlstelemetry import MLSTelemetry @@ -105,12 +110,16 @@ async def make_prediction(request: DynamicSchema): }} current_timestamp = datetime.now(timezone.utc).isoformat(timespec='milliseconds').replace('+00:00', 'Z') try: - loaded_model = joblib.load("{model}") - print("Model loaded successfully!") if data_source == 0: - data_dict = request.data.dict() - df = pd.DataFrame([data_dict]) - result_pred = loaded_model.predict(df) + #data_dict = request.data.dict() + data_dict = [m.model_dump(by_alias=True) for m in request.data] + df = pd.DataFrame(data_dict) + if request.is_fun: + result_pred = predict("{model}", df) + else: + loaded_model = joblib.load("{model}") + print("Model loaded successfully!") + result_pred = loaded_model.predict(df) data = {{ "ownerid": owner, "deploymentid": "{deploymentid}", @@ -126,7 +135,7 @@ async def make_prediction(request: DynamicSchema): except ValueError: print("No JSON response returned.") #if request.explanation: - # explanation_res = get_single_explanation(model_id,data_dict) + # explanation_res = get_single_explanation(model_id,request.data) # if explanation_res: # return {{"inference": str(result_pred), "explanation":explanation_res}} # else: @@ -318,65 +327,69 @@ def build_and_push_image(model, registry_url, image_name, registry_username, reg app["MLSysOpsApplication"]["components"] = [{"Component": comp}] return app""" + def generate_json( deployment_id: str, image: str, - placement: dict, + placement: Dict[str, Any], app_name: str = "ml-app-1", - port: int = 8000 -): - app = { - "MLSysOpsApplication": { + port: int = 8000, +) -> Dict[str, Any]: + app: Dict[str, Any] = { + "MLSysOpsApp": { "name": app_name, - "mlsysops-id": deployment_id } } - # Only add clusterPlacement if clusterID is not a wildcard - cluster_id = placement.get("clusterID", "") + # cluster_placement (optional) + cluster_id = placement.get("clusterID") or placement.get("cluster_id") or "" if cluster_id and cluster_id != "*": - app["MLSysOpsApplication"]["clusterPlacement"] = { - "clusterID": [cluster_id], - "instances": 1 + app["MLSysOpsApp"]["cluster_placement"] = { + "cluster_id": [cluster_id] } - # Build the component block - component = { - "Component": { + # component base + component: Dict[str, Any] = { + "metadata": { "name": "ml-comp", - "uid": deployment_id - } + "uid": deployment_id, + }, + "restart_policy": "OnFailure", + "containers": [ + { + "image": image, + "image_pull_policy": "IfNotPresent", + "platform_requirements": { + "cpu": { + "requests": "1", + "limits": "2", + "architecture": ["amd64"], + } + }, + "ports": [{"container_port": port}], + } + ], } - # Always consider continuumLayer, but only add node if it's not "*" - node_conf = {} + # node_placement (optional) node_name = placement.get("node", "") + continuum = placement.get("continuum") or placement.get("continuum_layer") or "" + + node_placement: Dict[str, Any] = {} if node_name and node_name != "*": - node_conf["node"] = node_name + node_placement["node"] = node_name + if continuum and continuum != "*": + node_placement["continuum_layer"] = [continuum] - continuum = placement.get("continuum", "") - if continuum: - node_conf["continuumLayer"] = [continuum] - - if node_conf: - component["nodePlacement"] = node_conf - - # Add the remaining fields - component["restartPolicy"] = "OnFailure" - component["containers"] = [ - { - "image": image, - "imagePullPolicy": "IfNotPresent", - "ports": [ - {"containerPort": port} - ] - } - ] + if node_placement: + component["node_placement"] = node_placement - app["MLSysOpsApplication"]["components"] = [component] + # attach component + app["MLSysOpsApp"]["components"] = [component] return app + def generate_yaml( deployment_id: str, image: str, diff --git a/mlconnector/src/utils/mldeployments.py b/mlconnector/src/utils/mldeployments.py index 138394b..8cde3f1 100644 --- a/mlconnector/src/utils/mldeployments.py +++ b/mlconnector/src/utils/mldeployments.py @@ -5,6 +5,9 @@ from sqlalchemy.orm import Session from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select +from pathlib import Path + +import asyncio from fastapi import HTTPException from models.mldeployment import MLDeployment @@ -24,6 +27,9 @@ from utils.manage_s3 import S3Manager from sqlalchemy import update #myuuid = uuid.uuid4() +from pathlib import Path +from datetime import datetime, timezone +import re load_dotenv(verbose=True, override=True) @@ -34,7 +40,6 @@ os.getenv("AWS_SECRET_ACCESS_KEY"), os.getenv("AWS_ACCESS_URL") ) - def extract_feature_names(feature_list): type_mapping = { 'cont': "float", @@ -99,11 +104,11 @@ def generate_schema_code(flag=0, feature_list_str=None): for key, val in feature_dict.items() }} DataModel = create_model("DataModel", **fields) - DynamicSchema = create_model("DynamicSchema", data=(DataModel, ...), explanation=(bool, ...)) + DynamicSchema = create_model("DynamicSchema", data=(List[DataModel], ...), is_fun=(bool, False), explanation=(bool, False)) """) elif flag == 1: schema_code = dedent(""" - DynamicSchema = create_model("DynamicSchema", data_link=(str, ...), explanation=(bool, ...)) + DynamicSchema = create_model("DynamicSchema", data_link=(str, ...), is_fun=(bool, False), explanation=(bool, False)) """) return schema_code @@ -176,10 +181,34 @@ async def update_deployment( await db.refresh(existing_deployment) return existing_deployment +async def delete_deployments(db: AsyncSession, deployment_id: str) -> bool: + existing_deployment = await get_deployment_by_id(db=db, deployment_id=deployment_id) + if not existing_deployment: + return False + + await db.delete(existing_deployment) + await db.commit() + + base = os.getenv("NORTHBOUND_API_ENDPOINT") or os.getenv("NOTHBOUND_API_ENDPOINT") + url = f"{base.rstrip('/')}/ml/remove/{deployment_id}" + headers = {"Accept": "application/json"} + + try: + # Run blocking requests.delete in a worker thread + r = await asyncio.to_thread(requests.delete, url, headers=headers, timeout=20) + if r.status_code == 200: + print("Deployment %s removed from Northbound.", deployment_id) + else: + print("Northbound delete failed (%s): %s", r.status_code, r.text[:500]) + except requests.RequestException: + print("Error calling Northbound delete for %s at %s", deployment_id, url) + + return True + async def create_deployment(db: AsyncSession, deployment: MLDeploymentCreate, create_new=False): model = await get_model_by_id(db, model_id=deployment.modelid) file_model = await get_model_files(db, modelid=deployment.modelid, filekind="model") - #file_require = await get_model_files(db, modelid=deployment.modelid, filekind="data") + file_code = await get_model_files(db, modelid=deployment.modelid, filekind="code") if(deployment.deployment_id ==""): deployment_id = str(uuid.uuid4()) else: @@ -196,14 +225,23 @@ async def create_deployment(db: AsyncSession, deployment: MLDeploymentCreate, cr if model is None: raise HTTPException(status_code=404, detail="No model details found with that model_id") else: - image_name = "registry.mlsysops.eu/usecases/augmenta-demo-testbed/"+deployment.modelid+":0.0.1" + image_name = os.getenv("DOCKER_REGISTRY_URL")+"/hackathon/"+deployment.modelid+":0.0.1" # download model file... + if file_model is None or len(file_model) == 0: + raise HTTPException(status_code=404, detail="No model file found for that model_id") local_model_path = prepare_model_artifact(s3_manager, file_model[0].filename) + # print(file_code) + if file_code: + #raise HTTPException(status_code=404, detail="No function file found for that model_id") + # download from S3 + s3_manager.download_file(object_name=file_code[0].filename, download_path=os.path.join("/code/utils/api","predict.py")) + #print(f"Local code path: {local_code_path}") + #print(f"Local model path: {file_code[0].filename}") build_and_push_image( #model.trained_model[0]['modelname'], file_model[0].filename, - "registry.mlsysops.eu", + os.getenv("DOCKER_REGISTRY_URL"), image_name, os.getenv("DOCKER_USERNAME"), os.getenv("DOCKER_PASSWORD"), @@ -225,8 +263,8 @@ async def create_deployment(db: AsyncSession, deployment: MLDeploymentCreate, cr port=8000 ) - #deployment_json = json.dumps(new_deployment) - #print(str(new_deployment)) + deployment_json = json.dumps(new_deployment) + print(str(new_deployment)) #con = await create_redis_connection() #await con.rpush(os.getenv("DEPLOYMENT_QUEUE"), [str(deployment_json)]) diff --git a/mlsysops-cli/descriptions/cluster/mls-test-manage.yaml b/mlsysops-cli/descriptions/cluster/mls-test-manage.yaml new file mode 100644 index 0000000..4481305 --- /dev/null +++ b/mlsysops-cli/descriptions/cluster/mls-test-manage.yaml @@ -0,0 +1,6 @@ +MLSysOpsCluster: + name: mls-test-manage + cluster_id: mls-test-manage + nodes: + - mls-compute-vm2 + - mls-compute-vm3 \ No newline at end of file diff --git a/mlsysops-cli/descriptions/cluster/mls01.yaml b/mlsysops-cli/descriptions/cluster/mls01.yaml new file mode 100644 index 0000000..dcf9245 --- /dev/null +++ b/mlsysops-cli/descriptions/cluster/mls01.yaml @@ -0,0 +1,6 @@ +MLSysOpsCluster: + name: mls01 + cluster_id: mls01 + nodes: + - mls02 + - mls03 \ No newline at end of file diff --git a/mlsysops-cli/descriptions/continuum/mls-test-karmada.yaml b/mlsysops-cli/descriptions/continuum/mls-test-karmada.yaml new file mode 100644 index 0000000..63b42dc --- /dev/null +++ b/mlsysops-cli/descriptions/continuum/mls-test-karmada.yaml @@ -0,0 +1,4 @@ +MLSysOpsContinuum: + continuum_id: mls-test-karmada + clusters: + - mls-test-manage \ No newline at end of file diff --git a/mlsysops-cli/descriptions/continuum/mls00.yaml b/mlsysops-cli/descriptions/continuum/mls00.yaml new file mode 100644 index 0000000..e0ebd22 --- /dev/null +++ b/mlsysops-cli/descriptions/continuum/mls00.yaml @@ -0,0 +1,4 @@ +MLSysOpsContinuum: + continuum_id: mls00 + clusters: + - mls01 \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mls-compute-vm2.yaml b/mlsysops-cli/descriptions/nodes/mls-compute-vm2.yaml new file mode 100644 index 0000000..00cff65 --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mls-compute-vm2.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mls-compute-vm2 + cluster_id: mls-test-manage + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mls-compute-vm3.yaml b/mlsysops-cli/descriptions/nodes/mls-compute-vm3.yaml new file mode 100644 index 0000000..674ae7b --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mls-compute-vm3.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mls-compute-vm3 + cluster_id: mls-test-manage + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mls02.yaml b/mlsysops-cli/descriptions/nodes/mls02.yaml new file mode 100644 index 0000000..dc6ac1a --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mls02.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mls02 + cluster_id: mls01 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: "100" + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mls03.yaml b/mlsysops-cli/descriptions/nodes/mls03.yaml new file mode 100644 index 0000000..6162a04 --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mls03.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mls03 + cluster_id: mls01 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: "100" + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mlsysops-worker1.rni.dc.ubiwhere.lan.yaml b/mlsysops-cli/descriptions/nodes/mlsysops-worker1.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..b3fd9ea --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mlsysops-worker1.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mlsysops-worker1.rni.dc.ubiwhere.lan + cluster_id: mlsysops-cluster-agent1 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mlsysops-worker2.rni.dc.ubiwhere.lan.yaml b/mlsysops-cli/descriptions/nodes/mlsysops-worker2.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..bb1aa03 --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mlsysops-worker2.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mlsysops-worker2.rni.dc.ubiwhere.lan + cluster_id: mlsysops-cluster-agent1 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mlsysops-worker3.rni.dc.ubiwhere.lan.yaml b/mlsysops-cli/descriptions/nodes/mlsysops-worker3.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..88fed0b --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mlsysops-worker3.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mlsysops-worker3.rni.dc.ubiwhere.lan + cluster_id: mlsysops-cluster-agent1 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mlsysops-worker4.rni.dc.ubiwhere.lan.yaml b/mlsysops-cli/descriptions/nodes/mlsysops-worker4.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..b85bfa0 --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mlsysops-worker4.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mlsysops-worker4.rni.dc.ubiwhere.lan + cluster_id: mlsysops-cluster-agent2 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/descriptions/nodes/mlsysops-worker5.rni.dc.ubiwhere.lan.yaml b/mlsysops-cli/descriptions/nodes/mlsysops-worker5.rni.dc.ubiwhere.lan.yaml new file mode 100644 index 0000000..74ad4e3 --- /dev/null +++ b/mlsysops-cli/descriptions/nodes/mlsysops-worker5.rni.dc.ubiwhere.lan.yaml @@ -0,0 +1,26 @@ +MLSysOpsNode: + name: mlsysops-worker5.rni.dc.ubiwhere.lan + cluster_id: mlsysops-cluster-agent2 + continuum_layer: edge + permitted_actions: + - traffic_redirection + - acceleration + - cpu_frequency + - gpu_frequency + - change_container_cpu_set + - change_container_image + environment: + node_type: virtualized + os: ubuntu + container_runtime: + - containerd + hardware: + cpu: + cores: 1 + architecture: amd64 + frequency: + - 14000 + - 17000 + performance_indicator: 1 # BogoMIPS + memory: 100 + disk: "1" \ No newline at end of file diff --git a/mlsysops-cli/inventory.yaml b/mlsysops-cli/inventory.yaml new file mode 100644 index 0000000..6358e74 --- /dev/null +++ b/mlsysops-cli/inventory.yaml @@ -0,0 +1,55 @@ +all: + children: + management_cluster: + hosts: + mls-test-karmada: + ansible_host: 10.64.83.239 + ansible_user: runner + ansible_ssh_private_key_file: /home/runner/.ssh/runner + ansible_python_interpreter: /usr/bin/python3 + k3s_cluster_name: management + pod_cidr: "10.10.0.0/16" + service_cidr: "10.11.0.0/16" + labels: + is_vm: true + mlsysops.eu/continuumLayer: continuum + vaccel: "false" + + cluster1: + children: + master_nodes: + hosts: + mls-test-manage: + ansible_host: 10.64.83.230 + ansible_user: runner + ansible_ssh_private_key_file: /home/runner/.ssh/runner + ansible_python_interpreter: /usr/bin/python3 + k3s_cluster_name: mls-test-manage + pod_cidr: "10.12.0.0/16" + service_cidr: "10.13.0.0/16" + labels: + is_vm: true + mlsysops.eu/continuumLayer: cluster + vaccel: "false" + worker_nodes: + hosts: + mls-compute-vm2: + ansible_host: 10.64.83.170 + ansible_user: runner + ansible_ssh_private_key_file: /home/runner/.ssh/runner + ansible_python_interpreter: /usr/bin/python3 + k3s_cluster_name: mls-test-manage + labels: + is_vm: true + mlsysops.eu/continuumLayer: node + vaccel: "false" + mls-compute-vm3: + ansible_host: 10.64.83.241 + ansible_user: runner + ansible_ssh_private_key_file: /home/runner/.ssh/runner + ansible_python_interpreter: /usr/bin/python3 + k3s_cluster_name: mls-test-manage + labels: + is_vm: true + mlsysops.eu/continuumLayer: node + vaccel: "false" \ No newline at end of file diff --git a/mlsysops-cli/karmada.kubeconfig b/mlsysops-cli/karmada.kubeconfig deleted file mode 100644 index aa205f9..0000000 --- a/mlsysops-cli/karmada.kubeconfig +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: v1 -clusters: -- cluster: - server: https://lab1.nubificus.co.uk:32644 - insecure-skip-tls-verify: true - - name: karmada-apiserver -- cluster: - server: https://lab1.nubificus.co.uk:6443 - insecure-skip-tls-verify: true - name: karmada-host -- cluster: - certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkakNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdGMyVnkKZG1WeUxXTmhRREUzTlRJMk1UVXdOREV3SGhjTk1qVXdOekUxTWpFek1EUXhXaGNOTXpVd056RXpNakV6TURReApXakFqTVNFd0h3WURWUVFEREJock0zTXRjMlZ5ZG1WeUxXTmhRREUzTlRJMk1UVXdOREV3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFUdUdNS2JqaTBiV0tpbUxkWndsMDNnYUJtUFZ6TzZ6TFA5SVZ6Uk1XN3kKNXlacHhJOG9NL1JxNXd6MStMcElLanRCZGJZK0lmNm9SbkorbGxLTkFoWG1vMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVW1pY3g1bmxqeElxb1J4MDVnanlzCnJkbFQrWjh3Q2dZSUtvWkl6ajBFQXdJRFJ3QXdSQUlnT3NyRWY3RTRmSkxGN3ZpdWxmRkxDV2VabzViSzVBRysKVHg3SnhhTm9YMThDSUVOUE94TEJEMU5oT004aW1RM2xlQVRnMkN5SndPd3lnZmJhQWhRSFRQT1gKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - server: https://192.168.5.78:6443 - name: mls01 -contexts: -- context: - cluster: karmada-apiserver - user: karmada-apiserver - name: karmada-apiserver -- context: - cluster: karmada-host - user: karmada-host - name: karmada-host -- context: - cluster: mls01 - user: mls01 - name: mls01 -current-context: karmada-host -kind: Config -preferences: {} -users: -- name: karmada-apiserver - user: - client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZQVENDQTZXZ0F3SUJBZ0lVWHlXTFN1YWZ0Z2hlb3BNNzV0RTNweW84Ymo0d0RRWUpLb1pJaHZjTkFRRU0KQlFBd1hERUxNQWtHQTFVRUJoTUNlSGd4Q2pBSUJnTlZCQWdNQVhneENqQUlCZ05WQkFjTUFYZ3hDakFJQmdOVgpCQW9NQVhneENqQUlCZ05WQkFzTUFYZ3hDekFKQmdOVkJBTU1BbU5oTVJBd0RnWUpLb1pJaHZjTkFRa0JGZ0Y0Ck1CNFhEVEkxTURjeE5USXhNamN3TUZvWERUTXdNRGN4TkRJeE1qY3dNRm93TURFWE1CVUdBMVVFQ2hNT2MzbHoKZEdWdE9tMWhjM1JsY25NeEZUQVRCZ05WQkFNVERITjVjM1JsYlRwaFpHMXBiakNDQWFJd0RRWUpLb1pJaHZjTgpBUUVCQlFBRGdnR1BBRENDQVlvQ2dnR0JBTHNyT25TR2RFRXNsUlJtdVZPRVN0ZjZWWDdmUjczWERoMDRiaUp3CnJna21zYTNUM1ZIaE1keWsybE1CNWhOY0U0U2llNUZiTER5WDZ3cHZSQTh3THZqRGZ3N0VzbFp3RGU5MVVWSGEKa2c4c2JEOGYrMVE5VU5JWjJmOS9ZUlYrSUdCam9FMGdtOVVmRnRwY2pvNkZGd3Z2dXBHN2pNMWRISXZ1cmFySAorbllRYkZGM1J3WmNFY2RpRytGdUJLeUQxK3VzT2lQTkxzYTFuVU8rdHFkQVozV1paTWVDSHVQcWF0QUtOMzJ5Cnc2ZnVjMHBEaFFnc2VLTGRqR0szTzNtckVwZzRUYXFjVWgyckdTbFZEMDVrMHdHTlZDT2h1Y3crbUp5c2E0b0UKNjJVSEdzNVVKa1V4bElOTGZJN0w2MFRjQlpMME56djVQT0RIRmhxbW9SdUpWZjNMbVZhaEpZdmhmSW5EZXJxRgpIbGJaSzBxY3JSLzZtN1JWU3VEYXZsV205YjNZQlloUWdYcU12UzkzdHhCdDVaR2trQWtUZXR0c3JNYW1zRnp2CkJZQS8vNUhuRXVOWkU2c2RpVlllbzVteHM5R2FPWjVOTjRqKzVUMkxvL2VEUHJxZjZwdXNMemI5WVlHUEtOdkQKNzlCc0FNRG1Tak9ybjAyN3kxUXdxeGNiU3dJREFRQUJvNElCSVRDQ0FSMHdEZ1lEVlIwUEFRSC9CQVFEQWdXZwpNQjBHQTFVZEpRUVdNQlFHQ0NzR0FRVUZCd01DQmdnckJnRUZCUWNEQVRBTUJnTlZIUk1CQWY4RUFqQUFNQjBHCkExVWREZ1FXQkJRSGdYSkhnTFNlRG1JNExub0JvTlFpQTlkdkR6QWZCZ05WSFNNRUdEQVdnQlQrdG1ud1lhbDkKdWJadndabGMzeWc0cDBNbXdqQ0JuUVlEVlIwUkJJR1ZNSUdTZ2hacmRXSmxjbTVsZEdWekxtUmxabUYxYkhRdQpjM1pqZ2dsc2IyTmhiR2h2YzNTQ0p5b3VaWFJqWkM1cllYSnRZV1JoTFhONWMzUmxiUzV6ZG1NdVkyeDFjM1JsCmNpNXNiMk5oYklJaUtpNXJZWEp0WVdSaExYTjVjM1JsYlM1emRtTXVZMngxYzNSbGNpNXNiMk5oYklJVUtpNXIKWVhKdFlXUmhMWE41YzNSbGJTNXpkbU9IQkg4QUFBR0hCTUNvQlVVd0RRWUpLb1pJaHZjTkFRRU1CUUFEZ2dHQgpBRXVONkIyWXVIUDNHQUNoZjNMYnIyb0p0M3I4NXptT21neWY2YVkzaDZLVWd5aHZXYWFqdmJzY1FUM2ozNmRiCmwwSVk5aWxPdVZvczZBM1RJSnlIVmcvalJJSW5nb1RyODZzcjErNStsQVdJTVJhOWYrM3pWeUFYUTVIcjhibnYKazFvbmxRcDI3VVVISjZRamRycTZPdmlBeGNmUnhaRFRyd3cvRlVyMHZ4d240Z3N0a1BZdUhvbWNiaS94YzVRago5N3FGTUh3bzZhSUQ4V3ZCM0NPTXJiL3k4bzg3aUlRVkdmZkZxWXVuY1d5aVZwTGtyM2V0UlRZa2dJY0JXUk12ClhxeXFheUNpYUY1cVk4VFlFYmpOYkdtbmdxNlJ4aTVmbXVXd21mM1N3S09najZuaGJGUUdRN3A3ZnNkTlM3eGIKSXoxZnlrd2xqK2ZQZ2xBTzBiSVNIR1dhYjlOcGpGRm1MMS95QWJpaEtUcUo0WGJTVmlmMjNWQUozUmx3SlQvLwpjM3FGS0RhUFJoWnZDcjNpcVlSMk0rak5nYXB2YklTR2EyaERCNyt6RW9yTmNQRjIxM3h0NjBUaGlhZlhaVUtmCnQyMkg2YnBZdmpUelJEV0tOMzFRcmhDZGZLVTB2R1h3TWlDaW9zWitOalI4cHZEMmVPdjhGa2lvNDdFZ1dkeXIKOXc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== - client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlHNVFJQkFBS0NBWUVBdXlzNmRJWjBRU3lWRkdhNVU0UksxL3BWZnQ5SHZkY09IVGh1SW5DdUNTYXhyZFBkClVlRXgzS1RhVXdIbUUxd1RoS0o3a1Zzc1BKZnJDbTlFRHpBdStNTi9Ec1N5Vm5BTjczVlJVZHFTRHl4c1B4LzcKVkQxUTBoblovMzloRlg0Z1lHT2dUU0NiMVI4VzJseU9qb1VYQysrNmtidU16VjBjaSs2dHFzZjZkaEJzVVhkSApCbHdSeDJJYjRXNEVySVBYNjZ3Nkk4MHV4cldkUTc2MnAwQm5kWmxreDRJZTQrcHEwQW8zZmJMRHArNXpTa09GCkNDeDRvdDJNWXJjN2Vhc1NtRGhOcXB4U0hhc1pLVlVQVG1UVEFZMVVJNkc1ekQ2WW5LeHJpZ1RyWlFjYXpsUW0KUlRHVWcwdDhqc3ZyUk53Rmt2UTNPL2s4NE1jV0dxYWhHNGxWL2N1WlZxRWxpK0Y4aWNONnVvVWVWdGtyU3B5dApIL3FidEZWSzROcStWYWIxdmRnRmlGQ0Jlb3k5TDNlM0VHM2xrYVNRQ1JONjIyeXN4cWF3WE84RmdELy9rZWNTCjQxa1RxeDJKVmg2am1iR3owWm81bmswM2lQN2xQWXVqOTRNK3VwL3FtNnd2TnYxaGdZOG8yOFB2MEd3QXdPWksKTTZ1ZlRidkxWRENyRnh0TEFnTUJBQUVDZ2dHQWVVMTQ0eEJJdy95SzJlQUJlRGtpOEhyVGNoaHZQSC9tSzlYbwp1dlZnTlpvWVNWb244K3NKRDhEZnBqQnkxbFUwYVIrTkNoV1d3K2FKZEFmTWlqTm1IZDlGdDJLUFdpbmhQakpBCmM5WStGd3RwVkRmOEFRcUVHTWFjWEZSSHJWQkMxM1Y1N01LWGZEWmI3MFZSTnRqV3FPdkRkTE9pZTI0L0ZBbXEKRUNidkVobHV1aFdkdUdkdk5sTkZVb0tsRU5HWjNLbVFQS2U4Q3hpbFJjU1NTQzhiWGtIOTZMZFNJZEJKVFc5eApxMHR5TnFINW9rWEVNQm55VFV0Y3NwV2lQQ1k4R1JHU09HLzk5ekNNci83Rk02cEZpL2J4UUdsSGhTSHdjNmFVCi9FY1RzNHR0S25UcjFCNDRUdFJlcHJLOW1heUIvK0hFQ1ExZjBNUDlERWZOQ1A4eFh0dGoya0tSNjhrbktXUU4KeGkva1ZFQkpEVzhyOC82RFlTYXp2UWRsYVhwK1JOWGF3TDJoLzQ4emIvKzJKTzMyVlh3U3hERVIrbytsZERTQgp1T3dvRVFzcFVmazdqVW8rRFNqWW1pTmFseWFKRUQwT0VPeXdTcUVoT0ZzMWViUWNsemtESktJRklYd3dsWjJqCmNlUXYxMXdXQjNKZHFLekdwUmVLbitZNHFub0JBb0hCQU4vVnVFWlRweG9HVDBJNXpsVHd5ZEJlbmlKQ3REb3MKcitIdkdRMzNyYjJJa0I3QlJ6ai9oTVUvV2UwZjBaQ0wrZ0RteEZsZWhBZjFoQ1Vwa3A5WjRYcUlWRTRocFR3dApsK2dNVWlKMUwxanFjcG12OWh4OWpLNmJYWXlPT05CbHp5YlNkNEx3MDFldm45UDUrN1IxTmRwaVZDUkxuVXNNCkxESkFtT213RnhsWGhhRFo5ZzUvL1lmSjNMRzJhT3FEVmdSZHZFbGxzYlZjVzZ5eThGZW5DbFhjRGk4REpCNEgKc1Axd2M0em9HYnkxek5kQ1hyS3J1RDhiNklMQ054ZTZnUUtCd1FEV0VLcDVraG5vYS9Qb3Y4Y05vclFQOGdpZQpSOUZBZWd6RE8wK3gzOWdPUnBHWC9xWHptcTNxRU95TWhUa2FpSnRIOTZla2VybHYwRXNqTkxLc2VWdlFHdTVtCmJIUkJPQWN3UUhhdVlJT3JvaGg2TWpsMXJXYVdlYlV1YmRselJSVCtsN2VlQkVkdjhkRjBsL1lpQUVMUnhxbVcKWVRKa3Ezc1NXeFlhZ0hjYXBQVVBDaERTYzAyRkNPRElwSjRZRzdGYnZqckhmVUZ2RjZXbjNpem1DeWVuOVhtUgpkSnVsS3o4M3Q3VnhoNDJoOFVMWjFhWmRqcFU3Z0lpOUdrTDB0OHNDZ2NFQWtCWFljRlRyRFVLZ2hLWUhYd0E5Ck9VZWZEYmpZb1ZZZ3hoU09rVUl0Y2dOVm1xYWFQV09tU1A2MTJZR1EralpROFlHYlJWZ2xMNlpvc1M5b3RmRTYKeHhDNURDNTJXaDJUV29tTDJhaUxtUFlhWFRvNitDNE5nRng4eEx6TWI4SUpyaGp2VzdmQXBBSm1wU0tYVjV2WAoxY05VSEtVY0ZMSTZSUUwySy8veGJwdTdaa2JJTUo5TlNicTB1U1VOdG1CaTM1NmpWTEFFekJVdTRpV3h0d3JOCkNXWUZMMWdiU01WMHRrQkw4Wmw2NzlJcmtTbWxJZWlPU3ZtaEc0TC95ZU9CQW9IQkFMVU1rWEtTL0RDVXBVMTAKYmlGOU8vZkVpRDkrbEYrSkdxUkExTTJuZWUvUkJiRGRJblNmblhvN2FDWU5RMWZLdERLeEptdTVsSDhhVjRKbQpQcndiUkdrRmJoN1ltWmJUa1ZqaEhMK3RSNC94WCt4eDhDRTBVSER5YlNmZ3BxcVVnZ1kyUVlsM0t3aG1LUFlJCjFKYTdTOGhxMHVJcVFObmVJeEJLVTY2RWVyZmhUcGF5dDdtVzViaWY0cVl3ZkxuOUtndGpJQW03U1kxeEhaMlcKRHJsZU0rTWRmZjFrOTROQnlZMFJabmVuNDBWVmJXb2FCa1JZcEtVWHVBN2dXRUxyOHdLQndRREtRampoRlVpdwpaeGk1VVpDdXErZ1RwdWF3dlcxbnRIR1VCWFc5V2lyN201OGdxUjNEY2lTbzJNUWhWN2s2ZUJvbzFxbGI5SzhmCnI5ZktPc0FaRU92TU5UMW9SeFFHU3lGdTY5R3cyTCt1Z2h4L1VBdW1TeUZGTSs2RjBMQ21rbld5QndEd3UwZUQKQURNc0h6bGhRYm5MMEdwaTJybFg4R3p1MkEzQ2NTck1sWXJDT1RxbEpHc01QTHJpQnBwYXdvKzUxUWlxM09WagpxaDZKK2lqWHpHdjc0akNSODlKakNHUTNKQ2VNY2N2N0t5MnU2TlZ6QjZPOEZ2QWdkYnpoRlRNPQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo= -- name: karmada-host - user: - client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrRENDQVRlZ0F3SUJBZ0lJZUJaWEhnRWdMdWN3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOelV5TmpFMU1EUXlNQjRYRFRJMU1EY3hOVEl4TXpBME1sb1hEVEkyTURjeApOVEl4TXpBME1sb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJEdExjdkxSV094ZTBjd1MKRitiTnpCdlhySGxjYTNQMlFCRkJTZHg1YkxDZWpjWmEzMzFsTnpXZ1JTd2Nhdmkzd05Fc0VjMnhuUTd2UXBYMQplL1d2TVZLalNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCUkpLZWNOQmNnVXVDaXoyZml3K2NpcjBiQUVPekFLQmdncWhrak9QUVFEQWdOSEFEQkUKQWlCaGtwanBLbzhMZURnYVRtekIyM2RiSlh5SURycytBRCtaMDNUVHVIeHpmd0lnTzZiQzNOOHZnQnN2dWpjVQp0TVJoOE9VY3RPcDEvRG5IekxGc0dCbWM1WlE9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTlRJMk1UVXdOREl3SGhjTk1qVXdOekUxTWpFek1EUXlXaGNOTXpVd056RXpNakV6TURReQpXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTlRJMk1UVXdOREl3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFSUTRBdVJHYVk1RUVLYWZ0V2p2VE8zS293czh3TjdoVUVObXEzUG5Bd0cKUmR4WlJxUVJUVFc5NGIvcDZ1empCeVp5M0JhTTlrNjAzOUtsOWpZYmpLZ09vMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVVNTbm5EUVhJRkxnb3M5bjRzUG5JCnE5R3dCRHN3Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUlnYTRUWnpnSW5XWU9Na0JkUzFqektld1lCQXE4a3B3YVUKclcwQ1Z6ZWt5VjhDSVFEeThSYUFWVFl2RnZhenBvdUh0NWwvd1RVbEJWZytjUktJRHVLc2JMMklEZz09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K - client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSU03ZnR0M1JKNms3OE5LSjM5ZkVwMlZ0Zm9RY2xSV1hqT1dFZlI2RG5mWG5vQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFTzB0eTh0Rlk3RjdSekJJWDVzM01HOWVzZVZ4cmMvWkFFVUZKM0hsc3NKNk54bHJmZldVMwpOYUJGTEJ4cStMZkEwU3dSemJHZER1OUNsZlY3OWE4eFVnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= -- name: mls01 - user: - client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJrRENDQVRlZ0F3SUJBZ0lJZnR0TTRYL3RreDR3Q2dZSUtvWkl6ajBFQXdJd0l6RWhNQjhHQTFVRUF3d1kKYXpOekxXTnNhV1Z1ZEMxallVQXhOelV5TmpFMU1EUXhNQjRYRFRJMU1EY3hOVEl4TXpBME1Wb1hEVEkyTURjeApOVEl4TXpBME1Wb3dNREVYTUJVR0ExVUVDaE1PYzNsemRHVnRPbTFoYzNSbGNuTXhGVEFUQmdOVkJBTVRESE41CmMzUmxiVHBoWkcxcGJqQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJHRlkzbGdxTEQ5R1pIQkMKWlJvN1BaeVRlNUdieFJNeGQxY0d0bXJsWVZWRUtzUnYvYzQ4djFhb3dneXlmNlBnU0I2Nk9xanRWdzNHNW5sdwpYYWovVmE2alNEQkdNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBakFmCkJnTlZIU01FR0RBV2dCVDFkRWRwNm5RbnFhQ20xNE9qSjYwQUxHemZ1ekFLQmdncWhrak9QUVFEQWdOSEFEQkUKQWlCWlRjUEFhNTVJZmFTRXlMY0QyNS9qU1EyY0tRYVFqSm4vMHo4RkJFUXpvd0lnQ04wNThLRUF5bjZGWENOTgpXRFlWeks0VHUzcDFSY28xaTR4dG92R0x3TXM9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0KLS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkekNDQVIyZ0F3SUJBZ0lCQURBS0JnZ3Foa2pPUFFRREFqQWpNU0V3SHdZRFZRUUREQmhyTTNNdFkyeHAKWlc1MExXTmhRREUzTlRJMk1UVXdOREV3SGhjTk1qVXdOekUxTWpFek1EUXhXaGNOTXpVd056RXpNakV6TURReApXakFqTVNFd0h3WURWUVFEREJock0zTXRZMnhwWlc1MExXTmhRREUzTlRJMk1UVXdOREV3V1RBVEJnY3Foa2pPClBRSUJCZ2dxaGtqT1BRTUJCd05DQUFUUVorSjZMOTZ0Y0JxU1J1Z09maGdOZ3lPanhSbFVXQkIwVDhhTmErd08KcE53UFNvQXdpQ3ZMbWF2Mlh6d2wxS0dXUi9nNTNuOU5OOGNBNi90TVBvWk1vMEl3UURBT0JnTlZIUThCQWY4RQpCQU1DQXFRd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFRmdRVTlYUkhhZXAwSjZtZ3B0ZURveWV0CkFDeHMzN3N3Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUloQVA1Z3NvY1N4dXVwKzRnYnVGOGFDOXRxRU9UcjJXd08KS0RwZ3kveGJBZ3JlQWlCeUpiYzZRZWFvS0pQcWoxS3JULzdJcXdZc2lZekd2WEdRN2RCMTMxdFdMZz09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K - client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUxyMG9SNkk0cm1JMG9jS1ZEdTc2aksrWDVGdU5QZjF6NmFodWNCYy9hbFhvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFWVZqZVdDb3NQMFprY0VKbEdqczluSk43a1p2RkV6RjNWd2EyYXVWaFZVUXF4Ry85emp5LwpWcWpDRExKL28rQklIcm82cU8xWERjYm1lWEJkcVA5VnJnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo= \ No newline at end of file diff --git a/mlsysops-cli/mlsysops_cli/cli.py b/mlsysops-cli/mlsysops_cli/cli.py index ee70ac8..7149033 100644 --- a/mlsysops-cli/mlsysops_cli/cli.py +++ b/mlsysops-cli/mlsysops_cli/cli.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +import asyncio +import sys import traceback import click @@ -12,6 +14,7 @@ deploy_continuum_agents, deploy_cluster_agents, deploy_node_agents, + deploy_fita_agents ) from mlsysops_cli.deployment.descriptions_util import create_app_yaml @@ -82,6 +85,7 @@ def list_all(): except Exception as e: click.secho(f"Connection Error: {e}", fg='red') + @click.command(help='Takes as Input a ticket and returns the performance of the app') @click.argument('app_id') def get_app_performance(app_id): @@ -128,7 +132,6 @@ def get_app_performance(app_id): click.echo(click.style(f"❌ ERROR. Reason: {error_message}", fg='red')) - @click.command(help='Takes as Input a ticket and returns the deployment status') @click.argument('app_id') def get_app_status(app_id): @@ -225,6 +228,7 @@ def remove_app(app_id): error_message = response.json().get("detail", "Unknown error") click.echo(click.style(f"ERROR. Reason: {error_message}", fg='red')) + apps.add_command(deploy_app) apps.add_command(list_all) apps.add_command(get_app_status) @@ -235,7 +239,6 @@ def remove_app(app_id): cli.add_command(apps) - # ----------------------------------------------------------------------------- # Infrastructure commands # ----------------------------------------------------------------------------- @@ -317,8 +320,10 @@ def framework(): @click.command(help="Deploy all components (core services, continuum, clusters, nodes)") -@click.option('--path', type=click.Path(exists=True), required=False, help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') -@click.option('--inventory', type=click.Path(exists=True), required=False, help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') +@click.option('--path', type=click.Path(exists=True), required=False, + help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') +@click.option('--inventory', type=click.Path(exists=True), required=False, + help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') def deploy_all(path, inventory): # Ensure only one of the --path or --uri options is provided if path and inventory: @@ -339,8 +344,10 @@ def deploy_services(): @click.command(help="Deploy the continuum agent") -@click.option('--path', type=click.Path(exists=True), required=False, help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') -@click.option('--inventory', type=click.Path(exists=True), required=False, help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') +@click.option('--path', type=click.Path(exists=True), required=False, + help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') +@click.option('--inventory', type=click.Path(exists=True), required=False, + help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') def deploy_continuum(path, inventory): # Ensure only one of the --path or --uri options is provided if path and inventory: @@ -353,8 +360,10 @@ def deploy_continuum(path, inventory): @click.command(help="Deploy the cluster agents") -@click.option('--path', type=click.Path(exists=True), required=False, help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') -@click.option('--inventory', type=click.Path(exists=True), required=False, help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') +@click.option('--path', type=click.Path(exists=True), required=False, + help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') +@click.option('--inventory', type=click.Path(exists=True), required=False, + help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') def deploy_cluster(path, inventory): # Ensure only one of the --path or --uri options is provided if path and inventory: @@ -367,8 +376,10 @@ def deploy_cluster(path, inventory): @click.command(help="Deploy the node agents") -@click.option('--path', type=click.Path(exists=True), required=False, help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') -@click.option('--inventory', type=click.Path(exists=True), required=False, help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') +@click.option('--path', type=click.Path(exists=True), required=False, + help='Path to the desriptions directory. It MUST include path/continuum,path/cluster,path/node') +@click.option('--inventory', type=click.Path(exists=True), required=False, + help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') def deploy_node(path, inventory): # Ensure only one of the --path or --uri options is provided if path and inventory: @@ -380,16 +391,103 @@ def deploy_node(path, inventory): except Exception as e: click.secho(f"❌ Error during node agents deployment: {e}", fg='red') -@click.command(help="Create a test application description using an inventory YAML.") -@click.option('--inventory', type=click.Path(exists=True), required=True, help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') -@click.option('--cluster', type=str, required=False, help='Cluster name to prepare the test application description for.') +@click.command(help="Create a test application description using an inventory YAML.") +@click.option('--inventory', type=click.Path(exists=True), required=True, + help='Path to the inventory YAML that was used from cluster/karmada setup ansible script.') +@click.option('--cluster', type=str, required=False, + help='Cluster name to prepare the test application description for.') def create_test_app_description(inventory, cluster): try: - create_app_yaml(inventory,cluster) + create_app_yaml(inventory, cluster) except Exception as e: click.secho(f"❌ Error during test application descriptions creation: {e}", fg='red') +# ----------------------------------------------------------------------------- +# Deploy FITA command +# ----------------------------------------------------------------------------- + +@framework.command('deploy-fita') +@click.option('--release-name', required=True, help='The name for the Helm release.') +@click.option('--chart-path', required=True, help='The path to the Helm chart directory.') +@click.option('--namespace', default='default', show_default=True, help='The Kubernetes namespace to deploy into.') +@click.option('--create-namespace', is_flag=True, help='Create the namespace if it does not exist.') +@click.option('--values', '-f', multiple=True, type=click.Path(exists=True), help='Path to a Helm values YAML file. Can be specified multiple times.') +async def deploy_fita(release_name, chart_path, namespace, create_namespace, values): + """ + Deploys FITA components using a Helm chart with asyncio. + + This command wraps 'helm install' to provide a standardized deployment method. + """ + click.echo(f"🚀 Starting FITA deployment for release '{release_name}'...") + + # --- Construct the Helm Command --- + # We start with the base command. + helm_command = [ + 'helm', 'install', release_name, chart_path, + '--namespace', namespace + ] + + # If the --create-namespace flag is used, add it to the command. + if create_namespace: + helm_command.append('--create-namespace') + + # Add any specified values files to the command. + for value_file in values: + helm_command.extend(['--values', value_file]) + + click.echo("--------------------------------------------------") + click.echo(f"Executing command: {' '.join(helm_command)}") + click.echo("--------------------------------------------------") + + # --- Execute the Helm Command with asyncio --- + # We use asyncio.create_subprocess_exec to run the command asynchronously. + try: + process = await asyncio.create_subprocess_exec( + *helm_command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT # Redirect stderr to stdout + ) + + # Asynchronously read the output line by line and print it. + # This provides real-time feedback to the user. + if process.stdout: + while True: + line = await process.stdout.readline() + if not line: + break + # The output is in bytes, so we decode it to a string. + sys.stdout.write(line.decode('utf-8')) + sys.stdout.flush() + + # Wait for the subprocess to complete and get the return code. + return_code = await process.wait() + + if return_code: + # If the command fails, we report an error. + click.secho(f"\n❌ Error: Helm command failed with exit code {return_code}.", fg='red') + sys.exit(return_code) + + click.echo("\n--------------------------------------------------") + click.secho(f"✅ FITA deployed successfully!", fg='green') + + except FileNotFoundError: + click.secho("\n❌ Error: 'helm' command not found.", fg='red') + click.echo("Please ensure Helm is installed and in your system's PATH.") + sys.exit(1) + except Exception as e: + click.secho(f"\n❌ An unexpected error occurred: {e}", fg='red') + sys.exit(1) + +@framework.command(help="fita-agents") +@click.option('--descriptions', type=click.Path(exists=True), required=False, + help='Fita Node descriptions.') +def fita_agents(descriptions): + try: + deploy_fita_agents(descriptions) + except Exception as e: + click.secho(f"❌ Error during fita node agents deployment: {e}", fg='red') + # Add commands to the 'framework' group framework.add_command(deploy_all) framework.add_command(deploy_services) @@ -397,6 +495,8 @@ def create_test_app_description(inventory, cluster): framework.add_command(deploy_cluster) framework.add_command(deploy_node) framework.add_command(create_test_app_description) +framework.add_command(deploy_fita) +framework.add_command(fita_agents) cli.add_command(framework) @@ -439,6 +539,7 @@ def set_mode(mode): manage.add_command(set_mode) cli.add_command(manage) + # ----------------------------------------------------------------------------- # Agent commands (set-policy, delete-policy) # -----------------------------------------------------------------------------@click.group() @@ -447,11 +548,13 @@ def agent(): """Manage policies for agents.""" pass + policies_configmap = { - "cluster": "cluster-agents-policies", - "continuum": "continuum-policies", - "node": "node-agents-policies" - } + "cluster": "cluster-agents-policies", + "continuum": "continuum-policies", + "node": "node-agents-policies" +} + @agent.command(name="set-policy") @click.option('--agent', type=click.Choice(['cluster', 'continuum', 'node']), required=True, @@ -483,9 +586,10 @@ def policy_add_or_update(agent, policy_file): except Exception as e: click.echo(f"Failed to update ConfigMap: {e}") + @agent.command("delete-policy") @click.option("--agent", type=click.Choice(["cluster", "continuum", "node"]), - required=True, help="Agent type to remove a policy from") + required=True, help="Agent type to remove a policy from") @click.argument("name", type=str) def policy_delete(agent: str, name: str): """Delete policy by name for the given agent.""" @@ -523,6 +627,7 @@ def policy_delete(agent: str, name: str): "node": "node-agents-config", } + @agent.command("set-config") @click.option("--agent", type=click.Choice(["cluster", "continuum", "node"])) @click.option("--file", "config_file", type=click.Path(exists=True), required=True, help="Path to the config file") @@ -552,6 +657,7 @@ def set_config(agent: str, config_file: str): except Exception as e: click.echo(f"Failed to update ConfigMap: {e}") + @agent.command("delete-config") @click.option("--agent", type=click.Choice(["cluster", "continuum", "node"])) @click.argument("key", type=str) diff --git a/mlsysops-cli/mlsysops_cli/deployment/api-service-deployment.yaml b/mlsysops-cli/mlsysops_cli/deployment/api-service-deployment.yaml index f691f17..29f015a 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/api-service-deployment.yaml +++ b/mlsysops-cli/mlsysops_cli/deployment/api-service-deployment.yaml @@ -21,7 +21,7 @@ spec: command: ["sh", "-c", "echo 'Waiting...' && sleep 4"] containers: - name: nb-api-service - image: harbor.nbfc.io/mlsysops/northbound-api + image: {{ CONTAINER_IMAGE }} ports: - containerPort: 8000 name: http-port diff --git a/mlsysops-cli/mlsysops_cli/deployment/cluster-agents-daemonset.yaml b/mlsysops-cli/mlsysops_cli/deployment/cluster-agents-daemonset.yaml index 1199c00..d554b86 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/cluster-agents-daemonset.yaml +++ b/mlsysops-cli/mlsysops_cli/deployment/cluster-agents-daemonset.yaml @@ -22,7 +22,7 @@ spec: command: ["sh", "-c", "echo 'Waiting...' && sleep 20"] containers: - name: cluster-agent - image: harbor.nbfc.io/mlsysops/cluster-agent + image: {{ CONTAINER_IMAGE }} imagePullPolicy: Always env: - name: NODE_NAME @@ -39,8 +39,6 @@ spec: value: {{ KARMADA_HOST_IP }} - name: NODE_PASSWORD value: "1234" - - name: PYTHONPATH - value: "/workdir/fluidity" - name: DESCRIPTION_PATH value: "/etc/mlsysops/descriptions" - name: LOCAL_OTEL_ENDPOINT diff --git a/mlsysops-cli/mlsysops_cli/deployment/continuum-agent-daemonset.yaml b/mlsysops-cli/mlsysops_cli/deployment/continuum-agent-daemonset.yaml index fd2c6b1..594c60d 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/continuum-agent-daemonset.yaml +++ b/mlsysops-cli/mlsysops_cli/deployment/continuum-agent-daemonset.yaml @@ -24,7 +24,7 @@ spec: containers: - name: continuum-agent imagePullPolicy: Always - image: harbor.nbfc.io/mlsysops/continuum-agent + image: {{ CONTAINER_IMAGE }} env: - name: KARMADA_API_KUBECONFIG value: /etc/kubeconfigs/karmada-api.kubeconfig @@ -41,13 +41,17 @@ spec: fieldRef: fieldPath: status.hostIP - name: EJABBERD_DOMAIN - valueFrom: - fieldRef: - fieldPath: status.hostIP + value: {{ KARMADA_HOST_IP }} - name: NODE_PASSWORD value: "1234" - name: DESCRIPTION_PATH value: "/etc/mlsysops/descriptions" + {% if OTEL_EXPORT_IP and OTEL_EXPORT_PORT %} + - name: MLS_OTEL_CONTINUUM_EXPORT_IP + value: "{{ OTEL_EXPORT_IP }}" + - name: MLS_OTEL_CONTINUUM_EXPORT_PORT + value: "{{ OTEL_EXPORT_PORT }}" + {% endif %} resources: requests: memory: "128Mi" diff --git a/mlsysops-cli/mlsysops_cli/deployment/deploy.py b/mlsysops-cli/mlsysops_cli/deployment/deploy.py index c605aa9..7b8a464 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/deploy.py +++ b/mlsysops-cli/mlsysops_cli/deployment/deploy.py @@ -13,8 +13,8 @@ import subprocess from mlsysops_cli import deployment from mlsysops_cli.deployment.descriptions_util import create_cluster_yaml, create_worker_node_yaml,create_continuum_yaml - - +import glob +import traceback def parse_yaml_from_file(path_obj: Path, template_variables: dict = {}) -> list | None: """ Parses a YAML file from a Path object (e.g. importlib.resources.files(...)) using Jinja2 templates @@ -610,8 +610,8 @@ def run_deploy_all(path, inventory_path): def deploy_core_services(): print("🔧 Deploying core services (ejabberd, redis, API service)...") _check_required_env_vars("KARMADA_HOST_IP", "KUBECONFIG") - client_k8s = KubernetesLibrary("apps", "v1", os.getenv("KUBECONFIG", "/etc/rancher/k3s/k3s.yaml"), - context="karmada-host") + client_k8s = KubernetesLibrary("apps", "v1", + os.getenv("KUBECONFIG", "/etc/rancher/k3s/k3s.yaml"), context="karmada-host") _apply_namespace_and_rbac(client_k8s) xmpp_path = files(deployment).joinpath("ejabberd-deployment.yaml") @@ -619,7 +619,9 @@ def deploy_core_services(): client_k8s.create_or_update(r) api_path = files(deployment).joinpath("api-service-deployment.yaml") - for r in parse_yaml_from_file(api_path, {"KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP")}): + for r in parse_yaml_from_file(api_path, {"KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP"), + "CONTAINER_IMAGE": os.getenv("API_CONTAINER_IMAGE","harbor.nbfc.io/mlsysops/northbound-api") + }): client_k8s.create_or_update(r) redis_path = files(deployment).joinpath("redis-stack-deployment.yaml") @@ -656,7 +658,13 @@ def deploy_continuum_agents(path, inventory_path): # DaemonSet YAML daemonset_path = files(deployment).joinpath("continuum-agent-daemonset.yaml") - for r in parse_yaml_from_file(daemonset_path, {"KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP")}): + for r in parse_yaml_from_file(daemonset_path, + { + "OTEL_EXPORT_IP": os.getenv("MLS_OTEL_CONTINUUM_EXPORT_IP", None), + "OTEL_EXPORT_PORT": os.getenv("MLS_OTEL_CONTINUUM_EXPORT_PORT", "4317"), + "CONTAINER_IMAGE": os.getenv("CONTINUUM_CONTAINER_IMAGE","harbor.nbfc.io/mlsysops/continuum-agent"), + "KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP") + }): client_k8s.create_or_update(r) def deploy_cluster_agents(path, inventory_path): @@ -695,7 +703,10 @@ def deploy_cluster_agents(path, inventory_path): # DaemonSet YAML daemonset_path = files(deployment).joinpath("cluster-agents-daemonset.yaml") - for r in parse_yaml_from_file(daemonset_path, {"KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP")}): + for r in parse_yaml_from_file(daemonset_path, { + "CONTAINER_IMAGE": os.getenv("CLUSTER_CONTAINER_IMAGE", "harbor.nbfc.io/mlsysops/cluster-agent"), + "KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP") + }): client_karmada.create_or_update(r) def deploy_node_agents(path, inventory_path): @@ -733,7 +744,10 @@ def deploy_node_agents(path, inventory_path): # DaemonSet YAML daemonset_path = files(deployment).joinpath("node-agents-daemonset.yaml") - for r in parse_yaml_from_file(daemonset_path, {"KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP"), "REDIS_HOST": os.getenv("KARMADA_HOST_IP")}): + for r in parse_yaml_from_file(daemonset_path, { + "CONTAINER_IMAGE": os.getenv("NODE_CONTAINER_IMAGE", "harbor.nbfc.io/mlsysops/node-agent"), + "KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP"), "REDIS_HOST": os.getenv("KARMADA_HOST_IP") + }): client_karmada.create_or_update(r) def _apply_namespace_and_rbac(client_instance): @@ -746,3 +760,69 @@ def _apply_namespace_and_rbac(client_instance): rbac_path = files(deployment).joinpath("mlsysops-rbac.yaml") for r in parse_yaml_from_file(rbac_path): client_instance.create_or_update(r) + + + ####### FITA Extension ######## +def deploy_fita_agents(fita_description_path: str): + """ + Deploy FITA agents using a daemonset YAML file and node descriptions, following deploy.py pattern. + + - Iterates over fita_description_path, reading *.yaml files with top-level MLSysOpsNode entries. + - For each valid file, renders the FITA DaemonSet template (from file) replacing FITA_NODE_NAME and + KARMADA_HOST_IP, and applies resources using the same synchronous client approach. + - Ensures required ConfigMaps exist (from directory or empty as needed). + """ + try: + _check_required_env_vars("KARMADA_HOST_IP", "KUBECONFIG") + + # Validate path + if not fita_description_path or not os.path.isdir(fita_description_path): + raise RuntimeError(f"Invalid FITA descriptions path: {fita_description_path}") + + + # Load kubeconfig for the karmada-apiserver context (sync client, same pattern) + client_karmada = KubernetesLibrary("apps", "v1", os.getenv("KUBECONFIG", "/etc/rancher/k3s/k3s.yaml") + , context="karmada-apiserver") + + # ConfigMap + print(f"Using fita node systems decriptions from {fita_description_path}") + client_karmada.create_configmap_from_file(fita_description_path, "mlsysops-framework", + "fita-system-descriptions") + + # Parse node description files + yaml_parser = YAML(typ='safe') + node_names = [] + for file_path in glob.glob(os.path.join(fita_description_path, "*.yaml")): + try: + with open(file_path, "r", encoding="utf-8") as f: + doc = yaml_parser.load(f) + if isinstance(doc, dict) and "MLSysOpsNode" in doc: + node = doc.get("MLSysOpsNode") or {} + node_name = node.get("name") + if node_name: + node_names.append(node_name) + else: + print(f"Skipping {file_path}: MLSysOpsNode.name missing") + else: + print(f"Skipping {file_path}: not an MLSysOpsNode document") + except Exception as e: + print(f"Failed parsing {file_path}: {e}") + + if not node_names: + print("No valid MLSysOpsNode descriptions found; nothing to deploy.") + return + + for node_name in node_names: + # DaemonSet YAML + daemonset_path = files(deployment).joinpath("fita-agent-daemonset.yaml") + for r in parse_yaml_from_file(daemonset_path, { + "KARMADA_HOST_IP": os.getenv("KARMADA_HOST_IP"), + "REDIS_HOST": os.getenv("KARMADA_HOST_IP"), + "FITA_NODE_NAME": node_name + }): + client_karmada.create_or_update(r) + + print("FITA agents deployment finished.") + except Exception as e: + print(f"Failed to deploy FITA agents: {e}") + print(traceback.format_exc()) \ No newline at end of file diff --git a/mlsysops-cli/mlsysops_cli/deployment/ejabberd-deployment.yaml b/mlsysops-cli/mlsysops_cli/deployment/ejabberd-deployment.yaml index 23b89e7..f2526eb 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/ejabberd-deployment.yaml +++ b/mlsysops-cli/mlsysops_cli/deployment/ejabberd-deployment.yaml @@ -42,9 +42,7 @@ spec: protocol: "TCP" env: - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP + value: {{ POD_IP }} command: ["/bin/sh", "-c"] args: - | diff --git a/mlsysops-cli/mlsysops_cli/deployment/fita-agent-daemonset.yaml b/mlsysops-cli/mlsysops_cli/deployment/fita-agent-daemonset.yaml new file mode 100644 index 0000000..e567dea --- /dev/null +++ b/mlsysops-cli/mlsysops_cli/deployment/fita-agent-daemonset.yaml @@ -0,0 +1,80 @@ +--- +# DaemonSet for Node Agent +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: mlsysops-node-agent-{{ FITA_NODE_NAME }} + namespace: mlsysops-framework +spec: + selector: + matchLabels: + app: node-agent-{{ FITA_NODE_NAME }} + template: + metadata: + labels: + app: node-agent-{{ FITA_NODE_NAME }} + spec: + # Use nodeSelector to target nodes with mlsysops/continuumLayer=node + nodeSelector: + mlsysops.eu/continuumLayer: "cluster" + initContainers: + - name: delay-startup + image: busybox + command: ["sh", "-c", "echo 'Waiting...' && sleep 35"] + containers: + - name: fita-agent + image: harbor.nbfc.io/mlsysops/fita-agent:fhp0.1 + imagePullPolicy: Always + env: + - name: NODE_NAME + value: {{ FITA_NODE_NAME }} + - name: EJABBERD_DOMAIN + value: {{ KARMADA_HOST_IP }} + - name: REDIS_HOST + value: {{ KARMADA_HOST_IP }} + - name: NODE_PASSWORD + value: "mysecret" + - name: DESCRIPTION_PATH + value: "/etc/mlsysops/descriptions" + resources: + requests: + memory: "128Mi" + cpu: "250m" + limits: + memory: "256Mi" + cpu: "500m" + volumeMounts: + - name: description-volume + mountPath: /etc/mlsysops/descriptions + - name: fita-config-volume + mountPath: /etc/mlsysops/config + - name: policies-volume + mountPath: /etc/mlsysops/policies + volumes: + - name: fita-config-volume + configMap: + name: fita-agents-config + - name: description-volume + configMap: + name: fita-system-descriptions + - name: policies-volume + configMap: + name: fita-agents-policies + +--- +# Empty ConfigMap for Cluster Policies +apiVersion: v1 +kind: ConfigMap +metadata: + name: fita-agents-policies + namespace: mlsysops-framework +data: {} + +--- +# Empty ConfigMap for agent configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: fita-agents-config + namespace: mlsysops-framework +data: {} diff --git a/mlsysops-cli/mlsysops_cli/deployment/node-agents-daemonset.yaml b/mlsysops-cli/mlsysops_cli/deployment/node-agents-daemonset.yaml index a34314d..f9b9b8d 100644 --- a/mlsysops-cli/mlsysops_cli/deployment/node-agents-daemonset.yaml +++ b/mlsysops-cli/mlsysops_cli/deployment/node-agents-daemonset.yaml @@ -23,7 +23,7 @@ spec: command: ["sh", "-c", "echo 'Waiting...' && sleep 35"] containers: - name: node-agent - image: harbor.nbfc.io/mlsysops/node-agent + image: {{ CONTAINER_IMAGE }} imagePullPolicy: Always env: - name: NODE_NAME diff --git a/mlsysops-cli/mlsysops_cli/templates/node.yaml.j2 b/mlsysops-cli/mlsysops_cli/templates/node.yaml.j2 index 49785ee..bd1788c 100644 --- a/mlsysops-cli/mlsysops_cli/templates/node.yaml.j2 +++ b/mlsysops-cli/mlsysops_cli/templates/node.yaml.j2 @@ -18,5 +18,5 @@ MLSysOpsNode: - 14000 - 17000 performance_indicator: 1 # BogoMIPS - memory: 100 + memory: "100" disk: "1" diff --git a/mlsysops-cli/requirements.txt b/mlsysops-cli/requirements.txt index 6ad443f..67df6cb 100644 --- a/mlsysops-cli/requirements.txt +++ b/mlsysops-cli/requirements.txt @@ -1,3 +1,5 @@ click requests PyYAML +jinja2 +kubernetes \ No newline at end of file diff --git a/northbound-api/Dockerfile b/northbound-api/Dockerfile index 9058c8a..5262849 100644 --- a/northbound-api/Dockerfile +++ b/northbound-api/Dockerfile @@ -1,14 +1,16 @@ # Use the official Python image -FROM python:3.9-slim +FROM harbor.nbfc.io/proxy_cache/library/python:3.10-slim # Set the working directory inside the container WORKDIR /app # Copy the current directory contents into the container -COPY . /app +COPY ./northbound-api /app +COPY ./agents/mlsysops /app/mlsysops # Install the Python dependencies -RUN pip install -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r mlsysops/requirements.txt # Install curl and kubectl RUN apt-get update && \ @@ -19,6 +21,8 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +ENV PYTHONPATH=/app + # Expose the port that FastAPI will run on EXPOSE 8000 diff --git a/northbound-api/MLSysOps-CRD.yaml b/northbound-api/MLSysOps-CRD.yaml deleted file mode 100644 index de654e8..0000000 --- a/northbound-api/MLSysOps-CRD.yaml +++ /dev/null @@ -1,770 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - # name must match the spec fields below, and be in the form: . - name: mlsysopsapps.fluidity.gr -spec: - # group name to use for REST API: /apis// - group: fluidity.gr - scope: Namespaced #Cluster - names: - plural: mlsysopsapps - singular: mlsysopsapp - kind: MLSysOpsApp - shortNames: - - mlsapp - versions: - - name: v1 - served: true - storage: true - schema: - openAPIV3Schema: - type: object - properties: - name: - type: string - mlsysops-id: - type: string - clusterPlacement: - type: object - properties: - clusterID: - type: array - items: - type: string - description: Array of clusters that can host the application. - instances: - type: integer - description: The desired number of replicas (e.g. deploy 2 instances of a given app). - location: - type: object - properties: - continent: - type: array - items: - type: string - enum: - - Europe - - Asia - description: The required continent (optional) - country: # See reference list - type: array - items: - type: string - enum: - - EL # Greece - - IT # Italy - - FR # France - - NL # Netherlands - - IE # Ireland - - PT # Portugal - - DK # Denmark - - IL # Israel - description: The required country (optional) - city: - type: array - items: - type: string - enum: - - Volos - - Athens - - Rende - - Milan - - Lille - - Delft - - Dublin - - Aveiro - - Porto - - Aarhus - - Jerusalem - description: The required city (optional) - cloudProvider: - type: string - enum: - - private - - AWS - - MicrosoftAzure - - GCP - accelerators: - type: array - items: - type: string - enum: - - GPU - - FPGA - - TPU - description: The candidate clusters should have nodes containing at least one instance of the specified accelerators. - nodeTypes: - type: array - items: - type: string - enum: - - Cloud - - FarEdge - - EdgeInfrastructure - - Edge - description: The required - description: The candidate clusters should contain at least one node of each specified type. - singleNode: - type: array - items: - type: object - description: The application deployment should occur within a single node with the required characteristics. - If the involved components have multiple instances, the framework will do the same for all the replicas. - This field will override nodePlacement information (if specified) for any of the components. - properties: - components: - type: array - items: - type: string - description: The array of components to be deployed on the same Node. - continuumLayer: - type: array - items: - type: string - enum: - - Cloud - - FarEdge - - EdgeInfrastructure - - Edge - - "*" - description: The required component placement on the continuum. "*" symbol means "anywhere on the continuum". - Scaling: - type: object - properties: - scalingMode: - type: string - enum: - - manual - - auto - instances: - type: integer - description: In case of manual scaling of the component, specify the number of instances. - scalingCriteria: - type: string - enum: - - MinCPUutilization - - MaxCPUutilization - - MinMemoryPercent - - MaxMemoryPercent - - MinRequestsPerSec - - MaxRequestPerSec - - MinNumberOfInstances - - MaxNumberOfInstances - description: Scaling criteria for the component, related to the "auto" scaling type. - description: The component scaling information. - mobility: - type: boolean - description: Specify if the component needs to be deployed on a mobile node (optional) - labels: - type: array - items: - type: string - description: The required labels for filtering. - node: - type: string - description: The required node name to be the host of the component (optional). - components: - type: array - items: - type: object - properties: - Component: - type: object - properties: - name: - type: string - description: The unique name of the component - uid: - type: string - description: The unique identifier of the component (not given by app provider). - nodePlacement: - type: object - properties: - continuumLayer: - type: array - items: - type: string - enum: - - Cloud - - FarEdge - - EdgeInfrastructure - - Edge - - "*" - description: The required component placement on the continuum. "*" symbol means "anywhere on the continuum". - Scaling: - type: object - properties: - scalingMode: - type: string - enum: - - manual - - auto - instances: - type: integer - description: In case of manual scaling of the component, specify the number of instances. - scalingCriteria: - type: string - enum: - - MinCPUutilization - - MaxCPUutilization - - MinMemoryPercent - - MaxMemoryPercent - - MinRequestsPerSec - - MaxRequestPerSec - - MinNumberOfInstances - - MaxNumberOfInstances - description: Scaling criteria for the component, related to the "auto" scaling type. - description: The component scaling information. - mobility: - type: boolean - description: Specify if the component needs to be deployed on a mobile node (optional) - labels: - type: array - items: - type: string - description: The required labels for filtering. - node: - type: string - description: The required node name to be the host of the component (optional). - DependsOn: - type: array - items: - type: string - description: The name of the related components. - description: The given component should be deployed after all the components - specified in the Components list have already started running. - runtimeConfiguration: - type: object - description: Enables runtime (node-level) configuration for app components. - properties: - configSpecificationFile: - type: string - description: The actual specification file describing the available runtime - configuration knobs (expected in json format). - This file is provided by the app developer. - configFilePath: - type: string - description: The absolute path inside the container where the application code - expects to find the configSpecificationFile. - sensors: - type: array - items: - type: object - properties: - camera: - type: object - properties: - methods: - type: array - items: - type: object - description: The list of camera service methods. - properties: - methodName: - type: string - enum: - - CaptureImage - - CaptureImagePeriodic - - CaptureVideo - - ListImages - - GetImageInfo - - RetrieveImage - - GetCameraInfo - model: - type: string - enum: - - D455 - - IMX477 - - picamera-v2 - description: The model name of the camera sensor - cameraType: - type: string - enum: - - RGB - - NIR - - Thermal - - Monocular - description: The camera sensor type. - minimumFramerate: - type: integer - resolutions: - type: array - items: - type: string - enum: - - 1024x768 - - 4056x3040 - temperature: - type: object - properties: - methods: - type: array - items: - type: object - description: The list of temperature service methods. - properties: - methodName: - type: string - enum: - - GetTemperature - model: - type: string - enum: - - SDC30 - - DS18B20 - description: The model name of the temperature sensor - measurementMin: - type: number - measurementMax: - type: number - measurementUnit: - type: string - enum: - - Celsius - - Fahrenheit - accuracy: - type: number - samplingFrequency: - type: number - accelerometer: - type: object - properties: - methods: - type: array - items: - type: object - description: The list of accelerometer service methods. - properties: - methodName: - type: string - enum: - - GetAcceleration - model: - type: string - enum: - - 3038-SMT - measurementMin: - type: number - measurementMax: - type: number - measurementUnit: - type: string - enum: - - "m/s^2" - accuracy: - type: number - samplingFrequency: - type: number - barometer: - type: object - properties: - methods: - type: array - items: - type: object - description: The list of barometer service methods. - properties: - methodName: - type: string - enum: - - GetAtmosphericPressure - model: - type: string - enum: - - SB-100 - measurementMin: - type: number - measurementMax: - type: number - measurementUnit: - type: string - enum: - - "Pa" # Pascal - accuracy: - type: number - samplingFrequency: - type: number - airQuality: - type: object - properties: - methods: - type: array - items: - type: object - description: The list of airQuality service methods. - properties: - methodName: - type: string - enum: - - DetectAirContaminants - model: - type: string - enum: - - MQ-135 - measurementMin: - type: number - measurementMax: - type: number - measurementUnit: - type: string - enum: - - "μg/m^3" # micrograms per cubic meter - accuracy: - type: number - samplingFrequency: - type: number - QoS-Metrics: - type: array - items: - type: object - properties: - ApplicationMetricID: - type: string - description: This is an indicative list of metrics. It can be extended as needed. - target: - type: number - relation: - type: string - enum: - - LowerOrEqual - - GreaterOrEqual - - Equal - - LowerThan - - GreaterThan - systemMetricsHints: - type: array - description: System-level metrics affecting the application metric. - items: - type: string - enum: - - CPUFrequency - storage: # Refers to CC storage service. - type: object - properties: - buckets: - type: array - items: - type: object - properties: - bucketID: - type: string - description: The bucket's unique identifier. - policyUpdateToken: - type: string - description: The required token for the MLSysOps to update the bucket's policy at runtime. - locationRestrictions: - type: object - description: These restrictions are used to exclude storage locations that host data of the application. - properties: - GDPR: - type: boolean - description: For EU citizens only GDPR-compliant storage locations can legally be used. - reduncancy: - type: string - enum: - - High - - One - - None - maxLatency: - type: number - minDownloadSpeed: - type: number - serverSideEncryption: - type: string - enum: - - "ON" - - "OFF" - required: - - bucketID - dataSensitivity: - type: boolean - description: The indication to specify whether a component has sensitive data or not (useful for the data storage). - dataCriticality: - type: string - enum: - - Low - - Medium - - High - description: Used to provide information referring to the trust aspect for a given component. - externalComponent: - type: boolean - description: This property indicates whether the component can be managed by MLSysOps or not. - If not the MLSysOps platform merely deploys the component(s), based on the provided instances, - and subsequently deletes it whenever the application needs to be removed. - externalAccess: - type: boolean - description: This property indicates whether the component can be accessed outside of its cluster. - hostNetwork: - type: boolean - description: Host networking requested for this component. - Use the host's network namespace. If this option is set, - the ports that will be used must be specified. Default to false. - restartPolicy: - type: string - enum: - - always - - on_failure - - never - description: Restart policy for the container. Default to Always. - containers: - type: array - items: - type: object - properties: - image: - type: string - description: The name of the container image. - command: - type: array - items: - type: string - imagePullPolicy: - type: string - enum: - - Always - - Never - - IfNotPresent - description: Image pull policy. Defaults to Always if :latest tag is specified, - or IfNotPresent otherwise. - accelerationAPI: - type: array - items: - type: object - properties: - callName: - type: string - description: The (unique) API call name. - enum: - - CalcOpticalFlow - - ImageInference - requiredFramework: - type: string - description: Asterisk means any of the available frameworks. - enum: - - PyTorch - - TensorFlow - - OpenCL - - "*" - required: - - callName - resourceRequirements: - type: object - description: The resource requirements of the container. - properties: - CPU: - type: object - properties: - architecture: - type: array - items: - type: string - enum: - - x86 - - arm64 - cores: - type: integer - description: required cores - frequency: - type: number - description: required frequency in GHz. - MCU: - type: object - properties: - architecture: - type: array - items: - type: string - enum: - - arm-M4 - Flash: - type: string - description: Flash memory size (related to far edge devices) - cores: - type: integer - description: required cores - frequency: - type: number - description: required frequency in GHz. - RAM: - type: string - description: required RAM (in GB). - Disk: - type: string - description: required Disk space (in GB). - GPU: - type: object - properties: - model: - type: string - enum: - - K80 - - K40 - memory: - type: string - utilizationRequest: - type: string - description: Percentage of expected utilization. - FPGA: - type: object - properties: - model: - type: string - enum: - - ZCU102 - memory: - type: string - utilizationRequest: - type: string - description: Percentage of expected utilization. - performanceIndicator: - type: number - description: This field assists MLSysOps with an initial hint in order to - filter out nodes based on their performance capabilities. - environmentRequirements: - type: object - properties: - nodeType: - type: string - enum: - - Virtualized # In the form of a Virtual Machine - - Native # Non-virtualized, including OS - - BareMetal # Non-virtualized, without OS - OS: - type: string - enum: - - Ubuntu - - Kali - - Zephyr - container-runtime: - type: string - enum: - - containerd - - Docker - - embServe - ports: - type: array - items: - type: object - properties: - containerPort: - type: integer - description: Number of port to expose on the component's IP address. - This must be a valid port number, 0 < x < 65536. - hostIP: - type: string - description: What host IP to bind the external port to. - hostPort: - type: integer - description: Number of port to expose on the host. - If specified, this must be a valid port number, 0 < x < 65536. - If HostNetwork is specified, this must match ContainerPort. - name: - type: string - description: Each named port in a component must have a unique name. - Name for the port that can be referred to by services. - protocol: - type: string - enum: - - UDP - - TCP - - SCTP - description: Protocol for port. Defaults to "TCP". - description: Environment variables for the container. - env: - type: array - items: - type: object - properties: - name: - type: string - description: Name of the environment variable. - valueFrom: - type: object - properties: - fieldRef: - type: object - properties: - fieldPath: - type: string - value: - type: string - description: Value of the environment variable. - description: Environment variables for the container. - required: - - containers - - Component - componentInteractions: - type: array - items: - type: object - properties: - componentName1: - type: string - description: The "source" component. - type: - type: string - enum: - - ingress - - egress - componentName2: - type: string - description: The "destination" component. - interactionCriticality: - type: string - enum: - - Low - - Medium - - High - description: Used to provide information referring to the trust aspect for a given interaction. - interactionMetrics: - type: array - items: - type: object - properties: - SystemMetricID: - type: string - enum: - - Latency - - Bandwidth - - End2EndInvocationDelay - description: The unique identifier of the system-level metric related to this interaction. - target: - type: number - measurementUnit: - type: string - enum: - - milliseconds # latency, E2E invocation delay - - Mbps # Bandwidth - - seconds # latency, E2E invocation delay - relation: - type: string - enum: - - LowerOrEqual - - GreaterOrEqual - - Equal - - LowerThan - - GreaterThan - globalSatisfaction: - type: object - properties: - threshold: - type: number - description: Happiness minimum required value (range (0-1]) - relation: - type: string - enum: - - GreaterOrEqual - - Equal - - GreaterThan - achievementWeights: - type: array - items: - type: object - properties: - metricID: - type: string - weight: - type: number - required: - - components diff --git a/northbound-api/MLSysOps_Schemas/mlsysops_model.py b/northbound-api/MLSysOps_Schemas/mlsysops_model.py deleted file mode 100644 index 2ee98fd..0000000 --- a/northbound-api/MLSysOps_Schemas/mlsysops_model.py +++ /dev/null @@ -1,729 +0,0 @@ -# generated by datamodel-codegen: -# filename: tmp_dv03s1k_schema.json -# timestamp: 2025-07-04T08:40:07+00:00 - -from __future__ import annotations - -from enum import Enum -from typing import List, Optional - -from pydantic import BaseModel, Field - - -class ContinentEnum(Enum): - Europe = 'Europe' - Asia = 'Asia' - - -class CountryEnum(Enum): - el = 'el' - it = 'it' - fr = 'fr' - nl = 'nl' - ie = 'ie' - pt = 'pt' - dk = 'dk' - il = 'il' - - -class CityEnum(Enum): - volos = 'volos' - athens = 'athens' - rende = 'rende' - milan = 'milan' - lille = 'lille' - delft = 'delft' - dublin = 'dublin' - aveiro = 'aveiro' - porto = 'porto' - aarhus = 'aarhus' - jerusalem = 'jerusalem' - - -class Location(BaseModel): - continent: Optional[List[ContinentEnum]] = None - country: Optional[List[CountryEnum]] = Field( - None, description='The required country (optional)' - ) - city: Optional[List[CityEnum]] = None - - -class CloudProvider(Enum): - private = 'private' - aws = 'aws' - microsoft_azure = 'microsoft_azure' - gcp = 'gcp' - - -class Accelerator(Enum): - gpu = 'gpu' - fpga = 'fpga' - tpu = 'tpu' - - -class NodeType(Enum): - cloud = 'cloud' - far_edge = 'far_edge' - edge_infrastructure = 'edge_infrastructure' - edge = 'edge' - - -class ClusterPlacement(BaseModel): - cluster_id: Optional[List[str]] = Field( - None, description='Array of clusters that can host the application.' - ) - instances: Optional[int] = Field( - None, - description='The desired number of replicas (e.g. deploy 2 instances of a given app).', - ) - location: Optional[Location] = None - cloud_provider: Optional[CloudProvider] = None - accelerators: Optional[List[Accelerator]] = Field( - None, - description='The candidate clusters should have nodes containing at least one instance of the specified accelerators.', - ) - node_types: Optional[List[NodeType]] = Field( - None, - description='The candidate clusters should contain at least one node of each specified type.', - ) - - -class ContinuumLayerEnum(Enum): - cloud = 'cloud' - far_edge = 'far_edge' - edge_infrastructure = 'edge_infrastructure' - edge = 'edge' - field_ = '*' - - -class ScalingMode(Enum): - manual = 'manual' - auto = 'auto' - - -class ScalingCriteria(Enum): - min_cpu_utilization = 'min_cpu_utilization' - max_cpu_utilization = 'max_cpu_utilization' - min_memory_percent = 'min_memory_percent' - max_memory_percent = 'max_memory_percent' - min_requests_per_sec = 'min_requests_per_sec' - max_request_per_sec = 'max_request_per_sec' - min_number_of_instances = 'min_number_of_instances' - max_number_of_instances = 'max_number_of_instances' - - -class Scaling(BaseModel): - scaling_mode: Optional[ScalingMode] = None - instances: Optional[int] = Field( - None, - description='In case of manual scaling of the component, specify the number of instances.', - ) - scaling_criteria: Optional[ScalingCriteria] = Field( - None, - description='Scaling criteria for the component, related to the "auto" scaling type.', - ) - - -class SingleNodeItem(BaseModel): - components: Optional[List[str]] = Field( - None, description='The array of components to be deployed on the same Node.' - ) - continuum_layer: Optional[List[ContinuumLayerEnum]] = None - scaling: Optional[Scaling] = Field( - None, description='The component scaling information.' - ) - mobile: Optional[bool] = Field( - None, - description='Specify if the component must be deployed on a mobile node (optional)', - ) - labels: Optional[List[str]] = Field( - None, description='The required labels for filtering.' - ) - node: Optional[str] = Field( - None, - description='The required node name to be the host of the component (optional).', - ) - - -class Metadata(BaseModel): - name: Optional[str] = Field(None, description='The unique name of the component') - uid: Optional[str] = Field( - None, - description='The unique identifier of the component (not given by app provider).', - ) - - -class Scaling1(BaseModel): - scaling_mode: Optional[ScalingMode] = None - instances: Optional[int] = Field( - None, - description='In case of manual scaling of the component, specify the number of instances.', - ) - scaling_criteria: Optional[ScalingCriteria] = Field( - None, - description='Scaling criteria for the component, related to the "auto" scaling type.', - ) - - -class NodePlacement(BaseModel): - continuum_layer: Optional[List[ContinuumLayerEnum]] = None - scaling: Optional[Scaling1] = Field( - None, description='The component scaling information.' - ) - mobile: Optional[bool] = Field( - None, - description='Specify if the component needs to be deployed on a mobile node (optional)', - ) - labels: Optional[List[str]] = Field( - None, description='The required labels for filtering.' - ) - node: Optional[str] = Field( - None, - description='The required node name to be the host of the component (optional).', - ) - - -class RuntimeConfiguration(BaseModel): - config_specification_file: Optional[str] = Field( - None, - description='The actual specification file describing the available runtime configuration knobs (expected in json format). This file is provided by the app developer.', - ) - config_file_path: Optional[str] = Field( - None, - description='The absolute path inside the container where the application code expects to find the configSpecificationFile.', - ) - - -class Model(Enum): - d455 = 'd455' - imx477 = 'imx477' - picamera_v2 = 'picamera-v2' - - -class CameraType(Enum): - rgb = 'rgb' - nir = 'nir' - thermal = 'thermal' - monocular = 'monocular' - - -class Resolution(Enum): - field_1024x768 = '1024x768' - field_4056x3040 = '4056x3040' - - -class Camera(BaseModel): - model: Optional[Model] = Field( - None, description='The model name of the camera sensor' - ) - camera_type: Optional[CameraType] = Field( - None, description='The camera sensor type.' - ) - minimum_framerate: Optional[int] = None - resolution: Optional[Resolution] = None - - -class Model1(Enum): - sdc30 = 'sdc30' - ds18b20 = 'ds18b20' - - -class MeasurementUnit(Enum): - celsius = 'celsius' - fahrenheit = 'fahrenheit' - - -class Temperature(BaseModel): - model: Optional[Model1] = Field( - None, description='The model name of the temperature sensor' - ) - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model2(Enum): - field_3038_smt = '3038-smt' - - -class MeasurementUnit1(Enum): - m_s_2 = 'm/s^2' - - -class Accelerometer(BaseModel): - model: Optional[Model2] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit1] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model3(Enum): - sb_100 = 'sb-100' - - -class MeasurementUnit2(Enum): - pa = 'pa' - - -class Barometer(BaseModel): - model: Optional[Model3] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit2] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model4(Enum): - mq_135 = 'mq-135' - - -class MeasurementUnit3(Enum): - ug_m_3 = 'ug/m^3' - - -class AirQuality(BaseModel): - model: Optional[Model4] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit3] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Protocol(Enum): - MQTT = 'MQTT' - RTSP = 'RTSP' - - -class Sensor(BaseModel): - camera: Optional[Camera] = None - temperature: Optional[Temperature] = None - accelerometer: Optional[Accelerometer] = None - barometer: Optional[Barometer] = None - air_quality: Optional[AirQuality] = None - endpoint_variable: Optional[str] = Field( - None, - description='The env variable that the app will retrieve the endpoint to get the sensor measurements.', - ) - protocol: Optional[Protocol] = Field( - None, description='The protocol of the sensor service.' - ) - instances: Optional[float] = Field( - None, description='The number of required sensor instances.' - ) - - -class Relation(Enum): - lower_or_equal = 'lower_or_equal' - greater_or_equal = 'greater_or_equal' - equal = 'equal' - lower_than = 'lower_than' - greater_than = 'greater_than' - - -class SystemMetricsHint(Enum): - cpu_frequency = 'cpu_frequency' - - -class QosMetric(BaseModel): - application_metric_id: Optional[str] = Field( - None, - description='This is an indicative list of metrics. It can be extended as needed.', - ) - target: Optional[float] = None - relation: Optional[Relation] = None - system_metrics_hints: Optional[List[SystemMetricsHint]] = Field( - None, description='System-level metrics affecting the application metric.' - ) - - -class LocationRestrictions(BaseModel): - gdpr: Optional[bool] = Field( - None, - description='For EU citizens only GDPR-compliant storage locations can legally be used.', - ) - - -class Reduncancy(Enum): - high = 'high' - one = 'one' - none = 'none' - - -class ServerSideEncryption(Enum): - True_ = True - False_ = False - - -class Bucket(BaseModel): - bucket_id: str = Field(..., description="The bucket's unique identifier.") - policy_update_token: Optional[str] = Field( - None, - description="The required token for the MLSysOps to update the bucket's policy at runtime.", - ) - location_restrictions: Optional[LocationRestrictions] = Field( - None, - description='These restrictions are used to exclude storage locations that host data of the application.', - ) - reduncancy: Optional[Reduncancy] = None - max_latency: Optional[float] = None - min_download_speed: Optional[float] = None - server_side_encryption: Optional[ServerSideEncryption] = None - - -class Storage(BaseModel): - buckets: Optional[List[Bucket]] = None - - -class DataCriticality(Enum): - low = 'low' - medium = 'medium' - high = 'high' - - -class RuntimeClassName(Enum): - nvidia = 'nvidia' - default = 'default' - kata_fc = 'kata-fc' - kata_dragon = 'kata-dragon' - urunc = 'urunc' - crun = 'crun' - lunatic = 'lunatic' - nvidia_experimental = 'nvidia-experimental' - spin = 'spin' - wasmedge = 'wasmedge' - slight = 'slight' - - -class RestartPolicy(Enum): - Always = 'Always' - OnFailure = 'OnFailure' - Never = 'Never' - - -class Os(Enum): - ubuntu = 'ubuntu' - kali = 'kali' - zephyr = 'zephyr' - - -class NodeType1(Enum): - virtualized = 'virtualized' - native = 'native' - bare_metal = 'bare_metal' - - -class ContainerRuntime(Enum): - containerd = 'containerd' - docker = 'docker' - emb_serve = 'emb_serve' - - -class ImagePullPolicy(Enum): - Always = 'Always' - Never = 'Never' - IfNotPresent = 'IfNotPresent' - - -class CallName(Enum): - calc_optical_flow = 'calc_optical_flow' - image_inference = 'image_inference' - - -class RequiredFramework(Enum): - pytorch = 'pytorch' - tensorflow = 'tensorflow' - opencl = 'opencl' - field_ = '*' - - -class AccelerationApiItem(BaseModel): - call_name: CallName = Field(..., description='The (unique) API call name.') - required_framework: Optional[RequiredFramework] = Field( - None, description='Asterisk means any of the available frameworks.' - ) - - -class ArchitectureEnum(Enum): - arm64 = 'arm64' - amd64 = 'amd64' - - -class Cpu(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - architecture: Optional[List[ArchitectureEnum]] = None - frequency: Optional[float] = Field(None, description='required frequency in Hz.') - performance_indicator: Optional[float] = Field( - None, - description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', - ) - - -class Memory(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - - -class ArchitectureEnum1(Enum): - arm_m4 = 'arm-m4' - - -class Mcu(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - architecture: Optional[List[ArchitectureEnum1]] = None - flash: Optional[str] = Field( - None, description='Flash memory size (related to far edge devices)' - ) - frequency: Optional[float] = Field(None, description='required frequency in GHz.') - performance_indicator: Optional[float] = Field( - None, - description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', - ) - - -class Model5(Enum): - k80 = 'k80' - k40 = 'k40' - - -class Gpu(BaseModel): - requests: Optional[int] = None - limits: Optional[int] = None - model: Optional[Model5] = None - memory: Optional[int] = None - utilization_request: Optional[str] = Field( - None, description='Percentage of expected utilization.' - ) - performance_indicator: Optional[float] = Field( - None, - description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', - ) - - -class Model6(Enum): - zcu102 = 'zcu102' - - -class Fpga(BaseModel): - model: Optional[Model6] = None - memory: Optional[int] = None - utilization_request: Optional[str] = Field( - None, description='Percentage of expected utilization.' - ) - performance_indicator: Optional[float] = Field( - None, - description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', - ) - - -class PlatformRequirements(BaseModel): - cpu: Optional[Cpu] = None - memory: Optional[Memory] = None - mcu: Optional[Mcu] = None - disk: Optional[str] = Field(None, description='required Disk space (in GB).') - gpu: Optional[Gpu] = None - fpga: Optional[Fpga] = None - - -class Protocol1(Enum): - UDP = 'UDP' - TCP = 'TCP' - SCTP = 'SCTP' - - -class Port(BaseModel): - container_port: Optional[int] = Field( - None, - description="Number of port to expose on the component's IP address. This must be a valid port number, 0 < x < 65536.", - ) - host_ip: Optional[str] = Field( - None, description='What host IP to bind the external port to.' - ) - host_port: Optional[int] = Field( - None, - description='Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort.', - ) - name: Optional[str] = Field( - None, - description='Each named port in a component must have a unique name. Name for the port that can be referred to by services.', - ) - protocol: Optional[Protocol1] = Field( - None, description='Protocol for port. Defaults to "TCP".' - ) - - -class FieldRef(BaseModel): - field_path: Optional[str] = None - - -class ValueFrom(BaseModel): - field_ref: Optional[FieldRef] = None - - -class EnvItem(BaseModel): - name: Optional[str] = Field(None, description='Name of the environment variable.') - value_from: Optional[ValueFrom] = None - value: Optional[str] = Field(None, description='Value of the environment variable.') - - -class Container(BaseModel): - image: Optional[str] = Field(None, description='The name of the container image.') - command: Optional[List[str]] = None - image_pull_policy: Optional[ImagePullPolicy] = Field( - None, - description='Image pull policy. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.', - ) - acceleration_api: Optional[List[AccelerationApiItem]] = None - platform_requirements: Optional[PlatformRequirements] = Field( - None, description='The resource requirements of the container.' - ) - ports: Optional[List[Port]] = Field( - None, description='Environment variables for the container.' - ) - env: Optional[List[EnvItem]] = Field( - None, description='Environment variables for the container.' - ) - - -class Component(BaseModel): - metadata: Metadata - node_placement: Optional[NodePlacement] = None - depends_on: Optional[List[str]] = Field( - None, - description='The given component should be deployed after all the components specified in the Components list have already started running.', - ) - runtime_configuration: Optional[RuntimeConfiguration] = Field( - None, - description='Enables runtime (node-level) configuration for app components.', - ) - sensors: Optional[List[Sensor]] = None - qos_metrics: Optional[List[QosMetric]] = None - storage: Optional[Storage] = None - data_sensitivity: Optional[bool] = Field( - None, - description='The indication to specify whether a component has sensitive data or not (useful for the data storage).', - ) - data_criticality: Optional[DataCriticality] = Field( - None, - description='Used to provide information referring to the trust aspect for a given component.', - ) - external_component: Optional[bool] = Field( - None, - description='This property indicates whether the component can be managed by MLSysOps or not. If not the MLSysOps platform merely deploys the component(s), based on the provided instances, and subsequently deletes it whenever the application needs to be removed.', - ) - external_access: Optional[bool] = Field( - None, - description='This property indicates whether the component can be accessed outside of its cluster.', - ) - host_network: Optional[bool] = Field( - None, - description="Host networking requested for this component. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false.", - ) - runtime_class_name: Optional[RuntimeClassName] = None - restart_policy: Optional[RestartPolicy] = Field( - None, description='Restart policy for the container. Default to Always.' - ) - os: Optional[Os] = None - node_type: Optional[NodeType1] = None - container_runtime: Optional[ContainerRuntime] = None - containers: List[Container] - - -class Type(Enum): - ingress = 'ingress' - egress = 'egress' - - -class InteractionCriticality(Enum): - low = 'low' - medium = 'medium' - high = 'high' - - -class SystemMetricId(Enum): - latency = 'latency' - bandwidth = 'bandwidth' - end_to_end_invocation_delay = 'end_to_end_invocation_delay' - - -class MeasurementUnit4(Enum): - milliseconds = 'milliseconds' - Mbps = 'Mbps' - seconds = 'seconds' - - -class InteractionMetric(BaseModel): - system_metric_id: Optional[SystemMetricId] = Field( - None, - description='The unique identifier of the system-level metric related to this interaction.', - ) - target: Optional[float] = None - measurement_unit: Optional[MeasurementUnit4] = None - relation: Optional[Relation] = None - - -class ComponentInteraction(BaseModel): - component_name1: Optional[str] = Field(None, description='The "source" component.') - type: Optional[Type] = None - component_name2: Optional[str] = Field( - None, description='The "destination" component.' - ) - interaction_criticality: Optional[InteractionCriticality] = Field( - None, - description='Used to provide information referring to the trust aspect for a given interaction.', - ) - interaction_metrics: Optional[List[InteractionMetric]] = None - - -class PermittedAction(Enum): - component_relocation = 'component_relocation' - traffic_redirection = 'traffic_redirection' - change_container_image = 'change_container_image' - change_container_runtime_class = 'change_container_runtime_class' - change_container_cpu_set = 'change_container_cpu_set' - change_container_resource_requirements = 'change_container_resource_requirements' - acceleration = 'acceleration' - field_ = '*' - - -class Relation2(Enum): - greater_or_equal = 'greater_or_equal' - equal = 'equal' - greater_than = 'greater_than' - - -class AchievementWeight(BaseModel): - metric_id: Optional[str] = None - weight: Optional[float] = None - - -class GlobalSatisfaction(BaseModel): - threshold: Optional[float] = Field( - None, description='Happiness minimum required value (range (0-1])' - ) - relation: Optional[Relation2] = None - achievement_weights: Optional[List[AchievementWeight]] = None - - -class MLSysOpsApp(BaseModel): - name: Optional[str] = None - cluster_placement: Optional[ClusterPlacement] = None - single_node: Optional[List[SingleNodeItem]] = None - components: List[Component] - component_interactions: Optional[List[ComponentInteraction]] = None - permitted_actions: Optional[List[PermittedAction]] = Field( - None, - description='List of desired actions that can be performed by the MLSysOps agents. For traffic redirection, it must be enabled in the respective node description in order for MLSysOps to apply it.', - ) - global_satisfaction: Optional[GlobalSatisfaction] = None - - -class MlsysopsappSchema(BaseModel): - MLSysOpsApp: MLSysOpsApp diff --git a/northbound-api/MLSysOps_Schemas/mlsysops_schemas.py b/northbound-api/MLSysOps_Schemas/mlsysops_schemas.py deleted file mode 100644 index 698986d..0000000 --- a/northbound-api/MLSysOps_Schemas/mlsysops_schemas.py +++ /dev/null @@ -1,1979 +0,0 @@ -node_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "MLSysOpsNode Schema", - "type": "object", - "properties": { - "MLSysOpsNode": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "The name of the node." - }, - "labels": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The required labels for filtering." - }, - "continuumLayer": { - "type": "string", - "enum": [ - "Cloud", - "EdgeInfrastructure", - "Edge", - "FarEdge" - ] - }, - "clusterID": { - "type": "string", - "description": "The unique cluster identifier that the node resides." - }, - "datacenterID": { - "type": "string", - "description": "The unique datacenter identifier that the node belongs to (if any). If no datacenterID is provided, the node is considered as standalone and will be characterized using its location (geocoordinates)." - }, - "availability": { - "type": "string", - "enum": [ - "transient", - "stable" - ], - "description": "Depicts the level of a node's availability. It is not related to addition/removal to/from a cluster. However, it relates to possible status transitions, for example due to energy outage (e.g. a node using a battery)." - }, - "mobility": { - "type": "boolean", - "description": "Specify if the node is mobile or stationary." - }, - "location": { - "type": "array", - "description": "This is used for fixed nodes. We assume that mobile node's location is telemetry data which is not captured via these descriptions. We can also assume that for mobile nodes this refers to base station's coordinates (lon, lat).", - "items": { - "type": "number" - } - }, - "sensors": { - "type": "array", - "description": "Available sensors on a node are presented as services provided by MLSysOps. Thus, each sensor has several methods that can be transparently accessed by application components without requiring extra effort from the developer to directly communicate with the device from within the container and enabling easy component relocation while abstracting out the node peculiarities following a FaaS approach.", - "items": { - "type": "object", - "properties": { - "camera": { - "type": "object", - "properties": { - "instances": { - "type": "integer", - "description": "Define the number of identical models (for multi-camera nodes)." - }, - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of camera service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "CaptureImage", - "CaptureImagePeriodic", - "CaptureVideo", - "ListImages", - "GetImageInfo", - "RetrieveImage", - "GetCameraInfo" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "D455", - "IMX477", - "picamera-v2" - ], - "description": "The model name of the camera sensor." - }, - "cameraType": { - "type": "string", - "description": "The camera sensor type." - }, - "framerate": { - "type": "integer" - }, - "resolution": { - "type": "string", - "enum": [ - "1024x768", - "4056x3040" - ] - } - }, - "required": [ - "methods" - ] - }, - "temperature": { - "type": "object", - "properties": { - "instances": { - "type": "integer", - "description": "Define the number of identical models." - }, - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of temperature service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetTemperature" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "SDC30", - "DS18B20" - ], - "description": "The model name of the temperature sensor" - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "Celsius", - "Fahrenheit" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - }, - "required": [ - "methods" - ] - }, - "accelerometer": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of accelerometer service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetAcceleration" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "3038-SMT" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "m/s^2" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - }, - "required": [ - "methods" - ] - }, - "humidity": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "properties": { - "methodName": { - "type": "string" - } - } - } - }, - "model": { - "type": "string" - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string" - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "object", - "properties": { - "min": { - "type": "number" - }, - "max": { - "type": "number" - } - } - } - } - }, - "sound": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "properties": { - "methodName": { - "type": "string" - } - } - } - }, - "model": { - "type": "string" - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string" - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "object", - "properties": { - "min": { - "type": "number" - }, - "max": { - "type": "number" - } - } - } - } - }, - "CO2": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "properties": { - "methodName": { - "type": "string" - } - } - } - }, - "model": { - "type": "string" - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string" - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "object", - "properties": { - "min": { - "type": "number" - }, - "max": { - "type": "number" - } - } - } - } - }, - "barometer": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of barometer service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetAtmosphericPressure" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "SB-100" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "Pa" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - }, - "required": [ - "methods" - ] - }, - "airQuality": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of airQuality service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "DetectAirContaminants" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "MQ-135" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "m^3" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - }, - "required": [ - "methods" - ] - } - } - } - }, - "distributedStorageService": { - "type": "object", - "properties": { - "member": { - "type": "boolean" - }, - "availableSpace": { - "type": "number", - "description": "Available space to be used by the storage service in GB." - } - }, - "description": "This property specifies whether the node is part of the distributed storage service or not." - }, - "environment": { - "type": "object", - "properties": { - "nodeType": { - "type": "string", - "enum": [ - "Virtualized", - "Native", - "BareMetal" - ] - }, - "OS": { - "type": "string", - "enum": [ - "Ubuntu", - "Kali", - "Zephyr" - ] - }, - "container-runtime": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "containerd", - "Docker", - "embServe" - ] - } - } - } - }, - "accelerationAPI": { - "type": "array", - "items": { - "type": "object", - "properties": { - "callName": { - "type": "string", - "enum": [ - "CalcOpticalFlow", - "ImageInference" - ], - "description": "The (unique) API call name." - }, - "supportedPlatforms": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "enum": [ - "CPU", - "GPU", - "FPGA", - "TPU" - ] - }, - "suportedFrameworks": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "PyTorch", - "TensorFlow", - "OpenCL" - ] - } - } - } - } - } - }, - "required": [ - "callName", - "supportedPlatforms" - ] - } - }, - "hardware": { - "type": "object", - "properties": { - "CPU": { - "type": "object", - "properties": { - "model": { - "type": "string", - "description": "CPU model name." - }, - "architecture": { - "type": "string", - "enum": [ - "x86", - "arm64" - ] - }, - "cores": { - "type": "integer" - }, - "frequency": { - "type": "array", - "description": "All the possible CPU frequency values.", - "items": { - "type": "number" - } - }, - "performanceIndicator": { - "type": "number", - "description": "Quantifies the processing capabilities of the platform." - } - } - }, - "MCU": { - "type": "object", - "properties": { - "architecture": { - "type": "string", - "enum": [ - "arm-M4" - ] - }, - "Flash": { - "type": "string", - "description": "Flash memory size (related to far edge devices)" - }, - "cores": { - "type": "integer" - }, - "frequency": { - "type": "number" - }, - "performanceIndicator": { - "type": "number", - "description": "Quantifies the processing capabilities of the platform." - } - } - }, - "RAM": { - "type": "string", - "description": "RAM size (in GB)." - }, - "Disk": { - "type": "string", - "description": "Disk space in GB (local storage)." - }, - "GPU": { - "type": "object", - "properties": { - "model": { - "type": "string", - "enum": [ - "NVIDIA", - "K80", - "K40" - ] - }, - "memory": { - "type": "string" - }, - "instances": { - "type": "number" - }, - "performanceIndicator": { - "type": "number", - "description": "Quantifies the processing capabilities of the platform." - } - } - }, - "FPGA": { - "type": "object", - "properties": { - "model": { - "type": "string", - "enum": [ - "ZCU102" - ] - }, - "memory": { - "type": "string" - }, - "performanceIndicator": { - "type": "number", - "description": "Quantifies the processing capabilities of the platform." - } - } - } - } - }, - "networkResources": { - "type": "object", - "properties": { - "BasicInterface": { - "type": "object", - "properties": { - "name": { - "type": "string", - "enum": [ - "4G", - "5G", - "WiFi", - "Bluetooth", - "LoRa", - "ZigBee", - "Ethernet" - ], - "description": "It is the (only) interface used for control traffic. Also, it is the default interface for application-level traffic." - }, - "range": { - "type": "number", - "description": "The communication range if the given interface is wireless." - }, - "interfaceName": { - "type": "string" - } - } - }, - "redirectionInterface": { - "type": "array", - "description": "The redirection only refers to application traffic.", - "items": { - "type": "object", - "properties": { - "connectionType": { - "type": "string", - "enum": [ - "4G", - "5G", - "WiFi", - "Bluetooth", - "LoRa", - "ZigBee", - "Ethernet" - ], - "description": "The networking interface technology" - }, - "range": { - "type": "number", - "description": "The communication range if the given interface is wireless." - }, - "interfaceName": { - "type": "string" - }, - "mode": { - "type": "string", - "enum": [ - "infrastructure", - "adhoc" - ], - "description": "The connectivity." - }, - "hardwareAddress": { - "type": "string" - }, - "pairingInfo": { - "type": "object", - "properties": { - "ipAddress": { - "type": "string" - }, - "networkSSID": { - "type": "string", - "description": "The network id (SSID or PAN ID)" - }, - "netKey": { - "type": "string", - "description": "The network connection password" - } - } - } - }, - "required": [ - "connectionType" - ] - } - } - } - }, - "powerSources": { - "type": "object", - "properties": { - "grid": { - "type": "boolean" - }, - "renewable": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "enum": [ - "solarPanel" - ], - "Production": { - "type": "object", - "properties": { - "value": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "mAh", - "Wh" - ] - } - } - } - } - } - } - }, - "battery": { - "type": "object", - "properties": { - "capacity": { - "type": "object", - "properties": { - "value": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "mAh", - "Wh" - ] - } - } - } - } - } - } - } - } - } - }, "required": ["MLSysOpsNode"] -} -cluster_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "MLSysOpsCluster Schema", - "type": "object", - "properties": { - "MLSysOpsCluster": { - "type": "object", - "properties": { - "clusterID": {"type": "string", - "description": " str"}, - "nodes": { - "type": "array", - "items": { - "type": "string", - "description": "string" - } - }, - }, "required": ["clusterID", "nodes"] - } - }, "required": ["MLSysOpsCluster"] -} -continuum_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "MLSysOpsContinuum Schema", - "type": "object", - "properties": { - "MLSysOpsContinuum": { - "type": "object", - "properties": { - "continuumID": {"type": "string", - "description": " str"}, - "clusters": { - "type": "array", - "items": { - "type": "string", - "description": "string" - } - }, - }, "required": ["continuumID", "clusters"] - } - }, "required": ["MLSysOpsContinuum"] -} -datacenter_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "MLSysOpsDatacenter Schema", - "type": "object", - "properties": { - "MLSysOpsDatacenter": { - "type": "object", - "description": "string", - "properties": { - "datacenterID": { - "type": "string", - "description": "The unique datacenter identifier." - }, - "clusterID": { - "type": "string", - "description": "The clusterID that the given datacenter is a member." - }, - "continuum": { - "type": "string", - "description": "The continuum layer that the datacenter belongs to.", - "enum": ["Cloud", "EdgeInfrastructure", "Edge", "FarEdge"] - }, - "nodes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The set of registered nodes." - }, - "continent": { - "type": "string", - "description": "The desired continent (optional).", - "enum": ["Europe", "Asia", "Africa", "Australia", "North America", "South America", "Antarctica"] - }, - "country": { - "type": "string", - "description": "The desired country (optional).", - "enum": ["GR", "IT", "FRA", "ENG", "POR"] - }, - "city": { - "type": "string", - "description": "The desired city (optional).", - "enum": ["Volos", "Milan", "Paris", "London", "Lisbon"] - }, - "location": { - "type": "array", - "description": "The location of the datacenter.", - "items": { - "type": "number" - } - }, - "cloudProvider": { - "type": "string", - "description": "The cloud provider (optional)." - } - }, "required": ["clusterID", "datacenterID", "continuum", "nodes"] - - } - - }, "required": ["MLSysOpsDatacenter"] -} -app_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "MLSysOpsApplication Schema", - "type": "object", - "properties": { - "MLSysOpsApplication": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "mlsysops-id": { - "type": "string" - }, - "clusterPlacement": { - "type": "object", - "properties": { - "clusterID": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of clusters that can host the application." - }, - "instances": { - "type": "integer", - "description": "The desired number of replicas (e.g. deploy 2 instances of a given app)." - }, - "location": { - "type": "object", - "properties": { - "continent": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "Europe", - "Asia" - ], - "description": "The required continent (optional)" - } - }, - "country": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "EL", - "IT", - "FR", - "NL", - "IE", - "PT", - "DK", - "IL" - ] - }, - "description": "The required country (optional)" - }, - "city": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "Volos", - "Athens", - "Rende", - "Milan", - "Lille", - "Delft", - "Dublin", - "Aveiro", - "Porto", - "Aarhus", - "Jerusalem" - ], - "description": "The required city (optional)" - } - } - } - }, - "cloudProvider": { - "type": "string", - "enum": [ - "private", - "AWS", - "MicrosoftAzure", - "GCP" - ] - }, - "accelerators": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "GPU", - "FPGA", - "TPU" - ] - }, - "description": "The candidate clusters should have nodes containing at least one instance of the specified accelerators." - }, - "nodeTypes": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "Cloud", - "FarEdge", - "EdgeInfrastructure", - "Edge" - ], - "description": "The required" - }, - "description": "The candidate clusters should contain at least one node of each specified type." - } - } - }, - "singleNode": { - "type": "array", - "items": { - "type": "object", - "description": "The application deployment should occur within a single node with the required characteristics. If the involved components have multiple instances, the framework will do the same for all the replicas. This field will override nodePlacement information (if specified) for any of the components.", - "properties": { - "components": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The array of components to be deployed on the same Node." - }, - "continuumLayer": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "Cloud", - "FarEdge", - "EdgeInfrastructure", - "Edge", - "*" - ], - "description": "The required component placement on the continuum. \"*\" symbol means \"anywhere on the continuum\"." - } - }, - "Scaling": { - "type": "object", - "properties": { - "scalingMode": { - "type": "string", - "enum": [ - "manual", - "auto" - ] - }, - "instances": { - "type": "integer", - "description": "In case of manual scaling of the component, specify the number of instances." - }, - "scalingCriteria": { - "type": "string", - "enum": [ - "MinCPUutilization", - "MaxCPUutilization", - "MinMemoryPercent", - "MaxMemoryPercent", - "MinRequestsPerSec", - "MaxRequestPerSec", - "MinNumberOfInstances", - "MaxNumberOfInstances" - ], - "description": "Scaling criteria for the component, related to the \"auto\" scaling type." - } - }, - "description": "The component scaling information." - }, - "mobility": { - "type": "boolean", - "description": "Specify if the component needs to be deployed on a mobile node (optional)" - }, - "labels": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The required labels for filtering." - }, - "node": { - "type": "string", - "description": "The required node name to be the host of the component (optional)." - } - } - } - }, - "components": { - "type": "array", - "items": { - "type": "object", - "properties": { - "Component": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "The unique name of the component" - }, - "uid": { - "type": "string", - "description": "The unique identifier of the component (not given by app provider)." - } - } - }, - "nodePlacement": { - "type": "object", - "properties": { - "continuumLayer": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "Cloud", - "FarEdge", - "EdgeInfrastructure", - "Edge", - "*" - ], - "description": "The required component placement on the continuum. \"*\" symbol means \"anywhere on the continuum\"." - } - }, - "Scaling": { - "type": "object", - "properties": { - "scalingMode": { - "type": "string", - "enum": [ - "manual", - "auto" - ] - }, - "instances": { - "type": "integer", - "description": "In case of manual scaling of the component, specify the number of instances." - }, - "scalingCriteria": { - "type": "string", - "enum": [ - "MinCPUutilization", - "MaxCPUutilization", - "MinMemoryPercent", - "MaxMemoryPercent", - "MinRequestsPerSec", - "MaxRequestPerSec", - "MinNumberOfInstances", - "MaxNumberOfInstances" - ], - "description": "Scaling criteria for the component, related to the \"auto\" scaling type." - } - }, - "description": "The component scaling information." - }, - "mobility": { - "type": "boolean", - "description": "Specify if the component needs to be deployed on a mobile node (optional)" - }, - "labels": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The required labels for filtering." - }, - "node": { - "type": "string", - "description": "The required node name to be the host of the component (optional)." - } - } - }, - "DependsOn": { - "type": "array", - "items": { - "type": "string", - "description": "The name of the related components." - }, - "description": "The given component should be deployed after all the components specified in the Components list have already started running." - }, - "runtimeConfiguration": { - "type": "object", - "description": "Enables runtime (node-level) configuration for app components.", - "properties": { - "configSpecificationFile": { - "type": "string", - "description": "The actual specification file describing the available runtime configuration knobs (expected in json format). This file is provided by the app developer." - }, - "configFilePath": { - "type": "string", - "description": "The absolute path inside the container where the application code expects to find the configSpecificationFile." - } - } - }, - "sensors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "camera": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of camera service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "CaptureImage", - "CaptureImagePeriodic", - "CaptureVideo", - "ListImages", - "GetImageInfo", - "RetrieveImage", - "GetCameraInfo" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "D455", - "IMX477", - "picamera-v2" - ], - "description": "The model name of the camera sensor" - }, - "cameraType": { - "type": "string", - "enum": [ - "RGB", - "NIR", - "Thermal", - "Monocular" - ], - "description": "The camera sensor type." - }, - "minimumFramerate": { - "type": "integer" - }, - "resolutions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "1024x768", - "4056x3040" - ] - } - } - } - }, - "temperature": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of temperature service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetTemperature" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "SDC30", - "DS18B20" - ], - "description": "The model name of the temperature sensor" - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "Celsius", - "Fahrenheit" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - } - }, - "accelerometer": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of accelerometer service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetAcceleration" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "3038-SMT" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "m/s^2" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - } - }, - "barometer": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of barometer service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "GetAtmosphericPressure" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "SB-100" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "Pa" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - } - }, - "airQuality": { - "type": "object", - "properties": { - "methods": { - "type": "array", - "items": { - "type": "object", - "description": "The list of airQuality service methods.", - "properties": { - "methodName": { - "type": "string", - "enum": [ - "DetectAirContaminants" - ] - } - } - } - }, - "model": { - "type": "string", - "enum": [ - "MQ-135" - ] - }, - "measurementMin": { - "type": "number" - }, - "measurementMax": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "μg/m^3" - ] - }, - "accuracy": { - "type": "number" - }, - "samplingFrequency": { - "type": "number" - } - } - } - } - } - }, - "QoS-Metrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "ApplicationMetricID": { - "type": "string", - "description": "This is an indicative list of metrics. It can be extended as needed." - }, - "target": { - "type": "number" - }, - "relation": { - "type": "string", - "enum": [ - "LowerOrEqual", - "GreaterOrEqual", - "Equal", - "LowerThan", - "GreaterThan" - ] - }, - "systemMetricsHints": { - "type": "array", - "description": "System-level metrics affecting the application metric.", - "items": { - "type": "string", - "enum": [ - "CPUFrequency" - ] - } - } - } - } - }, - "storage": { - "type": "object", - "properties": { - "buckets": { - "type": "array", - "items": { - "type": "object", - "properties": { - "bucketID": { - "type": "string", - "description": "The bucket's unique identifier." - }, - "policyUpdateToken": { - "type": "string", - "description": "The required token for the MLSysOps to update the bucket's policy at runtime." - }, - "locationRestrictions": { - "type": "object", - "description": "These restrictions are used to exclude storage locations that host data of the application.", - "properties": { - "GDPR": { - "type": "boolean", - "description": "For EU citizens only GDPR-compliant storage locations can legally be used." - } - } - }, - "reduncancy": { - "type": "string", - "enum": [ - "High", - "One", - "None" - ] - }, - "maxLatency": { - "type": "number" - }, - "minDownloadSpeed": { - "type": "number" - }, - "serverSideEncryption": { - "type": "string", - "enum": [ - "ON", - "OFF" - ] - } - }, - "required": [ - "bucketID" - ] - } - } - } - }, - "dataSensitivity": { - "type": "boolean", - "description": "The indication to specify whether a component has sensitive data or not (useful for the data storage)." - }, - "dataCriticality": { - "type": "string", - "enum": [ - "Low", - "Medium", - "High" - ], - "description": "Used to provide information referring to the trust aspect for a given component." - }, - "externalComponent": { - "type": "boolean", - "description": "This property indicates whether the component can be managed by MLSysOps or not. If not the MLSysOps platform merely deploys the component(s), based on the provided instances, and subsequently deletes it whenever the application needs to be removed." - }, - "externalAccess": { - "type": "boolean", - "description": "This property indicates whether the component can be accessed outside of its cluster." - }, - "hostNetwork": { - "type": "boolean", - "description": "Host networking requested for this component. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false." - }, - "runtimeClassName": { - "type": "string", - "enum": [ - "nvidia" - ] }, - "restartPolicy": { - "type": "string", - "enum": [ - "Always", - "OnFailure", - "Never" - ], - "description": "Restart policy for the container. Default to Always." - }, - "containers": { - "type": "array", - "items": { - "type": "object", - "properties": { - "image": { - "type": "string", - "description": "The name of the container image." - }, - "resources": { - "type": "object", - "properties": { - "requests": { - "type": "object", - "properties": { - "cpu": { - "type": "string" - }, - "memory": { - "type": "string" - }, - "nvidia.com/gpu": { - "type": "integer" - } - } - }, - "limits": { - "type": "object", - "properties": { - "cpu": { - "type": "string" - }, - "memory": { - "type": "string" - }, - "nvidia.com/gpu": { - "type": "integer" - } - } - } - } - }, - "securityContext": { - "type": "object", - "properties": { - "allowPrivilegeEscalation": { - "type": "boolean" - } - }, - "description": "SecurityContext defines the security options the container should be run with." - }, - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "imagePullPolicy": { - "type": "string", - "enum": [ - "Always", - "Never", - "IfNotPresent" - ], - "description": "Image pull policy. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise." - }, - "accelerationAPI": { - "type": "array", - "items": { - "type": "object", - "properties": { - "callName": { - "type": "string", - "description": "The (unique) API call name.", - "enum": [ - "CalcOpticalFlow", - "ImageInference" - ] - }, - "requiredFramework": { - "type": "string", - "description": "Asterisk means any of the available frameworks.", - "enum": [ - "PyTorch", - "TensorFlow", - "OpenCL", - "*" - ] - } - }, - "required": [ - "callName" - ] - } - }, - "resourceRequirements": { - "type": "object", - "description": "The resource requirements of the container.", - "properties": { - "CPU": { - "type": "object", - "properties": { - "architecture": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "x86", - "arm64" - ] - } - }, - "cores": { - "type": "integer", - "description": "required cores" - }, - "frequency": { - "type": "number", - "description": "required frequency in GHz." - } - } - }, - "MCU": { - "type": "object", - "properties": { - "architecture": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "arm-M4" - ] - } - }, - "Flash": { - "type": "string", - "description": "Flash memory size (related to far edge devices)" - }, - "cores": { - "type": "integer", - "description": "required cores" - }, - "frequency": { - "type": "number", - "description": "required frequency in GHz." - } - } - }, - "RAM": { - "type": "string", - "description": "required RAM (in GB)." - }, - "Disk": { - "type": "string", - "description": "required Disk space (in GB)." - }, - "GPU": { - "type": "object", - "properties": { - "model": { - "type": "string", - "enum": [ - "K80", - "K40" - ] - }, - "memory": { - "type": "string" - }, - "utilizationRequest": { - "type": "string", - "description": "Percentage of expected utilization." - } - } - }, - "FPGA": { - "type": "object", - "properties": { - "model": { - "type": "string", - "enum": [ - "ZCU102" - ] - }, - "memory": { - "type": "string" - }, - "utilizationRequest": { - "type": "string", - "description": "Percentage of expected utilization." - } - } - }, - "performanceIndicator": { - "type": "number", - "description": "This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities." - } - } - }, - "environmentRequirements": { - "type": "object", - "properties": { - "nodeType": { - "type": "string", - "enum": [ - "Virtualized", - "Native", - "BareMetal" - ] - }, - "OS": { - "type": "string", - "enum": [ - "Ubuntu", - "Kali", - "Zephyr" - ] - }, - "container-runtime": { - "type": "string", - "enum": [ - "containerd", - "Docker", - "embServe" - ] - } - } - }, - "ports": { - "type": "array", - "items": { - "type": "object", - "properties": { - "containerPort": { - "type": "integer", - "description": "Number of port to expose on the component's IP address. This must be a valid port number, 0 < x < 65536." - }, - "hostIP": { - "type": "string", - "description": "What host IP to bind the external port to." - }, - "hostPort": { - "type": "integer", - "description": "Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort." - }, - "name": { - "type": "string", - "description": "Each named port in a component must have a unique name. Name for the port that can be referred to by services." - }, - "protocol": { - "type": "string", - "enum": [ - "UDP", - "TCP", - "SCTP" - ], - "description": "Protocol for port. Defaults to \"TCP\"." - } - } - }, - "description": "Environment variables for the container." - }, - "env": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "Name of the environment variable." - }, - "valueFrom": { - "type": "object", - "properties": { - "fieldRef": { - "type": "object", - "properties": { - "fieldPath": { - "type": "string" - } - } - } - } - }, - "value": { - "type": "string", - "description": "Value of the environment variable." - } - } - }, - "description": "Environment variables for the container." - } - } - } - } - }, - "required": [ - "Component" - ] - } - }, - "componentInteractions": { - "type": "array", - "items": { - "type": "object", - "properties": { - "componentName1": { - "type": "string", - "description": "The \"source\" component." - }, - "type": { - "type": "string", - "enum": [ - "ingress", - "egress" - ] - }, - "componentName2": { - "type": "string", - "description": "The \"destination\" component." - }, - "interactionCriticality": { - "type": "string", - "enum": [ - "Low", - "Medium", - "High" - ], - "description": "Used to provide information referring to the trust aspect for a given interaction." - }, - "interactionMetrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "SystemMetricID": { - "type": "string", - "enum": [ - "Latency", - "Bandwidth", - "End2EndInvocationDelay" - ], - "description": "The unique identifier of the system-level metric related to this interaction." - }, - "target": { - "type": "number" - }, - "measurementUnit": { - "type": "string", - "enum": [ - "milliseconds", - "Mbps", - "seconds" - ] - }, - "relation": { - "type": "string", - "enum": [ - "LowerOrEqual", - "GreaterOrEqual", - "Equal", - "LowerThan", - "GreaterThan" - ] - } - } - } - } - } - } - }, - "globalSatisfaction": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "description": "Happiness minimum required value (range (0-1])" - }, - "relation": { - "type": "string", - "enum": [ - "GreaterOrEqual", - "Equal", - "GreaterThan" - ] - }, - "achievementWeights": { - "type": "array", - "items": { - "type": "object", - "properties": { - "metricID": { - "type": "string" - }, - "weight": { - "type": "number" - } - } - } - } - } - } - }, - "required": [ - "components" - ] - } - - }, "required": ["MLSysOpsApplication"] -} diff --git a/northbound-api/MLSysOps_Schemas/schema generator/generate_model.py b/northbound-api/MLSysOps_Schemas/schema generator/generate_model.py deleted file mode 100644 index 39fdad7..0000000 --- a/northbound-api/MLSysOps_Schemas/schema generator/generate_model.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import glob -import tempfile -import yaml -import json -import subprocess -from jsonschema import validate, ValidationError - -def find_single_crd_yaml(crd_dir: str): - """ - Look for exactly one .yaml or .yml file inside crd_dir. - Return its full path. Exit if none (or zero). - If more than one, pick the first and print a warning. - """ - patterns = [os.path.join(crd_dir, "*.yaml"), os.path.join(crd_dir, "*.yml")] - matches = [] - for p in patterns: - matches.extend(glob.glob(p)) - - if not matches: - print(f"[✗] No .yaml/.yml file found under '{crd_dir}'.") - sys.exit(1) - if len(matches) > 1: - print(f"[!] Multiple CRD files found under '{crd_dir}'. Using the first:\n {matches[0]}") - return matches[0] - -def convert_yaml_crd_to_json(yaml_file: str, json_file: str): - """ - Converts a YAML CRD schema into a JSON schema file and adds necessary metadata, - including a root key derived from the CRD schema. - """ - try: - with open(yaml_file, 'r') as f: - yaml_content = yaml.safe_load(f) - # Extract kind from spec.names.kind - root_key = None - if ( - isinstance(yaml_content, dict) - and 'spec' in yaml_content - and isinstance(yaml_content['spec'], dict) - and 'names' in yaml_content['spec'] - and isinstance(yaml_content['spec']['names'], dict) - and 'kind' in yaml_content['spec']['names'] - ): - root_key = yaml_content['spec']['names']['kind'] - - if root_key is None: - raise ValueError("Could not determine 'kind' from CRD → spec.names.kind.") - - # Find openAPIV3Schema under spec.versions[*].schema.openAPIV3Schema - openapi_schema = None - if ( - 'spec' in yaml_content - and isinstance(yaml_content['spec'], dict) - and 'versions' in yaml_content['spec'] - and isinstance(yaml_content['spec']['versions'], list) - ): - for version in yaml_content['spec']['versions']: - if ( - isinstance(version, dict) - and 'schema' in version - and isinstance(version['schema'], dict) - and 'openAPIV3Schema' in version['schema'] - ): - openapi_schema = version['schema']['openAPIV3Schema'] - break - - if openapi_schema is None: - raise ValueError("No valid 'openAPIV3Schema' found in the CRD under spec.versions[].schema.") - - # Build full JSON Schema - full_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": f"{root_key} Schema", - "type": "object", - "properties": { - root_key: openapi_schema - }, - "required": [root_key] - } - - with open(json_file, 'w') as f: - json.dump(full_schema, f, indent=4) - print(f"[✓] JSON Schema written to: {json_file}") - - except Exception as e: - print(f"[✗] Error converting YAML→JSON: {e}") - sys.exit(1) - -def run_datamodel_codegen(json_schema_file: str, output_model_file: str): - """ - Invoke datamodel-codegen to turn the JSON Schema into a Pydantic model. - """ - cmd = [ - "datamodel-codegen", - "--input", json_schema_file, - "--input-file-type", "jsonschema", - "--output", output_model_file - ] - try: - print(f"[>] Running: {' '.join(cmd)}") - subprocess.check_call(cmd) - print(f"[✓] Model written to: {output_model_file}") - except FileNotFoundError: - print("[✗] 'datamodel-codegen' not found. Please install it (pip install datamodel-code-generator).") - sys.exit(1) - except subprocess.CalledProcessError as e: - print(f"[✗] datamodel-codegen failed: {e}") - sys.exit(1) - -if __name__ == "__main__": - SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) - CRD_DIR = os.path.join(SCRIPT_DIR, "input") - OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output") - - # 1) Find the single .yaml/.yml in CRD_DIR - crd_yaml_path = find_single_crd_yaml(CRD_DIR) - - # 2) Ensure output folder exists - os.makedirs(OUTPUT_DIR, exist_ok=True) - - # 3) Create a temporary file for the JSON Schema - with tempfile.NamedTemporaryFile(suffix="_schema.json", delete=False) as tmp: - json_schema_path = tmp.name - - # 4) Convert CRD YAML → JSON Schema - convert_yaml_crd_to_json(crd_yaml_path, json_schema_path) - - # 5) Run datamodel-codegen → output/model.py - model_py_path = os.path.join(OUTPUT_DIR, "model.py") - run_datamodel_codegen(json_schema_path, model_py_path) - - # 6) Remove the temporary JSON Schema file - try: - os.remove(json_schema_path) - except OSError: - pass - - print("\nDone. Your Pydantic model is here:") - print(f" {model_py_path}") diff --git a/northbound-api/MLSysOps_Schemas/schema generator/input/MLSysOpsAppCRD.yaml b/northbound-api/MLSysOps_Schemas/schema generator/input/MLSysOpsAppCRD.yaml deleted file mode 100644 index 696ed65..0000000 --- a/northbound-api/MLSysOps_Schemas/schema generator/input/MLSysOpsAppCRD.yaml +++ /dev/null @@ -1,774 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - # name must match the spec fields below, and be in the form: . - name: mlsysopsapps.mlsysops.eu -spec: - # group name to use for REST API: /apis// - group: mlsysops.eu - scope: Namespaced #Cluster - names: - plural: mlsysopsapps - singular: mlsysopsapp - kind: MLSysOpsApp - shortNames: - - mlsapp - versions: - - name: v1 - served: true - storage: true - schema: - openAPIV3Schema: - type: object - properties: - name: - type: string - cluster_placement: - type: object - properties: - cluster_id: - type: array - items: - type: string - description: Array of clusters that can host the application. - instances: - type: integer - description: The desired number of replicas (e.g. deploy 2 instances of a given app). - location: - type: object - properties: - continent: - type: array - items: - type: string - enum: - - Europe - - Asia - description: The required continent (optional) - country: # See reference list - type: array - items: - type: string - enum: - - el # Greece - - it # Italy - - fr # France - - nl # Netherlands - - ie # Ireland - - pt # Portugal - - dk # Denmark - - il # Israel - description: The required country (optional) - city: - type: array - items: - type: string - enum: - - volos - - athens - - rende - - milan - - lille - - delft - - dublin - - aveiro - - porto - - aarhus - - jerusalem - description: The required city (optional) - cloud_provider: - type: string - enum: - - private - - aws - - microsoft_azure - - gcp - accelerators: - type: array - items: - type: string - enum: - - gpu - - fpga - - tpu - description: The candidate clusters should have nodes containing at least one instance of the specified accelerators. - node_types: - type: array - items: - type: string - enum: - - cloud - - far_edge - - edge_infrastructure - - edge - description: The required - description: The candidate clusters should contain at least one node of each specified type. - single_node: - type: array - items: - type: object - description: The application deployment should occur within a single node with the required characteristics. - If the involved components have multiple instances, the framework will do the same for all the replicas. - This field will override nodePlacement information (if specified) for any of the components. - properties: - components: - type: array - items: - type: string - description: The array of components to be deployed on the same Node. - continuum_layer: - type: array - items: - type: string - enum: - - cloud - - far_edge - - edge_infrastructure - - edge - - "*" - description: The required component placement on the continuum. "*" symbol means "anywhere on the continuum". - scaling: - type: object - properties: - scaling_mode: - type: string - enum: - - manual - - auto - instances: - type: integer - description: In case of manual scaling of the component, specify the number of instances. - scaling_criteria: - type: string - enum: - - min_cpu_utilization - - max_cpu_utilization - - min_memory_percent - - max_memory_percent - - min_requests_per_sec - - max_request_per_sec - - min_number_of_instances - - max_number_of_instances - description: Scaling criteria for the component, related to the "auto" scaling type. - description: The component scaling information. - mobile: - type: boolean - description: Specify if the component must be deployed on a mobile node (optional) - labels: - type: array - items: - type: string - description: The required labels for filtering. - node: - type: string - description: The required node name to be the host of the component (optional). - components: - type: array - items: - type: object - properties: - metadata: - type: object - properties: - name: - type: string - description: The unique name of the component - uid: - type: string - description: The unique identifier of the component (not given by app provider). - node_placement: - type: object - properties: - continuum_layer: - type: array - items: - type: string - enum: - - cloud - - far_edge - - edge_infrastructure - - edge - - "*" - description: The required component placement on the continuum. "*" symbol means "anywhere on the continuum". - scaling: - type: object - properties: - scaling_mode: - type: string - enum: - - manual - - auto - instances: - type: integer - description: In case of manual scaling of the component, specify the number of instances. - scaling_criteria: - type: string - enum: - - min_cpu_utilization - - max_cpu_utilization - - min_memory_percent - - max_memory_percent - - min_requests_per_sec - - max_request_per_sec - - min_number_of_instances - - max_number_of_instances - description: Scaling criteria for the component, related to the "auto" scaling type. - description: The component scaling information. - mobile: - type: boolean - description: Specify if the component needs to be deployed on a mobile node (optional) - labels: - type: array - items: - type: string - description: The required labels for filtering. - node: - type: string - description: The required node name to be the host of the component (optional). - depends_on: - type: array - items: - type: string - description: The name of the related components. - description: The given component should be deployed after all the components - specified in the Components list have already started running. - runtime_configuration: - type: object - description: Enables runtime (node-level) configuration for app components. - properties: - config_specification_file: - type: string - description: The actual specification file describing the available runtime - configuration knobs (expected in json format). - This file is provided by the app developer. - config_file_path: - type: string - description: The absolute path inside the container where the application code - expects to find the configSpecificationFile. - sensors: - type: array - items: - type: object - properties: - camera: - type: object - properties: - model: - type: string - enum: - - d455 - - imx477 - - picamera-v2 - description: The model name of the camera sensor - camera_type: - type: string - enum: - - rgb - - nir - - thermal - - monocular - description: The camera sensor type. - minimum_framerate: - type: integer - resolution: - type: string - enum: - - 1024x768 - - 4056x3040 - temperature: - type: object - properties: - model: - type: string - enum: - - sdc30 - - ds18b20 - description: The model name of the temperature sensor - measurement_min: - type: number - measurement_max: - type: number - measurement_unit: - type: string - enum: - - celsius - - fahrenheit - accuracy: - type: number - sampling_frequency: - type: number - accelerometer: - type: object - properties: - model: - type: string - enum: - - 3038-smt - measurement_min: - type: number - measurement_max: - type: number - measurement_unit: - type: string - enum: - - m/s^2 - accuracy: - type: number - sampling_frequency: - type: number - barometer: - type: object - properties: - model: - type: string - enum: - - sb-100 - measurement_min: - type: number - measurement_max: - type: number - measurement_unit: - type: string - enum: - - pa # Pascal - accuracy: - type: number - sampling_frequency: - type: number - air_quality: - type: object - properties: - model: - type: string - enum: - - mq-135 - measurement_min: - type: number - measurement_max: - type: number - measurement_unit: - type: string - enum: - - ug/m^3 # micrograms per cubic meter - accuracy: - type: number - sampling_frequency: - type: number - endpoint_variable: - type: string - description: The env variable that the app will retrieve the endpoint - to get the sensor measurements. - protocol: - type: string - enum: - - MQTT - - RTSP - description: The protocol of the sensor service. - instances: - type: number - description: The number of required sensor instances. - qos_metrics: - type: array - items: - type: object - properties: - application_metric_id: - type: string - description: This is an indicative list of metrics. It can be extended as needed. - target: - type: number - relation: - type: string - enum: - - lower_or_equal - - greater_or_equal - - equal - - lower_than - - greater_than - system_metrics_hints: - type: array - description: System-level metrics affecting the application metric. - items: - type: string - enum: - - cpu_frequency - storage: # Refers to CC storage service. - type: object - properties: - buckets: - type: array - items: - type: object - properties: - bucket_id: - type: string - description: The bucket's unique identifier. - policy_update_token: - type: string - description: The required token for the MLSysOps to update the bucket's policy at runtime. - location_restrictions: - type: object - description: These restrictions are used to exclude storage locations that host data of the application. - properties: - gdpr: - type: boolean - description: For EU citizens only GDPR-compliant storage locations can legally be used. - reduncancy: - type: string - enum: - - high - - one - - none - max_latency: - type: number - min_download_speed: - type: number - server_side_encryption: - type: string - enum: - - on - - off - required: - - bucket_id - data_sensitivity: - type: boolean - description: The indication to specify whether a component has sensitive data or not (useful for the data storage). - data_criticality: - type: string - enum: - - low - - medium - - high - description: Used to provide information referring to the trust aspect for a given component. - external_component: - type: boolean - description: This property indicates whether the component can be managed by MLSysOps or not. - If not the MLSysOps platform merely deploys the component(s), based on the provided instances, - and subsequently deletes it whenever the application needs to be removed. - external_access: - type: boolean - description: This property indicates whether the component can be accessed outside of its cluster. - host_network: - type: boolean - description: Host networking requested for this component. - Use the host's network namespace. If this option is set, - the ports that will be used must be specified. Default to false. - runtime_class_name: - type: string - enum: - - nvidia - - default - - kata-fc - - kata-dragon - - urunc - - crun - - lunatic - - nvidia-experimental - - spin - - wasmedge - - slight - restart_policy: - type: string - enum: - - always - - on_failure - - never - description: Restart policy for the container. Default to Always. - os: - type: string - enum: - - ubuntu - - kali - - zephyr - node_type: - type: string - enum: - - virtualized # In the form of a Virtual Machine - - native # Non-virtualized, including OS - - bare_metal # Non-virtualized, without OS - container_runtime: - type: string - enum: - - containerd - - docker - - emb_serve - containers: - type: array - items: - type: object - properties: - image: - type: string - description: The name of the container image. - command: - type: array - items: - type: string - image_pull_policy: - type: string - enum: - - Always - - Never - - IfNotPresent - description: Image pull policy. Defaults to Always if :latest tag is specified, - or IfNotPresent otherwise. - acceleration_api: - type: array - items: - type: object - properties: - call_name: - type: string - description: The (unique) API call name. - enum: - - calc_optical_flow - - image_inference - required_framework: - type: string - description: Asterisk means any of the available frameworks. - enum: - - pytorch - - tensorflow - - opencl - - "*" - required: - - call_name - platform_requirements: - type: object - description: The resource requirements of the container. - properties: - cpu: - type: object - properties: - requests: - type: string - limits: - type: string - architecture: - type: array - items: - type: string - enum: - - arm64 - - amd64 - frequency: - type: number - description: required frequency in Hz. - performance_indicator: - type: number - description: This field assists MLSysOps with an initial hint in order to - filter out nodes based on their performance capabilities. - memory: - type: object - properties: - requests: - type: string - limits: - type: string - mcu: - type: object - properties: - requests: - type: string - limits: - type: string - architecture: - type: array - items: - type: string - enum: - - arm-m4 - flash: - type: string - description: Flash memory size (related to far edge devices) - frequency: - type: number - description: required frequency in GHz. - performance_indicator: - type: number - description: This field assists MLSysOps with an initial hint in order to - filter out nodes based on their performance capabilities. - disk: - type: string - description: required Disk space (in GB). - gpu: - type: object - properties: - requests: - type: integer - limits: - type: integer - model: - type: string - enum: - - k80 - - k40 - memory: - type: integer - utilization_request: - type: string - description: Percentage of expected utilization. - performance_indicator: - type: number - description: This field assists MLSysOps with an initial hint in order to - filter out nodes based on their performance capabilities. - fpga: - type: object - properties: - model: - type: string - enum: - - zcu102 - memory: - type: integer - utilization_request: - type: string - description: Percentage of expected utilization. - performance_indicator: - type: number - description: This field assists MLSysOps with an initial hint in order to - filter out nodes based on their performance capabilities. - ports: - type: array - items: - type: object - properties: - container_port: - type: integer - description: Number of port to expose on the component's IP address. - This must be a valid port number, 0 < x < 65536. - host_ip: - type: string - description: What host IP to bind the external port to. - host_port: - type: integer - description: Number of port to expose on the host. - If specified, this must be a valid port number, 0 < x < 65536. - If HostNetwork is specified, this must match ContainerPort. - name: - type: string - description: Each named port in a component must have a unique name. - Name for the port that can be referred to by services. - protocol: - type: string - enum: - - UDP - - TCP - - SCTP - description: Protocol for port. Defaults to "TCP". - description: Environment variables for the container. - env: - type: array - items: - type: object - properties: - name: - type: string - description: Name of the environment variable. - value_from: - type: object - properties: - field_ref: - type: object - properties: - field_path: - type: string - value: - type: string - description: Value of the environment variable. - description: Environment variables for the container. - required: - - containers - - metadata - component_interactions: - type: array - items: - type: object - properties: - component_name1: - type: string - description: The "source" component. - type: - type: string - enum: - - ingress - - egress - component_name2: - type: string - description: The "destination" component. - interaction_criticality: - type: string - enum: - - low - - medium - - high - description: Used to provide information referring to the trust aspect for a given interaction. - interaction_metrics: - type: array - items: - type: object - properties: - system_metric_id: - type: string - enum: - - latency - - bandwidth - - end_to_end_invocation_delay - description: The unique identifier of the system-level metric related to this interaction. - target: - type: number - measurement_unit: - type: string - enum: - - milliseconds # latency, E2E invocation delay - - Mbps # Bandwidth - - seconds # latency, E2E invocation delay - relation: - type: string - enum: - - lower_or_equal - - greater_or_equal - - equal - - lower_than - - greater_than - permitted_actions: - type: array - items: - type: string - enum: - - component_relocation - - traffic_redirection - - change_container_image - - change_container_runtime_class - - change_container_cpu_set - - change_container_resource_requirements - - acceleration - - "*" - description: List of desired actions that can be performed by - the MLSysOps agents. For traffic redirection, it must - be enabled in the respective node description in order - for MLSysOps to apply it. - global_satisfaction: - type: object - properties: - threshold: - type: number - description: Happiness minimum required value (range (0-1]) - relation: - type: string - enum: - - greater_or_equal - - equal - - greater_than - achievement_weights: - type: array - items: - type: object - properties: - metric_id: - type: string - weight: - type: number - required: - - components - diff --git a/northbound-api/MLSysOps_Schemas/schema generator/output/model.py b/northbound-api/MLSysOps_Schemas/schema generator/output/model.py deleted file mode 100644 index 343e8e7..0000000 --- a/northbound-api/MLSysOps_Schemas/schema generator/output/model.py +++ /dev/null @@ -1,707 +0,0 @@ -# generated by datamodel-codegen: -# filename: tmpdlptvm7l_schema.json -# timestamp: 2025-06-06T15:03:46+00:00 - -from __future__ import annotations - -from enum import Enum -from typing import List, Optional - -from pydantic import BaseModel, Field - - -class ContinentEnum(Enum): - Europe = 'Europe' - Asia = 'Asia' - - -class CountryEnum(Enum): - el = 'el' - it = 'it' - fr = 'fr' - nl = 'nl' - ie = 'ie' - pt = 'pt' - dk = 'dk' - il = 'il' - - -class CityEnum(Enum): - volos = 'volos' - athens = 'athens' - rende = 'rende' - milan = 'milan' - lille = 'lille' - delft = 'delft' - dublin = 'dublin' - aveiro = 'aveiro' - porto = 'porto' - aarhus = 'aarhus' - jerusalem = 'jerusalem' - - -class Location(BaseModel): - continent: Optional[List[ContinentEnum]] = None - country: Optional[List[CountryEnum]] = Field( - None, description='The required country (optional)' - ) - city: Optional[List[CityEnum]] = None - - -class CloudProvider(Enum): - private = 'private' - aws = 'aws' - microsoft_azure = 'microsoft_azure' - gcp = 'gcp' - - -class Accelerator(Enum): - gpu = 'gpu' - fpga = 'fpga' - tpu = 'tpu' - - -class NodeType(Enum): - cloud = 'cloud' - far_edge = 'far_edge' - edge_infrastructure = 'edge_infrastructure' - edge = 'edge' - - -class ClusterPlacement(BaseModel): - cluster_id: Optional[List[str]] = Field( - None, description='Array of clusters that can host the application.' - ) - instances: Optional[int] = Field( - None, - description='The desired number of replicas (e.g. deploy 2 instances of a given app).', - ) - location: Optional[Location] = None - cloud_provider: Optional[CloudProvider] = None - accelerators: Optional[List[Accelerator]] = Field( - None, - description='The candidate clusters should have nodes containing at least one instance of the specified accelerators.', - ) - node_types: Optional[List[NodeType]] = Field( - None, - description='The candidate clusters should contain at least one node of each specified type.', - ) - - -class ContinuumLayerEnum(Enum): - cloud = 'cloud' - far_edge = 'far_edge' - edge_infrastructure = 'edge_infrastructure' - edge = 'edge' - field_ = '*' - - -class ScalingMode(Enum): - manual = 'manual' - auto = 'auto' - - -class ScalingCriteria(Enum): - min_cpu_utilization = 'min_cpu_utilization' - max_cpu_utilization = 'max_cpu_utilization' - min_memory_percent = 'min_memory_percent' - max_memory_percent = 'max_memory_percent' - min_requests_per_sec = 'min_requests_per_sec' - max_request_per_sec = 'max_request_per_sec' - min_number_of_instances = 'min_number_of_instances' - max_number_of_instances = 'max_number_of_instances' - - -class Scaling(BaseModel): - scaling_mode: Optional[ScalingMode] = None - instances: Optional[int] = Field( - None, - description='In case of manual scaling of the component, specify the number of instances.', - ) - scaling_criteria: Optional[ScalingCriteria] = Field( - None, - description='Scaling criteria for the component, related to the "auto" scaling type.', - ) - - -class SingleNodeItem(BaseModel): - components: Optional[List[str]] = Field( - None, description='The array of components to be deployed on the same Node.' - ) - continuum_layer: Optional[List[ContinuumLayerEnum]] = None - scaling: Optional[Scaling] = Field( - None, description='The component scaling information.' - ) - mobile: Optional[bool] = Field( - None, - description='Specify if the component must be deployed on a mobile node (optional)', - ) - labels: Optional[List[str]] = Field( - None, description='The required labels for filtering.' - ) - node: Optional[str] = Field( - None, - description='The required node name to be the host of the component (optional).', - ) - - -class Metadata(BaseModel): - name: Optional[str] = Field(None, description='The unique name of the component') - uid: Optional[str] = Field( - None, - description='The unique identifier of the component (not given by app provider).', - ) - - -class Scaling1(BaseModel): - scaling_mode: Optional[ScalingMode] = None - instances: Optional[int] = Field( - None, - description='In case of manual scaling of the component, specify the number of instances.', - ) - scaling_criteria: Optional[ScalingCriteria] = Field( - None, - description='Scaling criteria for the component, related to the "auto" scaling type.', - ) - - -class NodePlacement(BaseModel): - continuum_layer: Optional[List[ContinuumLayerEnum]] = None - scaling: Optional[Scaling1] = Field( - None, description='The component scaling information.' - ) - mobile: Optional[bool] = Field( - None, - description='Specify if the component needs to be deployed on a mobile node (optional)', - ) - labels: Optional[List[str]] = Field( - None, description='The required labels for filtering.' - ) - node: Optional[str] = Field( - None, - description='The required node name to be the host of the component (optional).', - ) - - -class RuntimeConfiguration(BaseModel): - config_specification_file: Optional[str] = Field( - None, - description='The actual specification file describing the available runtime configuration knobs (expected in json format). This file is provided by the app developer.', - ) - config_file_path: Optional[str] = Field( - None, - description='The absolute path inside the container where the application code expects to find the configSpecificationFile.', - ) - - -class Model(Enum): - d455 = 'd455' - imx477 = 'imx477' - picamera_v2 = 'picamera-v2' - - -class CameraType(Enum): - rgb = 'rgb' - nir = 'nir' - thermal = 'thermal' - monocular = 'monocular' - - -class Resolution(Enum): - field_1024x768 = '1024x768' - field_4056x3040 = '4056x3040' - - -class Camera(BaseModel): - model: Optional[Model] = Field( - None, description='The model name of the camera sensor' - ) - camera_type: Optional[CameraType] = Field( - None, description='The camera sensor type.' - ) - minimum_framerate: Optional[int] = None - resolution: Optional[Resolution] = None - - -class Model1(Enum): - sdc30 = 'sdc30' - ds18b20 = 'ds18b20' - - -class MeasurementUnit(Enum): - celsius = 'celsius' - fahrenheit = 'fahrenheit' - - -class Temperature(BaseModel): - model: Optional[Model1] = Field( - None, description='The model name of the temperature sensor' - ) - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model2(Enum): - field_3038_smt = '3038-smt' - - -class MeasurementUnit1(Enum): - m_s_2 = 'm/s^2' - - -class Accelerometer(BaseModel): - model: Optional[Model2] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit1] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model3(Enum): - sb_100 = 'sb-100' - - -class MeasurementUnit2(Enum): - pa = 'pa' - - -class Barometer(BaseModel): - model: Optional[Model3] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit2] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Model4(Enum): - mq_135 = 'mq-135' - - -class MeasurementUnit3(Enum): - ug_m_3 = 'ug/m^3' - - -class AirQuality(BaseModel): - model: Optional[Model4] = None - measurement_min: Optional[float] = None - measurement_max: Optional[float] = None - measurement_unit: Optional[MeasurementUnit3] = None - accuracy: Optional[float] = None - sampling_frequency: Optional[float] = None - - -class Protocol(Enum): - MQTT = 'MQTT' - RTSP = 'RTSP' - - -class Sensor(BaseModel): - camera: Optional[Camera] = None - temperature: Optional[Temperature] = None - accelerometer: Optional[Accelerometer] = None - barometer: Optional[Barometer] = None - air_quality: Optional[AirQuality] = None - endpoint_variable: Optional[str] = Field( - None, - description='The env variable that the app will retrieve the endpoint to get the sensor measurements.', - ) - protocol: Optional[Protocol] = Field( - None, description='The protocol of the sensor service.' - ) - instances: Optional[float] = Field( - None, description='The number of required sensor instances.' - ) - - -class Relation(Enum): - lower_or_equal = 'lower_or_equal' - greater_or_equal = 'greater_or_equal' - equal = 'equal' - lower_than = 'lower_than' - greater_than = 'greater_than' - - -class SystemMetricsHint(Enum): - cpu_frequency = 'cpu_frequency' - - -class QosMetric(BaseModel): - application_metric_id: Optional[str] = Field( - None, - description='This is an indicative list of metrics. It can be extended as needed.', - ) - target: Optional[float] = None - relation: Optional[Relation] = None - system_metrics_hints: Optional[List[SystemMetricsHint]] = Field( - None, description='System-level metrics affecting the application metric.' - ) - - -class LocationRestrictions(BaseModel): - gdpr: Optional[bool] = Field( - None, - description='For EU citizens only GDPR-compliant storage locations can legally be used.', - ) - - -class Reduncancy(Enum): - high = 'high' - one = 'one' - none = 'none' - - -class ServerSideEncryption(Enum): - True_ = True - False_ = False - - -class Bucket(BaseModel): - bucket_id: str = Field(..., description="The bucket's unique identifier.") - policy_update_token: Optional[str] = Field( - None, - description="The required token for the MLSysOps to update the bucket's policy at runtime.", - ) - location_restrictions: Optional[LocationRestrictions] = Field( - None, - description='These restrictions are used to exclude storage locations that host data of the application.', - ) - reduncancy: Optional[Reduncancy] = None - max_latency: Optional[float] = None - min_download_speed: Optional[float] = None - server_side_encryption: Optional[ServerSideEncryption] = None - - -class Storage(BaseModel): - buckets: Optional[List[Bucket]] = None - - -class DataCriticality(Enum): - low = 'low' - medium = 'medium' - high = 'high' - - -class RuntimeClassName(Enum): - nvidia = 'nvidia' - - -class RestartPolicy(Enum): - always = 'always' - on_failure = 'on_failure' - never = 'never' - - -class Os(Enum): - ubuntu = 'ubuntu' - kali = 'kali' - zephyr = 'zephyr' - - -class NodeType1(Enum): - virtualized = 'virtualized' - native = 'native' - bare_metal = 'bare_metal' - - -class ContainerRuntime(Enum): - containerd = 'containerd' - docker = 'docker' - emb_serve = 'emb_serve' - - -class ImagePullPolicy(Enum): - Always = 'Always' - Never = 'Never' - IfNotPresent = 'IfNotPresent' - - -class CallName(Enum): - calc_optical_flow = 'calc_optical_flow' - image_inference = 'image_inference' - - -class RequiredFramework(Enum): - pytorch = 'pytorch' - tensorflow = 'tensorflow' - opencl = 'opencl' - field_ = '*' - - -class AccelerationApiItem(BaseModel): - call_name: CallName = Field(..., description='The (unique) API call name.') - required_framework: Optional[RequiredFramework] = Field( - None, description='Asterisk means any of the available frameworks.' - ) - - -class ArchitectureEnum(Enum): - arm64 = 'arm64' - amd64 = 'amd64' - - -class Cpu(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - architecture: Optional[List[ArchitectureEnum]] = None - frequency: Optional[float] = Field(None, description='required frequency in Hz.') - - -class Memory(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - - -class ArchitectureEnum1(Enum): - arm_m4 = 'arm-m4' - - -class Mcu(BaseModel): - requests: Optional[str] = None - limits: Optional[str] = None - architecture: Optional[List[ArchitectureEnum1]] = None - flash: Optional[str] = Field( - None, description='Flash memory size (related to far edge devices)' - ) - frequency: Optional[float] = Field(None, description='required frequency in GHz.') - - -class Model5(Enum): - k80 = 'k80' - k40 = 'k40' - - -class Gpu(BaseModel): - requests: Optional[int] = None - limits: Optional[int] = None - model: Optional[Model5] = None - memory: Optional[str] = None - utilization_request: Optional[str] = Field( - None, description='Percentage of expected utilization.' - ) - - -class Model6(Enum): - zcu102 = 'zcu102' - - -class Fpga(BaseModel): - model: Optional[Model6] = None - memory: Optional[str] = None - utilization_request: Optional[str] = Field( - None, description='Percentage of expected utilization.' - ) - - -class PlatformRequirements(BaseModel): - cpu: Optional[Cpu] = None - memory: Optional[Memory] = None - mcu: Optional[Mcu] = None - disk: Optional[str] = Field(None, description='required Disk space (in GB).') - gpu: Optional[Gpu] = None - fpga: Optional[Fpga] = None - performance_indicator: Optional[float] = Field( - None, - description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', - ) - - -class Protocol1(Enum): - UDP = 'UDP' - TCP = 'TCP' - SCTP = 'SCTP' - - -class Port(BaseModel): - container_port: Optional[int] = Field( - None, - description="Number of port to expose on the component's IP address. This must be a valid port number, 0 < x < 65536.", - ) - host_ip: Optional[str] = Field( - None, description='What host IP to bind the external port to.' - ) - host_port: Optional[int] = Field( - None, - description='Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort.', - ) - name: Optional[str] = Field( - None, - description='Each named port in a component must have a unique name. Name for the port that can be referred to by services.', - ) - protocol: Optional[Protocol1] = Field( - None, description='Protocol for port. Defaults to "TCP".' - ) - - -class FieldRef(BaseModel): - field_path: Optional[str] = None - - -class ValueFrom(BaseModel): - field_ref: Optional[FieldRef] = None - - -class EnvItem(BaseModel): - name: Optional[str] = Field(None, description='Name of the environment variable.') - value_from: Optional[ValueFrom] = None - value: Optional[str] = Field(None, description='Value of the environment variable.') - - -class Container(BaseModel): - image: Optional[str] = Field(None, description='The name of the container image.') - command: Optional[List[str]] = None - image_pull_policy: Optional[ImagePullPolicy] = Field( - None, - description='Image pull policy. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.', - ) - acceleration_api: Optional[List[AccelerationApiItem]] = None - platform_requirements: Optional[PlatformRequirements] = Field( - None, description='The resource requirements of the container.' - ) - ports: Optional[List[Port]] = Field( - None, description='Environment variables for the container.' - ) - env: Optional[List[EnvItem]] = Field( - None, description='Environment variables for the container.' - ) - - -class Component(BaseModel): - metadata: Metadata - node_placement: Optional[NodePlacement] = None - depends_on: Optional[List[str]] = Field( - None, - description='The given component should be deployed after all the components specified in the Components list have already started running.', - ) - runtime_configuration: Optional[RuntimeConfiguration] = Field( - None, - description='Enables runtime (node-level) configuration for app components.', - ) - sensors: Optional[List[Sensor]] = None - qos_metrics: Optional[List[QosMetric]] = None - storage: Optional[Storage] = None - data_sensitivity: Optional[bool] = Field( - None, - description='The indication to specify whether a component has sensitive data or not (useful for the data storage).', - ) - data_criticality: Optional[DataCriticality] = Field( - None, - description='Used to provide information referring to the trust aspect for a given component.', - ) - external_component: Optional[bool] = Field( - None, - description='This property indicates whether the component can be managed by MLSysOps or not. If not the MLSysOps platform merely deploys the component(s), based on the provided instances, and subsequently deletes it whenever the application needs to be removed.', - ) - external_access: Optional[bool] = Field( - None, - description='This property indicates whether the component can be accessed outside of its cluster.', - ) - host_network: Optional[bool] = Field( - None, - description="Host networking requested for this component. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false.", - ) - runtime_class_name: Optional[RuntimeClassName] = None - restart_policy: Optional[RestartPolicy] = Field( - None, description='Restart policy for the container. Default to Always.' - ) - os: Optional[Os] = None - node_type: Optional[NodeType1] = None - container_runtime: Optional[ContainerRuntime] = None - containers: List[Container] - - -class Type(Enum): - ingress = 'ingress' - egress = 'egress' - - -class InteractionCriticality(Enum): - low = 'low' - medium = 'medium' - high = 'high' - - -class SystemMetricId(Enum): - latency = 'latency' - bandwidth = 'bandwidth' - end_to_end_invocation_delay = 'end_to_end_invocation_delay' - - -class MeasurementUnit4(Enum): - milliseconds = 'milliseconds' - Mbps = 'Mbps' - seconds = 'seconds' - - -class InteractionMetric(BaseModel): - system_metric_id: Optional[SystemMetricId] = Field( - None, - description='The unique identifier of the system-level metric related to this interaction.', - ) - target: Optional[float] = None - measurement_unit: Optional[MeasurementUnit4] = None - relation: Optional[Relation] = None - - -class ComponentInteraction(BaseModel): - component_name1: Optional[str] = Field(None, description='The "source" component.') - type: Optional[Type] = None - component_name2: Optional[str] = Field( - None, description='The "destination" component.' - ) - interaction_criticality: Optional[InteractionCriticality] = Field( - None, - description='Used to provide information referring to the trust aspect for a given interaction.', - ) - interaction_metrics: Optional[List[InteractionMetric]] = None - - -class PermittedAction(Enum): - component_relocation = 'component_relocation' - traffic_redirection = 'traffic_redirection' - change_container_image = 'change_container_image' - change_container_runtime_class = 'change_container_runtime_class' - change_container_cpu_set = 'change_container_cpu_set' - change_container_resource_requirements = 'change_container_resource_requirements' - acceleration = 'acceleration' - field_ = '*' - - -class Relation2(Enum): - greater_or_equal = 'greater_or_equal' - equal = 'equal' - greater_than = 'greater_than' - - -class AchievementWeight(BaseModel): - metric_id: Optional[str] = None - weight: Optional[float] = None - - -class GlobalSatisfaction(BaseModel): - threshold: Optional[float] = Field( - None, description='Happiness minimum required value (range (0-1])' - ) - relation: Optional[Relation2] = None - achievement_weights: Optional[List[AchievementWeight]] = None - - -class MLSysOpsApp(BaseModel): - name: Optional[str] = None - cluster_placement: Optional[ClusterPlacement] = None - single_node: Optional[List[SingleNodeItem]] = None - components: List[Component] - component_interactions: Optional[List[ComponentInteraction]] = None - permitted_actions: Optional[List[PermittedAction]] = Field( - None, - description='List of desired actions that can be performed by the MLSysOps agents. For traffic redirection, it must be enabled in the respective node description in order for MLSysOps to apply it.', - ) - global_satisfaction: Optional[GlobalSatisfaction] = None - - -class MlsysopsappSchema(BaseModel): - MLSysOpsApp: MLSysOpsApp diff --git a/northbound-api/Makefile b/northbound-api/Makefile index 03a5cb1..359f785 100644 --- a/northbound-api/Makefile +++ b/northbound-api/Makefile @@ -4,7 +4,7 @@ PLATFORMS := linux/arm64/v8,linux/amd64 DOCKER_BUILDX=docker buildx build # Default tags (can be overridden from CLI) -NB_API_TAG ?= registry.mlsysops.eu/agent/northbound-api +NB_API_TAG ?= harbor.nbfc.io/mlsysops/northbound-api CI_COMMIT_TAG ?= 0.0.0 # Individual targets diff --git a/northbound-api/endpoints/applications.py b/northbound-api/endpoints/applications.py index 177c72b..be70999 100644 --- a/northbound-api/endpoints/applications.py +++ b/northbound-api/endpoints/applications.py @@ -12,7 +12,7 @@ from fastapi.encoders import jsonable_encoder from kubernetes.client import ApiException -from MLSysOps_Schemas.mlsysops_model import MlsysopsappSchema, Component +from schemas.mlsysops_application import MlsysopsappSchema, Component from redis_setup import redis_mgt as rm # Your RedisManager class router = APIRouter() diff --git a/northbound-api/endpoints/infrastructure.py b/northbound-api/endpoints/infrastructure.py deleted file mode 100644 index dc9b57c..0000000 --- a/northbound-api/endpoints/infrastructure.py +++ /dev/null @@ -1,216 +0,0 @@ -import json - -import requests -import yaml -from fastapi import APIRouter, HTTPException, Request -from jsonschema import validate, ValidationError -from MLSysOps_Schemas.mlsysops_schemas import node_schema, cluster_schema, datacenter_schema, continuum_schema -from redis_setup import redis_mgt as rm - -# JSON schema with enum validation for the city -# Update the required fields in the JSON schema - -# Dictionary to map types to schemas -schema_map = { - "MLSysOpsNode": node_schema, - "MLSysOpsCluster": cluster_schema, - "MLSysOpsDatacenter": datacenter_schema, - "MLSysOpsContinuum": continuum_schema, -} - - -def validate_infrastructure_file(json_data): - # Extract the type from the JSON datafirst_key = next(iter(data)) - infrastructure_type = next(iter(json_data), None) - - # Get the corresponding schema from the map - schema = schema_map.get(infrastructure_type) - - if schema is None: - return f"Invalid file type: {infrastructure_type}, please submit a valid infrastructure file." - # Perform validation - try: - validate(instance=json_data, schema=schema) - return None # Validation passed - except ValidationError as e: - return f"Validation error for {infrastructure_type}: {e.message}" - except Exception as e: - return f"Error validating {infrastructure_type}: {str(e)}" - - -router = APIRouter() - -# Connect to Redis using RedisManager -r = rm.RedisManager() -r.connect() - -# Global variable to store the last connection time -last_connection_time = None - -"----------------------------------------------------------------------------------------" -" REGISTER INFRA " -"----------------------------------------------------------------------------------------" - - -@router.post("/register", tags=["Infra"]) -async def deploy_infra(request: Request): - try: - data = await request.json() - #print(data) - except json.JSONDecodeError: - print("error") - return HTTPException(status_code=400, detail="Invalid JSON payload") - - if 'uri' in data: - # Retrieve and process the application configuration from the URI - uri = data['uri'] - print(f"The path uri received is {uri}") - - try: - response = requests.get(uri) - response.raise_for_status() - yaml_data = response.text - json_data = yaml.safe_load(yaml_data) - except requests.RequestException as e: - raise HTTPException(status_code=400, detail=f"Failed to fetch data from URI: {e}") - else: - # the yaml data are in the request - json_data = data['yaml'] - - validation_error = validate_infrastructure_file(json_data) - - if validation_error is None: - print("Now execute the validation with the actual infrastructure and save in a datastructure") - - else: - raise HTTPException(status_code=400, detail=validation_error) - - -"----------------------------------------------------------------------------------------" -" LIST INFRASTRUCTURE " -"----------------------------------------------------------------------------------------" - - -@router.get("/list/", tags=["Infra"]) -async def list_infra(id_type: str, id_value: str): - print(f"requested info : {id_type}, with id {id_value}") - - - - - -"----------------------------------------------------------------------------------------" -" GET NODE SATE " -"----------------------------------------------------------------------------------------" - - -# POST a new product -@router.get("/node/{node_id}", tags=["Infra"]) -async def get_node_state(app_id: str): - print("return the app mQoS metric of ", app_id) - return (json.dumps(app_id)) - - -#IMPLEMENT THE LOGIC TO RETURN THE APP METRICS: - -"----------------------------------------------------------------------------------------" -" GET CLUSTER SATE " -"----------------------------------------------------------------------------------------" - - -@router.get("/cluster/{clus_id}", tags=["Infra"]) -async def get_cluster_state(app_id: str): - """ - Endpoint to update the status of an application to 'removed' based on its app_id. - - Args: - app_id (str): The application ID to look up in Redis. - - Returns: - dict: A success message if the application was updated to 'removed', - or an error message if the app_id was not found. - """ - try: - # Check if the app_id exists in the Redis dictionary - app_status = r.get_dict_value('system_app_hash', app_id) - - if app_status is None: - # If the app_id doesn't exist, return a 404 error - raise HTTPException(status_code=404, detail=f"App ID '{app_id}' not found in the system.") - - # If the app_id exists, update the status to 'removed' - r.update_dict_value('system_app_hash', app_id, "To_be_removed") - json_data = {"MLSysOpsApplication": {"name": app_id}} - r.push('valid_descriptions_queue', json.dumps(json_data)) - return {"app_id": app_id, "message": "Application status updated to 'To_be_removed'."} - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error updating status for app_id '{app_id}': {e}") - - -"----------------------------------------------------------------------------------------" -" GET DATACENTER SATE " -"----------------------------------------------------------------------------------------" - - -@router.get("/datacenter/{dc_id}", tags=["Infra"]) -async def get_dc_state(dc_id: str): - """ - Endpoint to update the status of an application to 'removed' based on its app_id. - - Args: - app_id (str): The application ID to look up in Redis. - - Returns: - dict: A success message if the application was updated to 'removed', - or an error message if the app_id was not found. - """ - try: - # Check if the app_id exists in the Redis dictionary - app_status = r.get_dict_value('system_app_hash', dc_id) - - if app_status is None: - # If the app_id doesn't exist, return a 404 error - raise HTTPException(status_code=404, detail=f"App ID '{dc_id}' not found in the system.") - - # If the app_id exists, update the status to 'removed' - r.update_dict_value('system_app_hash', dc_id, "To_be_removed") - json_data = {"MLSysOpsApplication": {"name": dc_id}} - r.push('valid_descriptions_queue', json.dumps(json_data)) - return {"app_id": dc_id, "message": "Application status updated to 'To_be_removed'."} - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error updating status for app_id '{dc_id}': {e}") - - -"----------------------------------------------------------------------------------------" -" UNREGISTER INFRA " -"----------------------------------------------------------------------------------------" - - -# POST a new product -@router.delete("/unregister/{app_id}", tags=["Infra"]) -async def unregister_infra(app_id: str): - """ - Endpoint to check the status of a specific application based on its app_id. - - Args: - app_id (str): The application ID to look up in Redis. - - Returns: - dict: The status of the application, or an error message if not found. - """ - try: - - # Fetch the value of the given app_id from Redis - app_status = r.get_dict_value('system_app_hash', app_id) - - if app_status is None: - # If the app_id doesn't exist in Redis, return a 404 error - raise HTTPException(status_code=404, detail=f"App ID '{app_id}' not found in the system.") - - # Return the app status - return {"app_id": app_id, "status": app_status} - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error retrieving status for app_id '{app_id}': {e}") diff --git a/northbound-api/endpoints/ml_models.py b/northbound-api/endpoints/ml_models.py index 70fee22..039d68d 100644 --- a/northbound-api/endpoints/ml_models.py +++ b/northbound-api/endpoints/ml_models.py @@ -1,314 +1,307 @@ -from datetime import datetime - -import yaml -from fastapi import APIRouter, HTTPException, Request, UploadFile, File -import json -from redis_setup import redis_mgt as rm -from jsonschema import validate, ValidationError -import requests -from MLSysOps_Schemas.mlsysops_schemas import app_schema -import os -import subprocess -from kubernetes import client, utils, config -from kubernetes.utils.create_from_yaml import FailToCreateError -from kubernetes.client.rest import ApiException - -from typing import Annotated, List, Optional, Dict, Any -from pydantic import BaseModel, Field - -# JSON schema with enum validation for the city -schema = app_schema - -os.environ["LOCAL_OTEL_ENDPOINT"] = "http://172.25.27.4:9464/metrics" -os.environ["TELEMETRY_ENDPOINT"] = "172.25.27.4:4317" - -karmada_api_kubeconfig = "/home/runner/karmada_management/karmada-api.kubeconfig" -uth_dev_kubeconfig = "/home/runner/karmada_management/uth-dev.kubeconfig" -uth_prod_kubeconfig = "/home/runner/karmada_management/uth-prod.kubeconfig" -kubeconfigs = [ - os.getenv("KARMADA_KUBECONFIG", "/root/.kube/karmada-api.kubeconfig"), - os.getenv("UTH_DEV_KUBECONFIG", "/root/.kube/uth-dev.kubeconfig"), - os.getenv("UTH_PROD_KUBECONFIG", "/root/.kube/uth-prod.kubeconfig"), -] - - -# Define Pydantic Model for Validation using v2 syntax -class ComponentModel(BaseModel): - Component: Dict[str, Any] - externalAccess: Optional[bool] = None - nodePlacement: Optional[Dict[str, Any]] = None - restartPolicy: Optional[str] = None - containers: Optional[Annotated[List[Dict[str, Any]], Field(min_items=1)]] = None - - -class MLSysOpsApplicationModel(BaseModel): - name: str = Field(..., title="Application Name") - mlsysops_id: str = Field(..., alias="mlsysops-id", title="MLSysOps ID") - clusterPlacement: Optional[Dict[str, Any]] = None - components: Annotated[List[ComponentModel], Field(min_items=1)] - - -class RootModel(BaseModel): - MLSysOpsApplication: MLSysOpsApplicationModel - - -class RootModel(BaseModel): - MLSysOpsApplication: MLSysOpsApplicationModel - - -def get_pods_from_kubeconfigs(): - """ - Retrieve all pods from multiple clusters using specified kubeconfig files. - - Returns: - list: A list of dictionaries containing pod details (name, status, node, cluster). - """ - pod_details = [] - - for kubeconfig in kubeconfigs: - try: - # Retrieve contexts for the given kubeconfig - result = subprocess.run( - ["kubectl", "config", "get-contexts", "-o", "name", "--kubeconfig", kubeconfig], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - if result.returncode != 0: - pod_details.append({"error": f"Error fetching contexts from {kubeconfig}: {result.stderr.strip()}"}) - continue - - contexts = result.stdout.strip().split("\n") - - # Fetch pod information for each context - for context_name in contexts: - try: - pod_result = subprocess.run( - [ - "kubectl", - "get", - "pods", - "--context", - context_name, - "--kubeconfig", - kubeconfig, - "-o", - "custom-columns=NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName", - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - if pod_result.returncode != 0: - pod_details.append({ - "error": f"Error fetching pods for context {context_name} ({kubeconfig}): {pod_result.stderr.strip()}"}) - continue - - # Parse the output and collect pod details - lines = pod_result.stdout.strip().split("\n") - if len(lines) > 1: # Skip header row - for line in lines[1:]: - parts = line.split() - if len(parts) >= 3: # Ensure there are enough columns - name, status, node = parts - pod_details.append({ - "pod_name": name, - "pod_status": status, - "node_name": node, - "cluster_name": context_name, - "kubeconfig": kubeconfig, - }) - - except Exception as e: - pod_details.append( - {"error": f"Error while listing pods for context {context_name} ({kubeconfig}): {str(e)}"}) - - except Exception as e: - pod_details.append({"error": f"Error while retrieving contexts from {kubeconfig}: {str(e)}"}) - - return pod_details - - -def get_yaml_info(data): - """ - Extracts the application name and components from a dictionary. - - Parameters: - data (dict): Dictionary containing the application data. - - Returns: - tuple: A tuple with the application name and a list of component names. - """ - try: - # Extract the application name - app_name = data.get("MLSysOpsApplication", {}).get("name", None) - - # Extract the component names - components = [ - component.get("Component", {}).get("name", None) - for component in data.get("MLSysOpsApplication", {}).get("components", []) - if component.get("Component", {}).get("name", None) - ] - - return app_name, components - except Exception as e: - print(f"Error processing the data: {e}") - return None, [] - - -def remove_none_fields(data): - if isinstance(data, dict): - return { - k: remove_none_fields(v) - for k, v in data.items() - if v is not None - } - elif isinstance(data, list): - return [remove_none_fields(item) for item in data if item is not None] - else: - return data - - -def validate_yaml(json_data): - try: - validate(instance=json_data, schema=schema) - return None - except ValidationError as e: - return e.message - except Exception as e: - return str(e) - - -router = APIRouter() - -# Connect to Redis using RedisManager -r = rm.RedisManager() -r.connect() - -# Global variable to store the last connection time -last_connection_time = None - -"----------------------------------------------------------------------------------------" -" DEPLOY ML MODEL " -"----------------------------------------------------------------------------------------" - - -@router.post("/deploy_ml", tags=["ML-models"]) -async def deploy_ml(payload: RootModel): - # Convert Pydantic object to dict - - parsed_data = payload.dict(by_alias=True) - parsed_data = remove_none_fields(parsed_data) - - # Validate YAML structure - validation_error = validate_yaml(parsed_data) - - try: - internal_uid = parsed_data["MLSysOpsApplication"]["mlsysops-id"] - except KeyError: - print("The mlsysops-id is not specified in the model description") - - if validation_error is None and internal_uid != "0": - - try: - r.push("ml_deployment_queue", json.dumps(parsed_data)) - timestamp = datetime.now() - info = { - 'status': 'pending', - 'timestamp': str(timestamp) - } - r.update_dict_value('endpoint_hash', internal_uid, str(info)) - return {"status": "success", "message": "Deployment request added to queue"} - except Exception as e: - print(f"Error checking the app in Redis: {e}") - raise HTTPException(status_code=500, detail=str(e)) - else: - raise HTTPException(status_code=400, detail=validation_error) - - -"----------------------------------------------------------------------------------------" -" LIST ALL APPS " -"----------------------------------------------------------------------------------------" - - -@router.get("/list_all/", tags=["ML-models"]) -async def list_all(): - """ - Endpoint to return the current Redis dictionary values. - """ - try: - redis_data = r.get_dict('endpoint_hash') - if redis_data is None: - return {"status": "No data in the system."} - return {"System_status": redis_data} - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error retrieving ml model status: {e}") - - -"----------------------------------------------------------------------------------------" -" ML DEPLOYMENT STATUS " -"----------------------------------------------------------------------------------------" - - -@router.get("/status/{model_uid}", tags=["ML-models"]) -async def get_ml_status(model_uid: str): - """ - Endpoint to check the status of a specific application based on its app_id. - - Args: - app_id (str): The application ID to look up in Redis. - - Returns: - dict: The status of the application, or an error message if not found. - """ - try: - # Fetch the value of the given app_id from Redis - app_status = r.get_dict_value('endpoint_hash', model_uid) - print(app_status) - if app_status is None: - # If the app_id doesn't exist in Redis, return a 404 error - raise HTTPException(status_code=404, detail=f"Model ID '{model_uid}' not found in the system.") - - # Return the app status - return {"Model ": model_uid, "status": app_status} - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error retrieving status for app_id '{model_uid}': {e}") - - -"----------------------------------------------------------------------------------------" -" REMOVE APP " -"----------------------------------------------------------------------------------------" - - -@router.delete("/remove/{model_uid}", tags=["ML-models"]) -async def remove_ml_model(model_uid: str): - """ - Endpoint to update the status of an application to 'removed' based on its app_id. - - Args: - app_id (str): The application ID to look up in Redis. - - Returns: - dict: A success message if the application was updated to 'removed', - or an error message if the app_id was not found. - :param model_uid: - """ - try: - # Check if the app_id exists in the Redis dictionary - app_status = r.get_dict_value('endpoint_hash', model_uid) - - if app_status is None: - # If the app_id doesn't exist, return a 404 error - raise HTTPException(status_code=404, detail=f"App ID '{model_uid}' not found in the system.") - - # If the app_id exists, update the status to 'removed' - r.update_dict_value('endpoint_hash', model_uid, "To_be_removed") - delete_msg = str({model_uid: "delete"}) - r.push("ml_deployment_queue", delete_msg) - return {"model_uid": model_uid, "message": "Application status updated to 'To_be_removed'."} - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error updating status for app_id '{model_uid}': {e}") +from datetime import datetime + +import yaml +from fastapi import APIRouter, HTTPException, Request, UploadFile, File +import json +from fastapi.encoders import jsonable_encoder +from mlsysops import logger +from redis_setup import redis_mgt as rm +from jsonschema import validate, ValidationError +import requests +from schemas.mlsysops_application import MlsysopsappSchema, Component +import os +import subprocess +from kubernetes import client, utils, config +from kubernetes.utils.create_from_yaml import FailToCreateError +from kubernetes.client.rest import ApiException + +from typing import Annotated, List, Optional, Dict, Any +from pydantic import BaseModel, Field +from starlette import status +# JSON schema with enum validation for the city + + +os.environ["LOCAL_OTEL_ENDPOINT"] = "http://172.25.27.4:9464/metrics" +os.environ["TELEMETRY_ENDPOINT"] = "172.25.27.4:4317" + +karmada_api_kubeconfig = "/home/runner/karmada_management/karmada-api.kubeconfig" +uth_dev_kubeconfig = "/home/runner/karmada_management/uth-dev.kubeconfig" +uth_prod_kubeconfig = "/home/runner/karmada_management/uth-prod.kubeconfig" +kubeconfigs = [ + os.getenv("KARMADA_KUBECONFIG", "/root/.kube/karmada-api.kubeconfig"), + os.getenv("UTH_DEV_KUBECONFIG", "/root/.kube/uth-dev.kubeconfig"), + os.getenv("UTH_PROD_KUBECONFIG", "/root/.kube/uth-prod.kubeconfig"), +] + + + +def get_pods_from_kubeconfigs(): + """ + Retrieve all pods from multiple clusters using specified kubeconfig files. + + Returns: + list: A list of dictionaries containing pod details (name, status, node, cluster). + """ + pod_details = [] + + for kubeconfig in kubeconfigs: + try: + # Retrieve contexts for the given kubeconfig + result = subprocess.run( + ["kubectl", "config", "get-contexts", "-o", "name", "--kubeconfig", kubeconfig], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + if result.returncode != 0: + pod_details.append({"error": f"Error fetching contexts from {kubeconfig}: {result.stderr.strip()}"}) + continue + + contexts = result.stdout.strip().split("\n") + + # Fetch pod information for each context + for context_name in contexts: + try: + pod_result = subprocess.run( + [ + "kubectl", + "get", + "pods", + "--context", + context_name, + "--kubeconfig", + kubeconfig, + "-o", + "custom-columns=NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + if pod_result.returncode != 0: + pod_details.append({ + "error": f"Error fetching pods for context {context_name} ({kubeconfig}): {pod_result.stderr.strip()}"}) + continue + + # Parse the output and collect pod details + lines = pod_result.stdout.strip().split("\n") + if len(lines) > 1: # Skip header row + for line in lines[1:]: + parts = line.split() + if len(parts) >= 3: # Ensure there are enough columns + name, status, node = parts + pod_details.append({ + "pod_name": name, + "pod_status": status, + "node_name": node, + "cluster_name": context_name, + "kubeconfig": kubeconfig, + }) + + except Exception as e: + pod_details.append( + {"error": f"Error while listing pods for context {context_name} ({kubeconfig}): {str(e)}"}) + + except Exception as e: + pod_details.append({"error": f"Error while retrieving contexts from {kubeconfig}: {str(e)}"}) + + return pod_details + + +def get_yaml_info(data): + """ + Extracts the application name and components from a dictionary. + + Parameters: + data (dict): Dictionary containing the application data. + + Returns: + tuple: A tuple with the application name and a list of component names. + """ + try: + # Extract the application name + app_name = data.get("MLSysOpsApplication", {}).get("name", None) + + # Extract the component names + components = [ + component.get("Component", {}).get("name", None) + for component in data.get("MLSysOpsApplication", {}).get("components", []) + if component.get("Component", {}).get("name", None) + ] + + return app_name, components + except Exception as e: + print(f"Error processing the data: {e}") + return None, [] + + +def remove_none_fields(data): + if isinstance(data, dict): + return { + k: remove_none_fields(v) + for k, v in data.items() + if v is not None + } + elif isinstance(data, list): + return [remove_none_fields(item) for item in data if item is not None] + else: + return data + + + + +router = APIRouter() + +# Connect to Redis using RedisManager +r = rm.RedisManager() +r.connect() + +# Global variable to store the last connection time +last_connection_time = None + +"----------------------------------------------------------------------------------------" +" DEPLOY ML MODEL " +"----------------------------------------------------------------------------------------" + + +@router.post("/deploy_ml", tags=["ML-models"]) +async def deploy_ml(request: Request, payload: MlsysopsappSchema): + # Convert Pydantic object to dict + + redis_mgr: rm.RedisManager = request.app.state.redis + try: + app_id = payload.MLSysOpsApp.name + + components: List[Component] = payload.MLSysOpsApp.components or [] + comp_names = [comp.metadata.name for comp in components] + logger.debug("Deploying app_id=%s with components=%s", app_id, comp_names) + + except Exception as exc: + logger.error("Error converting payload to dict: %s", exc) + + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid payload format: {exc}" + ) + + try: + encoded = jsonable_encoder(payload) + encoded_clean = _remove_none_fields(encoded) + payload_json = json.dumps(encoded_clean) + + redis_mgr.push("ml_deployment_queue", payload_json) + timestamp = datetime.now() + info = { + 'status': 'pending', + 'timestamp': str(timestamp) + } + + redis_mgr.update_dict_value("app_data_hash", app_id, str(info)) + + + except Exception as exc: + logger.error("Error storing application in Redis: %s", exc) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Redis store error: {exc}" + ) + + return {"app_id": app_id, "status": "ML model queued successfully"} + + +"----------------------------------------------------------------------------------------" +" LIST ALL APPS " +"----------------------------------------------------------------------------------------" + + +@router.get("/list_all/", tags=["ML-models"]) +async def list_all(): + """ + Endpoint to return the current Redis dictionary values. + """ + try: + redis_data = r.get_dict('endpoint_hash') + if redis_data is None: + return {"status": "No data in the system."} + return {"System_status": redis_data} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error retrieving ml model status: {e}") + + +"----------------------------------------------------------------------------------------" +" ML DEPLOYMENT STATUS " +"----------------------------------------------------------------------------------------" + + +@router.get("/status/{model_uid}", tags=["ML-models"]) +async def get_ml_status(model_uid: str): + """ + Endpoint to check the status of a specific application based on its app_id. + + Args: + app_id (str): The application ID to look up in Redis. + + Returns: + dict: The status of the application, or an error message if not found. + """ + try: + # Fetch the value of the given app_id from Redis + app_status = r.get_dict_value('endpoint_hash', model_uid) + print(app_status) + if app_status is None: + # If the app_id doesn't exist in Redis, return a 404 error + raise HTTPException(status_code=404, detail=f"Model ID '{model_uid}' not found in the system.") + + # Return the app status + return {"Model ": model_uid, "status": app_status} + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error retrieving status for app_id '{model_uid}': {e}") + + +"----------------------------------------------------------------------------------------" +" REMOVE APP " +"----------------------------------------------------------------------------------------" + + +@router.delete("/remove/{model_uid}", tags=["ML-models"]) +async def remove_ml_model(model_uid: str): + """ + Endpoint to update the status of an application to 'removed' based on its app_id. + + Args: + app_id (str): The application ID to look up in Redis. + + Returns: + dict: A success message if the application was updated to 'removed', + or an error message if the app_id was not found. + :param model_uid: + """ + try: + # Check if the app_id exists in the Redis dictionary + app_status = r.get_dict_value('endpoint_hash', model_uid) + + if app_status is None: + # If the app_id doesn't exist, return a 404 error + raise HTTPException(status_code=404, detail=f"App ID '{model_uid}' not found in the system.") + + # If the app_id exists, update the status to 'removed' + r.update_dict_value('endpoint_hash', model_uid, "To_be_removed") + delete_msg = str({model_uid: "delete"}) + r.push("ml_deployment_queue", delete_msg) + return {"model_uid": model_uid, "message": "Application status updated to 'To_be_removed'."} + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error updating status for app_id '{model_uid}': {e}") + +def _remove_none_fields(obj: Any) -> Any: + """ + Recursively drop keys/values that are None in dicts or None items in lists. + """ + if isinstance(obj, dict): + return {k: _remove_none_fields(v) for k, v in obj.items() if v is not None} + if isinstance(obj, list): + return [_remove_none_fields(item) for item in obj if item is not None] + return obj diff --git a/northbound-api/generate_model.py b/northbound-api/generate_model.py new file mode 100644 index 0000000..40d13b4 --- /dev/null +++ b/northbound-api/generate_model.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Optional + +import yaml + +try: + # Python 3.9+ + from importlib.resources import files as res_files +except ImportError: # pragma: no cover + # Python 3.8 fallback + from importlib_resources import files as res_files # type: ignore + + +CRD_PACKAGE = "mlsysops.crds" +CRD_FILENAME = "MLSysOpsApplication.yaml" +CRD_FILENAME_CLUSTER = "MLSysOpsCluster.yaml" + + +def _load_crd_yaml_from_package() -> dict: + """ + Load CRD YAML from python package resources: mlsysops.crds/MLSysOpsApplication.yaml + Returns parsed YAML as dict. + """ + resource = res_files(CRD_PACKAGE).joinpath(CRD_FILENAME) + if not resource.is_file(): + raise FileNotFoundError(f"CRD resource not found: {CRD_PACKAGE}/{CRD_FILENAME}") + + with resource.open("r", encoding="utf-8") as f: + return yaml.safe_load(f) + + +def _convert_crd_dict_to_jsonschema(crd: dict) -> tuple[str, dict]: + """ + Convert CRD dict to a JSON Schema dict, wrapped under a root key based on spec.names.kind. + Returns (root_key, full_schema_dict). + """ + # Extract kind from spec.names.kind + root_key: Optional[str] = None + if ( + isinstance(crd, dict) + and isinstance(crd.get("spec"), dict) + and isinstance(crd["spec"].get("names"), dict) + and isinstance(crd["spec"]["names"].get("kind"), str) + ): + root_key = crd["spec"]["names"]["kind"] + + if not root_key: + raise ValueError("Could not determine 'kind' from CRD → spec.names.kind.") + + # Find openAPIV3Schema under spec.versions[*].schema.openAPIV3Schema + openapi_schema = None + versions = crd.get("spec", {}).get("versions") + if isinstance(versions, list): + for version in versions: + if ( + isinstance(version, dict) + and isinstance(version.get("schema"), dict) + and "openAPIV3Schema" in version["schema"] + ): + openapi_schema = version["schema"]["openAPIV3Schema"] + break + + if openapi_schema is None: + raise ValueError("No valid 'openAPIV3Schema' found in the CRD under spec.versions[].schema.") + + # Build full JSON Schema + full_schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": f"{root_key} Schema", + "type": "object", + "properties": { + root_key: openapi_schema + }, + "required": [root_key], + } + return root_key, full_schema + + +def _run_datamodel_codegen(json_schema_file: str, output_model_file: str) -> None: + """ + Invoke datamodel-codegen to turn the JSON Schema into a Pydantic model. + """ + cmd = [ + "datamodel-codegen", + "--input", json_schema_file, + "--input-file-type", "jsonschema", + "--output", output_model_file, + ] + try: + subprocess.check_call(cmd) + except FileNotFoundError as e: + raise RuntimeError( + "'datamodel-codegen' not found. Install it with: pip install datamodel-code-generator" + ) from e + except subprocess.CalledProcessError as e: + raise RuntimeError(f"datamodel-codegen failed: {e}") from e + + +def generate_pydantic_schemas( + schemas_dir: str | os.PathLike = "schemas", + output_filename: str = "mlsysops_application.py", +) -> Path: + """ + Public method to be called from your bootstrap/main. + + - Loads CRD from mlsysops.crds.MLSysOpsApplication.yaml + - Generates JSON Schema (temp file) + - Runs datamodel-codegen + - Writes model into `schemas_dir/output_filename` + - Ensures schemas_dir exists + Returns the Path of the generated file. + """ + crd = _load_crd_yaml_from_package() + _, schema = _convert_crd_dict_to_jsonschema(crd) + + schemas_path = Path(schemas_dir) + schemas_path.mkdir(parents=True, exist_ok=True) + + out_file = schemas_path / output_filename + + with tempfile.NamedTemporaryFile(suffix="_schema.json", delete=False) as tmp: + json_schema_path = tmp.name + + try: + with open(json_schema_path, "w", encoding="utf-8") as f: + json.dump(schema, f, indent=2) + + _run_datamodel_codegen(json_schema_path, str(out_file)) + return out_file + finally: + try: + os.remove(json_schema_path) + except OSError: + pass diff --git a/northbound-api/main.py b/northbound-api/main.py index dd23318..2ad649f 100644 --- a/northbound-api/main.py +++ b/northbound-api/main.py @@ -1,22 +1,33 @@ -import uvicorn -from fastapi import FastAPI -from endpoints import applications, infrastructure, management, ml_models -from redis_setup import redis_mgt as rm - -app = FastAPI(title="MLSysOps NorthBound API", - description="Is API that the application owners and infrastructure administrators can use to interact " - "with the MLSysOps platform.", - version="1.0") - -# 1) Create & connect your Redis client exactly once: -redis_client = rm.RedisManager() -redis_client.connect() - -# 2) Attach it to `app.state` so that any route handler (or router) can fetch it later: -app.state.redis = redis_client - -# Register each router with a prefix to organize routes -app.include_router(applications.router, prefix="/apps") -app.include_router(ml_models.router, prefix="/ml") -app.include_router(infrastructure.router, prefix="/infra") -app.include_router(management.router, prefix="/manage") +import uvicorn +from fastapi import FastAPI + + +# generate MLSysOps CRD model +from generate_model import generate_pydantic_schemas + +print("Generating MLSysOps CRD model...") +generated = generate_pydantic_schemas(schemas_dir="schemas") +print(f"Generated Pydantic models at: {generated}") + + + + +from endpoints import applications, management, ml_models +from redis_setup import redis_mgt as rm + +app = FastAPI(title="MLSysOps NorthBound API", + description="Is API that the application owners and infrastructure administrators can use to interact " + "with the MLSysOps platform.", + version="1.0") + +# 1) Create & connect your Redis client exactly once: +redis_client = rm.RedisManager() +redis_client.connect() + +# 2) Attach it to `app.state` so that any route handler (or router) can fetch it later: +app.state.redis = redis_client + +# Register each router with a prefix to organize routes +app.include_router(applications.router, prefix="/apps") +app.include_router(ml_models.router, prefix="/ml") +app.include_router(management.router, prefix="/manage") diff --git a/northbound-api/mlsysops-test-app-description.yaml b/northbound-api/mlsysops-test-app-description.yaml deleted file mode 100644 index ea08a13..0000000 --- a/northbound-api/mlsysops-test-app-description.yaml +++ /dev/null @@ -1,84 +0,0 @@ -MLSysOpsApp: - name: test-application - cluster_placement: - cluster_id: - - mls04 - components: - - metadata: - name: server-app - uid: server-app-v1 - node_placement: - node: mls04 - restart_policy: on_failure - containers: - - image: harbor.nbfc.io/mlsysops/test-app:latest - image_pull_policy: IfNotPresent - command: ["python", "TcpServer.py"] - env: - - name: OTEL_RESOURCE_ATTRIBUTES - value: "service.name=server-app, service.version=0.0.0" - - name: OTEL_SERVICE_NAME - value: "server-app" - - name: NODE_IP - value_from: - field_ref: - field_path: status.hostIP - - name: TELEMETRY_ENDPOINT - value: "$(NODE_IP):43170" - - name: TCP_SERVER_IP - value: "0.0.0.0" - ports: - - container_port: 10000 - protocol: TCP - qos_metrics: - - application_metric_id: test_received_success_counter_total - target: 20 - relation: lower_or_equal - - metadata: - name: client-app - uid: client-app-v1 - restart_policy: on_failure - containers: - - image: harbor.nbfc.io/mlsysops/test-app:latest - image_pull_policy: IfNotPresent - command: ["python", "TcpClient.py"] - env: - - name: OTEL_RESOURCE_ATTRIBUTES - value: "service.name=client-app, service.version=0.0.0" - - name: OTEL_SERVICE_NAME - value: "client-app" - - name: NODE_IP - value_from: - field_ref: - field_path: status.hostIP - - name: TELEMETRY_ENDPOINT - value: "$(NODE_IP):43170" - - name: TCP_SERVER_IP - value: "server-app" - ports: - - container_port: 10000 - protocol: TCP - qos_metrics: - - application_metric_id: metric-2 - target: 30 - relation: equal - component_interactions: - - component_name1: client-app - type: egress - component_name2: server-app - permitted_actions: - - component_relocation - - traffic_redirection - - change_container_image - - change_container_runtime_class - - change_container_cpu_set - - change_container_resource_requirements - - acceleration - global_satisfaction: - threshold: 0.7 - relation: greater_than - achievement_weights: - - metric_id: test_received_success_counter - weight: 0.5 - - metric_id: test_sent_success_counter - weight: 0.5 diff --git a/northbound-api/redis_setup/testredis.py b/northbound-api/redis_setup/testredis.py index 018970b..f03f097 100644 --- a/northbound-api/redis_setup/testredis.py +++ b/northbound-api/redis_setup/testredis.py @@ -1,5 +1,5 @@ -import redis_mgt as rm - -r = rm.RedisManager() -r.connect() - +import redis_mgt as rm + +r = rm.RedisManager() +r.connect() + diff --git a/northbound-api/requirements.txt b/northbound-api/requirements.txt index 2b19df9..b217cd7 100644 --- a/northbound-api/requirements.txt +++ b/northbound-api/requirements.txt @@ -36,3 +36,5 @@ urllib3==2.2.3 uvicorn==0.32.0 kubernetes~=32.0.0 python-multipart +datamodel-code-generator +attrs diff --git a/northbound-api/schemas/mlsysops_application.py b/northbound-api/schemas/mlsysops_application.py new file mode 100644 index 0000000..63bc2e9 --- /dev/null +++ b/northbound-api/schemas/mlsysops_application.py @@ -0,0 +1,312 @@ +# generated by datamodel-codegen: +# filename: tmpy427nwqa_schema.json +# timestamp: 2025-12-17T13:09:15+00:00 + +from __future__ import annotations + +from enum import Enum +from typing import List, Optional + +from pydantic import BaseModel, Field + + +class ClusterPlacement(BaseModel): + cluster_id: Optional[List[str]] = Field( + None, description='Array of clusters that can host the application.' + ) + + +class Metadata(BaseModel): + name: Optional[str] = Field(None, description='The unique name of the component') + uid: Optional[str] = Field( + None, + description='The unique identifier of the component (not given by app provider).', + ) + + +class ContinuumLayerEnum(Enum): + cloud = 'cloud' + far_edge = 'far_edge' + edge_infrastructure = 'edge_infrastructure' + edge = 'edge' + field_ = '*' + + +class NodePlacement(BaseModel): + continuum_layer: Optional[List[ContinuumLayerEnum]] = None + mobile: Optional[bool] = Field( + None, + description='Specify if the component needs to be deployed on a mobile node (optional)', + ) + labels: Optional[List[str]] = Field( + None, description='The required labels for filtering.' + ) + node: Optional[str] = Field( + None, + description='The required node name to be the host of the component (optional).', + ) + + +class Model(Enum): + d455 = 'd455' + imx477 = 'imx477' + picamera_v2 = 'picamera-v2' + + +class CameraType(Enum): + rgb = 'rgb' + nir = 'nir' + thermal = 'thermal' + monocular = 'monocular' + + +class Resolution(Enum): + field_1024x768 = '1024x768' + field_4056x3040 = '4056x3040' + + +class Camera(BaseModel): + model: Optional[Model] = Field( + None, description='The model name of the camera sensor' + ) + camera_type: Optional[CameraType] = Field( + None, description='The camera sensor type.' + ) + minimum_framerate: Optional[int] = None + resolution: Optional[Resolution] = None + + +class Model1(Enum): + sdc30 = 'sdc30' + ds18b20 = 'ds18b20' + + +class Temperature(BaseModel): + model: Optional[Model1] = Field( + None, description='The model name of the temperature sensor' + ) + + +class Sensor(BaseModel): + camera: Optional[Camera] = None + temperature: Optional[Temperature] = None + + +class Relation(Enum): + lower_or_equal = 'lower_or_equal' + greater_or_equal = 'greater_or_equal' + equal = 'equal' + lower_than = 'lower_than' + greater_than = 'greater_than' + + +class QosMetric(BaseModel): + application_metric_id: Optional[str] = Field(None, description='App metric id.') + target: Optional[float] = None + relation: Optional[Relation] = None + + +class RuntimeClassName(Enum): + nvidia = 'nvidia' + default = 'default' + kata_fc = 'kata-fc' + kata_dragon = 'kata-dragon' + urunc = 'urunc' + crun = 'crun' + lunatic = 'lunatic' + nvidia_experimental = 'nvidia-experimental' + spin = 'spin' + wasmedge = 'wasmedge' + slight = 'slight' + runc = 'runc' + + +class RestartPolicy(Enum): + Always = 'Always' + OnFailure = 'OnFailure' + Never = 'Never' + + +class Os(Enum): + ubuntu = 'ubuntu' + kali = 'kali' + zephyr = 'zephyr' + + +class NodeType(Enum): + virtualized = 'virtualized' + native = 'native' + bare_metal = 'bare_metal' + + +class ContainerRuntime(Enum): + containerd = 'containerd' + docker = 'docker' + embserve = 'embserve' + kata = 'kata' + kata_qemu = 'kata-qemu' + kata_clh = 'kata-clh' + kata_fc = 'kata-fc' + urunc = 'urunc' + nvidia = 'nvidia' + + +class ImagePullPolicy(Enum): + Always = 'Always' + Never = 'Never' + IfNotPresent = 'IfNotPresent' + + +class ArchitectureEnum(Enum): + arm64 = 'arm64' + amd64 = 'amd64' + arm_v7 = 'arm-v7' + arm_v8 = 'arm-v8' + + +class Cpu(BaseModel): + requests: Optional[str] = None + limits: Optional[str] = None + architecture: Optional[List[ArchitectureEnum]] = None + frequency: Optional[float] = Field(None, description='required frequency in Hz.') + performance_indicator: Optional[float] = Field( + None, + description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', + ) + + +class Memory(BaseModel): + requests: Optional[str] = None + limits: Optional[str] = None + + +class Model2(Enum): + k80 = 'k80' + k40 = 'k40' + + +class Gpu(BaseModel): + model: Optional[Model2] = None + memory: Optional[str] = None + performance_indicator: Optional[float] = Field( + None, + description='This field assists MLSysOps with an initial hint in order to filter out nodes based on their performance capabilities.', + ) + + +class PlatformRequirements(BaseModel): + cpu: Optional[Cpu] = None + memory: Optional[Memory] = None + disk: Optional[str] = Field(None, description='required Disk space (in GB).') + gpu: Optional[Gpu] = None + + +class Protocol(Enum): + UDP = 'UDP' + TCP = 'TCP' + SCTP = 'SCTP' + + +class Port(BaseModel): + container_port: Optional[int] = Field( + None, + description="Number of port to expose on the component's IP address. This must be a valid port number, 0 < x < 65536.", + ) + protocol: Optional[Protocol] = Field( + None, description='Protocol for port. Defaults to "TCP".' + ) + + +class FieldRef(BaseModel): + field_path: Optional[str] = None + + +class ValueFrom(BaseModel): + field_ref: Optional[FieldRef] = None + + +class EnvItem(BaseModel): + name: Optional[str] = Field(None, description='Name of the environment variable.') + value_from: Optional[ValueFrom] = None + value: Optional[str] = Field(None, description='Value of the environment variable.') + + +class Container(BaseModel): + image: Optional[str] = Field(None, description='The name of the container image.') + command: Optional[List[str]] = None + image_pull_policy: Optional[ImagePullPolicy] = Field( + None, + description='Image pull policy. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.', + ) + platform_requirements: Optional[PlatformRequirements] = Field( + None, description='The resource requirements of the container.' + ) + ports: Optional[List[Port]] = Field( + None, description='Environment variables for the container.' + ) + env: Optional[List[EnvItem]] = Field( + None, description='Environment variables for the container.' + ) + + +class Component(BaseModel): + metadata: Metadata + node_placement: Optional[NodePlacement] = None + sensors: Optional[List[Sensor]] = None + qos_metrics: Optional[List[QosMetric]] = None + host_network: Optional[bool] = Field( + None, + description="Host networking requested for this component. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false.", + ) + runtime_class_name: Optional[RuntimeClassName] = None + restart_policy: Optional[RestartPolicy] = Field( + None, description='Restart policy for the container. Default to Always.' + ) + os: Optional[Os] = None + node_type: Optional[NodeType] = None + container_runtime: Optional[ContainerRuntime] = None + containers: List[Container] + + +class Type(Enum): + ingress = 'ingress' + egress = 'egress' + + +class ComponentInteraction(BaseModel): + component_name1: Optional[str] = Field(None, description='The "source" component.') + type: Optional[Type] = None + component_name2: Optional[str] = Field( + None, description='The "destination" component.' + ) + + +class Relation1(Enum): + greater_or_equal = 'greater_or_equal' + equal = 'equal' + greater_than = 'greater_than' + + +class AchievementWeight(BaseModel): + metric_id: Optional[str] = None + weight: Optional[float] = None + + +class GlobalSatisfaction(BaseModel): + threshold: Optional[float] = Field( + None, description='Happiness minimum required value (range (0-1])' + ) + relation: Optional[Relation1] = None + achievement_weights: Optional[List[AchievementWeight]] = None + + +class MLSysOpsApp(BaseModel): + name: Optional[str] = Field(None, description='The application name.') + cluster_placement: Optional[ClusterPlacement] = None + components: List[Component] + component_interactions: Optional[List[ComponentInteraction]] = None + global_satisfaction: Optional[GlobalSatisfaction] = None + + +class MlsysopsappSchema(BaseModel): + MLSysOpsApp: MLSysOpsApp