#!/usr/bin/env bash # # setup-amlfs-aks.sh # # Enable Azure Managed Lustre File System (AMLFS) on an existing AKS cluster # with dynamic provisioning via the Azure Lustre CSI driver. # # IMPORTANT: AMLFS requires a dedicated subnet separate from the AKS node subnet. # This script automatically creates one if it doesn't exist. Using the AKS node # subnet causes extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet). # # Prerequisites: # - Azure CLI (az) installed and logged in # - kubectl configured to target your AKS cluster # - Sufficient Azure subscription quota for AMLFS # # Usage: # export RESOURCE_GROUP="my-resource-group" # export CLUSTER_NAME="my-aks-cluster" # ./setup-amlfs-aks.sh # # Optional environment variables: # AMLFS_SKU - AMLFS SKU (default: AMLFS-Durable-Premium-125) # AMLFS_STORAGE_SIZE - Storage size (default: 4Ti) # AMLFS_ZONE - Availability zone (default: 1) # AMLFS_SUBNET_NAME - Dedicated subnet name for AMLFS (default: amlfs-subnet) # AMLFS_SUBNET_PREFIX - CIDR for AMLFS subnet (default: auto-calculated) # PVC_NAME - PVC name (default: shared-amlfs-storage) # STORAGECLASS_NAME - StorageClass name (default: amlfs-lustre) set -euo pipefail # Required variables : "${RESOURCE_GROUP:?โŒ RESOURCE_GROUP must be set (e.g., export RESOURCE_GROUP=my-rg)}" : "${CLUSTER_NAME:?โŒ CLUSTER_NAME must be set (e.g., export CLUSTER_NAME=my-aks)}" # Optional variables with defaults : "${AMLFS_SKU:=AMLFS-Durable-Premium-125}" : "${AMLFS_STORAGE_SIZE:=4Ti}" : "${AMLFS_ZONE:=1}" : "${AMLFS_SUBNET_NAME:=amlfs-subnet}" : "${AMLFS_SUBNET_PREFIX:=}" : "${PVC_NAME:=shared-amlfs-storage}" : "${STORAGECLASS_NAME:=amlfs-lustre}" echo "============================================" echo " AMLFS Setup for Existing AKS Cluster" echo "============================================" echo "" echo "Cluster: ${CLUSTER_NAME}" echo "Resource Group: ${RESOURCE_GROUP}" echo "AMLFS SKU: ${AMLFS_SKU}" echo "Storage Size: ${AMLFS_STORAGE_SIZE}" echo "Zone: ${AMLFS_ZONE}" echo "AMLFS Subnet: ${AMLFS_SUBNET_NAME}" echo "PVC Name: ${PVC_NAME}" echo "" # ------------------------------------------------------------------ # Step 1: Register Microsoft.StorageCache resource provider # ------------------------------------------------------------------ echo "๐Ÿ“ฆ Step 1: Registering Microsoft.StorageCache resource provider..." PROVIDER_STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv 2>/dev/null || echo "NotRegistered") if [[ "${PROVIDER_STATE}" == "Registered" ]]; then echo "โœ… Microsoft.StorageCache is already registered." else az provider register --namespace Microsoft.StorageCache echo "โณ Waiting for Microsoft.StorageCache to be registered..." while true; do STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv) if [[ "${STATE}" == "Registered" ]]; then echo "โœ… Microsoft.StorageCache registered successfully." break fi echo " Current state: ${STATE}. Waiting..." sleep 10 done fi # ------------------------------------------------------------------ # Step 2: Install the Azure Lustre CSI driver # ------------------------------------------------------------------ echo "" echo "๐Ÿ”ง Step 2: Installing Azure Lustre CSI driver..." if kubectl get -n kube-system deployment csi-azurelustre-controller >/dev/null 2>&1; then echo "โœ… Azure Lustre CSI driver is already installed, skipping." else curl -skSL https://raw.githubusercontent.com/kubernetes-sigs/azurelustre-csi-driver/main/deploy/install-driver.sh | bash echo "โณ Waiting for CSI controller pods to be ready..." while true; do READY=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") DESIRED=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.replicas}' 2>/dev/null || echo "0") if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then break fi echo " Controller pods: ${READY}/${DESIRED} ready..." sleep 5 done echo "โณ Waiting for CSI node pods to be ready..." while true; do READY=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0") DESIRED=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0") if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then break fi echo " Node pods: ${READY}/${DESIRED} ready..." sleep 5 done echo "โœ… Azure Lustre CSI driver installed successfully." fi # ------------------------------------------------------------------ # Step 3: Assign RBAC roles to kubelet identity # ------------------------------------------------------------------ echo "" echo "๐Ÿ”‘ Step 3: Assigning RBAC roles to kubelet identity..." SUBSCRIPTION_ID=$(az account show --query id -o tsv) OBJECT_ID=$(az aks show \ --name "${CLUSTER_NAME}" \ --resource-group "${RESOURCE_GROUP}" \ --query identityProfile.kubeletidentity.objectId \ -o tsv) NODE_RESOURCE_GROUP=$(az aks show \ --name "${CLUSTER_NAME}" \ --resource-group "${RESOURCE_GROUP}" \ --query nodeResourceGroup \ -o tsv) echo " Kubelet Identity: ${OBJECT_ID}" echo " Node RG: ${NODE_RESOURCE_GROUP}" # Assign Contributor on node resource group EXISTING_CONTRIBUTOR=$(az role assignment list \ --assignee "${OBJECT_ID}" \ --role "Contributor" \ --scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" \ --query "[].roleDefinitionName" -o tsv) if [[ -n "${EXISTING_CONTRIBUTOR}" ]]; then echo "โœ… Contributor role already assigned on node resource group." else echo " Assigning Contributor role on node resource group..." az role assignment create \ --assignee-object-id "${OBJECT_ID}" \ --assignee-principal-type ServicePrincipal \ --role "Contributor" \ --scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" >/dev/null echo "โœ… Contributor role assigned." fi # Assign Reader on subscription EXISTING_READER=$(az role assignment list \ --assignee "${OBJECT_ID}" \ --role "Reader" \ --scope "/subscriptions/${SUBSCRIPTION_ID}" \ --query "[].roleDefinitionName" -o tsv) if [[ -n "${EXISTING_READER}" ]]; then echo "โœ… Reader role already assigned on subscription." else echo " Assigning Reader role on subscription..." az role assignment create \ --assignee-object-id "${OBJECT_ID}" \ --assignee-principal-type ServicePrincipal \ --role "Reader" \ --scope "/subscriptions/${SUBSCRIPTION_ID}" >/dev/null echo "โœ… Reader role assigned." fi # ------------------------------------------------------------------ # Step 4: Create dedicated AMLFS subnet # ------------------------------------------------------------------ echo "" echo "๐ŸŒ Step 4: Creating dedicated subnet for AMLFS..." echo " โš ๏ธ AMLFS must NOT share the AKS node subnet โ€” this causes" echo " extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet)." # Discover the AKS VNet VNET_INFO=$(az network vnet list \ --resource-group "${NODE_RESOURCE_GROUP}" \ --query "[0].{name:name, prefixes:addressSpace.addressPrefixes[0]}" \ -o json) VNET_NAME=$(echo "${VNET_INFO}" | jq -r '.name') echo " VNet: ${VNET_NAME}" # Check if AMLFS subnet already exists if az network vnet subnet show \ --resource-group "${NODE_RESOURCE_GROUP}" \ --vnet-name "${VNET_NAME}" \ --name "${AMLFS_SUBNET_NAME}" &>/dev/null; then echo "โœ… Subnet '${AMLFS_SUBNET_NAME}' already exists, reusing it." else # Auto-calculate subnet prefix if not provided if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then # Get existing subnets and find an unused /24 range # Start from the second octet block after the AKS subnet EXISTING_PREFIXES=$(az network vnet subnet list \ --resource-group "${NODE_RESOURCE_GROUP}" \ --vnet-name "${VNET_NAME}" \ --query "[].addressPrefix" -o tsv) # Get the VNet base address (e.g., 10.224.0.0/12 -> 10) VNET_PREFIX=$(echo "${VNET_INFO}" | jq -r '.prefixes') VNET_BASE=$(echo "${VNET_PREFIX}" | cut -d'.' -f1) # Try to find an unused /24 in the VNet range for SECOND_OCTET in $(seq 225 237); do CANDIDATE="${VNET_BASE}.${SECOND_OCTET}.0.0/24" if ! echo "${EXISTING_PREFIXES}" | grep -q "${VNET_BASE}.${SECOND_OCTET}"; then AMLFS_SUBNET_PREFIX="${CANDIDATE}" break fi done if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then echo "โŒ Could not auto-calculate a free /24 subnet. Please set AMLFS_SUBNET_PREFIX manually." exit 1 fi fi echo " Creating subnet '${AMLFS_SUBNET_NAME}' with prefix '${AMLFS_SUBNET_PREFIX}'..." az network vnet subnet create \ --resource-group "${NODE_RESOURCE_GROUP}" \ --vnet-name "${VNET_NAME}" \ --name "${AMLFS_SUBNET_NAME}" \ --address-prefixes "${AMLFS_SUBNET_PREFIX}" >/dev/null echo "โœ… Subnet '${AMLFS_SUBNET_NAME}' created (${AMLFS_SUBNET_PREFIX})." fi # ------------------------------------------------------------------ # Step 5: Deploy StorageClass and PVC with dedicated subnet # ------------------------------------------------------------------ echo "" echo "๐Ÿ“€ Step 5: Deploying StorageClass and PVC with dedicated AMLFS subnet..." cat </dev/null || echo "Unknown") if [[ "${PVC_STATUS}" == "Bound" ]]; then echo "" echo "โœ… PVC '${PVC_NAME}' is Bound!" kubectl get pvc "${PVC_NAME}" break fi # Check for non-timeout errors (ignore DeadlineExceeded as the driver retries) LATEST_ERROR=$(kubectl describe pvc "${PVC_NAME}" 2>/dev/null | grep "ProvisioningFailed" | grep -v "DeadlineExceeded" | tail -1) if [[ -n "${LATEST_ERROR}" ]]; then echo " โš ๏ธ Provisioning error: ${LATEST_ERROR}" fi if [[ ${SECONDS_WAITED} -ge ${MAX_WAIT} ]]; then echo "" echo "โŒ Timed out after $((MAX_WAIT / 60)) minutes. PVC is still ${PVC_STATUS}." echo " The CSI driver will continue retrying in the background." echo " Monitor with: kubectl describe pvc ${PVC_NAME}" echo " Check Azure: az resource list --resource-group ${NODE_RESOURCE_GROUP} --resource-type Microsoft.StorageCache/amlFileSystems -o table" exit 1 fi echo " [$(date +%H:%M:%S)] PVC status: ${PVC_STATUS} (${SECONDS_WAITED}s elapsed)" sleep 30 SECONDS_WAITED=$((SECONDS_WAITED + 30)) done # ------------------------------------------------------------------ # Step 7: Verify with a test pod # ------------------------------------------------------------------ echo "" echo "๐Ÿงช Step 7: Deploying test pod to verify Lustre storage..." kubectl delete pod lustre-test --ignore-not-found=true --wait=false >/dev/null 2>&1 || true sleep 2 cat < /mnt/lustre/test.txt && cat /mnt/lustre/test.txt && df -h /mnt/lustre && echo 'SUCCESS' && sleep 30"] volumeMounts: - mountPath: /mnt/lustre name: lustre-data restartPolicy: Never volumes: - name: lustre-data persistentVolumeClaim: claimName: ${PVC_NAME} EOF echo "โณ Waiting for test pod to complete..." kubectl wait --for=condition=Ready pod/lustre-test --timeout=120s 2>/dev/null || true sleep 5 echo "" echo "๐Ÿ“‹ Test pod logs:" kubectl logs lustre-test echo "" echo "๐Ÿงน Cleaning up test pod..." kubectl delete pod lustre-test --wait=false >/dev/null 2>&1 || true echo "" echo "============================================" echo " โœ… AMLFS Setup Complete!" echo "============================================" echo "" echo "Summary:" echo " - CSI Driver: azurelustre.csi.azure.com (installed)" echo " - StorageClass: ${STORAGECLASS_NAME}" echo " - PVC: ${PVC_NAME} (Bound)" echo " - Storage: ${AMLFS_STORAGE_SIZE} (${AMLFS_SKU})" echo " - AMLFS Subnet: ${AMLFS_SUBNET_NAME}" echo "" echo "Usage in your deployments:" echo " volumes:" echo " - name: lustre-data" echo " persistentVolumeClaim:" echo " claimName: ${PVC_NAME}" echo ""