- Prerequisites
- Environment Setup
- EKS Cluster Creation
- Neuron Node Group Setup
- Neuron Components Installation
- Deploy Simple Training Job
- Deploy Inference Workload
- Neuron Monitoring Setup
- Monitoring and Observability
- Troubleshooting
- Cleanup
- AWS CLI v2.x
- kubectl
- eksctl
- helm
- jq
- docker (for building custom containers)
Your AWS user/role needs permissions for:
- EKS cluster management
- EC2 instance management
- CloudFormation stack operations
- ECR repository access
- IAM role creation
- Inf1:
inf1.xlarge,inf1.2xlarge,inf1.6xlarge,inf1.24xlarge - Inf2:
inf2.xlarge,inf2.8xlarge,inf2.24xlarge,inf2.48xlarge - Trn1:
trn1.2xlarge,trn1.32xlarge - Trn2:
trn2.4xlarge,trn2.8xlarge,trn2.48xlarge
# Set your cluster and region
export CLUSTER_NAME="neuron-eks-cluster"
export REGION_CODE="us-west-2" # Choose your preferred region
export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
# Verify AWS CLI is configured
aws sts get-caller-identity# Install eksctl (if not already installed)
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
# Install kubectl (if not already installed)
curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.28.3/2023-11-14/bin/linux/amd64/kubectl
chmod +x ./kubectl
sudo mv ./kubectl /usr/local/bin
# Install helm (if not already installed)
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# Verify installations
eksctl version
kubectl version --client
helm version# Create cluster configuration file
cat <<EOF > cluster-config.yaml
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${REGION_CODE}
version: "1.28"
iam:
withOIDC: true
managedNodeGroups:
- name: worker-nodes
instanceType: m5.large
minSize: 1
desiredCapacity: 2
maxSize: 4
privateNetworking: true
cloudWatch:
clusterLogging:
enableTypes: ["*"]
EOF
# Create the cluster
eksctl create cluster -f cluster-config.yaml# Update kubeconfig
aws eks update-kubeconfig --region ${REGION_CODE} --name ${CLUSTER_NAME}
# Verify cluster is running
kubectl get nodes
kubectl get namespaces# Create parameter extraction script
cat <<'EOF' > create_cfn_params.sh
#!/bin/bash
CLUSTER_NAME=$1
CLUSTER_SG=$(eksctl get cluster $CLUSTER_NAME -o json|jq -r ".[0].ResourcesVpcConfig.ClusterSecurityGroupId")
VPC_ID=$(eksctl get cluster $CLUSTER_NAME -o json|jq -r ".[0].ResourcesVpcConfig.VpcId")
cat <<EOL > cfn_params.json
[
{
"ParameterKey": "ClusterName",
"ParameterValue": "$CLUSTER_NAME"
},
{
"ParameterKey": "ClusterControlPlaneSecurityGroup",
"ParameterValue": "$CLUSTER_SG"
},
{
"ParameterKey": "VpcId",
"ParameterValue": "$VPC_ID"
}
]
EOL
echo "CloudFormation parameters created in cfn_params.json"
EOF
chmod +x create_cfn_params.sh
./create_cfn_params.sh ${CLUSTER_NAME}# Download the CloudFormation template
wget https://raw.githubusercontent.com/aws-neuron/aws-neuron-eks-samples/master/dp_bert_hf_pretrain/cfn/eks_trn1_ng_stack.yaml
# Create CloudFormation stack for Neuron nodes
aws cloudformation create-stack \
--stack-name eks-neuron-ng-stack \
--template-body file://eks_trn1_ng_stack.yaml \
--parameters file://cfn_params.json \
--capabilities CAPABILITY_IAM
# Wait for stack creation to complete
aws cloudformation wait stack-create-complete --stack-name eks-neuron-ng-stack
echo "CloudFormation stack created successfully"# Create node group yaml generation script
cat <<'EOF' > create_ng_yaml.sh
#!/bin/bash
REGION_CODE=$1
EKSAZ1=$2
EKSAZ2=$3
CLUSTER_NAME=$4
STACKNAME=$5
LT_ID_TRN1=$(aws cloudformation describe-stacks --stack-name $STACKNAME \
--query "Stacks[0].Outputs[?OutputKey=='LaunchTemplateIdTrn1'].OutputValue" \
--output text)
cat <<EOL > trn1_nodegroup.yaml
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: $CLUSTER_NAME
region: $REGION_CODE
version: "1.28"
iam:
withOIDC: true
availabilityZones: ["$EKSAZ1","$EKSAZ2"]
managedNodeGroups:
- name: trn1-32xl-ng1
launchTemplate:
id: $LT_ID_TRN1
minSize: 1
desiredCapacity: 1
maxSize: 1
availabilityZones: ["$EKSAZ1"]
privateNetworking: true
efaEnabled: true
EOL
echo "Node group configuration created: trn1_nodegroup.yaml"
EOF
chmod +x create_ng_yaml.sh# Get availability zones
ZONE1=$(aws ec2 describe-availability-zones --region ${REGION_CODE} --query "AvailabilityZones[0].ZoneName" --output text)
ZONE2=$(aws ec2 describe-availability-zones --region ${REGION_CODE} --query "AvailabilityZones[1].ZoneName" --output text)
echo "Using zones: $ZONE1, $ZONE2"
# Generate node group configuration
./create_ng_yaml.sh ${REGION_CODE} ${ZONE1} ${ZONE2} ${CLUSTER_NAME} eks-neuron-ng-stack
# Create the node group
eksctl create nodegroup -f trn1_nodegroup.yaml
# Verify nodes are ready
kubectl get nodes -o wide# Install Neuron Device Plugin using Helm
helm upgrade --install neuron-helm-chart oci://public.ecr.aws/neuron/neuron-helm-chart \
--set "npd.enabled=false"
# Verify device plugin is running
kubectl get ds neuron-device-plugin -n kube-system
# Check neuron cores are available
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,NeuronCore:.status.allocatable.aws\.amazon\.com/neuroncore"
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,NeuronDevice:.status.allocatable.aws\.amazon\.com/neuron"# Install Neuron Scheduler Extension
helm upgrade --install neuron-helm-chart oci://public.ecr.aws/neuron/neuron-helm-chart \
--set "scheduler.enabled=true" \
--set "npd.enabled=false"
# Verify scheduler is running
kubectl get pods -n kube-system | grep scheduler
kubectl logs -n kube-system -l app=my-scheduler# Create namespace for neuron health check
kubectl create ns neuron-healthcheck-system
# Create IAM policy for node problem detector
cat <<EOF > npd-policy.json
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"autoscaling:SetInstanceHealth",
"autoscaling:DescribeAutoScalingInstances"
],
"Effect": "Allow",
"Resource": "*"
},
{
"Action": [
"ec2:DescribeInstances"
],
"Effect": "Allow",
"Resource": "*"
},
{
"Action": [
"cloudwatch:PutMetricData"
],
"Effect": "Allow",
"Resource": "*",
"Condition": {
"StringEquals": {
"cloudwatch:Namespace": "NeuronHealthCheck"
}
}
}
]
}
EOF
# Create the policy
aws iam create-policy \
--policy-name NeuronProblemDetectorPolicy \
--policy-document file://npd-policy.json
# Create service account with IAM role
POLICY_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:policy/NeuronProblemDetectorPolicy"
eksctl create iamserviceaccount \
--name node-problem-detector \
--namespace neuron-healthcheck-system \
--cluster ${CLUSTER_NAME} \
--attach-policy-arn ${POLICY_ARN} \
--approve \
--role-name neuron-problem-detector-role-${CLUSTER_NAME} \
--region ${REGION_CODE} \
--override-existing-serviceaccounts
# Install node problem detector
helm upgrade --install neuron-helm-chart oci://public.ecr.aws/neuron/neuron-helm-chart \
--set "npd.nodeRecovery.enabled=true"# Create MLP training job configuration
cat <<EOF > mlp_train_job.yaml
apiVersion: v1
kind: Pod
metadata:
name: trn1-mlp-training
labels:
app: neuron-training
spec:
restartPolicy: Never
schedulerName: my-scheduler
hostNetwork: true
nodeSelector:
beta.kubernetes.io/instance-type: trn1.32xlarge
containers:
- name: trn1-mlp
# Replace with your ECR image URL
image: 647554078242.dkr.ecr.us-east-1.amazonaws.com/neuron-training:latest
command: ["/usr/local/bin/python3"]
args: ["/opt/ml/mlp_train.py"]
imagePullPolicy: IfNotPresent
env:
- name: NEURON_RT_LOG_LEVEL
value: "INFO"
- name: NEURON_FRAMEWORK_DEBUG
value: "1"
resources:
limits:
aws.amazon.com/neuron: 2
memory: "8Gi"
cpu: "4"
requests:
aws.amazon.com/neuron: 2
memory: "4Gi"
cpu: "2"
volumeMounts:
- name: neuron-logs
mountPath: /tmp/neuron_logs
volumes:
- name: neuron-logs
emptyDir: {}
EOF# Deploy the training job
kubectl apply -f mlp_train_job.yaml
# Monitor job status
kubectl get pods -l app=neuron-training -w
# Check job logs
kubectl logs trn1-mlp-training -f
# Check if training completed successfully
kubectl logs trn1-mlp-training | grep "Final loss"cat <<EOF > batch_training_job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: neuron-batch-training
spec:
template:
metadata:
labels:
app: neuron-batch-training
spec:
restartPolicy: Never
schedulerName: my-scheduler
nodeSelector:
beta.kubernetes.io/instance-type: trn1.32xlarge
containers:
- name: training-container
image: 647554078242.dkr.ecr.us-east-1.amazonaws.com/neuron-training:latest
command: ["/usr/local/bin/python3"]
args: ["/opt/ml/batch_train.py"]
resources:
limits:
aws.amazon.com/neuron: 4
memory: "16Gi"
requests:
aws.amazon.com/neuron: 4
memory: "8Gi"
env:
- name: NEURON_RT_LOG_LEVEL
value: "INFO"
backoffLimit: 3
EOF
# Deploy batch job
kubectl apply -f batch_training_job.yaml
# Monitor batch job
kubectl get jobs
kubectl describe job neuron-batch-trainingcat <<EOF > inference_deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: neuron-inference
labels:
app: neuron-inference
spec:
replicas: 2
selector:
matchLabels:
app: neuron-inference
template:
metadata:
labels:
app: neuron-inference
spec:
schedulerName: my-scheduler
nodeSelector:
beta.kubernetes.io/instance-type: inf2.xlarge
containers:
- name: inference-server
image: 647554078242.dkr.ecr.us-east-1.amazonaws.com/neuron-inference:latest
ports:
- containerPort: 8080
resources:
limits:
aws.amazon.com/neuroncore: 2
memory: "4Gi"
requests:
aws.amazon.com/neuroncore: 2
memory: "2Gi"
env:
- name: NEURON_RT_LOG_LEVEL
value: "INFO"
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 20
---
apiVersion: v1
kind: Service
metadata:
name: neuron-inference-service
spec:
selector:
app: neuron-inference
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: LoadBalancer
EOF
# Deploy inference workload
kubectl apply -f inference_deployment.yaml
# Check deployment status
kubectl get deployments
kubectl get pods -l app=neuron-inference
kubectl get services# Get the external IP of the load balancer
EXTERNAL_IP=$(kubectl get service neuron-inference-service -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
echo "Inference service available at: http://${EXTERNAL_IP}"
# Test the inference endpoint (once available)
curl -X POST http://${EXTERNAL_IP}/predict \
-H "Content-Type: application/json" \
-d '{"input": "test data"}'# Download neuron monitor configuration
wget https://awsdocs-neuron.readthedocs-hosted.com/en/latest/_downloads/c53c93938d5dacb9c2ce4358a92d0792/k8s-neuron-monitor-daemonset.yml -O k8s-neuron-monitor.yml
# Apply neuron monitor
kubectl apply -f k8s-neuron-monitor.yml
# Verify neuron monitor is running
kubectl get ds neuron-monitor --namespace neuron-monitor
kubectl get pods -n neuron-monitor# Get neuron monitor pod names
MONITOR_PODS=$(kubectl get pods -n neuron-monitor -o jsonpath='{.items[*].metadata.name}')
# Check prometheus endpoint
for pod in $MONITOR_PODS; do
echo "Testing metrics endpoint for pod: $pod"
kubectl exec -n neuron-monitor $pod -- wget -q --output-document - http://127.0.0.1:8000 | head -20
done# Add Prometheus Helm repository
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
# Install Prometheus
helm install prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--create-namespace \
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false
# Create ServiceMonitor for Neuron metrics
cat <<EOF > neuron-servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: neuron-monitor
namespace: monitoring
spec:
selector:
matchLabels:
app: neuron-monitor
endpoints:
- port: metrics
interval: 30s
path: /
EOF
kubectl apply -f neuron-servicemonitor.yaml# Create monitoring script
cat <<'EOF' > monitor_neuron.sh
#!/bin/bash
echo "=== Neuron Device Status ==="
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,NeuronCore:.status.allocatable.aws\.amazon\.com/neuroncore,NeuronDevice:.status.allocatable.aws\.amazon\.com/neuron"
echo -e "\n=== Running Neuron Workloads ==="
kubectl get pods -A -o wide | grep -E "(neuron|inference|training)"
echo -e "\n=== Neuron Device Plugin Status ==="
kubectl get ds neuron-device-plugin -n kube-system
echo -e "\n=== Neuron Scheduler Status ==="
kubectl get pods -n kube-system -l app=my-scheduler
echo -e "\n=== Node Problem Detector Status ==="
kubectl get pods -n neuron-healthcheck-system
echo -e "\n=== Neuron Monitor Status ==="
kubectl get pods -n neuron-monitor
echo -e "\n=== Recent Events ==="
kubectl get events --sort-by=.metadata.creationTimestamp | tail -10
EOF
chmod +x monitor_neuron.sh
./monitor_neuron.sh# Create performance monitoring script
cat <<'EOF' > neuron_performance.sh
#!/bin/bash
echo "=== Neuron Resource Utilization ==="
for pod in $(kubectl get pods -l app=neuron-inference -o jsonpath='{.items[*].metadata.name}'); do
echo "Pod: $pod"
kubectl top pod $pod --containers || echo "Metrics not available yet"
done
echo -e "\n=== Neuron Core Allocation ==="
kubectl get pods -o wide | grep -E "(inference|training)" | while read line; do
pod_name=$(echo $line | awk '{print $1}')
echo "Checking allocation for pod: $pod_name"
kubectl describe pod $pod_name | grep -A 5 -B 5 neuron
done
EOF
chmod +x neuron_performance.sh
./neuron_performance.sh# Monitor training logs
kubectl logs -f trn1-mlp-training
# Monitor inference logs
kubectl logs -f deployment/neuron-inference
# Monitor neuron device plugin logs
kubectl logs -n kube-system daemonset/neuron-device-plugin -f
# Monitor scheduler logs
kubectl logs -n kube-system -l app=my-scheduler -f# Check if Neuron devices are detected
kubectl describe node | grep -A 10 -B 10 neuron
# Verify Neuron runtime
kubectl exec -it <pod-name> -- neuron-ls
# Check for resource allocation issues
kubectl describe pod <pod-name> | grep -A 20 Events
# Debug scheduler issues
kubectl get events --field-selector reason=FailedScheduling
# Check node conditions
kubectl get nodes -o wide
kubectl describe node <node-name> | grep Conditions -A 10cat <<'EOF' > diagnose_neuron.sh
#!/bin/bash
echo "=== Cluster Health Check ==="
kubectl get nodes
kubectl get pods --all-namespaces | grep -E "(Error|CrashLoopBackOff|Evicted)"
echo -e "\n=== Neuron Component Health ==="
echo "Device Plugin:"
kubectl get ds neuron-device-plugin -n kube-system
echo "Scheduler:"
kubectl get pods -n kube-system -l app=my-scheduler
echo "Monitor:"
kubectl get pods -n neuron-monitor
echo -e "\n=== Resource Availability ==="
kubectl describe nodes | grep -E "(neuron|Allocatable|Capacity)" -A 2 -B 2
echo -e "\n=== Failed Pods ==="
kubectl get pods --all-namespaces --field-selector=status.phase=Failed
echo -e "\n=== Recent Events ==="
kubectl get events --sort-by=.metadata.creationTimestamp | tail -20
EOF
chmod +x diagnose_neuron.sh
./diagnose_neuron.sh# Check Neuron utilization in pods
kubectl exec -it <pod-name> -- neuron-top
# Monitor system resources
kubectl top nodes
kubectl top pods --all-namespaces
# Check for throttling or resource constraints
kubectl describe pod <pod-name> | grep -E "(limits|requests|throttl)"# Delete inference deployment
kubectl delete -f inference_deployment.yaml
# Delete training jobs
kubectl delete -f mlp_train_job.yaml
kubectl delete -f batch_training_job.yaml
# Delete monitoring components
kubectl delete -f k8s-neuron-monitor.yml
kubectl delete namespace neuron-monitor
# Delete prometheus (if installed)
helm uninstall prometheus -n monitoring
kubectl delete namespace monitoring# Remove Neuron helm chart
helm uninstall neuron-helm-chart
# Remove IAM service account
eksctl delete iamserviceaccount \
--name node-problem-detector \
--namespace neuron-healthcheck-system \
--cluster ${CLUSTER_NAME} \
--region ${REGION_CODE}
# Delete namespace
kubectl delete namespace neuron-healthcheck-system
# Delete IAM policy
aws iam delete-policy --policy-arn "arn:aws:iam::${AWS_ACCOUNT_ID}:policy/NeuronProblemDetectorPolicy"# Delete CloudFormation stack
aws cloudformation delete-stack --stack-name eks-neuron-ng-stack
aws cloudformation wait stack-delete-complete --stack-name eks-neuron-ng-stack
# Delete EKS cluster
eksctl delete cluster --name ${CLUSTER_NAME} --region ${REGION_CODE}
# Clean up local files
rm -f cluster-config.yaml cfn_params.json eks_trn1_ng_stack.yaml trn1_nodegroup.yaml
rm -f mlp_train_job.yaml batch_training_job.yaml inference_deployment.yaml
rm -f k8s-neuron-monitor.yml neuron-servicemonitor.yaml npd-policy.json
rm -f *.sh