Title here
Summary here
This guide covers production deployment considerations, performance tuning, security hardening, monitoring strategies, and operational procedures for the incident management platform.
Production Readiness Checklist:
┌─────────────────────────────────────────────────────────┐
│ Load Balancer │
└─────────────────────┬───────────────────────────────────┘
│
┌─────────────────────▼───────────────────────────────────┐
│ Ingress Controller │
│ (nginx/traefik) │
└─────────────────────┬───────────────────────────────────┘
│
┌─────────────────┼─────────────────┐
│ │ │
┌───▼────┐ ┌───▼────┐ ┌───▼────┐
│IM Pod 1│ │IM Pod 2│ │IM Pod N│
│ │ │ │ │ │
└────────┘ └────────┘ └────────┘
│ │ │
└─────────────────┼─────────────────┘
│
┌─────────────────┼─────────────────┐
│ │ │
┌───▼──────┐ ┌────▼─────┐ ┌──────▼──┐
│PostgreSQL│ │ Redis │ │ Gorush │
│ Primary │ │ Cluster │ │ (Push) │
└──────────┘ └──────────┘ └─────────┘
│
┌───▼──────┐
│PostgreSQL│
│ Replica │
└──────────┘
| Component | Replicas | Resource Allocation | Purpose |
|---|---|---|---|
| Incident Server | 3-10 (auto-scaled) | 500m CPU, 1Gi RAM | Main application |
| PostgreSQL Primary | 1 | 2 CPU, 4Gi RAM | Primary database |
| PostgreSQL Replica | 1-2 | 1 CPU, 2Gi RAM | Read replicas |
| Redis Cluster | 3 | 500m CPU, 512Mi RAM | Caching and sessions |
| Push Service | 2 | 250m CPU, 256Mi RAM | Mobile notifications |
apiVersion: apps/v1
kind: Deployment
metadata:
name: incident-server
spec:
replicas: 5
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
template:
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- incident-server
topologyKey: kubernetes.io/hostnamelivenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: incident-server-pdb
spec:
minAvailable: 60%
selector:
matchLabels:
app: incident-server# Primary
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: postgres-cluster
spec:
instances: 3
postgresql:
parameters:
max_connections: "300"
shared_buffers: "256MB"
effective_cache_size: "1GB"
maintenance_work_mem: "64MB"
wal_buffers: "16MB"
max_wal_size: "1GB"
min_wal_size: "80MB"
storage:
size: 100Gi
storageClass: fast-ssd
monitoring:
enabled: trueapiVersion: apps/v1
kind: Deployment
metadata:
name: pgbouncer
spec:
replicas: 2
template:
spec:
containers:
- name: pgbouncer
image: pgbouncer/pgbouncer:latest
env:
- name: DATABASES_HOST
value: postgres-cluster-rw
- name: DATABASES_PORT
value: "5432"
- name: POOL_MODE
value: transaction
- name: MAX_CLIENT_CONN
value: "1000"
- name: DEFAULT_POOL_SIZE
value: "25"apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis-cluster
spec:
serviceName: redis-cluster
replicas: 6
template:
spec:
containers:
- name: redis
image: redis:7-alpine
command:
- redis-server
- /etc/redis/redis.conf
- --cluster-enabled
- "yes"
- --cluster-config-file
- /data/nodes.conf
- --cluster-node-timeout
- "5000"
volumeMounts:
- name: redis-data
mountPath: /data
- name: redis-config
mountPath: /etc/redisresources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"# Environment variables for Go runtime optimization
GOGC=100 # Garbage collection target percentage
GOMAXPROCS=4 # Number of OS threads
GOMEMLIMIT=1073741824 # Memory limit in bytes (1GB)server:
max_connections: 1000
read_timeout: 30s
write_timeout: 30s
idle_timeout: 120s
database:
max_open_conns: 25
max_idle_conns: 10
conn_max_lifetime: 5m
cache:
default_expiration: 10m
cleanup_interval: 5m-- Performance tuning parameters
ALTER SYSTEM SET shared_buffers = '2GB';
ALTER SYSTEM SET effective_cache_size = '6GB';
ALTER SYSTEM SET maintenance_work_mem = '512MB';
ALTER SYSTEM SET checkpoint_completion_target = 0.9;
ALTER SYSTEM SET wal_buffers = '16MB';
ALTER SYSTEM SET default_statistics_target = 100;
ALTER SYSTEM SET random_page_cost = 1.1;
ALTER SYSTEM SET effective_io_concurrency = 200;
ALTER SYSTEM SET work_mem = '4MB';
ALTER SYSTEM SET min_wal_size = '1GB';
ALTER SYSTEM SET max_wal_size = '4GB';
-- Restart required
SELECT pg_reload_conf();-- Incident management optimized indexes
CREATE INDEX CONCURRENTLY idx_incidents_status_created ON incidents (status, created_at);
CREATE INDEX CONCURRENTLY idx_incidents_severity_service ON incidents (severity, service);
CREATE INDEX CONCURRENTLY idx_timeline_incident_time ON timeline_events (incident_id, event_time);
CREATE INDEX CONCURRENTLY idx_timeline_event_type ON timeline_events (event_type, event_time);
-- Partial indexes for active incidents
CREATE INDEX CONCURRENTLY idx_incidents_active ON incidents (created_at DESC)
WHERE status IN ('open', 'mitigated');redis:
# Cache layers
incident_cache_ttl: 300s # 5 minutes
user_cache_ttl: 3600s # 1 hour
config_cache_ttl: 86400s # 24 hours
# Connection settings
max_idle: 20
max_active: 100
idle_timeout: 240s
# Cluster settings
read_from_replicas: true
route_by_latency: truecache_strategies:
incidents:
strategy: write_through
ttl: 300s
users:
strategy: lazy_loading
ttl: 3600s
policies:
strategy: refresh_ahead
ttl: 86400ssecurityContext:
runAsNonRoot: true
runAsUser: 65534 # nobody user
runAsGroup: 65534
fsGroup: 65534
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
add:
- NET_BIND_SERVICE# .github/workflows/security-scan.yml
name: Security Scan
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
security-scan:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build image
run: docker build -t incident-management:${{ github.sha }} .
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
with:
image-ref: "incident-management:${{ github.sha }}"
format: "sarif"
output: "trivy-results.sarif"apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: incident-management-deny-all
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: incident-server-network-policy
spec:
podSelector:
matchLabels:
app: incident-server
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
ports:
- protocol: TCP
port: 8080
egress:
- to:
- podSelector:
matchLabels:
app: postgres
ports:
- protocol: TCP
port: 5432
- to:
- podSelector:
matchLabels:
app: redis
ports:
- protocol: TCP
port: 6379
- ports:
- protocol: TCP
port: 443 # HTTPS outbound
- protocol: TCP
port: 53 # DNS
- protocol: UDP
port: 53 # DNSapiVersion: v1
kind: Secret
metadata:
name: tls-secret
type: kubernetes.io/tls
data:
tls.crt: <base64-encoded-cert>
tls.key: <base64-encoded-key>
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: incident-management-ingress
annotations:
nginx.ingress.kubernetes.io/ssl-protocols: "TLSv1.2 TLSv1.3"
nginx.ingress.kubernetes.io/ssl-ciphers: "ECDHE-ECDSA-AES256-GCM-SHA384,ECDHE-RSA-AES256-GCM-SHA384"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
spec:
tls:
- hosts:
- incidents.yourdomain.com
secretName: tls-secretapiVersion: external-secrets.io/v1beta1
kind: SecretStore
metadata:
name: vault-backend
spec:
provider:
vault:
server: "https://vault.company.com"
path: "secret"
version: "v2"
auth:
kubernetes:
mountPath: "kubernetes"
role: "incident-management"
---
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: incident-management-secrets
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: SecretStore
target:
name: incident-management-secrets
creationPolicy: Owner
data:
- secretKey: postgres-password
remoteRef:
key: incident-management
property: postgres-password
- secretKey: jwt-secret
remoteRef:
key: incident-management
property: jwt-secretapiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: incident-server-metrics
spec:
selector:
matchLabels:
app: incident-server
endpoints:
- port: http
path: /metrics
interval: 30s
scrapeTimeout: 10s# Key business metrics to monitor
custom_metrics:
- name: incidents_total
description: Total number of incidents by severity
labels: [severity, service]
- name: incident_response_time_seconds
description: Time from incident creation to acknowledgment
type: histogram
buckets: [30, 60, 300, 600, 1800, 3600]
- name: incident_resolution_time_seconds
description: Time from incident creation to resolution
type: histogram
buckets: [300, 900, 1800, 3600, 14400, 86400]
- name: sla_breaches_total
description: Number of SLA breaches by severity
labels: [severity, sla_type]groups:
- name: incident-management-critical
rules:
- alert: ServiceDown
expr: up{job="incident-server-metrics"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Incident Management service is down"
description: "Service has been down for more than 2 minutes"
runbook_url: "https://runbooks.company.com/incident-management/service-down"
- alert: DatabaseDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL database is down"
description: "Primary database connection failed"
- alert: HighErrorRate
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"- name: incident-management-performance
rules:
- alert: HighResponseTime
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s"
- alert: HighMemoryUsage
expr: container_memory_usage_bytes{pod=~"incident-server-.*"} / container_spec_memory_limit_bytes > 0.8
for: 15m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | humanizePercentage }}"logging:
format: json
level: info
fields:
service: incident-management
version: ${VERSION}
environment: ${ENVIRONMENT}
# Log levels by component
components:
database: warn
cache: warn
connectors: info
api: info
websocket: debugapiVersion: logging.coreos.com/v1
kind: ClusterLogForwarder
metadata:
name: instance
spec:
outputs:
- name: elasticsearch-incident-management
type: elasticsearch
url: https://elasticsearch.company.com
elasticsearch:
index: incident-management-{.log_type}-{+yyyy.MM.dd}
pipelines:
- name: incident-management-logs
inputRefs:
- application
filterRefs:
- incident-management-filter
outputRefs:
- elasticsearch-incident-managementapiVersion: batch/v1
kind: CronJob
metadata:
name: postgres-backup
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: postgres-backup
image: postgres:15-alpine
command:
- /bin/bash
- -c
- |
set -e
# Create timestamp
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="incidents_${TIMESTAMP}.sql"
# Create backup with compression
pg_dump -h postgres-cluster-rw -U incidents -d incidents \
--verbose --clean --no-owner --no-acl \
| gzip > /backup/${BACKUP_FILE}.gz
# Upload to S3
aws s3 cp /backup/${BACKUP_FILE}.gz \
s3://company-backups/incident-management/database/
# Cleanup local files older than 3 days
find /backup -name "incidents_*.sql.gz" -mtime +3 -delete
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: incident-management-secrets
key: postgres-password
volumeMounts:
- name: backup-storage
mountPath: /backup# Configure continuous archiving
archive_mode = on
archive_command = 'aws s3 cp %p s3://company-backups/incident-management/wal/%f'
wal_level = replica
max_wal_senders = 3
wal_keep_segments = 32apiVersion: batch/v1
kind: CronJob
metadata:
name: application-state-backup
spec:
schedule: "0 6 * * *" # Daily at 6 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: app-backup
image: incidents:latest
command:
- /bin/sh
- -c
- |
# Export configuration
./incident-server export config > /backup/config_$(date +%Y%m%d).json
# Export policies
./incident-server export policies > /backup/policies_$(date +%Y%m%d).json
# Upload to S3
aws s3 sync /backup/ s3://company-backups/incident-management/application/#!/bin/bash
# disaster-recovery.sh
# 1. Restore database from latest backup
LATEST_BACKUP=$(aws s3 ls s3://company-backups/incident-management/database/ | tail -n 1 | awk '{print $4}')
aws s3 cp s3://company-backups/incident-management/database/${LATEST_BACKUP} /tmp/
gunzip /tmp/${LATEST_BACKUP}
# 2. Create new database cluster
kubectl apply -f k8s/postgres.yaml
# 3. Restore data
kubectl exec -i postgres-cluster-rw-0 -- psql -U postgres -c "CREATE DATABASE incidents;"
kubectl exec -i postgres-cluster-rw-0 -- psql -U incidents -d incidents < /tmp/${LATEST_BACKUP%%.gz}
# 4. Deploy application
kubectl apply -k k8s/
# 5. Verify services
kubectl wait --for=condition=ready pod -l app=incident-server --timeout=300s
kubectl exec -it deployment/incident-server -- /app/incident-server health#!/bin/bash
# deploy.sh
set -e
NEW_IMAGE="$1"
if [ -z "$NEW_IMAGE" ]; then
echo "Usage: $0 <new-image>"
exit 1
fi
echo "Starting zero-downtime deployment to $NEW_IMAGE"
# 1. Update image
kubectl set image deployment/incident-server incident-server=$NEW_IMAGE -n incident-management
# 2. Wait for rollout
kubectl rollout status deployment/incident-server -n incident-management --timeout=600s
# 3. Verify deployment
READY_REPLICAS=$(kubectl get deployment incident-server -n incident-management -o jsonpath='{.status.readyReplicas}')
DESIRED_REPLICAS=$(kubectl get deployment incident-server -n incident-management -o jsonpath='{.spec.replicas}')
if [ "$READY_REPLICAS" = "$DESIRED_REPLICAS" ]; then
echo "Deployment successful: $READY_REPLICAS/$DESIRED_REPLICAS replicas ready"
else
echo "Deployment failed: only $READY_REPLICAS/$DESIRED_REPLICAS replicas ready"
kubectl rollout undo deployment/incident-server -n incident-management
exit 1
fi
# 4. Run smoke tests
./scripts/smoke-tests.sh
echo "Deployment completed successfully"#!/bin/bash
# rollback.sh
echo "Rolling back incident-server deployment"
# Get previous revision
CURRENT_REVISION=$(kubectl rollout history deployment/incident-server -n incident-management | tail -n 2 | head -n 1 | awk '{print $1}')
PREVIOUS_REVISION=$((CURRENT_REVISION - 1))
# Rollback to previous version
kubectl rollout undo deployment/incident-server -n incident-management --to-revision=$PREVIOUS_REVISION
# Wait for rollback to complete
kubectl rollout status deployment/incident-server -n incident-management --timeout=300s
# Verify rollback
./scripts/smoke-tests.sh
echo "Rollback completed successfully"#!/bin/bash
# db-maintenance.sh
# 1. Update statistics
kubectl exec -it postgres-cluster-rw-0 -- psql -U incidents -d incidents -c "ANALYZE;"
# 2. Vacuum tables
kubectl exec -it postgres-cluster-rw-0 -- psql -U incidents -d incidents -c "VACUUM (VERBOSE, ANALYZE);"
# 3. Reindex if needed
kubectl exec -it postgres-cluster-rw-0 -- psql -U incidents -d incidents -c "REINDEX DATABASE incidents;"
# 4. Check for unused indexes
kubectl exec -it postgres-cluster-rw-0 -- psql -U incidents -d incidents -c "
SELECT schemaname, tablename, attname, n_distinct, correlation
FROM pg_stats
WHERE n_distinct = 1 AND correlation = 1;
"#!/bin/bash
# log-rotation.sh
# Archive old logs
find /var/log/incident-management -name "*.log" -mtime +7 -exec gzip {} \;
# Remove logs older than 30 days
find /var/log/incident-management -name "*.log.gz" -mtime +30 -delete
# Restart log forwarder to pick up new files
kubectl rollout restart daemonset/fluentd -n logging#!/bin/bash
# capacity-check.sh
echo "=== CPU Usage ==="
kubectl top nodes
echo "=== Memory Usage ==="
kubectl top pods -n incident-management --sort-by=memory
echo "=== Storage Usage ==="
kubectl get pvc -n incident-management -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,CAPACITY:.status.capacity.storage,USED:.status.capacity.storage
echo "=== Network Usage ==="
kubectl exec -it deployment/incident-server -n incident-management -- ss -tuln# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: incident-server-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: incident-server
minReplicas: 5
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: active_incidents
target:
type: AverageValue
averageValue: "10"#!/bin/bash
# incident-response.sh
SEVERITY="$1"
ISSUE="$2"
case $SEVERITY in
"critical")
# Immediate response for critical issues
echo "CRITICAL ISSUE: $ISSUE"
# Scale up immediately
kubectl scale deployment incident-server --replicas=10 -n incident-management
# Check resource usage
kubectl top pods -n incident-management
# Check logs for errors
kubectl logs -l app=incident-server -n incident-management --tail=100 | grep ERROR
# Notify on-call team
curl -X POST "$PAGERDUTY_WEBHOOK" -d "{\"event_action\":\"trigger\",\"payload\":{\"summary\":\"Critical issue: $ISSUE\",\"severity\":\"critical\"}}"
;;
"warning")
# Standard response for warnings
echo "WARNING: $ISSUE"
# Gather diagnostics
kubectl get pods -n incident-management
kubectl get events -n incident-management --sort-by='.lastTimestamp' | tail -20
;;
esacThis production guide provides a comprehensive framework for deploying, monitoring, and operating the incident management platform in enterprise environments with high reliability and performance requirements.