Published: March 8, 2023
Author: Fernando McKenzie
Tags: Terraform, Infrastructure as Code, DevOps, Cloud, Automation
Building on our successful Kubernetes implementation in 2022, we faced a new challenge: managing infrastructure at scale across multiple environments and cloud providers. This article details our comprehensive Terraform implementation, establishing Infrastructure as Code (IaC) practices that reduced provisioning time by 85% and eliminated configuration drift across 200+ resources.
terraform-infrastructure/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── terraform.tfvars
│ │ └── outputs.tf
│ ├── staging/
│ └── production/
├── modules/
│ ├── networking/
│ │ ├── vpc/
│ │ ├── security-groups/
│ │ └── load-balancers/
│ ├── compute/
│ │ ├── eks/
│ │ ├── ec2/
│ │ └── autoscaling/
│ ├── data/
│ │ ├── rds/
│ │ ├── elasticache/
│ │ └── s3/
│ └── monitoring/
│ ├── cloudwatch/
│ ├── prometheus/
│ └── grafana/
├── policies/
│ ├── security/
│ ├── compliance/
│ └── cost-management/
├── scripts/
│ ├── deploy.sh
│ ├── validate.sh
│ └── cleanup.sh
└── docs/
├── architecture.md
├── runbooks/
└── decisions/
# environments/production/main.tf
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
azurerm = {
source = "hashicorp/azurerm"
version = "~> 3.0"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.0"
}
}
backend "s3" {
bucket = "supply-chain-terraform-state"
key = "production/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-locks"
# Workspace-based state isolation
workspace_key_prefix = "environments"
}
}
# Configure providers
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = var.environment
Project = "supply-chain"
ManagedBy = "terraform"
CostCenter = var.cost_center
Owner = var.owner_team
BackupPolicy = var.backup_policy
}
}
}
provider "azurerm" {
features {
resource_group {
prevent_deletion_if_contains_resources = true
}
key_vault {
purge_soft_delete_on_destroy = false
recover_soft_deleted_key_vaults = false
}
}
}
# Data sources for existing resources
data "aws_caller_identity" "current" {}
data "aws_region" "current" {}
# Local values for computed configurations
locals {
account_id = data.aws_caller_identity.current.account_id
region = data.aws_region.current.name
common_tags = {
Environment = var.environment
Project = "supply-chain"
ManagedBy = "terraform"
Repository = "terraform-infrastructure"
# Compliance tags
DataClassification = "internal"
BackupRequired = "true"
MonitoringEnabled = "true"
}
# Environment-specific configurations
config = {
dev = {
instance_count = 1
instance_size = "small"
enable_backup = false
multi_az = false
}
staging = {
instance_count = 2
instance_size = "medium"
enable_backup = true
multi_az = false
}
production = {
instance_count = 3
instance_size = "large"
enable_backup = true
multi_az = true
}
}
env_config = local.config[var.environment]
}
# Core infrastructure modules
module "networking" {
source = "../../modules/networking/vpc"
environment = var.environment
cidr_block = var.vpc_cidr
availability_zones = var.availability_zones
enable_nat_gateway = local.env_config.multi_az
enable_vpn_gateway = var.environment == "production"
tags = local.common_tags
}
module "security_groups" {
source = "../../modules/networking/security-groups"
vpc_id = module.networking.vpc_id
environment = var.environment
allowed_cidr_blocks = var.allowed_cidr_blocks
tags = local.common_tags
}
module "eks_cluster" {
source = "../../modules/compute/eks"
cluster_name = "${var.project_name}-${var.environment}"
cluster_version = var.kubernetes_version
vpc_id = module.networking.vpc_id
subnet_ids = module.networking.private_subnet_ids
node_groups = {
general = {
instance_types = ["t3.medium", "t3.large"]
scaling_config = {
desired_size = local.env_config.instance_count
min_size = 1
max_size = local.env_config.instance_count * 3
}
labels = {
workload = "general"
}
}
ml_workload = {
instance_types = ["c5.xlarge", "c5.2xlarge"]
capacity_type = "SPOT"
scaling_config = {
desired_size = var.environment == "production" ? 2 : 1
min_size = 0
max_size = 10
}
labels = {
workload = "ml-processing"
}
taints = [
{
key = "workload"
value = "ml"
effect = "NO_SCHEDULE"
}
]
}
}
tags = local.common_tags
}
module "rds_cluster" {
source = "../../modules/data/rds"
cluster_identifier = "${var.project_name}-${var.environment}"
engine = "aurora-postgresql"
engine_version = "14.6"
database_name = var.database_name
vpc_id = module.networking.vpc_id
subnet_ids = module.networking.database_subnet_ids
security_group_ids = [module.security_groups.database_sg_id]
instance_count = local.env_config.instance_count
instance_class = local.env_config.instance_size == "small" ? "db.t3.medium" :
local.env_config.instance_size == "medium" ? "db.r5.large" : "db.r5.xlarge"
backup_retention_period = local.env_config.enable_backup ? 30 : 7
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
monitoring_enabled = true
performance_insights_enabled = var.environment == "production"
tags = local.common_tags
}
module "monitoring" {
source = "../../modules/monitoring/cloudwatch"
environment = var.environment
eks_cluster_name = module.eks_cluster.cluster_name
rds_cluster_id = module.rds_cluster.cluster_identifier
notification_endpoints = var.notification_endpoints
tags = local.common_tags
}
# Output values for other configurations
output "vpc_id" {
description = "VPC ID"
value = module.networking.vpc_id
}
output "eks_cluster_endpoint" {
description = "EKS cluster endpoint"
value = module.eks_cluster.cluster_endpoint
sensitive = true
}
output "rds_cluster_endpoint" {
description = "RDS cluster endpoint"
value = module.rds_cluster.cluster_endpoint
sensitive = true
}
VPC Module Example:
# modules/networking/vpc/main.tf
variable "environment" {
description = "Environment name"
type = string
}
variable "cidr_block" {
description = "CIDR block for VPC"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "Availability zones"
type = list(string)
}
variable "enable_nat_gateway" {
description = "Enable NAT gateway"
type = bool
default = true
}
variable "enable_vpn_gateway" {
description = "Enable VPN gateway"
type = bool
default = false
}
variable "tags" {
description = "Tags to apply to resources"
type = map(string)
default = {}
}
# Local calculations
locals {
azs = length(var.availability_zones) > 0 ? var.availability_zones : data.aws_availability_zones.available.names
# Calculate subnet CIDRs automatically
public_subnets = [for i, az in local.azs : cidrsubnet(var.cidr_block, 8, i)]
private_subnets = [for i, az in local.azs : cidrsubnet(var.cidr_block, 8, i + 10)]
database_subnets = [for i, az in local.azs : cidrsubnet(var.cidr_block, 8, i + 20)]
}
data "aws_availability_zones" "available" {
state = "available"
}
# VPC
resource "aws_vpc" "main" {
cidr_block = var.cidr_block
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(var.tags, {
Name = "${var.environment}-vpc"
Type = "vpc"
})
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = merge(var.tags, {
Name = "${var.environment}-igw"
Type = "internet-gateway"
})
}
# Public Subnets
resource "aws_subnet" "public" {
count = length(local.azs)
vpc_id = aws_vpc.main.id
cidr_block = local.public_subnets[count.index]
availability_zone = local.azs[count.index]
map_public_ip_on_launch = true
tags = merge(var.tags, {
Name = "${var.environment}-public-${local.azs[count.index]}"
Type = "public-subnet"
Tier = "public"
# EKS tags
"kubernetes.io/role/elb" = "1"
"kubernetes.io/cluster/${var.environment}" = "shared"
})
}
# Private Subnets
resource "aws_subnet" "private" {
count = length(local.azs)
vpc_id = aws_vpc.main.id
cidr_block = local.private_subnets[count.index]
availability_zone = local.azs[count.index]
tags = merge(var.tags, {
Name = "${var.environment}-private-${local.azs[count.index]}"
Type = "private-subnet"
Tier = "private"
# EKS tags
"kubernetes.io/role/internal-elb" = "1"
"kubernetes.io/cluster/${var.environment}" = "shared"
})
}
# Database Subnets
resource "aws_subnet" "database" {
count = length(local.azs)
vpc_id = aws_vpc.main.id
cidr_block = local.database_subnets[count.index]
availability_zone = local.azs[count.index]
tags = merge(var.tags, {
Name = "${var.environment}-database-${local.azs[count.index]}"
Type = "database-subnet"
Tier = "database"
})
}
# NAT Gateways (conditional)
resource "aws_eip" "nat" {
count = var.enable_nat_gateway ? length(local.azs) : 0
domain = "vpc"
tags = merge(var.tags, {
Name = "${var.environment}-nat-eip-${count.index + 1}"
Type = "elastic-ip"
})
depends_on = [aws_internet_gateway.main]
}
resource "aws_nat_gateway" "main" {
count = var.enable_nat_gateway ? length(local.azs) : 0
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = merge(var.tags, {
Name = "${var.environment}-nat-${local.azs[count.index]}"
Type = "nat-gateway"
})
depends_on = [aws_internet_gateway.main]
}
# Route Tables
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = merge(var.tags, {
Name = "${var.environment}-public-rt"
Type = "route-table"
Tier = "public"
})
}
resource "aws_route_table" "private" {
count = length(local.azs)
vpc_id = aws_vpc.main.id
dynamic "route" {
for_each = var.enable_nat_gateway ? [1] : []
content {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
}
tags = merge(var.tags, {
Name = "${var.environment}-private-rt-${local.azs[count.index]}"
Type = "route-table"
Tier = "private"
})
}
# Route Table Associations
resource "aws_route_table_association" "public" {
count = length(aws_subnet.public)
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
resource "aws_route_table_association" "private" {
count = length(aws_subnet.private)
subnet_id = aws_subnet.private[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
# VPN Gateway (conditional)
resource "aws_vpn_gateway" "main" {
count = var.enable_vpn_gateway ? 1 : 0
vpc_id = aws_vpc.main.id
tags = merge(var.tags, {
Name = "${var.environment}-vpn-gw"
Type = "vpn-gateway"
})
}
# Outputs
output "vpc_id" {
description = "ID of the VPC"
value = aws_vpc.main.id
}
output "vpc_cidr_block" {
description = "CIDR block of the VPC"
value = aws_vpc.main.cidr_block
}
output "public_subnet_ids" {
description = "List of public subnet IDs"
value = aws_subnet.public[*].id
}
output "private_subnet_ids" {
description = "List of private subnet IDs"
value = aws_subnet.private[*].id
}
output "database_subnet_ids" {
description = "List of database subnet IDs"
value = aws_subnet.database[*].id
}
# S3 backend with state locking
terraform {
backend "s3" {
bucket = "supply-chain-terraform-state"
key = "production/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-locks"
# Versioning enabled for state history
versioning = true
# Server-side encryption
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
}
}
# State bucket configuration (bootstrap)
resource "aws_s3_bucket" "terraform_state" {
bucket = "supply-chain-terraform-state"
# Prevent accidental deletion
lifecycle {
prevent_destroy = true
}
tags = {
Name = "Terraform State"
Environment = "shared"
Purpose = "terraform-backend"
}
}
resource "aws_s3_bucket_versioning" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_s3_bucket_public_access_block" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}
# DynamoDB table for state locking
resource "aws_dynamodb_table" "terraform_locks" {
name = "terraform-locks"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "Terraform State Lock"
Environment = "shared"
Purpose = "terraform-locking"
}
}
#!/bin/bash
# scripts/workspace-management.sh
# Workspace management utilities
create_workspace() {
local env=$1
echo "Creating workspace for environment: $env"
terraform workspace new $env 2>/dev/null || terraform workspace select $env
# Initialize with environment-specific variables
cp environments/$env/terraform.tfvars .
echo "Workspace $env ready"
}
switch_workspace() {
local env=$1
echo "Switching to environment: $env"
terraform workspace select $env
cp environments/$env/terraform.tfvars .
echo "Current workspace: $(terraform workspace show)"
}
list_workspaces() {
echo "Available workspaces:"
terraform workspace list
echo "Current workspace: $(terraform workspace show)"
}
# Usage examples
case "$1" in
create)
create_workspace $2
;;
switch)
switch_workspace $2
;;
list)
list_workspaces
;;
*)
echo "Usage: $0 {create|switch|list} [environment]"
exit 1
;;
esac
# .gitlab-ci.yml
stages:
- validate
- plan
- apply
- test
variables:
TF_ROOT: ${CI_PROJECT_DIR}
TF_VERSION: 1.5.0
cache:
paths:
- ${TF_ROOT}/.terraform
before_script:
- cd ${TF_ROOT}
- terraform --version
- terraform init
# Validation stage
terraform:validate:
stage: validate
script:
- terraform validate
- terraform fmt -check
rules:
- changes:
- "**/*.tf"
- "**/*.tfvars"
terraform:security-scan:
stage: validate
image: bridgecrew/checkov:latest
script:
- checkov -d . --framework terraform
allow_failure: true
rules:
- changes:
- "**/*.tf"
terraform:cost-estimation:
stage: validate
image: infracost/infracost:ci-0.10
script:
- infracost breakdown --path . --format json --out-file infracost.json
- infracost comment gitlab --path infracost.json --repo $CI_PROJECT_PATH --merge-request $CI_MERGE_REQUEST_IID --gitlab-token $GITLAB_TOKEN
rules:
- if: '$CI_MERGE_REQUEST_ID'
# Planning stage
terraform:plan:dev:
stage: plan
environment:
name: dev
script:
- terraform workspace select dev
- terraform plan -var-file="environments/dev/terraform.tfvars" -out=tfplan
artifacts:
paths:
- tfplan
expire_in: 1 week
rules:
- if: '$CI_COMMIT_BRANCH == "develop"'
terraform:plan:staging:
stage: plan
environment:
name: staging
script:
- terraform workspace select staging
- terraform plan -var-file="environments/staging/terraform.tfvars" -out=tfplan
artifacts:
paths:
- tfplan
expire_in: 1 week
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
terraform:plan:production:
stage: plan
environment:
name: production
script:
- terraform workspace select production
- terraform plan -var-file="environments/production/terraform.tfvars" -out=tfplan
artifacts:
paths:
- tfplan
expire_in: 1 week
rules:
- if: '$CI_COMMIT_TAG'
# Apply stage
terraform:apply:dev:
stage: apply
environment:
name: dev
script:
- terraform workspace select dev
- terraform apply -auto-approve tfplan
dependencies:
- terraform:plan:dev
rules:
- if: '$CI_COMMIT_BRANCH == "develop"'
when: manual
terraform:apply:staging:
stage: apply
environment:
name: staging
script:
- terraform workspace select staging
- terraform apply -auto-approve tfplan
dependencies:
- terraform:plan:staging
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: manual
terraform:apply:production:
stage: apply
environment:
name: production
script:
- terraform workspace select production
- terraform apply -auto-approve tfplan
dependencies:
- terraform:plan:production
rules:
- if: '$CI_COMMIT_TAG'
when: manual
# Testing stage
terraform:test:
stage: test
image: golang:1.19
script:
- go mod download
- go test ./test/... -v -timeout 30m
rules:
- changes:
- "**/*.tf"
- "test/**/*"
# Compliance verification
terraform:compliance:
stage: test
script:
- terraform show -json tfplan > plan.json
- opa eval -d policies/ -i plan.json "data.terraform.deny[x]"
dependencies:
- terraform:plan:production
rules:
- if: '$CI_COMMIT_TAG'
# policies/security.rego
package terraform.security
# Deny if S3 buckets are not encrypted
deny[msg] {
resource := input.planned_values.root_module.resources[_]
resource.type == "aws_s3_bucket"
not resource.values.server_side_encryption_configuration
msg := sprintf("S3 bucket '%s' must have encryption enabled", [resource.name])
}
# Deny if RDS instances are not encrypted
deny[msg] {
resource := input.planned_values.root_module.resources[_]
resource.type == "aws_db_instance"
resource.values.storage_encrypted != true
msg := sprintf("RDS instance '%s' must have storage encryption enabled", [resource.name])
}
# Deny if security groups allow unrestricted access
deny[msg] {
resource := input.planned_values.root_module.resources[_]
resource.type == "aws_security_group"
rule := resource.values.ingress[_]
rule.cidr_blocks[_] == "0.0.0.0/0"
rule.from_port == 22
msg := sprintf("Security group '%s' allows SSH access from anywhere", [resource.name])
}
# Require specific tags
required_tags := ["Environment", "Project", "Owner"]
deny[msg] {
resource := input.planned_values.root_module.resources[_]
resource.type in ["aws_instance", "aws_s3_bucket", "aws_db_instance"]
tag := required_tags[_]
not resource.values.tags[tag]
msg := sprintf("Resource '%s' missing required tag: %s", [resource.name, tag])
}
# Dynamic configuration based on environment
locals {
# Environment-specific settings
environments = {
dev = {
instance_count = 1
instance_type = "t3.micro"
enable_monitoring = false
backup_retention = 7
multi_az = false
deletion_protection = false
}
staging = {
instance_count = 2
instance_type = "t3.small"
enable_monitoring = true
backup_retention = 14
multi_az = false
deletion_protection = false
}
production = {
instance_count = 3
instance_type = "t3.medium"
enable_monitoring = true
backup_retention = 30
multi_az = true
deletion_protection = true
}
}
# Current environment configuration
env = local.environments[var.environment]
# Dynamic security group rules
security_group_rules = {
web = [
{
type = "ingress"
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
},
{
type = "ingress"
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
]
app = [
{
type = "ingress"
from_port = 8080
to_port = 8080
protocol = "tcp"
source_security_group_id = aws_security_group.web.id
}
]
database = [
{
type = "ingress"
from_port = 5432
to_port = 5432
protocol = "tcp"
source_security_group_id = aws_security_group.app.id
}
]
}
# Dynamic monitoring configuration
cloudwatch_alarms = var.environment == "production" ? {
cpu_high = {
alarm_name = "high-cpu-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "80"
alarm_description = "This metric monitors ec2 cpu utilization"
}
memory_high = {
alarm_name = "high-memory-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MemoryUtilization"
namespace = "CWAgent"
period = "300"
statistic = "Average"
threshold = "85"
alarm_description = "This metric monitors memory utilization"
}
} : {}
}
# Use dynamic blocks with locals
resource "aws_security_group_rule" "web_rules" {
for_each = {
for idx, rule in local.security_group_rules.web :
"${rule.type}-${rule.from_port}-${rule.to_port}" => rule
}
security_group_id = aws_security_group.web.id
type = each.value.type
from_port = each.value.from_port
to_port = each.value.to_port
protocol = each.value.protocol
cidr_blocks = lookup(each.value, "cidr_blocks", null)
source_security_group_id = lookup(each.value, "source_security_group_id", null)
}
# Dynamic CloudWatch alarms
resource "aws_cloudwatch_metric_alarm" "alarms" {
for_each = local.cloudwatch_alarms
alarm_name = each.value.alarm_name
comparison_operator = each.value.comparison_operator
evaluation_periods = each.value.evaluation_periods
metric_name = each.value.metric_name
namespace = each.value.namespace
period = each.value.period
statistic = each.value.statistic
threshold = each.value.threshold
alarm_description = each.value.alarm_description
alarm_actions = [aws_sns_topic.alerts.arn]
tags = merge(local.common_tags, {
Name = each.value.alarm_name
})
}
// Custom provider for internal API
package main
import (
"context"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/plugin"
)
func main() {
plugin.Serve(&plugin.ServeOpts{
ProviderFunc: func() *schema.Provider {
return &schema.Provider{
Schema: map[string]*schema.Schema{
"api_url": {
Type: schema.TypeString,
Required: true,
Description: "API URL for internal services",
},
"api_token": {
Type: schema.TypeString,
Required: true,
Sensitive: true,
Description: "API token for authentication",
},
},
ResourcesMap: map[string]*schema.Resource{
"internal_service": resourceService(),
"internal_config": resourceConfig(),
},
ConfigureContextFunc: providerConfigure,
}
},
})
}
func resourceService() *schema.Resource {
return &schema.Resource{
CreateContext: resourceServiceCreate,
ReadContext: resourceServiceRead,
UpdateContext: resourceServiceUpdate,
DeleteContext: resourceServiceDelete,
Schema: map[string]*schema.Schema{
"name": {
Type: schema.TypeString,
Required: true,
Description: "Service name",
},
"version": {
Type: schema.TypeString,
Required: true,
Description: "Service version",
},
"config": {
Type: schema.TypeMap,
Optional: true,
Elem: &schema.Schema{Type: schema.TypeString},
Description: "Service configuration",
},
"endpoints": {
Type: schema.TypeList,
Computed: true,
Elem: &schema.Schema{Type: schema.TypeString},
Description: "Service endpoints",
},
},
}
}
func resourceServiceCreate(ctx context.Context, d *schema.ResourceData, m interface{}) diag.Diagnostics {
// Implementation for creating internal service
client := m.(*APIClient)
service := &Service{
Name: d.Get("name").(string),
Version: d.Get("version").(string),
Config: d.Get("config").(map[string]interface{}),
}
createdService, err := client.CreateService(service)
if err != nil {
return diag.FromErr(err)
}
d.SetId(createdService.ID)
d.Set("endpoints", createdService.Endpoints)
return nil
}
// test/terraform_test.go
package test
import (
"testing"
"time"
"github.com/gruntwork-io/terratest/modules/terraform"
"github.com/gruntwork-io/terratest/modules/test-structure"
"github.com/stretchr/testify/assert"
)
func TestTerraformInfrastructure(t *testing.T) {
t.Parallel()
// Setup test directory
testDir := test_structure.CopyTerraformFolderToTemp(t, "../", "environments/test")
// Configure Terraform options
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
TerraformDir: testDir,
Vars: map[string]interface{}{
"environment": "test",
"aws_region": "us-west-2",
"vpc_cidr": "10.100.0.0/16",
},
BackendConfig: map[string]interface{}{
"bucket": "test-terraform-state",
"key": "test/terraform.tfstate",
"region": "us-west-2",
},
})
// Clean up resources with "terraform destroy" at the end of the test
defer terraform.Destroy(t, terraformOptions)
// Run "terraform init" and "terraform apply"
terraform.InitAndApply(t, terraformOptions)
// Validate outputs
vpcID := terraform.Output(t, terraformOptions, "vpc_id")
assert.NotEmpty(t, vpcID)
eksEndpoint := terraform.Output(t, terraformOptions, "eks_cluster_endpoint")
assert.NotEmpty(t, eksEndpoint)
rdsEndpoint := terraform.Output(t, terraformOptions, "rds_cluster_endpoint")
assert.NotEmpty(t, rdsEndpoint)
}
func TestVPCModule(t *testing.T) {
t.Parallel()
terraformOptions := &terraform.Options{
TerraformDir: "../modules/networking/vpc",
Vars: map[string]interface{}{
"environment": "test",
"cidr_block": "10.200.0.0/16",
"availability_zones": []string{"us-west-2a", "us-west-2b"},
"enable_nat_gateway": true,
},
}
defer terraform.Destroy(t, terraformOptions)
terraform.InitAndApply(t, terraformOptions)
// Test VPC creation
vpcID := terraform.Output(t, terraformOptions, "vpc_id")
assert.Regexp(t, "^vpc-", vpcID)
// Test subnet creation
publicSubnets := terraform.OutputList(t, terraformOptions, "public_subnet_ids")
assert.Len(t, publicSubnets, 2)
privateSubnets := terraform.OutputList(t, terraformOptions, "private_subnet_ids")
assert.Len(t, privateSubnets, 2)
}
func TestSecurityCompliance(t *testing.T) {
t.Parallel()
terraformOptions := &terraform.Options{
TerraformDir: "../environments/production",
}
// Run plan to generate plan file
terraform.Init(t, terraformOptions)
planOut := terraform.Plan(t, terraformOptions)
// Check for security violations
assert.NotContains(t, planOut, "0.0.0.0/0", "Security groups should not allow unrestricted access")
assert.Contains(t, planOut, "encrypted = true", "Resources should be encrypted")
}
#!/bin/bash
# scripts/compliance-scan.sh
# Run multiple compliance tools
echo "Running Terraform compliance scans..."
# Checkov security scanning
echo "Running Checkov security scan..."
checkov -d . --framework terraform \
--output cli \
--output json \
--output-file-path ./reports/checkov-report.json
# TFSec security scanning
echo "Running TFSec security scan..."
tfsec . --format json > ./reports/tfsec-report.json
# Infracost analysis
echo "Running cost analysis..."
infracost breakdown --path . \
--format json \
--out-file ./reports/cost-analysis.json
# Custom policy validation with OPA
echo "Running OPA policy validation..."
terraform show -json tfplan > ./reports/plan.json
opa eval -d policies/ -i ./reports/plan.json \
"data.terraform.deny[x]" > ./reports/policy-violations.json
# Generate summary report
echo "Generating compliance summary..."
python3 scripts/generate-compliance-report.py \
--checkov ./reports/checkov-report.json \
--tfsec ./reports/tfsec-report.json \
--cost ./reports/cost-analysis.json \
--policy ./reports/policy-violations.json \
--output ./reports/compliance-summary.html
echo "Compliance scan complete. Reports available in ./reports/"
Infrastructure Provisioning:
Provisioning Improvements:
├── Time to provision new environment: 4-6 hours → 12 minutes (96% reduction)
├── Configuration consistency: Manual/inconsistent → 100% identical across environments
├── Infrastructure drift detection: Manual → Automated daily checks
├── Resource tagging compliance: 45% → 98% compliant
└── Documentation accuracy: Often outdated → Always current (generated)
Operational Efficiency:
operational_metrics = {
'deployment_frequency': {
'before': 12, # changes per year
'after': 156, # changes per year (13x increase)
'improvement': 'Weekly → Multiple daily deployments'
},
'mean_time_to_recovery': {
'before': 240, # 4 hours average
'after': 15, # 15 minutes average
'improvement': '94% faster incident recovery'
},
'infrastructure_costs': {
'before': 145000, # $145K monthly
'after': 89000, # $89K monthly
'savings': 56000 # $56K monthly savings (39% reduction)
},
'human_errors': {
'before': 23, # configuration errors per quarter
'after': 2, # configuration errors per quarter
'improvement': '91% reduction in manual errors'
}
}
Cost Optimization Results:
Annual Cost Analysis:
Infrastructure Costs (Before IaC):
├── Over-provisioned resources: $45,000/month
├── Orphaned resources: $12,000/month
├── Manual management overhead: $28,000/month
├── Inconsistent resource sizing: $18,000/month
└── Total wastage: $103,000/month
Infrastructure Costs (After IaC):
├── Right-sized resources: $72,000/month
├── Automated cleanup: $0/month (orphaned resources)
├── Automated management: $8,000/month
├── Optimized resource allocation: $9,000/month
└── Total optimized cost: $89,000/month
Annual Savings: $672,000 (39% reduction)
Learning: State management and module design are critical from day one
Implementation:
# Always use remote state with locking
terraform {
backend "s3" {
bucket = "company-terraform-state"
key = "path/to/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
# Design modules for reusability
module "standardized_vpc" {
source = "git::https://github.com/company/terraform-modules.git//vpc?ref=v1.2.0"
environment = var.environment
# Standardized sizing based on environment
size = var.environment == "production" ? "large" : "small"
tags = local.standard_tags
}
Learning: Automated compliance prevents security and cost issues
Policy Examples:
# Require encryption for all data stores
package terraform.encryption
deny[msg] {
resource := input.planned_values.root_module.resources[_]
resource.type in ["aws_s3_bucket", "aws_ebs_volume", "aws_db_instance"]
not is_encrypted(resource)
msg := sprintf("%s '%s' must be encrypted", [resource.type, resource.name])
}
is_encrypted(resource) {
resource.values.encrypted == true
}
is_encrypted(resource) {
resource.values.server_side_encryption_configuration
}
Learning: Infrastructure testing prevents production issues
Testing Strategy:
Learning: Infrastructure as Code enables rapid disaster recovery
DR Implementation:
# Multi-region configuration
module "primary_region" {
source = "./modules/environment"
region = "us-west-2"
environment = var.environment
is_primary = true
}
module "dr_region" {
source = "./modules/environment"
region = "us-east-1"
environment = var.environment
is_primary = false
# Only deploy DR in production
count = var.environment == "production" ? 1 : 0
}
# Cross-region replication
resource "aws_s3_bucket_replication_configuration" "replication" {
count = var.environment == "production" ? 1 : 0
role = aws_iam_role.replication[0].arn
bucket = module.primary_region.s3_bucket_id
rule {
id = "replicate-to-dr"
status = "Enabled"
destination {
bucket = module.dr_region[0].s3_bucket_arn
storage_class = "STANDARD_IA"
}
}
}
# ArgoCD for Kubernetes + Terraform
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: infrastructure-as-code
spec:
source:
repoURL: https://github.com/company/terraform-infrastructure
path: environments/production
targetRevision: HEAD
destination:
server: https://terraform-cloud-api
syncPolicy:
automated:
prune: true
selfHeal: true
# ML-based resource optimization
def optimize_infrastructure():
"""Use ML to recommend infrastructure optimizations"""
usage_patterns = get_usage_metrics()
cost_analysis = get_cost_data()
recommendations = ml_model.predict_optimizations(
usage_patterns=usage_patterns,
cost_data=cost_analysis,
constraints={
'max_cost_increase': 0.1,
'min_performance_level': 0.95
}
)
return create_terraform_changes(recommendations)
Our Terraform Infrastructure as Code implementation transformed infrastructure management from a manual, error-prone process to an automated, reliable system. The 85% reduction in provisioning time, 39% cost savings, and 91% reduction in configuration errors demonstrate the business value of treating infrastructure as software.
Critical Success Factors:
The IaC foundation now enables rapid innovation while maintaining operational excellence. We can provision complete environments in minutes, ensure consistent configurations across all stages, and respond to changing business requirements with confidence.
2023 taught us: Infrastructure as Code is not just about automation—it’s about creating a foundation for innovation. When infrastructure becomes code, it becomes testable, reviewable, and improvable using the same practices that make software development successful.
Implementing Infrastructure as Code? Let’s connect on LinkedIn to discuss your Terraform strategy.