AWS EMR Terraform module

Terraform module which creates AWS EMR resources.

This module supports the creation of:

EMR clusters using instance fleets or instance groups deployed in public or private subnets
EMR Virtual clusters that run on Amazon EKS
EMR Serverless clusters
EMR Studios
Security groups for master, core, and task nodes
Security group for EMR service to support private clusters
IAM roles for autoscaling, EMR service, and EC2 instance profiles

[!IMPORTANT] The appropriate resources have been tagged with { "for-use-with-amazon-emr-managed-policies" = true } to support the use of the recommended IAM policy "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2". Users are required to tag the appropriate VPC resources (VPC and subnets) as needed. See here for more details regarding v2 of managed EMR policies and their usage requirements.

Usage

Private Cluster w/ Instance Fleet

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-fleet"

  release_label = "emr-7.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = [
    {
      path = "file:/bin/echo",
      name = "Just an example",
      args = ["Hello World!"]
    }
  ]

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_fleet = {
    name                      = "master-fleet"
    target_on_demand_capacity = 1
    instance_type_configs = [
      {
        instance_type = "m5.xlarge"
      }
    ]
  }

  core_instance_fleet = {
    name                      = "core-fleet"
    target_on_demand_capacity = 2
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = [{
          size                 = 256
          type                 = "gp3"
          volumes_per_instance = 1
        }]
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        instance_type                              = "c6i.xlarge"
        weighted_capacity                          = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  task_instance_fleet = {
    name                      = "task-fleet"
    target_on_demand_capacity = 1
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = [{
          size                 = 256
          type                 = "gp3"
          volumes_per_instance = 1
        }]
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-abcde012", "subnet-bcde012a", "subnet-fghi345a"]
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Fleet

Configuration is the same as the public version shown above except for the following changes noted below. Users should utilize S3 and EMR VPC endpoints for private connectivity and avoid data transfer charges across NAT gateways.

...
  ec2_attributes = {
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-xyzde987", "subnet-slkjf456", "subnet-qeiru789"]
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Private Cluster w/ Instance Group

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-group"

  release_label = "emr-7.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = [
    {
      name = "Just an example",
      path = "file:/bin/echo",
      args = ["Hello World!"]
    }
  ]

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_group = {
    name           = "master-group"
    instance_count = 1
    instance_type  = "m5.xlarge"
  }

  core_instance_group = {
    name           = "core-group"
    instance_count = 2
    instance_type  = "c4.large"
  }

  task_instance_group = {
    name           = "task-group"
    instance_count = 2
    instance_type  = "c5.xlarge"
    bid_price      = "0.1"

    ebs_config = [{
      size                 = 256
      type                 = "gp3"
      volumes_per_instance = 1
    }]
    ebs_optimized = true
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-abcde012"
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Group

...
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-xyzde987"
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Conditional Creation

The following values are provided to toggle on/off creation of the associated resources as desired:

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  # Disables all resources from being created
  create = false

  # Enables the creation of a security configuration for the cluster
  # Configuration should be supplied via the `security_configuration` variable
  create_security_configuration = true

  # Disables the creation of the role used by the service
  # An externally created role must be supplied via the `service_iam_role_arn` variable
  create_service_iam_role = false

  # Disables the creation of the role used by the service
  # An externally created role can be supplied via the `autoscaling_iam_role_arn` variable
  create_autoscaling_iam_role = false

  # Disables the creation of the IAM role/instance profile used by the EC2 instances
  # An externally created IAM instance profile must be supplied
  # via the `iam_instance_profile_name` variable
  create_iam_instance_profile = false

  # Disables the creation of the security groups used by the EC2 instances. Users can supplied
  # security groups for `master`, `slave`, and `service` security groups via the
  # `ec2_attributes` map variable. If not, the EMR service will create and associate
  # the necessary security groups. Note - the VPC will need to be tagged with
  # { "for-use-with-amazon-emr-managed-policies" = true } for EMR to create security groups
  # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html
  create_managed_security_groups = false

  is_private_cluster = false
}

Examples

Examples codified under the examples are intended to give users references for how to use the module(s) as well as testing/validating changes to the source code of the module. If contributing to the project, please be sure to make any appropriate updates to the relevant examples to allow maintainers to test your changes and to keep the examples up to date for users. Thank you!

Private clusters using instance fleet or instance group
Public clusters using instance fleet or instance group
Serverless clusters running Spark or Hive
Studios with either IAM or SSO authentication
Virtual cluster running on Amazon EKS

License

Apache-2.0 Licensed. See LICENSE.

AWS EMR Terraform module

Terraform Module Source

AWS EMR Terraform module

Usage

Private Cluster w/ Instance Fleet

Public Cluster w/ Instance Fleet

Private Cluster w/ Instance Group

Public Cluster w/ Instance Group

Conditional Creation

Examples

License

Show terraform-docs details