This is an automated email from the ASF dual-hosted git repository.
arvindsh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/fluo-muchos.git
The following commit(s) were added to refs/heads/master by this push:
new dd898d7 Add optional support for Azure ADLS Gen2 (#304)
dd898d7 is described below
commit dd898d79344a1df1f3990697717de6944b7a06a8
Author: Shan <[email protected]>
AuthorDate: Wed Jan 8 17:00:09 2020 -0500
Add optional support for Azure ADLS Gen2 (#304)
Add optional support for Azure ADLS Gen2
- Muchos launch can now create Azure ADLS Gen2 accounts
- Muchos setup can use Azure ADLS Gen2 storage accounts for Accumulo
- Volume chooser is configured to use local HDFS for WALs and ADLS Gen2 for
tables
---
README.md | 3 +
ansible/accumulo.yml | 10 +
.../handlers/init-adlsgen2.yml} | 6 +-
.../main.yml => accumulo/tasks/add-adlsgen2.yml} | 10 +-
.../main.yml => accumulo/tasks/init-adlsgen2.yml} | 11 +-
ansible/roles/accumulo/templates/accumulo-env.sh | 7 +
.../roles/accumulo/templates/accumulo.properties | 6 +
ansible/roles/azure/tasks/create_adlsgen2.yml | 235 +++++++++++++++++++++
ansible/roles/azure/tasks/main.yml | 2 +
ansible/roles/hadoop-ha/tasks/main.yml | 8 +
ansible/roles/hadoop-ha/templates/core-site.xml | 30 +++
ansible/roles/hadoop-ha/templates/mapred-site.xml | 6 +
ansible/roles/hadoop-ha/templates/yarn-site.xml | 6 +
ansible/roles/hadoop/tasks/main.yml | 8 +
ansible/roles/hadoop/templates/core-site.xml | 30 +++
ansible/roles/hadoop/templates/mapred-site.xml | 6 +
ansible/roles/hadoop/templates/yarn-site.xml | 6 +
conf/muchos.props.example | 25 +++
lib/muchos/config/azure.py | 27 ++-
19 files changed, 427 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index 4339ae8..4dd1fcc 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,9 @@ Under the `azure` section, edit following values as per
your configuration
* `numnodes` to change the cluster size in terms of number of nodes deployed
* `vm_sku` to specify the VM size to use. You can choose from the
[available VM
sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general).
+* `use_adlsg2` to use Azure Data Lake Storage(ADLS) Gen2 as datastore for
Accumulo
+ [ADLS Gen2
Doc](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction).
+ [Setup ADLS Gen2 as datastore for
Accumulo](https://accumulo.apache.org/blog/2019/10/15/accumulo-adlsgen2-notes.html).
Within Azure the `nodes` section is auto populated with the hostnames and
their default roles.
diff --git a/ansible/accumulo.yml b/ansible/accumulo.yml
index 2af9d67..2352c85 100644
--- a/ansible/accumulo.yml
+++ b/ansible/accumulo.yml
@@ -27,6 +27,16 @@
- import_tasks: roles/accumulo/tasks/init-accumulo.yml
handlers:
- import_tasks: roles/accumulo/handlers/init-accumulo.yml
+- hosts: all:!{{ azure_proxy_host }}
+ tasks:
+ - import_tasks: roles/accumulo/tasks/add-adlsgen2.yml
+ when: accumulo_major_version == '2' and use_adlsg2 == True
+- hosts: accumulomaster[0]
+ tasks:
+ - import_tasks: roles/accumulo/tasks/init-adlsgen2.yml
+ when: accumulo_major_version == '2' and use_adlsg2 == True
+ handlers:
+ - import_tasks: roles/accumulo/handlers/init-adlsgen2.yml
- hosts: accumulo
tasks:
- name: "start accumulo 1.0"
diff --git a/ansible/roles/azure/tasks/main.yml
b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
similarity index 86%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/handlers/init-adlsgen2.yml
index 6ec80d7..06f67b5 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -17,5 +15,5 @@
# limitations under the License.
#
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: "Initialize Apache Accumulo on ADLS Gen2 volume"
+ command: "{{ accumulo_home }}/bin/accumulo init --add-volumes"
diff --git a/ansible/roles/azure/tasks/main.yml
b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
similarity index 78%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/tasks/add-adlsgen2.yml
index 6ec80d7..8056f2d 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: Add ADLS Gen2 volume
+ lineinfile:
+ path: "{{ accumulo_home }}/conf/accumulo.properties"
+ regexp: '^instance.volumes='
+ line: "instance.volumes={{ hdfs_root }}/accumulo,{{
instance_volumes_preferred }}"
diff --git a/ansible/roles/azure/tasks/main.yml
b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
similarity index 67%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/tasks/init-adlsgen2.yml
index 6ec80d7..505b23d 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: "determine if accumulo needs to be initialized on adlsgen2"
+ command: "{{ hadoop_home }}/bin/hdfs dfs -stat {{
instance_volumes_preferred[0] }}"
+ register: adlsgen2_stat
+ changed_when: adlsgen2_stat.rc != 0
+ failed_when: adlsgen2_stat.rc != 0 and 'No such file or directory' not in
adlsgen2_stat.stderr
+ notify: Initialize Apache Accumulo on ADLS Gen2 volume
diff --git a/ansible/roles/accumulo/templates/accumulo-env.sh
b/ansible/roles/accumulo/templates/accumulo-env.sh
index a6a1bc6..083007b 100755
--- a/ansible/roles/accumulo/templates/accumulo-env.sh
+++ b/ansible/roles/accumulo/templates/accumulo-env.sh
@@ -41,6 +41,10 @@ export HADOOP_HOME={{ hadoop_home }}
export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
CLASSPATH="${conf}:${lib}/*:${HADOOP_CONF_DIR}:${ZOOKEEPER_HOME}/*:${HADOOP_HOME}/share/hadoop/client/*"
+{% if use_adlsg2 == True %}
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*"
+{% endif %}
export CLASSPATH
JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
@@ -50,6 +54,9 @@ JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
'-XX:OnOutOfMemoryError=kill -9 %p'
'-XX:-OmitStackTraceInFastThrow'
'-Djava.net.preferIPv4Stack=true'
+{% if use_adlsg2 == True %}
+ '-Dorg.wildfly.openssl.path=/usr/lib64'
+{% endif %}
"-Daccumulo.native.lib.path=${lib}/native")
case "$cmd" in
diff --git a/ansible/roles/accumulo/templates/accumulo.properties
b/ansible/roles/accumulo/templates/accumulo.properties
index 895cc99..eac3ddf 100644
--- a/ansible/roles/accumulo/templates/accumulo.properties
+++ b/ansible/roles/accumulo/templates/accumulo.properties
@@ -42,3 +42,9 @@ tserver.server.threads.minimum=64
## The maximum size for each write-ahead log
tserver.walog.max.size=512M
+
+{% if use_adlsg2 == True %}
+general.volume.chooser=org.apache.accumulo.server.fs.PreferredVolumeChooser
+general.custom.volume.preferred.default={{ instance_volumes_preferred }}
+general.custom.volume.preferred.logger={{ hdfs_root }}/accumulo
+{% endif %}
diff --git a/ansible/roles/azure/tasks/create_adlsgen2.yml
b/ansible/roles/azure/tasks/create_adlsgen2.yml
new file mode 100644
index 0000000..cd674dd
--- /dev/null
+++ b/ansible/roles/azure/tasks/create_adlsgen2.yml
@@ -0,0 +1,235 @@
+---
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# These Ansible tasks only run on the client machine where Muchos runs
+# At a high level, the various sections in this file do the following:
+# 1. Create an Azure ADLS Gen2 storage account.
+# 2. Create User Assigned Identity.
+# 3. Assign roles to storage accounts.
+# 4. Create filesysystem/container in storage accounts.
+# 5. Update tenant_id, client_id and instance_volumes_preferred in
muchos.props.
+# 6. Assign User Assigned Identity to VMSS.
+
+- name: Generate MD5 checksum based on resource_group name, vmss_name and
cluster name
+ shell: echo -n {{ resource_group + vmss_name + location }}|md5sum|tr -cd
"[:alnum:]"|cut -c 1-16|tr '[:upper:]' '[:lower:]'
+ register: StorageAccountMD5
+
+- name: Generate random names for storage account names
+ set_fact:
+ StorageAccountName: "{{ StorageAccountMD5.stdout +
99|random(seed=resource_group)|string + 99|random(seed=vmss_name)|string +
9|random(seed=location)|string }}"
+
+- name: Initialize instance variables
+ set_fact:
+ InstanceVolumesAuto: []
+ InstanceVolumesManual: []
+
+- name: Validate instance_volumes_input
+ fail: msg="Variable instance_volumes_input incorrectly specified, Both
Manual and Auto cannot be specified at same time"
+ when: instance_volumes_input.split('|')[0].split(',') != [''] and
instance_volumes_input.split('|')[1].split(',') != ['']
+
+- name: Assign manual or autogenerated volumes
+ set_fact:
+ InstanceVolumesTemp: "{{
instance_volumes_input.split('|')[0].split(',')|list if
instance_volumes_input.split('|')[0].split(',') != [''] else
instance_volumes_input.split('|')[1].split(',')|list }}"
+
+- name: Retrieve sequence end number to get the number of storage accounts
+ set_fact:
+ InstanceVolumesEndSequence: "{{ '1' if
instance_volumes_input.split('|')[0].split(',') == [''] else
InstanceVolumesTemp[0]|int }}"
+
+- name: Generate names for Storage Accounts
+ set_fact:
+ InstanceVolumesAuto: "{{ InstanceVolumesAuto +
['abfss://'+'accumulodata'+'@'+StorageAccountName+item+'.'+InstanceVolumesTemp[1]+'/accumulo']
}}"
+ with_sequence: start=1 end={{ InstanceVolumesEndSequence|int }}
+ when: InstanceVolumesTemp[0]|int != 0
+
+- name: Retrieve ABFSS values when specified manually
+ set_fact:
+ InstanceVolumesManual: "{{ InstanceVolumesManual + [ item ] }}"
+ loop:
+ "{{ InstanceVolumesTemp }}"
+ when: item.split('://')[0] == 'abfss' and
instance_volumes_input.split('|')[0].split(',') == ['']
+
+# This is final list of instance volumes
+- name: Assign variables for autogeneration or manual for storage account
creation
+ set_fact:
+ InstanceVolumes: "{{ InstanceVolumesManual if
instance_volumes_input.split('|')[0].split(',') == [''] else
InstanceVolumesAuto }}"
+
+- name: Update instance_volumes_preferred in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp:
'^instance_volumes_preferred\s*=\s*|^[#]instance_volumes_preferred\s*=\s*'
+ line: "instance_volumes_preferred = {{ InstanceVolumes|join(',') }}"
+
+# Not registering variable because storage values are not visible immediately
+- name: Create ADLS Gen2 storage acount using REST API
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: Storage
+ resource_type: storageAccounts
+ resource_name: "{{ item.split('@')[1].split('.')[0] }}"
+ api_version: '2019-04-01'
+ idempotency: yes
+ state: present
+ body:
+ sku:
+ name: "{{ adls_storage_type }}"
+ kind: StorageV2
+ properties:
+ isHnsEnabled: yes
+ location: "{{ location }}"
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Creating User Assigned identity with vmss_name suffixed by ua-msi if not
specified in muchos.props
+# Not registering variable because user identity values are not visible
immediately
+- name: Create User Assigned Identity
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: ManagedIdentity
+ resource_type: userAssignedIdentities
+ resource_name: "{{ user_assigned_identity if user_assigned_identity !=''
else vmss_name + '-ua-msi' }}"
+ api_version: '2018-11-30'
+ idempotency: yes
+ state: present
+ body:
+ location: "{{ location }}"
+
+# Retrieving facts about User Assigned Identity
+- name: Get facts for User Assigned Identity
+ azure_rm_resource_facts:
+ resource_group: "{{ resource_group }}"
+ provider: ManagedIdentity
+ resource_type: userAssignedIdentities
+ resource_name: "{{ user_assigned_identity if user_assigned_identity !=''
else vmss_name + '-ua-msi' }}"
+ api_version: '2018-11-30'
+ register: UserAssignedIdentityInfo
+ retries: 20
+ delay: 15
+ until:
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
is defined
+
+- name: Update principal_id in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^principal_id\s*=\s*|^[#]principal_id\s*=\s*'
+ line: "principal_id = {{
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
}}"
+
+# This will be used to assign the MSI for VMSS
+- name: Format User Assigned Identity for API
+ set_fact:
+ UserAssignedIdentityArr: "{{
UserAssignedIdentityInfo.response|default({})|map(attribute='id')|map('regex_replace','^(.*)$','{\"\\1\":{}}')|list}}"
+
+# Retrieve facts about role assignment
+- name: Get role definition id for "Storage Blob Data Contributor"
+ azure_rm_resource_facts:
+ resource_group: "{{ resource_group }}"
+ provider: Authorization
+ resource_type: roleDefinitions
+ resource_name: ba92f5b4-2d11-453d-a403-e96b0029c9fe
+ api_version: '2015-07-01'
+ register: RoleDefinitionInfo
+
+# Retrieve storage acount informationn.
+- name: Check if the storage accounts is visible
+ azure_rm_storageaccount_facts:
+ resource_group: "{{ resource_group }}"
+ name: "{{ item.split('@')[1].split('.')[0] }}"
+ register: StorageAccountsInfo
+ retries: 20
+ delay: 15
+ until:
StorageAccountsInfo.storageaccounts|sum(start=[])|map(attribute='id')|join('')
is defined
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Retrieve storage accounts id creeated -- Used for account assignments
+- name: Get the id of storage accounts created
+ set_fact:
+ StorageAccountsId:
"{{StorageAccountsInfo.results|map(attribute='ansible_facts')|map(attribute='azure_storageaccounts')|sum(start=[])|map(attribute='id')|list|unique
}}"
+
+# Adding this module since role aassignment fails if it already exists.
+- name: Get facts about role assignment
+ azure_rm_roleassignment_facts:
+ scope: "{{ item }}"
+ assignee: "{{
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('')
}}"
+ role_definition_id: "{{
RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+ register: RoleAssignmentResults
+ retries: 20
+ delay: 15
+ until:
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
is defined and RoleDefinitionInfo.response|map(attribute='id')|join('') is
defined
+ loop:
+ "{{ StorageAccountsId }}"
+
+- name: Set fact for getting storage accounts that have assigned roles
+ set_fact:
+ StorageAccountRoles: "{{ item|map(attribute='scope')|list|unique }}"
+ no_log: True
+ loop:
+ "{{RoleAssignmentResults.results|map(attribute='roleassignments')|list
}}"
+
+# This retry logic is needed due to race condition between storage account
create complete and role assignment
+- name: Create a role assignment
+ azure_rm_roleassignment:
+ scope: "{{ item }}"
+ assignee_object_id: "{{
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('')
}}"
+ role_definition_id: "{{
RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+ state: present
+ retries: 30
+ delay: 15
+ register: roleassignresult
+ until: roleassignresult is succeeded
+ loop:
+ "{{ StorageAccountsId }}"
+ when: item not in StorageAccountRoles
+
+# This retry logic is needed due to race condition between storage account
creation and creating filesystem
+- name: Create container/Filesystem on ADLS Gen2
+ azure_rm_storageblob:
+ resource_group: "{{ resource_group }}"
+ storage_account_name: "{{ item.split('@')[1].split('.')[0] }}"
+ container: "{{ item.split('@')[0].split('://')[1] }}"
+ retries: 30
+ delay: 15
+ register: createfsresult
+ until: createfsresult is succeeded and (createfsresult.changed == False or
(createfsresult.changed == True and createfsresult.container|length > 0))
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Retrieve tenantId for core-site.xml
+- name: Update tenantId in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^azure_tenant_id\s*=\s*|^[#]azure_tenant_id\s*=\s*'
+ line: "azure_tenant_id = {{
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='tenantId')|list|join('')
}}"
+
+# Retrieve clientId for core-site.xml
+- name: Update clientid in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^azure_client_id\s*=\s*|^[#]azure_client_id\s*=\s*'
+ line: "azure_client_id = {{
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='clientId')|list|join('')
}}"
+
+- name: Assign User Assigned Identity to VMSS
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: Compute
+ resource_type: virtualMachineScaleSets
+ resource_name: "{{ vmss_name }}"
+ api_version: '2019-03-01'
+ body:
+ location: "{{ location }}"
+ identity:
+ type: UserAssigned
+ userAssignedIdentities: "{{ UserAssignedIdentityArr|join('') }}"
diff --git a/ansible/roles/azure/tasks/main.yml
b/ansible/roles/azure/tasks/main.yml
index 6ec80d7..a846779 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/azure/tasks/main.yml
@@ -19,3 +19,5 @@
# tasks file for azure
- import_tasks: create_vmss.yml
+- import_tasks: create_adlsgen2.yml
+ when: use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/tasks/main.yml
b/ansible/roles/hadoop-ha/tasks/main.yml
index 7f456c8..dd92ae1 100644
--- a/ansible/roles/hadoop-ha/tasks/main.yml
+++ b/ansible/roles/hadoop-ha/tasks/main.yml
@@ -54,3 +54,11 @@
replace: "export HADOOP_LOG_DIR={{ worker_data_dirs[0] }}/logs/hadoop"
- name: "Create hadoop log dir"
file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+ blockinfile:
+ path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+ insertafter: EOF
+ block: |
+ export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+ export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64
${HADOOP_OPTS}"
+ when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/templates/core-site.xml
b/ansible/roles/hadoop-ha/templates/core-site.xml
index dd54827..d717c5c 100644
--- a/ansible/roles/hadoop-ha/templates/core-site.xml
+++ b/ansible/roles/hadoop-ha/templates/core-site.xml
@@ -38,4 +38,34 @@
<name>ha.zookeeper.quorum</name>
<value>{{ zookeeper_connect }}</value>
</property>
+{% if use_adlsg2 == True %}
+ <property>
+ <name>fs.azure.account.auth.type</name>
+ <value>OAuth</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth.provider.type</name>
+ <value>org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth2.msi.tenant</name>
+ <value>{{ azure_tenant_id}}</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth2.client.id</name>
+ <value>{{ azure_client_id }}</value>
+ </property>
+ <property>
+ <name>fs.azure.use.upn</name>
+ <value>true</value>
+ </property>
+ <property>
+
<name>fs.azure.identity.transformer.service.principal.substitution.list</name>
+ <value>*</value>
+ </property>
+ <property>
+ <name>fs.azure.identity.transformer.service.principal.id</name>
+ <value>{{ principal_id }}</value>
+ </property>
+{% endif %}
</configuration>
diff --git a/ansible/roles/hadoop-ha/templates/mapred-site.xml
b/ansible/roles/hadoop-ha/templates/mapred-site.xml
index c6be0ce..c3def16 100644
--- a/ansible/roles/hadoop-ha/templates/mapred-site.xml
+++ b/ansible/roles/hadoop-ha/templates/mapred-site.xml
@@ -54,4 +54,10 @@
<value>HADOOP_MAPRED_HOME={{ hadoop_home }}</value>
</property>
{% endif %}
+{% if use_adlsg2 == True %}
+ <property>
+ <name>mapreduce.application.classpath</name>
+
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+ </property>
+{% endif %}
</configuration>
diff --git a/ansible/roles/hadoop-ha/templates/yarn-site.xml
b/ansible/roles/hadoop-ha/templates/yarn-site.xml
index 85033a6..eb45896 100644
--- a/ansible/roles/hadoop-ha/templates/yarn-site.xml
+++ b/ansible/roles/hadoop-ha/templates/yarn-site.xml
@@ -93,4 +93,10 @@
<name>twill.java.reserved.memory.mb</name>
<value>{{ twill_reserve_mem_mb }}</value>
</property>
+ {% if use_adlsg2 == True %}
+ <property>
+ <name>yarn.application.classpath</name>
+
<value>${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+ </property>
+ {% endif %}
</configuration>
diff --git a/ansible/roles/hadoop/tasks/main.yml
b/ansible/roles/hadoop/tasks/main.yml
index d0219b3..a6733a9 100644
--- a/ansible/roles/hadoop/tasks/main.yml
+++ b/ansible/roles/hadoop/tasks/main.yml
@@ -55,3 +55,11 @@
- name: "Create hadoop log dir"
file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+ blockinfile:
+ path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+ insertafter: EOF
+ block: |
+ export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+ export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64
${HADOOP_OPTS}"
+ when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop/templates/core-site.xml
b/ansible/roles/hadoop/templates/core-site.xml
index 56232aa..c5f1597 100644
--- a/ansible/roles/hadoop/templates/core-site.xml
+++ b/ansible/roles/hadoop/templates/core-site.xml
@@ -36,4 +36,34 @@
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
+{% if use_adlsg2 == True %}
+ <property>
+ <name>fs.azure.account.auth.type</name>
+ <value>OAuth</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth.provider.type</name>
+ <value>org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth2.msi.tenant</name>
+ <value>{{ azure_tenant_id}}</value>
+ </property>
+ <property>
+ <name>fs.azure.account.oauth2.client.id</name>
+ <value>{{ azure_client_id }}</value>
+ </property>
+ <property>
+ <name>fs.azure.use.upn</name>
+ <value>true</value>
+ </property>
+ <property>
+
<name>fs.azure.identity.transformer.service.principal.substitution.list</name>
+ <value>*</value>
+ </property>
+ <property>
+ <name>fs.azure.identity.transformer.service.principal.id</name>
+ <value>{{ principal_id }}</value>
+ </property>
+{% endif %}
</configuration>
diff --git a/ansible/roles/hadoop/templates/mapred-site.xml
b/ansible/roles/hadoop/templates/mapred-site.xml
index a95eb77..7ecf751 100644
--- a/ansible/roles/hadoop/templates/mapred-site.xml
+++ b/ansible/roles/hadoop/templates/mapred-site.xml
@@ -56,4 +56,10 @@
<value>HADOOP_MAPRED_HOME={{ hadoop_home }}</value>
</property>
{% endif %}
+{% if use_adlsg2 == True %}
+ <property>
+ <name>mapreduce.application.classpath</name>
+
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+ </property>
+{% endif %}
</configuration>
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml
b/ansible/roles/hadoop/templates/yarn-site.xml
index ac62174..847f98b 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop/templates/yarn-site.xml
@@ -82,4 +82,10 @@
<name>twill.java.reserved.memory.mb</name>
<value>{{ twill_reserve_mem_mb }}</value>
</property>
+ {% if use_adlsg2 == True %}
+ <property>
+ <name>yarn.application.classpath</name>
+
<value>${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+ </property>
+ {% endif %}
</configuration>
diff --git a/conf/muchos.props.example b/conf/muchos.props.example
index b34a437..41cb5d5 100644
--- a/conf/muchos.props.example
+++ b/conf/muchos.props.example
@@ -129,6 +129,31 @@ metrics_drive_root = var-data
# Optional proxy VM. If not set, the first node of the cluster will be
selected as the proxy.
azure_proxy_host =
location = westus2
+# Enable ADLS Gen2 storage configuration. Muchos parameters
instance_volumes_input, instance_volumes_preferred & adls_storage_type is not
required if use_adlsg2 is false.
+use_adlsg2 = False
+# Storage accounts can be auto generated or manually specified. "|" is used as
separator between manual and auto generated storage account names and must be
specified
+# Manual and Auto generated names are mutually exclusive
+#
+# Specifying storage accounts manually:
+#
|abfss://<container-name>@<storage-account-name>.<domain-name>/<folder-name>".
Use comma to specify multiple entries
+#
Example:|abfss://[email protected]/accumulo,abfss://[email protected]/accumulo
+#
+# Specifying auto-generated storage accounts:
+# <Number-of-Storage-Accounts>,<domain-name>|
+# Example: 3,dfs.core.windows.net|
+instance_volumes_input = 1,dfs.core.windows.net|
+# Do not update "instance_volumes_preferred", it will be populated dynamically
during launch phase of muchos
+instance_volumes_preferred =
+# Type of storage for ADLS Gen2 storage accounts
+adls_storage_type = Standard_LRS
+# Specify user assigned identity name. "{{ vmss_name }}-ua-msi" will be
created if value is not provided
+user_assigned_identity =
+# Do not update "azure_tenant_id", it will be populated dynamically during
launch phase of muchos
+azure_tenant_id =
+# Do not update "azure_client_id", it will be populated dynamically during
launch phase of muchos
+azure_client_id =
+# Do not update "principal_id", it will be populated dynamically during launch
phase of muchos when "use_hdfs = False"
+principal_id =
# Optional Azure fileshare to mount on all nodes.
# Path and credentials must be updated to enable this.
#azure_fileshare_mount = /mnt/azure-fileshare
diff --git a/lib/muchos/config/azure.py b/lib/muchos/config/azure.py
index fe93b55..86c584c 100644
--- a/lib/muchos/config/azure.py
+++ b/lib/muchos/config/azure.py
@@ -104,4 +104,29 @@ class AzureDeployConfig(BaseConfig):
@ansible_host_var(name='az_logs_key')
@default(None)
def logs_key(self):
- return self.get('azure', 'az_logs_key')
\ No newline at end of file
+ return self.get('azure', 'az_logs_key')
+
+ @ansible_host_var(name='use_adlsg2')
+ @default(None)
+ def use_adlsg2(self):
+ return self.get('azure', 'use_adlsg2')
+
+ @ansible_host_var(name='azure_tenant_id')
+ @default(None)
+ def azure_tenant_id(self):
+ return self.get('azure', 'azure_tenant_id')
+
+ @ansible_host_var(name='azure_client_id')
+ @default(None)
+ def azure_client_id(self):
+ return self.get('azure', 'azure_client_id')
+
+ @ansible_host_var(name='principal_id')
+ @default(None)
+ def principal_id(self):
+ return self.get('azure', 'principal_id')
+
+ @ansible_host_var(name='instance_volumes_preferred')
+ @default(None)
+ def instance_volumes_preferred(self):
+ return self.get('azure', 'instance_volumes_preferred')