torrust-tracker-deployer/src/application/command_handlers/provision/handler.rs at 272-add-https-support-with-caddy · torrust/torrust-tracker-deployer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
//! Provision command handler implementation

use std::net::{IpAddr, SocketAddr};
use std::sync::Arc;

use tracing::{error, info, instrument};

use super::errors::ProvisionCommandHandlerError;
use crate::adapters::ansible::AnsibleClient;
use crate::adapters::ssh::{SshConfig, SshCredentials};
use crate::adapters::tofu::client::InstanceInfo;
use crate::adapters::OpenTofuClient;
use crate::application::command_handlers::common::StepResult;
use crate::application::services::AnsibleTemplateService;
use crate::application::steps::{
    ApplyInfrastructureStep, GetInstanceInfoStep, InitializeInfrastructureStep,
    PlanInfrastructureStep, RenderOpenTofuTemplatesStep, ValidateInfrastructureStep,
    WaitForCloudInitStep, WaitForSSHConnectivityStep,
};
use crate::domain::environment::repository::{EnvironmentRepository, TypedEnvironmentRepository};
use crate::domain::environment::runtime_outputs::ProvisionMethod;
use crate::domain::environment::state::{ProvisionFailureContext, ProvisionStep};
use crate::domain::environment::{Environment, Provisioned, Provisioning};
use crate::domain::EnvironmentName;
use crate::infrastructure::templating::tofu::TofuProjectGenerator;
use crate::shared::error::Traceable;

/// `ProvisionCommandHandler` orchestrates the complete infrastructure provisioning workflow
///
/// The `ProvisionCommandHandler` orchestrates the complete infrastructure provisioning workflow.
///
/// This command handler handles all steps required to provision infrastructure:
/// 1. Render `OpenTofu` templates
/// 2. Initialize `OpenTofu`
/// 3. Validate configuration syntax and consistency
/// 4. Plan infrastructure
/// 5. Apply infrastructure
/// 6. Get instance information
/// 7. Render `Ansible` templates (with runtime IP address)
/// 8. Wait for SSH connectivity
/// 9. Wait for cloud-init completion
///
/// # State Management
///
/// The command handler integrates with the type-state pattern for environment lifecycle:
/// - Accepts `Environment<Created>` as input
/// - Transitions to `Environment<Provisioning>` at start
/// - Returns `Environment<Provisioned>` on success
/// - Transitions to `Environment<ProvisionFailed>` on error
///
/// State is persisted after each transition using the injected repository.
/// Persistence failures are logged but don't fail the command handler (state remains valid in memory).
pub struct ProvisionCommandHandler {
    clock: Arc<dyn crate::shared::Clock>,
    repository: TypedEnvironmentRepository,
}

impl ProvisionCommandHandler {
    /// Create a new `ProvisionCommandHandler`
    #[must_use]
    pub fn new(
        clock: Arc<dyn crate::shared::Clock>,
        repository: Arc<dyn EnvironmentRepository>,
    ) -> Self {
        Self {
            clock,
            repository: TypedEnvironmentRepository::new(repository),
        }
    }

    /// Execute the complete provisioning workflow
    ///
    /// # Arguments
    ///
    /// * `env_name` - The name of the environment to provision
    ///
    /// # Returns
    ///
    /// Returns the provisioned environment
    ///
    /// # Errors
    ///
    /// Returns an error if any step in the provisioning workflow fails:
    /// * Environment not found or not in `Created` state
    /// * Template rendering fails
    /// * `OpenTofu` initialization, planning, or apply fails
    /// * Unable to retrieve instance information
    /// * SSH connectivity cannot be established
    /// * Cloud-init does not complete successfully
    ///
    /// On error, the environment transitions to `ProvisionFailed` state and is persisted.
    #[instrument(
        name = "provision_command",
        skip_all,
        fields(
            command_type = "provision",
            environment = %env_name
        )
    )]
    pub async fn execute(
        &self,
        env_name: &EnvironmentName,
    ) -> Result<Environment<Provisioned>, ProvisionCommandHandlerError> {
        let environment = self.load_created_environment(env_name)?;

        let started_at = self.clock.now();

        let environment = environment.start_provisioning();

        self.repository.save_provisioning(&environment)?;

        // Execute provisioning workflow with explicit step tracking
        // This allows us to know exactly which step failed if an error occurs
        match self.execute_provisioning_workflow(&environment).await {
            Ok(provisioned) => {
                info!(
                    command = "provision",
                    environment = %provisioned.name(),
                    instance_ip = ?provisioned.instance_ip(),
                    "Infrastructure provisioning completed successfully"
                );

                self.repository.save_provisioned(&provisioned)?;

                Ok(provisioned)
            }
            Err((e, current_step)) => {
                error!(
                    command = "provision",
                    environment = %environment.name(),
                    error = %e,
                    step = ?current_step,
                    "Infrastructure provisioning failed"
                );

                let context =
                    self.build_failure_context(&environment, &e, current_step, started_at);
                let failed = environment.provision_failed(context);

                self.repository.save_provision_failed(&failed)?;

                Err(e)
            }
        }
    }

    /// Execute the provisioning workflow
    ///
    /// This method orchestrates the complete provisioning workflow across multiple phases:
    /// 1. Infrastructure provisioning (`OpenTofu`)
    /// 2. Configuration preparation (Ansible templates and system readiness)
    /// 3. State transition to Provisioned (with instance IP and provision method)
    ///
    /// If an error occurs, it returns both the error and the step that was being
    /// executed, enabling accurate failure context generation.
    ///
    /// # Errors
    ///
    /// Returns a tuple of (error, `current_step`) if any provisioning step fails
    ///
    /// # Returns
    ///
    /// Returns the provisioned environment with instance IP and provision method set
    async fn execute_provisioning_workflow(
        &self,
        environment: &Environment<Provisioning>,
    ) -> StepResult<Environment<Provisioned>, ProvisionCommandHandlerError, ProvisionStep> {
        let instance_ip = self.provision_infrastructure(environment).await?;

        self.prepare_for_configuration(environment, instance_ip)
            .await?;

        self.wait_for_system_readiness(environment, instance_ip)
            .await?;

        let provisioned = environment
            .clone()
            .provisioned(instance_ip, ProvisionMethod::Provisioned);

        Ok(provisioned)
    }

    // Private helper methods - organized from higher to lower level of abstraction

    /// Provision infrastructure using `OpenTofu`
    ///
    /// This method handles the complete `OpenTofu`-based infrastructure provisioning:
    /// - Render `OpenTofu` templates
    /// - Initialize, validate, plan, and apply infrastructure
    /// - Retrieve instance information
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment in Provisioning state
    ///
    /// # Returns
    ///
    /// Returns the IP address of the provisioned instance
    ///
    /// # Errors
    ///
    /// Returns a tuple of (error, `current_step`) if any provisioning step fails
    async fn provision_infrastructure(
        &self,
        environment: &Environment<Provisioning>,
    ) -> StepResult<IpAddr, ProvisionCommandHandlerError, ProvisionStep> {
        let (tofu_template_renderer, opentofu_client) =
            Self::build_infrastructure_dependencies(environment);

        let current_step = ProvisionStep::RenderOpenTofuTemplates;
        self.render_opentofu_templates(&tofu_template_renderer)
            .await
            .map_err(|e| (e, current_step))?;

        let current_step = ProvisionStep::OpenTofuInit;
        Self::create_instance(&opentofu_client).map_err(|e| (e, current_step))?;

        let current_step = ProvisionStep::GetInstanceInfo;
        let instance_info =
            Self::get_instance_info(&opentofu_client).map_err(|e| (e, current_step))?;
        let instance_ip = instance_info.ip_address;

        Ok(instance_ip)
    }

    /// Build dependencies for infrastructure provisioning
    ///
    /// Creates the template renderer and `OpenTofu` client needed for infrastructure provisioning.
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment in Provisioning state
    ///
    /// # Returns
    ///
    /// Returns a tuple of:
    /// - `TofuProjectGenerator` - For rendering `OpenTofu` templates
    /// - `OpenTofuClient` - For executing `OpenTofu` operations
    fn build_infrastructure_dependencies(
        environment: &Environment<Provisioning>,
    ) -> (Arc<TofuProjectGenerator>, Arc<OpenTofuClient>) {
        let opentofu_client = Arc::new(OpenTofuClient::new(environment.tofu_build_dir()));

        let template_manager = Arc::new(crate::domain::TemplateManager::new(
            environment.templates_dir(),
        ));

        let tofu_template_renderer = Arc::new(TofuProjectGenerator::new(
            template_manager,
            environment.build_dir(),
            environment.ssh_credentials().clone(),
            environment.ssh_port(),
            environment.instance_name().clone(),
            environment.provider_config().clone(),
        ));

        (tofu_template_renderer, opentofu_client)
    }

    /// Prepare for configuration stages
    ///
    /// This method handles preparation for future configuration stages:
    /// - Render Ansible templates with user inputs and runtime instance IP
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment in Provisioning state
    /// * `instance_ip` - IP address of the provisioned instance
    ///
    /// # Errors
    ///
    /// Returns a tuple of (error, `current_step`) if any preparation step fails
    async fn prepare_for_configuration(
        &self,
        environment: &Environment<Provisioning>,
        instance_ip: IpAddr,
    ) -> StepResult<(), ProvisionCommandHandlerError, ProvisionStep> {
        let current_step = ProvisionStep::RenderAnsibleTemplates;

        let ansible_template_service = AnsibleTemplateService::from_paths(
            environment.templates_dir(),
            environment.build_dir().clone(),
        );

        ansible_template_service
            .render_templates(&environment.context().user_inputs, instance_ip, None)
            .await
            .map_err(|e| {
                (
                    ProvisionCommandHandlerError::TemplateRendering(e.to_string()),
                    current_step,
                )
            })?;

        Ok(())
    }

    /// Wait for system readiness
    ///
    /// This method waits for the provisioned instance to be ready:
    /// - Wait for SSH connectivity on the configured port
    /// - Wait for cloud-init completion
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment in Provisioning state
    /// * `instance_ip` - IP address of the provisioned instance
    ///
    /// # Errors
    ///
    /// Returns a tuple of (error, `current_step`) if any readiness check fails
    async fn wait_for_system_readiness(
        &self,
        environment: &Environment<Provisioning>,
        instance_ip: IpAddr,
    ) -> StepResult<(), ProvisionCommandHandlerError, ProvisionStep> {
        let ansible_client = Self::build_ansible_client(environment);
        let ssh_credentials = environment.ssh_credentials();
        let ssh_port = environment.ssh_port();

        let current_step = ProvisionStep::WaitSshConnectivity;
        self.wait_for_readiness(&ansible_client, ssh_credentials, instance_ip, ssh_port)
            .await
            .map_err(|e| (e, current_step))?;

        Ok(())
    }

    /// Build Ansible client for playbook execution
    ///
    /// Creates the Ansible client needed for waiting on cloud-init completion.
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment in Provisioning state
    ///
    /// # Returns
    ///
    /// Returns `AnsibleClient` for executing Ansible playbooks
    fn build_ansible_client(environment: &Environment<Provisioning>) -> Arc<AnsibleClient> {
        Arc::new(AnsibleClient::new(environment.ansible_build_dir()))
    }

    /// Render `OpenTofu` templates
    ///
    /// Generates `OpenTofu` configuration files from templates.
    ///
    /// # Errors
    ///
    /// Returns an error if template rendering fails
    async fn render_opentofu_templates(
        &self,
        tofu_template_renderer: &Arc<TofuProjectGenerator>,
    ) -> Result<(), ProvisionCommandHandlerError> {
        RenderOpenTofuTemplatesStep::new(tofu_template_renderer.clone())
            .execute()
            .await?;

        Ok(())
    }

    /// Create the infrastructure instance using `OpenTofu`
    ///
    /// This method handles the `OpenTofu` workflow:
    /// - Initialize `OpenTofu` configuration
    /// - Validate configuration syntax and consistency
    /// - Plan the infrastructure changes
    /// - Apply the infrastructure changes
    ///
    /// # Errors
    ///
    /// Returns an error if any `OpenTofu` operation fails
    fn create_instance(
        opentofu_client: &Arc<OpenTofuClient>,
    ) -> Result<(), ProvisionCommandHandlerError> {
        InitializeInfrastructureStep::new(Arc::clone(opentofu_client)).execute()?;
        ValidateInfrastructureStep::new(Arc::clone(opentofu_client)).execute()?;
        PlanInfrastructureStep::new(Arc::clone(opentofu_client)).execute()?;
        ApplyInfrastructureStep::new(Arc::clone(opentofu_client)).execute()?;

        Ok(())
    }

    /// Get instance information from `OpenTofu`
    ///
    /// Retrieves information about the provisioned instance, including its IP address.
    ///
    /// # Errors
    ///
    /// Returns an error if instance information cannot be retrieved
    fn get_instance_info(
        opentofu_client: &Arc<OpenTofuClient>,
    ) -> Result<InstanceInfo, ProvisionCommandHandlerError> {
        let instance_info = GetInstanceInfoStep::new(Arc::clone(opentofu_client)).execute()?;
        Ok(instance_info)
    }

    /// Wait for system readiness
    ///
    /// Waits for SSH connectivity and cloud-init completion.
    ///
    /// # Arguments
    ///
    /// * `ansible_client` - Ansible client for running cloud-init wait playbook
    /// * `ssh_credentials` - SSH credentials for connecting to the instance
    /// * `instance_ip` - IP address of the provisioned instance
    /// * `ssh_port` - The configured SSH port to wait for
    ///
    /// # Errors
    ///
    /// Returns an error if SSH connectivity fails or cloud-init does not complete
    async fn wait_for_readiness(
        &self,
        ansible_client: &Arc<AnsibleClient>,
        ssh_credentials: &SshCredentials,
        instance_ip: IpAddr,
        ssh_port: u16,
    ) -> Result<(), ProvisionCommandHandlerError> {
        let ssh_socket_addr = SocketAddr::new(instance_ip, ssh_port);
        let ssh_config = SshConfig::new(ssh_credentials.clone(), ssh_socket_addr);

        WaitForSSHConnectivityStep::new(ssh_config)
            .execute()
            .await?;

        WaitForCloudInitStep::new(Arc::clone(ansible_client)).execute()?;

        Ok(())
    }

    /// Build failure context for a provisioning error and generate trace file
    ///
    /// This helper method builds structured error context including the failed step,
    /// error classification, timing information, and generates a trace file for
    /// post-mortem analysis.
    ///
    /// # Arguments
    ///
    /// * `environment` - The environment being provisioned (for trace directory path)
    /// * `error` - The provisioning error that occurred
    /// * `current_step` - The step that was executing when the error occurred
    /// * `started_at` - The timestamp when provisioning execution started
    ///
    /// # Returns
    ///
    /// A `ProvisionFailureContext` with all failure metadata and trace file path
    fn build_failure_context(
        &self,
        environment: &Environment<Provisioning>,
        error: &ProvisionCommandHandlerError,
        current_step: ProvisionStep,
        started_at: chrono::DateTime<chrono::Utc>,
    ) -> ProvisionFailureContext {
        use crate::application::command_handlers::common::failure_context::build_base_failure_context;
        use crate::infrastructure::trace::ProvisionTraceWriter;

        // Step that failed is directly provided - no reverse engineering needed
        let failed_step = current_step;

        // Get error kind from the error itself (errors are self-describing)
        let error_kind = error.error_kind();

        // Build base failure context using common helper
        let base = build_base_failure_context(&self.clock, started_at, error.to_string());

        // Build handler-specific context
        let mut context = ProvisionFailureContext {
            failed_step,
            error_kind,
            base,
        };

        // Generate trace file (logging handled by trace writer)
        let traces_dir = environment.traces_dir();
        let writer = ProvisionTraceWriter::new(traces_dir, Arc::clone(&self.clock));

        if let Ok(trace_file) = writer.write_trace(&context, error) {
            context.base.trace_file_path = Some(trace_file);
        }

        context
    }

    /// Load environment from storage and validate it is in `Created` state
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// * Persistence error occurs during load
    /// * Environment does not exist
    /// * Environment is not in `Created` state
    fn load_created_environment(
        &self,
        env_name: &EnvironmentName,
    ) -> Result<Environment<crate::domain::environment::Created>, ProvisionCommandHandlerError>
    {
        let any_env = self
            .repository
            .inner()
            .load(env_name)
            .map_err(ProvisionCommandHandlerError::StatePersistence)?;

        let any_env = any_env.ok_or_else(|| ProvisionCommandHandlerError::EnvironmentNotFound {
            name: env_name.to_string(),
        })?;

        Ok(any_env.try_into_created()?)
    }
}