Source code for ironic.conductor.servicing

#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

"""Functionality related to servicing."""

from oslo_log import log

from ironic.common import async_steps
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.drivers import utils as driver_utils
from ironic import objects

LOG = log.getLogger(__name__)



[docs]
@task_manager.require_exclusive_lock
def do_node_service(task, service_steps=None, disable_ramdisk=False):
    """Internal RPC method to perform servicing of a node.

    :param task: a TaskManager instance with an exclusive lock on its node
    :param service_steps: The list of service steps to perform. If none, step
                          validation will fail.
    :param disable_ramdisk: Whether to skip booting ramdisk for servicing.
    """
    node = task.node
    try:
        # NOTE(ghe): Valid power and network values are needed to perform
        # a service operation.
        task.driver.power.validate(task)
        if not disable_ramdisk:
            task.driver.network.validate(task)
    except (exception.InvalidParameterValue, exception.NetworkError) as e:
        msg = (_('Validation of node %(node)s for service failed: %(msg)s') %
               {'node': node.uuid, 'msg': e})
        return utils.servicing_error_handler(task, msg)

    utils.wipe_service_internal_info(task)
    node.set_driver_internal_info('service_steps', service_steps)
    node.set_driver_internal_info('service_disable_ramdisk',
                                  disable_ramdisk)
    task.node.save()

    # Allow the deploy driver to set up the ramdisk again (necessary for IPA)
    try:
        if not disable_ramdisk:
            prepare_result = task.driver.deploy.prepare_service(task)
        else:
            LOG.info('Skipping preparing for service in-band service since '
                     'out-of-band only service has been requested for node '
                     '%s', node.uuid)
            prepare_result = None
    except Exception as e:
        msg = (_('Failed to prepare node %(node)s for service: %(e)s')
               % {'node': node.uuid, 'e': e})
        return utils.servicing_error_handler(task, msg, traceback=True)

    if prepare_result == states.SERVICEWAIT:
        # Prepare is asynchronous, the deploy driver will need to
        # set node.driver_internal_info['service_steps'] and
        # node.service_step and then make an RPC call to
        # continue_node_service to start service operations.
        task.process_event('wait')
        return
    try:
        conductor_steps.set_node_service_steps(
            task, disable_ramdisk=disable_ramdisk)
    except Exception as e:
        # Catch all exceptions and follow the error handling
        # path so things are cleaned up properly.
        msg = (_('Cannot service node %(node)s: %(msg)s')
               % {'node': node.uuid, 'msg': e})
        return utils.servicing_error_handler(task, msg)

    steps = node.driver_internal_info.get('service_steps', [])
    step_index = 0 if steps else None
    do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)




[docs]
@utils.fail_on_error(utils.servicing_error_handler,
                     _("Unexpected error when processing next service step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def do_next_service_step(task, step_index, disable_ramdisk=None):
    """Do service, starting from the specified service step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first service step in the list to execute. This
        is the index (from 0) into the list of service steps in the node's
        driver_internal_info['service_steps']. Is None if there are no steps
        to execute.
    :param disable_ramdisk: Whether to skip booting ramdisk for service.
    """
    node = task.node
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    if step_index is None:
        steps = []
    else:
        assert node.driver_internal_info.get('service_steps') is not None, \
            f"BUG: No steps for {node.uuid}, step index is {step_index}"
        steps = node.driver_internal_info['service_steps'][step_index:]

    if disable_ramdisk is None:
        disable_ramdisk = node.driver_internal_info.get(
            'service_disable_ramdisk', False)

    LOG.info('Executing service on node %(node)s, remaining steps: '
             '%(steps)s', {'node': node.uuid, 'steps': steps})
    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.service_step = step
        node.set_driver_internal_info('service_step_index', step_index + ind)
        node.save()
        eocn = step.get('execute_on_child_nodes', False)
        result = None
        try:
            if not eocn:
                LOG.info('Executing %(step)s on node %(node)s',
                         {'step': step, 'node': node.uuid})
                use_step_handler = conductor_steps.use_reserved_step_handler(
                    task, step)
                if use_step_handler:
                    if use_step_handler == conductor_steps.EXIT_STEPS:
                        # Exit the step, i.e. hold step
                        return
                    # if use_step_handler == conductor_steps.USED_HANDLER
                    # Then we have completed the needful in the handler,
                    # but since there is no other value to check now,
                    # we know we just need to skip execute_deploy_step
                else:
                    interface = getattr(task.driver, step.get('interface'))
                    result = interface.execute_service_step(task, step)
            else:
                LOG.info('Executing %(step)s on child nodes for node '
                         '%(node)s.',
                         {'step': step, 'node': node.uuid})
                result = execute_step_on_child_nodes(task, step)

        except Exception as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get(
                        async_steps.SERVICING_REBOOT):
                    LOG.info('Agent is not yet running on node %(node)s '
                             'after service reboot, waiting for agent to '
                             'come up to run next service step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    node.set_driver_internal_info(
                        async_steps.SKIP_CURRENT_SERVICE_STEP, False)
                    task.process_event('wait')
                    return
            if isinstance(e, exception.AgentInProgress):
                LOG.info('Conductor attempted to process service step for '
                         'node %(node)s. Agent indicated it is presently '
                         'executing a command. Error: %(error)s',
                         {'node': task.node.uuid,
                          'error': e})
                node.set_driver_internal_info(
                    async_steps.SKIP_CURRENT_SERVICE_STEP, False)
                task.process_event('wait')
                return

            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
                    'step': node.service_step})
            if not disable_ramdisk:
                driver_utils.collect_ramdisk_logs(task.node, label='service')
            utils.servicing_error_handler(task, msg, traceback=True)
            return

        # Check if the step is done or not. The step should return
        # states.SERVICEWAIT if the step is still being executed, or
        # None if the step is done.
        if result == states.SERVICEWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_service to continue service
            LOG.info('Service step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            task.process_event('wait')
            return
        elif result is not None:
            msg = (_('While executing step %(step)s on node '
                     '%(node)s, step returned invalid value: %(val)s')
                   % {'step': step, 'node': node.uuid, 'val': result})
            return utils.servicing_error_handler(task, msg)
        LOG.info('Node %(node)s finished service step %(step)s',
                 {'node': node.uuid, 'step': step})
    utils.wipe_service_internal_info(task)
    if CONF.agent.deploy_logs_collect == 'always' and not disable_ramdisk:
        driver_utils.collect_ramdisk_logs(task.node, label='service')
    _tear_down_node_service(task, disable_ramdisk)



def _tear_down_node_service(task, disable_ramdisk):
    """Clean up a node from service.

    :param task: A Taskmanager object.
    :returns: None
    """
    task.node.service_step = None
    utils.wipe_service_internal_info(task)
    task.node.save()
    if not disable_ramdisk:
        try:
            task.driver.deploy.tear_down_service(task)
        except Exception as e:
            msg = (_('Failed to tear down from service for node %(node)s, '
                     'reason: %(err)s')
                   % {'node': task.node.uuid, 'err': e})
            return utils.servicing_error_handler(task, msg,
                                                 traceback=True,
                                                 tear_down_service=False)
    LOG.info('Node %s service complete.', task.node.uuid)
    task.process_event('done')



[docs]
def execute_step_on_child_nodes(task, step):
    """Execute a requested step against a child node.

    :param task: The TaskManager object for the parent node.
    :param step: The requested step to be executed.
    :returns: None on Success, the resulting error message if a
              failure has occurred.
    """
    # NOTE(TheJulia): We could just use nodeinfo list calls against
    # dbapi.
    # NOTE(TheJulia): We validate the data in advance in the API
    # with the original request context.
    eocn = step.get('execute_on_child_nodes')
    child_nodes = step.get('limit_child_node_execution', [])
    filters = {'parent_node': task.node.uuid}
    if eocn and len(child_nodes) >= 1:
        filters['uuid_in'] = child_nodes
    child_nodes = objects.Node.list(
        task.context,
        filters=filters,
        fields=['uuid']
    )
    for child_node in child_nodes:
        result = None
        LOG.info('Executing step %(step)s on child node %(node)s for parent '
                 'node %(parent_node)s',
                 {'step': step,
                  'node': child_node.uuid,
                  'parent_node': task.node.uuid})
        with task_manager.acquire(task.context,
                                  child_node.uuid,
                                  purpose='execute step') as child_task:
            interface = getattr(child_task.driver, step.get('interface'))
            LOG.info('Executing %(step)s on node %(node)s',
                     {'step': step, 'node': child_task.node.uuid})
            if not conductor_steps.use_reserved_step_handler(child_task, step):
                result = interface.execute_service_step(child_task, step)
            if result is not None:
                if (result == states.SERVICEWAIT
                    and CONF.conductor.permit_child_node_step_async_result):
                    # Operator has chosen to permit this due to some reason
                    # NOTE(TheJulia): This is where we would likely wire agent
                    # error handling if we ever implicitly allowed child node
                    # deploys to take place with the agent from a parent node
                    # being deployed.
                    continue
                msg = (_('While executing step %(step)s on child node '
                         '%(node)s, step returned invalid value: %(val)s')
                       % {'step': step, 'node': child_task.node.uuid,
                          'val': result})
                LOG.error(msg)
                # Only None or states.SERVICEWAIT are possible paths forward
                # in the parent step execution code, so returning the message
                # means it will be logged.
                return msg




[docs]
def get_last_error(node):
    last_error = _('By request, the service operation was aborted')
    if node.service_step:
        last_error += (
            _(' during or after the completion of step "%s"')
            % conductor_steps.step_id(node.service_step)
        )
    return last_error




[docs]
@task_manager.require_exclusive_lock
def do_node_service_abort(task):
    """Internal method to abort an ongoing operation.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node
    try:
        task.driver.deploy.tear_down_service(task)
    except Exception as e:
        log_msg = (_('Failed to tear down service for node %(node)s '
                     'after aborting the operation. Error: %(err)s') %
                   {'node': node.uuid, 'err': e})
        error_msg = _('Failed to tear down service after aborting '
                      'the operation')
        utils.servicing_error_handler(task, log_msg,
                                      errmsg=error_msg,
                                      traceback=True,
                                      tear_down_service=False,
                                      set_fail_state=False)
        return

    last_error = get_last_error(node)
    info_message = _('Service operation aborted for node %s') % node.uuid
    if node.service_step:
        info_message += (
            _(' during or after the completion of step "%s"')
            % node.service_step
        )

    node.last_error = last_error
    node.service_step = None
    utils.wipe_service_internal_info(task)
    node.save()
    LOG.info(info_message)




[docs]
@utils.fail_on_error(utils.servicing_error_handler,
                     _("Unexpected error when processing next service step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def continue_node_service(task):
    """Continue servicing after finishing an async service step.

    This function calculates which step has to run next and passes control
    into do_next_service_step.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node

    next_step_index = utils.update_next_step_index(task, 'service')

    # If this isn't the final service step in the service operation
    # and it is flagged to abort after the service step that just
    # finished, we abort the operation.
    if node.service_step.get('abort_after'):
        step_name = node.service_step['step']
        if next_step_index is not None:
            LOG.debug('The service operation for node %(node)s was '
                      'marked to be aborted after step "%(step)s '
                      'completed. Aborting now that it has completed.',
                      {'node': task.node.uuid, 'step': step_name})

            task.process_event('fail')
            do_node_service_abort(task)
            return

        LOG.debug('The service operation for node %(node)s was '
                  'marked to be aborted after step "%(step)s" '
                  'completed. However, since there are no more '
                  'service steps after this, the abort is not going '
                  'to be done.', {'node': node.uuid,
                                  'step': step_name})

    do_next_service_step(task, next_step_index)
ironic.conductor.servicing

Source code for ironic.conductor.servicing

ironic 24.2.0.dev114