Guia de instalação do Automation Suite

Última atualização 24 de fev de 2025

Pod do Rook Ceph ou Looker travado no estado Init

Detalhes do Erro

O código de erro para esse problema é 30001.

Description

Ocasionalmente, ao reiniciar um nó, um problema faz com que o pod do Looker ou do Rook Ceph fique travado no estado Init, já que o volume necessário para anexar o PVC a um pod está ausente.

Verifique se o problema está realmente relacionado ao Longhorn executando o seguinte comando:

kubectl get events -A -o json | jq -r '.items[] | select(.message != null) | select(.message | contains("cannot get resource \"volumeattachments\" in API group \"storage.k8s.io\""))'kubectl get events -A -o json | jq -r '.items[] | select(.message != null) | select(.message | contains("cannot get resource \"volumeattachments\" in API group \"storage.k8s.io\""))'

Se estiver relacionado ao Longhorn, esse comando deve retornar uma lista de nomes de pods afetados pelo problema. Se o comando não retornar nada, a causa do problema é diferente.

Solução

Execute o seguinte script para corrigir os pods problemáticos se o comando anterior retornar uma saída não vazia:

#!/bin/bash


function wait_till_rollout() {
    local namespace=$1
    local object_type=$2
    local deploy=$3

    local try=0
    local maxtry=2
    local status="notready"

    while [[ ${status} == "notready" ]]  && (( try != maxtry )) ; do
        kubectl -n "$namespace" rollout status "$deploy" -w --timeout=600s; 
        # shellcheck disable=SC2181
        if [[ "$?" -ne 0 ]]; 
        then
            status="notready"
            try=$((try+1))
        else
            status="ready"
        fi
    done
    if [[ $status == "notready" ]]; then 
        echo "$deploy of type $object_type failed in namespace $namespace. Plz re-run the script once again to verify that it's not a transient issue !!!"
        exit 1
    fi
}

function fix_pv_deployments() {
    for pod_name in $(kubectl get events -A -o json | jq -r '.items[]  | select(.message | contains("cannot get resource \"volumeattachments\" in API group \"storage.k8s.io\"")) | select(.involvedObject.kind == "Pod") | .involvedObject.name + "/" + .involvedObject.namespace' | sort | uniq)
    do
        POD_NAME=$(echo "${pod_name}" | cut -d '/' -f1)
        NS=$(echo "${pod_name}" | cut -d '/' -f2)
        controller_data=$(kubectl -n "${NS}" get po "${POD_NAME}" -o json | jq -r '[.metadata.ownerReferences[] | select(.controller==true)][0] | .kind + "=" + .name')
        [[ $controller_data == "" ]] && error "Error: Could not determine owner for pod: ${POD_NAME}" && exit 1
        CONTROLLER_KIND=$(echo "${controller_data}" | cut -d'=' -f1)
        CONTROLLER_NAME=$(echo "${controller_data}" | cut -d'=' -f2)
        if [[ $CONTROLLER_KIND == "ReplicaSet" ]]
        then
            controller_data=$(kubectl  -n "${NS}" get "${CONTROLLER_KIND}" "${CONTROLLER_NAME}" -o json | jq -r '[.metadata.ownerReferences[] | select(.controller==true)][0] | .kind + "=" + .name')
            CONTROLLER_KIND=$(echo "${controller_data}" | cut -d'=' -f1)
            CONTROLLER_NAME=$(echo "${controller_data}" | cut -d'=' -f2)

            replicas=$(kubectl -n "${NS}" get "$CONTROLLER_KIND" "$CONTROLLER_NAME" -o json | jq -r '.status.replicas')
            unavailable_replicas=$(kubectl -n "${NS}" get "$CONTROLLER_KIND" "$CONTROLLER_NAME" -o json | jq -r '.status.unavailableReplicas')

            if [ -n "$unavailable_replicas" ]; then 
                available_replicas=$((replicas - unavailable_replicas))
                if [ $available_replicas -eq 0 ]; then
                    kubectl -n "$NS" scale "$CONTROLLER_KIND" "$CONTROLLER_NAME" --replicas=0
                    sleep 15
                    kubectl -n "$NS" scale "$CONTROLLER_KIND" "$CONTROLLER_NAME" --replicas="$replicas"
                    deployment_name="$CONTROLLER_KIND/$CONTROLLER_NAME"
                    wait_till_rollout "$NS" "deploy" "$deployment_name"
                fi 
            fi
        fi
    done
}

fix_pv_deployments#!/bin/bash


function wait_till_rollout() {
    local namespace=$1
    local object_type=$2
    local deploy=$3

    local try=0
    local maxtry=2
    local status="notready"

    while [[ ${status} == "notready" ]]  && (( try != maxtry )) ; do
        kubectl -n "$namespace" rollout status "$deploy" -w --timeout=600s; 
        # shellcheck disable=SC2181
        if [[ "$?" -ne 0 ]]; 
        then
            status="notready"
            try=$((try+1))
        else
            status="ready"
        fi
    done
    if [[ $status == "notready" ]]; then 
        echo "$deploy of type $object_type failed in namespace $namespace. Plz re-run the script once again to verify that it's not a transient issue !!!"
        exit 1
    fi
}

function fix_pv_deployments() {
    for pod_name in $(kubectl get events -A -o json | jq -r '.items[]  | select(.message | contains("cannot get resource \"volumeattachments\" in API group \"storage.k8s.io\"")) | select(.involvedObject.kind == "Pod") | .involvedObject.name + "/" + .involvedObject.namespace' | sort | uniq)
    do
        POD_NAME=$(echo "${pod_name}" | cut -d '/' -f1)
        NS=$(echo "${pod_name}" | cut -d '/' -f2)
        controller_data=$(kubectl -n "${NS}" get po "${POD_NAME}" -o json | jq -r '[.metadata.ownerReferences[] | select(.controller==true)][0] | .kind + "=" + .name')
        [[ $controller_data == "" ]] && error "Error: Could not determine owner for pod: ${POD_NAME}" && exit 1
        CONTROLLER_KIND=$(echo "${controller_data}" | cut -d'=' -f1)
        CONTROLLER_NAME=$(echo "${controller_data}" | cut -d'=' -f2)
        if [[ $CONTROLLER_KIND == "ReplicaSet" ]]
        then
            controller_data=$(kubectl  -n "${NS}" get "${CONTROLLER_KIND}" "${CONTROLLER_NAME}" -o json | jq -r '[.metadata.ownerReferences[] | select(.controller==true)][0] | .kind + "=" + .name')
            CONTROLLER_KIND=$(echo "${controller_data}" | cut -d'=' -f1)
            CONTROLLER_NAME=$(echo "${controller_data}" | cut -d'=' -f2)

            replicas=$(kubectl -n "${NS}" get "$CONTROLLER_KIND" "$CONTROLLER_NAME" -o json | jq -r '.status.replicas')
            unavailable_replicas=$(kubectl -n "${NS}" get "$CONTROLLER_KIND" "$CONTROLLER_NAME" -o json | jq -r '.status.unavailableReplicas')

            if [ -n "$unavailable_replicas" ]; then 
                available_replicas=$((replicas - unavailable_replicas))
                if [ $available_replicas -eq 0 ]; then
                    kubectl -n "$NS" scale "$CONTROLLER_KIND" "$CONTROLLER_NAME" --replicas=0
                    sleep 15
                    kubectl -n "$NS" scale "$CONTROLLER_KIND" "$CONTROLLER_NAME" --replicas="$replicas"
                    deployment_name="$CONTROLLER_KIND/$CONTROLLER_NAME"
                    wait_till_rollout "$NS" "deploy" "$deployment_name"
                fi 
            fi
        fi
    done
}

fix_pv_deployments

Nesta página

Detalhes do Erro
Description
Solução

Esta página foi útil?

AnteriorFalha no redimensionamento do PVC do Objectstore

AvançarErro de anexo de volume StatefulSet