Automation Suite
2022.4
false
重要 :
请注意此内容已使用机器翻译进行了部分本地化。
Automation Suite 安装指南
Last updated 2024年7月12日

集群还原后 MongoDB 或业务应用程序降级

描述

有时,在集群还原/回滚到版本 2022.4.x 或 2021.10.x 后,某个问题会导致 MongoDB 或业务应用程序 (Apps) Pod 卡在初始状态。 发生这种情况是因为缺少将 PVC 附加到 Pod 所需的卷。

解决方案

  1. 验证问题是否确实与 MongoDB 卷附件问题有关:

    # fetch all mongodb pods kubectl -n mongodb get pods #describe pods stuck in init state #kubectl -n mongodb describe pods mongodb-replica-set-<replica index number> kubectl -n mongodb describe pods mongodb-replica-set-0# fetch all mongodb pods kubectl -n mongodb get pods #describe pods stuck in init state #kubectl -n mongodb describe pods mongodb-replica-set-<replica index number> kubectl -n mongodb describe pods mongodb-replica-set-0

    如果问题与 MongoDB 卷附件有关,则会显示以下事件:

    Events:
      Type     Reason              Age                   From                     Message
      ----                   ----                  ----                     
      Warning  FailedAttachVolume  3m9s (x65 over 133m)  attachdetach-controller  AttachVolume.Attach failed for volume "pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07" : rpc error: code = Aborted desc = volume pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07 is not ready for workloads
      Warning  FailedMount         103s (x50 over 112m)  kubelet                  (combined from similar events): Unable to attach or mount volumes: unmounted volumes=[logs-volume], unattached volumes=[hooks mongodb-replica-set-keyfile tls-secret data-volume healthstatus tls-ca kube-api-access-45qcl agent-scripts logs-volume automation-config]: timed out waiting for the conditionEvents:
      Type     Reason              Age                   From                     Message
      ----                   ----                  ----                     
      Warning  FailedAttachVolume  3m9s (x65 over 133m)  attachdetach-controller  AttachVolume.Attach failed for volume "pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07" : rpc error: code = Aborted desc = volume pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07 is not ready for workloads
      Warning  FailedMount         103s (x50 over 112m)  kubelet                  (combined from similar events): Unable to attach or mount volumes: unmounted volumes=[logs-volume], unattached volumes=[hooks mongodb-replica-set-keyfile tls-secret data-volume healthstatus tls-ca kube-api-access-45qcl agent-scripts logs-volume automation-config]: timed out waiting for the condition
  2. 通过运行以下脚本来修复有问题的 MongoDB Pod:

    #!/bin/bash
    
    set -eu
    
    
    FAILED_PVC_LIST=""
    STORAGE_CLASS_SINGLE_REPLICA="longhorn-backup-single-replica"
    STORAGE_CLASS="longhorn-backup"
    LOCAL_RESTORE_PATH="restoredata"
    mkdir -p ${LOCAL_RESTORE_PATH}
    export LOCAL_RESTORE_PATH="${LOCAL_RESTORE_PATH}"
    
    
    function delete_pvc_resources(){
      local pv_name=$1
      local volumeattachment_name=$2
    
      echo "deleting pv & volumes forcefully"
      delete_pv_forcefully "${pv_name}"
      sleep 2
      delete_longhornvolume_forcefully "${pv_name}"
      sleep 2
      if [ -n "${volumeattachment_name}" ]; then
        echo "deleting volumeattachments forcefully"
        delete_volumeattachment_forcefully "${volumeattachment_name}"
        sleep 5
      fi
    }
    
    function delete_longhornvolume_forcefully() {
      local pv_name=$1
      
      if ! kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" >/dev/null 2>&1 ; then
        echo "Volume ${pv_name} not found, skip deletion."
        return
      fi
    
      kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
      local success=0
      local try=0
      local maxtry=10
      while (( try < maxtry )); do
        local result=""
        # update finaluzer field to null
        result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
        echo "${result}"
    
        result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}")|| true
        echo "${result}"
        if [[ -z "${result}" ]]; then
          success=1
          break;
        fi
        kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
        echo "Waiting to delete volume ${pv_name} ${try}/${maxtry}..."; sleep 10
        try=$(( try + 1 ))
      done
    
       if [ "${success}" -eq 0 ]; then
        echo "Failed to delete volume ${pv_name}."
      fi
    
    }
    
    function delete_pv_forcefully() {
      local pv_name=$1
    
      kubectl delete pv "${pv_name}" --grace-period=0 --force &
    
      local success=0
      local try=0
      local maxtry=10
      while (( try < maxtry )); do
        # update finaluzer field to null
        result=$(kubectl get pv "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
        echo "${result}"
    
        result=$(kubectl get pv "${pv_name}")|| true
        echo "${result}"
        if [[ -z "${result}" ]]; then
          success=1
          break;
        fi
        kubectl delete pv "${pv_name}" --grace-period=0 --force &
        echo "Waiting to delete pv ${pv_name} ${try}/${maxtry}..."; sleep 10
        try=$(( try + 1 ))
      done
    
      if [ "${success}" -eq 0 ]; then
        echo "Failed to delete pv ${pv_name}."
      fi
    }
    
    function validate_pv_backup_available(){
      local pv_name=$1
      local validate_pv_backup_available_resp=0
      local backup_resp=""
      local resp_status_code=""
      local resp_message=""
    
      local try=0
      local maxtry=3
      while (( try != maxtry )) ; do
        backup_resp=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?") || true
        echo "Backup Response: ${backup_resp}"
        if { [ -n "${backup_resp}" ] && [ "${backup_resp}" != " " ]; }; then
          resp_status_code=$(echo "${backup_resp}"| jq -c ".status")
          resp_message=$(echo "${backup_resp}"| jq -c ".message")
          if [[ -n "${resp_status_code}" && "${resp_status_code}" != " " && "${resp_status_code}" != "null" && (( resp_status_code -gt 200 )) ]] ; then
            validate_pv_backup_available_resp=0
          else
            resp_message=$(echo "${backup_resp}"| jq -c ".messages.error")
            if { [ -z "${resp_message}" ] || [ "${resp_message}" = "null" ]; }; then
              echo "PVC Backup is available for ${pv_name}"
              # shellcheck disable=SC2034
              validate_pv_backup_available_resp=1
              break;
            fi
          fi
        fi
        try=$((try+1))
        sleep 10
      done
      export IS_BACKUP_AVAILABLE="${validate_pv_backup_available_resp}"
    }
    
    function store_pvc_resources(){
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
    
      if [[ -n "${volumeattachment_name}" && "${volumeattachment_name}" != " " ]]; then
        result=$(kubectl get volumeattachments "${volumeattachment_name}" -o json > "${LOCAL_RESTORE_PATH}/${volumeattachment_name}".json) || true
        echo "${result}"
      fi
    
      kubectl get pv "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
      kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json
      kubectl get pvc "${pvc_name}" -n "${namespace}" -o json > "${LOCAL_RESTORE_PATH}/${pvc_name}"-pvc.json
    }
    
    function wait_pvc_bound() {
      local namespace=$1
      local pvc_name=$2
    
      try=0
      maxtry=30
      while (( try < maxtry )); do
        local status=""
        status=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".status | select( has(\"phase\")).phase")
        if [[ "${status}"  == "Bound" ]]; then
          echo "PVC ${pvc_name} Bouned successfully."
          break;
        fi
        echo "waiting for PVC to bound...${try}/${maxtry}"; sleep 30
        try=$((try+1))
      done
    }
    
    function create_volume() {
      local pv_name=$1
      local backup_path=$2
      local response
      create_volume_status="succeed"
      echo "Creating Volume with PV: ${pv_name} and BackupPath: ${backup_path}"
      local accessmode
      local replicacount
      accessmode=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.accessMode")
      replicacount=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.numberOfReplicas")
    
      # create volume from backup pv
      response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/volumes" -H 'Accept: application/json' -H 'Content-Type: application/json;charset=UTF-8' --data-raw "{\"name\":\"${pv_name}\", \"accessMode\":\"${accessmode}\", \"fromBackup\": \"${backup_path}\", \"numberOfReplicas\": ${replicacount}}")
    
      sleep 5
    
      if [[ -n "${response}" && "${response}" != "null" && "${response}" != " " && -n $(echo "${response}"|jq -r ".id") ]]; then
      # wait for volume to be detached , max wait 4hr
        local try=0
        local maxtry=480
        local success=0
        while (( try < maxtry ));do
          status=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}"  -o json |jq -r ".status.state") || true
          echo "volume ${pv_name} status: ${status}"
          if [[ "${status}" == "detached" ]]; then
            # update label
            kubectl -n longhorn-system label volumes.longhorn.io/"${pv_name}" recurring-job-group.longhorn.io/uipath-backup=enabled
            success=1
            echo "Volume ready to use"
            break
          fi
          echo "waiting for volume to be ready to use${try}/${maxtry}..."; sleep 30
          restore_status_url="${LONGHORN_URL}/v1/volumes/${pv_name}?action=snapshotList"
          try=$(( try + 1 ))
        done
    
        if [ "${success}" -eq 0 ]; then
          create_volume_status="failed"
          echo "${pv_name} Volume is not ready to use with status ${status}"
        fi
      else
        create_volume_status="failed"
        kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json || true
        echo "${pv_name} Volume creation failed ${response} "
      fi
      echo "${create_volume_status}"
    }
    
    function restore_with_backupvolume() {
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
      local backup_path=$5
      create_volume_status="succeed"
      store_pvc_resources "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}"
      sleep 5
      delete_pvc_resources "${pv_name}" "${volumeattachment_name}"
      sleep 5
      create_volume "${pv_name}" "${backup_path}"
      sleep 10
    
      kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
      if [[ -n "${create_volume_status}" && "${create_volume_status}" != "succeed" ]]; then
        echo "Backup volume restore failed for pvc ${pvc_name} in namespace ${namespace}"
        restore_pvc_status="failed"
      else 
        local pvc_uid=""
        pvc_uid=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".metadata.uid")
    
        # update pv with pvc uid
        kubectl patch pv "${pv_name}" --type json -p "[{\"op\": \"add\", \"path\": \"/spec/claimRef/uid\", \"value\": \"${pvc_uid}\"}]"
    
        wait_pvc_bound "${namespace}" "${pvc_name}"
        echo "${result}"
      fi
      echo ${restore_pvc_status}
    }
    
    function restore_pvc(){
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
      local backup_path=$5
      restore_pvc_status="succeed"
      restore_with_backupvolume "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
      echo ${restore_pvc_status}
      # attach_volume "${pv_name}"
    }
    
    function get_backup_list_by_pvname(){
      local pv_name=$1
      local get_backup_list_by_pvname_resp=""
      local backup_list_response=""
    
      local try=0
      local maxtry=3
      while (( try != maxtry )) ; do
    
        backup_list_response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?action=backupList" -X 'POST' -H 'Content-Length: 0' -H 'Accept: application/json')
        if [[ -n "${backup_list_response}" && "${backup_list_response}" != " " && -n $( echo "${backup_list_response}"|jq ".data" )  && -n $( echo "${backup_list_response}"|jq ".data[]" ) ]]; then
          echo "Backup List Response: ${backup_list_response}"
          # pick first backup data with url non empty
          # shellcheck disable=SC2034
          get_backup_list_by_pvname_resp=$(echo "${backup_list_response}"|jq -c '[.data[]|select(.url | . != null and . != "")][0]')
          break;
        fi
        try=$((try+1))
        sleep 10
      done
      export PV_BACKUP_PATH="${get_backup_list_by_pvname_resp}"
    }
    
    function get_pvc_resources() {
    
      get_pvc_resources_resp=""
    
      local PVC_NAME=$1
      local PVC_NAMESPACE=$2
    
      # PV have one to one mapping with PVC
      PV_NAME=$(kubectl -n "${PVC_NAMESPACE}" get pvc "${PVC_NAME}" -o json|jq -r ".spec.volumeName")
    
      VOLUME_ATTACHMENT_LIST=$(kubectl get volumeattachments -n "${PVC_NAMESPACE}" -o=json|jq -c ".items[]|\
    {name: .metadata.name, pvClaimName:.spec.source| select( has (\"persistentVolumeName\")).persistentVolumeName}")
    
      VOLUME_ATTACHMENT_NAME=""
      for VOLUME_ATTACHMENT in ${VOLUME_ATTACHMENT_LIST};
      do
        PV_CLAIM_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".pvClaimName")
        if [[ "${PV_NAME}" = "${PV_CLAIM_NAME}" ]]; then
          VOLUME_ATTACHMENT_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".name")
          break;
        fi
      done
    
      BACKUP_PATH=""
      validate_pv_backup_available "${PV_NAME}"
      local is_backup_available=${IS_BACKUP_AVAILABLE:- 0}
      unset IS_BACKUP_AVAILABLE
    
      if [ "${is_backup_available}" -eq 1 ]; then
        echo "Backup is available for PVC ${PVC_NAME}"
    
        get_backup_list_by_pvname "${PV_NAME}"
        BACKUP_PATH=$(echo "${PV_BACKUP_PATH}"| jq -r ".url")
        unset PV_BACKUP_PATH
      fi
    
      get_pvc_resources_resp="{\"pvc_name\": \"${PVC_NAME}\", \"pv_name\": \"${PV_NAME}\", \"volumeattachment_name\": \"${VOLUME_ATTACHMENT_NAME}\", \"backup_path\": \"${BACKUP_PATH}\"}"
      echo "${get_pvc_resources_resp}"
    }
    
    function scale_ownerreferences() {
      local ownerReferences=$1
      local namespace=$2
      local replicas=$3
    
      # no operation required
      if [[ -z "${ownerReferences}" || "${ownerReferences}" == "null" ]]; then
        return
      fi
    
      ownerReferences=$(echo "${ownerReferences}"| jq -c ".[]")
      for ownerReference in ${ownerReferences};
      do
        echo "Owner: ${ownerReference}"
        local resourceKind
        local resourceName
        resourceKind=$(echo "${ownerReference}"| jq -r ".kind")
        resourceName=$(echo "${ownerReference}"| jq -r ".name")
    
        if kubectl -n "${namespace}" get "${resourceKind}" "${resourceName}" >/dev/null 2>&1; then
          # scale replicas
          kubectl  -n "${namespace}" patch "${resourceKind}" "${resourceName}" --type json -p "[{\"op\": \"replace\", \"path\": \"/spec/members\", \"value\": ${replicas} }]"
        fi
      done
    }
    
    function scale_down_statefulset() {
      local statefulset_name=$1
      local namespace=$2
      local ownerReferences=$3
    
      echo "Start Scale Down statefulset ${statefulset_name} under namespace ${namespace}..."
    
      # validate and scale down ownerreference
      scale_ownerreferences "${ownerReferences}" "${namespace}" 0
    
      local try=0
      local maxtry=30
      success=0
      while (( try != maxtry )) ; do
        result=$(kubectl scale statefulset "${statefulset_name}" --replicas=0 -n "${namespace}") || true
        echo "${result}"
        scaledown=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep 0/0) || true
        if { [ -n "${scaledown}" ] && [ "${scaledown}" != " " ]; }; then
          echo "Statefulset scaled down successfully."
          success=1
          break
        else
          try=$((try+1))
          echo "waiting for the statefulset ${statefulset_name} to scale down...${try}/${maxtry}";sleep 30
        fi
      done
    
      if [ ${success} -eq 0 ]; then
        echo "Statefulset ${statefulset_name} scaled down failed"
      fi
    }
    
    function scale_up_statefulset() {
      local statefulset_name=$1
      local namespace=$2
      local replica=$3
      local ownerReferences=$4
    
      # Scale up statefulsets using PVCs
      echo "Start Scale Up statefulset ${statefulset_name}..."
    
      # validate and scale up ownerreference
      scale_ownerreferences "${ownerReferences}" "${namespace}" "${replica}"
    
      echo "Waiting to scale up statefulset..."
    
      local try=1
      local maxtry=15
      local success=0
      while (( try != maxtry )) ; do
    
        kubectl scale statefulset "${statefulset_name}" --replicas="${replica}" -n "${namespace}"
        kubectl get statefulset "${statefulset_name}" -n "${namespace}"
    
        scaleup=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep "${replica}"/"${replica}") || true
        if ! { [ -n "${scaleup}" ] && [ "${scaleup}" != " " ]; }; then
          try=$((try+1))
          echo "waiting for the statefulset ${statefulset_name} to scale up...${try}/${maxtry}"; sleep 30
        else
          echo "Statefulset scaled up successfully."
          success=1
          break
        fi
      done
    
      if [ ${success} -eq 0 ]; then
        echo "Statefulset scaled up failed ${statefulset_name}."
      fi
    }
    
    function restore_pvc_attached_to_statefulsets() {
      local namespace
      namespace="${1}"
      local statefulset_list
    
      # list of all statefulset using PVC
      statefulset_list=$(kubectl get statefulset -n "${namespace}" -o=json | jq -r ".items[] | select(.spec.volumeClaimTemplates).metadata.name")
    
      for statefulset_name in ${statefulset_list};
      do
        local replica
        local ownerReferences
        local pvc_restore_failed
        local try=0
        local maxtry=5
        local status="notready"
        pvc_restore_failed=""
        restore_pvc_status=""
    
        # check if statefulset is reday
        while [[ "${status}" == "notready" ]] && (( try < maxtry )); do
          echo "fetch statefulset ${statefulset_name} metadata...  ${try}/${maxtry}"
          try=$(( try + 1 ))
          replica=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec.replicas")
    
          if [[ "${replica}" != 0 ]]; then
            status="ready"
          else
            echo "statefulset ${statefulset_name} replica is not ready. Wait and retry"; sleep 30
          fi
        done
    
        if [[ "${status}" != "ready" ]]; then
          echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}. Please retrigger volume restore step."
        fi
    
        # Fetch ownerReferences and claim name
        ownerReferences=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".metadata.ownerReferences")
        claimTemplatesName=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec | select( has (\"volumeClaimTemplates\") ).volumeClaimTemplates[].metadata.name " | xargs)
        
        echo "Scaling down Statefulset ${statefulset_name} with ${replica} under namespace ${namespace}"
        scale_down_statefulset "${statefulset_name}" "${namespace}" "${ownerReferences}"
        for name in ${claimTemplatesName}; do
          local pvc_prefix
          pvc_prefix="${name}-${statefulset_name}"
    
          for((i=0;i<"${replica}";i++)); do
            local pvc_name
            pvc_name="${pvc_prefix}-${i}"
    
            pvc_exist=$(kubectl -n "${namespace}" get pvc "${pvc_name}") || true
            if [[ -z "${pvc_exist}" || "${pvc_exist}" == " " ]]; then
              echo "PVC not available for the statefulset ${statefulset_name}, skipping restore."
              continue;
            fi
    
            local pvc_storageclass
            pvc_storageclass=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".spec.storageClassName")
            if [[ ! ( "${pvc_storageclass}" == "${STORAGE_CLASS}" || "${pvc_storageclass}" == "${STORAGE_CLASS_SINGLE_REPLICA}" ) ]]; then
              echo "backup not available for pvc ${pvc_name}, storageclass: ${pvc_storageclass} "
              continue;
            fi
    
            # get pv, volumeattachments for pvc
            get_pvc_resources_resp=""
            get_pvc_resources "${pvc_name}" "${namespace}"
    
            local pv_name
            local volumeattachment_name
            local backup_path
            pv_name=$(echo "${get_pvc_resources_resp}"| jq -r ".pv_name")
            volumeattachment_name=$(echo "${get_pvc_resources_resp}"| jq -r ".volumeattachment_name")
            backup_path=$(echo "${get_pvc_resources_resp}"| jq -r ".backup_path")
            if [[ -z "${backup_path}" || "${backup_path}" == " " || "${backup_path}" == "null" ]]; then
              pvc_restore_failed="error"
              FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
              continue;
            fi
    
            restore_pvc_status="succeed"
            restore_pvc "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
            if [[ -n "${restore_pvc_status}" && "${restore_pvc_status}" != "succeed" ]]; then
              pvc_restore_failed="error"
              FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
              continue;
            fi
          done
        done
    
        sleep 10
        scale_up_statefulset "${statefulset_name}" "${namespace}" "${replica}" "${ownerReferences}"
        sleep 5
    
        if [[ -n "${pvc_restore_failed}" && "${pvc_restore_failed}" == "error" ]]; then
            echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}."
        fi    
      done
    }
    
    LONGHORN_URL=$(kubectl -n longhorn-system get svc longhorn-backend -o jsonpath="{.spec.clusterIP}"):9500
    
    restore_pvc_attached_to_statefulsets "mongodb"#!/bin/bash
    
    set -eu
    
    
    FAILED_PVC_LIST=""
    STORAGE_CLASS_SINGLE_REPLICA="longhorn-backup-single-replica"
    STORAGE_CLASS="longhorn-backup"
    LOCAL_RESTORE_PATH="restoredata"
    mkdir -p ${LOCAL_RESTORE_PATH}
    export LOCAL_RESTORE_PATH="${LOCAL_RESTORE_PATH}"
    
    
    function delete_pvc_resources(){
      local pv_name=$1
      local volumeattachment_name=$2
    
      echo "deleting pv & volumes forcefully"
      delete_pv_forcefully "${pv_name}"
      sleep 2
      delete_longhornvolume_forcefully "${pv_name}"
      sleep 2
      if [ -n "${volumeattachment_name}" ]; then
        echo "deleting volumeattachments forcefully"
        delete_volumeattachment_forcefully "${volumeattachment_name}"
        sleep 5
      fi
    }
    
    function delete_longhornvolume_forcefully() {
      local pv_name=$1
      
      if ! kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" >/dev/null 2>&1 ; then
        echo "Volume ${pv_name} not found, skip deletion."
        return
      fi
    
      kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
      local success=0
      local try=0
      local maxtry=10
      while (( try < maxtry )); do
        local result=""
        # update finaluzer field to null
        result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
        echo "${result}"
    
        result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}")|| true
        echo "${result}"
        if [[ -z "${result}" ]]; then
          success=1
          break;
        fi
        kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
        echo "Waiting to delete volume ${pv_name} ${try}/${maxtry}..."; sleep 10
        try=$(( try + 1 ))
      done
    
       if [ "${success}" -eq 0 ]; then
        echo "Failed to delete volume ${pv_name}."
      fi
    
    }
    
    function delete_pv_forcefully() {
      local pv_name=$1
    
      kubectl delete pv "${pv_name}" --grace-period=0 --force &
    
      local success=0
      local try=0
      local maxtry=10
      while (( try < maxtry )); do
        # update finaluzer field to null
        result=$(kubectl get pv "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
        echo "${result}"
    
        result=$(kubectl get pv "${pv_name}")|| true
        echo "${result}"
        if [[ -z "${result}" ]]; then
          success=1
          break;
        fi
        kubectl delete pv "${pv_name}" --grace-period=0 --force &
        echo "Waiting to delete pv ${pv_name} ${try}/${maxtry}..."; sleep 10
        try=$(( try + 1 ))
      done
    
      if [ "${success}" -eq 0 ]; then
        echo "Failed to delete pv ${pv_name}."
      fi
    }
    
    function validate_pv_backup_available(){
      local pv_name=$1
      local validate_pv_backup_available_resp=0
      local backup_resp=""
      local resp_status_code=""
      local resp_message=""
    
      local try=0
      local maxtry=3
      while (( try != maxtry )) ; do
        backup_resp=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?") || true
        echo "Backup Response: ${backup_resp}"
        if { [ -n "${backup_resp}" ] && [ "${backup_resp}" != " " ]; }; then
          resp_status_code=$(echo "${backup_resp}"| jq -c ".status")
          resp_message=$(echo "${backup_resp}"| jq -c ".message")
          if [[ -n "${resp_status_code}" && "${resp_status_code}" != " " && "${resp_status_code}" != "null" && (( resp_status_code -gt 200 )) ]] ; then
            validate_pv_backup_available_resp=0
          else
            resp_message=$(echo "${backup_resp}"| jq -c ".messages.error")
            if { [ -z "${resp_message}" ] || [ "${resp_message}" = "null" ]; }; then
              echo "PVC Backup is available for ${pv_name}"
              # shellcheck disable=SC2034
              validate_pv_backup_available_resp=1
              break;
            fi
          fi
        fi
        try=$((try+1))
        sleep 10
      done
      export IS_BACKUP_AVAILABLE="${validate_pv_backup_available_resp}"
    }
    
    function store_pvc_resources(){
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
    
      if [[ -n "${volumeattachment_name}" && "${volumeattachment_name}" != " " ]]; then
        result=$(kubectl get volumeattachments "${volumeattachment_name}" -o json > "${LOCAL_RESTORE_PATH}/${volumeattachment_name}".json) || true
        echo "${result}"
      fi
    
      kubectl get pv "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
      kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json
      kubectl get pvc "${pvc_name}" -n "${namespace}" -o json > "${LOCAL_RESTORE_PATH}/${pvc_name}"-pvc.json
    }
    
    function wait_pvc_bound() {
      local namespace=$1
      local pvc_name=$2
    
      try=0
      maxtry=30
      while (( try < maxtry )); do
        local status=""
        status=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".status | select( has(\"phase\")).phase")
        if [[ "${status}"  == "Bound" ]]; then
          echo "PVC ${pvc_name} Bouned successfully."
          break;
        fi
        echo "waiting for PVC to bound...${try}/${maxtry}"; sleep 30
        try=$((try+1))
      done
    }
    
    function create_volume() {
      local pv_name=$1
      local backup_path=$2
      local response
      create_volume_status="succeed"
      echo "Creating Volume with PV: ${pv_name} and BackupPath: ${backup_path}"
      local accessmode
      local replicacount
      accessmode=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.accessMode")
      replicacount=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.numberOfReplicas")
    
      # create volume from backup pv
      response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/volumes" -H 'Accept: application/json' -H 'Content-Type: application/json;charset=UTF-8' --data-raw "{\"name\":\"${pv_name}\", \"accessMode\":\"${accessmode}\", \"fromBackup\": \"${backup_path}\", \"numberOfReplicas\": ${replicacount}}")
    
      sleep 5
    
      if [[ -n "${response}" && "${response}" != "null" && "${response}" != " " && -n $(echo "${response}"|jq -r ".id") ]]; then
      # wait for volume to be detached , max wait 4hr
        local try=0
        local maxtry=480
        local success=0
        while (( try < maxtry ));do
          status=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}"  -o json |jq -r ".status.state") || true
          echo "volume ${pv_name} status: ${status}"
          if [[ "${status}" == "detached" ]]; then
            # update label
            kubectl -n longhorn-system label volumes.longhorn.io/"${pv_name}" recurring-job-group.longhorn.io/uipath-backup=enabled
            success=1
            echo "Volume ready to use"
            break
          fi
          echo "waiting for volume to be ready to use${try}/${maxtry}..."; sleep 30
          restore_status_url="${LONGHORN_URL}/v1/volumes/${pv_name}?action=snapshotList"
          try=$(( try + 1 ))
        done
    
        if [ "${success}" -eq 0 ]; then
          create_volume_status="failed"
          echo "${pv_name} Volume is not ready to use with status ${status}"
        fi
      else
        create_volume_status="failed"
        kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json || true
        echo "${pv_name} Volume creation failed ${response} "
      fi
      echo "${create_volume_status}"
    }
    
    function restore_with_backupvolume() {
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
      local backup_path=$5
      create_volume_status="succeed"
      store_pvc_resources "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}"
      sleep 5
      delete_pvc_resources "${pv_name}" "${volumeattachment_name}"
      sleep 5
      create_volume "${pv_name}" "${backup_path}"
      sleep 10
    
      kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
      if [[ -n "${create_volume_status}" && "${create_volume_status}" != "succeed" ]]; then
        echo "Backup volume restore failed for pvc ${pvc_name} in namespace ${namespace}"
        restore_pvc_status="failed"
      else 
        local pvc_uid=""
        pvc_uid=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".metadata.uid")
    
        # update pv with pvc uid
        kubectl patch pv "${pv_name}" --type json -p "[{\"op\": \"add\", \"path\": \"/spec/claimRef/uid\", \"value\": \"${pvc_uid}\"}]"
    
        wait_pvc_bound "${namespace}" "${pvc_name}"
        echo "${result}"
      fi
      echo ${restore_pvc_status}
    }
    
    function restore_pvc(){
      local pvc_name=$1
      local namespace=$2
      local pv_name=$3
      local volumeattachment_name=$4
      local backup_path=$5
      restore_pvc_status="succeed"
      restore_with_backupvolume "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
      echo ${restore_pvc_status}
      # attach_volume "${pv_name}"
    }
    
    function get_backup_list_by_pvname(){
      local pv_name=$1
      local get_backup_list_by_pvname_resp=""
      local backup_list_response=""
    
      local try=0
      local maxtry=3
      while (( try != maxtry )) ; do
    
        backup_list_response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?action=backupList" -X 'POST' -H 'Content-Length: 0' -H 'Accept: application/json')
        if [[ -n "${backup_list_response}" && "${backup_list_response}" != " " && -n $( echo "${backup_list_response}"|jq ".data" )  && -n $( echo "${backup_list_response}"|jq ".data[]" ) ]]; then
          echo "Backup List Response: ${backup_list_response}"
          # pick first backup data with url non empty
          # shellcheck disable=SC2034
          get_backup_list_by_pvname_resp=$(echo "${backup_list_response}"|jq -c '[.data[]|select(.url | . != null and . != "")][0]')
          break;
        fi
        try=$((try+1))
        sleep 10
      done
      export PV_BACKUP_PATH="${get_backup_list_by_pvname_resp}"
    }
    
    function get_pvc_resources() {
    
      get_pvc_resources_resp=""
    
      local PVC_NAME=$1
      local PVC_NAMESPACE=$2
    
      # PV have one to one mapping with PVC
      PV_NAME=$(kubectl -n "${PVC_NAMESPACE}" get pvc "${PVC_NAME}" -o json|jq -r ".spec.volumeName")
    
      VOLUME_ATTACHMENT_LIST=$(kubectl get volumeattachments -n "${PVC_NAMESPACE}" -o=json|jq -c ".items[]|\
    {name: .metadata.name, pvClaimName:.spec.source| select( has (\"persistentVolumeName\")).persistentVolumeName}")
    
      VOLUME_ATTACHMENT_NAME=""
      for VOLUME_ATTACHMENT in ${VOLUME_ATTACHMENT_LIST};
      do
        PV_CLAIM_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".pvClaimName")
        if [[ "${PV_NAME}" = "${PV_CLAIM_NAME}" ]]; then
          VOLUME_ATTACHMENT_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".name")
          break;
        fi
      done
    
      BACKUP_PATH=""
      validate_pv_backup_available "${PV_NAME}"
      local is_backup_available=${IS_BACKUP_AVAILABLE:- 0}
      unset IS_BACKUP_AVAILABLE
    
      if [ "${is_backup_available}" -eq 1 ]; then
        echo "Backup is available for PVC ${PVC_NAME}"
    
        get_backup_list_by_pvname "${PV_NAME}"
        BACKUP_PATH=$(echo "${PV_BACKUP_PATH}"| jq -r ".url")
        unset PV_BACKUP_PATH
      fi
    
      get_pvc_resources_resp="{\"pvc_name\": \"${PVC_NAME}\", \"pv_name\": \"${PV_NAME}\", \"volumeattachment_name\": \"${VOLUME_ATTACHMENT_NAME}\", \"backup_path\": \"${BACKUP_PATH}\"}"
      echo "${get_pvc_resources_resp}"
    }
    
    function scale_ownerreferences() {
      local ownerReferences=$1
      local namespace=$2
      local replicas=$3
    
      # no operation required
      if [[ -z "${ownerReferences}" || "${ownerReferences}" == "null" ]]; then
        return
      fi
    
      ownerReferences=$(echo "${ownerReferences}"| jq -c ".[]")
      for ownerReference in ${ownerReferences};
      do
        echo "Owner: ${ownerReference}"
        local resourceKind
        local resourceName
        resourceKind=$(echo "${ownerReference}"| jq -r ".kind")
        resourceName=$(echo "${ownerReference}"| jq -r ".name")
    
        if kubectl -n "${namespace}" get "${resourceKind}" "${resourceName}" >/dev/null 2>&1; then
          # scale replicas
          kubectl  -n "${namespace}" patch "${resourceKind}" "${resourceName}" --type json -p "[{\"op\": \"replace\", \"path\": \"/spec/members\", \"value\": ${replicas} }]"
        fi
      done
    }
    
    function scale_down_statefulset() {
      local statefulset_name=$1
      local namespace=$2
      local ownerReferences=$3
    
      echo "Start Scale Down statefulset ${statefulset_name} under namespace ${namespace}..."
    
      # validate and scale down ownerreference
      scale_ownerreferences "${ownerReferences}" "${namespace}" 0
    
      local try=0
      local maxtry=30
      success=0
      while (( try != maxtry )) ; do
        result=$(kubectl scale statefulset "${statefulset_name}" --replicas=0 -n "${namespace}") || true
        echo "${result}"
        scaledown=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep 0/0) || true
        if { [ -n "${scaledown}" ] && [ "${scaledown}" != " " ]; }; then
          echo "Statefulset scaled down successfully."
          success=1
          break
        else
          try=$((try+1))
          echo "waiting for the statefulset ${statefulset_name} to scale down...${try}/${maxtry}";sleep 30
        fi
      done
    
      if [ ${success} -eq 0 ]; then
        echo "Statefulset ${statefulset_name} scaled down failed"
      fi
    }
    
    function scale_up_statefulset() {
      local statefulset_name=$1
      local namespace=$2
      local replica=$3
      local ownerReferences=$4
    
      # Scale up statefulsets using PVCs
      echo "Start Scale Up statefulset ${statefulset_name}..."
    
      # validate and scale up ownerreference
      scale_ownerreferences "${ownerReferences}" "${namespace}" "${replica}"
    
      echo "Waiting to scale up statefulset..."
    
      local try=1
      local maxtry=15
      local success=0
      while (( try != maxtry )) ; do
    
        kubectl scale statefulset "${statefulset_name}" --replicas="${replica}" -n "${namespace}"
        kubectl get statefulset "${statefulset_name}" -n "${namespace}"
    
        scaleup=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep "${replica}"/"${replica}") || true
        if ! { [ -n "${scaleup}" ] && [ "${scaleup}" != " " ]; }; then
          try=$((try+1))
          echo "waiting for the statefulset ${statefulset_name} to scale up...${try}/${maxtry}"; sleep 30
        else
          echo "Statefulset scaled up successfully."
          success=1
          break
        fi
      done
    
      if [ ${success} -eq 0 ]; then
        echo "Statefulset scaled up failed ${statefulset_name}."
      fi
    }
    
    function restore_pvc_attached_to_statefulsets() {
      local namespace
      namespace="${1}"
      local statefulset_list
    
      # list of all statefulset using PVC
      statefulset_list=$(kubectl get statefulset -n "${namespace}" -o=json | jq -r ".items[] | select(.spec.volumeClaimTemplates).metadata.name")
    
      for statefulset_name in ${statefulset_list};
      do
        local replica
        local ownerReferences
        local pvc_restore_failed
        local try=0
        local maxtry=5
        local status="notready"
        pvc_restore_failed=""
        restore_pvc_status=""
    
        # check if statefulset is reday
        while [[ "${status}" == "notready" ]] && (( try < maxtry )); do
          echo "fetch statefulset ${statefulset_name} metadata...  ${try}/${maxtry}"
          try=$(( try + 1 ))
          replica=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec.replicas")
    
          if [[ "${replica}" != 0 ]]; then
            status="ready"
          else
            echo "statefulset ${statefulset_name} replica is not ready. Wait and retry"; sleep 30
          fi
        done
    
        if [[ "${status}" != "ready" ]]; then
          echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}. Please retrigger volume restore step."
        fi
    
        # Fetch ownerReferences and claim name
        ownerReferences=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".metadata.ownerReferences")
        claimTemplatesName=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec | select( has (\"volumeClaimTemplates\") ).volumeClaimTemplates[].metadata.name " | xargs)
        
        echo "Scaling down Statefulset ${statefulset_name} with ${replica} under namespace ${namespace}"
        scale_down_statefulset "${statefulset_name}" "${namespace}" "${ownerReferences}"
        for name in ${claimTemplatesName}; do
          local pvc_prefix
          pvc_prefix="${name}-${statefulset_name}"
    
          for((i=0;i<"${replica}";i++)); do
            local pvc_name
            pvc_name="${pvc_prefix}-${i}"
    
            pvc_exist=$(kubectl -n "${namespace}" get pvc "${pvc_name}") || true
            if [[ -z "${pvc_exist}" || "${pvc_exist}" == " " ]]; then
              echo "PVC not available for the statefulset ${statefulset_name}, skipping restore."
              continue;
            fi
    
            local pvc_storageclass
            pvc_storageclass=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".spec.storageClassName")
            if [[ ! ( "${pvc_storageclass}" == "${STORAGE_CLASS}" || "${pvc_storageclass}" == "${STORAGE_CLASS_SINGLE_REPLICA}" ) ]]; then
              echo "backup not available for pvc ${pvc_name}, storageclass: ${pvc_storageclass} "
              continue;
            fi
    
            # get pv, volumeattachments for pvc
            get_pvc_resources_resp=""
            get_pvc_resources "${pvc_name}" "${namespace}"
    
            local pv_name
            local volumeattachment_name
            local backup_path
            pv_name=$(echo "${get_pvc_resources_resp}"| jq -r ".pv_name")
            volumeattachment_name=$(echo "${get_pvc_resources_resp}"| jq -r ".volumeattachment_name")
            backup_path=$(echo "${get_pvc_resources_resp}"| jq -r ".backup_path")
            if [[ -z "${backup_path}" || "${backup_path}" == " " || "${backup_path}" == "null" ]]; then
              pvc_restore_failed="error"
              FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
              continue;
            fi
    
            restore_pvc_status="succeed"
            restore_pvc "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
            if [[ -n "${restore_pvc_status}" && "${restore_pvc_status}" != "succeed" ]]; then
              pvc_restore_failed="error"
              FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
              continue;
            fi
          done
        done
    
        sleep 10
        scale_up_statefulset "${statefulset_name}" "${namespace}" "${replica}" "${ownerReferences}"
        sleep 5
    
        if [[ -n "${pvc_restore_failed}" && "${pvc_restore_failed}" == "error" ]]; then
            echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}."
        fi    
      done
    }
    
    LONGHORN_URL=$(kubectl -n longhorn-system get svc longhorn-backend -o jsonpath="{.spec.clusterIP}"):9500
    
    restore_pvc_attached_to_statefulsets "mongodb"
  • 描述
  • 解决方案

此页面有帮助吗?

获取您需要的帮助
了解 RPA - 自动化课程
UiPath Community 论坛
Uipath Logo White
信任与安全
© 2005-2024 UiPath。保留所有权利。