MongoDB o aplicaciones empresariales degradadas después de la restauración del clúster

Descripción

En ocasiones, después de la restauración o reversión del clúster a la versión 2022.4.x o 2021.10.x, un problema hace que MongoDB o los pods de aplicaciones empresariales se atasquen en el estado inicial. Esto sucede porque falta el volumen necesario para adjuntar el PVC a un pod.
Solución

Verifica si el problema está realmente relacionado con el problema de archivos adjuntos de volumen de MongoDB:
```
# fetch all mongodb pods kubectl -n mongodb get pods #describe pods stuck in init state #kubectl -n mongodb describe pods mongodb-replica-set-<replica index number> kubectl -n mongodb describe pods mongodb-replica-set-0# fetch all mongodb pods kubectl -n mongodb get pods #describe pods stuck in init state #kubectl -n mongodb describe pods mongodb-replica-set-<replica index number> kubectl -n mongodb describe pods mongodb-replica-set-0
```
Si el problema está relacionado con el archivo adjunto de volumen de MongoDB, se muestran los siguientes eventos:
```
Events:
  Type     Reason              Age                   From                     Message
  ----                   ----                  ----                     
  Warning  FailedAttachVolume  3m9s (x65 over 133m)  attachdetach-controller  AttachVolume.Attach failed for volume "pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07" : rpc error: code = Aborted desc = volume pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07 is not ready for workloads
  Warning  FailedMount         103s (x50 over 112m)  kubelet                  (combined from similar events): Unable to attach or mount volumes: unmounted volumes=[logs-volume], unattached volumes=[hooks mongodb-replica-set-keyfile tls-secret data-volume healthstatus tls-ca kube-api-access-45qcl agent-scripts logs-volume automation-config]: timed out waiting for the conditionEvents:
  Type     Reason              Age                   From                     Message
  ----                   ----                  ----                     
  Warning  FailedAttachVolume  3m9s (x65 over 133m)  attachdetach-controller  AttachVolume.Attach failed for volume "pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07" : rpc error: code = Aborted desc = volume pvc-66897693-e52d-4b89-aac6-ca0cc5ae9e07 is not ready for workloads
  Warning  FailedMount         103s (x50 over 112m)  kubelet                  (combined from similar events): Unable to attach or mount volumes: unmounted volumes=[logs-volume], unattached volumes=[hooks mongodb-replica-set-keyfile tls-secret data-volume healthstatus tls-ca kube-api-access-45qcl agent-scripts logs-volume automation-config]: timed out waiting for the condition
```
Corrija los pods problemáticos de MongoDB ejecutando el siguiente script:
#!/bin/bash

set -eu


FAILED_PVC_LIST=""
STORAGE_CLASS_SINGLE_REPLICA="longhorn-backup-single-replica"
STORAGE_CLASS="longhorn-backup"
LOCAL_RESTORE_PATH="restoredata"
mkdir -p ${LOCAL_RESTORE_PATH}
export LOCAL_RESTORE_PATH="${LOCAL_RESTORE_PATH}"


function delete_pvc_resources(){
  local pv_name=$1
  local volumeattachment_name=$2

  echo "deleting pv & volumes forcefully"
  delete_pv_forcefully "${pv_name}"
  sleep 2
  delete_longhornvolume_forcefully "${pv_name}"
  sleep 2
  if [ -n "${volumeattachment_name}" ]; then
    echo "deleting volumeattachments forcefully"
    delete_volumeattachment_forcefully "${volumeattachment_name}"
    sleep 5
  fi
}

function delete_longhornvolume_forcefully() {
  local pv_name=$1
  
  if ! kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" >/dev/null 2>&1 ; then
    echo "Volume ${pv_name} not found, skip deletion."
    return
  fi

  kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
  local success=0
  local try=0
  local maxtry=10
  while (( try < maxtry )); do
    local result=""
    # update finaluzer field to null
    result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
    echo "${result}"

    result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}")|| true
    echo "${result}"
    if [[ -z "${result}" ]]; then
      success=1
      break;
    fi
    kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
    echo "Waiting to delete volume ${pv_name} ${try}/${maxtry}..."; sleep 10
    try=$(( try + 1 ))
  done

   if [ "${success}" -eq 0 ]; then
    echo "Failed to delete volume ${pv_name}."
  fi

}

function delete_pv_forcefully() {
  local pv_name=$1

  kubectl delete pv "${pv_name}" --grace-period=0 --force &

  local success=0
  local try=0
  local maxtry=10
  while (( try < maxtry )); do
    # update finaluzer field to null
    result=$(kubectl get pv "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
    echo "${result}"

    result=$(kubectl get pv "${pv_name}")|| true
    echo "${result}"
    if [[ -z "${result}" ]]; then
      success=1
      break;
    fi
    kubectl delete pv "${pv_name}" --grace-period=0 --force &
    echo "Waiting to delete pv ${pv_name} ${try}/${maxtry}..."; sleep 10
    try=$(( try + 1 ))
  done

  if [ "${success}" -eq 0 ]; then
    echo "Failed to delete pv ${pv_name}."
  fi
}

function validate_pv_backup_available(){
  local pv_name=$1
  local validate_pv_backup_available_resp=0
  local backup_resp=""
  local resp_status_code=""
  local resp_message=""

  local try=0
  local maxtry=3
  while (( try != maxtry )) ; do
    backup_resp=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?") || true
    echo "Backup Response: ${backup_resp}"
    if { [ -n "${backup_resp}" ] && [ "${backup_resp}" != " " ]; }; then
      resp_status_code=$(echo "${backup_resp}"| jq -c ".status")
      resp_message=$(echo "${backup_resp}"| jq -c ".message")
      if [[ -n "${resp_status_code}" && "${resp_status_code}" != " " && "${resp_status_code}" != "null" && (( resp_status_code -gt 200 )) ]] ; then
        validate_pv_backup_available_resp=0
      else
        resp_message=$(echo "${backup_resp}"| jq -c ".messages.error")
        if { [ -z "${resp_message}" ] || [ "${resp_message}" = "null" ]; }; then
          echo "PVC Backup is available for ${pv_name}"
          # shellcheck disable=SC2034
          validate_pv_backup_available_resp=1
          break;
        fi
      fi
    fi
    try=$((try+1))
    sleep 10
  done
  export IS_BACKUP_AVAILABLE="${validate_pv_backup_available_resp}"
}

function store_pvc_resources(){
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4

  if [[ -n "${volumeattachment_name}" && "${volumeattachment_name}" != " " ]]; then
    result=$(kubectl get volumeattachments "${volumeattachment_name}" -o json > "${LOCAL_RESTORE_PATH}/${volumeattachment_name}".json) || true
    echo "${result}"
  fi

  kubectl get pv "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
  kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json
  kubectl get pvc "${pvc_name}" -n "${namespace}" -o json > "${LOCAL_RESTORE_PATH}/${pvc_name}"-pvc.json
}

function wait_pvc_bound() {
  local namespace=$1
  local pvc_name=$2

  try=0
  maxtry=30
  while (( try < maxtry )); do
    local status=""
    status=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".status | select( has(\"phase\")).phase")
    if [[ "${status}"  == "Bound" ]]; then
      echo "PVC ${pvc_name} Bouned successfully."
      break;
    fi
    echo "waiting for PVC to bound...${try}/${maxtry}"; sleep 30
    try=$((try+1))
  done
}

function create_volume() {
  local pv_name=$1
  local backup_path=$2
  local response
  create_volume_status="succeed"
  echo "Creating Volume with PV: ${pv_name} and BackupPath: ${backup_path}"
  local accessmode
  local replicacount
  accessmode=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.accessMode")
  replicacount=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.numberOfReplicas")

  # create volume from backup pv
  response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/volumes" -H 'Accept: application/json' -H 'Content-Type: application/json;charset=UTF-8' --data-raw "{\"name\":\"${pv_name}\", \"accessMode\":\"${accessmode}\", \"fromBackup\": \"${backup_path}\", \"numberOfReplicas\": ${replicacount}}")

  sleep 5

  if [[ -n "${response}" && "${response}" != "null" && "${response}" != " " && -n $(echo "${response}"|jq -r ".id") ]]; then
  # wait for volume to be detached , max wait 4hr
    local try=0
    local maxtry=480
    local success=0
    while (( try < maxtry ));do
      status=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}"  -o json |jq -r ".status.state") || true
      echo "volume ${pv_name} status: ${status}"
      if [[ "${status}" == "detached" ]]; then
        # update label
        kubectl -n longhorn-system label volumes.longhorn.io/"${pv_name}" recurring-job-group.longhorn.io/uipath-backup=enabled
        success=1
        echo "Volume ready to use"
        break
      fi
      echo "waiting for volume to be ready to use${try}/${maxtry}..."; sleep 30
      restore_status_url="${LONGHORN_URL}/v1/volumes/${pv_name}?action=snapshotList"
      try=$(( try + 1 ))
    done

    if [ "${success}" -eq 0 ]; then
      create_volume_status="failed"
      echo "${pv_name} Volume is not ready to use with status ${status}"
    fi
  else
    create_volume_status="failed"
    kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json || true
    echo "${pv_name} Volume creation failed ${response} "
  fi
  echo "${create_volume_status}"
}

function restore_with_backupvolume() {
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4
  local backup_path=$5
  create_volume_status="succeed"
  store_pvc_resources "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}"
  sleep 5
  delete_pvc_resources "${pv_name}" "${volumeattachment_name}"
  sleep 5
  create_volume "${pv_name}" "${backup_path}"
  sleep 10

  kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
  if [[ -n "${create_volume_status}" && "${create_volume_status}" != "succeed" ]]; then
    echo "Backup volume restore failed for pvc ${pvc_name} in namespace ${namespace}"
    restore_pvc_status="failed"
  else 
    local pvc_uid=""
    pvc_uid=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".metadata.uid")

    # update pv with pvc uid
    kubectl patch pv "${pv_name}" --type json -p "[{\"op\": \"add\", \"path\": \"/spec/claimRef/uid\", \"value\": \"${pvc_uid}\"}]"

    wait_pvc_bound "${namespace}" "${pvc_name}"
    echo "${result}"
  fi
  echo ${restore_pvc_status}
}

function restore_pvc(){
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4
  local backup_path=$5
  restore_pvc_status="succeed"
  restore_with_backupvolume "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
  echo ${restore_pvc_status}
  # attach_volume "${pv_name}"
}

function get_backup_list_by_pvname(){
  local pv_name=$1
  local get_backup_list_by_pvname_resp=""
  local backup_list_response=""

  local try=0
  local maxtry=3
  while (( try != maxtry )) ; do

    backup_list_response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?action=backupList" -X 'POST' -H 'Content-Length: 0' -H 'Accept: application/json')
    if [[ -n "${backup_list_response}" && "${backup_list_response}" != " " && -n $( echo "${backup_list_response}"|jq ".data" )  && -n $( echo "${backup_list_response}"|jq ".data[]" ) ]]; then
      echo "Backup List Response: ${backup_list_response}"
      # pick first backup data with url non empty
      # shellcheck disable=SC2034
      get_backup_list_by_pvname_resp=$(echo "${backup_list_response}"|jq -c '[.data[]|select(.url | . != null and . != "")][0]')
      break;
    fi
    try=$((try+1))
    sleep 10
  done
  export PV_BACKUP_PATH="${get_backup_list_by_pvname_resp}"
}

function get_pvc_resources() {

  get_pvc_resources_resp=""

  local PVC_NAME=$1
  local PVC_NAMESPACE=$2

  # PV have one to one mapping with PVC
  PV_NAME=$(kubectl -n "${PVC_NAMESPACE}" get pvc "${PVC_NAME}" -o json|jq -r ".spec.volumeName")

  VOLUME_ATTACHMENT_LIST=$(kubectl get volumeattachments -n "${PVC_NAMESPACE}" -o=json|jq -c ".items[]|\
{name: .metadata.name, pvClaimName:.spec.source| select( has (\"persistentVolumeName\")).persistentVolumeName}")

  VOLUME_ATTACHMENT_NAME=""
  for VOLUME_ATTACHMENT in ${VOLUME_ATTACHMENT_LIST};
  do
    PV_CLAIM_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".pvClaimName")
    if [[ "${PV_NAME}" = "${PV_CLAIM_NAME}" ]]; then
      VOLUME_ATTACHMENT_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".name")
      break;
    fi
  done

  BACKUP_PATH=""
  validate_pv_backup_available "${PV_NAME}"
  local is_backup_available=${IS_BACKUP_AVAILABLE:- 0}
  unset IS_BACKUP_AVAILABLE

  if [ "${is_backup_available}" -eq 1 ]; then
    echo "Backup is available for PVC ${PVC_NAME}"

    get_backup_list_by_pvname "${PV_NAME}"
    BACKUP_PATH=$(echo "${PV_BACKUP_PATH}"| jq -r ".url")
    unset PV_BACKUP_PATH
  fi

  get_pvc_resources_resp="{\"pvc_name\": \"${PVC_NAME}\", \"pv_name\": \"${PV_NAME}\", \"volumeattachment_name\": \"${VOLUME_ATTACHMENT_NAME}\", \"backup_path\": \"${BACKUP_PATH}\"}"
  echo "${get_pvc_resources_resp}"
}

function scale_ownerreferences() {
  local ownerReferences=$1
  local namespace=$2
  local replicas=$3

  # no operation required
  if [[ -z "${ownerReferences}" || "${ownerReferences}" == "null" ]]; then
    return
  fi

  ownerReferences=$(echo "${ownerReferences}"| jq -c ".[]")
  for ownerReference in ${ownerReferences};
  do
    echo "Owner: ${ownerReference}"
    local resourceKind
    local resourceName
    resourceKind=$(echo "${ownerReference}"| jq -r ".kind")
    resourceName=$(echo "${ownerReference}"| jq -r ".name")

    if kubectl -n "${namespace}" get "${resourceKind}" "${resourceName}" >/dev/null 2>&1; then
      # scale replicas
      kubectl  -n "${namespace}" patch "${resourceKind}" "${resourceName}" --type json -p "[{\"op\": \"replace\", \"path\": \"/spec/members\", \"value\": ${replicas} }]"
    fi
  done
}

function scale_down_statefulset() {
  local statefulset_name=$1
  local namespace=$2
  local ownerReferences=$3

  echo "Start Scale Down statefulset ${statefulset_name} under namespace ${namespace}..."

  # validate and scale down ownerreference
  scale_ownerreferences "${ownerReferences}" "${namespace}" 0

  local try=0
  local maxtry=30
  success=0
  while (( try != maxtry )) ; do
    result=$(kubectl scale statefulset "${statefulset_name}" --replicas=0 -n "${namespace}") || true
    echo "${result}"
    scaledown=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep 0/0) || true
    if { [ -n "${scaledown}" ] && [ "${scaledown}" != " " ]; }; then
      echo "Statefulset scaled down successfully."
      success=1
      break
    else
      try=$((try+1))
      echo "waiting for the statefulset ${statefulset_name} to scale down...${try}/${maxtry}";sleep 30
    fi
  done

  if [ ${success} -eq 0 ]; then
    echo "Statefulset ${statefulset_name} scaled down failed"
  fi
}

function scale_up_statefulset() {
  local statefulset_name=$1
  local namespace=$2
  local replica=$3
  local ownerReferences=$4

  # Scale up statefulsets using PVCs
  echo "Start Scale Up statefulset ${statefulset_name}..."

  # validate and scale up ownerreference
  scale_ownerreferences "${ownerReferences}" "${namespace}" "${replica}"

  echo "Waiting to scale up statefulset..."

  local try=1
  local maxtry=15
  local success=0
  while (( try != maxtry )) ; do

    kubectl scale statefulset "${statefulset_name}" --replicas="${replica}" -n "${namespace}"
    kubectl get statefulset "${statefulset_name}" -n "${namespace}"

    scaleup=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep "${replica}"/"${replica}") || true
    if ! { [ -n "${scaleup}" ] && [ "${scaleup}" != " " ]; }; then
      try=$((try+1))
      echo "waiting for the statefulset ${statefulset_name} to scale up...${try}/${maxtry}"; sleep 30
    else
      echo "Statefulset scaled up successfully."
      success=1
      break
    fi
  done

  if [ ${success} -eq 0 ]; then
    echo "Statefulset scaled up failed ${statefulset_name}."
  fi
}

function restore_pvc_attached_to_statefulsets() {
  local namespace
  namespace="${1}"
  local statefulset_list

  # list of all statefulset using PVC
  statefulset_list=$(kubectl get statefulset -n "${namespace}" -o=json | jq -r ".items[] | select(.spec.volumeClaimTemplates).metadata.name")

  for statefulset_name in ${statefulset_list};
  do
    local replica
    local ownerReferences
    local pvc_restore_failed
    local try=0
    local maxtry=5
    local status="notready"
    pvc_restore_failed=""
    restore_pvc_status=""

    # check if statefulset is reday
    while [[ "${status}" == "notready" ]] && (( try < maxtry )); do
      echo "fetch statefulset ${statefulset_name} metadata...  ${try}/${maxtry}"
      try=$(( try + 1 ))
      replica=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec.replicas")

      if [[ "${replica}" != 0 ]]; then
        status="ready"
      else
        echo "statefulset ${statefulset_name} replica is not ready. Wait and retry"; sleep 30
      fi
    done

    if [[ "${status}" != "ready" ]]; then
      echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}. Please retrigger volume restore step."
    fi

    # Fetch ownerReferences and claim name
    ownerReferences=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".metadata.ownerReferences")
    claimTemplatesName=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec | select( has (\"volumeClaimTemplates\") ).volumeClaimTemplates[].metadata.name " | xargs)
    
    echo "Scaling down Statefulset ${statefulset_name} with ${replica} under namespace ${namespace}"
    scale_down_statefulset "${statefulset_name}" "${namespace}" "${ownerReferences}"
    for name in ${claimTemplatesName}; do
      local pvc_prefix
      pvc_prefix="${name}-${statefulset_name}"

      for((i=0;i<"${replica}";i++)); do
        local pvc_name
        pvc_name="${pvc_prefix}-${i}"

        pvc_exist=$(kubectl -n "${namespace}" get pvc "${pvc_name}") || true
        if [[ -z "${pvc_exist}" || "${pvc_exist}" == " " ]]; then
          echo "PVC not available for the statefulset ${statefulset_name}, skipping restore."
          continue;
        fi

        local pvc_storageclass
        pvc_storageclass=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".spec.storageClassName")
        if [[ ! ( "${pvc_storageclass}" == "${STORAGE_CLASS}" || "${pvc_storageclass}" == "${STORAGE_CLASS_SINGLE_REPLICA}" ) ]]; then
          echo "backup not available for pvc ${pvc_name}, storageclass: ${pvc_storageclass} "
          continue;
        fi

        # get pv, volumeattachments for pvc
        get_pvc_resources_resp=""
        get_pvc_resources "${pvc_name}" "${namespace}"

        local pv_name
        local volumeattachment_name
        local backup_path
        pv_name=$(echo "${get_pvc_resources_resp}"| jq -r ".pv_name")
        volumeattachment_name=$(echo "${get_pvc_resources_resp}"| jq -r ".volumeattachment_name")
        backup_path=$(echo "${get_pvc_resources_resp}"| jq -r ".backup_path")
        if [[ -z "${backup_path}" || "${backup_path}" == " " || "${backup_path}" == "null" ]]; then
          pvc_restore_failed="error"
          FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
          continue;
        fi

        restore_pvc_status="succeed"
        restore_pvc "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
        if [[ -n "${restore_pvc_status}" && "${restore_pvc_status}" != "succeed" ]]; then
          pvc_restore_failed="error"
          FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
          continue;
        fi
      done
    done

    sleep 10
    scale_up_statefulset "${statefulset_name}" "${namespace}" "${replica}" "${ownerReferences}"
    sleep 5

    if [[ -n "${pvc_restore_failed}" && "${pvc_restore_failed}" == "error" ]]; then
        echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}."
    fi    
  done
}

LONGHORN_URL=$(kubectl -n longhorn-system get svc longhorn-backend -o jsonpath="{.spec.clusterIP}"):9500

restore_pvc_attached_to_statefulsets "mongodb"#!/bin/bash

set -eu


FAILED_PVC_LIST=""
STORAGE_CLASS_SINGLE_REPLICA="longhorn-backup-single-replica"
STORAGE_CLASS="longhorn-backup"
LOCAL_RESTORE_PATH="restoredata"
mkdir -p ${LOCAL_RESTORE_PATH}
export LOCAL_RESTORE_PATH="${LOCAL_RESTORE_PATH}"


function delete_pvc_resources(){
  local pv_name=$1
  local volumeattachment_name=$2

  echo "deleting pv & volumes forcefully"
  delete_pv_forcefully "${pv_name}"
  sleep 2
  delete_longhornvolume_forcefully "${pv_name}"
  sleep 2
  if [ -n "${volumeattachment_name}" ]; then
    echo "deleting volumeattachments forcefully"
    delete_volumeattachment_forcefully "${volumeattachment_name}"
    sleep 5
  fi
}

function delete_longhornvolume_forcefully() {
  local pv_name=$1
  
  if ! kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" >/dev/null 2>&1 ; then
    echo "Volume ${pv_name} not found, skip deletion."
    return
  fi

  kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
  local success=0
  local try=0
  local maxtry=10
  while (( try < maxtry )); do
    local result=""
    # update finaluzer field to null
    result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
    echo "${result}"

    result=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}")|| true
    echo "${result}"
    if [[ -z "${result}" ]]; then
      success=1
      break;
    fi
    kubectl -n longhorn-system delete volumes.longhorn.io "${pv_name}" --grace-period=0 --force &
    echo "Waiting to delete volume ${pv_name} ${try}/${maxtry}..."; sleep 10
    try=$(( try + 1 ))
  done

   if [ "${success}" -eq 0 ]; then
    echo "Failed to delete volume ${pv_name}."
  fi

}

function delete_pv_forcefully() {
  local pv_name=$1

  kubectl delete pv "${pv_name}" --grace-period=0 --force &

  local success=0
  local try=0
  local maxtry=10
  while (( try < maxtry )); do
    # update finaluzer field to null
    result=$(kubectl get pv "${pv_name}" -o=json | jq '.metadata.finalizers = null' | kubectl apply -f -) || true
    echo "${result}"

    result=$(kubectl get pv "${pv_name}")|| true
    echo "${result}"
    if [[ -z "${result}" ]]; then
      success=1
      break;
    fi
    kubectl delete pv "${pv_name}" --grace-period=0 --force &
    echo "Waiting to delete pv ${pv_name} ${try}/${maxtry}..."; sleep 10
    try=$(( try + 1 ))
  done

  if [ "${success}" -eq 0 ]; then
    echo "Failed to delete pv ${pv_name}."
  fi
}

function validate_pv_backup_available(){
  local pv_name=$1
  local validate_pv_backup_available_resp=0
  local backup_resp=""
  local resp_status_code=""
  local resp_message=""

  local try=0
  local maxtry=3
  while (( try != maxtry )) ; do
    backup_resp=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?") || true
    echo "Backup Response: ${backup_resp}"
    if { [ -n "${backup_resp}" ] && [ "${backup_resp}" != " " ]; }; then
      resp_status_code=$(echo "${backup_resp}"| jq -c ".status")
      resp_message=$(echo "${backup_resp}"| jq -c ".message")
      if [[ -n "${resp_status_code}" && "${resp_status_code}" != " " && "${resp_status_code}" != "null" && (( resp_status_code -gt 200 )) ]] ; then
        validate_pv_backup_available_resp=0
      else
        resp_message=$(echo "${backup_resp}"| jq -c ".messages.error")
        if { [ -z "${resp_message}" ] || [ "${resp_message}" = "null" ]; }; then
          echo "PVC Backup is available for ${pv_name}"
          # shellcheck disable=SC2034
          validate_pv_backup_available_resp=1
          break;
        fi
      fi
    fi
    try=$((try+1))
    sleep 10
  done
  export IS_BACKUP_AVAILABLE="${validate_pv_backup_available_resp}"
}

function store_pvc_resources(){
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4

  if [[ -n "${volumeattachment_name}" && "${volumeattachment_name}" != " " ]]; then
    result=$(kubectl get volumeattachments "${volumeattachment_name}" -o json > "${LOCAL_RESTORE_PATH}/${volumeattachment_name}".json) || true
    echo "${result}"
  fi

  kubectl get pv "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
  kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}" -o json > "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json
  kubectl get pvc "${pvc_name}" -n "${namespace}" -o json > "${LOCAL_RESTORE_PATH}/${pvc_name}"-pvc.json
}

function wait_pvc_bound() {
  local namespace=$1
  local pvc_name=$2

  try=0
  maxtry=30
  while (( try < maxtry )); do
    local status=""
    status=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".status | select( has(\"phase\")).phase")
    if [[ "${status}"  == "Bound" ]]; then
      echo "PVC ${pvc_name} Bouned successfully."
      break;
    fi
    echo "waiting for PVC to bound...${try}/${maxtry}"; sleep 30
    try=$((try+1))
  done
}

function create_volume() {
  local pv_name=$1
  local backup_path=$2
  local response
  create_volume_status="succeed"
  echo "Creating Volume with PV: ${pv_name} and BackupPath: ${backup_path}"
  local accessmode
  local replicacount
  accessmode=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.accessMode")
  replicacount=$(< "${LOCAL_RESTORE_PATH}/${pv_name}-volume.json" jq -r ".spec.numberOfReplicas")

  # create volume from backup pv
  response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/volumes" -H 'Accept: application/json' -H 'Content-Type: application/json;charset=UTF-8' --data-raw "{\"name\":\"${pv_name}\", \"accessMode\":\"${accessmode}\", \"fromBackup\": \"${backup_path}\", \"numberOfReplicas\": ${replicacount}}")

  sleep 5

  if [[ -n "${response}" && "${response}" != "null" && "${response}" != " " && -n $(echo "${response}"|jq -r ".id") ]]; then
  # wait for volume to be detached , max wait 4hr
    local try=0
    local maxtry=480
    local success=0
    while (( try < maxtry ));do
      status=$(kubectl -n longhorn-system get volumes.longhorn.io "${pv_name}"  -o json |jq -r ".status.state") || true
      echo "volume ${pv_name} status: ${status}"
      if [[ "${status}" == "detached" ]]; then
        # update label
        kubectl -n longhorn-system label volumes.longhorn.io/"${pv_name}" recurring-job-group.longhorn.io/uipath-backup=enabled
        success=1
        echo "Volume ready to use"
        break
      fi
      echo "waiting for volume to be ready to use${try}/${maxtry}..."; sleep 30
      restore_status_url="${LONGHORN_URL}/v1/volumes/${pv_name}?action=snapshotList"
      try=$(( try + 1 ))
    done

    if [ "${success}" -eq 0 ]; then
      create_volume_status="failed"
      echo "${pv_name} Volume is not ready to use with status ${status}"
    fi
  else
    create_volume_status="failed"
    kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-volume.json || true
    echo "${pv_name} Volume creation failed ${response} "
  fi
  echo "${create_volume_status}"
}

function restore_with_backupvolume() {
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4
  local backup_path=$5
  create_volume_status="succeed"
  store_pvc_resources "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}"
  sleep 5
  delete_pvc_resources "${pv_name}" "${volumeattachment_name}"
  sleep 5
  create_volume "${pv_name}" "${backup_path}"
  sleep 10

  kubectl create -f "${LOCAL_RESTORE_PATH}/${pv_name}"-pv.json
  if [[ -n "${create_volume_status}" && "${create_volume_status}" != "succeed" ]]; then
    echo "Backup volume restore failed for pvc ${pvc_name} in namespace ${namespace}"
    restore_pvc_status="failed"
  else 
    local pvc_uid=""
    pvc_uid=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".metadata.uid")

    # update pv with pvc uid
    kubectl patch pv "${pv_name}" --type json -p "[{\"op\": \"add\", \"path\": \"/spec/claimRef/uid\", \"value\": \"${pvc_uid}\"}]"

    wait_pvc_bound "${namespace}" "${pvc_name}"
    echo "${result}"
  fi
  echo ${restore_pvc_status}
}

function restore_pvc(){
  local pvc_name=$1
  local namespace=$2
  local pv_name=$3
  local volumeattachment_name=$4
  local backup_path=$5
  restore_pvc_status="succeed"
  restore_with_backupvolume "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
  echo ${restore_pvc_status}
  # attach_volume "${pv_name}"
}

function get_backup_list_by_pvname(){
  local pv_name=$1
  local get_backup_list_by_pvname_resp=""
  local backup_list_response=""

  local try=0
  local maxtry=3
  while (( try != maxtry )) ; do

    backup_list_response=$(curl --noproxy "*" "${LONGHORN_URL}/v1/backupvolumes/${pv_name}?action=backupList" -X 'POST' -H 'Content-Length: 0' -H 'Accept: application/json')
    if [[ -n "${backup_list_response}" && "${backup_list_response}" != " " && -n $( echo "${backup_list_response}"|jq ".data" )  && -n $( echo "${backup_list_response}"|jq ".data[]" ) ]]; then
      echo "Backup List Response: ${backup_list_response}"
      # pick first backup data with url non empty
      # shellcheck disable=SC2034
      get_backup_list_by_pvname_resp=$(echo "${backup_list_response}"|jq -c '[.data[]|select(.url | . != null and . != "")][0]')
      break;
    fi
    try=$((try+1))
    sleep 10
  done
  export PV_BACKUP_PATH="${get_backup_list_by_pvname_resp}"
}

function get_pvc_resources() {

  get_pvc_resources_resp=""

  local PVC_NAME=$1
  local PVC_NAMESPACE=$2

  # PV have one to one mapping with PVC
  PV_NAME=$(kubectl -n "${PVC_NAMESPACE}" get pvc "${PVC_NAME}" -o json|jq -r ".spec.volumeName")

  VOLUME_ATTACHMENT_LIST=$(kubectl get volumeattachments -n "${PVC_NAMESPACE}" -o=json|jq -c ".items[]|\
{name: .metadata.name, pvClaimName:.spec.source| select( has (\"persistentVolumeName\")).persistentVolumeName}")

  VOLUME_ATTACHMENT_NAME=""
  for VOLUME_ATTACHMENT in ${VOLUME_ATTACHMENT_LIST};
  do
    PV_CLAIM_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".pvClaimName")
    if [[ "${PV_NAME}" = "${PV_CLAIM_NAME}" ]]; then
      VOLUME_ATTACHMENT_NAME=$(echo "${VOLUME_ATTACHMENT}"|jq -r ".name")
      break;
    fi
  done

  BACKUP_PATH=""
  validate_pv_backup_available "${PV_NAME}"
  local is_backup_available=${IS_BACKUP_AVAILABLE:- 0}
  unset IS_BACKUP_AVAILABLE

  if [ "${is_backup_available}" -eq 1 ]; then
    echo "Backup is available for PVC ${PVC_NAME}"

    get_backup_list_by_pvname "${PV_NAME}"
    BACKUP_PATH=$(echo "${PV_BACKUP_PATH}"| jq -r ".url")
    unset PV_BACKUP_PATH
  fi

  get_pvc_resources_resp="{\"pvc_name\": \"${PVC_NAME}\", \"pv_name\": \"${PV_NAME}\", \"volumeattachment_name\": \"${VOLUME_ATTACHMENT_NAME}\", \"backup_path\": \"${BACKUP_PATH}\"}"
  echo "${get_pvc_resources_resp}"
}

function scale_ownerreferences() {
  local ownerReferences=$1
  local namespace=$2
  local replicas=$3

  # no operation required
  if [[ -z "${ownerReferences}" || "${ownerReferences}" == "null" ]]; then
    return
  fi

  ownerReferences=$(echo "${ownerReferences}"| jq -c ".[]")
  for ownerReference in ${ownerReferences};
  do
    echo "Owner: ${ownerReference}"
    local resourceKind
    local resourceName
    resourceKind=$(echo "${ownerReference}"| jq -r ".kind")
    resourceName=$(echo "${ownerReference}"| jq -r ".name")

    if kubectl -n "${namespace}" get "${resourceKind}" "${resourceName}" >/dev/null 2>&1; then
      # scale replicas
      kubectl  -n "${namespace}" patch "${resourceKind}" "${resourceName}" --type json -p "[{\"op\": \"replace\", \"path\": \"/spec/members\", \"value\": ${replicas} }]"
    fi
  done
}

function scale_down_statefulset() {
  local statefulset_name=$1
  local namespace=$2
  local ownerReferences=$3

  echo "Start Scale Down statefulset ${statefulset_name} under namespace ${namespace}..."

  # validate and scale down ownerreference
  scale_ownerreferences "${ownerReferences}" "${namespace}" 0

  local try=0
  local maxtry=30
  success=0
  while (( try != maxtry )) ; do
    result=$(kubectl scale statefulset "${statefulset_name}" --replicas=0 -n "${namespace}") || true
    echo "${result}"
    scaledown=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep 0/0) || true
    if { [ -n "${scaledown}" ] && [ "${scaledown}" != " " ]; }; then
      echo "Statefulset scaled down successfully."
      success=1
      break
    else
      try=$((try+1))
      echo "waiting for the statefulset ${statefulset_name} to scale down...${try}/${maxtry}";sleep 30
    fi
  done

  if [ ${success} -eq 0 ]; then
    echo "Statefulset ${statefulset_name} scaled down failed"
  fi
}

function scale_up_statefulset() {
  local statefulset_name=$1
  local namespace=$2
  local replica=$3
  local ownerReferences=$4

  # Scale up statefulsets using PVCs
  echo "Start Scale Up statefulset ${statefulset_name}..."

  # validate and scale up ownerreference
  scale_ownerreferences "${ownerReferences}" "${namespace}" "${replica}"

  echo "Waiting to scale up statefulset..."

  local try=1
  local maxtry=15
  local success=0
  while (( try != maxtry )) ; do

    kubectl scale statefulset "${statefulset_name}" --replicas="${replica}" -n "${namespace}"
    kubectl get statefulset "${statefulset_name}" -n "${namespace}"

    scaleup=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}"|grep "${replica}"/"${replica}") || true
    if ! { [ -n "${scaleup}" ] && [ "${scaleup}" != " " ]; }; then
      try=$((try+1))
      echo "waiting for the statefulset ${statefulset_name} to scale up...${try}/${maxtry}"; sleep 30
    else
      echo "Statefulset scaled up successfully."
      success=1
      break
    fi
  done

  if [ ${success} -eq 0 ]; then
    echo "Statefulset scaled up failed ${statefulset_name}."
  fi
}

function restore_pvc_attached_to_statefulsets() {
  local namespace
  namespace="${1}"
  local statefulset_list

  # list of all statefulset using PVC
  statefulset_list=$(kubectl get statefulset -n "${namespace}" -o=json | jq -r ".items[] | select(.spec.volumeClaimTemplates).metadata.name")

  for statefulset_name in ${statefulset_list};
  do
    local replica
    local ownerReferences
    local pvc_restore_failed
    local try=0
    local maxtry=5
    local status="notready"
    pvc_restore_failed=""
    restore_pvc_status=""

    # check if statefulset is reday
    while [[ "${status}" == "notready" ]] && (( try < maxtry )); do
      echo "fetch statefulset ${statefulset_name} metadata...  ${try}/${maxtry}"
      try=$(( try + 1 ))
      replica=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec.replicas")

      if [[ "${replica}" != 0 ]]; then
        status="ready"
      else
        echo "statefulset ${statefulset_name} replica is not ready. Wait and retry"; sleep 30
      fi
    done

    if [[ "${status}" != "ready" ]]; then
      echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}. Please retrigger volume restore step."
    fi

    # Fetch ownerReferences and claim name
    ownerReferences=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".metadata.ownerReferences")
    claimTemplatesName=$(kubectl get statefulset "${statefulset_name}" -n "${namespace}" -o=json | jq -c ".spec | select( has (\"volumeClaimTemplates\") ).volumeClaimTemplates[].metadata.name " | xargs)
    
    echo "Scaling down Statefulset ${statefulset_name} with ${replica} under namespace ${namespace}"
    scale_down_statefulset "${statefulset_name}" "${namespace}" "${ownerReferences}"
    for name in ${claimTemplatesName}; do
      local pvc_prefix
      pvc_prefix="${name}-${statefulset_name}"

      for((i=0;i<"${replica}";i++)); do
        local pvc_name
        pvc_name="${pvc_prefix}-${i}"

        pvc_exist=$(kubectl -n "${namespace}" get pvc "${pvc_name}") || true
        if [[ -z "${pvc_exist}" || "${pvc_exist}" == " " ]]; then
          echo "PVC not available for the statefulset ${statefulset_name}, skipping restore."
          continue;
        fi

        local pvc_storageclass
        pvc_storageclass=$(kubectl -n "${namespace}" get pvc "${pvc_name}" -o json| jq -r ".spec.storageClassName")
        if [[ ! ( "${pvc_storageclass}" == "${STORAGE_CLASS}" || "${pvc_storageclass}" == "${STORAGE_CLASS_SINGLE_REPLICA}" ) ]]; then
          echo "backup not available for pvc ${pvc_name}, storageclass: ${pvc_storageclass} "
          continue;
        fi

        # get pv, volumeattachments for pvc
        get_pvc_resources_resp=""
        get_pvc_resources "${pvc_name}" "${namespace}"

        local pv_name
        local volumeattachment_name
        local backup_path
        pv_name=$(echo "${get_pvc_resources_resp}"| jq -r ".pv_name")
        volumeattachment_name=$(echo "${get_pvc_resources_resp}"| jq -r ".volumeattachment_name")
        backup_path=$(echo "${get_pvc_resources_resp}"| jq -r ".backup_path")
        if [[ -z "${backup_path}" || "${backup_path}" == " " || "${backup_path}" == "null" ]]; then
          pvc_restore_failed="error"
          FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
          continue;
        fi

        restore_pvc_status="succeed"
        restore_pvc "${pvc_name}" "${namespace}" "${pv_name}" "${volumeattachment_name}" "${backup_path}"
        if [[ -n "${restore_pvc_status}" && "${restore_pvc_status}" != "succeed" ]]; then
          pvc_restore_failed="error"
          FAILED_PVC_LIST="${FAILED_PVC_LIST},${pv_name}"
          continue;
        fi
      done
    done

    sleep 10
    scale_up_statefulset "${statefulset_name}" "${namespace}" "${replica}" "${ownerReferences}"
    sleep 5

    if [[ -n "${pvc_restore_failed}" && "${pvc_restore_failed}" == "error" ]]; then
        echo "Failed to restore pvc for Statefulset ${statefulset_name} in namespace ${namespace}."
    fi    
  done
}

LONGHORN_URL=$(kubectl -n longhorn-system get svc longhorn-backend -o jsonpath="{.spec.clusterIP}"):9500

restore_pvc_attached_to_statefulsets "mongodb"