You can deploy a deep learning VM with an NVIDIA RAG workload using a pgvector PostgreSQL database managed by VMware Data Services Manager.

Prerequisites

Procedure

  1. If you are deploying the deep learning VM directly on the vSphere cluster or by using the kubectl command, create a cloud-init script and deploy the deep learning VM.
    1. Create a cloud-init script for NVIDIA RAG and the pgvector PostgreSQL database you have created.
      You can modify the initial version of the cloud-init script for NVIDIA RAG. For example, for NVIDIA RAG 24.03 and a pgvector PostgreSQL database with connection details postgres://pgvector_db_admin:encoded_pgvector_db_admin_password@pgvector_db_ip_address:5432/pgvector_db_name.
      #cloud-config
      write_files:
      - path: /opt/dlvm/dl_app.sh
        permissions: '0755'
        content: |
          #!/bin/bash
          error_exit() {
            echo "Error: $1" >&2
            exit 1
          }
      
          cat <<EOF > /opt/dlvm/config.json
          {
            "_comment": "This provides default support for RAG: TensorRT inference, llama2-13b model, and H100x2 GPU",
            "rag": {
              "org_name": "cocfwga8jq2c",
              "org_team_name": "no-team",
              "rag_repo_name": "nvidia/paif",
              "llm_repo_name": "nvidia/nim",
              "embed_repo_name": "nvidia/nemo-retriever",
              "rag_name": "rag-docker-compose",
              "rag_version": "24.03",
              "embed_name": "nv-embed-qa",
              "embed_type": "NV-Embed-QA",
              "embed_version": "4",
              "inference_type": "trt",
              "llm_name": "llama2-13b-chat",
              "llm_version": "h100x2_fp16_24.02",
              "num_gpu": "2",
              "hf_token": "huggingface token to pull llm model, update when using vllm inference",
              "hf_repo": "huggingface llm model repository, update when using vllm inference"
            }
          }
          EOF
          CONFIG_JSON=$(cat "/opt/dlvm/config.json")
          INFERENCE_TYPE=$(echo "${CONFIG_JSON}" | jq -r '.rag.inference_type')
          if [ "${INFERENCE_TYPE}" = "trt" ]; then
            required_vars=("ORG_NAME" "ORG_TEAM_NAME" "RAG_REPO_NAME" "LLM_REPO_NAME" "EMBED_REPO_NAME" "RAG_NAME" "RAG_VERSION" "EMBED_NAME" "EMBED_TYPE" "EMBED_VERSION" "LLM_NAME" "LLM_VERSION" "NUM_GPU")
          elif [ "${INFERENCE_TYPE}" = "vllm" ]; then
            required_vars=("ORG_NAME" "ORG_TEAM_NAME" "RAG_REPO_NAME" "LLM_REPO_NAME" "EMBED_REPO_NAME" "RAG_NAME" "RAG_VERSION" "EMBED_NAME" "EMBED_TYPE" "EMBED_VERSION" "LLM_NAME" "NUM_GPU" "HF_TOKEN" "HF_REPO")
          else
            error_exit "Inference type '${INFERENCE_TYPE}' is not recognized. No action will be taken."
          fi
          for index in "${!required_vars[@]}"; do
            key="${required_vars[$index]}"
            jq_query=".rag.${key,,} | select (.!=null)"
            value=$(echo "${CONFIG_JSON}" | jq -r "${jq_query}")
            if [[ -z "${value}" ]]; then 
              error_exit "${key} is required but not set."
            else
              eval ${key}=\""${value}"\"
            fi
          done
      
          RAG_URI="${RAG_REPO_NAME}/${RAG_NAME}:${RAG_VERSION}"
          LLM_MODEL_URI="${LLM_REPO_NAME}/${LLM_NAME}:${LLM_VERSION}"
          EMBED_MODEL_URI="${EMBED_REPO_NAME}/${EMBED_NAME}:${EMBED_VERSION}"
      
          NGC_CLI_VERSION="3.41.2"
          NGC_CLI_URL="https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/${NGC_CLI_VERSION}/files/ngccli_linux.zip"
      
          mkdir -p /opt/data
          cd /opt/data
      
          if [ ! -f .file_downloaded ]; then
            # clean up
            rm -rf compose.env ${RAG_NAME}* ${LLM_NAME}* ngc* ${EMBED_NAME}* *.json .file_downloaded
      
            # install ngc-cli
            wget --content-disposition ${NGC_CLI_URL} -O ngccli_linux.zip && unzip ngccli_linux.zip
            export PATH=`pwd`/ngc-cli:${PATH}
      
            APIKEY=""
            REG_URI="nvcr.io"
      
            if [[ "$(grep registry-uri /opt/dlvm/ovf-env.xml | sed -n 's/.*oe:value="\([^"]*\).*/\1/p')" == *"${REG_URI}"* ]]; then
              APIKEY=$(grep registry-passwd /opt/dlvm/ovf-env.xml | sed -n 's/.*oe:value="\([^"]*\).*/\1/p')
            fi
      
            if [ -z "${APIKEY}" ]; then
                error_exit "No APIKEY found"
            fi
      
            # config ngc-cli
            mkdir -p ~/.ngc
      
            cat << EOF > ~/.ngc/config
            [CURRENT]
            apikey = ${APIKEY}
            format_type = ascii
            org = ${ORG_NAME}
            team = ${ORG_TEAM_NAME}
            ace = no-ace
          EOF
      
            # ngc docker login
            docker login nvcr.io -u \$oauthtoken -p ${APIKEY}
      
            # dockerhub login for general components, e.g. minio
            DOCKERHUB_URI=$(grep registry-2-uri /opt/dlvm/ovf-env.xml | sed -n 's/.*oe:value="\([^"]*\).*/\1/p')
            DOCKERHUB_USERNAME=$(grep registry-2-user /opt/dlvm/ovf-env.xml | sed -n 's/.*oe:value="\([^"]*\).*/\1/p')
            DOCKERHUB_PASSWORD=$(grep registry-2-passwd /opt/dlvm/ovf-env.xml | sed -n 's/.*oe:value="\([^"]*\).*/\1/p')
      
            if [[ -n "${DOCKERHUB_USERNAME}" && -n "${DOCKERHUB_PASSWORD}" ]]; then
              docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
            else
              echo "Warning: DockerHub not login"
            fi
      
            # get RAG files
            ngc registry resource download-version ${RAG_URI}
      
            # get llm model
            if [ "${INFERENCE_TYPE}" = "trt" ]; then
              ngc registry model download-version ${LLM_MODEL_URI}
              chmod -R o+rX ${LLM_NAME}_v${LLM_VERSION}
              LLM_MODEL_FOLDER="/opt/data/${LLM_NAME}_v${LLM_VERSION}"
            elif [ "${INFERENCE_TYPE}" = "vllm" ]; then
              pip install huggingface_hub
              huggingface-cli login --token ${HF_TOKEN}
              huggingface-cli download --resume-download ${HF_REPO}/${LLM_NAME} --local-dir ${LLM_NAME} --local-dir-use-symlinks False
              LLM_MODEL_FOLDER="/opt/data/${LLM_NAME}"
              cat << EOF > ${LLM_MODEL_FOLDER}/model_config.yaml 
              engine:
                model: /model-store
                enforce_eager: false
                max_context_len_to_capture: 8192
                max_num_seqs: 256
                dtype: float16
                tensor_parallel_size: ${NUM_GPU}
                gpu_memory_utilization: 0.8
          EOF
              chmod -R o+rX ${LLM_MODEL_FOLDER}
              python3 -c "import yaml, json, sys; print(json.dumps(yaml.safe_load(sys.stdin.read())))" < "${RAG_NAME}_v${RAG_VERSION}/rag-app-text-chatbot.yaml"> rag-app-text-chatbot.json
              jq '.services."nemollm-inference".image = "nvcr.io/nvidia/nim/nim_llm:24.02-day0" |
                  .services."nemollm-inference".command = "nim_vllm --model_name ${MODEL_NAME} --model_config /model-store/model_config.yaml" |
                  .services."nemollm-inference".ports += ["8000:8000"] |
                  .services."nemollm-inference".expose += ["8000"]' rag-app-text-chatbot.json > temp.json && mv temp.json rag-app-text-chatbot.json
              python3 -c "import yaml, json, sys; print(yaml.safe_dump(json.load(sys.stdin), default_flow_style=False, sort_keys=False))" < rag-app-text-chatbot.json > "${RAG_NAME}_v${RAG_VERSION}/rag-app-text-chatbot.yaml"
            fi
      
            # get embedding models
            ngc registry model download-version ${EMBED_MODEL_URI}
            chmod -R o+rX ${EMBED_NAME}_v${EMBED_VERSION}
      
            # config compose.env
            cat << EOF > compose.env
            export MODEL_DIRECTORY="${LLM_MODEL_FOLDER}"
            export MODEL_NAME=${LLM_NAME}
            export NUM_GPU=${NUM_GPU}
            export APP_CONFIG_FILE=/dev/null
            export EMBEDDING_MODEL_DIRECTORY="/opt/data/${EMBED_NAME}_v${EMBED_VERSION}"
            export EMBEDDING_MODEL_NAME=${EMBED_TYPE}
            export EMBEDDING_MODEL_CKPT_NAME="${EMBED_TYPE}-${EMBED_VERSION}.nemo"
            export POSTGRES_HOST_IP=pgvector_db_ip_address
            export POSTGRES_PORT_NUMBER=5432
            export POSTGRES_DB=pgvector_db_name
            export POSTGRES_USER=pgvector_db_admin
            export POSTGRES_PASSWORD=encoded_pgvector_db_admin_password
          EOF
      
            touch .file_downloaded
          fi
      
          # start NGC RAG
          docker compose -f ${RAG_NAME}_v${RAG_VERSION}/docker-compose-vectordb.yaml up -d pgvector
          source compose.env; docker compose -f ${RAG_NAME}_v${RAG_VERSION}/rag-app-text-chatbot.yaml up -d
    2. Encode the cloud-init script to base64 format.
      You use a base 64 encoding tool, such as https://decode64base.com/ to generate the encoded versio of your cloud-init script.
    3. Deploy the deep learning VM, passing the base64 value of the cloud-init script to the user-data input parameter.
  2. If you are deploying the deep learning VM by using a catalog item in VMware Aria Automation, you provide the details of the pgvector PostgreSQL database after you deploy the virtual machine.
    1. Deploy the deep learning VM from Automation Service Broker.
    2. Navigate to Consume > Deployments > Deployments and locate the deep learning VM deployment.
    3. In the Workstation VM section, save the details for SSH login to the virtual machine.
    4. Log in to the deep learning VM over SSH by using the credentials available in Automation Service Broker.
    5. Add the following pgvector variables to the /opt/data/compose.env file:
      POSTGRES_HOST_IP=pgvector_db_ip_address
      POSTGRES_PORT_NUMBER=5432
      POSTGRES_DB=pgvector_db_name
      POSTGRES_USER=pgvector_db_admin
      POSTGRES_PASSWORD=encoded_pgvector_db_admin_password
    6. Restart the NVIDIA RAG multi-container application by running the following commands.
      For example, for NVIDIA RAG 24.03:
      cd /opt/data
      docker compose -f rag-docker-compose_v24.03/rag-app-text-chatbot.yaml down
      docker compose -f rag-docker-compose_v24.03/docker-compose-vectordb.yaml down
      docker compose -f rag-docker-compose_v24.03/docker-compose-vectordb.yaml up -d