#!/bin/bash
# turf-deploy — deploy script for turf-tracker.
#
# Implements the prod-side half of turf-tracker's deploy contract
# (https://github.com/TylerVigario/turf-tracker/blob/main/docs/deployment.md).
# The contract says: build-on-prod from a tagged commit, run
# `npm ci && npm run build`, satisfy `src/lib/required-env.json`,
# probe liveness at `GET /api/health`, honour SIGTERM with a 30-second
# drain window. Everything else in this script (release directory
# layout, pg_dump cadence, pre-swap smoke, atomic swap mechanics,
# rollback policy) is the prod-side fill-in the contract leaves to
# us — the contract calls those out as out-of-scope.
#
# Triggered by /run/turf/deploy-trigger (written by the GitHub release
# webhook on release.published). Curate a specific tag by passing it
# as $1. Roll back to an on-disk release with --rollback <tag>.
#
# Usage:
#   sudo turf-deploy                       # consume trigger file
#   sudo turf-deploy v0.5.0                # curate a specific tag
#   sudo turf-deploy --skip-health         # bypass the post-restart health check
#   sudo turf-deploy --rollback v0.4.0     # roll back to an on-disk release

source /usr/lib/server-admin/admin-common.sh

# ----------------------------------------------------------------------------
# Config — every value here is either contract-mandated (commented as such)
# or a prod-side choice we own. No magic numbers without justification.
# ----------------------------------------------------------------------------

# App identity / filesystem layout.
APP_DIR="/opt/turf-tracker"
RELEASES_DIR="$APP_DIR/releases"
CURRENT_LINK="$APP_DIR/current"
ENV_FILE="$APP_DIR/.env"
DATA_DIR="$APP_DIR/data"
VERSION_FILE="$APP_DIR/.deployed-version"
SERVICE_USER="turf"
SERVICE_NAME="turf.service"
DROPIN_DIR="/etc/systemd/system/${SERVICE_NAME}.d"
DROPIN_FILE="$DROPIN_DIR/exec.conf"
TRIGGER_FILE="/run/turf/deploy-trigger"

# Source repo + access. Clone URL is fixed by the contract.
REPO_URL="git@github.com:TylerVigario/turf-tracker.git"
DEPLOY_KEY="/etc/turf/deploy_key"

# CLI shim — bin/turf.js is the operational CLI per docs/cli.md.
# Rewritten on every deploy to resolve the current release's node ABI.
CLI_SHIM="/usr/bin/turf"

# Disk / retention. KEEP_RELEASES is non-failed; failed ones are pruned eagerly.
KEEP_RELEASES=5
BUILD_DISK_MB=2000   # npm ci + next build needs ~1.5 GB; 2 GB for headroom.

# Health check — path is contract-fixed (GET /api/health, 200/503 via SELECT 1).
HEALTH_PATH="/api/health"
HEALTH_RETRIES=6
HEALTH_TIMEOUT=5

# Pre-swap smoke — boots the new bundle against the real env on an ephemeral
# port to validate the migration backward-compatibility invariant (new code
# must boot against the previous release's schema). Contract section §Database.
SMOKE_BUDGET=30

SKIP_HEALTH=false

# ----------------------------------------------------------------------------
# Argument parsing
# ----------------------------------------------------------------------------
TARGET_TAG=""
ROLLBACK_TAG=""
while [[ $# -gt 0 ]]; do
    case "$1" in
        --skip-health) SKIP_HEALTH=true; shift ;;
        --rollback)    ROLLBACK_TAG="${2:-}"; shift 2 ;;
        --rollback=*)  ROLLBACK_TAG="${1#--rollback=}"; shift ;;
        v[0-9]*.[0-9]*.[0-9]*) TARGET_TAG="$1"; shift ;;
        [0-9]*.[0-9]*.[0-9]*)  TARGET_TAG="v$1"; shift ;;
        *) shift ;;
    esac
done
[[ -n "$ROLLBACK_TAG" && "$ROLLBACK_TAG" =~ ^[0-9] ]] && ROLLBACK_TAG="v$ROLLBACK_TAG"

require_root
# pg_dump is the prod-side DB snapshot tool. Migrations + seed run via npm
# (which finds prisma in the release's node_modules), so we don't preflight
# the prisma CLI here — npm ci is the proof.
require_command git jq curl pg_dump systemctl npm node
lock_or_exit "turf-deploy"
enable_error_trap

CURRENT_VERSION="none"
[[ -f "$VERSION_FILE" ]] && CURRENT_VERSION=$(cat "$VERSION_FILE")

# Validate tag shape early.
if [[ -n "$ROLLBACK_TAG" ]]; then
    [[ "$ROLLBACK_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] \
        || { log_error "Invalid rollback tag (expected vX.Y.Z): $ROLLBACK_TAG"; exit 1; }
else
    if [[ -z "$TARGET_TAG" ]]; then
        if [[ -s "$TRIGGER_FILE" ]]; then
            TARGET_TAG=$(cat "$TRIGGER_FILE")
            rm -f "$TRIGGER_FILE"
        else
            log_info "No tag argument and no trigger file — nothing to deploy"
            exit 0
        fi
    fi
    [[ "$TARGET_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] \
        || { log_error "Invalid tag (expected vX.Y.Z): $TARGET_TAG"; exit 1; }
    log_start "Turf deploy $CURRENT_VERSION -> $TARGET_TAG"
fi

# ----------------------------------------------------------------------------
# App-specific helpers
# ----------------------------------------------------------------------------

# Per-release CLI shim at /usr/bin/turf. Self-reexec via sudo so any caller
# (root, tyler, anyone in the turf-cli sudoers rule) lands as the turf service
# user, executing the same node binary the service uses so native modules
# (prisma engines, argon2) share their ABI.
write_cli_shim() {
    local release_dir="$1"
    local node_bin
    node_bin=$(resolve_node_binary "$release_dir")
    [[ -n "$node_bin" ]] || { log_error "Could not resolve node binary for CLI shim"; return 1; }
    local svc_uid
    svc_uid=$(id -u "$SERVICE_USER")
    cat > "$CLI_SHIM" <<EOF
#!/bin/bash
# Generated by turf-deploy from $release_dir/bin/turf.js
# Do not edit by hand — rewritten on every deploy and rollback.
if [[ \$(id -u) -ne $svc_uid ]]; then
    exec sudo -u $SERVICE_USER -- "\$0" "\$@"
fi
exec "$node_bin" "$CURRENT_LINK/bin/turf.js" "\$@"
EOF
    chmod 755 "$CLI_SHIM"
    log_ok "Wrote CLI shim: $CLI_SHIM"
}

# Pre-deploy database snapshot — retains last 5 .dump files. The contract
# explicitly leaves "Database snapshot/backup strategy" to prod; this is the
# strategy we picked.
backup_database() {
    local backup_path="$DATA_DIR/turf.pre-deploy-$(date +%Y%m%d-%H%M%S).dump"
    log_info "pg_dump → $backup_path"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        URL=\"\${DATABASE_URL%%\\?*}\"
        pg_dump -Fc \"\$URL\" -f '$backup_path'
    " >> "$ADMIN_LOG" 2>&1 || { log_error "pg_dump failed"; return 1; }
    ls -t "$DATA_DIR/turf.pre-deploy-"*.dump 2>/dev/null | tail -n +6 | xargs -r rm -f
    log_ok "Database snapshot written"
}

# Migrations + seed, both contract-specified as idempotent. Order matters:
# migrations first (schema change), seed second (lookup-row upserts that
# may depend on the new schema).
apply_migrations_and_seed() {
    local release_dir="$1"

    log_info "npm run db:migrate"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        cd '$release_dir' && npm run db:migrate
    " >> "$ADMIN_LOG" 2>&1 || { log_error "migrations failed"; return 1; }
    log_ok "Migrations applied"

    log_info "node bin/seed.js"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        cd '$release_dir' && node bin/seed.js
    " >> "$ADMIN_LOG" 2>&1 || { log_error "seed failed"; return 1; }
    log_ok "Seed applied"
}

# Roll back to a build-on-prod release dir already on disk.
rollback_to() {
    local tag="$1"
    local release_dir="$RELEASES_DIR/$tag"

    [[ -d "$release_dir" ]] \
        || { log_error "Rollback target not on disk: $release_dir"; return 1; }
    [[ -f "$release_dir/package.json" ]] \
        || { log_error "Rollback target is not a build-on-prod release (no package.json): $tag"; return 1; }
    [[ -f "$release_dir/.failed-deploy" ]] \
        && { log_error "Rollback target is marked failed: $tag"; return 1; }

    log_start "Rollback $CURRENT_VERSION -> $tag"

    log_info "Symlink swap: current -> $tag"
    ln -sfn "$release_dir" "$CURRENT_LINK.tmp"
    mv -Tf "$CURRENT_LINK.tmp" "$CURRENT_LINK"
    restorecon "$CURRENT_LINK" 2>/dev/null || true
    echo "$tag" > "$VERSION_FILE"

    write_dropin "$release_dir" "$DROPIN_FILE" "$CURRENT_LINK" "$DATA_DIR" "$CURRENT_LINK/.next/cache" \
        || return 1
    write_cli_shim "$release_dir" || return 1
    systemctl daemon-reload

    if ! restart_service "$SERVICE_NAME"; then
        log_error "Rollback target failed to restart — manual investigation required"
        mark_failed_release "$release_dir" "rollback target failed to restart"
        return 1
    fi

    if [[ "$SKIP_HEALTH" != "true" ]]; then
        local port
        port=$(sudo -u "$SERVICE_USER" bash -c "set -a; source '$ENV_FILE'; set +a; printf '%s' \"\${PORT:-3000}\"")
        log_info "Health check on http://localhost:${port}${HEALTH_PATH}"
        if retry "$HEALTH_RETRIES" curl -fsS --max-time "$HEALTH_TIMEOUT" \
                "http://localhost:${port}${HEALTH_PATH}" -o /dev/null; then
            log_ok "Service healthy"
        else
            log_error "Health check failed after rollback to $tag"
            mark_failed_release "$release_dir" "rollback target also failed health check"
            return 1
        fi
    fi

    log_ok "Rolled back to $tag"
    log_end "Rollback"
}

# Forward-deploy failure path. Picks the previous build-on-prod release and
# tries to roll back to it. Logs the degraded state if rollback also fails;
# this is intentionally non-fatal because the rollback already left a marker.
dispatch_rollback() {
    local prev_tag="$1"
    if [[ -n "$prev_tag" ]]; then
        rollback_to "$prev_tag" \
            || log_error "Rollback to $prev_tag did not converge — service may be in a degraded state"
    else
        log_error "No previous build-on-prod release to roll back to"
    fi
}

# ----------------------------------------------------------------------------
# Rollback short-circuit
# ----------------------------------------------------------------------------
if [[ -n "$ROLLBACK_TAG" ]]; then
    rollback_to "$ROLLBACK_TAG"
    exit $?
fi

# ----------------------------------------------------------------------------
# Forward deploy
# ----------------------------------------------------------------------------

# 1. Disk pre-check + git clone the tagged commit.
require_disk_space "$RELEASES_DIR" "$BUILD_DISK_MB"

RELEASE_DIR="$RELEASES_DIR/$TARGET_TAG"
if [[ -d "$RELEASE_DIR" ]]; then
    log_warn "Release dir exists; removing for clean clone: $RELEASE_DIR"
    rm -rf "$RELEASE_DIR"
fi
clone_tag "$REPO_URL" "$TARGET_TAG" "$RELEASE_DIR" "$DEPLOY_KEY" || exit 6
fix_ownership "$RELEASE_DIR" "$SERVICE_USER"

# 2. Validate required env from src/lib/required-env.json. Turf's set per
# the contract: DATABASE_URL, BETTER_AUTH_SECRET, BETTER_AUTH_URL,
# AUTH_PASSWORD_PEPPER. Source of truth is the file in the release tree,
# so adding a new required-env in the source repo is automatically enforced
# on next deploy.
validate_required_env "$RELEASE_DIR" "$ENV_FILE" "$SERVICE_USER" || exit 9

# 3. Build. `npm run build` runs prebuild (check:public-env, build:seed,
# build:cli, build:server), then `next build && serwist build`, then the
# postbuild real-boot smoke against a hermetic stub env. Contract §Build.
build_release "$RELEASE_DIR" "$SERVICE_USER" "$ENV_FILE" || {
    mark_failed_release "$RELEASE_DIR" "build failed (npm ci or npm run build)"
    exit 12
}
fix_selinux "$RELEASE_DIR"

# 4. Decide rollback target now, before we touch anything destructive.
PREV_TAG=$(select_prev_tag "$RELEASES_DIR" "$TARGET_TAG")
[[ -n "$PREV_TAG" ]] && log_info "Rollback target if needed: $PREV_TAG"

# 5. DB snapshot. Always run, even if the build later fails post-snapshot —
# the snapshot doesn't cost much and gives a forensic anchor.
backup_database

# 6. Pre-swap smoke against the real env on an ephemeral port. This is the
# critical step for the migration backward-compatibility invariant: the new
# bundle must boot against the CURRENT schema (pre-migration). Contract
# §Database calls this out as prod-side responsibility.
SMOKE_START=$(read_start_command "$RELEASE_DIR") || exit 16
NODE_BIN=$(resolve_node_binary "$RELEASE_DIR")
[[ -n "$NODE_BIN" ]] || { log_error "No node binary resolvable for smoke"; exit 16; }
SMOKE_RENDERED=$(render_exec_start "$NODE_BIN" "$SMOKE_START")
if ! smoke_test_release "$RELEASE_DIR" "$SERVICE_USER" "$ENV_FILE" "$SMOKE_RENDERED" "$SMOKE_BUDGET"; then
    mark_failed_release "$RELEASE_DIR" "pre-swap smoke test failed"
    log_error "Aborting deploy — current symlink untouched, no migrations attempted"
    exit 16
fi

# 7. Migrations + seed AFTER the smoke proved the new bundle boots against
# the live schema. If migrations themselves fail, the symlink is still
# untouched, so the service keeps running the previous release until a
# manual remediation.
apply_migrations_and_seed "$RELEASE_DIR" || {
    mark_failed_release "$RELEASE_DIR" "migrations or seed failed"
    exit 12
}

# 8. Atomic swap. Symlink first (so the next service start picks up the new
# tree), drop-in next (ExecStart derived from package.json#scripts.start),
# CLI shim next (matches the just-swapped tree's node ABI), then
# daemon-reload + restart.
log_info "Atomic symlink swap: current -> $TARGET_TAG"
ln -sfn "$RELEASE_DIR" "$CURRENT_LINK.tmp"
mv -Tf "$CURRENT_LINK.tmp" "$CURRENT_LINK"
restorecon "$CURRENT_LINK" 2>/dev/null || true
echo "$TARGET_TAG" > "$VERSION_FILE"

write_dropin "$RELEASE_DIR" "$DROPIN_FILE" "$CURRENT_LINK" "$DATA_DIR" "$CURRENT_LINK/.next/cache" \
    || exit 13
write_cli_shim "$RELEASE_DIR" || exit 13
systemctl daemon-reload

if ! restart_service "$SERVICE_NAME"; then
    log_error "Service failed to restart cleanly after swap to $TARGET_TAG"
    mark_failed_release "$RELEASE_DIR" "service failed to restart after swap"
    dispatch_rollback "$PREV_TAG"
    exit 14
fi

# 9. Post-restart health check.
PORT=$(sudo -u "$SERVICE_USER" bash -c "set -a; source '$ENV_FILE'; set +a; printf '%s' \"\${PORT:-3000}\"")
if [[ "$SKIP_HEALTH" == "true" ]]; then
    log_info "Skipping health check (--skip-health)"
else
    log_info "Health check on http://localhost:${PORT}${HEALTH_PATH}"
    if retry "$HEALTH_RETRIES" curl -fsS --max-time "$HEALTH_TIMEOUT" \
            "http://localhost:${PORT}${HEALTH_PATH}" -o /dev/null; then
        log_ok "Service healthy"
    else
        log_error "Health check failed"
        mark_failed_release "$RELEASE_DIR" "post-restart health check failed at ${HEALTH_PATH}"
        dispatch_rollback "$PREV_TAG"
        exit 14
    fi
fi

# 10. GC: keep last $KEEP_RELEASES non-failed releases, prune all failed.
gc_releases "$RELEASES_DIR" "$CURRENT_LINK" "$KEEP_RELEASES"

log_ok "Deployed $TARGET_TAG"
log_end "Turf deploy"
