#!/bin/bash
# dailies-deploy — build-on-prod deploy for vis-daily-tracker.
#
# Triggered by /run/dailies/deploy-trigger (written by the GitHub release
# webhook on release.published). Clones the tagged commit, runs npm ci +
# npm run build (which includes source-side postbuild smoke), runs prod's
# pre-swap smoke against real env on an ephemeral port, applies migrations
# + seed, atomic-swaps the `current` symlink, and restarts dailies.service.
#
# Per the source-side deploy contract (vis-daily-tracker/docs/deployment.md),
# the artifact IS the source tree at the tagged commit + everything
# `npm ci && npm run build` produces. No tarball, no MANIFEST, no
# requiredTools preflight — prod is the canonical build environment.
#
# Usage:
#   sudo dailies-deploy                   # consume trigger file
#   sudo dailies-deploy v2.84.0           # curate a specific tag
#   sudo dailies-deploy --skip-health     # bypass post-restart health check
#   sudo dailies-deploy --rollback v2.84.0  # roll back to a build-on-prod release on disk
#
# Rollback contract: --rollback targets a release dir that was deployed by
# this build-on-prod flow (has package.json). Legacy artifact releases
# (MANIFEST-based, e.g. v2.79.0 and earlier) cannot be rolled back to via
# this script — operator handles those manually if ever needed. After
# v2.84.0 deploys cleanly, every rollback target is build-on-prod-shaped.

source /usr/lib/server-admin/admin-common.sh

# --- App-specific config ---
APP_DIR="/opt/vis-daily-tracker"
RELEASES_DIR="$APP_DIR/releases"
CURRENT_LINK="$APP_DIR/current"
ENV_FILE="$APP_DIR/.env"
DATA_DIR="$APP_DIR/data"
TRIGGER_FILE="/run/dailies/deploy-trigger"
VERSION_FILE="$APP_DIR/.deployed-version"
SERVICE_USER="dailies"
SERVICE_NAME="dailies.service"
DROPIN_DIR="/etc/systemd/system/${SERVICE_NAME}.d"
DROPIN_FILE="$DROPIN_DIR/exec.conf"
CLI_SHIM="/usr/bin/dailies"
REPO_URL="git@github.com:TylerVigario/vis-daily-tracker.git"
DEPLOY_KEY="/etc/dailies/deploy_key"
KEEP_RELEASES=5
HEALTH_PATH="/api/health"   # hardcoded per source-side contract
HEALTH_RETRIES=6
HEALTH_TIMEOUT=5
SMOKE_BUDGET=30
BUILD_DISK_MB=2000          # build needs ~1.5 GB; 2 GB for headroom
SKIP_HEALTH=false
TIMERS=(dailies-notifications-tick.timer dailies-uploads-sweep.timer dailies-submissions-auto-lock.timer)

# --- Argument parsing ---
TARGET_TAG=""
ROLLBACK_TAG=""
while [[ $# -gt 0 ]]; do
    case "$1" in
        --skip-health) SKIP_HEALTH=true; shift ;;
        --rollback)    ROLLBACK_TAG="${2:-}"; shift 2 ;;
        --rollback=*)  ROLLBACK_TAG="${1#--rollback=}"; shift ;;
        v[0-9]*.[0-9]*.[0-9]*) TARGET_TAG="$1"; shift ;;
        [0-9]*.[0-9]*.[0-9]*)  TARGET_TAG="v$1"; shift ;;
        *) shift ;;
    esac
done
[[ -n "$ROLLBACK_TAG" && "$ROLLBACK_TAG" =~ ^[0-9] ]] && ROLLBACK_TAG="v$ROLLBACK_TAG"

require_root
require_command git jq curl pg_dump systemctl npm node
lock_or_exit "dailies-deploy"
enable_error_trap

CURRENT_VERSION="none"
[[ -f "$VERSION_FILE" ]] && CURRENT_VERSION=$(cat "$VERSION_FILE")

if [[ -n "$ROLLBACK_TAG" ]]; then
    [[ "$ROLLBACK_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] \
        || { log_error "Invalid rollback tag (expected vX.Y.Z): $ROLLBACK_TAG"; exit 1; }
else
    if [[ -z "$TARGET_TAG" ]]; then
        if [[ -s "$TRIGGER_FILE" ]]; then
            TARGET_TAG=$(cat "$TRIGGER_FILE")
            rm -f "$TRIGGER_FILE"
        else
            log_info "No tag argument and no trigger file — nothing to deploy"
            exit 0
        fi
    fi
    [[ "$TARGET_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] \
        || { log_error "Invalid tag (expected vX.Y.Z): $TARGET_TAG"; exit 1; }
    log_start "Dailies build-on-prod deploy $CURRENT_VERSION -> $TARGET_TAG"
fi

# ----------------------------------------------------------------------
# App-specific helpers (everything else lives in admin-common.sh)
# ----------------------------------------------------------------------

# Per-release CLI shim at /usr/bin/dailies. Self-reexec via sudo so
# non-dailies callers (tyler, etc.) auto-drop privileges via the existing
# /etc/sudoers.d/dailies-cli rule. Resolves the same node binary the
# service uses, so CLI and server share the same native-module ABI.
write_cli_shim() {
    local release_dir="$1"
    local node_bin
    node_bin=$(resolve_node_binary "$release_dir")
    [[ -n "$node_bin" ]] || { log_error "Could not resolve node binary for CLI shim"; return 1; }
    local svc_uid
    svc_uid=$(id -u "$SERVICE_USER")
    cat > "$CLI_SHIM" <<EOF
#!/bin/bash
# Generated by dailies-deploy from $release_dir/bin/dailies.js
# Do not edit by hand — rewritten on every deploy and rollback.
if [[ \$(id -u) -ne $svc_uid ]]; then
    exec sudo -u $SERVICE_USER -- "\$0" "\$@"
fi
exec "$node_bin" "$CURRENT_LINK/bin/dailies.js" "\$@"
EOF
    chmod 755 "$CLI_SHIM"
    log_ok "Wrote CLI shim: $CLI_SHIM"
}

# Pre-deploy database snapshot via pg_dump. Retains last 5.
backup_database() {
    local backup_path="$DATA_DIR/dailies.pre-deploy-$(date +%Y%m%d-%H%M%S).dump"
    log_info "pg_dump → $backup_path"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        URL=\"\${DATABASE_URL%%\\?*}\"
        pg_dump -Fc \"\$URL\" -f '$backup_path'
    " >> "$ADMIN_LOG" 2>&1 || { log_error "pg_dump failed"; return 1; }
    ls -t "$DATA_DIR/dailies.pre-deploy-"*.dump 2>/dev/null | tail -n +6 | xargs -r rm -f
    log_ok "Database snapshot written"
}

# Apply migrations (npm run db:migrate) then seed (node bin/seed.js).
# Both idempotent per the source-side contract.
apply_migrations_and_seed() {
    local release_dir="$1"

    log_info "npm run db:migrate"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        cd '$release_dir' && npm run db:migrate
    " >> "$ADMIN_LOG" 2>&1 || { log_error "migrations failed"; return 1; }
    log_ok "Migrations applied"

    log_info "node bin/seed.js"
    sudo -u "$SERVICE_USER" bash -c "
        set -a && source '$ENV_FILE' && set +a
        cd '$release_dir' && node bin/seed.js
    " >> "$ADMIN_LOG" 2>&1 || { log_error "seed failed"; return 1; }
    log_ok "Seed applied"
}

# Roll back to a build-on-prod release dir already on disk. Refuses legacy
# (MANIFEST-shaped) targets — operator handles those manually.
rollback_to() {
    local tag="$1"
    local release_dir="$RELEASES_DIR/$tag"

    [[ -d "$release_dir" ]] \
        || { log_error "Rollback target not on disk: $release_dir"; return 1; }
    [[ -f "$release_dir/.failed-deploy" ]] \
        && { log_error "Rollback target is marked failed: $tag"; return 1; }

    log_start "Rollback $CURRENT_VERSION -> $tag"

    systemctl stop "${TIMERS[@]}" 2>/dev/null || true

    log_info "Symlink swap: current -> $tag"
    ln -sfn "$release_dir" "$CURRENT_LINK.tmp"
    mv -Tf "$CURRENT_LINK.tmp" "$CURRENT_LINK"
    restorecon "$CURRENT_LINK" 2>/dev/null || true
    echo "$tag" > "$VERSION_FILE"

    write_dropin "$release_dir" "$DROPIN_FILE" "$CURRENT_LINK" "$DATA_DIR" "$CURRENT_LINK/.next/cache" \
        || return 1
    write_cli_shim "$release_dir" || return 1
    systemctl daemon-reload

    if ! restart_service "$SERVICE_NAME"; then
        log_error "Rollback target failed to restart — manual investigation required"
        log_error "Current symlink: $(readlink "$CURRENT_LINK")"
        mark_failed_release "$release_dir" "rollback target failed to restart"
        return 1
    fi

    if [[ "$SKIP_HEALTH" != "true" ]]; then
        local port
        port=$(sudo -u "$SERVICE_USER" bash -c "set -a; source '$ENV_FILE'; set +a; printf '%s' \"\${PORT:-3000}\"")
        log_info "Health check on http://localhost:${port}${HEALTH_PATH}"
        if retry "$HEALTH_RETRIES" curl -fsS --max-time "$HEALTH_TIMEOUT" \
                "http://localhost:${port}${HEALTH_PATH}" -o /dev/null; then
            log_ok "Service healthy"
        else
            log_error "Health check failed after rollback to $tag"
            mark_failed_release "$release_dir" "rollback target also failed health check"
            return 1
        fi
    fi

    systemctl start "${TIMERS[@]}" 2>/dev/null || true
    log_ok "Rolled back to $tag"
    log_end "Rollback"
}

# Fan-out used by failure paths in the forward-deploy flow.
dispatch_rollback() {
    local prev_tag="$1"
    if [[ -n "$prev_tag" ]]; then
        rollback_to "$prev_tag" \
            || log_error "Rollback to $prev_tag did not converge — service may be in a degraded state"
    else
        log_error "No previous build-on-prod release to roll back to"
    fi
    systemctl start "${TIMERS[@]}" 2>/dev/null || true
}

# ----------------------------------------------------------------------
# Rollback short-circuit
# ----------------------------------------------------------------------
if [[ -n "$ROLLBACK_TAG" ]]; then
    rollback_to "$ROLLBACK_TAG"
    exit $?
fi

# ----------------------------------------------------------------------
# Forward deploy
# ----------------------------------------------------------------------

# 1. Disk pre-check + git clone the tagged commit
require_disk_space "$RELEASES_DIR" "$BUILD_DISK_MB"

RELEASE_DIR="$RELEASES_DIR/$TARGET_TAG"
if [[ -d "$RELEASE_DIR" ]]; then
    log_warn "Release dir exists; removing for clean clone: $RELEASE_DIR"
    rm -rf "$RELEASE_DIR"
fi
clone_tag "$REPO_URL" "$TARGET_TAG" "$RELEASE_DIR" "$DEPLOY_KEY" || exit 6
fix_ownership "$RELEASE_DIR" "$SERVICE_USER"

# 2. Validate required env (from src/lib/required-env.json) + STORAGE_PATH sanity
validate_required_env "$RELEASE_DIR" "$ENV_FILE" "$SERVICE_USER" || exit 9

STORAGE_PATH=$(sudo -u "$SERVICE_USER" bash -c "set -a; source '$ENV_FILE'; set +a; printf '%s' \"\$STORAGE_PATH\"")
[[ "$STORAGE_PATH" == /* ]] \
    || { log_error "STORAGE_PATH must be absolute: $STORAGE_PATH"; exit 10; }
sudo -u "$SERVICE_USER" test -w "$STORAGE_PATH" \
    || { log_error "STORAGE_PATH not writable by $SERVICE_USER: $STORAGE_PATH"; exit 11; }

# 3. Build (npm ci + npm run build, includes the source-side postbuild smoke)
build_release "$RELEASE_DIR" "$SERVICE_USER" "$ENV_FILE" || {
    mark_failed_release "$RELEASE_DIR" "build failed (npm ci or npm run build)"
    exit 12
}
fix_selinux "$RELEASE_DIR"

# 4. PREV_TAG selection (admin-common's marker-aware helper)
PREV_TAG=$(select_prev_tag "$RELEASES_DIR" "$TARGET_TAG")
[[ -n "$PREV_TAG" ]] && log_info "Rollback target if needed: $PREV_TAG"

# 5. DB snapshot
backup_database

# 6. Pre-swap smoke against real env on an ephemeral port
SMOKE_START=$(read_start_command "$RELEASE_DIR") || exit 16
NODE_BIN=$(resolve_node_binary "$RELEASE_DIR")
[[ -n "$NODE_BIN" ]] || { log_error "No node binary resolvable for smoke"; exit 16; }
SMOKE_RENDERED=$(render_exec_start "$NODE_BIN" "$SMOKE_START")
if ! smoke_test_release "$RELEASE_DIR" "$SERVICE_USER" "$ENV_FILE" "$SMOKE_RENDERED" "$SMOKE_BUDGET"; then
    mark_failed_release "$RELEASE_DIR" "pre-swap smoke test failed"
    log_error "Aborting deploy — current symlink untouched, no migrations attempted"
    exit 16
fi

# 7. Migrations + seed (post-smoke; smoke already proved boot under real env)
apply_migrations_and_seed "$RELEASE_DIR" || {
    mark_failed_release "$RELEASE_DIR" "migrations or seed failed"
    exit 12
}

# 8. Stop timers, atomic swap, drop-in + CLI shim, daemon-reload, restart
systemctl stop "${TIMERS[@]}" 2>/dev/null || true

log_info "Atomic symlink swap: current -> $TARGET_TAG"
ln -sfn "$RELEASE_DIR" "$CURRENT_LINK.tmp"
mv -Tf "$CURRENT_LINK.tmp" "$CURRENT_LINK"
restorecon "$CURRENT_LINK" 2>/dev/null || true
echo "$TARGET_TAG" > "$VERSION_FILE"

write_dropin "$RELEASE_DIR" "$DROPIN_FILE" "$CURRENT_LINK" "$DATA_DIR" "$CURRENT_LINK/.next/cache" \
    || exit 13
write_cli_shim "$RELEASE_DIR" || exit 13
systemctl daemon-reload

if ! restart_service "$SERVICE_NAME"; then
    log_error "Service failed to restart cleanly after swap to $TARGET_TAG"
    mark_failed_release "$RELEASE_DIR" "service failed to restart after swap"
    dispatch_rollback "$PREV_TAG"
    exit 14
fi

# 9. Post-restart health check
PORT=$(sudo -u "$SERVICE_USER" bash -c "set -a; source '$ENV_FILE'; set +a; printf '%s' \"\${PORT:-3000}\"")
if [[ "$SKIP_HEALTH" == "true" ]]; then
    log_info "Skipping health check (--skip-health)"
else
    log_info "Health check on http://localhost:${PORT}${HEALTH_PATH}"
    if retry "$HEALTH_RETRIES" curl -fsS --max-time "$HEALTH_TIMEOUT" \
            "http://localhost:${PORT}${HEALTH_PATH}" -o /dev/null; then
        log_ok "Service healthy"
    else
        log_error "Health check failed"
        mark_failed_release "$RELEASE_DIR" "post-restart health check failed at ${HEALTH_PATH}"
        dispatch_rollback "$PREV_TAG"
        exit 14
    fi
fi

# 10. Restart timers + verify
systemctl start "${TIMERS[@]}" 2>/dev/null || true
for timer in "${TIMERS[@]}"; do
    if systemctl is-active --quiet "$timer"; then
        log_ok "$timer active"
    else
        log_warn "$timer not active"
    fi
done

# 11. GC: keep last $KEEP_RELEASES non-failed releases, prune all failed.
gc_releases "$RELEASES_DIR" "$CURRENT_LINK" "$KEEP_RELEASES"

log_ok "Deployed $TARGET_TAG"
log_end "Dailies deploy"
