#!/bin/bash
# Protein symmetry type prediction script
# Function: Input protein sequence, compute ESM2 and Path homology features, predict symmetry type using trained model

# Set script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

# Check if PHsymm environment is activated
if [ -z "$CONDA_DEFAULT_ENV" ]; then
    echo "Warning: No conda environment detected. Make sure to activate PHsymm environment."
    echo "You can activate it with: conda activate PHsymm"
    read -p "Continue anyway? (y/n) " -n 1 -r
    echo
    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
        exit 1
    fi
fi

# Default parameters
MODEL_PATH="trained_models/protein_classifier_unified/best_checkpoint_1_macro_auc_pr=0.5918.pt"
CONFIG_PATH="config/protein_classifier_config.yaml"
DEVICE="cpu"

# Parse command line arguments
FASTA_FILE=""
SEQUENCE=""
ESM2_DIR=""
ESM2_FILE=""
OUTPUT_FILE=""
USE_GPU=false

# Display usage instructions
usage() {
    echo "Usage: $0 -f <fasta_file> [OPTIONS]"
    echo "   or: $0 -s <sequence> [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  -f, --fasta <file>        FASTA file containing protein sequence(s) (required if -s not used)"
    echo "  -s, --sequence <seq>      Protein sequence string (required if -f not used)"
    echo "  -m, --model <path>        Model checkpoint path (default: $MODEL_PATH)"
    echo "  -c, --config <path>       Config file path (default: $CONFIG_PATH)"
    echo "  -e, --esm2-dir <dir>      Directory containing ESM2 feature files (optional)"
    echo "  -f2, --esm2-file <file>   Path to ESM2 feature CSV file (optional, for single sequence)"
    echo "  -o, --output <file>       Output CSV file for batch prediction results (optional)"
    echo "  -g, --gpu                 Use GPU for computation (requires CUDA)"
    echo "  -h, --help                Show this help message"
    echo ""
    echo "Examples:"
    echo "  # Predict from FASTA file (recommended)"
    echo "  $0 -f protein.fasta"
    echo "  $0 -f protein.fasta --output results.csv"
    echo "  $0 -f protein.fasta --gpu"
    echo ""
    echo "  # Predict from sequence string"
    echo "  $0 -s 'MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL'"
    echo "  $0 -s 'MKTAYIAKQR' --esm2-file esm2_features.csv"
    exit 1
}

# Check parameters
if [ $# -eq 0 ]; then
    usage
fi

# Parse parameters
while [[ $# -gt 0 ]]; do
    case $1 in
        -f|--fasta)
            FASTA_FILE="$2"
            shift 2
            ;;
        -s|--sequence)
            SEQUENCE="$2"
            shift 2
            ;;
        -m|--model)
            MODEL_PATH="$2"
            shift 2
            ;;
        -c|--config)
            CONFIG_PATH="$2"
            shift 2
            ;;
        -e|--esm2-dir)
            ESM2_DIR="$2"
            shift 2
            ;;
        -f2|--esm2-file)
            ESM2_FILE="$2"
            shift 2
            ;;
        -o|--output)
            OUTPUT_FILE="$2"
            shift 2
            ;;
        -g|--gpu)
            USE_GPU=true
            shift
            ;;
        -h|--help)
            usage
            ;;
        *)
            echo "Unknown option: $1"
            usage
            ;;
    esac
done

# Check required parameters
if [ -z "$FASTA_FILE" ] && [ -z "$SEQUENCE" ]; then
    echo "Error: Either FASTA file (-f) or sequence (-s) must be provided"
    usage
fi

if [ -n "$FASTA_FILE" ] && [ -n "$SEQUENCE" ]; then
    echo "Error: Cannot specify both FASTA file and sequence. Use one or the other."
    usage
fi

# Check model file
if [ ! -f "$MODEL_PATH" ]; then
    echo "Error: Model file not found: $MODEL_PATH"
    exit 1
fi

# Check config file
if [ ! -f "$CONFIG_PATH" ]; then
    echo "Error: Config file not found: $CONFIG_PATH"
    exit 1
fi

# Set device
if [ "$USE_GPU" = true ]; then
    DEVICE="cuda"
    if ! command -v nvidia-smi &> /dev/null; then
        echo "Warning: GPU requested but nvidia-smi not found. Falling back to CPU."
        DEVICE="cpu"
    fi
fi

# Set environment variables to avoid MKL errors and segmentation faults
export MKL_SERVICE_FORCE_INTEL=1
export KMP_DUPLICATE_LIB_OK=TRUE
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1

# Build Python command
PYTHON_CMD="python predict_single_protein.py"

if [ -n "$FASTA_FILE" ]; then
    PYTHON_CMD="$PYTHON_CMD --fasta \"$FASTA_FILE\""
elif [ -n "$SEQUENCE" ]; then
    PYTHON_CMD="$PYTHON_CMD --sequence \"$SEQUENCE\""
fi

PYTHON_CMD="$PYTHON_CMD --model \"$MODEL_PATH\""
PYTHON_CMD="$PYTHON_CMD --config \"$CONFIG_PATH\""
PYTHON_CMD="$PYTHON_CMD --device $DEVICE"

if [ -n "$ESM2_DIR" ]; then
    PYTHON_CMD="$PYTHON_CMD --esm2_dir \"$ESM2_DIR\""
fi

if [ -n "$ESM2_FILE" ]; then
    PYTHON_CMD="$PYTHON_CMD --esm2_file \"$ESM2_FILE\""
fi

if [ -n "$OUTPUT_FILE" ]; then
    PYTHON_CMD="$PYTHON_CMD --output \"$OUTPUT_FILE\""
fi

# Execute prediction
echo "=========================================="
echo "Protein Symmetry Type Prediction"
echo "=========================================="
if [ -n "$FASTA_FILE" ]; then
    echo "FASTA file: $FASTA_FILE"
else
    echo "Sequence length: ${#SEQUENCE}"
fi
echo "Model: $MODEL_PATH"
echo "Config: $CONFIG_PATH"
echo "Device: $DEVICE"
if [ -n "$ESM2_DIR" ]; then
    echo "ESM2 directory: $ESM2_DIR"
fi
if [ -n "$ESM2_FILE" ]; then
    echo "ESM2 file: $ESM2_FILE"
fi
if [ -n "$OUTPUT_FILE" ]; then
    echo "Output file: $OUTPUT_FILE"
fi
echo "=========================================="
echo ""

eval $PYTHON_CMD

# Check execution result
if [ $? -eq 0 ]; then
    echo ""
    echo "Prediction completed successfully!"
else
    echo ""
    echo "Error: Prediction failed. Please check the error messages above."
    exit 1
fi
