io.py

# -*- coding: utf-8 -*-
"""io.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1PI5VL7o_z4X9TLPTv9UNY8FtjkiUqpTp
"""

!pip install pandas matplotlib seaborn networkx shap scikit-learn plotly scipy
# -*- coding: utf-8 -*-
"""Targeted Anxiety Intervention Analysis with Subgroup Discovery

This notebook enhances the MoE framework to incorporate subgroup discovery
techniques. It aims to identify specific subgroups within intervention groups
that show particularly strong or weak responses to the intervention. This
allows for a more targeted analysis of intervention effectiveness and
personalized insights.

Workflow:
1. Data Loading and Validation: Load synthetic anxiety intervention data, validate its structure, content, and data types. Handle potential errors gracefully.
2. Data Preprocessing: One-hot encode the group column and scale numerical features.
3. Subgroup Discovery: Implement a flexible subgroup discovery method to identify response-based subgroups.
4. SHAP Value Analysis: Quantify feature importance within discovered subgroups.
5. Data Visualization: Generate KDE, Violin, Parallel Coordinates, and Hypergraph plots, highlighting subgroups.
6. Statistical Summary: Perform bootstrap analysis and generate summary statistics for subgroups.
7. LLM Insights Report: Synthesize findings using Grok, Claude, and Grok-Enhanced, emphasizing subgroup-specific insights, validating LLM outputs, and handling potential LLM API errors.

Keywords: Subgroup Discovery, Targeted Analysis, Personalized Intervention, Anxiety, LLMs, SHAP, Data Visualization, Machine Learning
"""

# Suppress warnings (with caution - better to handle specific warnings)
import warnings
import logging  # Use logging for more informative error/warning messages

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="plotly")

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import shap
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from io import StringIO
import plotly.express as px
from scipy.stats import bootstrap
from matplotlib.colors import LinearSegmentedColormap

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Google Colab environment check
try:
    from google.colab import drive

    drive.mount("/content/drive")
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False
    logging.info("Not running in Google Colab environment.")

# Constants
OUTPUT_PATH = "./output_anxiety_subgroup_discovery/" if not COLAB_ENV else "/content/drive/MyDrive/output_anxiety_subgroup_discovery/"
PARTICIPANT_ID_COLUMN = "participant_id"
GROUP_COLUMN = "group"  # Keep this for the initial loading and validation
ANXIETY_PRE_COLUMN = "anxiety_pre"
ANXIETY_POST_COLUMN = "anxiety_post"
MODEL_GROK_NAME = "grok-base"
MODEL_CLAUDE_NAME = "claude-3.7-sonnet"
MODEL_GROK_ENHANCED_NAME = "grok-enhanced"
LINE_WIDTH = 2.5
BOOTSTRAP_RESAMPLES = 500

# Placeholder API Keys (Security Warning)
GROK_API_KEY = "YOUR_GROK_API_KEY"  # Placeholder
CLAUDE_API_KEY = "YOUR_CLAUDE_API_KEY"  # Placeholder


# --- Functions ---
def create_output_directory(path):
    """Creates the output directory if it doesn't exist, handling errors."""
    try:
        os.makedirs(path, exist_ok=True)
        logging.info(f"Output directory created/exists: {path}")
        return True
    except OSError as e:
        logging.error(f"Failed to create output directory: {path}. Error: {e}")
        return False

def load_data_from_synthetic_string(csv_string):
    """Loads data from a synthetic CSV string, handling errors."""
    try:
        csv_file = StringIO(csv_string)
        df = pd.read_csv(csv_file)
        logging.info(f"Data loaded successfully. First 5 rows:\n{df.head()}")
        logging.info(f"Data types:\n{df.dtypes}")
        return df
    except pd.errors.ParserError as e:
        logging.error(f"Error parsing CSV data: {e}")
        return None
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

def validate_dataframe(df, required_columns):
    """Validates the DataFrame against required columns and data types, handling errors."""
    if df is None:
        logging.error("DataFrame is None. Cannot validate.")
        return False

    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logging.error(f"Missing columns: {missing_columns}")
        return False

    for col in required_columns:
        if col != PARTICIPANT_ID_COLUMN and col != GROUP_COLUMN:
            if not pd.api.types.is_numeric_dtype(df[col]):
                logging.error(f"Non-numeric values found in column: {col}")
                return False

    if df[PARTICIPANT_ID_COLUMN].duplicated().any():
        logging.error("Duplicate participant IDs found.")
        return False

    valid_groups = ["Group A", "Group B", "Control"]
    if not df[GROUP_COLUMN].isin(valid_groups).all():
        logging.error(f"Invalid group labels found.  Must be one of: {valid_groups}")
        return False

    for col in [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]:
        if df[col].min() < 0 or df[col].max() > 10:
            logging.error(f"Anxiety scores in column '{col}' are out of range (0-10).")
            return False

    logging.info("DataFrame validation successful.")
    return True

def analyze_text_with_llm(text, model_name):  # Placeholder LLM analysis
    """Placeholder for LLM analysis.  Replace with actual API calls."""
    text_lower = text.lower()
    logging.info(f"Calling {model_name} with text: {text[:50]}...")  # Log first 50 chars

    if model_name == MODEL_GROK_NAME:
        if "subgroup analysis" in text_lower:
            return "Grok-base: Subgroup analysis reveals varied responses to the intervention, with some subgroups showing significant improvement while others show minimal change."
        elif "shap summary" in text_lower:
            return "Grok-base: SHAP values highlight feature importance across subgroups, indicating that pre-anxiety is a strong predictor of post-anxiety in all subgroups, but group membership has varying effects."
        else:
            return f"Grok-base: General analysis on '{text}'."
    elif model_name == MODEL_CLAUDE_NAME:
        if "subgroup analysis" in text_lower:
            return "Claude 3.7: Subgroup discovery shows distinct patterns of response to the intervention, identifying groups with strong, weak, and typical responses based on pre- and post-anxiety levels."
        elif "violin plot" in text_lower:
            return "Claude 3.7: Violin plots detail subgroup distributions, clearly showing the differences in anxiety levels and variability between the identified subgroups."
        else:
            return f"Claude 3.7: Enhanced subgroup analysis on '{text}'."
    elif model_name == MODEL_GROK_ENHANCED_NAME:
        if "subgroup analysis" in text_lower:
            return "Grok-Enhanced: Subgroup analysis provides nuanced insights into targeted interventions, revealing specific characteristics of participants who respond differently to the intervention."
        elif "parallel coordinates" in text_lower:
            return "Grok-Enhanced: Parallel coordinates visualize subgroup-specific trajectories, showing how individual participants within each subgroup change from pre- to post-intervention anxiety levels."
        else:
            return f"Grok-Enhanced: In-depth subgroup-focused analysis on '{text}'."
    return f"Model '{model_name}' not supported."

def scale_data(df, columns):
    """Scales specified columns of the DataFrame using MinMaxScaler, handling errors."""
    try:
        scaler = MinMaxScaler()
        df[columns] = scaler.fit_transform(df[columns])
        logging.info(f"Data scaled successfully. Description:\n{df[columns].describe()}")
        return df
    except Exception as e:
        logging.error(f"Error scaling data: {e}")
        return None  # Return None on error

def discover_subgroups(df, encoded_group_cols, output_path):
    """Identifies subgroups based on intervention response, handling errors.

    Args:
        df: DataFrame with one-hot encoded group columns.
        encoded_group_cols: List of the one-hot encoded group column names.
        output_path: Path for output (not used here, but good practice).

    Returns:
        DataFrame with 'response_level' column, and subgroup description.  Returns
        (None, error_message) on failure.
    """
    try:
        df['response_level'] = 'typical'  # Default response level

        # Construct conditions using the encoded columns
        for group_col in encoded_group_cols:
            if 'Group A' in group_col:  # Check if this encoded column represents Group A
                # Strong responders in Group A: post-anxiety is less than the *overall* mean of pre-anxiety
                df.loc[(df[group_col] == 1) & (df[ANXIETY_POST_COLUMN] < df[ANXIETY_PRE_COLUMN].mean()), 'response_level'] = 'strong'
            elif 'Group B' in group_col:  # Check if this encoded column represents Group B
                # Weak responders in Group B: post-anxiety is *greater* than the *overall* mean of pre-anxiety
                df.loc[(df[group_col] == 1) & (df[ANXIETY_POST_COLUMN] > df[ANXIETY_PRE_COLUMN].mean()), 'response_level'] = 'weak'

        subgroup_desc = (
            "Subgroups identified based on response to intervention:\n"
            "- Strong Responders (Group A, anxiety_post < mean(anxiety_pre)):\n"
            "  Participants in Group A showing a strong decrease in post-intervention anxiety.\n"
            "- Weak Responders (Group B, anxiety_post > mean(anxiety_pre)):\n"
            "  Participants in Group B showing a weak or no decrease in post-intervention anxiety.\n"
            "- Typical Responders: Participants not classified as strong or weak responders.\n"
        )

        logging.info(f"Subgroup Discovery Placeholder Output:\n{subgroup_desc}")
        logging.info(f"Response level value counts:\n{df['response_level'].value_counts()}")
        return df, subgroup_desc

    except Exception as e:
        logging.error(f"Error during subgroup discovery: {e}")
        return None, str(e)

def calculate_shap_values(df, feature_columns, target_column, output_path):
    """Calculates SHAP values using a RandomForestRegressor, handling errors."""
    try:
        model_rf = RandomForestRegressor(random_state=42).fit(df[feature_columns], df[target_column]) # Added random_state
        explainer = shap.TreeExplainer(model_rf)
        shap_values = explainer.shap_values(df[feature_columns])

        plt.figure(figsize=(10, 8))
        plt.style.use('dark_background')
        shap.summary_plot(shap_values, df[feature_columns], show=False, color_bar=True)
        plt.savefig(os.path.join(output_path, 'shap_summary.png'))
        plt.close()
        logging.info(f"SHAP summary plot saved to {output_path}")
        return f"SHAP summary for features {feature_columns} predicting {target_column}"
    except Exception as e:
        logging.error(f"Error calculating SHAP values: {e}")
        return "Error calculating SHAP values."

def create_kde_plot(df, column1, column2, output_path, colors):
    """Creates a KDE plot of two columns, handling errors."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')
        sns.kdeplot(
            data=df[column1], color=colors[0], label=column1.capitalize(), linewidth=LINE_WIDTH
        )
        sns.kdeplot(
            data=df[column2], color=colors[1], label=column2.capitalize(), linewidth=LINE_WIDTH
        )
        plt.title("KDE Plot of Anxiety Levels", fontsize=16, color="white")
        plt.legend(facecolor="black", edgecolor="white", labelcolor="white")
        plt.grid(alpha=0.2, linestyle='--')
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "kde_plot.png"))
        plt.close()

        return f"KDE plot visualizing distributions of {column1} and {column2}"
    except Exception as e:
        logging.error(f"Error creating KDE plot: {e}")
        return "Error creating KDE plot."

def create_violin_plot(df, group_column, y_column, output_path, colors):
    """Creates a violin plot, handling errors."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')
        sns.violinplot(data=df, x=group_column, y=y_column, palette=colors, linewidth=LINE_WIDTH)
        plt.title('Violin Plot of Anxiety Distribution by Group', color='white')
        plt.savefig(os.path.join(output_path, 'violin_plot.png'))
        plt.close()
        logging.info(f"Violin plot saved to {output_path}")
        return f"Violin plot showing {y_column} across {group_column}"
    except Exception as e:
        logging.error(f"Error creating violin plot: {e}")
        return "Error creating violin plot."

def create_parallel_coordinates_plot(df, group_column, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Creates a parallel coordinates plot, handling errors."""
    try:
        plot_df = df[[group_column, anxiety_pre_column, anxiety_post_column, 'response_level']].copy()
        unique_groups = plot_df['response_level'].unique()
        group_color_map = {group: colors[i % len(colors)] for i, group in enumerate(unique_groups)}
        plot_df['color'] = plot_df['response_level'].map(group_color_map)
        fig = px.parallel_coordinates(
            plot_df,
            color='color',
            dimensions=[anxiety_pre_column, anxiety_post_column],
            title="Anxiety Levels: Pre- vs Post-Intervention by Response Subgroup",
            color_continuous_scale=px.colors.sequential.Viridis
        )
        fig.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white', title_font_size=16)
        fig.write_image(os.path.join(output_path, 'parallel_coordinates_plot_subgroups.png'))
        logging.info(f"Parallel coordinates plot saved to {output_path}")
        return "Parallel coordinates plot of anxiety pre vs post intervention by response subgroup"
    except Exception as e:
        logging.error(f"Error creating parallel coordinates plot: {e}")
        return "Error creating parallel coordinates plot."

def visualize_hypergraph(df, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Visualizes a hypergraph, handling errors."""
    try:
        G = nx.Graph()
        participant_ids = df[PARTICIPANT_ID_COLUMN].tolist()
        G.add_nodes_from(participant_ids, bipartite=0)
        feature_sets = {
            "anxiety_pre": df[PARTICIPANT_ID_COLUMN][df[anxiety_pre_column] > df[anxiety_pre_column].mean()].tolist(),
            "anxiety_post": df[PARTICIPANT_ID_COLUMN][df[anxiety_post_column] > df[anxiety_post_column].mean()].tolist(),
            "strong_response": df[PARTICIPANT_ID_COLUMN][df['response_level'] == 'strong'].tolist()
        }
        feature_nodes = list(feature_sets.keys())
        G.add_nodes_from(feature_nodes, bipartite=1)
        for feature, participants in feature_sets.items():
            for participant in participants:
                G.add_edge(participant, feature)
        pos = nx.bipartite_layout(G, participant_ids)
        color_map = [colors[0] if node in participant_ids else colors[1] for node in G]
        plt.figure(figsize=(12, 10))
        plt.style.use('dark_background')
        nx.draw(G, pos, with_labels=True, node_color=color_map, font_color="white", edge_color="gray",
                width=LINE_WIDTH, node_size=700, font_size=10)
        plt.title("Hypergraph Representation of Anxiety Patterns with Subgroups", color="white")
        plt.savefig(os.path.join(output_path, "hypergraph_subgroups.png"))
        plt.close()
        logging.info(f"Hypergraph saved to {output_path}")
        return "Hypergraph visualizing participant relationships, highlighting response subgroups"
    except Exception as e:
        logging.error(f"Error creating hypergraph: {e}")
        return "Error creating hypergraph."

def perform_bootstrap(data, statistic, n_resamples=BOOTSTRAP_RESAMPLES):
    """Performs bootstrap analysis, handling errors."""
    try:
        bootstrap_result = bootstrap((data,), statistic, n_resamples=n_resamples, method='percentile', random_state=42) # Added random_state
        logging.info(f"Bootstrap CI: {bootstrap_result.confidence_interval}")
        return bootstrap_result.confidence_interval
    except Exception as e:
        logging.error(f"Error performing bootstrap: {e}")
        return None

def save_summary(df, bootstrap_ci, output_path):
    """Saves summary statistics, handling errors."""
    try:
        summary_text = (
            df.describe().to_string() +
            f"\nBootstrap CI for anxiety_post mean (all participants): {bootstrap_ci}\n\n"
            f"Summary by Response Subgroup:\n"
            f"{df.groupby('response_level')[[ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]].describe().to_string()}"
        )
        with open(os.path.join(output_path, 'summary.txt'), 'w') as f:
            f.write(summary_text)
        logging.info(f"Summary statistics saved to {output_path}")
        return summary_text
    except Exception as e:
        logging.error(f"Error saving summary: {e}")
        return "Error saving summary."

def generate_insights_report(summary_stats_text, subgroup_desc, shap_analysis_info, kde_plot_desc, violin_plot_desc, parallel_coords_desc, hypergraph_desc, output_path):
    """Generates a combined insights report using (simulated) LLM calls."""

    try:
        grok_insights = (
                analyze_text_with_llm(f"Analyze summary statistics including subgroup analysis:\n{summary_stats_text}",
                                      MODEL_GROK_NAME) + "\n\n" +
                analyze_text_with_llm(f"Interpret SHAP summary for subgroups: {shap_analysis_info}", MODEL_GROK_NAME) + "\n\n" +
                analyze_text_with_llm(f"Describe the identified subgroups: {subgroup_desc}", MODEL_GROK_NAME) + "\n\n"
        )
        claude_insights = (
                analyze_text_with_llm(f"Interpret KDE plot for subgroups: {kde_plot_desc}", MODEL_CLAUDE_NAME) + "\n\n" +
                analyze_text_with_llm(f"Interpret Violin plot for subgroups: {violin_plot_desc}",
                                      MODEL_CLAUDE_NAME) + "\n\n" +
                analyze_text_with_llm(f"Interpret Parallel Coordinates Plot for subgroups: {parallel_coords_desc}",
                                      MODEL_CLAUDE_NAME) + "\n\n" +
                analyze_text_with_llm(f"Interpret Hypergraph highlighting subgroups: {hypergraph_desc}",
                                      MODEL_CLAUDE_NAME) + "\n\n"
        )
        grok_enhanced_insights = analyze_text_with_llm(
            f"Provide enhanced insights on anxiety intervention effectiveness based on subgroup analysis, SHAP, and Parallel Coordinates, focusing on differences between subgroups.",
            MODEL_GROK_ENHANCED_NAME)

        combined_insights = f"""
    Combined Insights Report: Anxiety Intervention Analysis with Subgroup Discovery

    Grok-base Analysis:
    {grok_insights}

    Claude 3.7 Sonnet Analysis:
    {claude_insights}

    Grok-Enhanced Analysis (Subgroup Focused):
    {grok_enhanced_insights}

    Synthesized Summary:
    This report synthesizes insights from Grok-base, Claude 3.7 Sonnet, and Grok-Enhanced, focusing on subgroup discovery to refine the analysis of anxiety intervention effectiveness.  Grok-base provides a statistical overview, initial subgroup interpretations, and feature importances across subgroups, noting the strong influence of pre-anxiety. Claude 3.7 Sonnet details visual patterns and distributions, highlighting subgroup-specific variations and the shift towards lower anxiety in the 'strong responders' subgroup. Grok-Enhanced, with a focus on subgroups, delivers nuanced interpretations and actionable recommendations tailored to different response patterns, revealing specific characteristics of participants. The combined expert analyses, enhanced by subgroup discovery, provide a targeted and personalized understanding of the anxiety intervention, enabling tailored strategies for different responder profiles. The identified subgroups ('strong responders', 'weak responders', and 'typical responders') show distinct patterns in their response to the intervention.
    """
        with open(os.path.join(output_path, 'insights.txt'), 'w') as f:
            f.write(combined_insights)
        logging.info(f"Insights report saved to {output_path}")
        return "Insights report generated successfully."

    except Exception as e:
        logging.error(f"Error generating insights report: {e}")
        return "Error generating insights report."

# --- Main Script ---
if __name__ == "__main__":
    # Create output directory
    if not create_output_directory(OUTPUT_PATH):
        exit()

    # Synthetic dataset (small, embedded in code)
    synthetic_dataset = """
participant_id,group,anxiety_pre,anxiety_post
P001,Group A,4,2
P002,Group A,3,1
P003,Group A,5,3
P004,Group B,6,5
P005,Group B,5,4
P006,Group B,7,6
P007,Control,3,3
P008,Control,4,4
P009,Control,2,2
P010,Control,5,5
"""
    # Load and validate data
    df = load_data_from_synthetic_string(synthetic_dataset)
    if df is None:
        exit()

    required_columns = [PARTICIPANT_ID_COLUMN, GROUP_COLUMN, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]
    if not validate_dataframe(df, required_columns):
        exit()

    # Keep a copy of the original dataframe for visualizations
    df_original = df.copy()

    # One-hot encode 'group' *before* subgroup discovery and scaling
    df = pd.get_dummies(df, columns=[GROUP_COLUMN], prefix=GROUP_COLUMN, drop_first=False)  # One-hot encode, keep all groups
    encoded_group_cols = [col for col in df.columns if col.startswith(f"{GROUP_COLUMN}_")]

    # Scale data
    df = scale_data(df, [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN] + encoded_group_cols)
    if df is None:
        exit()

    # Subgroup Discovery (using the encoded group columns)
    df, subgroup_desc = discover_subgroups(df, encoded_group_cols, OUTPUT_PATH)
    if df is None:
        exit()

    # SHAP analysis (using the encoded group columns)
    shap_feature_columns = encoded_group_cols + [ANXIETY_PRE_COLUMN]
    shap_analysis_info = calculate_shap_values(df.copy(), shap_feature_columns, ANXIETY_POST_COLUMN,
                                                OUTPUT_PATH)

    # Visualization colors
    neon_colors = ["#FF00FF", "#00FFFF", "#FFFF00", "#00FF00"]

    # Create visualizations (using df_original for plots that need original group labels)
    kde_plot_desc = create_kde_plot(
        df, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors[:2]
    )  # Use scaled, encoded df
    violin_plot_desc = create_violin_plot(
        df, 'response_level', ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors
    )  # Use the new 'response_level' column
    parallel_coords_desc = create_parallel_coordinates_plot(
        df, 'response_level', ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors
    )  # Use 'response_level'
    hypergraph_desc = visualize_hypergraph(
        df, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors[:2]
    )  # Use the modified df

    # Bootstrap analysis
    bootstrap_ci = perform_bootstrap(df[ANXIETY_POST_COLUMN], np.mean)

    # Save summary statistics
    summary_stats_text = save_summary(df, bootstrap_ci, OUTPUT_PATH)

    # Generate insights report
    generate_insights_report(summary_stats_text, subgroup_desc, shap_analysis_info, kde_plot_desc, violin_plot_desc, parallel_coords_desc, hypergraph_desc, OUTPUT_PATH)

    print("Execution completed successfully - Subgroup Discovery Enhanced Notebook.")