Source code for OCD_modeling.mcmc.inference_analysis

### Analyse inferred simulations w.r.t. observed data
##  Author: Sebastien Naze
#   QIMR 2023

import argparse
from datetime import datetime 
from concurrent.futures import ProcessPoolExecutor
import copy
import itertools
from joblib import Parallel, delayed
import matplotlib as mpl
from matplotlib import pyplot as plt
import multiprocessing
import numpy as np
import os 
import pandas as pd
import pickle
import pingouin as pg
import pyabc
import scipy
import seaborn as sbn
import sklearn
from sklearn.model_selection import KFold, RepeatedKFold, ShuffleSplit, LeaveOneOut
from sklearn.metrics import make_scorer, r2_score
from sklearn.inspection import permutation_importance
import sqlite3 as sl
from statsmodels.stats.weightstats import ztest

# import most relevant environment and project variable
from OCD_modeling.utils.utils import proj_dir, today, rmse, emd, cohen_d, paired_euclidian
from OCD_modeling.mcmc.history_analysis import import_results, compute_kdes
from OCD_modeling.analysis.fc_data_analysis import drop_single_session

# mapping of parameter names from numbered to lettered indices (e.g. C_13 to C_OA)
param_mapping = {4: {'C_12':'C_OL', 'C_13':'C_OA', 'C_21':'C_LO', 'C_24':'C_LP', 'C_31':'C_AO', 'C_34':'C_AP', 'C_42':'C_PL', 'C_43':'C_PA',
                 'eta_C_13':'eta_OA', 'eta_C_24':'eta_LP', 'sigma_C_13':'sigma_OA', 'sigma_C_24':'sigma_LP', 'sigma':'sigma', 'G':'G',
                 'patients':'patients', 'controls':'controls'},
                 6: {'C_12':'C_OL', 'C_15':'C_OD', 'C_16':'C_OV', 'C_21':'C_LO', 'C_25':'C_LD', 'C_26':'C_LV', 'C_31':'C_AO', 'C_32':'C_AL', 'C_34':'C_AP', 'C_41':'C_PO', 'C_42':'C_PL', 'C_43':'C_PA',
                 'C_53':'C_DA', 'C_54':'C_DP', 'C_63':'C_VA', 'C_64':'C_VP', 
                 'eta_C_53':'eta_DA', 'eta_C_54':'eta_DP', 'eta_C_63':'eta_VA', 'eta_C_64':'eta_VP', 'sigma_C_53':'sigma_DA', 'sigma_C_54':'sigma_DP', 'sigma_C_63':'sigma_VA', 'sigma_C_64':'sigma_VP', 'sigma':'sigma', 'G':'G',
                 'patients':'patients', 'controls':'controls'}
                }

metric = {'rmse': rmse, 'emd': emd}

def load_df_sims(args):
    """ Load infered simulations from database """
    if type(args.db_names)==list:
        dfs = []
        for db_name in args.db_names:
            with sl.connect(os.path.join(proj_dir, 'postprocessing', db_name+'.db')) as conn:
                df = pd.read_sql(''' SELECT * FROM SIMTEST ''', conn)
            conn.close()
            dfs.append(df)
        return pd.concat(dfs, ignore_index=True, copy=False)
    else:
        with sl.connect(os.path.join(proj_dir, 'postprocessing', args.db_names+'.db')) as conn:
            df = pd.read_sql(''' SELECT * FROM SIMTEST ''', conn)
        conn.close()
        return df

[docs] def load_df_data(args): """ Loads clinical FC data in pandas Dataframe """ with open(os.path.join(proj_dir, 'postprocessing', 'df_roi_corr_avg_2023.pkl'), 'rb') as f: df_data = pickle.load(f) return df_data
def load_kdes(args): """ Load Kernel Density Estimates from Optimization outcomes (see history_analysis.py) """ #fname = 'kdes_rww4D_OU_HPC_20230510_rww4D_OU_HPC_20230605_20230919.pkl' # <-- compatible scikit-learn v0.24.3 fname = 'kdes_rww4D_OU_HPC_20230510_rww4D_OU_HPC_20230605_20240417.pkl' # <-- compatible scikit-learn v1.1.3 with open(os.path.join(proj_dir, 'postprocessing', fname), 'rb') as f: kdes = pickle.load(f) return kdes def load_dist_to_FC_controls(args): """ Load distances in FC space of data (control & patients) to controls from clinical trial (PRE and POST) """ fname = os.path.join(proj_dir, 'postprocessing', 'distances_to_FC_controls_20240418.pkl') with open(fname, 'rb') as f: distances = pickle.load(f) df_fc_pre_post = distances['indiv'] df_fc_pre_post = df_fc_pre_post.reset_index(drop=True) df_fc_pre_post.index.name = None return df_fc_pre_post #-------------------------# # Digital twin analysis # #-------------------------# def get_sim_vector(sim): sim_vec = np.array(sim[pathways]) return (sim['subj'],sim_vec) def compute_sim_vecs(df_sims): """ Compute vector of each simulation in FC space in parallel """ sim_vecs = Parallel(n_jobs=args.n_jobs, verbose=10)(delayed(get_sim_vector)(sim) for i,sim in df_sims.iterrows()) return sim_vecs
[docs] def compute_distances(df_data, df_sims, ses, args): """ Compute Euclidian distances between single empirical functional connectivity (FC) and simulated FC. Parameters ---------- df_data: pandas.DataFrame Empirical FC. df_sims: pandas.DataFrame Simulated FC. ses: string Session (i.e. point in time): initial ("ses-pre") or follow-up ("ses-post") appointment. args: argparse.Namespace Extra arguments with options. Returns ------- assoc: dict Unique pairing between individual OCD subjects and simulated subjects (digital twins). """ print("Computing distances between patients and simulations") pathways = np.sort(df_data.pathway.unique()) patients = np.sort(df_data[df_data.cohort=='patients'].subj.unique()) sim_vecs = df_sims[pathways] sim_vecs = zip(df_sims['subj'], sim_vecs) def compute_dist(patient, df_data, pathways, args): """ compute distance between single patient to each simulation """ pat_vec = np.array([df_data[(df_data.subj==patient)&(df_data.pathway==p)]['corr'] for p in pathways]) sims = [] for _,sim_row in df_sims.iterrows(): sim_vec = np.array(sim_row[pathways]) sim_name = sim_row['subj'] d = np.sqrt(sum((a-b)**2 for a,b in zip(pat_vec,sim_vec)))[0] if d < args.tolerance: sims.append({'sim':sim_name, 'distance':d}) if len(sims)>0: # keep only n_assoc closest. inds = pd.DataFrame(sims).sort_values('distance').index[:args.n_assoc] sims = list(np.array(sims)[inds]) # returns list of dict for backward compatibility return sims def compute_cdist(patient, df_data, pathways, args): """ Refactored compute_dist """ pat_vec = np.array([df_data[(df_data.subj==patient)&(df_data.pathway==p)]['corr'] for p in pathways]) sim_vecs = np.array(df_sims[pathways]) sim_names = np.array(df_sims['subj']) dists = scipy.spatial.distance.cdist(pat_vec.T, sim_vecs, 'euclidean') # if any distance below threshold, save the n_assoc closests if sum(dists.squeeze()<args.tolerance)>0: tmp = pd.DataFrame({'sim':sim_names, 'distance':dists.squeeze()}) inds = tmp.sort_values('distance').index[:args.n_assoc] out = [] for i in inds: out.append(tmp.loc[i].to_dict()) return out else: return [] assoc = dict() #with ProcessPoolExecutor(max_workers=args.n_jobs, mp_context=multiprocessing.get_context('spawn')) as pool: #with ProcessPoolExecutor(max_workers=args.n_jobs) as pool: # pt_sims = pool.map(compute_dist, [(patient, sim_vecs, df_data, pathways, args) for patient in patients]) # pt_sims = list(pt_sims) pt_sims = Parallel(n_jobs=args.n_jobs, verbose=10)(delayed(compute_cdist)(patient, df_data, pathways, args) for patient in patients) for patient, sims in zip(patients, pt_sims): assoc[patient] = sims if args.save_distances: if len(args.db_names)==1: fname = os.path.join(proj_dir, 'postprocessing', args.db_names[0]+'_distances100eps'+str(int(args.tolerance*100))+"_"+ses+today()+".pkl") else: fname = os.path.join(proj_dir, 'postprocessing', 'assoc_digital_twins_distances100eps'+str(int(args.tolerance*100))+"_"+ses+today()+".pkl") with open(fname, 'wb') as f: pickle.dump(assoc, f) return assoc
def merge_data_sim_dfs(df_pat, df_sims, assoc, args): """ Create single dataframe concatenating all patients relevant variables """ dfs = [] for pat in df_pat.subj.unique(): if pat in assoc.keys(): if len(assoc[pat]): df_sim_pat = pd.DataFrame(assoc[pat]).sort_values('distance') df_sim_pat['subj'] = pat df_sim_pat = df_pat[df_pat.subj==pat].merge(df_sim_pat) # keep n-th distance entry only (can be changed to n-th closest...) for i,sim_row in df_sim_pat.iloc[:args.n_closest].iterrows(): if sim_row['distance'] < args.tolerance_plot: df_sim = df_sims[df_sims.subj==sim_row['sim']] line = sim_row.to_frame().transpose().merge(df_sim.rename(columns={'subj':'sim'}), on='sim') dfs.append(line) if args.verbose: print('{}: merged data and sim.'.format(pat)) return pd.concat(dfs, ignore_index=True)
[docs] def plot_param_behav(df_sim_pat, params=['C_12', 'C_13', 'C_24', 'C_31', 'C_34', 'C_42'], behavs=['YBOCS_Total', 'OCIR_Total', 'OBQ_Total', 'MADRS_Total', 'HAMA_Total'], args=None): """ Plot association between simulation parameters and behavioral/clinical measures """ for behav in behavs: fig = plt.figure(figsize=[4*len(params),4]) for i,param in enumerate(params): ax = plt.subplot(1,len(params),i+1) sbn.scatterplot(data=df_sim_pat, x=behav, y=param, ax=ax) r,p = scipy.stats.pearsonr(df_sim_pat[behav], df_sim_pat[param]) plt.title("r={:.2f} p={:.3f}".format(r,p)) plt.tight_layout() plt.show(block=False)
#---------------------------# # Multivariate Analysis # #---------------------------# def fix_df_sims_names(df_sims, args): """ Fix duplicate sim names in dataframe (sometimes sqlite3 locking mechanism does fail, the db is not corrupted but simulation indices may need fixing (duplicate names) """ df_sims['subj'] = df_sims.apply(lambda row: "sim{:06d}".format(int(row.name)+1), axis=1) df_sims['n_test_params'] = df_sims.test_param.apply(lambda pars_str: 0 if pars_str=='none' else len(pars_str.split(' '))) def score_pval(y, y_pred): """ scoring based on correlation p-value """ res = scipy.stats.pearsonr(y, y_pred) return res.pvalue
[docs] def cross_validation(model, X, y, args): """ Run cross validation using CV type given in argument """ # home made p-value scorer p_score = make_scorer(score_pval, greater_is_better=False) cv_types = {'RepeatedKFold': {'cv': RepeatedKFold(n_splits=args.n_splits, n_repeats=args.n_repeats), 'scoring': {'p':p_score, 'r2':make_scorer(r2_score)}}, 'KFold': {'cv': KFold(n_splits=args.n_splits), 'scoring': {'p':p_score, 'r2':make_scorer(r2_score)}}, 'ShuffleSplit': {'cv': ShuffleSplit(n_splits=args.n_splits, test_size=args.test_size), 'scoring': {'p':p_score, 'r2':make_scorer(r2_score)}}, 'LeaveOneOut': {'cv': LeaveOneOut(), 'scoring': 'neg_mean_absolute_error'} } cv_results = sklearn.model_selection.cross_validate( copy.deepcopy(model), X, y, cv=cv_types[args.cv_type]['cv'], scoring=cv_types[args.cv_type]['scoring'], return_estimator=True, return_train_score=True, return_indices=True, n_jobs=args.n_jobs, ) return cv_results
[docs] def multivariate_analysis(df_sim_pat, params=['C_12', 'C_13', 'C_24', 'C_31', 'C_34', 'C_42'], behavs=['YBOCS_Total', 'OCIR_Total', 'OBQ_Total', 'MADRS_Total', 'HAMA_Total', 'Anx_total', 'Dep_Total'], models={'Ridge':sklearn.linear_model.Ridge(alpha=0.01)}, null=False, args=None): """ Multivariate regression of parameters to predict behaviors """ output = dict() for i,behav in enumerate(behavs): output[behav] = dict() # Linear regression X = df_sim_pat[params] y = df_sim_pat[behav] for model_name,model in models.items(): # create null couplings if null: # Null based on full matrix permutation #data=np.array(df_sim_pat[params]) #data = np.random.permutation(data.ravel()).reshape(data.shape) #X = pd.DataFrame(data=data, columns=params) # Null based on column-wise permutations data=np.array([np.random.permutation(df_sim_pat[param]) for param in params]).T X = pd.DataFrame(data=data, columns=params) model.fit(X,y) y_pred = model.predict(X) mae = sklearn.metrics.median_absolute_error(y, y_pred) print("Median Abs Error (MAE) = {:.5f}".format(mae)) r2 = sklearn.metrics.r2_score(y, y_pred) # stats on prediction r,p = scipy.stats.pearsonr(y,y_pred) # cross validation cv_results = cross_validation(model, X, y, args) output[behav][model_name] = {'model':copy.deepcopy(model), 'X':X, 'y':y, 'y_pred':y_pred, 'r2':r2, 'r':r, 'p':p, 'cv_results':copy.deepcopy(cv_results)} return output
[docs] def plot_multivariate_results(multivar, models=['Ridge'], args=None): """ Display results of the multivariate analysis """ behavs = multivar.keys() for model_name in models: fig = plt.figure(figsize=[3*len(behavs),12]) gs = plt.GridSpec(4,len(behavs)) for i,behav in enumerate(behavs): y, y_pred = multivar[behav][model_name]['y'], multivar[behav][model_name]['y_pred'] r2, r, p = multivar[behav][model_name]['r2'], multivar[behav][model_name]['r'], multivar[behav][model_name]['p'] model = multivar[behav][model_name]['model'] X = multivar[behav][model_name]['X'] # PLOTTING # -------- # linear regression ax = fig.add_subplot(gs[0,i]) ax.scatter(y, y_pred, alpha=0.3, color='k') ax.set_xlabel(behav) ax.set_ylabel("Prediction") ax.set_title("r2={:.2f} r={:.2} p={:.3}".format(r2,r,p)) # coefficients ax = fig.add_subplot(gs[1,i]) coefs = pd.DataFrame(data=[model.coef_], columns=model.feature_names_in_) sbn.barplot(data=coefs, orient='h', alpha=0.5, ax=ax) plt.title('coefs') # normalized coefficients ax = fig.add_subplot(gs[2,i]) coefs = pd.DataFrame(data=[model.coef_ * np.std(X)], columns=model.feature_names_in_) sbn.barplot(data=coefs, orient='h', alpha=0.5, ax=ax) plt.title('normalized coefs') if model_name=='Ridge': var_coefs = pd.DataFrame( [ est.coef_ for est in multivar[behav][model_name]['cv_results']['estimator'] ], columns=model.feature_names_in_ ) # coefficients' variability ax = fig.add_subplot(gs[3,i]) sbn.stripplot(data=var_coefs, orient="h", palette="dark:k", alpha=0.5) sbn.boxplot(data=var_coefs, orient="h", saturation=0.7, whis=10) plt.axvline(x=0, color=".5") plt.xlabel("Coefficients") plt.title("Coefficients' variability") plt.tight_layout() plt.show()
[docs] def plot_cv_regression(multivar, df_sim_pat, args=None): """ Scatter plots of cross-validated regression """ behavs = multivar.keys() params = multivar['YBOCS_Total']['Ridge']['model'].feature_names_in_ for behav in behavs: train_ids = multivar[behav]['Ridge']['cv_results']['indices']['train'] test_ids = multivar[behav]['Ridge']['cv_results']['indices']['test'] # Plot training and test in each fold n_folds = len(train_ids) n_rows = int(np.ceil(n_folds/5)) plt.figure(figsize=[15,n_rows*3]) tests = {'y':[], 'y_pred':[]} for i,train_idx in enumerate(train_ids): test_idx = test_ids[i] est = multivar[behav]['Ridge']['cv_results']['estimator'][i] X_train = df_sim_pat.iloc[train_idx][params] X_test = df_sim_pat.iloc[test_idx][params] y_train = df_sim_pat.iloc[train_idx][behav] y_test = df_sim_pat.iloc[test_idx][behav] train_pred = est.predict(X_train) test_pred = est.predict(X_test) tests['y'].append(y_test) tests['y_pred'].append(test_pred) plt.subplot(n_rows,5,i+1) plt.scatter(y_train, train_pred, label='train') plt.scatter(y_test, test_pred, label='test') plt.xlabel(behav+' ground truth') plt.ylabel(behav+' prediction') r2 = sklearn.metrics.r2_score(y_train, train_pred) r,p = scipy.stats.pearsonr(y_train, train_pred) plt.title("training\nr2={:.2f} r={:.2f} p={:.3f}".format(r2,r,p)) plt.tight_layout() plt.show(block=False) # Plot all tests aggregated over folds plt.figure(figsize=[4,3]) y = np.array(tests['y']).ravel().squeeze() y_pred = np.array(tests['y_pred']).ravel().squeeze() r, p = scipy.stats.pearsonr(y, y_pred) r2 = sklearn.metrics.r2_score(y, y_pred) plt.scatter(y, y_pred, color='orange') plt.xlabel(behav+' ground truth') plt.ylabel(behav+' prediction') plt.title("testing\nr2={:.2f} r={:.2f} p={:.3f}".format(r2,r,p)); plt.tight_layout() plt.show(block=False)
def plot_multivar_svd(multivar, behavs=None, args=None): """ dimensionality reduction on linear regression parameters/coefficients """ behavs = list(multivar.keys()) if behavs==None else behavs coefs = [] for behav in behavs: coefs.append(multivar[behav]['Ridge']['model'].coef_) params = multivar[behav]['Ridge']['model'].feature_names_in_ U,S,V = scipy.linalg.svd(np.array(coefs)) # normalized eigenvalues plt.figure(figsize=[4,3]) plt.bar(np.arange(len(S))+1, S/np.sum(S)) plt.ylim([0,1]) plt.xlabel('SV') plt.ylabel('contribution') plt.show() # eigenvectors plt.figure(figsize=[21,3]) for i in range(len(S)): plt.subplot(1,len(coefs),i+1) plt.barh(-np.arange(len(params)),V[i,:]) plt.yticks(-np.arange(len(params)), params) plt.title('SV'+str(i+1)) plt.tight_layout() plt.show() # low dim projections fig = plt.figure(figsize=[12,4]) # 2-d ax = fig.add_subplot(1,3,1) for i,(x,y) in enumerate((coefs@V)[:,:2]): ax.scatter(x,y, label=behavs[i]) plt.xlabel('SV1') plt.ylabel('SV2') plt.legend(behavs) ax = fig.add_subplot(1,3,2) for i,(x,y) in enumerate((coefs@V)[:,1:3]): ax.scatter(x,y, label=behavs[i]) plt.xlabel('SV2') plt.ylabel('SV3') plt.legend(behavs) ax = fig.add_subplot(1,3,3) for i,(x,y) in enumerate((coefs@V)[:,[0,2]]): ax.scatter(x,y, label=behavs[i]) plt.xlabel('SV1') plt.ylabel('SV3') plt.legend(behavs) plt.tight_layout() plt.show() # 3-d fig = plt.figure(figsize=[8,4]) ax = fig.add_subplot(projection='3d') for i,(x,y,z) in enumerate((coefs@V)[:,:3]): ax.scatter(x,y,z, label=behavs[i]) plt.xlabel('SV1') plt.ylabel('SV2') ax.set_zlabel('SV3') plt.legend(behavs) plt.tight_layout() plt.show()
[docs] def create_df_null(multivar, multivar_null): """ create pandas dataframe of regression coefficients w.r.t. null distribution """ behavs = multivar.keys() for behav in behavs: params = multivar[behav]['Ridge']['model'].feature_names_in_ nulls = [] for model_name in multivar_null[behav].keys(): model = multivar_null[behav][model_name]['model'] nulls.append(dict((f,v) for f,v in zip(model.feature_names_in_, model.coef_))) df_null = pd.DataFrame(nulls) df_null['null'] = True # create df of regression coefficients from full dataset (index=0) and all CV folds (index=1:) coefs = [dict((f,v) for f,v in zip(multivar[behav]['Ridge']['model'].feature_names_in_, multivar[behav]['Ridge']['model'].coef_))] for est in multivar[behav]['Ridge']['cv_results']['estimator']: coefs.append(dict((f,v) for f,v in zip(multivar[behav]['Ridge']['model'].feature_names_in_, est.coef_))) df_sim = pd.DataFrame(coefs) df_sim['null'] = False df = pd.concat([df_null, df_sim], ignore_index=True) multivar[behav]['Ridge']['df_null'] = df
def get_param_stats_title(df, params, behav): """ stats on null distribs with two samples """ ttl = behav+"\n" for par in params: x = df[~df.null].iloc[1:][par] y = df[df.null][par] x_norm, y_norm = scipy.stats.normaltest(x), scipy.stats.normaltest(y) utest = scipy.stats.mannwhitneyu(x,y) U, p = utest.statistic, utest.pvalue ttl += "U={}, p={:.4f} ".format(int(U), p) return ttl
[docs] def plot_null_distrib(multivar, args=None): """ plot null distributions of regression coefficients and stats """ plt.rcParams['svg.fonttype'] = 'none' plt.rcParams.update({'font.size':11, 'axes.titlesize':'medium', 'mathtext.default': 'regular'}) behavs = list(multivar.keys()) params = multivar[behavs[0]]['Ridge']['model'].feature_names_in_ ## COEFFICIENTS FROM FULL REGRESSION # first figure: individual axes for each params + stats for j,behav in enumerate(behavs): fig = plt.figure(figsize=[2*len(params),3]) df = multivar[behav]['Ridge']['df_null'] for i,par in enumerate(params): ax = fig.add_subplot(1, len(params), i+1) n, mu, sigma = len(df[df.null][par]), df[df.null][par].mean(), df[df.null][par].std() t = (float(df[~df.null].iloc[0][par]) - mu) / sigma p = scipy.stats.t.sf(np.abs(t), n-1) sbn.swarmplot(df[df.null][par], ax=ax, color='black', alpha=0.1, size=1) sbn.swarmplot([df[~df.null].iloc[0][par]], ax=ax, color='red', alpha=1) plt.title("t={:.2f} p={:.3f}".format(t,p)) plt.suptitle(behav) plt.tight_layout() if args.save_figs: fname = os.path.join(proj_dir, 'img', 'null_plots_indiv_stats_'+behav+today()+'.svg') plt.savefig(fname) if args.plot_figs: plt.show(block=False) else: plt.close() if args.plot_cv_null_distrib: # second figure: single axe for all params, no stats fig = plt.figure(figsize=[15,5*len(behavs)]) for j,behav in enumerate(behavs): df = multivar[behav]['Ridge']['df_null'] ax = fig.add_subplot(len(behavs), 1, j+1) sbn.swarmplot(df.melt(id_vars='null'), x='variable', y='value', hue='null', dodge=True, palette=['black', 'gainsboro'], ax=ax) ax.get_legend().set_visible(False) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ttl = get_param_stats_title(df, params, behav) plt.title(ttl) plt.tight_layout() if args.save_figs: fname = os.path.join(proj_dir, 'img', 'null_plots_all_behavs'+today()+'.svg') plt.savefig(fname) if args.plot_figs: plt.show(block=False) else: plt.close() plt.show(block=False)
def plot_behavs_distrib(df_sim_pat, behavs): """ Plot behavioral score distribution with median line""" plt.figure(figsize=[20,4]) for i,behav in enumerate(behavs): plt.subplot(1,len(behavs),i+1) plt.hist(df_sim_pat[behav], bins=20, alpha=0.5) counts, bins = np.histogram(df_sim_pat[behav], bins=20) plt.vlines(np.median(df_sim_pat[behav]), min(counts), max(counts), color='black') plt.xlabel(behav) plt.tight_layout() plt.show() def discretize_behavs(df_sim_pat, behavs): """ """ discretize = lambda x: 0 if x else 1 behavs_discrete = [] for behav in behavs: new_behav = behav[:-5]+'_discrete' df_sim_pat[new_behav] = [discretize(v) for v in df_sim_pat[behav]<df_sim_pat[behav].median()] behavs_discrete.append(new_behav) return behavs_discrete #--------------------------# # RESTORATION ANALYSIS # #--------------------------#
[docs] def get_df_base(args): """ Import simulations from infered parameters for controls and patients without restoration """ if args.use_optim_params: fname = 'sim_base_optim_20240326.db' else: fname = 'sim_base_20240320.db' #with sl.connect(os.path.join(proj_dir, 'postprocessing', 'sim_test_20230614.db')) as conn: with sl.connect(os.path.join(proj_dir, 'postprocessing', fname)) as conn: #df = pd.read_sql("SELECT * FROM SIMTEST WHERE test_param='None'", conn) df = pd.read_sql("SELECT * FROM SIMTEST", conn) df.test_param = 'None' conn.close() return df
def get_df_other_base(args): """ Import simulations from infered parameters for controls and patients without restoration (2nd batch to be able to compare cntrols to controls and patients to patients with same n) """ if args.use_optim_params: fname = 'sim_base_optim_20240327.db' else: fname = 'sim_base_20240321.db' #with sl.connect(os.path.join(proj_dir, 'postprocessing', 'sim_test_20230614.db')) as conn: with sl.connect(os.path.join(proj_dir, 'postprocessing', fname)) as conn: #df = pd.read_sql("SELECT * FROM SIMTEST WHERE test_param='None'", conn) df = pd.read_sql("SELECT * FROM SIMTEST", conn) df.test_param = 'None' conn.close() return df def get_df_3rd_base(args): """ Import simulations from infered parameters for controls and patients without restoration (2nd batch to be able to compare cntrols to controls and patients to patients with same n) """ fname = 'sim_base_optim_20240328.db' with sl.connect(os.path.join(proj_dir, 'postprocessing', fname)) as conn: df = pd.read_sql("SELECT * FROM SIMTEST", conn) df.test_param = 'None' conn.close() return df
[docs] def fix_df_base(df_base): """ fix duplicate entries in df_base """ pathways = df_base.pathway.unique() def iter_df(): inds = list(itertools.islice(range(len(df_base)), 0, None, len(pathways))) for i in inds: df_line = df_base.iloc[i].to_frame().transpose() yield df_base.iloc[i:i+len(pathways)].pivot(index='subj', columns='pathway', values='corr').reset_index().merge(df_line, on='subj') new_rows = [row for row in iter_df()] new_df = pd.concat(new_rows).reset_index().drop(columns=['index', 'level_0', 'pathway', 'corr']) new_df['subj'] = new_df.apply(lambda row: "sim-test{:06d}".format(int(row.name)+1), axis=1) return new_df
def get_restoration_suffix(args): suffix = '_'+args.distance_metric if args.use_optim_params: suffix += '_optim' suffix += '_'+args.efficacy_base return suffix def compute_rmse_restore_data(df_data, df_sims, args): """ [DEPRECATED] Compute Root Mean Squared Error between the data and batches from the simulation """ pathways = df_data.pathway.unique() n_folds = int(np.floor(len(df_sims) / args.n_sims)) RMSEs = [] for i in np.arange(0,n_folds*args.n_sims, args.n_sims): RMSE = [] for pathway in pathways: mu_data, sigma_data = df_data[df_data.pathway==pathway]['corr'].mean(), df_data[df_data.pathway==pathway]['corr'].std() mu_sims, sigma_sims = df_sims.iloc[i:i+args.n_sims][pathway].mean(), df_sims.iloc[i:i+args.n_sims][pathway].std() RMSE.append((mu_data-mu_sims)**2 + (sigma_data-sigma_sims)**2) RMSEs.append(np.sqrt(np.sum(RMSE))) return np.array(RMSEs)
[docs] def compute_distance_restore_sims(df_base, df_sims, args): """ Compute distance metric between batches of simulations in controls, patients and continuous restoration from patients to controls. Parameters ---------- df_base: pandas.DataFrame Simulated data from virtual OCD subjects (patients) and healthy controls. No virtual interventions were performed. Corresponds to simulation using OCD and healthy posteriors. df_sims: pandas.DataFrame Simulation outputs loaded from the virtual interventions database. args: argparse.Namespace Extra arguments with options. Returns ------- outputs: list of dictionanries Distances between simulated interventions and healthy controls cohorts, with normalized parameters values w.r.t original OCD parameter posterior distribution. """ max_len = np.min([len(df_sims), len(df_base[df_base.base_cohort=='controls'])]) n_folds = int(np.floor(max_len/ args.n_sims)) i_s = np.arange(0,n_folds*args.n_sims, args.n_sims) outputs = [] indices = itertools.product(i_s, i_s) df_other_base = get_df_other_base(args) if args.use_optim_params: indices = zip(i_s, i_s) for i,j in indices: # get FC distance from controls or patients FC sim = df_sims.iloc[i:i+args.n_sims] base_cons = df_base[df_base.base_cohort=='controls'].iloc[j:j+args.n_sims] base_pats = df_base[df_base.base_cohort=='patients'].iloc[j:j+args.n_sims] base_ocons = df_other_base[df_other_base.base_cohort=='controls'].iloc[i:i+args.n_sims] base_opats = df_other_base[df_other_base.base_cohort=='patients'].iloc[i:i+args.n_sims] #distance = metric[args.distance_metric](sim[args.pathways], base_cons[args.pathways]) distance_post_pre = metric[args.distance_metric](sim[args.pathways], base_pats[args.pathways]) distance_pre_hc = metric[args.distance_metric](base_cons[args.pathways], base_pats[args.pathways]) distance_post_hc = metric[args.distance_metric](sim[args.pathways], base_cons[args.pathways]) distance_hc_hc = metric[args.distance_metric](base_ocons[args.pathways], base_cons[args.pathways]) distance_pre_pre = metric[args.distance_metric](base_opats[args.pathways], base_pats[args.pathways]) # get parameter difference from patients parameters output = dict(('z_'+param, ztest(x1=sim[param], x2=base_pats[param])[0]) for param in args.params) output['dist'] = distance_post_hc output['dist_pre_hc'] = distance_pre_hc output['dist_post_pre'] = distance_post_pre output['dist_pre_pre'] = distance_pre_pre output['dist_hc_hc'] = distance_hc_hc #output['efficacy'] = distance_post_pre/distance_pre_hc if args.use_optim_params: euclidian_post_hc = paired_euclidian(sim[args.pathways], base_cons[args.pathways]) euclidian_hc_hc = paired_euclidian(base_ocons[args.pathways], base_cons[args.pathways]) euclidian_pre_hc = paired_euclidian(base_pats[args.pathways], base_cons[args.pathways]) #output['paired_tstat_T'], output['paired_tstat_pval'] = scipy.stats.ttest_rel(euclidian_post_hc, euclidian_hc_hc) #output['paired_wilcoxon'], output['paired_wilcoxon_pval'] = scipy.stats.wilcoxon(euclidian_post_hc, euclidian_hc_hc) output['paired_tstat_T'], output['paired_tstat_pval'] = scipy.stats.ttest_rel(euclidian_pre_hc, euclidian_post_hc) output['paired_wilcoxon'], output['paired_wilcoxon_pval'] = scipy.stats.wilcoxon(euclidian_pre_hc, euclidian_post_hc) outputs.append(output) return outputs
[docs] def compute_distance_restore(df_sims, args): """ Compare base simulations (from OCD subjects and healthy controls) to simulated interventions by calculating distances in parallel. Parameters ---------- df_sims: pandas.DataFrame Simulation outputs loaded from the database. args: argparse.Namespace Extra arguments with options. Returns ------- df_restore: pandas.DataFrame Copy of df_sims with efficacy measures (including cohorts first level statistics). """ # Test params are the parameters to be permuted. test_params = df_sims[df_sims.n_test_params<=args.n_test_params].test_param.unique() # df_base are the simulated using OCD subjects and helthy controls' posterior (no invervention). df_base = get_df_base(args) #df_base = fix_df_base(df_base) distance_outputs = Parallel(n_jobs=args.n_jobs, backend='loky',verbose=3)(delayed(compute_distance_restore_sims) (df_base=df_base, df_sims=df_sims[(df_sims.base_cohort=='patients') & (df_sims.test_cohort=='controls') & (df_sims.test_param==test_param)], args=args) for test_param in test_params) # Add first order statistics of the cohort's distance to base FC lines = [] for tp,outputs in zip(test_params, distance_outputs): df_test_param = pd.DataFrame(outputs) df_test_param['test_param'] = tp df_test_param['mean'] = np.mean(df_test_param.dist) df_test_param['median'] = np.median(df_test_param.dist) df_test_param['std'] = np.std(df_test_param.dist) df_test_param['tstat'], df_test_param['pval'] = scipy.stats.ttest_ind(df_test_param.dist_pre_hc, df_test_param.dist) df_test_param['ustat'], df_test_param['upval'] = scipy.stats.mannwhitneyu( np.array(df_test_param.dist_pre_hc), np.array(df_test_param.dist), alternative='greater') # non-paramteric effect size (https://aakinshin.net/posts/nonparametric-effect-size/) Q_x = np.median(np.array(df_test_param.dist_pre_hc)) n_x = np.array(df_test_param.dist_pre_hc).shape[0] Q_y = np.median(np.array(df_test_param.dist)) n_y = np.array(df_test_param.dist).shape[0] MAD_x = scipy.stats.median_abs_deviation(np.array(df_test_param.dist_pre_hc)) MAD_y = scipy.stats.median_abs_deviation(np.array(df_test_param.dist)) PMAD_xy = np.sqrt(((n_x-1)*MAD_x**2 + (n_y-1)*MAD_y**2) / (n_x + n_y -2)) df_test_param['gamma_es'] = (Q_x - Q_y) / PMAD_xy lines.append(df_test_param) df_restore = pd.concat(lines, ignore_index=True) # add a column with number of targeted parameters df_restore['n_test_params'] = df_restore.test_param.apply(lambda pars_str: len(pars_str.split(' ')) if pars_str!='None' else 0) return df_restore
def get_max_distance_data(args): """ Compute the distance between controls and patients in real data """ df_fc_pre_post = load_dist_to_FC_controls(args) df_fc_pre = df_fc_pre_post[df_fc_pre_post.ses=='ses-pre'] #pathways = np.sort(df_data.pathway.unique()) #df_tmp = df_data.pivot(index=['subj', 'cohort'], columns='pathway', values='corr').reset_index() cons,pats = df_fc_pre[df_fc_pre.cohort=='controls'],df_fc_pre[df_fc_pre.cohort=='patients'] # compute euclidian distances to mean controls con_ref = cons[args.pathways].apply(np.mean, axis=0) pat_ref = pats[args.pathways].apply(np.mean, axis=0) def dist_con(x): return np.sqrt(np.sum((np.array(x)-np.array(con_ref))**2)) def dist_pat(x): return np.sqrt(np.sum((np.array(x)-np.array(pat_ref))**2)) distances = {'con':[], 'pat':[], 'con_pat':[]} distances['con'] = cons[args.pathways].apply(dist_con, axis=1) distances['pat'] = pats[args.pathways].apply(dist_pat, axis=1) distances['con_pat'] = pats[args.pathways].apply(dist_con, axis=1) return distances def get_max_distance_sims(args): """ Compute the distance between controls and patients in simulated dataset. This function is also used to compute the null distributions. """ df_base = get_df_base(args) # 2nd base simulation to compute the patients' null df_other_base = get_df_other_base(args) #df_base = fix_df_base(df_base) #n_pathways = len(df.pathway.unique()) # 3rd base simulation to compute the controls' null df_3rd_base = get_df_3rd_base(args) cons = df_base[df_base.base_cohort=='controls'] pats = df_base[df_base.base_cohort=='patients'] o_cons = df_other_base[df_other_base.base_cohort=='controls'] o_pats = df_other_base[df_other_base.base_cohort=='patients'] oo_cons = df_3rd_base[df_3rd_base.base_cohort=='controls'] oo_pats = df_3rd_base[df_3rd_base.base_cohort=='patients'] distances = {'con':[], 'pat':[], 'con_pat':[], 'con_pat_centroid':0, 'con_con':[], 'pat_pat':[], 'pat_con':[], 'con_ocon':[], \ 'conpat':[],'conopat':[], 'oconpat':[], 'conocon':[], 'conoocon':[], 'opatpat':[], 'tstat':0, 'pval':0, 'tstat_':0, 'pval_':0, 'ustat':0, 'upval':0, 'ustat_':0, 'upval_':0, 'gamma_es':0, 'gamma_es_':0, 'paired_tstat_T_concon':[], 'paired_tstat_pval_concon':[], 'paired_tstat_T_patpat':[], 'paired_tstat_pval_patpat':[], 'paired_tstat_T_conpat':[], 'paired_tstat_pval_conpat':[]} i_s = list(itertools.islice(range(len(cons)), 0, None, args.n_sims)) # within-group distances (using same base) for i,j in itertools.combinations(i_s, 2): cons_i = cons.iloc[i:i+args.n_sims] cons_j = cons.iloc[j:j+args.n_sims] pats_i = pats.iloc[i:i+args.n_sims] pats_j = pats.iloc[j:j+args.n_sims] distances['con'].append(metric[args.distance_metric](cons_i[args.pathways], cons_j[args.pathways])) distances['pat'].append(metric[args.distance_metric](pats_i[args.pathways], pats_j[args.pathways])) # between-group distances for i,j in itertools.product(i_s, i_s): pats_i = pats.iloc[i:i+args.n_sims] cons_j = cons.iloc[j:j+args.n_sims] distances['con_pat'].append(metric[args.distance_metric](pats_i[args.pathways], cons_j[args.pathways])) pats_i = o_pats.iloc[i:i+args.n_sims] distances['pat_con'].append(metric[args.distance_metric](pats_i[args.pathways], cons_j[args.pathways])) distances['con_pat_centroid'] = metric[args.distance_metric](cons[args.pathways], pats[args.pathways]) # within-group distances (using other base) for i,j in itertools.product(i_s, i_s): cons_i = cons.iloc[i:i+args.n_sims] cons_j = o_cons.iloc[j:j+args.n_sims] distances['con_con'].append(metric[args.distance_metric](cons_i[args.pathways], cons_j[args.pathways])) cons_i = oo_cons.iloc[i:i+args.n_sims] distances['con_ocon'].append(metric[args.distance_metric](cons_i[args.pathways], cons_j[args.pathways])) pats_i = pats.iloc[i:i+args.n_sims] pats_j = o_pats.iloc[j:j+args.n_sims] distances['pat_pat'].append(metric[args.distance_metric](pats_i[args.pathways], pats_j[args.pathways])) distances['tstat'], distances['pval'] = scipy.stats.ttest_ind(np.array(distances['con_pat']), np.array(distances['con_con'])) distances['tstat_'], distances['pval_'] = scipy.stats.ttest_ind(np.array(distances['con_pat']), np.array(distances['pat_con'])) distances['ustat'], distances['upval'] = scipy.stats.mannwhitneyu(np.array(distances['con_pat']), np.array(distances['con_con']), alternative='greater') distances['ustat_'], distances['upval_'] = scipy.stats.mannwhitneyu(np.array(distances['con_pat']), np.array(distances['pat_con']), alternative='greater') # non-parametric effect size Q_x = np.median(np.array(distances['con_pat'])) n_x = np.array(np.array(distances['con_pat'])).shape[0] Q_y = np.median(np.array(distances['con_con'])) n_y = np.array(distances['con_con']).shape[0] MAD_x = scipy.stats.median_abs_deviation(np.array(distances['con_pat'])) MAD_y = scipy.stats.median_abs_deviation(np.array(distances['con_con'])) PMAD_xy = np.sqrt(((n_x-1)*MAD_x**2 + (n_y-1)*MAD_y**2) / (n_x + n_y -2)) distances['gamma_es'] = (Q_y - Q_x) / PMAD_xy Q_y = np.median(np.array(distances['pat_con'])) n_y = np.array(distances['pat_con']).shape[0] MAD_y = scipy.stats.median_abs_deviation(np.array(distances['pat_con'])) PMAD_xy = np.sqrt(((n_x-1)*MAD_x**2 + (n_y-1)*MAD_y**2) / (n_x + n_y -2)) distances['gamma_es_'] = (Q_y - Q_x) / PMAD_xy # paired distances (optim) for i,j in zip(i_s, i_s): cons_ = cons.iloc[i:i+args.n_sims] pats_ = pats.iloc[i:i+args.n_sims] ocons_ = o_cons.iloc[j:j+args.n_sims] opats_ = o_pats.iloc[j:j+args.n_sims] oocons_ = oo_cons.iloc[j:j+args.n_sims] oopats_ = oo_pats.iloc[j:j+args.n_sims] distances['conpat'].append(metric[args.distance_metric](pats_[args.pathways], cons_[args.pathways])) distances['conopat'].append(metric[args.distance_metric](opats_[args.pathways], cons_[args.pathways])) distances['oconpat'].append(metric[args.distance_metric](pats_[args.pathways], ocons_[args.pathways])) distances['conocon'].append(metric[args.distance_metric](ocons_[args.pathways], cons_[args.pathways])) distances['conoocon'].append(metric[args.distance_metric](oocons_[args.pathways], cons_[args.pathways])) distances['opatpat'].append(metric[args.distance_metric](pats_[args.pathways], opats_[args.pathways])) d_conocon = paired_euclidian(ocons_[args.pathways], cons_[args.pathways]) d_conoocon = paired_euclidian(oocons_[args.pathways], cons_[args.pathways]) T, p = scipy.stats.ttest_rel(d_conocon, d_conoocon) distances['paired_tstat_T_concon'].append(T) distances['paired_tstat_pval_concon'].append(p) d_patopat = paired_euclidian(opats_[args.pathways], pats_[args.pathways]) d_patoopat = paired_euclidian(oopats_[args.pathways], pats_[args.pathways]) T, p = scipy.stats.ttest_rel(d_patopat, d_patoopat) distances['paired_tstat_T_patpat'].append(T) distances['paired_tstat_pval_patpat'].append(p) d_conpat = paired_euclidian(pats_[args.pathways], cons_[args.pathways], ) T, p = scipy.stats.ttest_rel(d_conpat, d_patopat) distances['paired_tstat_T_conpat'].append(T) distances['paired_tstat_pval_conpat'].append(p) return distances def plot_efficacy_transform(args): """ Plot distance and efficacy distrib and KDEs""" distances = get_max_distance_sims(args) def get_KDE(values): vmin, vmax = np.min(values), np.max(values) bw = (vmax-vmin)/20 kde = sklearn.neighbors.KernelDensity(kernel='gaussian', bandwidth=bw).fit(np.array(values).reshape(-1,1)) X = np.linspace(vmin-3*bw, vmax+3*bw, 100).reshape(-1,1) pdf = kde.score_samples(X) return {'kde':kde, 'pdf':pdf, 'X':X} bias_con = np.mean(distances['con']) bias_pat = np.mean(distances['pat']) bias_con_pat = np.mean(distances['con_pat']) bias = np.mean([bias_con, bias_pat]) offset = np.mean(distances['con_pat']) scale = np.mean(distances['con_pat']) kde_con = get_KDE(1-((distances['con']-bias_con)/scale)*100) kde_pat = get_KDE(1-((distances['con_pat']-bias_con)/scale)*100) plt.figure(figsize=[10,12]) ax = plt.subplot(5,1,1) plt.hist(distances['con'], bins=np.linspace(0,0.3,30), color='lightblue', alpha=0.3, density=False) plt.hist(distances['pat'], bins=np.linspace(0,0.3,30), color='orange', alpha=0.3, density=False) plt.hist(distances['con_pat']-bias_con-bias_pat, bins=np.linspace(0,0.3,30), color='magenta', alpha=0.3, density=False) plt.vlines(np.mean(distances['con']), ymin=0, ymax=50, color='lightblue') plt.vlines(np.mean(distances['pat']), ymin=0, ymax=50, color='orange') plt.vlines(np.mean(distances['con_pat']), ymin=0, ymax=50, color='magenta') plt.xlabel('$d$', fontsize=12) plt.ylabel('counts', fontsize=12) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) ax = plt.subplot(5,1,2) plt.hist(distances['con']-bias_con, bins=np.linspace(-0.1,0.3,40), color='lightblue', alpha=0.5, density=False) plt.hist(distances['con_pat'], bins=np.linspace(-0.1,0.3,40), color='orange', alpha=0.5, density=False) plt.xlabel('$d-\mu_{d_{XX}}$', fontsize=12) plt.ylabel('counts', fontsize=12) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) ax = plt.subplot(5,1,3) plt.hist((distances['con']-bias_con)/scale, bins=np.linspace(-2,2,40), color='lightblue', alpha=0.5, density=False) plt.hist((distances['con_pat']-bias_con)/scale, bins=np.linspace(-2,2,40), color='orange', alpha=0.5, density=False) plt.xlabel('$ \\frac{d-\mu_{d_{XX}}}{\mu_{d_{XY}} - \mu_{d_{XX}}}$', fontsize=12) plt.ylabel('counts', fontsize=12) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) ax = plt.subplot(5,1,4) plt.hist(1-((distances['con']-bias_con)/scale), bins=np.linspace(-2,2,40), color='lightblue', alpha=0.5, density=False) plt.hist(1-((distances['con_pat']-bias_con)/scale), bins=np.linspace(-2,2,40), color='orange', alpha=0.5, density=False) plt.xlabel('$1- \\frac{d-\mu_{d_{XX}}}{\mu_{d_{XY}} - \mu_{d_{XX}}}$', fontsize=12) plt.ylabel('counts', fontsize=12) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) ax = plt.subplot(5,1,5) plt.plot(kde_con['X'], kde_con['pdf'], color='lightblue') plt.plot(kde_pat['X'], kde_pat['pdf'], color='orange') plt.xlabel('$E_{ff} \, (\%)$', fontsize=12) plt.ylabel('density', fontsize=12) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) plt.tight_layout() if args.save_figs: fname = 'efficacy_trasnform'+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) fname = 'efficacy_trasnform'+today()+'.png' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show() def format_param(param, N=4): """ return LaTeX formated string of parameter (without dollar signs) """ param = param_mapping[N][param] par_ = param.split('_') # handle greek letters if (param.startswith('sigma') or param.startswith('eta')): par_[0] = r"\\"[0]+par_[0] # handle C_ if len(par_)==2: par_[1] = r'{'+par_[1]+r'}' elif len(par_)==3: par_[1] = r'{'+par_[1] par_[2] = r'{'+par_[2]+r'}}' formatted_param = '_'.join(par_) return formatted_param def format_labels(labels, N=4): new_labels = [] for label in labels: new_label = [] params = label.get_text().split(' ') for param in params: formatted_param = format_param(param, N=N) new_label.append(formatted_param) new_label = '${}$'.format(r'\quad'.join(new_label)) #label.set_text(new_label) new_labels.append(new_label) return new_labels
[docs] def compute_efficacy(df_restore, args=None): """ Add new column to input DataFrame with treatment efficacy based on distance to healthy controls in functional connectivity space. .. note:: Simulation metric is the Wasserstein distance and the data metric is the Euclidean distance. Comparison in FC space is automatically performed using the correct metric (default: Wasserstein). Parameters ---------- df_restore: pandas.DataFrame Virtual intervention simulation outputs with distance precomputed. args: argparse.Namespace Extra arguments with options. Important option in this function is `args.efficacy_base` which informs how treatment efficacy is computed (e.g. retained was "ustat") Returns ------- df_restore: pandas.DataFrame Input DataFrame with new column 'efficacy'. """ if args.efficacy_base=='sims': distances = get_max_distance_sims(args) # mu_d offset = np.mean([np.mean(distances['pat']), np.mean(distances['con'])]) # mu_d_XY #scale = np.mean(distances['con_pat']) scale = distances['con_pat_centroid'] # E_ff #df_restore['efficacy'] = (1-((df_restore['dist']-offset))/scale)*100 # make % of restoration df_restore['efficacy'] = (1-df_restore['dist']/scale)*100 # make % of restoration elif args.efficacy_base=='paired': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = (1-np.divide(df_restore.loc[ids].dist, distances['con_pat']))*100 elif args.efficacy_base=='paired_A': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = np.divide(df_restore.loc[ids].dist, distances['con_pat']) elif args.efficacy_base=='paired_B': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = np.divide(distances['con_pat'],df_restore.loc[ids].dist) elif args.efficacy_base=='paired_C': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index if args.use_optim_params: df_restore.loc[ids, 'efficacy'] = 1-np.divide(df_restore.loc[ids].dist, distances['conpat']) else: df_restore.loc[ids, 'efficacy'] = 1-np.divide(df_restore.loc[ids].dist, distances['con_pat']) elif args.efficacy_base=='paired_D': distances = get_max_distance_sims(args) # transpose indices to matched "pat" (pre) pairs instead of "con" (hc) n = int(np.sqrt(len(distances['con_pat']))) con_pat = np.array(distances['con_pat']).reshape((n,n)).T.ravel() for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = np.divide(df_restore.loc[ids].dist_post_pre, con_pat) elif args.efficacy_base=='paired_E': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = df_restore.loc[ids].dist elif args.efficacy_base=='paired_F': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index if args.use_optim_params: df_restore.loc[ids, 'efficacy'] = np.divide(distances['conpat'] - df_restore.loc[ids].dist, distances['conpat']) else: df_restore.loc[ids, 'efficacy'] = np.divide(distances['con_pat'] - df_restore.loc[ids].dist, distances['con_pat']) elif args.efficacy_base=='paired_G': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index if args.use_optim_params: df_restore.loc[ids, 'efficacy'] = np.divide(np.mean(distances['conpat']) - df_restore.loc[ids].dist, np.mean(distances['conpat'])) else: df_restore.loc[ids, 'efficacy'] = np.divide(distances['con_pat_centroid'] - df_restore.loc[ids].dist, distances['con_pat_centroid']) # using T stat elif args.efficacy_base=='paired_H': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index if args.use_optim_params: df_restore.loc[ids, 'efficacy'] = df_restore.loc[ids, 'paired_tstat_T'] elif args.efficacy_base=='tstat': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = df_restore.loc[ids, 'tstat'] elif args.efficacy_base=='ustat': distances = get_max_distance_sims(args) for test_param in df_restore.test_param.unique(): ids = df_restore[df_restore.test_param==test_param].index df_restore.loc[ids, 'efficacy'] = np.array(df_restore.loc[ids, 'ustat'])/(400*400) # AUC1 else: distances = get_max_distance_data(args) offset = np.mean([np.mean(distances['pat']), np.mean(distances['con'])]) scale = np.mean(distances['con_pat']) df_restore['efficacy'] = (1-((df_restore['dist']))/scale)*100 # make % of restoration # add a column with number of targeted parameters df_restore['n_test_params'] = df_restore.test_param.apply(lambda pars_str: len(pars_str.split(' ')) if pars_str!='None' else 0) return df_restore
def get_df_top(sub_df_restore, args): # get top parameters that restore FC top_params = dict() top_params['all'] = sub_df_restore.sort_values('mean').test_param.unique()[:args.n_restore] if args.efficacy_base in ['paired_D', 'paired_E']: top_params['by_n'] = [sub_df_restore[sub_df_restore.n_test_params==n].sort_values('median').test_param.unique()[:args.n_tops] for n in np.arange(1,args.n_test_params+1)] else: top_params['by_n'] = [] for n in np.arange(1,args.n_test_params+1): for test_param in sub_df_restore[sub_df_restore.n_test_params==n].test_param.unique(): inds = sub_df_restore[sub_df_restore.test_param==test_param].index sub_df_restore.loc[inds, 'median_Eff'] = np.median(sub_df_restore.iloc[inds]['efficacy']) top_params['by_n'].append(sub_df_restore[sub_df_restore.n_test_params==n].sort_values('median_Eff', ascending=False).test_param.unique()[:args.n_tops]) top_params['by_n'] = np.concatenate(top_params['by_n']) df_top = sub_df_restore[sub_df_restore.test_param.apply(lambda x: x in top_params[args.sort_style])] return df_top, top_params
[docs] def plot_distance_restore(df_restore, args, gs=None): """ Plot horizontal box plot of best virtual interventions outcomes, sorted by number of target points. Parameters ---------- df_restore: pandas.DataFrame Virtual intervention simulation outputs with distance and efficacy precomputed. args: (argparse.Namespace) Extra arguments with options. An important options here is args.n_tops which defines how many of the best interventions to display by number of target points. gs: matplotlib.GridSpec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. """ # adds n_test_params column to dataframe sub_df_restore = df_restore[df_restore.n_test_params<=args.n_test_params] # get top parameters that restore FC df_top, top_params = get_df_top(df_restore, args) # normalize distances to get efficacy distances = get_max_distance_sims(args) # Patients "NULLS" df_pat = pd.DataFrame({'dist': distances['con_pat'], 'test_param':'patients', 'n_test_params': 0}) n = int(np.sqrt(len(distances['con_pat']))) inds, = np.where(np.tril(np.ones((n,n)), k=-1).ravel()) if args.efficacy_base=='paired_C': if args.use_optim_params: df_pat = pd.DataFrame({'dist': distances['conpat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = 1 - np.divide(np.array(distances['conopat']), np.array(distances['conpat'])) else: # get the con_pat matrice shifted so different patient to same control con_pat = np.roll(np.array(distances['con_pat']).reshape(n,n), 1, axis=0).ravel() df_pat['efficacy'] = np.divide(df_pat['dist'] - con_pat, df_pat['dist']) elif args.efficacy_base=='paired_D': df_pat = pd.DataFrame({'dist': distances['pat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = np.divide(df_pat['dist'], np.array(distances['con_pat'])[inds]) elif args.efficacy_base=='paired_E': df_pat = pd.DataFrame({'dist': distances['con_pat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = df_pat['dist'] elif args.efficacy_base=='paired_F': if args.use_optim_params: df_pat = pd.DataFrame({'dist': distances['conpat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = np.divide(np.array(distances['conpat']) - np.array(distances['conopat']), np.array(distances['conpat'])) else: # get the con_pat matrice shifted so different patient to same control con_pat = np.roll(np.array(distances['con_pat']).reshape(n,n), 1, axis=0).ravel() #df_pat['efficacy'] = np.divide(df_pat['dist'] - con_pat, df_pat['dist']) df_pat['efficacy'] = np.divide(np.array(distances['con_pat']) - np.array(distances['pat_con']), np.array(distances['con_pat'])) elif args.efficacy_base=='paired_G': if args.use_optim_params: df_pat = pd.DataFrame({'dist': distances['conpat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = np.divide(np.mean(distances['conpat']) - distances['conpat'], np.mean(distances['conpat'])) else: df_pat = pd.DataFrame({'dist': distances['con_pat'], 'test_param':'patients', 'n_test_params': 0}) # get the con_pat matrice shifted so different patient to same control df_pat['efficacy'] = np.divide(distances['con_pat_centroid'] - df_pat['dist'], distances['con_pat_centroid']) elif args.efficacy_base=='paired_H': if args.use_optim_params: df_pat = pd.DataFrame({'dist': distances['conpat'], 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = distances['paired_tstat_T_patpat'] elif args.efficacy_base=='tstat': df_pat = pd.DataFrame({'dist': np.unique(distances['tstat_']), 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = np.unique(distances['tstat_']) elif args.efficacy_base=='ustat': df_pat = pd.DataFrame({'dist': np.unique(distances['ustat_']), 'test_param':'patients', 'n_test_params': 0}) df_pat['efficacy'] = np.unique(distances['ustat_']/(400*400)) # AUC else: df_pat['test_param'] = 'None' df_pat = compute_efficacy(df_pat, args=args) df_pat['test_param'] = 'patients' # Controls' NULLS df_con = pd.DataFrame({'dist': distances['con'], 'test_param':'controls', 'n_test_params': 0}) # test_param=controls for legend label n = int(np.sqrt(len(distances['con_pat']))) inds, = np.where(np.triu(np.ones((n,n)), k=1).ravel()) if args.efficacy_base=='paired_A': df_con['efficacy'] = np.divide(df_con['dist'], np.array(distances['con_pat'])[inds]) elif args.efficacy_base=='paired_B': df_con['efficacy'] = np.divide(np.array(distances['con_pat'])[inds], df_con['dist']) elif args.efficacy_base=='paired_C': if args.use_optim_params: df_con = pd.DataFrame({'dist': distances['conpat'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = 1-np.divide(distances['conocon'], np.array(distances['conpat'])) else: df_con['efficacy'] = 1-np.divide(df_con['dist'], np.array(distances['con_pat'])[inds]) elif args.efficacy_base=='paired_D': df_con = pd.DataFrame({'dist': distances['con_pat'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = np.divide(np.roll(np.array(distances['con_pat']).reshape(n,n),1).ravel(), df_con['dist']) elif args.efficacy_base=='paired_E': df_con = pd.DataFrame({'dist': distances['conocon'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = distances['conocon'] elif args.efficacy_base=='paired_F': if args.use_optim_params: df_con = pd.DataFrame({'dist': distances['conocon'], 'test_param':'controls', 'n_test_params': 0}) #df_con['efficacy'] = 1 - np.array(distances['conocon']) #np.divide(np.array(distances['conpat']) - np.array(distances['conopat']), np.array(distances['conpat'])) df_con['efficacy'] = 1 - np.divide(np.array(distances['conpat']) - np.array(distances['oconpat']), np.array(distances['conpat'])) else: df_con = pd.DataFrame({'dist': distances['con_con'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = 1 - np.divide(np.array(distances['con_pat']) - np.array(distances['con_con']), np.array(distances['con_pat']) ) elif args.efficacy_base=='paired_G': if args.use_optim_params: df_con = pd.DataFrame({'dist': distances['conpat'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = np.divide(np.mean(distances['conocon']) - distances['conocon'], np.mean(distances['conocon'])) else: # get the con matrice shifted so different control to same control df_con['efficacy'] = np.divide(distances['con_pat_centroid'] - df_con['dist'], distances['con_pat_centroid']) elif args.efficacy_base=='paired_H': if args.use_optim_params: df_con = pd.DataFrame({'dist': distances['conocon'], 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = distances['paired_tstat_T_conpat'] elif args.efficacy_base=='tstat': df_con = pd.DataFrame({'dist': np.unique(distances['tstat']), 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = np.unique(distances['tstat']) # check for normality for test_param in df_top.test_param.unique(): df_ = df_top[df_top.test_param==test_param] stat,p = scipy.stats.normaltest(np.array(df_.dist)) if p<0.05: print(test_param+": d(post,hc) is not normally distributed.") elif args.efficacy_base=='ustat': df_con = pd.DataFrame({'dist': np.unique(distances['ustat']), 'test_param':'controls', 'n_test_params': 0}) df_con['efficacy'] = np.unique(distances['ustat']/(400*400)) # AUC # merge base and restore dataframes df_top = pd.concat([df_pat, df_con, df_top], ignore_index=True) top_params[args.sort_style] = np.concatenate([['patients','controls'],top_params[args.sort_style]]) # plotting palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} plt.rcParams.update({'mathtext.default': 'regular', 'font.size':10}) plt.rcParams.update({'text.usetex': False}) lw=1.5 if gs==None: if args.sort_style == 'all': fig = plt.figure(figsize=[5, int(args.n_restore/3)]) ax = plt.subplot(1,1,1) else: fig = plt.figure(figsize=[5, int(args.n_tops*args.n_test_params/4)]) ax = plt.subplot(1,1,1) else: ax = plt.subplot(gs) if 'stat' in args.efficacy_base: sbn.boxplot(data=df_top, x='efficacy', y='test_param', order=top_params[args.sort_style], hue='n_test_params', orient='h', saturation=3, width=0.5, whis=2, palette=palette, dodge=False, linewidth=lw, fliersize=2, ax=ax) #fliersize=2 else: sbn.boxplot(data=df_top, x='efficacy', y='test_param', order=top_params[args.sort_style], hue='n_test_params', orient='h', saturation=3, width=0.5, whis=2, palette=palette, dodge=False, linewidth=lw, fliersize=2, ax=ax) #fliersize=2 # data and simulated references xmin,xmax = ax.get_xlim() ymin,ymax = ax.get_ylim() # Vertical lines for visual indication if args.efficacy_base=='sims': plt.vlines(0, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.75, linewidth=lw) plt.vlines(100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.75, linewidth=lw) plt.vlines((np.std(distances['con_pat'])/np.mean(distances['con_pat_centroid']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.5, linewidth=lw) plt.vlines((2*np.std(distances['con_pat'])/np.mean(distances['con_pat_centroid']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.25, linewidth=lw) plt.vlines((1 - (np.std(distances['con'])/np.mean(distances['con_pat_centroid'])))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.5, linewidth=lw) plt.vlines((1 - (2*np.std(distances['con'])/np.mean(distances['con_pat_centroid'])))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.25, linewidth=lw) elif args.efficacy_base=='paired': plt.vlines((1 - (np.mean(distances['con_pat'])/np.mean(distances['con_pat'])))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.75, linewidth=lw) plt.vlines((1 - (np.mean(distances['con'])/np.mean(distances['con_pat'])))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.75, linewidth=lw) plt.vlines((1 - (np.mean(distances['con_pat'])-np.std(distances['con_pat']))/np.mean(distances['con_pat']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.5, linewidth=lw) plt.vlines((1 - (np.mean(distances['con_pat'])-2*np.std(distances['con_pat']))/np.mean(distances['con_pat']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='red', alpha=0.25, linewidth=lw) plt.vlines((1 - (np.mean(distances['con'])+np.std(distances['con']))/np.mean(distances['con_pat']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.5, linewidth=lw) plt.vlines((1 - (np.mean(distances['con'])+2*np.std(distances['con']))/np.mean(distances['con_pat']))*100, ymin=ymin, ymax=ymax, linestyle='dashed', color='blue', alpha=0.25, linewidth=lw) elif args.efficacy_base=='paired_A': plt.vlines(1, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) plt.vlines(((np.mean(distances['con_pat'])+np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines(((np.mean(distances['con_pat'])-np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines(((np.mean(distances['con_pat'])+2*np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) plt.vlines(((np.mean(distances['con_pat'])-2*np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) elif args.efficacy_base=='paired_B': plt.vlines(1, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) plt.vlines((np.mean(distances['con_pat'])/(np.mean(distances['con_pat'])+np.std(distances['con_pat']))), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines((np.mean(distances['con_pat'])/(np.mean(distances['con_pat'])-np.std(distances['con_pat']))), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines((np.mean(distances['con_pat'])/(np.mean(distances['con_pat'])+2*np.std(distances['con_pat']))), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) plt.vlines((np.mean(distances['con_pat'])/(np.mean(distances['con_pat'])-2*np.std(distances['con_pat']))), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) elif args.efficacy_base=='paired_C': plt.vlines(0, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) plt.vlines((1-(np.mean(distances['con_pat'])+np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines((1-(np.mean(distances['con_pat'])-np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.5, linewidth=lw) plt.vlines((1-(np.mean(distances['con_pat'])+2*np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) plt.vlines((1-(np.mean(distances['con_pat'])-2*np.std(distances['con_pat']))/np.mean(distances['con_pat'])), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.25, linewidth=lw) elif args.efficacy_base=='tstat': plt.vlines(5, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) elif args.efficacy_base=='ustat': sig_ustats = df_restore[(df_restore.upval*1485<0.051) & (df_restore.upval*1485>0.049)].ustat.unique() #plt.vlines(np.abs(sig_ustats).mean()/(400*400), ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) plt.vlines(0.578, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) else: plt.vlines(0, ymin=ymin, ymax=ymax, linestyle='dashed', color='gray', alpha=0.75, linewidth=lw) # legends and labels labels = plt.gca().get_yticklabels() new_labels = format_labels(labels) ax.set_yticklabels(new_labels) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False) ax.set_xlabel("$Efficacy \quad (E_{ff}, \\ \%)$", fontsize=10) if 'tstat' in args.efficacy_base: ax.set_xlabel("$T \, statistic$", fontsize=10) if 'ustat' in args.efficacy_base: ax.set_xlabel("AUC", fontsize=10) #ticks = ax.get_xticks() #ax.set_xticklabels(labels=["{:.1f}".format(i/100000) for i in ticks]) ax.set_ylabel("Target points", fontsize=10) sbn.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) ax.get_legend().set_visible(False) if ((gs==None) and (args.save_figs)): fname = 'restoration_best_efficacy_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname))
#plt.show(block=False)
[docs] def plot_efficacy_by_number_of_target(df_top, gs=None, args=None): """ Swarm plots of efficacy score (y-axis) by number of targets (x-axis), with means projected in log-linear scale. Parameters ---------- df_top: pandas.DataFrame Subset of df_restore with only virtual intervention resulting in positive outcomes. gs: matplotlib.GridSpec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace Extra arguments with options. """ palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} px = 1/plt.rcParams['figure.dpi'] # pixel in inches if gs==None: fig = plt.figure(figsize=[500*px,200*px]) gsp = plt.GridSpec(nrows=1, ncols=5) ax1 = plt.subplot(gsp[0,0:3]) ax2 = plt.subplot(gsp[0,3:]) else: ax1 = plt.subplot(gs[0,0:3]) ax2 = plt.subplot(gs[0,4]) plt.sca(ax1) plt.tight_layout() if args.use_optim_params: sbn.swarmplot(data=df_top, x='n_test_params', y='efficacy', ax=ax1, size=2, palette=palette, alpha=0.6) #else: #sbn.stripplot(data=df_top, x='n_test_params', y='efficacy', ax=ax1, size=0.5, palette=palette, alpha=0.6) sbn.boxplot(data=df_top, x='n_test_params', y='efficacy', ax=ax1, width=0.1, palette=palette, fliersize=0, linewidth=1.5, showcaps=False) ax1.spines.top.set_visible(False) ax1.spines.right.set_visible(False) plt.xlabel("$n_t$", fontsize=10) ticks = ax1.get_yticks() plt.ylabel("AUC", fontsize=10) plt.sca(ax2) plt.tight_layout() lines = [] for i,j in enumerate(np.sort(df_top.n_test_params.unique())): if i==0: mu_t0 = 0 else: mu_t0 = df_top[df_top.n_test_params==i].efficacy.mean() mu_t1 = df_top[df_top.n_test_params==j].efficacy.mean() plt.plot(np.log(j), mu_t1, 'o', color=palette[j], ms=6) lines.append({'x':np.log(j), 'y':mu_t1}) sbn.regplot(data=pd.DataFrame(lines), x='x', y='y', ci=95, color='gray', ax=ax2) ax2.spines.top.set_visible(False) ax2.spines.right.set_visible(False) ax2.set_xticks(np.log(np.arange(1,7))) ax2.set_xticklabels(np.arange(1,7)) plt.xlabel("$n_t$", fontsize=10) plt.ylabel("$\widehat{AUC}$", fontsize=10) if gs==None: plt.tight_layout() if args.save_figs: fname = 'avg_efficacy_by_n_targets_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname))
#-----------------------------# # FEATURE IMPORTANCE ANALYSIS # #-----------------------------# def get_X_y(df_restore, params): """ Extract parameter features for hierarchical clustering of test parameters in restoration analysis Parameters: ----------- df_restore: pandas DataFrame Simulation data with a test_param and a distance column to be transformed to X and y params: list Parameters of model Results: -------- X: np.array input feature vector y: np.array output vector """ # template feature set all feature params to 0 template_features = dict((par,0) for par in params) def transform_test_param(test_param): """ return a binary array of param features """ out = copy.deepcopy(template_features) for par in test_param.split(' '): out[par] = 1 return list(out.values()) X = df_restore.test_param.apply(transform_test_param) y = df_restore['efficacy'] return np.array(list(X)), np.array(y) def decision_tree(df_restore, params, args): """ create a tree from restoration output Parameters: ----------- df_restore pandas DataFrame of restoration output params list of model parameters args: argparse Namespace global arguments, used to get number of tested parameters """ # create a line in df for each number of params tested lines = [] decision_trees = [] feat_imps = dict() # feature importances for n_test_params in np.arange(args.n_test_params)+1: X,y = get_X_y(df_restore[(df_restore.n_test_params==n_test_params) & (df_restore.efficacy>0)], params) dt = sklearn.tree.DecisionTreeRegressor(max_depth=args.max_depth) dt.fit(X, y) y_pred = dt.predict(X) decision_trees.append({'dt':dt, 'y':y, 'y_pred':y_pred}) # basic feature importances line = dict((feat, imp) for feat,imp in zip(params, dt.feature_importances_)) line['n_test_params'] = n_test_params lines.append(line) # more detailed feature importances feat_imps[n_test_params] = permutation_importance(dt, X, y, scoring='neg_mean_absolute_error', n_repeats=10) df_feature_importance = pd.DataFrame(lines) return df_feature_importance, decision_trees, feat_imps def plot_decision_tree(df_dt, decision_trees, params): """ Plot a bar plot of basic feature importance and a tree diagram """ plt.rcParams.update({'font.size':12}) for i,row in df_dt.iterrows(): plt.figure(figsize=[6,4]) sbn.barplot(row[params].to_frame().transpose()) plt.xticks(rotation=30) plt.show() plt.figure(figsize=[60,20]) sklearn.tree.plot_tree(decision_trees[i]['dt'], max_depth=6, feature_names=params, proportion=True, impurity=False) plt.show() def plot_feature_importances(feat_imps, params): """ Plot feature importance using permutations (not bulit-in basic) """ fig = plt.figure(figsize=[16,6]) for i,(n_test_params, feat_imp) in enumerate(feat_imps.items()): fi = dict() for feat, imp in zip(params, feat_imp.importances_mean): fi[feat] = imp df_featimp = pd.DataFrame([fi]) ax = plt.subplot(2,3,i+1) sbn.barplot(data=df_featimp, ax=ax) plt.title(f'n_test_params = {n_test_params}') plt.xticks(rotation=30) plt.tight_layout() plt.show() def plot_dt_prediction(decision_trees): """ Plot decision tree prediction vs ground truth for sanity check """ n_dts = len(decision_trees) fig = plt.figure(figsize=[n_dts*5, 5]) for i,dt in enumerate(decision_trees): plt.subplot(1,n_dts, i+1) plt.scatter(dt['y'], dt['y_pred'], alpha=0.005, s=10) plt.xlabel('y') plt.ylabel('y_pred') r,p = scipy.stats.pearsonr(dt['y'], dt['y_pred']) r2 = r2_score(dt['y'], dt['y_pred']) plt.title('r={:.2f} p={:.3f} r2={:.3f}'.format(r,p,r2)) plt.show() def plot_feature_windrose(df_dt, params, rscale='linear', args=None): """ Make windrose vizualisation of decision tree feature importances """ theta = np.arange(len(params))/(len(params)) * 2*np.pi theta = np.append(theta, theta[0]) palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} fig, axes = plt.subplots(3,2, subplot_kw={'projection': 'polar'}, figsize=[5,8]) for i in np.arange(len(df_dt)): j = int(np.floor(i/2)) k = i%2 r = np.array(df_dt.iloc[i][params]) r = np.append(r, r[0]) for r_, theta_ in zip(r,theta): if r_<0: r__ = np.log10(np.abs(r_)) if rscale=='log' else np.abs(r_) axes[j,k].bar(theta_, r__, facecolor=palette[df_dt.iloc[i].n_test_params], lw=1, linestyle='--', width=0.5, edgecolor='black') else: r__ = np.log10(r_) if rscale=='log' else r_ axes[j,k].bar(theta_, r__, facecolor=palette[df_dt.iloc[i].n_test_params], lw=1, linestyle='-', width=0.5, edgecolor='black') axes[j,k].set_xticks(theta[:-1]) axes[j,k].set_xticklabels(params) lbls = axes[j,k].get_xticklabels() new_lbls = format_labels(lbls) axes[j,k].set_xticklabels(new_lbls) axes[j,k].set_yticks([-1,0,1,2]) axes[j,k].set_yticklabels([]) axes[j,k].spines.polar.set_visible(False) axes[j,k].xaxis.grid(linewidth=0.5, linestyle='--') plt.tight_layout() if args.save_figs: fname = 'restoration_param_importances_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show() def compute_dt_best_paths(decision_tree, args): """ Find best path and extract corresponding features Returns ------- None. The paths are added in given decision_tree dictionary """ dt = decision_tree['dt'] n_bests = 5 best_leaf = np.argsort(dt.tree_.value.squeeze() * np.array(dt.tree_.feature==-2, dtype=int))[::-1][:n_bests] depth = dt.tree_.compute_node_depths()[best_leaf] def get_parent(node_id): for node in range(dt.tree_.node_count): if dt.tree_.children_left[node]==node_id: return {node:False} elif dt.tree_.children_right[node]==node_id: return {node:True} return {} # get list of parents parents = [] for i,p in enumerate(best_leaf): branch = [] for _ in range(depth[i]): parent = get_parent(p) if parent!={}: branch.append(parent) p = list(parent.keys())[0] parents.append(branch) decision_tree['paths'] = parents decision_tree['best_leaves'] = best_leaf def get_feat_scores(decision_tree, params): """ Compute custom scores of features Returns: -------- feat_scores: dict (key,value) = (feature_name, feature_score) """ dt = decision_tree['dt'] parents = decision_tree['paths'] best_leaves = decision_tree['best_leaves'] pars = np.array(params) feat_scores = dict((param,0) for param in params) for i,branch in enumerate(parents): for node in branch: if list(node.values())[0]: node_id = list(node.keys())[0] feat_name = pars[dt.tree_.feature[node_id]] score = dt.tree_.value[best_leaves[i]].squeeze() feat_scores[feat_name] += score return feat_scores def compute_custom_feature_scores(decision_trees, params, args): lines= [] for i,decision_tree in enumerate(decision_trees): compute_dt_best_paths(decision_tree, args) feat_scores = get_feat_scores(decision_tree, params) feat_scores['n_test_params'] = i+1 lines.append(feat_scores) df_custom_feat_imps = pd.DataFrame(lines) return df_custom_feat_imps def compute_simple_feature_scores(df_top, params, args): """ Compute feature score simply based on top params weighted by efficacy """ lines = [] for n_test_params in np.arange(args.n_test_params)+1: df_ = df_top[df_top.n_test_params==n_test_params] Xs, ys = get_X_y(df_, params) feat_imp = np.zeros((len(params),)) for X,y in zip(Xs, ys): feat_imp += np.array(X*y).squeeze() line = dict((feat,imp) for feat,imp in zip(params,feat_imp)) line['n_test_params'] = n_test_params lines.append(line) df_simple_feat_imps = pd.DataFrame(lines) return df_simple_feat_imps def scale_efficacy_to_kdes(df_row, params, kdes, scaling): normalized = dict() for param in params: #val = (df_row[param] - kdes['patients'][param]['vals'].mean()) / kdes['patients'][param]['vals'].std() if scaling=='contribution': #normalized[param] = df_row['efficacy']*val normalized[param] = df_row['efficacy']*df_row['z_'+param] else: # sensitivity #normalized[param] = df_row['efficacy']/val normalized[param] = df_row['efficacy']/df_row['z_'+param] return normalized
[docs] def compute_scaled_feature_score(df_top, params, kdes, scaling='dot_product_correlation', args=None): """ Compute feature scores (i.e. parameter contribution) as the dot-product between their normalized location on the KDEs distribution (using z-statistic) and their resulting efficacy (AUC). Parameters ---------- df_top: pandas.DataFrame Subset of df_restore with significantly positive virtual interventions params: list Individual intervention targets (i.e. model parameters). kdes: dict Kernel Density Estimates of posterior distributions of OCD subjects and healthy controls. scaling: string How to scale the efficacy of the virtual intervention by the z-score normalized parameter. "dot_product_correlation" (default) multiplies the normalized parameter by the efficacy (AUC). Other values can be "pearson_correlation", "spearman_correlation" and "covariance_correlation" but those measures distort the results and interpretation due to the mean-centering of variables. Other legacy values are "contribution" (same as dot-product) and "sensitivity" which divides the normalized parameter by the AUC efficacy (giving a sense of "sensitivity" of the parameter). args: argparse.Namespace (optional) Extra arguments with options. """ lines = [] for n_test_params in np.arange(args.n_test_params)+1: df_ = df_top[df_top.n_test_params==n_test_params] print("Compute param sensitivity for n_test_params={}".format(n_test_params)) if 'correlation' not in scaling: rows = [] for i,row in df_.iterrows(): new_row = scale_efficacy_to_kdes(row, params, kdes, scaling) rows.append(new_row) line = pd.DataFrame(rows).mean(axis=0).to_frame().transpose() line['n_test_params'] = n_test_params lines.append(line) else: line = dict() line['n_test_params'] = n_test_params diff_fc = np.array(df_['dist_pre_hc']) - np.array(df_['dist']) for param in params: inds, = np.where([param in test_param.split(' ') for test_param in df_.test_param]) if scaling=='pearson_correlation': R,p = scipy.stats.pearsonr(diff_fc, np.array(df_['z_'+param])) line[param] = R line['p_'+param] = p elif scaling=='spearman_correlation': R,p = scipy.stats.spearmanr(diff_fc, np.array(df_['z_'+param])) line[param] = R line['p_'+param] = p elif scaling == 'covariance_correlation': R = np.cov(diff_fc, np.array(df_['z_'+param]))[0,1] line[param] = R line['p_'+param] = None elif scaling == 'cross_correlation': R = np.correlate(diff_fc, np.array(df_['z_'+param]))[0] line[param] = R R = R / (np.std(diff_fc)*np.std(df_['z_'+param])) / len(diff_fc) line['R_'+param] = R elif scaling == 'dot_product_correlation': R = np.dot(diff_fc, np.array(df_['z_'+param])) line[param] = R R = R / (np.std(diff_fc)*np.std(df_['z_'+param])) / len(diff_fc) line['R_'+param] = R line['n'] = len(df_) lines.append(pd.DataFrame([line])) df_kdes_scaled_feats = pd.concat(lines, ignore_index=True) return df_kdes_scaled_feats
def compute_feature_reliability(df_top, params, kdes, args=None): """ Compute reliability of feature using z-score """ out = [] for n_test_params in np.arange(args.n_test_params)+1: df_ = df_top[df_top.n_test_params==n_test_params] print("Compute param reliability for n_test_params={}".format(n_test_params)) for param in params: df_['z_'+param] = (df_[param] - kdes['patients'][param]['vals'].mean()) / kdes['patients'][param]['vals'].std() out.append(df_) df_reliability = pd.concat(out) return df_reliability def plot_contribution_windrose(df_params_contribution, params, args=None): """ Make windrose vizualisation of (normalized) parameter contribution in log-scaled """ theta = np.arange(len(params))/(len(params)) * 2*np.pi theta = np.append(theta, theta[0]) palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} fig, axes = plt.subplots(3,2, subplot_kw={'projection': 'polar'}, figsize=[4,6]) for i in np.arange(len(df_params_contribution)): j = int(np.floor(i/2)) k = i%2 r = np.array(df_params_contribution.iloc[i][params]) r = np.append(r, r[0]) for r_, theta_ in zip(r,theta): if r_<0: axes[j,k].bar(theta_, np.abs(r_), facecolor=palette[df_params_contribution.iloc[i].n_test_params], lw=0.75, linestyle='--', width=0.5, edgecolor='black', zorder=3) else: axes[j,k].bar(theta_, r_, facecolor=palette[df_params_contribution.iloc[i].n_test_params], lw=0.75, linestyle='-', width=0.5, edgecolor='black', zorder=3) axes[j,k].set_xticks(theta[:-1]) axes[j,k].set_xticklabels(args.params) lbls = axes[j,k].get_xticklabels() new_lbls = format_labels(lbls) axes[j,k].set_xticklabels(new_lbls, fontsize=8) r_max = np.max([np.abs(r).max(), 101]) axes[j,k].set_yticks(np.arange(0,r_max, 100)) axes[j,k].set_yticklabels([]) axes[j,k].grid(zorder=0) axes[j,k].spines.polar.set_visible(False) axes[j,k].xaxis.grid(linewidth=0.5, linestyle='--') plt.tight_layout() if args.save_figs: #plt.rcParams['svg.fonttype'] = 'none' fname = 'normalized_params_contribution_'+args.distance_metric+'_'+args.efficacy_base+today()+'_small.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show()
[docs] def plot_single_contribution_windrose(df, params, theta, palette, ax): """ A single windrose of parameters' contribution. Parameters ---------- df: pandas.DataFrame Contribution data. theta: list Angles at which to put polar bars. palette: dict Color palette used by matplotlib. ax: matplotlib.Axes Axes to plot the windrose (must be polar type). """ r = np.array(df[params]) r = np.append(r, r[0]) for r_, theta_ in zip(r,theta): if r_<0: ax.bar(theta_, np.abs(r_), facecolor=palette[df['n_test_params'].iloc[0]], lw=0.75, linestyle='--', width=0.5, edgecolor='black', zorder=3) else: ax.bar(theta_, r_, facecolor=palette[df['n_test_params'].iloc[0]], lw=0.75, linestyle='-', width=0.5, edgecolor='black', zorder=3) ax.set_xticks(theta[:-1]) ax.set_xticklabels(params) lbls = ax.get_xticklabels() new_lbls = format_labels(lbls) ax.set_xticklabels(new_lbls, fontsize=10) ax.set_yticks([]) ax.set_yticklabels([]) ax.grid(zorder=0) ax.spines.polar.set_visible(False) ax.xaxis.grid(linewidth=0.5, linestyle='--')
[docs] def plot_parameters_contribution(df_params_contribution, params, gs=None, args=None): """ Polar plots of parameters contribution across virtual interventions, colorcoded by number of intervention targets :math:`n_t`. Each polar plot corresponds to a number of target :math:`n_t`. Parameters ---------- df_params_contribution: pandas.DataFrame Contribution of parameters (z-score normalized parameters times efficacies of virtual interventions). params: list Individual intervention targets (i.e. model parameters). gs: matplotlib.GridSpec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace (optional) Extra arguments with options. """ theta = np.arange(len(params))/(len(params)) * 2*np.pi theta = np.append(theta, theta[0]) palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} if gs==None: #fig, axes = plt.subplots(3,2, subplot_kw={'projection': 'polar'}, figsize=[4,6]) fig = plt.figure(figsize=[4,6]) gs = plt.GridSpec(nrows=3, ncols=2) for i,n_test_params in enumerate(np.sort((df_params_contribution.n_test_params.unique()))): df = df_params_contribution[df_params_contribution.n_test_params==n_test_params] ax = plt.subplot(gs[i//2,i%2], polar=True) plot_single_contribution_windrose(df, params, theta, palette, ax=ax) plt.tight_layout()
def plot_sensitivity_windrose(df_params_sensitivity, params, args=None): """ Make windrose vizualisation of (normalized) parameter sensitivity in log-scale """ theta = np.arange(len(params))/(len(params)) * 2*np.pi theta = np.append(theta, theta[0]) palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} fig, axes = plt.subplots(3,2, subplot_kw={'projection': 'polar'}, figsize=[5,8]) for i in np.arange(len(df_params_sensitivity)): j = int(np.floor(i/2)) k = i%2 r = np.array(df_params_sensitivity.iloc[i][params]) r = np.append(r, r[0]) for r_, theta_ in zip(r,theta): if r_<0: axes[j,k].bar(theta_, np.log10(np.abs(r_)), facecolor=palette[df_params_sensitivity.iloc[i].n_test_params], lw=0, linestyle='--', width=0.5, edgecolor='black') else: axes[j,k].bar(theta_, np.log10(r_), facecolor=palette[df_params_sensitivity.iloc[i].n_test_params], lw=0, linestyle='-', width=0.5, edgecolor='black') axes[j,k].set_xticks(theta[:-1]) axes[j,k].set_xticklabels(params) lbls = axes[j,k].get_xticklabels() new_lbls = format_labels(lbls) axes[j,k].set_xticklabels(new_lbls) axes[j,k].set_yticks([0,1,2,3]) axes[j,k].set_yticklabels([]) axes[j,k].set_rmax(3.5) axes[j,k].spines.polar.set_visible(False) axes[j,k].xaxis.grid(linewidth=0.5, linestyle='--') plt.tight_layout() if args.save_figs: #plt.rcParams['svg.fonttype'] = 'none' fname = 'normalized_params_log_sensitivity_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show() #---------------------# # PREDICTIVE ANALYSIS # #---------------------# def plot_fc_dist_pre_post_behav(df_summary, feature='dist', args=None): """ plot behavioral relationship to distance to FC controls """ behavs=['YBOCS_Total', 'OCIR_Total', 'OBQ_Total', 'MADRS_Total', 'HAMA_Total', 'Dep_Total', 'Anx_total'] colors={'group1':'orange', 'group2':'green'} plt.figure(figsize=[21,5]) for i,behav in enumerate(behavs): plt.subplot(1,len(behavs), i+1) feat_diffs = {'group1':[], 'group2':[], 'both':[]} behav_diffs = {'group1':[], 'group2':[], 'both':[]} responders = {feature:copy.deepcopy(feat_diffs), 'behav':copy.deepcopy(behav_diffs)} for subj in df_summary.subj.unique(): df_subj = df_summary[df_summary.subj==subj] group = df_subj['group'].unique()[0] diff = df_subj[df_subj.ses=='ses-post'][feature].iloc[0] - df_subj[df_subj.ses=='ses-pre'][feature].iloc[0] behav_diff = df_subj[df_subj.ses=='ses-post'][behav].iloc[0] - df_subj[df_subj.ses=='ses-pre'][behav].iloc[0] feat_diffs[group].append(diff) behav_diffs[group].append(behav_diff) feat_diffs['both'].append(diff) behav_diffs['both'].append(behav_diff) if diff < 0: plt.scatter(behav_diff, diff, color=colors[group], alpha=0.5) responders[feature][group].append(diff) responders['behav'][group].append(behav_diff) responders[feature]['both'].append(diff) responders['behav']['both'].append(behav_diff) else: plt.scatter(behav_diff, diff, color=colors[group], alpha=0.2) r,p = scipy.stats.pearsonr(feat_diffs['both'], behav_diffs['both']) r1,p1 = scipy.stats.pearsonr(feat_diffs['group1'], behav_diffs['group1']) r2,p2 = scipy.stats.pearsonr(feat_diffs['group2'], behav_diffs['group2']) rr,pr = scipy.stats.pearsonr(responders[feature]['both'], responders['behav']['both']) r1r,p1r = scipy.stats.pearsonr(responders[feature]['group1'], responders['behav']['group1']) r2r,p2r = scipy.stats.pearsonr(responders[feature]['group2'], responders['behav']['group2']) plt.title("n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}\n\ Responders:\n n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}".format( len(feat_diffs['both']),r,p, len(feat_diffs['group1']), r1, p1, len(feat_diffs['group2']), r2, p2, len(responders[feature]['both']),rr,pr, len(responders[feature]['group1']), r1r, p1r, len(responders[feature]['group2']), r2r, p2r)) plt.xlabel("$\Delta \, {}$".format(behav.split('_')[0])) plt.ylabel("$\Delta \, {} \, (FC)$".format(feature)) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) plt.tight_layout() def plot_fc_dist_pre_post_params(df_summary, args=None): """ plot behavioral relationship to distance to FC controls """ #behav = 'YBOCS_Total' #'OCIR_Total' params=['C_12', 'C_13', 'C_24', 'C_31', 'C_34', 'C_42', 'eta_C_13', 'eta_C_24', 'sigma', 'sigma_C_13', 'sigma_C_24'] colors={'group1':'orange', 'group2':'green'} plt.figure(figsize=[18,10]) for i,param in enumerate(params): plt.subplot(2,int(np.ceil(len(params)/2)), i+1) dist_diffs = {'group1':[], 'group2':[], 'both':[]} param_diffs = {'group1':[], 'group2':[], 'both':[]} responders = {'dist':copy.deepcopy(dist_diffs), 'param':copy.deepcopy(param_diffs)} for subj in df_summary.subj.unique(): df_subj = df_summary[df_summary.subj==subj] group = df_subj['group'].unique()[0] diff = df_subj[df_subj.ses=='ses-post'].dist.iloc[0] - df_subj[df_subj.ses=='ses-pre'].dist.iloc[0] param_diff = df_subj[df_subj.ses=='ses-post'][param].iloc[0] - df_subj[df_subj.ses=='ses-pre'][param].iloc[0] dist_diffs[group].append(diff) param_diffs[group].append(param_diff) dist_diffs['both'].append(diff) param_diffs['both'].append(param_diff) if diff < 0: plt.scatter(param_diff, diff, color=colors[group], alpha=0.5) responders['dist'][group].append(diff) responders['param'][group].append(param_diff) responders['dist']['both'].append(diff) responders['param']['both'].append(param_diff) else: plt.scatter(param_diff, diff, color=colors[group], alpha=0.2) r,p = scipy.stats.pearsonr(dist_diffs['both'], param_diffs['both']) r1,p1 = scipy.stats.pearsonr(dist_diffs['group1'], param_diffs['group1']) r2,p2 = scipy.stats.pearsonr(dist_diffs['group2'], param_diffs['group2']) rr,pr = scipy.stats.pearsonr(responders['dist']['both'], responders['param']['both']) r1r,p1r = scipy.stats.pearsonr(responders['dist']['group1'], responders['param']['group1']) r2r,p2r = scipy.stats.pearsonr(responders['dist']['group2'], responders['param']['group2']) plt.title("n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}\n\ Responders:\n n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}".format( len(dist_diffs['both']),r,p, len(dist_diffs['group1']), r1, p1, len(dist_diffs['group2']), r2, p2, len(responders['dist']['both']),rr,pr, len(responders['dist']['group1']), r1r, p1r, len(responders['dist']['group2']), r2r, p2r)) plt.xlabel("$\Delta \, {}$".format(format_param(param))) plt.ylabel("$\Delta \, distance \, (FC)$") plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) plt.tight_layout() def plot_pre_post_params_behavs(df_summary, args=None): """ plot behavioral relationship to distance to FC controls """ params=['dist', 'efficacy', 'C_12', 'C_13', 'C_24', 'C_31', 'C_34', 'C_42', 'eta_C_13', 'eta_C_24', 'sigma', 'sigma_C_13', 'sigma_C_24'] behavs=['YBOCS_Total', 'OCIR_Total', 'OBQ_Total', 'MADRS_Total', 'HAMA_Total', 'Dep_Total', 'Anx_total'] colors={'group1':'orange', 'group2':'green'} fig = plt.figure(figsize=[len(behavs)*3,len(params)*5]) gs = plt.GridSpec(nrows=len(params), ncols=len(behavs)) for i,param in enumerate(params): for j,behav in enumerate(behavs): ax = fig.add_subplot(gs[i,j]) behav_diffs = {'group1':[], 'group2':[], 'both':[]} param_diffs = {'group1':[], 'group2':[], 'both':[]} responders = {'behav':copy.deepcopy(behav_diffs), 'param':copy.deepcopy(param_diffs)} for subj in df_summary.subj.unique(): df_subj = df_summary[df_summary.subj==subj] group = df_subj['group'].unique()[0] behav_diff = df_subj[df_subj.ses=='ses-post'][behav].iloc[0] - df_subj[df_subj.ses=='ses-pre'][behav].iloc[0] param_diff = df_subj[df_subj.ses=='ses-post'][param].iloc[0] - df_subj[df_subj.ses=='ses-pre'][param].iloc[0] behav_diffs[group].append(behav_diff) param_diffs[group].append(param_diff) behav_diffs['both'].append(behav_diff) param_diffs['both'].append(param_diff) if behav_diff < -1: plt.scatter(param_diff, behav_diff, color=colors[group], alpha=0.5) responders['behav'][group].append(behav_diff) responders['param'][group].append(param_diff) responders['behav']['both'].append(behav_diff) responders['param']['both'].append(param_diff) else: plt.scatter(param_diff, behav_diff, color=colors[group], alpha=0.2) r,p = scipy.stats.pearsonr(behav_diffs['both'], param_diffs['both']) r1,p1 = scipy.stats.pearsonr(behav_diffs['group1'], param_diffs['group1']) r2,p2 = scipy.stats.pearsonr(behav_diffs['group2'], param_diffs['group2']) rr,pr = scipy.stats.pearsonr(responders['behav']['both'], responders['param']['both']) r1r,p1r = scipy.stats.pearsonr(responders['behav']['group1'], responders['param']['group1']) r2r,p2r = scipy.stats.pearsonr(responders['behav']['group2'], responders['param']['group2']) plt.title("n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}\n\ Responders:\n n={} r={:.2f} p={:.3f}\ng1: n={} r={:.2f} p={:.3f}\ng2: n={} r={:.2f} p={:.3f}".format( len(behav_diffs['both']),r,p, len(behav_diffs['group1']), r1, p1, len(behav_diffs['group2']), r2, p2, len(responders['behav']['both']),rr,pr, len(responders['behav']['group1']), r1r, p1r, len(responders['behav']['group2']), r2r, p2r)) plt.xlabel("$\Delta \, {}$".format(format_param(param))) plt.ylabel("$\Delta \, {}$".format(behav.split('_')[0])) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) plt.tight_layout()
[docs] def plot_pre_post_dist_ybocs(df_summary, behav = 'YBOCS_Total', gs=None, args=None): """ Plot improvement in behavioral measure of symptoms severity (Y-BOCS) of subjects, and their association to functional improvement (via their distance to healthy functional connectivity). Parameters ---------- df_summary: pandas.DataFrame Summary measures of empirical analysis, inluding subject's functional connectivity across frontostriatal circuits, behavioral measures (e.g. Y-BOCS score, IQ, etc.), digital pairing and values of the digital twin parameters. gs: matplotlib.Gridspec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace Extra arguments with options. """ param = 'dist' if gs==None: fig = plt.figure(figsize=[3,3]) ax = fig.add_subplot(1,1,1) else: ax = plt.subplot(gs) behav_diffs = {'group1':[], 'group2':[], 'both':[]} param_diffs = {'group1':[], 'group2':[], 'both':[]} responders = {'behav':copy.deepcopy(behav_diffs), 'param':copy.deepcopy(param_diffs)} lines=[] for subj in df_summary.subj.unique(): df_subj = df_summary[df_summary.subj==subj] group = df_subj['group'].unique()[0] behav_diff = df_subj[df_subj.ses=='ses-pre'][behav].iloc[0] - df_subj[df_subj.ses=='ses-post'][behav].iloc[0] param_diff = df_subj[df_subj.ses=='ses-pre'][param].iloc[0] - df_subj[df_subj.ses=='ses-post'][param].iloc[0] behav_diffs[group].append(behav_diff) param_diffs[group].append(param_diff) behav_diffs['both'].append(behav_diff) param_diffs['both'].append(param_diff) responders['behav'][group].append(behav_diff) responders['param'][group].append(param_diff) responders['behav']['both'].append(behav_diff) responders['param']['both'].append(param_diff) lines.append({'subj':subj, 'group':group, 'param':param, 'behav':behav, 'param_diff':param_diff, 'behav_diff':behav_diff}) rr,pr = scipy.stats.pearsonr(responders['behav']['both'], responders['param']['both']) sbn.regplot(data=pd.DataFrame(lines), x='param_diff', y='behav_diff', ax=ax, color='gray', scatter_kws={'s':10}) plt.title("n={} r={:.2f} p={:.3f}".format(len(responders['behav']['both']),rr,pr), fontsize=10) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) plt.xlabel("$\Delta \, FC$") plt.ylabel("$\Delta \, {}$".format(behav.split('_')[0])) if gs==None: plt.tight_layout() if args.save_figs: fname = 'FC_YBOCS_scatter'+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show()
def plot_restoration_figure_paper(df_restore, df_top, df_params_contribution, args): """ Concatenates restoration plots for paper """ fig = plt.figure(figsize=[12,10]) gs = plt.GridSpec(nrows=12, ncols=14) plt.tight_layout() # top 5 efficacy by targets plot_distance_restore(df_restore, args, gs[:9, 3:9]) # efficacy by number of targets + log scaling sub_gs = gs[10:,:10].subgridspec(nrows=1, ncols=5) plot_efficacy_by_number_of_target(df_top, sub_gs, args=args) # parameters' contribution params = args.params sub_gs = gs[:9,10:].subgridspec(nrows=3, ncols=2) plot_parameters_contribution(df_params_contribution, params, sub_gs, args) if args.save_figs: fname = 'restoration_combined_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show() #----------------------# # IMPROVEMENT ANALYSIS # #----------------------# def get_param_zscore(df, params, kdes): """ score each parameter based on kdes distributions """ score = [] for param in params: mean, std = kdes['patients'][param]['vals'].mean(), kdes['patients'][param]['vals'].std() zpar = (df[param].iloc[0] - mean) / std score.append(zpar) return np.array(score)
[docs] def score_improvement(df, params, kdes, behav='YBOCS_Total'): """ Score parameters based on improvement they induce in functional connectivity (FC) space across virtual interventions. Parameters ---------- df: pandas.DataFrame Summary data (empirical data paired with digital twins) params: list Individual intervention targets (i.e. model parameters). kdes: dict Kernel Density Estimates of posterior distributions of OCD subjects and healthy controls. behav: string Behavioral measure. Default: Y-BOCS score. Returns ------- df_improvement: pandas.DataFrame Normalized differences between initial (pre) and follow-up (post) parameters of digital twins for each number of targets in virtual interventions. """ lines = [] for subj in df.subj.unique(): df_subj = df[df.subj==subj] pre = get_param_zscore(df_subj[df_subj.ses=='ses-pre'], params, kdes) post = get_param_zscore(df_subj[df_subj.ses=='ses-post'], params, kdes) diff_params = post-pre pre = df_subj[df_subj.ses=='ses-pre'][behav] post = df_subj[df_subj.ses=='ses-post'][behav] diff_behav = np.array(pre) - np.array(post) # we want behavioral improvements to be postive pre = df_subj[df_subj.ses=='ses-pre']['dist'] post = df_subj[df_subj.ses=='ses-post']['dist'] diff_fc = pre.iloc[0] - post.iloc[0] # we want functional improvements to be positive score = diff_params score_dict = dict((par,val) for par,val in zip(params,score)) score_dict['subj'] = subj score_dict['group'] = df_subj.group.unique()[0] score_dict['diff_behav'] = diff_behav[0] score_dict['diff_fc'] = diff_fc for param in params: pre = df_subj[df_subj.ses=='ses-pre'][param].iloc[0] post = df_subj[df_subj.ses=='ses-post'][param].iloc[0] score_dict['diff_'+param] = post - pre lines.append(score_dict) df_improvement = pd.DataFrame(lines) return df_improvement
[docs] def plot_improvement_windrose(df_improvement, params, gs=None, args=None): """ Make windrose vizualisation of mean improvement in parameter space. Parameters ---------- df_improvement: pandas.DataFrame Z-score normalized differences between initial (pre) and follow-up (post) parameters of digital twins for number of targets in virtual interventions. params: list Individual intervention targets (i.e. model parameters). gs: matplotlib.Gridspec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace Extra arguments with options. """ theta = np.arange(len(params))/(len(params)) * 2*np.pi theta = np.append(theta, theta[0]) palette = {'group1': 'orange', 'group2': 'green'} if gs==None: fig, ax = plt.subplots(1,1, subplot_kw={'projection': 'polar'}, figsize=[3,3]) else: ax = plt.subplot(gs, projection='polar') sub_df = df_improvement line = dict() for param in params: df_tmp = df_improvement line[param]= np.dot(np.array(df_tmp[param]), np.array(df_tmp['diff_behav'])) sub_df = pd.DataFrame([line]) # mean improvemet across subjects r = np.array(sub_df[params].sum(axis=0))/len(sub_df) r = np.append(r, r[0]) for theta_, r_ in zip(theta, r): if r_ > 0: # increase of parameter due to treatment #ax.bar(theta_, np.abs(r_), color='red', lw=5, width=0.5, alpha=0.2+np.abs(r_)/2) ax.bar(theta_, np.abs(r_), color='goldenrod', width=0.5, lw=1, edgecolor='k', linestyle='-', alpha=0.2+np.abs(r_)/1000) else: # decreae of paramter due to treatment #ax.bar(theta_, np.abs(r_), color='blue', lw=5, width=0.5, alpha=0.2+np.abs(r_)/2) ax.bar(theta_, np.abs(r_), color='goldenrod', width=0.5, lw=1, edgecolor='k', linestyle='--', alpha=0.2+np.abs(r_)/1000) #axes[i].fill_between(theta, rmin, rmax) ax.set_xticks(theta[:-1]) ax.set_xticklabels(params) lbls = ax.get_xticklabels() new_lbls = format_labels(lbls) ax.set_xticklabels(new_lbls) #axes[i].set_rticks([0.1], labels=[]) ax.set_yticklabels([]) ax.set_yticks([1]) #ax.set_rmin(-1) #ax.set_rmax(1) ax.spines.polar.set_visible(False) ax.xaxis.grid(linewidth=0.5, linestyle='--') if gs==None: plt.tight_layout() if args.save_figs: #plt.rcParams['svg.fonttype'] = 'none' fname = 'improvement_params_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show()
def plot_improvement_bars(df_improvement, params, gs=None, args=None): """ Make barplot vizualisation of improvements in parameter space. Parameters ---------- df_improvement: pandas.DataFrame Z-score normalized differences between initial (pre) and follow-up (post) parameters of digital twins for number of targets in virtual interventions. params: list Individual intervention targets (i.e. model parameters). gs: matplotlib.Gridspec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace Extra arguments with options. """ if gs==None: fig, ax = plt.subplots(1,1, figsize=[3,6]) else: ax = plt.subplot(gs) # 1) behavioral improvement during treatment sub_df = df_improvement[sub_df.diff_behav>0] #[df_improvement.group==group] for i,param in enumerate(params): mean_param_diff = sub_df[param].mean() # increased params if mean_param_diff>0: ax.bar(i, np.abs(mean_param_diff)*sub_df['diff_behav'], color='goldenrod', width=0.5, alpha=0.3, lw=1, edgecolor='k', linestyle='-') # behavioral recession during treatment else: ax.bar(i, np.abs(mean_param_diff)*sub_df['diff_behav'], color='goldenrod', width=0.5, alpha=0.3, lw=1, edgecolor='k', linestyle='-') #axes[i].fill_between(theta, rmin, rmax) ax.set_xticks(theta[:-1]) ax.set_xticklabels(params) lbls = ax.get_xticklabels() new_lbls = format_labels(lbls) ax.set_xticklabels(new_lbls) #axes[i].set_rticks([0.1], labels=[]) ax.set_yticklabels([]) ax.set_yticks([1]) #ax.set_rmin(-1) #ax.set_rmax(1) ax.spines.polar.set_visible(False) ax.xaxis.grid(linewidth=0.5, linestyle='--') if gs==None: plt.tight_layout() if args.save_figs: #plt.rcParams['svg.fonttype'] = 'none' fname = 'improvement_params_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show() def plot_improvement_distrib(df_improvement, param_imp=None): plt.figure(figsize=[12,3]) df_imp = df_improvement.melt(id_vars=['subj', 'group'], value_vars=params, var_name='param', value_name='diff', ignore_index=True) sbn.swarmplot(data=df_imp, x='param', y='diff', hue='group', dodge=True, palette={'group1':'orange', 'group2':'green'}, alpha=0.5, size=3) plt.hlines(0, -1, len(params), color='black', linestyle='--', lw=0.5, alpha=0.5) plt.legend().set_visible(False) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) if param_imp==None: plt.title('Normalized improvement') else: plt.title('Normalized improvement scaled by '+param_imp) plt.show() def print_stats_improvement(df_improvement, params, param_imp=None): print("STATS FOR NORMALIZED PARAMETER IMPROVEMENT SCALED BY "+str(param_imp)) for param in params: gp1 = df_improvement[df_improvement.group=='group1'][param] gp2 = df_improvement[df_improvement.group=='group2'][param] t,p = scipy.stats.ttest_ind(gp1, gp2) print("{:12} \t t={:.2f} \t p={:.3f}".format(param, t, p)) def plot_improvement_pre_post_params(df_summary, params, args): """ Plot initial (pre) and follow-up (post) distributions of parameters from digital twin analysis. Parameters ---------- df_summary: pandas.DataFrame Summary measures of empirical analysis, inluding subject's functional connectivity across frontostriatal circuits, behavioral measures (e.g. Y-BOCS score, IQ, etc.), digital pairing and values of the digital twin parameters. params: list Individual intervention targets (i.e. model parameters). args: argparse.Namespace Extra arguments with options. """ plt.figure(figsize=[22,3]) for i,param in enumerate(params): plt.subplot(1, len(params), i+1) df_sum = df_summary.melt(id_vars=['subj', 'ses'], value_vars=param, var_name='param', ignore_index=True) sbn.swarmplot(data=df_sum, x='param', y='value', hue='ses', hue_order=['ses-pre', 'ses-post'], dodge=True, palette={'ses-pre':'orange', 'ses-post':'purple'}, size=3.5, alpha=0.5) plt.legend().set_visible(False) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) #plt.gca().spines.bottom.set_visible(False) s_pre, p_pre = scipy.stats.normaltest(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value) s_post, p_post = scipy.stats.normaltest(df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) u,p = scipy.stats.mannwhitneyu(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) d = cohen_d(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) t,pt = scipy.stats.ttest_rel(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) T,pT = scipy.stats.wilcoxon(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) KS,pKS = scipy.stats.ks_2samp(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) ES,pES = scipy.stats.epps_singleton_2samp(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) print("{:15}: normality={:1}/{:1} d={: 1.2} u={:5} p={:.4f} t={: .2f} p={:.4f} T={} p={:.4f} KS={:.3f} p={:.4f} ES={:.3f} p={:.4f}".format(param, p_pre>0.05, p_post>0.05,d, int(u), p, t, pt,int(T), pT, KS, pKS, ES, pES)) #plt.title("u={} p={:.3f}".format(int(u),p)) ttl = '' #if ((p_pre>0.05) and (p_post>0.05)): # if p<.05: # ttl = ttl+'*' # if p*len(params)<0.05: # ttl = ttl+'*' #else: if pT<.05: ttl = ttl+'$\star$' if pT*len(params)<0.05: ttl = ttl+'$\star$' plt.title(ttl, fontsize=14) #plt.ylabel("${}$".format(format_param(param)), fontsize=12) plt.xticks([0], labels=[]) plt.xlabel("${}$".format(format_param(param)), fontsize=12) plt.ylabel('') if param.startswith('C_'): plt.ylim([-0.12, 0.55]) if i>1: plt.gca().spines.left.set_visible(False) plt.yticks([], labels=None) elif param.startswith('eta_'): plt.ylim([-0.01, 0.11]) if args.save_figs: fname = 'distrib_params_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.tight_layout() plt.show() def get_kde(data, mn, mx, smoothing_factor=10): """ create kernel density estimate for the data (used in violin-like plots) """ b = (mx-mn)/smoothing_factor mn -= (mx-mn)*0.1 mx += (mx-mn)*0.1 model = sklearn.neighbors.KernelDensity(bandwidth=b) xtrain = np.array(data)[:, np.newaxis] model.fit(xtrain) xtest = np.linspace(mn,mx,100)[:, np.newaxis] log_dens = model.score_samples(xtest) mu = model.score_samples(np.mean(data).reshape(1,1)) return xtest, np.exp(log_dens), np.exp(mu)
[docs] def plot_improvement_pre_post_params_paper(df_summary, params, gs=None, args=None): """ Plot initial (pre) and follow-up (post) distributions of parameters from digital twin analysis. (only relevant parameters are shown for manuscript). Parameters ---------- df_summary: pandas.DataFrame Summary measures of empirical analysis, inluding subject's functional connectivity across frontostriatal circuits, behavioral measures (e.g. Y-BOCS score, IQ, etc.), digital pairing and values of the digital twin parameters. params: list Individual intervention targets (i.e. model parameters). gs: matplotlib.Gridspec (optional) A GridSpec object that can be used to embbed axes when this figure is a subplot of a larger figure. args: argparse.Namespace Extra arguments with options. """ #i_params = [1,2,5,7] # indices of params to plot i_params = [1,3,5] # indices of params to plot #i_params = [1,2,3,5] # indices of params to plot #i_params = [1,3,5,8] # indices of params to plot if gs==None: fig = plt.figure(figsize=[6,2]) sub_gs = plt.GridSpec(nrows=1, ncols=len(i_params)+1) else: #sub_gs = gs.subgridspec(nrows=1, ncols=len(i_params)+1, width_ratios=[1,1,1,0.5,1]) sub_gs = gs.subgridspec(nrows=1, ncols=len(i_params))#, width_ratios=[1,1,1,0.5,1]) for i,i_param in enumerate(i_params): param = params[i_param] #plt.subplot(1, 4, i+1) if (param.startswith('eta_') | param.startswith('sigma')): ax = plt.subplot(sub_gs[0,i+1]) else: ax = plt.subplot(sub_gs[0,i]) df_sum = df_summary.melt(id_vars=['subj', 'ses'], value_vars=param, var_name='param', ignore_index=True) #sbn.swarmplot(data=df_sum, x='param', y='value', hue='ses', hue_order=['ses-pre', 'ses-post'], dodge=True, # palette={'ses-pre':'orange', 'ses-post':'purple'}, size=3, alpha=0.5, ax=ax) for subj in df_sum.subj.unique(): sbn.pointplot(data = df_sum[df_sum.subj==subj], y='value', x='ses', order=['ses-pre', 'ses-post'], dodge=True, color='gray', linewidth=0.5, marker={'size':1}, size=0.5, alpha=0.5, ax=ax) kde_scale = 0.1 kde_pre_x, kde_pre_logdensity, kde_pre_exp_mu = get_kde(np.array(df_sum[df_sum.ses=='ses-pre'].value), mn=df_sum.value.min(), mx=df_sum.value.max()) kde_post_x, kde_post_logdensity, kde_post_exp_mu = get_kde(np.array(df_sum[df_sum.ses=='ses-post'].value), mn=df_sum.value.min(), mx=df_sum.value.max()) ax.fill(-kde_pre_logdensity/kde_pre_logdensity.std()*kde_scale,kde_pre_x, color='goldenrod', alpha=0.2) #ax.plot([-kde_pre_exp_mu/kde_pre_logdensity.std()*kde_scale,0],[df_sum[df_sum.ses=='ses-pre'].value.mean(), df_sum[df_sum.ses=='ses-pre'].value.mean()], # '-', color='gray') ax.fill(1+kde_post_logdensity/kde_post_logdensity.std()*kde_scale,kde_post_x, color='goldenrod', alpha=0.2) #ax.plot([1+kde_post_exp_mu/kde_post_logdensity.std()*kde_scale,0],[df_sum[df_sum.ses=='ses-post'].value.mean(), df_sum[df_sum.ses=='ses-post'].value.mean()], # '-', color='gray') plt.setp(ax.collections, sizes=[6], linewidth=[0.1]) plt.setp(ax.lines, linewidth=0.3) plt.legend().set_visible(False) plt.gca().spines.top.set_visible(False) plt.gca().spines.right.set_visible(False) #plt.gca().spines.bottom.set_visible(False) u,p = scipy.stats.mannwhitneyu(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) d = cohen_d(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) t,pt = scipy.stats.ttest_rel(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) T,pT = scipy.stats.wilcoxon(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) ES,pES = scipy.stats.epps_singleton_2samp(df_sum[df_sum.ses=='ses-pre'].sort_values('subj').value, df_sum[df_sum.ses=='ses-post'].sort_values('subj').value) print("{:15}: d={: 1.2} u={:5} p={:.4f} t={: .2f} p={:.4f} T={: } p={:.4f} ES={:.3f} pES={:.4f}".format(param, d, int(u), p, t, pt,int(T), pT, ES, pES)) #plt.title("u={} p={:.3f}".format(int(u),p)) ttl = '' ft = 10 if pT<.05: ttl = ttl+'*' if pT*len(i_params)<0.05: ttl = ttl+'*' ft=14 #if pES<0.05: # ttl += '#' # if pES*len(i_params)<0.05: # ttl += '#' ttl = plt.title(ttl, fontsize=ft) x,y = ttl.get_position() ttl.set_position([x,y-0.2]) #plt.ylabel("${}$".format(format_param(param)), fontsize=12) plt.xticks([0,1], labels=['baseline', 'post']) plt.xlabel("${}$".format(format_param(param)), fontsize=12) if i==0: plt.ylabel('value') else: plt.ylabel('') if param.startswith('C_'): #ax.set_ylim([-0.12, 0.55]) ax.set_ylim([-0.17, 0.55]) if i>0: plt.gca().spines.left.set_visible(False) plt.yticks([], labels=None) plt.subplots_adjust(left=0, right=0.8) elif param.startswith('eta_'): ax.set_ylim([-0.01, 0.11]) #ax.set_ylim([-0.03, 0.13]) plt.yticks([0, 0.05, 0.1]) plt.subplots_adjust(left=0.2, right=1) elif param.startswith('sigma'): ax.set_ylim([0.05, 0.1]) #ax.set_ylim([-0.03, 0.13]) plt.yticks([0.06, 0.08]) plt.subplots_adjust(left=0.2, right=1) plt.tight_layout() if gs==None: plt.tight_layout() if args.save_figs: fname = 'distrib_params_paper_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show()
def drop_unimproved(df_summary, df_improvement, feature='dist', threshold=0.1): """ Remove subjects whos feature do not improve between pre and post. Parameters ---------- df_summary: pandas.DataFrame Summary output of the analysis df_improvement: pandas.DataFrame Summary improvement of the virtual interventions feature: string Column of the dataframe to apply filtering on. Returns ------- df_summary: pandas.DataFrame Summary output of the analysis df_improvement: pandas.DataFrame Summary improvement of the virtual interventions """ for subj in df_summary.subj.unique(): df = df_summary[df_summary.subj==subj] if feature=='distance': if ((df[df.ses=='ses-post'][feature].iloc[0]>threshold) | (df[df.ses=='ses-pre'][feature].iloc[0] > threshold)): df_summary.drop(df_summary[df_summary.subj==subj].index[:], inplace=True) df_improvement.drop(df_improvement[df_improvement.subj==subj].index[:], inplace=True) else: if (df[df.ses=='ses-post'][feature].iloc[0] - df[df.ses=='ses-pre'][feature].iloc[0] > threshold): df_summary.drop(df_summary[df_summary.subj==subj].index[:], inplace=True) df_improvement.drop(df_improvement[df_improvement.subj==subj].index[:], inplace=True) return df_summary, df_improvement def plot_summary_improvement(df_summary, df_improvement, params, feature=None, args=None): """ Just a layout organization to have all plots in same figure """ fig = plt.figure(figsize=[6,5]) gs = plt.GridSpec(nrows=2,ncols=4, height_ratios=[1,1]) #df_summary, df_improvement = drop_unimproved(df_summary, df_improvement, threshold=args.fc_thresh) # FC - YBOCS scatter + regrression plot plot_pre_post_dist_ybocs(df_summary, gs=gs[0,0:2], args=args) # Param contribution plot_improvement_windrose(df_improvement, params=params, gs=gs[0,2:4], args=args) # pre-post change in param space plot_improvement_pre_post_params_paper(df_summary, params=params, gs=gs[1,:], args=args) plt.tight_layout() #plot_improvement_pre_post_params(df_summary, params=params, args=args) if args.save_figs: fname = os.path.join(proj_dir, 'img', 'summary_improvement_'+args.distance_metric+'_'+args.efficacy_base+today()+'.svg') plt.savefig(fname) plt.show() def linear_regression_sims(df_restore, params): """ Linear model between change in parameters and change in distance to healthy controls """ df_rest = df_restore[(df_restore.ustat>96200)]# & (df_restore.dist<df_restore.dist_pre_hc)] X = [] inds = dict((i,[]) for i in np.arange(0,7)) y = [] for j,row in df_rest.iterrows(): x = np.zeros(len(params),) for i,par in enumerate(params): #if par in tp.split(' '): x[i] = row['z_'+par] X.append(x) inds[row.n_test_params].append(j) #y.append(row['ustat']) y.append(row['dist_pre_hc'] - row['dist']) X = np.array(X) y = np.array(y) model = sklearn.linear_model.LinearRegression(positive=True) model.fit(X,y) score = model.score(X,y) return model, score def plot_regression_coefs(model, score, params): plt.bar(np.arange(1,12), model.coef_) plt.xticks(np.arange(1,12), labels=params, rotation=60) plt.title('$R^2={:.2f}$'.format(score)) def linear_regression_digital_twins(df_improvement, params): """ Linear model between change in parameters and improvement in YBOCS """ df_imp = df_improvement[df_improvement['diff_behav']>0] #df_imp = df_improvement#[df_improvement['diff_fc']>0] X = np.array(df_imp[params]) y = np.array(df_imp['diff_behav']) #y = np.array(df_imp['diff_fc']) model = sklearn.linear_model.LinearRegression(positive=True) model.fit(X,y) score = model.score(X,y) return model def plot_hcp_DD(df_sim_hcp, params=['C_13'], gs=None, args=None): """ Plot linear regression between NAcc-OFC coupling and delay-discounting score in HCP dataset """ DD = 'DDisc_AUC_200' # filter out digital twins not satisfying proximity requirement of FC distance < 0.05 df = df_sim_hcp[df_sim_hcp.distance<0.05] if gs==None: fig = plt.figure(figsize=[3,3]) ax = fig.add_subplot(1,1,1) else: ax = plt.subplot(gs) for i,param in enumerate(params): plt.scatter(df[beh], df[param]) r,p = scipy.stats.pearsonr(df[beh], df[param]) plt.title("{:12} r={:.2f} p={:.4f}".format(param, r,p)) plt.tight_layout() def plot_param_distance_correlations(df_five_top, color='blue', gs=None, row=None): """ Show association between parameter values and distance to healthy FC, for top parameters of restoration. This function can be called to plot single or multiple intervention. Note that in the Supplementary Figure 4, we call this function once for each best interventions per number of targets whereby every call uses a different color. Parameters ---------- df_five_top: pandas.DataFrame Best(s) interventions per number of targets :math:`n_t`. color: str Name of color that correspond to number of targets :math:`n_t`. gs: matplotlib.GridSpec GridSpec object if the plot is embedded in another plot. row: int Row of the GridSpec on which to plot the current correlations """ top_test_params = df_five_top.test_param.unique() ncols = df_five_top.n_test_params.unique()[0] if gs==None: fig = plt.figure(figsize=[ncols*2,10]) gs = plt.GridSpec(nrows=5, ncols=ncols) for i,test_param in enumerate(top_test_params): pars = test_param.split(' ') df_ = df_five_top[df_five_top.test_param==test_param] #df_restore[df_restore.test_param==test_param] for j,par in enumerate(pars): if row==None: ax = plt.subplot(gs[i,j]) else: ax = plt.subplot(gs[row,j]) df_['pre_post'] = df_['dist_pre_hc'] - df_['dist'] data = df_ sbn.regplot(data=data, y='pre_post', x='z_'+par, ax=ax, marker='.', scatter_kws={'s':5, 'alpha':0.3}, color=color) ax.set_xlabel('${'+OCD_modeling.mcmc.inference_analysis.format_param(par)+'}^z$', fontsize=14) if j==0: ax.set_ylabel('$\Delta \; FC$', fontsize=12) else: ax.set_ylabel('', visible=False) r,p = scipy.stats.pearsonr(np.array(data['pre_post']), np.array(data['z_'+par])) ax.set_title('r={:.2f} p={:.3f}'.format(r,p)) ax.spines.top.set_visible(False) ax.spines.right.set_visible(False)
[docs] def plot_five_top_params_distance_correlations(df_top, args): """ Plot intermediate visualization of parameter change correlation to change of distance in FC space. Parameters ---------- df_top: pandas.DataFrame Five best parameter combinations (i.e. virtual interventions) per number of target (number of target :math:`n_t=1 \cdots 6`). args: argparse.Namespace Optional arguments. """ top_test_params = ['C_42', 'C_12 C_42', 'C_12 C_42 eta_C_13', 'C_12 C_42 eta_C_13 sigma_C_24', 'C_13 C_24 C_31 C_34 C_42', 'C_12 C_13 C_24 C_31 C_34 C_42'] palette = {0: 'white', 1: 'lightpink', 2: 'plum', 3: 'mediumpurple', 4: 'lightsteelblue', 5:'skyblue', 6:'royalblue'} args.save_figs = True fig = plt.figure(figsize=[12,12]) gs = plt.GridSpec(nrows=6, ncols=6) for j,n_test_params in enumerate(np.sort(df_top.n_test_params.unique())): df_ = df_top[df_top.n_test_params==n_test_params] df__ = df_[[df_.iloc[i].test_param in top_test_params for i in range(len(df_))]] if len(df__): plot_param_distance_correlations(df__, color=palette[n_test_params], gs=gs, row=j) plt.tight_layout() if args.save_figs: fname = 'diff_fc_vs_param_n_test_params_'+today()+'.svg' plt.savefig(os.path.join(proj_dir, 'img', fname)) plt.show()
def parse_arguments(): " Script arguments when ran as main " parser = argparse.ArgumentParser() # global parameters parser.add_argument('--save_figs', default=False, action='store_true', help='save figures') parser.add_argument('--save_outputs', default=False, action='store_true', help='save outputs') parser.add_argument('--save_summary', default=False, action='store_true', help='save output summary') parser.add_argument('--n_jobs', type=int, default=10, action='store', help="number of parallel processes launched") parser.add_argument('--n_sims', type=int, default=50, action='store', help="number of simulations ran with the same parameters (e.g. to get distribution that can be campared to clinical observations)") parser.add_argument('--n_batch', type=int, default=10, action='store', help="batch size") parser.add_argument('--plot_figs', default=False, action='store_true', help='plot figures') # loading parameters parser.add_argument('--db_names', type=str, nargs='+', default=["sim_digital_twins_20240327", "sim_interventions_20240323"], help="identifier of the sqlite3 database (use sim_pat_20230628 for digital twin, sim_pat_20230721 for restoration analysis") parser.add_argument('--load_sims', default=False, action='store_true', help='Load simulations based on db_names') # digital twin parser.add_argument('--tolerance', type=float, default=0.3, action='store', help="maximal distance allow to take into consideration 'digital sigling'") parser.add_argument('--tolerance_plot', type=float, default=0.3, action='store', help="maximal distance allow to take into consideration 'digital sigling' (for plotting) ") parser.add_argument('--save_distances', default=False, action='store_true', help='save distances between patients and simulations') parser.add_argument('--load_distances', default=False, action='store_true', help='load distances between patients and simulations from previously saved') parser.add_argument('--compute_distances', default=False, action='store_true', help='compute distances between patients and simulations') parser.add_argument('--compute_post_distances', default=False, action='store_true', help='Compute distances for digital twins analysis using post-TMS FC') parser.add_argument('--n_closest', type=int, default=1, action='store', help="number of digital twins to average from, sorted by increasing distance, needs to be < than n_assoc") parser.add_argument('--n_assoc', type=int, default=10, action='store', help="number of digital twins to save sorted by increasing distance") parser.add_argument('--plot_param_behav', default=False, action='store_true', help='plot param-behavioral relationship') parser.add_argument('--verbose', default=False, action='store_true', help='print extra processing info') parser.add_argument('--session', default=None, action='store', help='which session (ses-pre or ses-post) for behavioral scores (default:None => both are used') # multivariate analysis (not reported in paper) parser.add_argument('--multivariate_analysis', default=False, action='store_true', help='perform multivariate analysis on simulations parameters') parser.add_argument('--multivar_fc', default=False, action='store_true', help='perform multivariate analysis on FC variables') parser.add_argument('--cv_type', type=str, default='LeaveOneOut', help="which cross-validation scheme to apply (KFold, RepeatedKFold, ShuffleSplit, LeaveOneOut)") parser.add_argument('--n_splits', type=int, default=10, help="number of splits used for cross-validation") parser.add_argument('--n_repeats', type=int, default=5, help="number of repetitions for the RepeatedKFold cross-validation") parser.add_argument('--max_depth', type=int, default=3, action='store', help="max depth of the decision tree") parser.add_argument('--test_size', type=float, default=0.3, help="ratio of test data (over all data) used for ShuffleSplit cross-validation") parser.add_argument('--plot_cv_regression', default=False, action='store_true', help='plot cross validation regression scatters') parser.add_argument('--plot_multivar_svd', default=False, action='store_true', help='plot dimensionality reduction on regression coefficients') parser.add_argument('--plot_multivariate_results', default=False, action='store_true', help='plot multivariate linear regression coefficients') parser.add_argument('--null', default=False, action='store_true', help='shuffle coupling weights to create null hypothesis for regression coefficients') parser.add_argument('--n_null', type=int, default=100, action='store', help="number of elements to make null distribution") parser.add_argument('--plot_null_distrib', default=False, action='store_true', help='plot null distribution analysis of linear regression coefficients') parser.add_argument('--plot_cv_null_distrib', default=False, action='store_true', help='plot null distribution analysis of linear regression coefficients for all CV folds') parser.add_argument('--print_ANOVA', default=False, action='store_true', help='print stats for mixed and multiple one-way ANOVAs') # restoration analysis parameters parser.add_argument('--restore_analysis', default=False, action='store_true', help='perform retoration analys of test parameters to move from patient to controls FC') parser.add_argument('--base_cohort', type=str, default='controls', help="Cohort from which to infer posterior as default") parser.add_argument('--test_cohort', type=str, default='patients', help="Cohort from which to infer posterior of individual params") parser.add_argument('--plot_efficacy_transform', default=False, action='store_true', help='plot the transformation from distance from controls to efficacy') parser.add_argument('--n_restore', type=int, default=10, action='store', help="number of best restorations for plotting") parser.add_argument('--n_tops', type=int, default=5, action='store', help="number of best restorations for each n_test_param for plotting") parser.add_argument('--n_test_params', type=int, default=6, action='store', help="max number of parameter combinations for plotting restoration outputs") parser.add_argument('--distance_metric', type=str, default='emd', help="distance used in restoration metric (rmse or emd)") parser.add_argument('--efficacy_base', type=str, default='ustat', help="use simulated centroid (sims) or paired (paired) or data (anything else) group difference in efficacy denominator") parser.add_argument('--use_optim_params', default=False, action='store_true', help='flag if using best optimization outputs directly (default: No, i.e. draw new params from posteriors.') parser.add_argument('--sort_style', type=str, default='by_n', help="how to sort distances for visualization: 'by_n' or 'all' (default)") parser.add_argument('--save_restoration', default=False, action='store_true', help='save outputs of the restoration analysis (df_restore)') parser.add_argument('--load_restoration', default=False, action='store_true', help='load outputs from previously saved restoration analysis (df_restore)') parser.add_argument('--plot_distance_restore', default=False, action='store_true', help='plot efficacy horizontal box plots') parser.add_argument('--plot_restoration_figure_paper', default=False, action='store_true', help='plot figure for paper with layout containing sub-figures') parser.add_argument('--contribution_analysis', default=False, action='store_true', help='run parameter contribution analysis') parser.add_argument('--sensitivity_analysis', default=False, action='store_true', help='run paramater sensitivity analysis') parser.add_argument('--load_param_contribution', default=False, action='store_true', help='load parameter contributions from previously computed restoration analysis') parser.add_argument('--load_param_sensitivity', default=False, action='store_true', help='load parameter sensitivity from previously computed restoration analysis') parser.add_argument('--plot_params_contribution_sensitivity', default=False, action='store_true', help='plot contribution and sensitivity of parameters in restoration analysis') parser.add_argument('--predictive_analysis', default=False, action='store_true', help='Analyse predictive power of model based on distance to controls FC') parser.add_argument('--plot_fc_dist_pre_post_behav', default=False, action='store_true', help='plot FC distance to controls in pre-post TMS data') parser.add_argument('--compute_sim_vecs', default=False, action='store_true', help='Compute simulation vectors in FC space (otherwise load pre-computed)') parser.add_argument('--save_sim_vecs', default=False, action='store_true', help='Save simulation vectors in FC space after being computed') parser.add_argument('--load_sim_vecs', default=False, action='store_true', help='Load simulation vectors in FC space') parser.add_argument('--load_post_distances', default=False, action='store_true', help='load distances in post-TMS inference') parser.add_argument('--plot_pre_post_associations', default=False, action='store_true', help='plot pre-post TMS associations between FC, params and behavioral measures') parser.add_argument('--load_summary', default=False, action='store_true', help='load summary of digital twin analysis') parser.add_argument('--improvement_analysis', default=False, action='store_true', help='Analyse improvement of models parameters pre-post TMS') parser.add_argument('--param_improvement', type=str, default=None, help="defines which parameter to scale improvement with") parser.add_argument('--fc_thresh', type=float, default=0, action='store', help="fc-ybocs reg plot -- theshold for fc improvement") args = parser.parse_args() return args if __name__=='__main__': args = parse_arguments() behavs=['YBOCS_Total', 'OCIR_Total', 'OBQ_Total', 'MADRS_Total', 'HAMA_Total', 'Dep_Total', 'Anx_total'] params=['C_12', 'C_13', 'C_24', 'C_31', 'C_34', 'C_42', 'eta_C_13', 'eta_C_24', 'sigma', 'sigma_C_13', 'sigma_C_24'] if args.load_sims: print("Loading simulations...") df_sims = load_df_sims(args) print("Fixing simulations indices...") fix_df_sims_names(df_sims, args) print("Loading functional connectivity data...") df_data = load_df_data(args) pathways = np.sort(df_data.pathway.unique()) print("Loading behavioral data...") with open(os.path.join(proj_dir, 'postprocessing', 'df_pat_.pkl'), 'rb') as f: df_pat = pickle.load(f) if args.session!=None: df_pat = df_pat[df_pat.ses==args.session] df_fc_pat = df_data.pivot(columns='pathway', values='corr', index='subj').reset_index().merge(df_pat) kdes = load_kdes(args) if args.compute_sim_vecs: print("Computing simulation vectors...") sim_vecs = compute_sim_vecs(df_sims) if args.save_sim_vecs: print("Saving simulation vectors...") fname = os.path.join(proj_dir, 'postprocessing', 'sim_vecs'+today()+'.pkl') with open(fname, 'wb') as f: pickle.dump(sim_vecs, f) elif args.load_sim_vecs: print("Loading simulation vectors...") fname = os.path.join(proj_dir, 'postprocessing', 'sim_vecs.pkl') #sim_vecs.pkl without restoration, sim_vecs_20230915.pkl with restoration with open(fname, 'rb') as f: sim_vecs = pickle.load(f) if args.load_distances: print("Loading distances...") fname = os.path.join(proj_dir, 'postprocessing', 'assoc_digital_twins_distances100eps'+str(int(args.tolerance*100))+"_pre_20240329.pkl") with open(fname, 'rb') as f: assoc = pickle.load(f) df_sim_pre = merge_data_sim_dfs(df_pat[df_pat.ses=='ses-pre'], df_sims, assoc, args) elif args.compute_distances: print("Computing distances...") assoc = compute_distances(df_data, df_sims[df_sims.test_param=='None'], ses='pre', args=args) df_sim_pre = merge_data_sim_dfs(df_pat[df_pat.ses=='ses-pre'], df_sims, assoc, args) # univariate analysis if args.plot_param_behav: plot_param_behav(df_sim_pre, params=params, args=args) # multivariate analysis if args.multivariate_analysis: print("Running multivariate analysis...") if args.multivar_fc: params=['Acc_OFC', 'Acc_PFC', 'Acc_dPut', 'OFC_PFC', 'dPut_PFC'] multivar = multivariate_analysis(df_fc_pat, params=params, args=args) else: multivar = multivariate_analysis(df_sim_pre, params=params, args=args) if args.plot_multivariate_results: plot_multivariate_results(multivar, args=args) if args.plot_cv_regression: plot_cv_regression(multivar, df_sim_pre, args=args) if args.plot_multivar_svd: plot_multivar_svd(multivar, behavs=behavs, args=args) if args.null: print("Creating Null distribution...") models = dict(('null_model{:04d}'.format(i), sklearn.linear_model.Ridge(alpha=0.01)) for i in range(args.n_null)) multivar_null = multivariate_analysis(df_sim_pre, params=params, models=models, null=True, args=args) create_df_null(multivar, multivar_null) if args.plot_null_distrib: plot_null_distrib(multivar, args) if args.print_ANOVA: print_ANOVA(df_sim_pre, behavs, params) # restoration analysis if args.restore_analysis: print("Restoration analysis...") args.pathways = pathways args.params = params if args.plot_efficacy_transform: plot_efficacy_transform(args) if args.load_restoration: optim='' date = '_20240516' if args.use_optim_params: optim += '_optim' date = '_20240329' restoration_file = 'df_restore_'+args.distance_metric+optim+'_'+args.efficacy_base+date+'.pkl' with open(os.path.join(proj_dir, 'postprocessing', restoration_file), 'rb') as f: df_restore = pickle.load(f) else: df_restore = compute_distance_restore(df_sims[df_sims.test_param!='None'], args) if args.save_restoration: fname = 'df_restore'+ get_restoration_suffix(args) + today() + '.pkl' with open(os.path.join(proj_dir, 'postprocessing', fname), 'wb') as f: pickle.dump(df_restore, f) df_restore = compute_efficacy(df_restore, args=args) if args.plot_distance_restore: plot_distance_restore(df_restore, args=args) df_top, top_params = get_df_top(df_restore, args) # Parameters contribution if args.contribution_analysis: if args.load_param_contribution: with open(os.path.join(proj_dir, 'postprocessing', 'df_param_contribution.pkl'), 'rb') as f: df_params_contribution = pickle.load(f) else: df_params_contribution = compute_scaled_feature_score(df_restore[df_restore.efficacy>0.5], params, kdes, scaling='dot_product_correlation', args=args) if args.save_outputs: with open(os.path.join(proj_dir, 'postprocessing', 'df_param_contribution_'+args.distance_metric+'_'+args.efficacy_base+today()+'.pkl'), 'wb') as f: pickle.dump(df_params_contribution, f) if args.plot_params_contribution_sensitivity: plot_contribution_windrose(df_params_contribution, params, args=args) if args.plot_restoration_figure_paper: plot_restoration_figure_paper(df_restore, df_top, df_params_contribution, args) # Parameter sensitivity (deprecated alternative to contribution) if args.sensitivity_analysis: # Parameters sensitivity if args.load_param_sensitivity: with open(os.path.join(proj_dir, 'postprocessing', 'df_param_sensitivity.pkl'), 'rb') as f: df_param_sensitivity = pickle.load(f) else: df_param_sensitivity = compute_scaled_feature_score(df_restore[df_restore.efficacy>0], params, kdes, scaling='sensitivity', args=args) if args.save_outputs: with open(os.path.join(proj_dir, 'postprocessing', 'df_param_sensitivity_'+args.distance_metric+'_'+args.efficacy_base+today()+'.pkl'), 'wb') as f: pickle.dump(df_param_sensitivity, f) if args.plot_params_contribution_sensitivity: plot_sensitivity_windrose(df_param_sensitivity, params, args=args) # prediction if args.predictive_analysis: print('Load functional distances (data)...') df_fc_pre_post = load_dist_to_FC_controls(args) # delta FC vs delta YBOCS df_dist_fc = df_fc_pre_post.merge(df_fc_pat, on=['subj', 'ses', 'group', *pathways], how='inner') #df_dist_fc = drop_single_session(df_dist_fc) if args.plot_fc_dist_pre_post_behav: plot_fc_dist_pre_post_behav(df_dist_fc, args=args) # Load or compute distances (post) if args.compute_post_distances: print("Computing functional distances (post)...") pathways = np.sort(df_data.pathway.unique()) df_post_data = df_fc_pre_post[df_fc_pre_post.ses=='ses-post'].melt(id_vars=['subj', 'cohort', 'group'], value_vars=pathways, var_name='pathway', value_name='corr') assoc_post = compute_distances(df_post_data, df_sims, ses='post', args=args) elif args.load_post_distances: print("Loading functional distances (post)...") fname = os.path.join(proj_dir, 'postprocessing', 'assoc_digital_twins_distances100eps'+str(int(args.tolerance*100))+"_post_20240329.pkl") with open(fname, 'rb') as f: assoc_post = pickle.load(f) df_sim_post = merge_data_sim_dfs(df_pat[df_pat.ses=='ses-post'], df_sims, assoc_post, args) df_post = pd.merge(df_fc_pre_post[df_fc_pre_post.ses=='ses-post'], df_sim_post, on=['subj', 'ses', 'group'], how='outer', suffixes=[None, '_sim']) # pre-post analysis summary df_pre = pd.merge(df_fc_pre_post[df_fc_pre_post.ses=='ses-pre'], df_sim_pre, on=['subj', 'ses', 'group'], how='outer', suffixes=[None, '_sim']) df_summary = df_post[df_post.cohort=='patients'].merge(df_pre[df_pre.cohort=='patients'], how='outer') if args.save_summary: with open(os.path.join(proj_dir, 'postprocessing', 'df_pre_'+args.distance_metric+'_'+args.efficacy_base+today()+".pkl"), 'wb') as f: pickle.dump(df_pre, f) with open(os.path.join(proj_dir, 'postprocessing', 'df_post_'+args.distance_metric+'_'+args.efficacy_base+today()+".pkl"), 'wb') as f: pickle.dump(df_post, f) with open(os.path.join(proj_dir, 'postprocessing', 'df_summary_'+args.distance_metric+'_'+args.efficacy_base+today()+".pkl"), 'wb') as f: pickle.dump(df_summary, f) if args.plot_pre_post_associations: plot_pre_post_dist_ybocs(df_summary) if args.load_summary: # df_summary_20230915.pkl : pre and post with controls/patients param swaps # df_summary_20230918.pkl : only post with param swaps with open(os.path.join(proj_dir, 'postprocessing', 'df_summary_20230918.pkl'), 'rb') as f: df_summary = pickle.load(f) if args.improvement_analysis: df_improvement = score_improvement(df_summary, params, kdes) plot_summary_improvement(df_summary, df_improvement, params, args=args)