import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import Counter
5 Estimate Participant Stages
In this chapter, we will do some exercise to have a deeper understanding of the math equations in Section 2.3.
5.1 Challenge
Suppos we know \(S, \theta, \phi\). How could we estimate participant stages?
This is the data we have. And we want to know fill the missing column of k_j
.
= 'data'
output_dir = pd.read_csv(f"{output_dir}/100|200_3.csv")
df = dict(zip(df.participant, df.k_j))
real_stages_dic 'k_j', 'affected_or_not'], axis = 1, inplace=True)
df.drop([ df.head()
participant | biomarker | measurement | S_n | diseased | |
---|---|---|---|---|---|
0 | 0 | HIP-FCI | -8.908479 | 1 | True |
1 | 1 | HIP-FCI | -1.095464 | 1 | False |
2 | 2 | HIP-FCI | 0.470754 | 1 | True |
3 | 3 | HIP-FCI | 2.633455 | 1 | True |
4 | 4 | HIP-FCI | 4.070208 | 1 | False |
5.2 Solution
One possible solution looks like this:
For each diseased participant, we iterate through all possible disease stages and calculate the likelihood using Equation 2.1.
We normalize all the likelihoods, construct an array, and randomly sample one possible stage according to that array.
Run multiple times, for each diseased participant, the mode of the sampled stages will be their stage.
Code
def compute_single_measurement_likelihood(
theta_phi,
biomarker,
affected,
measurement):'''Computes the likelihood of the measurement value of a single biomarker
We know the normal distribution defined by either theta or phi
and we know the measurement. This will give us the probability
of this given measurement value.
input:
- theta_phi: the dictionary containing theta and phi values for each biomarker
- biomarker: an integer between 0 and 9
- affected: boolean
- measurement: the observed value for a biomarker in a specific participant
output: a scalar
'''
= theta_phi[biomarker]
biomarker_dict = biomarker_dict['theta_mean'] if affected else biomarker_dict['phi_mean']
mu = biomarker_dict['theta_std'] if affected else biomarker_dict['phi_std']
std = std**2
var if var <= int(0) or np.isnan(measurement) or np.isnan(mu):
print(f"Invalid values: measurement: {measurement}, mu: {mu}, var: {var}")
= np.exp(-(measurement - mu)**2 /
likelihood 2 * var)) / np.sqrt(2 * np.pi * var)
(else:
= np.exp(-(measurement - mu)**2 /
likelihood 2 * var)) / np.sqrt(2 * np.pi * var)
(return likelihood
def fill_up_kj_and_affected(pdata, k_j):
'''Fill up a single participant's data using k_j; basically add two columns:
k_j and affected
Note that this function assumes that pdata already has the S_n column
Input:
- pdata: a dataframe of ten biomarker values for a specific participant
- k_j: a scalar
'''
= pdata.copy()
data 'k_j'] = k_j
data['affected'] = data.apply(lambda row: row.k_j >= row.S_n, axis=1)
data[return data
def compute_likelihood(pdata, k_j, theta_phi):
'''
This function computes the likelihood of seeing this sequence of biomarker values
for a specific participant, assuming that this participant is at stage k_j
'''
= fill_up_kj_and_affected(pdata, k_j)
data = 1
likelihood for i, row in data.iterrows():
= row['biomarker']
biomarker = row['measurement']
measurement = row['affected']
affected *= compute_single_measurement_likelihood(
likelihood
theta_phi, biomarker, affected, measurement)return likelihood
We first look at the known \(\theta, \phi\):
with open('files/real_theta_phi.json', 'r') as f:
= json.load(f)
truth = pd.DataFrame.from_dict(truth, orient='index')
truth_df = 'biomarker', inplace=True)
truth_df.reset_index(names truth_df
biomarker | theta_mean | theta_std | phi_mean | phi_std | |
---|---|---|---|---|---|
0 | MMSE | 22.0 | 2.666667 | 28.0 | 0.666667 |
1 | ADAS | -20.0 | 4.000000 | -6.0 | 1.333333 |
2 | AB | 150.0 | 16.666667 | 250.0 | 50.000000 |
3 | P-Tau | -50.0 | 33.333333 | -25.0 | 16.666667 |
4 | HIP-FCI | -5.0 | 6.666667 | 5.0 | 1.666667 |
5 | HIP-GMI | 0.3 | 0.333333 | 0.4 | 0.233333 |
6 | AVLT-Sum | 20.0 | 6.666667 | 40.0 | 15.000000 |
7 | PCC-FCI | 5.0 | 3.333333 | 12.0 | 4.000000 |
8 | FUS-GMI | 0.5 | 0.066667 | 0.6 | 0.066667 |
9 | FUS-FCI | -20.0 | 6.000000 | -10.0 | 3.333333 |
5.3 Implementation
We then implement the algorithm mentioned above:
= truth.copy()
theta_phi_estimates = df.S_n.unique()
disease_stages = df[df.diseased==True]['participant'].unique() diseased_participants
def update_participant_stages_dic(
data,
p,
disease_stages,
theta_phi_estimates,# participant stage dic:
psdic,= 20
sample_iterations
):"""
Inputs:
- data: pd.dataframe, e.g., 100|200_3.csv
- p: int
- disease_stages: a list of integers
- theta_phi_estimates: a hashmap of dictionaries
- psdic: a dictionary
- sample_iteration: int. How many times we sample
Output:
no outputs. Simply update psdic
"""
= data[data.participant == p]
pdata = {}
stage_likelihood_dict for k_j in disease_stages:
= compute_likelihood(
kj_likelihood
pdata, k_j, theta_phi_estimates)# update each stage likelihood for this participant
= kj_likelihood
stage_likelihood_dict[k_j] # Add a small epsilon to avoid division by zero
= sum(stage_likelihood_dict.values())
likelihood_sum = 1e-10
epsilon if likelihood_sum == 0:
# print("Invalid likelihood_sum: zero encountered.")
= epsilon # Handle the case accordingly
likelihood_sum = [
normalized_stage_likelihood /likelihood_sum for l in stage_likelihood_dict.values()]
l= np.random.choice(
sampled_stages
disease_stages, = sample_iterations,
size =normalized_stage_likelihood,
p=True
replace
)= Counter(sampled_stages).most_common(1)[0][0]
mode_result = mode_result psdic[p]
= df.participant.unique()
participants = {}
psdic for p in participants:
if p not in diseased_participants:
= 0
psdic[p] else:
update_participant_stages_dic(
df,
p,
disease_stages,
theta_phi_estimates,# participant stage dic:
psdic,= 10
sample_iterations )
5.4 Result
Then we compare our results with the actual participants’ stages:
= np.array(list(psdic.values())) - np.array(list(real_stages_dic.values())) diff
Code
def scatter_plot_of_stage_differences(stage_differences):
'''Scatter Plot of the Difference at each index
Input:
- stage_differences: estimated_stages - actual stages. Result should be a 1-dim np array
'''
=(10, 6))
plt.figure(figsizerange(len(diff)), stage_differences, alpha=0.6)
plt.scatter(=0, color='red', linestyle='--')
plt.axhline(y"Scatter Plot of Stage Difference for Each Participant")
plt.title("Participant")
plt.xlabel("Difference (Estimated Stage - True Stage)")
plt.ylabel(True)
plt.grid( plt.show()
scatter_plot_of_stage_differences(diff)
5.5 Discussion
From the above result, we can see how challenging it is to accurately estimate participant stages, even if we know exactly the \(\theta\) and \(\phi\).
What if we know only \(S\), but not \(\theta\) or \(\phi\)?
The first step for us is to estimate \(\theta, \phi\) and then follow the above procedures. To do that, refer back to Chapter 4.