import numpy as np
import utils
import json
import pandas as pd
import utils
from scipy.stats import kendalltau
import sys
import os
7 Estimate \(S\) with Hard KMeans
The basic idea of using hard K Means to estimate \(S\) is:
- We first estimate distribution parameters using hard K Means, the exact procedure we covered in Section 4.1.
- We use Metropolis–Hastings algorithm to accept or reject a proposed \(S\).
7.1 Implementation
Code
def calculate_all_participant_ln_likelihood(
iteration,
data_we_have,
current_order_dict,
n_participants,
non_diseased_participant_ids,
theta_phi_estimates,
diseased_stages,
):= data_we_have.copy()
data 'S_n'] = data.apply(
data[lambda row: current_order_dict[row['biomarker']], axis=1)
= 0
all_participant_ln_likelihood
for p in range(n_participants):
= data[data.participant == p].reset_index(drop=True)
pdata if p in non_diseased_participant_ids:
= utils.compute_likelihood(
this_participant_likelihood =0, theta_phi=theta_phi_estimates)
pdata, k_j= np.log(
this_participant_ln_likelihood + 1e-10)
this_participant_likelihood else:
# normalized_stage_likelihood_dict = None
# initiaze stage_likelihood
= {}
stage_likelihood_dict for k_j in diseased_stages:
= utils.compute_likelihood(
kj_likelihood
pdata, k_j, theta_phi_estimates)# update each stage likelihood for this participant
= kj_likelihood
stage_likelihood_dict[k_j] # Add a small epsilon to avoid division by zero
= sum(stage_likelihood_dict.values())
likelihood_sum
# calculate weighted average
= np.mean(likelihood_sum)
this_participant_likelihood = np.log(
this_participant_ln_likelihood + 1e-10)
this_participant_likelihood += this_participant_ln_likelihood
all_participant_ln_likelihood return all_participant_ln_likelihood
def metropolis_hastings_hard_kmeans(
data_we_have,
iterations,
n_shuffle,
):'''Implement the metropolis-hastings algorithm using simple clustering
Inputs:
- data: data_we_have
- iterations: number of iterations
- log_folder_name: the folder where log files locate
Outputs:
- best_order: a numpy array
- best_likelihood: a scalar
'''
= len(data_we_have.participant.unique())
n_participants = data_we_have.biomarker.unique()
biomarkers = len(biomarkers)
n_biomarkers = n_biomarkers + 1
n_stages = data_we_have.loc[
non_diseased_participant_ids == False].participant.unique()
data_we_have.diseased = np.arange(start=1, stop=n_stages, step=1)
diseased_stages # obtain the iniial theta and phi estimates
= utils.get_theta_phi_estimates(
theta_phi_estimates
data_we_have)
# initialize empty lists
= 0
acceptance_count = []
all_current_accepted_order_dicts
= np.random.permutation(np.arange(1, n_stages))
current_accepted_order = dict(zip(biomarkers, current_accepted_order))
current_accepted_order_dict = -np.inf
current_accepted_likelihood
for _ in range(iterations):
# in each iteration, we have updated current_order_dict and theta_phi_estimates
= current_accepted_order.copy()
new_order
utils.shuffle_order(new_order, n_shuffle)= dict(zip(biomarkers, new_order))
current_order_dict = calculate_all_participant_ln_likelihood(
all_participant_ln_likelihood
_,
data_we_have,
current_order_dict,
n_participants,
non_diseased_participant_ids,
theta_phi_estimates,
diseased_stages,
)
# Log-Sum-Exp Trick
= max(all_participant_ln_likelihood,
max_likelihood
current_accepted_likelihood)= np.exp(
prob_of_accepting_new_order - max_likelihood) -
(all_participant_ln_likelihood - max_likelihood)
(current_accepted_likelihood
)
# prob_of_accepting_new_order = np.exp(
# all_participant_ln_likelihood - current_accepted_likelihood)
# np.exp(a)/np.exp(b) = np.exp(a - b)
# if a > b, then np.exp(a - b) > 1
# it will definitly update at the first iteration
if np.random.rand() < prob_of_accepting_new_order:
+= 1
acceptance_count = new_order
current_accepted_order = all_participant_ln_likelihood
current_accepted_likelihood = current_order_dict
current_accepted_order_dict
= acceptance_count*100/(_+1)
acceptance_ratio
all_current_accepted_order_dicts.append(current_accepted_order_dict)
if (_+1) % 10 == 0:
= (
formatted_string f"iteration {_ + 1} done, "
f"current accepted likelihood: {current_accepted_likelihood}, "
f"current acceptance ratio is {acceptance_ratio:.2f} %, "
f"current accepted order is {current_accepted_order_dict.values()}, "
)print(formatted_string)
# print("done!")
return all_current_accepted_order_dicts
= 2
n_shuffle = 10
iterations = 2
burn_in = 2
thining
= os.getcwd()
base_dir print(f"Current working directory: {base_dir}")
= os.path.join(base_dir, "data")
data_dir
= os.path.join(base_dir, 'hard_kmeans')
cop_kmeans_dir = os.path.join(cop_kmeans_dir, "temp_json_results")
temp_results_dir = os.path.join(cop_kmeans_dir, 'img')
img_dir = os.path.join(cop_kmeans_dir, "results.json")
results_file
=True)
os.makedirs(cop_kmeans_dir, exist_ok=True)
os.makedirs(temp_results_dir, exist_ok=True)
os.makedirs(img_dir, exist_ok
print(f"Data directory: {data_dir}")
print(f"Temp results directory: {temp_results_dir}")
print(f"Image directory: {img_dir}")
if __name__ == "__main__":
# Read parameters from command line arguments
= 200
j = 0.75
r = 3
m
print(f"Processing with j={j}, r={r}, m={m}")
= f"{int(j*r)}|{j}"
combstr = img_dir
heatmap_folder
= f"{int(j*r)}-{j}_{m}"
img_filename = f"{combstr}_{m}"
filename = f"{data_dir}/{filename}.csv"
data_file = pd.read_csv(data_file)
data_we_have = len(data_we_have.biomarker.unique())
n_biomarkers
if not os.path.exists(data_file):
print(f"Data file not found: {data_file}")
1) # Exit early if the file doesn't exist
sys.exit(else:
print(f"Data file found: {data_file}")
# Define the temporary result file
= os.path.join(temp_results_dir, f"temp_results_{j}_{r}_{m}.json")
temp_result_file
= {}
dic
if combstr not in dic:
= []
dic[combstr]
= metropolis_hastings_hard_kmeans(
accepted_order_dicts
data_we_have,
iterations,
n_shuffle,
)
utils.save_heatmap(
accepted_order_dicts,
burn_in,
thining, =heatmap_folder,
folder_name=f"{img_filename}",
file_name=f'heatmap of {filename}')
title
= utils.obtain_most_likely_order_dic(
most_likely_order_dic
accepted_order_dicts, burn_in, thining)= list(most_likely_order_dic.values())
most_likely_order = kendalltau(most_likely_order, range(1, n_biomarkers + 1))
tau, p_value
dic[combstr].append(tau)
# Write the results to a unique temporary file inside the temp folder
with open(temp_result_file, "w") as file:
file, indent=4)
json.dump(dic, print(f"{filename} is done! Results written to {temp_result_file}")
Current working directory: /Users/hongtaoh/Desktop/github/ebmBook
Data directory: /Users/hongtaoh/Desktop/github/ebmBook/data
Temp results directory: /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/temp_json_results
Image directory: /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/img
Processing with j=200, r=0.75, m=3
Data file found: /Users/hongtaoh/Desktop/github/ebmBook/data/150|200_3.csv
iteration 10 done, current accepted likelihood: -4491.553963056519, current acceptance ratio is 70.00 %, current accepted order is dict_values([1, 5, 2, 3, 9, 8, 10, 6, 4, 7]),
150|200_3 is done! Results written to /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/temp_json_results/temp_results_200_0.75_3.json
7.2 Result
We plot the resulting \(S\) probalistically using a heatmap. We also quantify the difference between our result with the real \(S\) using Kendall’s Tau. It ranges from \(-1\) (completely different) to \(1\) (exactly the same). \(0\) indicate complete randomness.
dic
{'150|200': [0.3333333333333333]}