import numpy as np 
import utils 
import json 
import pandas as pd 
import utils 
from scipy.stats import kendalltau
import sys
import os7 Estimate \(S\) with Hard KMeans
The basic idea of using hard K Means to estimate \(S\) is:
- We first estimate distribution parameters using hard K Means, the exact procedure we covered in Section 4.1.
 - We use Metropolis–Hastings algorithm to accept or reject a proposed \(S\).
 
7.1 Implementation
Code
def calculate_all_participant_ln_likelihood(
        iteration,
        data_we_have,
        current_order_dict,
        n_participants,
        non_diseased_participant_ids,
        theta_phi_estimates,
        diseased_stages,
):
    data = data_we_have.copy()
    data['S_n'] = data.apply(
        lambda row: current_order_dict[row['biomarker']], axis=1)
    all_participant_ln_likelihood = 0
    for p in range(n_participants):
        pdata = data[data.participant == p].reset_index(drop=True)
        if p in non_diseased_participant_ids:
            this_participant_likelihood = utils.compute_likelihood(
                pdata, k_j=0, theta_phi=theta_phi_estimates)
            this_participant_ln_likelihood = np.log(
                this_participant_likelihood + 1e-10)
        else:
            # normalized_stage_likelihood_dict = None
            # initiaze stage_likelihood
            stage_likelihood_dict = {}
            for k_j in diseased_stages:
                kj_likelihood = utils.compute_likelihood(
                    pdata, k_j, theta_phi_estimates)
                # update each stage likelihood for this participant
                stage_likelihood_dict[k_j] = kj_likelihood
            # Add a small epsilon to avoid division by zero
            likelihood_sum = sum(stage_likelihood_dict.values())
            # calculate weighted average
            this_participant_likelihood = np.mean(likelihood_sum)
            this_participant_ln_likelihood = np.log(
                this_participant_likelihood + 1e-10)
        all_participant_ln_likelihood += this_participant_ln_likelihood
    return all_participant_ln_likelihood
def metropolis_hastings_hard_kmeans(
    data_we_have,
    iterations,
    n_shuffle,
):
    '''Implement the metropolis-hastings algorithm using simple clustering
    Inputs: 
        - data: data_we_have
        - iterations: number of iterations
        - log_folder_name: the folder where log files locate
    Outputs:
        - best_order: a numpy array
        - best_likelihood: a scalar 
    '''
    n_participants = len(data_we_have.participant.unique())
    biomarkers = data_we_have.biomarker.unique()
    n_biomarkers = len(biomarkers)
    n_stages = n_biomarkers + 1
    non_diseased_participant_ids = data_we_have.loc[
        data_we_have.diseased == False].participant.unique()
    diseased_stages = np.arange(start=1, stop=n_stages, step=1)
    # obtain the iniial theta and phi estimates
    theta_phi_estimates = utils.get_theta_phi_estimates(
        data_we_have)
    # initialize empty lists
    acceptance_count = 0
    all_current_accepted_order_dicts = []
    current_accepted_order = np.random.permutation(np.arange(1, n_stages))
    current_accepted_order_dict = dict(zip(biomarkers, current_accepted_order))
    current_accepted_likelihood = -np.inf
    for _ in range(iterations):
        # in each iteration, we have updated current_order_dict and theta_phi_estimates
        new_order = current_accepted_order.copy()
        utils.shuffle_order(new_order, n_shuffle)
        current_order_dict = dict(zip(biomarkers, new_order))
        all_participant_ln_likelihood = calculate_all_participant_ln_likelihood(
                _,
                data_we_have,
                current_order_dict,
                n_participants,
                non_diseased_participant_ids,
                theta_phi_estimates,
                diseased_stages,
            )
        # Log-Sum-Exp Trick
        max_likelihood = max(all_participant_ln_likelihood,
                             current_accepted_likelihood)
        prob_of_accepting_new_order = np.exp(
            (all_participant_ln_likelihood - max_likelihood) -
            (current_accepted_likelihood - max_likelihood)
        )
        # prob_of_accepting_new_order = np.exp(
        #     all_participant_ln_likelihood - current_accepted_likelihood)
        # np.exp(a)/np.exp(b) = np.exp(a - b)
        # if a > b, then np.exp(a - b) > 1
        # it will definitly update at the first iteration
        if np.random.rand() < prob_of_accepting_new_order:
            acceptance_count += 1
            current_accepted_order = new_order
            current_accepted_likelihood = all_participant_ln_likelihood
            current_accepted_order_dict = current_order_dict
        acceptance_ratio = acceptance_count*100/(_+1)
        all_current_accepted_order_dicts.append(current_accepted_order_dict)
        if (_+1) % 10 == 0:
            formatted_string = (
                f"iteration {_ + 1} done, "
                f"current accepted likelihood: {current_accepted_likelihood}, "
                f"current acceptance ratio is {acceptance_ratio:.2f} %, "
                f"current accepted order is {current_accepted_order_dict.values()}, "
            )
            print(formatted_string)
    # print("done!")
    return all_current_accepted_order_dictsn_shuffle = 2
iterations = 10
burn_in = 2
thining = 2
base_dir = os.getcwd()
print(f"Current working directory: {base_dir}")
data_dir = os.path.join(base_dir, "data")
cop_kmeans_dir = os.path.join(base_dir, 'hard_kmeans')
temp_results_dir = os.path.join(cop_kmeans_dir, "temp_json_results")
img_dir = os.path.join(cop_kmeans_dir, 'img')
results_file = os.path.join(cop_kmeans_dir, "results.json")
os.makedirs(cop_kmeans_dir, exist_ok=True)
os.makedirs(temp_results_dir, exist_ok=True)
os.makedirs(img_dir, exist_ok=True)
print(f"Data directory: {data_dir}")
print(f"Temp results directory: {temp_results_dir}")
print(f"Image directory: {img_dir}")
if __name__ == "__main__":
    # Read parameters from command line arguments
    j = 200
    r = 0.75
    m = 3
    print(f"Processing with j={j}, r={r}, m={m}")
    combstr = f"{int(j*r)}|{j}"
    heatmap_folder = img_dir
    
    img_filename = f"{int(j*r)}-{j}_{m}"
    filename = f"{combstr}_{m}"
    data_file = f"{data_dir}/{filename}.csv"
    data_we_have = pd.read_csv(data_file)
    n_biomarkers = len(data_we_have.biomarker.unique())
    if not os.path.exists(data_file):
        print(f"Data file not found: {data_file}")
        sys.exit(1)  # Exit early if the file doesn't exist
    else:
        print(f"Data file found: {data_file}")
    # Define the temporary result file
    temp_result_file = os.path.join(temp_results_dir, f"temp_results_{j}_{r}_{m}.json")
    
    dic = {}
    if combstr not in dic:
        dic[combstr] = []
    accepted_order_dicts = metropolis_hastings_hard_kmeans(
        data_we_have,
        iterations,
        n_shuffle,
    )
    utils.save_heatmap(
        accepted_order_dicts,
        burn_in, 
        thining, 
        folder_name=heatmap_folder,
        file_name=f"{img_filename}", 
        title=f'heatmap of {filename}')
    
    most_likely_order_dic = utils.obtain_most_likely_order_dic(
        accepted_order_dicts, burn_in, thining)
    most_likely_order = list(most_likely_order_dic.values())
    tau, p_value = kendalltau(most_likely_order, range(1, n_biomarkers + 1))
    
    dic[combstr].append(tau)
    
    # Write the results to a unique temporary file inside the temp folder
    with open(temp_result_file, "w") as file:
        json.dump(dic, file, indent=4)
    print(f"{filename} is done! Results written to {temp_result_file}")Current working directory: /Users/hongtaoh/Desktop/github/ebmBook
Data directory: /Users/hongtaoh/Desktop/github/ebmBook/data
Temp results directory: /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/temp_json_results
Image directory: /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/img
Processing with j=200, r=0.75, m=3
Data file found: /Users/hongtaoh/Desktop/github/ebmBook/data/150|200_3.csv
iteration 10 done, current accepted likelihood: -4491.553963056519, current acceptance ratio is 70.00 %, current accepted order is dict_values([1, 5, 2, 3, 9, 8, 10, 6, 4, 7]), 
150|200_3 is done! Results written to /Users/hongtaoh/Desktop/github/ebmBook/hard_kmeans/temp_json_results/temp_results_200_0.75_3.json
7.2 Result
We plot the resulting \(S\) probalistically using a heatmap. We also quantify the difference between our result with the real \(S\) using Kendall’s Tau. It ranges from \(-1\) (completely different) to \(1\) (exactly the same). \(0\) indicate complete randomness.
dic{'150|200': [0.3333333333333333]}

