from google.colab import drive
drive.mount('/content/drive')

CSV_PATH = "/content/drive/MyDrive/Predicting-Fracture-Risk-in-Osteoporosis-Patients/bmd.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', None)

df = pd.read_csv(CSV_PATH)
display(df.head())
display(df.isna().sum())

if 'id' in df.columns:
    df = df.drop(columns=['id'])


X_init = df[['age', 'weight_kg', 'height_cm', 'bmd']].copy()
y_init = df['fracture'].copy()

display(df.head())

num_cols = ['age', 'weight_kg', 'height_cm', 'bmd']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
display(df.head())

print("Original class distribution:")
print(df['fracture'].value_counts())

Original class distribution:
fracture
no fracture    119
fracture        50
Name: count, dtype: int64

df_majority = df[df['fracture'] == 'no fracture']
df_minority = df[df['fracture'] == 'fracture']

df_majority_down = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=96
)
df_bal = pd.concat([df_majority_down, df_minority]).reset_index(drop=True)

print("Balanced class distribution:")
print(df_bal['fracture'].value_counts())
display(df_bal.head())

Balanced class distribution:
fracture
no fracture    50
fracture       50
Name: count, dtype: int64

df_encoded = pd.get_dummies(df_bal, columns=['fracture'])
corr = df_encoded.corr(numeric_only=True)

plt.figure(figsize=(14, 7))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation heatmap")
plt.show()

# bmd vs age
plt.figure(figsize=(8,5))
sns.scatterplot(x=df_bal['bmd'], y=df_bal['age'], hue=df_bal['fracture'])
plt.title("BMD vs Age by fracture")
plt.show()

# bmd vs weight_kg
plt.figure(figsize=(8,5))
sns.scatterplot(x=df_bal['bmd'], y=df_bal['weight_kg'], hue=df_bal['fracture'])
plt.title("BMD vs Weight by fracture")
plt.show()

X = df_bal[['age', 'weight_kg', 'height_cm', 'bmd']].copy()
y = df_bal['fracture'].copy()

# k=10
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
knn.fit(X, y)
y_pred = knn.predict(X)

cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(cm, annot=True, fmt="d")
plt.title('KNN — k=10')
plt.show()

print('Accuracy for k = 10 :', accuracy_score(y, y_pred))

# k=9
knn = KNeighborsClassifier(n_neighbors=9, metric='euclidean')
knn.fit(X, y)
y_pred = knn.predict(X)

print('Accuracy for k = 9 :', accuracy_score(y, y_pred))
cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(cm, annot=True, fmt="d")
plt.title('KNN — k=9')
plt.show()

# k=11
knn = KNeighborsClassifier(n_neighbors=11, metric='euclidean')
knn.fit(X, y)
y_pred = knn.predict(X)

print('Accuracy for k = 11 :', accuracy_score(y, y_pred))
cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(cm, annot=True, fmt="d")
plt.title('KNN — k=11')
plt.show()

Accuracy for k = 10 : 0.81
Accuracy for k = 9 : 0.79

Accuracy for k = 11 : 0.8

error_rate = []
ks = range(1, 12)
for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn.fit(X, y)
    pred_k = knn.predict(X)
    error_rate.append(np.mean(pred_k != y))

plt.figure(figsize=(10,5))
plt.plot(list(ks), error_rate, linestyle='dashed', marker='*', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.xticks(list(ks))
plt.show()

knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
knn.fit(X, y)
y_pred = knn.predict(X)

print('Accuracy for k = 7 :', accuracy_score(y, y_pred))
cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(cm, annot=True, fmt="d")
plt.title('KNN — k=7')
plt.show()

Accuracy for k = 7 : 0.88

# Save comparison
y_pred_knn = y_pred
cm_knn = cm.copy()
acc_knn = accuracy_score(y, y_pred)

correct_knn = cm_knn.values.trace()
incorrect_knn = cm_knn.values.sum() - correct_knn

print(f"[KNN k=7] acc={acc_knn:.2f} | correct={correct_knn} | incorrect={incorrect_knn}")

[KNN k=7] acc=0.88 | correct=88 | incorrect=12

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.utils import resample

# Load dataset
df_lr = pd.read_csv(CSV_PATH)
print("Shape:", df_lr.shape)
display(df_lr.head())
print("\nMissing values:\n", df_lr.isna().sum())

# Quick EDA — scatterplots
sns.scatterplot(x=df_lr["bmd"], y=df_lr["age"], hue=df_lr["fracture"])
plt.title("BMD vs Age by fracture")
plt.show()

sns.scatterplot(x=df_lr["weight_kg"], y=df_lr["height_cm"], hue=df_lr["fracture"])
plt.title("Weight vs Height by fracture")
plt.show()

# Class distribution before balancing
print("\nOriginal class distribution:\n", df_lr["fracture"].value_counts())

# Downsample majority class to match minority class
df_majority = df_lr[df_lr["fracture"] == "no fracture"]
df_minority = df_lr[df_lr["fracture"] == "fracture"]

df_majority_downsampled = resample(
    df_majority, replace=False, n_samples=len(df_minority), random_state=96
)

df_bal_lr = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop=True)

print("\nBalanced class distribution:\n", df_bal_lr["fracture"].value_counts())

# EDA after balancing
sns.scatterplot(x=df_bal_lr["bmd"], y=df_bal_lr["age"], hue=df_bal_lr["fracture"])
plt.title("BMD vs Age (Balanced Data)")
plt.show()

Shape: (169, 9)

Missing values:
 id              0
age             0
sex             0
fracture        0
weight_kg       0
height_cm       0
medication      0
waiting_time    0
bmd             0
dtype: int64

Original class distribution:
 fracture
no fracture    119
fracture        50
Name: count, dtype: int64

Balanced class distribution:
 fracture
no fracture    50
fracture       50
Name: count, dtype: int64

# Features & Target
X_lr = df_bal_lr[["age", "weight_kg", "height_cm", "bmd"]]
y_lr = df_bal_lr["fracture"]

# Train/Test split (Random Sampling)
x_train, x_test, y_train, y_test = train_test_split(
    X_lr, y_lr, test_size=0.30, random_state=0, stratify=y_lr
)
print("Train size:", x_train.shape[0], " | Test size:", x_test.shape[0])

# Create Logistic Regression model (L1 penalty, liblinear solver)
model_LR = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
model_LR.fit(x_train, y_train)

# Show model parameters
print("Classes:", model_LR.classes_)
print("Intercept:", model_LR.intercept_)
print("Coefficients [age, weight_kg, height_cm, bmd]:")
print(model_LR.coef_)

Train size: 70  | Test size: 30
Classes: ['fracture' 'no fracture']
Intercept: [0.]
Coefficients [age, weight_kg, height_cm, bmd]:
[[-0.04736055  0.06461802 -0.02866587  5.03344623]]

# Predict on test set
y_pred = model_LR.predict(x_test)

# Confusion matrix
cm = pd.crosstab(y_test, y_pred, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Logistic Regression — Confusion Matrix (Test Set)")
plt.show()

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)

# Classification report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:\n", cr)

# Correct vs Incorrect counts
correct = cm.values.trace()
incorrect = cm.values.sum() - correct
print(f"Correct predictions: {correct} | Incorrect predictions: {incorrect}")

Test Accuracy: 0.7666666666666667

Classification Report:
               precision    recall  f1-score   support

    fracture       0.90      0.60      0.72        15
 no fracture       0.70      0.93      0.80        15

    accuracy                           0.77        30
   macro avg       0.80      0.77      0.76        30
weighted avg       0.80      0.77      0.76        30

Correct predictions: 23 | Incorrect predictions: 7

cv_scores = cross_val_score(
    LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000),
    X_lr, y_lr, cv=5, scoring="accuracy"
)
print("5-fold CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

5-fold CV Accuracy Scores: [0.9  0.8  0.9  0.85 0.45]
Mean CV Accuracy: 0.78

# Comparison metrics
y_pred_lr   = y_pred
cm_lr       = cm.copy()
acc_lr      = acc
correct_lr  = correct
incorrect_lr= incorrect

print(f"[Logistic Regression] acc={acc_lr:.2f} | correct={correct_lr} | incorrect={incorrect_lr}")

[Logistic Regression] acc=0.77 | correct=23 | incorrect=7

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.utils import resample

# Load
df = pd.read_csv(CSV_PATH)
print("Raw shape:", df.shape)
display(df.head())
print("\nMissing values:\n", df.isna().sum())

# Normalise numeric features (kept consistent with your earlier steps)
num_cols = ["age", "weight_kg", "height_cm", "bmd"]
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Class balance (downsample majority to minority count)
df_majority = df[df["fracture"] == "no fracture"]
df_minority = df[df["fracture"] == "fracture"]

df_majority_down = resample(
    df_majority, replace=False, n_samples=len(df_minority), random_state=96
)
df_bal = pd.concat([df_majority_down, df_minority]).reset_index(drop=True)

print("\nBalanced class distribution:\n", df_bal["fracture"].value_counts())
display(df_bal.head())

# Final X, y
X = df_bal[["age", "weight_kg", "height_cm", "bmd"]].copy()
y = df_bal["fracture"].copy()

# Train/Test split (stratify to keep 50/50 in both sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
print("\nTrain size:", X_train.shape[0], " | Test size:", X_test.shape[0])

Raw shape: (169, 9)

Missing values:
 id              0
age             0
sex             0
fracture        0
weight_kg       0
height_cm       0
medication      0
waiting_time    0
bmd             0
dtype: int64

Balanced class distribution:
 fracture
no fracture    50
fracture       50
Name: count, dtype: int64

Train size: 70  | Test size: 30

# Train RF with 10 trees
rf10 = RandomForestClassifier(n_estimators=10, random_state=58)
rf10.fit(X_train, y_train)

# Predict on test set
y_pred10 = rf10.predict(X_test)

# Confusion matrix
cm10 = pd.crosstab(y_test, y_pred10, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm10, annot=True, fmt="d", cmap="Blues")
plt.title("Random Forest — Test Set (n_estimators=10)")
plt.show()

# Accuracy + classification report
acc10 = accuracy_score(y_test, y_pred10)
print("Test Accuracy (10 trees):", acc10)
print("\nClassification Report (10 trees):\n", classification_report(y_test, y_pred10))

# Quick counts
correct10 = cm10.values.trace()
incorrect10 = cm10.values.sum() - correct10
print(f"Correct: {correct10} | Incorrect: {incorrect10}")

Test Accuracy (10 trees): 0.8333333333333334

Classification Report (10 trees):
               precision    recall  f1-score   support

    fracture       0.81      0.87      0.84        15
 no fracture       0.86      0.80      0.83        15

    accuracy                           0.83        30
   macro avg       0.83      0.83      0.83        30
weighted avg       0.83      0.83      0.83        30

Correct: 25 | Incorrect: 5

# Train RF with 20 trees
rf20 = RandomForestClassifier(n_estimators=20, random_state=52)
rf20.fit(X_train, y_train)

# Predict on test set
y_pred20 = rf20.predict(X_test)

# Confusion matrix
cm20 = pd.crosstab(y_test, y_pred20, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm20, annot=True, fmt="d", cmap="Greens")
plt.title("Random Forest — Test Set (n_estimators=20)")
plt.show()

# Accuracy + classification report
acc20 = accuracy_score(y_test, y_pred20)
print("Test Accuracy (20 trees):", acc20)
print("\nClassification Report (20 trees):\n", classification_report(y_test, y_pred20))

# Quick counts
correct20 = cm20.values.trace()
incorrect20 = cm20.values.sum() - correct20
print(f"Correct: {correct20} | Incorrect: {incorrect20}")

# Feature importance (from the 20-tree model)
importances = pd.Series(rf20.feature_importances_, index=X_train.columns).sort_values(ascending=False)
ax = importances.plot(kind="bar")
plt.title("Random Forest Feature Importances (n_estimators=20)")
plt.ylabel("Importance")
plt.show()

# Summary
print("\n Random Forest Summary")
print(f"10 trees -> accuracy: {acc10:.2f} | correct={correct10} | incorrect={incorrect10}")
print(f"20 trees -> accuracy: {acc20:.2f} | correct={correct20} | incorrect={incorrect20}")

Test Accuracy (20 trees): 0.8333333333333334

Classification Report (20 trees):
               precision    recall  f1-score   support

    fracture       0.81      0.87      0.84        15
 no fracture       0.86      0.80      0.83        15

    accuracy                           0.83        30
   macro avg       0.83      0.83      0.83        30
weighted avg       0.83      0.83      0.83        30

Correct: 25 | Incorrect: 5

 Random Forest Summary
10 trees -> accuracy: 0.83 | correct=25 | incorrect=5
20 trees -> accuracy: 0.83 | correct=25 | incorrect=5

acc_rf10       = acc10
correct_rf10   = correct10
incorrect_rf10 = incorrect10
cm_rf10        = cm10.copy()

# Comparison metrics for Random Forest (20 trees)
acc_rf20       = acc20
correct_rf20   = correct20
incorrect_rf20 = incorrect20
cm_rf20        = cm20.copy()

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import svm

# Load
df = pd.read_csv(CSV_PATH)
print("Raw shape:", df.shape)
display(df.head())
print("\nMissing values:\n", df.isna().sum())

# Normalise numeric features
num_cols = ["age", "weight_kg", "height_cm", "bmd"]
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Balance (downsample majority to match minority)
df_majority = df[df["fracture"] == "no fracture"]
df_minority = df[df["fracture"] == "fracture"]

df_majority_down = resample(
    df_majority, replace=False, n_samples=len(df_minority), random_state=96
)
df_bal = pd.concat([df_majority_down, df_minority]).reset_index(drop=True)

print("\nBalanced class distribution:\n", df_bal["fracture"].value_counts())

# Final features/target
X = df_bal[["age", "weight_kg", "height_cm", "bmd"]].copy()
y = df_bal["fracture"].copy()

# Train/Test split (hold-out)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
print("Train size:", X_train.shape[0], " | Test size:", X_test.shape[0])

Raw shape: (169, 9)

Missing values:
 id              0
age             0
sex             0
fracture        0
weight_kg       0
height_cm       0
medication      0
waiting_time    0
bmd             0
dtype: int64

Balanced class distribution:
 fracture
no fracture    50
fracture       50
Name: count, dtype: int64
Train size: 70  | Test size: 30

# Linear SVM
linear_svm = svm.SVC(kernel="linear", C=1.0)
linear_svm.fit(X_train, y_train)

y_pred_lin = linear_svm.predict(X_test)

cm_lin = pd.crosstab(y_test, y_pred_lin, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm_lin, annot=True, fmt="d", cmap="Blues")
plt.title("SVM (Linear) — Test Set")
plt.show()

acc_lin = accuracy_score(y_test, y_pred_lin)
print("Test Accuracy (Linear):", acc_lin)
print("\nClassification Report (Linear):\n", classification_report(y_test, y_pred_lin))

# RBF SVM
rbf_svm = svm.SVC(kernel="rbf", gamma=0.7, C=1.0)
rbf_svm.fit(X_train, y_train)

y_pred_rbf = rbf_svm.predict(X_test)

cm_rbf = pd.crosstab(y_test, y_pred_rbf, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm_rbf, annot=True, fmt="d", cmap="Greens")
plt.title("SVM (RBF) — Test Set")
plt.show()

acc_rbf = accuracy_score(y_test, y_pred_rbf)
print("Test Accuracy (RBF):", acc_rbf)
print("\nClassification Report (RBF):\n", classification_report(y_test, y_pred_rbf))

# Summary
correct_lin = cm_lin.values.trace(); incorrect_lin = cm_lin.values.sum() - correct_lin
correct_rbf = cm_rbf.values.trace(); incorrect_rbf = cm_rbf.values.sum() - correct_rbf

print("\n=== SVM (hold-out test) Summary ===")
print(f"Linear -> accuracy: {acc_lin:.2f} | correct={correct_lin} | incorrect={incorrect_lin}")
print(f"RBF    -> accuracy: {acc_rbf:.2f} | correct={correct_rbf} | incorrect={incorrect_rbf}")

Test Accuracy (Linear): 0.7666666666666667

Classification Report (Linear):
               precision    recall  f1-score   support

    fracture       0.83      0.67      0.74        15
 no fracture       0.72      0.87      0.79        15

    accuracy                           0.77        30
   macro avg       0.78      0.77      0.76        30
weighted avg       0.78      0.77      0.76        30

Test Accuracy (RBF): 0.7666666666666667

Classification Report (RBF):
               precision    recall  f1-score   support

    fracture       0.83      0.67      0.74        15
 no fracture       0.72      0.87      0.79        15

    accuracy                           0.77        30
   macro avg       0.78      0.77      0.76        30
weighted avg       0.78      0.77      0.76        30


=== SVM (hold-out test) Summary ===
Linear -> accuracy: 0.77 | correct=23 | incorrect=7
RBF    -> accuracy: 0.77 | correct=23 | incorrect=7

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Grid (kept small enough to run fast but still useful)
param_grid = {
    "C": [0.1, 1, 3, 10, 30, 100],
    "gamma": ["scale", 0.01, 0.03, 0.1, 0.3, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rbf = svm.SVC(kernel="rbf")

grid = GridSearchCV(
    estimator=rbf,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=0,
    refit=True,
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

# Evaluate best model on hold-out test set
best_rbf = grid.best_estimator_
y_pred_gs = best_rbf.predict(X_test)

cm = pd.crosstab(y_test, y_pred_gs, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(cm, annot=True, fmt="d", cmap="Purples")
plt.title("SVM (RBF) — Best GridSearchCV Model (Test Set)")
plt.show()

acc = accuracy_score(y_test, y_pred_gs)
print("Test Accuracy (best RBF):", acc)
print("\nClassification Report (best RBF):\n", classification_report(y_test, y_pred_gs))

Best params: {'C': 10, 'gamma': 0.3}
Best CV accuracy: 0.8428571428571429

Test Accuracy (best RBF): 0.7666666666666667

Classification Report (best RBF):
               precision    recall  f1-score   support

    fracture       0.83      0.67      0.74        15
 no fracture       0.72      0.87      0.79        15

    accuracy                           0.77        30
   macro avg       0.78      0.77      0.76        30
weighted avg       0.78      0.77      0.76        30

# Build a pivot table for mean CV accuracy (only numeric gamma values)
cv_results = pd.DataFrame(grid.cv_results_)
plot_df = cv_results[cv_results["param_gamma"] != "scale"].copy()
plot_df["C"] = plot_df["param_C"].astype(float)
plot_df["gamma"] = plot_df["param_gamma"].astype(float)

pivot = plot_df.pivot_table(
    index="gamma", columns="C", values="mean_test_score"
).sort_index(ascending=True)

sns.heatmap(pivot, annot=True, fmt=".3f", cmap="viridis")
plt.title("GridSearchCV mean CV accuracy (RBF SVM)")
plt.ylabel("gamma")
plt.xlabel("C")
plt.show()

# Comparison metrics for SVM (Linear)
cm_svm_lin       = cm_lin.copy()
acc_svm_lin      = acc_lin
correct_svm_lin  = correct_lin
incorrect_svm_lin= incorrect_lin

# Comparison metrics for SVM (RBF)
cm_svm_rbf       = cm_rbf.copy()
acc_svm_rbf      = acc_rbf
correct_svm_rbf  = correct_rbf
incorrect_svm_rbf= incorrect_rbf

# Comparison metrics for SVM (Best RBF GridSearchCV)
cm_svm_gs        = cm.copy()
acc_svm_gs       = acc
correct_svm_gs   = cm_svm_gs.values.trace()
incorrect_svm_gs = cm_svm_gs.values.sum() - correct_svm_gs

# Final Model Comparison
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

rows = []

def add_row(name, acc_var, cm_var=None, correct_var=None, incorrect_var=None):
    """Add a model row if its accuracy variable exists."""
    try:
        acc = globals()[acc_var]
    except KeyError:
        return  # model not run / variable not defined -> skip

    # correct/incorrect: prefer provided vars, else compute from cm if available
    if correct_var and incorrect_var and correct_var in globals() and incorrect_var in globals():
        correct = globals()[correct_var]
        incorrect = globals()[incorrect_var]
    elif cm_var and cm_var in globals():
        cm = globals()[cm_var]
        correct = cm.values.trace()
        incorrect = cm.values.sum() - correct
    else:
        correct = None
        incorrect = None

    rows.append({
        "Model": name,
        "Accuracy": float(acc),
        "Correct": correct,
        "Incorrect": incorrect
    })

# KNN (k=7) — from your KNN block
add_row("KNN (k=7)", "acc_knn", cm_var="cm_knn", correct_var="correct_knn", incorrect_var="incorrect_knn")

# Logistic Regression
add_row("Logistic Regression (L1, liblinear)", "acc_lr", cm_var="cm_lr", correct_var="correct_lr", incorrect_var="incorrect_lr")

# Random Forest
add_row("Random Forest (10 trees)", "acc_rf10", cm_var="cm_rf10", correct_var="correct_rf10", incorrect_var="incorrect_rf10")
add_row("Random Forest (20 trees)", "acc_rf20", cm_var="cm_rf20", correct_var="correct_rf20", incorrect_var="incorrect_rf20")

# SVM
add_row("SVM (Linear, C=1)", "acc_svm_lin", cm_var="cm_svm_lin", correct_var="correct_svm_lin", incorrect_var="incorrect_svm_lin")
add_row("SVM (RBF, C=1, γ=0.7)", "acc_svm_rbf", cm_var="cm_svm_rbf", correct_var="correct_svm_rbf", incorrect_var="incorrect_svm_rbf")
add_row("SVM (RBF, GridSearch best)", "acc_svm_gs", cm_var="cm_svm_gs", correct_var="correct_svm_gs", incorrect_var="incorrect_svm_gs")

# Build & show table
df_results = pd.DataFrame(rows)
if df_results.empty:
    raise RuntimeError("No model metrics found. Make sure you ran the model cells and the 'save comparison metrics' lines.")

df_results = df_results.sort_values("Accuracy", ascending=False).reset_index(drop=True)
print("Model Comparison")
display(df_results)

# Plot
plt.figure(figsize=(8, 4.5))
sns.barplot(data=df_results, x="Accuracy", y="Model")
plt.title("Model Accuracy Comparison")
plt.xlim(0, 1)
plt.xlabel("Accuracy")
plt.ylabel("")
plt.show()

Model Comparison

!pip -q install nbconvert

!jupyter nbconvert --to html "/content/drive/MyDrive/Predicting-Fracture-Risk-in-Osteoporosis-Patients/fracture_risk_hamza.ipynb" --output osteoporosis.html

from google.colab import files
files.download('osteoporosis.html')

[NbConvertApp] Converting notebook /content/drive/MyDrive/Predicting-Fracture-Risk-in-Osteoporosis-Patients/fracture_risk_hamza.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 20 image(s).
[NbConvertApp] Writing 1255887 bytes to /content/drive/MyDrive/Predicting-Fracture-Risk-in-Osteoporosis-Patients/fracture_risk_hamza.html

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/tmp/ipython-input-1507752353.py in <cell line: 0>()
      4 
      5 from google.colab import files
----> 6 files.download('osteoporosis.html')

/usr/local/lib/python3.11/dist-packages/google/colab/files.py in download(filename)
    231   if not _os.path.exists(filename):
    232     msg = 'Cannot find file: {}'.format(filename)
--> 233     raise FileNotFoundError(msg)  # pylint: disable=undefined-variable
    234 
    235   comm_manager = _IPython.get_ipython().kernel.comm_manager

FileNotFoundError: Cannot find file: osteoporosis.html

	id	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	469	57.052768	F	no fracture	64.0	155.5	Anticonvulsant	18	0.8793
1	8724	75.741225	F	no fracture	78.0	162.0	No medication	56	0.7946
2	6736	70.778900	M	no fracture	73.0	170.5	No medication	10	0.9067
3	24180	78.247175	F	no fracture	60.0	148.0	No medication	14	0.7112
4	17072	54.191877	M	no fracture	55.0	161.0	No medication	20	0.7909

	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	57.052768	F	no fracture	64.0	155.5	Anticonvulsant	18	0.8793
1	75.741225	F	no fracture	78.0	162.0	No medication	56	0.7946
2	70.778900	M	no fracture	73.0	170.5	No medication	10	0.9067
3	78.247175	F	no fracture	60.0	148.0	No medication	14	0.7112
4	54.191877	M	no fracture	55.0	161.0	No medication	20	0.7909

	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	0.401187	F	no fracture	0.466667	0.385714	Anticonvulsant	18	0.494030
1	0.754200	F	no fracture	0.700000	0.571429	No medication	56	0.405320
2	0.660465	M	no fracture	0.616667	0.814286	No medication	10	0.522727
3	0.801536	F	no fracture	0.400000	0.171429	No medication	14	0.317972
4	0.347146	M	no fracture	0.316667	0.542857	No medication	20	0.401445

	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	0.298122	F	no fracture	0.300000	0.342857	No medication	13	0.385316
1	0.314778	F	no fracture	0.383333	0.242857	No medication	6	0.339129
2	0.706773	F	no fracture	0.266667	0.314286	No medication	89	0.319648
3	0.775407	M	no fracture	0.866667	0.714286	No medication	8	0.622015
4	0.461327	F	no fracture	0.566667	0.457143	Glucocorticoids	8	0.448890

	id	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	469	57.052768	F	no fracture	64.0	155.5	Anticonvulsant	18	0.8793
1	8724	75.741225	F	no fracture	78.0	162.0	No medication	56	0.7946
2	6736	70.778900	M	no fracture	73.0	170.5	No medication	10	0.9067
3	24180	78.247175	F	no fracture	60.0	148.0	No medication	14	0.7112
4	17072	54.191877	M	no fracture	55.0	161.0	No medication	20	0.7909

Introduction¶

Logistic Regression¶

Random Forest¶

SVM¶

Model Performance Summary¶

Key Insights¶

	id	age	sex	fracture	weight_kg	height_cm	medication	waiting_time	bmd
0	8896	0.298122	F	no fracture	0.300000	0.342857	No medication	13	0.385316
1	23790	0.314778	F	no fracture	0.383333	0.242857	No medication	6	0.339129
2	55	0.706773	F	no fracture	0.266667	0.314286	No medication	89	0.319648
3	5509	0.775407	M	no fracture	0.866667	0.714286	No medication	8	0.622015
4	8999	0.461327	F	no fracture	0.566667	0.457143	Glucocorticoids	8	0.448890

	Model	Accuracy	Correct	Incorrect
0	KNN (k=7)	0.880000	88	12
1	Random Forest (10 trees)	0.833333	25	5
2	Random Forest (20 trees)	0.833333	25	5
3	Logistic Regression (L1, liblinear)	0.766667	23	7
4	SVM (Linear, C=1)	0.766667	23	7
5	SVM (RBF, C=1, γ=0.7)	0.766667	23	7
6	SVM (RBF, GridSearch best)	0.766667	23	7