## Solution to Assignment 5

In [1]:
%pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.25-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->pgmpy)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->pgmpy)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->pgmpy)
  Usi

In [2]:
# Q1) Build your Bayesian Network using pgmpy as shown in class. Make use of the Variable Elimination method.

from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Define the structure of the Bayesian Network

model_diabetes = BayesianNetwork([
    ('Age', 'Diabetes'),
    ('Lifestyle', 'Diabetes'),
    ('FamilyHistory', 'Diabetes'),
    ('Diabetes', 'BloodTests'),
    ('Diabetes', 'CardioRisk'),
    ('Cholesterol', 'CardioRisk'),
    ('Hypertension', 'CardioRisk')
])
# Define the Conditional Probability Distributions (CPDs)
cpd_age = TabularCPD(variable='Age', variable_card=3,
                     values=[[0.4], [0.4], [0.2]], state_names={'Age': ['Young', 'Middle-aged', 'Elderly']})

cpd_lifestyle = TabularCPD(variable='Lifestyle', variable_card=2,
                           values=[[0.7], [0.3]], state_names={'Lifestyle': ['Unhealthy', 'Healthy']})

cpd_familyhistory = TabularCPD(variable='FamilyHistory', variable_card=2,
                               values=[[0.85], [0.15]], state_names={'FamilyHistory': ['Absent', 'Present']})


cpd_cholesterol = TabularCPD(variable='Cholesterol', variable_card=2,
                             values=[[0.5], [0.5]],
                             state_names={'Cholesterol': ['Normal', 'High']})

cpd_hypertension = TabularCPD(variable='Hypertension', variable_card=2,
                              values=[[0.6], [0.4]],
                              state_names={'Hypertension': ['No', 'Yes']})


cpd_diabetes = TabularCPD(variable='Diabetes', variable_card=2,
                          values=[
                            [0.97, 0.93, 0.99, 0.95, 0.81, 0.77, 0.88, 0.81, 0.65, 0.61, 0.78, 0.70], # No Diabetes
                            [0.03, 0.07, 0.01, 0.05, 0.19, 0.23, 0.12, 0.19, 0.35, 0.39, 0.22, 0.30]  # Yes Diabetes
                          ],
                          evidence=['Age', 'Lifestyle', 'FamilyHistory'],
                          evidence_card=[3, 2, 2],
                          state_names={'Diabetes': ['No', 'Yes'], 'Age': ['Young', 'Middle-aged', 'Elderly'], 'Lifestyle': ['Unhealthy', 'Healthy'], 'FamilyHistory': ['Absent', 'Present']})

cpd_bloodtests = TabularCPD(variable='BloodTests', variable_card=2,
                            values=[[0.7, 0.3],
                                    [0.3, 0.7]],
                            evidence=['Diabetes'],
                            evidence_card=[2],
                            state_names={'BloodTests': ['Normal', 'Abnormal'], 'Diabetes': ['No', 'Yes']}
                            )


cpd_cardiorisk = TabularCPD(variable='CardioRisk', variable_card=2,
                            values=[
                                [0.9, 0.7, 0.8, 0.6, 0.85, 0.65, 0.75, 0.55],  # Low Risk
                                [0.1, 0.3, 0.2, 0.4, 0.15, 0.35, 0.25, 0.45]   # High Risk
                            ],
                            evidence=['Diabetes', 'Cholesterol', 'Hypertension'],
                            evidence_card=[2, 2, 2],
                            state_names={'CardioRisk': ['Low', 'High'], 'Diabetes': ['No', 'Yes'], 'Cholesterol': ['Normal', 'High'], 'Hypertension': ['No', 'Yes']}
                            )


# Add all CPDs to the model
model_diabetes.add_cpds(cpd_age, cpd_lifestyle, cpd_familyhistory, cpd_cholesterol, cpd_hypertension, cpd_diabetes, cpd_bloodtests, cpd_cardiorisk)


# Validate the model
assert model_diabetes.check_model()


In [3]:
# Initialize the inference object
infer = VariableElimination(model_diabetes)


In [5]:
# Q2a) What is the probability of diabetes given being elderly, unhealthy lifestyle, and with family history present?
query_result = infer.query(variables=['Diabetes'], evidence={'Age': 'Elderly', 'Lifestyle': 'Unhealthy', 'FamilyHistory': 'Present'})
print(query_result)

+---------------+-----------------+
| Diabetes      |   phi(Diabetes) |
| Diabetes(No)  |          0.6100 |
+---------------+-----------------+
| Diabetes(Yes) |          0.3900 |
+---------------+-----------------+


In [6]:
# Q2b)  What is the probability of diabetes given being elderly, unhealthy lifestyle, family history present and cardio risk being high?
query_result = infer.query(variables=['Diabetes'], evidence={'Age': 'Elderly', 'Lifestyle': 'Unhealthy', 'FamilyHistory': 'Present', 'CardioRisk': 'High'})
print(query_result)

+---------------+-----------------+
| Diabetes      |   phi(Diabetes) |
| Diabetes(No)  |          0.5623 |
+---------------+-----------------+
| Diabetes(Yes) |          0.4377 |
+---------------+-----------------+


In [7]:
# Q2c) What is the probability of diabetes given being elderly, unhealthy lifestyle, family history present, cardio risk being high and knowing that cholesterol is high and hypertension is present?
query_result = infer.query(variables=['Diabetes'], evidence={'Age': 'Elderly', 'Lifestyle': 'Unhealthy', 'FamilyHistory': 'Present', 'CardioRisk': 'High', 'Cholesterol': 'High', 'Hypertension':'Yes'})
print(query_result)


+---------------+-----------------+
| Diabetes      |   phi(Diabetes) |
| Diabetes(No)  |          0.5816 |
+---------------+-----------------+
| Diabetes(Yes) |          0.4184 |
+---------------+-----------------+


In [8]:
# Bonus1) What is the probability of diabetes given being elderly, unhealthy lifestyle, family history present, knowing that cholesterol is high and hypertension is present? Does it change compared to Question 2c?
query_result = infer.query(variables=['Diabetes'], evidence={'Age': 'Elderly', 'Lifestyle': 'Unhealthy', 'FamilyHistory': 'Present', 'Cholesterol': 'High', 'Hypertension':'Yes'})
print(query_result)


+---------------+-----------------+
| Diabetes      |   phi(Diabetes) |
| Diabetes(No)  |          0.6100 |
+---------------+-----------------+
| Diabetes(Yes) |          0.3900 |
+---------------+-----------------+


In [None]:
# Bonus 2) Is the probability of Blood Tests conditioned by Hypertension? Please motivate your answer.
query_result = infer.query(variables=['BloodTests'], evidence={'Hypertension':'Yes'})
print(query_result)

+----------------------+-------------------+
| BloodTests           |   phi(BloodTests) |
| BloodTests(Normal)   |            0.6415 |
+----------------------+-------------------+
| BloodTests(Abnormal) |            0.3585 |
+----------------------+-------------------+


In [9]:
query_result = infer.query(variables=['BloodTests'], evidence={'Hypertension':'No'})
print(query_result)

+----------------------+-------------------+
| BloodTests           |   phi(BloodTests) |
| BloodTests(Normal)   |            0.6415 |
+----------------------+-------------------+
| BloodTests(Abnormal) |            0.3585 |
+----------------------+-------------------+


We see that the results of blood tests are independent of hypertension.

In [10]:
# Bonus 3) What sanity checks can you perform to ensure that your network is properly encoded? Please expand upon and justify your strategies.