Capstone Project: Responsible AI Audit System

The capstone of this entire AI course: build a Responsible AI Audit System that evaluates any ML model or AI application across six dimensions — fairness, robustness, interpretability, privacy, safety, and EU AI Act compliance — and generates a detailed audit report with actionable recommendations.

90 min•By Priygop Team•Updated 2026

Responsible AI Audit Framework

import json
from openai import OpenAI
import numpy as np
from datetime import datetime

client = OpenAI()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# RESPONSIBLE AI AUDIT DIMENSIONS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

AUDIT_FRAMEWORK = {
    "fairness": {
        "checks": ["Demographic parity", "Equal opportunity", "Calibration", "Individual fairness"],
        "threshold": 0.8,  # disparate impact >= 0.8
    },
    "robustness": {
        "checks": ["Adversarial examples", "Out-of-distribution data", "Missing value handling", "Noise sensitivity"],
        "threshold": 0.9,  # accuracy under attack >= 90% of clean accuracy
    },
    "interpretability": {
        "checks": ["SHAP explanations available", "Feature importance documented", "Decision boundary interpretable", "Local explanations for edge cases"],
    },
    "privacy": {
        "checks": ["PII not in training data", "Differential privacy applied", "GDPR compliance", "Data minimization"],
    },
    "safety": {
        "checks": ["Input validation", "Output filtering", "Rate limiting", "Abuse monitoring", "Incident response plan"],
    },
    "eu_ai_act": {
        "checks": ["Risk classification done", "Technical documentation", "Human oversight", "Logging enabled", "Bias testing done"],
        "risk_level": "high",  # set based on use case
    },
}

class ResponsibleAIAuditor:
    '''Comprehensive AI system audit tool.'''

    def __init__(self, system_name: str, system_description: str):
        self.system_name = system_name
        self.system_description = system_description
        self.audit_results = {}
        self.start_time = datetime.now()

    def audit_dimension(self, dimension: str, evidence: dict) -> dict:
        '''Use GPT-4o to evaluate a specific audit dimension.'''
        framework = AUDIT_FRAMEWORK[dimension]
        checks_str = "\n".join(f"  - {c}" for c in framework["checks"])

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f'''You are a Responsible AI auditor evaluating an AI system.

System name: {self.system_name}
System description: {self.system_description}
Audit dimension: {dimension.upper()}

Checks to evaluate:
{checks_str}

Evidence provided:
{json.dumps(evidence, indent=2)}

Evaluate each check as PASS/FAIL/PARTIAL with a brief reason.
Calculate an overall score (0-100).
Provide specific, actionable recommendations.

Respond with JSON:
{{
  "checks": {{"check_name": {{"status": "PASS|FAIL|PARTIAL", "reason": "..."}}, ...}},
  "overall_score": 0-100,
  "critical_issues": ["..."],
  "recommendations": ["..."]
}}'''
            }],
            response_format={"type": "json_object"},
            temperature=0,
        )

        return json.loads(response.choices[0].message.content)

    def generate_report(self, evidence_per_dimension: dict) -> dict:
        '''Run full audit across all dimensions.'''
        print(f"Starting Responsible AI Audit for: {self.system_name}")
        print("=" * 60)

        for dimension in AUDIT_FRAMEWORK.keys():
            print(f"Auditing {dimension}...")
            evidence = evidence_per_dimension.get(dimension, {})
            self.audit_results[dimension] = self.audit_dimension(dimension, evidence)

        # Calculate overall score
        scores = [r.get("overall_score", 50) for r in self.audit_results.values()]
        overall_score = np.mean(scores)

        # Determine risk level
        if overall_score >= 80:
            risk_level = "LOW -- Suitable for deployment"
        elif overall_score >= 60:
            risk_level = "MEDIUM -- Address critical issues before deployment"
        else:
            risk_level = "HIGH -- Significant issues, deployment not recommended"

        report = {
            "system": self.system_name,
            "audit_date": self.start_time.isoformat(),
            "overall_score": round(overall_score, 1),
            "risk_level": risk_level,
            "dimension_scores": {d: r.get("overall_score", 0) for d, r in self.audit_results.items()},
            "critical_issues": [item for r in self.audit_results.values() for item in r.get("critical_issues", [])],
            "recommendations": [item for r in self.audit_results.values() for item in r.get("recommendations", [])],
            "full_results": self.audit_results,
        }

        # Print summary
        print(f"\n{'='*60}")
        print(f"AUDIT COMPLETE: {self.system_name}")
        print(f"Overall Score: {overall_score:.1f}/100")
        print(f"Risk Level: {risk_level}")
        print(f"\nDimension Scores:")
        for dim, score in report["dimension_scores"].items():
            bar = "=" * int(score / 5)
            print(f"  {dim:20s}: {score:3.0f}/100 [{bar:20s}]")

        return report

# Example usage:
auditor = ResponsibleAIAuditor(
    system_name="Loan Approval AI System v2.3",
    system_description="ML model that predicts creditworthiness for personal loan applications at a UK bank"
)

evidence = {
    "fairness": {
        "disparate_impact_gender": 0.76,  # BELOW 0.8 threshold -- FAIL
        "disparate_impact_race": 0.88,    # above threshold -- PASS
        "equal_opportunity_gap": 0.08,
        "calibration_tested": True,
    },
    "eu_ai_act": {
        "risk_classification": "High risk (credit scoring -- Annex III)",
        "technical_documentation": "Partially complete, missing bias testing section",
        "human_override": True,
        "logging": "All decisions logged with reason codes",
        "registered_in_eu_database": False,
    },
    "privacy": {
        "sensitive_attributes_in_training": "Gender, age (via DOB) -- removed in v2.3",
        "gdpr_dpia_completed": True,
        "data_retention_policy": "3 years",
    },
}

report = auditor.generate_report(evidence)
print(f"\nCritical Issues Found:")
for issue in report["critical_issues"][:5]:
    print(f"  * {issue}")

Tip

Practice Capstone Project Responsible AI Audit System in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Better prompts = better AI output. Structure, examples, and constraints matter.

Practice Task

Note

Practice Task — (1) Write a working example of Capstone Project Responsible AI Audit System from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Capstone Project Responsible AI Audit System is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module

Capstone Project: Responsible AI Audit System

90 min•By Priygop Team•Updated 2026

Responsible AI Audit Framework

import json
from openai import OpenAI
import numpy as np
from datetime import datetime

client = OpenAI()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# RESPONSIBLE AI AUDIT DIMENSIONS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

AUDIT_FRAMEWORK = {
    "fairness": {
        "checks": ["Demographic parity", "Equal opportunity", "Calibration", "Individual fairness"],
        "threshold": 0.8,  # disparate impact >= 0.8
    },
    "robustness": {
        "checks": ["Adversarial examples", "Out-of-distribution data", "Missing value handling", "Noise sensitivity"],
        "threshold": 0.9,  # accuracy under attack >= 90% of clean accuracy
    },
    "interpretability": {
        "checks": ["SHAP explanations available", "Feature importance documented", "Decision boundary interpretable", "Local explanations for edge cases"],
    },
    "privacy": {
        "checks": ["PII not in training data", "Differential privacy applied", "GDPR compliance", "Data minimization"],
    },
    "safety": {
        "checks": ["Input validation", "Output filtering", "Rate limiting", "Abuse monitoring", "Incident response plan"],
    },
    "eu_ai_act": {
        "checks": ["Risk classification done", "Technical documentation", "Human oversight", "Logging enabled", "Bias testing done"],
        "risk_level": "high",  # set based on use case
    },
}

class ResponsibleAIAuditor:
    '''Comprehensive AI system audit tool.'''

    def __init__(self, system_name: str, system_description: str):
        self.system_name = system_name
        self.system_description = system_description
        self.audit_results = {}
        self.start_time = datetime.now()

    def audit_dimension(self, dimension: str, evidence: dict) -> dict:
        '''Use GPT-4o to evaluate a specific audit dimension.'''
        framework = AUDIT_FRAMEWORK[dimension]
        checks_str = "\n".join(f"  - {c}" for c in framework["checks"])

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f'''You are a Responsible AI auditor evaluating an AI system.

System name: {self.system_name}
System description: {self.system_description}
Audit dimension: {dimension.upper()}

Checks to evaluate:
{checks_str}

Evidence provided:
{json.dumps(evidence, indent=2)}

Evaluate each check as PASS/FAIL/PARTIAL with a brief reason.
Calculate an overall score (0-100).
Provide specific, actionable recommendations.

Respond with JSON:
{{
  "checks": {{"check_name": {{"status": "PASS|FAIL|PARTIAL", "reason": "..."}}, ...}},
  "overall_score": 0-100,
  "critical_issues": ["..."],
  "recommendations": ["..."]
}}'''
            }],
            response_format={"type": "json_object"},
            temperature=0,
        )

        return json.loads(response.choices[0].message.content)

    def generate_report(self, evidence_per_dimension: dict) -> dict:
        '''Run full audit across all dimensions.'''
        print(f"Starting Responsible AI Audit for: {self.system_name}")
        print("=" * 60)

        for dimension in AUDIT_FRAMEWORK.keys():
            print(f"Auditing {dimension}...")
            evidence = evidence_per_dimension.get(dimension, {})
            self.audit_results[dimension] = self.audit_dimension(dimension, evidence)

        # Calculate overall score
        scores = [r.get("overall_score", 50) for r in self.audit_results.values()]
        overall_score = np.mean(scores)

        # Determine risk level
        if overall_score >= 80:
            risk_level = "LOW -- Suitable for deployment"
        elif overall_score >= 60:
            risk_level = "MEDIUM -- Address critical issues before deployment"
        else:
            risk_level = "HIGH -- Significant issues, deployment not recommended"

        report = {
            "system": self.system_name,
            "audit_date": self.start_time.isoformat(),
            "overall_score": round(overall_score, 1),
            "risk_level": risk_level,
            "dimension_scores": {d: r.get("overall_score", 0) for d, r in self.audit_results.items()},
            "critical_issues": [item for r in self.audit_results.values() for item in r.get("critical_issues", [])],
            "recommendations": [item for r in self.audit_results.values() for item in r.get("recommendations", [])],
            "full_results": self.audit_results,
        }

        # Print summary
        print(f"\n{'='*60}")
        print(f"AUDIT COMPLETE: {self.system_name}")
        print(f"Overall Score: {overall_score:.1f}/100")
        print(f"Risk Level: {risk_level}")
        print(f"\nDimension Scores:")
        for dim, score in report["dimension_scores"].items():
            bar = "=" * int(score / 5)
            print(f"  {dim:20s}: {score:3.0f}/100 [{bar:20s}]")

        return report

# Example usage:
auditor = ResponsibleAIAuditor(
    system_name="Loan Approval AI System v2.3",
    system_description="ML model that predicts creditworthiness for personal loan applications at a UK bank"
)

evidence = {
    "fairness": {
        "disparate_impact_gender": 0.76,  # BELOW 0.8 threshold -- FAIL
        "disparate_impact_race": 0.88,    # above threshold -- PASS
        "equal_opportunity_gap": 0.08,
        "calibration_tested": True,
    },
    "eu_ai_act": {
        "risk_classification": "High risk (credit scoring -- Annex III)",
        "technical_documentation": "Partially complete, missing bias testing section",
        "human_override": True,
        "logging": "All decisions logged with reason codes",
        "registered_in_eu_database": False,
    },
    "privacy": {
        "sensitive_attributes_in_training": "Gender, age (via DOB) -- removed in v2.3",
        "gdpr_dpia_completed": True,
        "data_retention_policy": "3 years",
    },
}

report = auditor.generate_report(evidence)
print(f"\nCritical Issues Found:")
for issue in report["critical_issues"][:5]:
    print(f"  * {issue}")

Tip

Diagram

Loading diagram…

Better prompts = better AI output. Structure, examples, and constraints matter.

Topics in This Module