Skip to main content
CloudExplain

Adult Dataset Example

Learn how to use CloudExplain with the famous Adult dataset to predict income levels and gain deep insights into what factors drive salary predictions.

Income Prediction Binary Classification Feature Importance

What You'll Learn

Data Preprocessing

Handle categorical variables, missing values, and feature encoding for the Adult dataset

Model Training

Build a Random Forest classifier to predict income levels based on demographic features

Explainability

Generate comprehensive explanations and understand which factors most influence income predictions

About the Adult Dataset

The Adult dataset, also known as "Census Income" dataset, is a classic machine learning benchmark

Dataset Details

  • Task: Binary classification (income > $50K or ≤ $50K)
  • Features: 14 demographic and employment attributes
  • Samples: ~48,000 individuals from 1994 US Census
  • Source: UCI Machine Learning Repository

Key Features

  • Age, Education: Basic demographics
  • Workclass, Occupation: Employment details
  • Marital Status, Relationship: Family situation
  • Race, Sex, Country: Additional demographics
  • Capital Gain/Loss: Financial information

Why This Example is Perfect for Explainable AI

Socially Relevant

Income prediction has real-world implications for fairness and bias detection. Understanding which factors influence predictions is crucial for ethical AI.

Mixed Data Types

Combines numerical (age, hours worked) and categorical features (occupation, education), demonstrating how CloudExplain handles diverse feature types.

Interpretable Results

Results are easy to understand and validate against domain knowledge, making it perfect for learning explainable AI concepts.

Bias Detection

Ideal for exploring potential biases related to gender, race, and other sensitive attributes. CloudExplain's bias detection features shine with this dataset.

Complete Example Code

Copy and run this complete example to see CloudExplain in action with the Adult dataset

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import cloudexplain
import datetime

# Load the famous Adult dataset from the UCI repository
github_data_url = "https://github.com/shap/shap/raw/master/data/"
dtypes = [
    ("Age", "float32"),
    ("Workclass", "category"),
    ("fnlwgt", "float32"),
    ("Education", "category"),
    ("Education-Num", "float32"),
    ("Marital Status", "category"),
    ("Occupation", "category"),
    ("Relationship", "category"),
    ("Race", "category"),
    ("Sex", "category"),
    ("Capital Gain", "float32"),
    ("Capital Loss", "float32"),
    ("Hours per week", "float32"),
    ("Country", "category"),
    ("Target", "category"),
]

# Load and preprocess the data
raw_data = pd.read_csv(
    github_data_url + "adult.data", 
    names=[d[0] for d in dtypes], 
    na_values="?", 
    dtype=dict(dtypes)
)

# Clean and prepare the data
data = raw_data.drop(["Education"], axis=1)  # redundant with Education-Num
data["Target"] = data["Target"] == " >50K"  # Convert to binary

# Sample a subset for faster processing
data = data.sample(1000)

# Encode categorical variables
columns_to_encode = [
    'Workclass', 'Marital Status', 'Occupation', 
    'Relationship', 'Race', 'Sex', 'Country'
]

for col in columns_to_encode:
    le = LabelEncoder()
    data[col + '_encoded'] = le.fit_transform(data[col].astype(str))

# Define comprehensive feature descriptions for better explanations
feature_descriptions = {
    'Age': {
        "description": 'Age',
        "display_description": "Age in years",
        "is_categorical": False
    },
    'Workclass_encoded': {
        "description": 'Work Class',
        "display_description": "Type of employment",
        "is_categorical": True,
        "encoding": {
            0: 'Unknown', 1: 'Federal Government', 2: 'Local Government',
            3: 'Never Worked', 4: 'Private Sector', 
            5: 'Self-Employed (Incorporated)', 6: 'Self-Employed (Not Incorporated)',
            7: 'State Government', 8: 'Without Pay'
        }
    },
    'Education-Num': {
        "description": 'Education Level',
        "display_description": "Years of formal education",
        "is_categorical": False
    },
    'Marital Status_encoded': {
        "description": 'Marital Status',
        "display_description": "Current marital status",
        "is_categorical": True,
        "encoding": {
            0: 'Divorced', 1: 'Married (Armed Forces Spouse)',
            2: 'Married (Civilian Spouse)', 3: 'Married (Spouse Absent)',
            4: 'Never Married', 5: 'Separated', 6: 'Widowed'
        }
    },
    'Occupation_encoded': {
        "description": 'Occupation',
        "display_description": "Type of job/profession",
        "is_categorical": True,
        "encoding": {
            0: 'Unknown', 1: 'Administrative/Clerical', 2: 'Armed Forces',
            3: 'Craft/Repair', 4: 'Executive/Managerial', 5: 'Farming/Fishing',
            6: 'Handlers/Cleaners', 7: 'Machine Operator/Inspector',
            8: 'Other Service', 9: 'Private Household Service',
            10: 'Professional Specialty', 11: 'Protective Service',
            12: 'Sales', 13: 'Tech Support', 14: 'Transportation/Moving'
        }
    },
    'Relationship_encoded': {
        "description": 'Family Relationship',
        "display_description": "Relationship within household",
        "is_categorical": True,
        "encoding": {
            0: 'Husband', 1: 'Not in Family', 2: 'Other Relative',
            3: 'Own Child', 4: 'Unmarried Partner', 5: 'Wife'
        }
    },
    'Race_encoded': {
        "description": 'Race',
        "display_description": "Racial background",
        "is_categorical": True,
        "encoding": {
            0: 'American Indian/Eskimo', 1: 'Asian/Pacific Islander',
            2: 'Black', 3: 'Other', 4: 'White'
        }
    },
    'Sex_encoded': {
        "description": 'Gender',
        "display_description": "Gender",
        "is_categorical": True,
        "encoding": {0: 'Female', 1: 'Male'}
    },
    'Capital Gain': {
        "description": 'Capital Gains',
        "display_description": "Capital gains income",
        "is_categorical": False
    },
    'Capital Loss': {
        "description": 'Capital Losses',
        "display_description": "Capital losses",
        "is_categorical": False
    },
    'Hours per week': {
        "description": 'Hours per Week',
        "display_description": "Hours worked per week",
        "is_categorical": False
    },
    'Country_encoded': {
        "description": 'Country',
        "display_description": "Country of origin",
        "is_categorical": True,
        "encoding": {
            0: 'Unknown', 1: 'Cambodia', 2: 'Canada', 3: 'China',
            4: 'Columbia', 5: 'Cuba', 6: 'Dominican Republic', 7: 'Ecuador',
            8: 'El Salvador', 9: 'England', 10: 'France', 11: 'Germany',
            12: 'Greece', 13: 'Guatemala', 14: 'Haiti', 15: 'Netherlands',
            16: 'Honduras', 17: 'Hong Kong', 18: 'Hungary', 19: 'India',
            20: 'Iran', 21: 'Ireland', 22: 'Italy', 23: 'Jamaica',
            24: 'Japan', 25: 'Laos', 26: 'Mexico', 27: 'Nicaragua',
            28: 'US Territories (Guam/USVI)', 29: 'Peru', 30: 'Philippines',
            31: 'Poland', 32: 'Portugal', 33: 'Puerto Rico', 34: 'Scotland',
            35: 'South Korea', 36: 'Taiwan', 37: 'Thailand',
            38: 'Trinidad & Tobago', 39: 'United States', 40: 'Vietnam',
            41: 'Yugoslavia'
        }
    }
}

# Prepare features and target
X = data[['Age', 'Workclass_encoded', 'Education-Num', 'Marital Status_encoded', 
          'Occupation_encoded', 'Relationship_encoded', 'Race_encoded', 'Sex_encoded',
          'Capital Gain', 'Capital Loss', 'Hours per week', 'Country_encoded']]
y = data["Target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Explain with CloudExplain - This is where the magic happens!
with cloudexplain.azure.explain(
    model=model,
    X=X_test,
    y=y_test,
    model_version="0.1.0",
    model_name="IncomePredictionModel",
    model_description="Predicting whether an individual earns more than $50K annually based on demographic and employment features.",
    explanation_name=f"Income prediction model {datetime.datetime.today().strftime('%Y-%m-%d')}",
    explanation_env="prod",
    data_source="adult dataset",
    resource_group_name="p1-cloudexplain-tf",
    ml_type="binary_classification",
    is_higher_output_better=True,
    feature_descriptions=feature_descriptions,
    baseline_data=X_train,
    api_token="your_api_token_here",  # Get this from /dashboards/analytics/tokens
    function_url="https://your-env-execute-containers.azurewebsites.net/api/upload_via_token"
) as run:
    # Make predictions and evaluate
    y_preds = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Print model performance
    print(f"Explanation ID: {run.run_uuid}")
    print("Model Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_preds):.3f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}")
    print(f"F1 Score: {f1_score(y_test, y_preds):.3f}")
    print(f"Precision: {precision_score(y_test, y_preds):.3f}")
    print(f"Recall: {recall_score(y_test, y_preds):.3f}")
    
    # Your model is now fully explained!
    print("\nView your detailed explanations in the CloudExplain dashboard!")
Adult Dataset v1.0 Updated: June 2025 View Original Dataset