Adult Dataset Example
Learn how to use CloudExplain with the famous Adult dataset to predict income levels and gain deep insights into what factors drive salary predictions.
What You'll Learn
Data Preprocessing
Handle categorical variables, missing values, and feature encoding for the Adult dataset
Model Training
Build a Random Forest classifier to predict income levels based on demographic features
Explainability
Generate comprehensive explanations and understand which factors most influence income predictions
The Adult dataset, also known as "Census Income" dataset, is a classic machine learning benchmark
Dataset Details
- Task: Binary classification (income > $50K or ≤ $50K)
- Features: 14 demographic and employment attributes
- Samples: ~48,000 individuals from 1994 US Census
- Source: UCI Machine Learning Repository
Key Features
- Age, Education: Basic demographics
- Workclass, Occupation: Employment details
- Marital Status, Relationship: Family situation
- Race, Sex, Country: Additional demographics
- Capital Gain/Loss: Financial information
Why This Example is Perfect for Explainable AI
Income prediction has real-world implications for fairness and bias detection. Understanding which factors influence predictions is crucial for ethical AI.
Combines numerical (age, hours worked) and categorical features (occupation, education), demonstrating how CloudExplain handles diverse feature types.
Results are easy to understand and validate against domain knowledge, making it perfect for learning explainable AI concepts.
Ideal for exploring potential biases related to gender, race, and other sensitive attributes. CloudExplain's bias detection features shine with this dataset.
Complete Example Code
Copy and run this complete example to see CloudExplain in action with the Adult dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import cloudexplain
import datetime
# Load the famous Adult dataset from the UCI repository
github_data_url = "https://github.com/shap/shap/raw/master/data/"
dtypes = [
("Age", "float32"),
("Workclass", "category"),
("fnlwgt", "float32"),
("Education", "category"),
("Education-Num", "float32"),
("Marital Status", "category"),
("Occupation", "category"),
("Relationship", "category"),
("Race", "category"),
("Sex", "category"),
("Capital Gain", "float32"),
("Capital Loss", "float32"),
("Hours per week", "float32"),
("Country", "category"),
("Target", "category"),
]
# Load and preprocess the data
raw_data = pd.read_csv(
github_data_url + "adult.data",
names=[d[0] for d in dtypes],
na_values="?",
dtype=dict(dtypes)
)
# Clean and prepare the data
data = raw_data.drop(["Education"], axis=1) # redundant with Education-Num
data["Target"] = data["Target"] == " >50K" # Convert to binary
# Sample a subset for faster processing
data = data.sample(1000)
# Encode categorical variables
columns_to_encode = [
'Workclass', 'Marital Status', 'Occupation',
'Relationship', 'Race', 'Sex', 'Country'
]
for col in columns_to_encode:
le = LabelEncoder()
data[col + '_encoded'] = le.fit_transform(data[col].astype(str))
# Define comprehensive feature descriptions for better explanations
feature_descriptions = {
'Age': {
"description": 'Age',
"display_description": "Age in years",
"is_categorical": False
},
'Workclass_encoded': {
"description": 'Work Class',
"display_description": "Type of employment",
"is_categorical": True,
"encoding": {
0: 'Unknown', 1: 'Federal Government', 2: 'Local Government',
3: 'Never Worked', 4: 'Private Sector',
5: 'Self-Employed (Incorporated)', 6: 'Self-Employed (Not Incorporated)',
7: 'State Government', 8: 'Without Pay'
}
},
'Education-Num': {
"description": 'Education Level',
"display_description": "Years of formal education",
"is_categorical": False
},
'Marital Status_encoded': {
"description": 'Marital Status',
"display_description": "Current marital status",
"is_categorical": True,
"encoding": {
0: 'Divorced', 1: 'Married (Armed Forces Spouse)',
2: 'Married (Civilian Spouse)', 3: 'Married (Spouse Absent)',
4: 'Never Married', 5: 'Separated', 6: 'Widowed'
}
},
'Occupation_encoded': {
"description": 'Occupation',
"display_description": "Type of job/profession",
"is_categorical": True,
"encoding": {
0: 'Unknown', 1: 'Administrative/Clerical', 2: 'Armed Forces',
3: 'Craft/Repair', 4: 'Executive/Managerial', 5: 'Farming/Fishing',
6: 'Handlers/Cleaners', 7: 'Machine Operator/Inspector',
8: 'Other Service', 9: 'Private Household Service',
10: 'Professional Specialty', 11: 'Protective Service',
12: 'Sales', 13: 'Tech Support', 14: 'Transportation/Moving'
}
},
'Relationship_encoded': {
"description": 'Family Relationship',
"display_description": "Relationship within household",
"is_categorical": True,
"encoding": {
0: 'Husband', 1: 'Not in Family', 2: 'Other Relative',
3: 'Own Child', 4: 'Unmarried Partner', 5: 'Wife'
}
},
'Race_encoded': {
"description": 'Race',
"display_description": "Racial background",
"is_categorical": True,
"encoding": {
0: 'American Indian/Eskimo', 1: 'Asian/Pacific Islander',
2: 'Black', 3: 'Other', 4: 'White'
}
},
'Sex_encoded': {
"description": 'Gender',
"display_description": "Gender",
"is_categorical": True,
"encoding": {0: 'Female', 1: 'Male'}
},
'Capital Gain': {
"description": 'Capital Gains',
"display_description": "Capital gains income",
"is_categorical": False
},
'Capital Loss': {
"description": 'Capital Losses',
"display_description": "Capital losses",
"is_categorical": False
},
'Hours per week': {
"description": 'Hours per Week',
"display_description": "Hours worked per week",
"is_categorical": False
},
'Country_encoded': {
"description": 'Country',
"display_description": "Country of origin",
"is_categorical": True,
"encoding": {
0: 'Unknown', 1: 'Cambodia', 2: 'Canada', 3: 'China',
4: 'Columbia', 5: 'Cuba', 6: 'Dominican Republic', 7: 'Ecuador',
8: 'El Salvador', 9: 'England', 10: 'France', 11: 'Germany',
12: 'Greece', 13: 'Guatemala', 14: 'Haiti', 15: 'Netherlands',
16: 'Honduras', 17: 'Hong Kong', 18: 'Hungary', 19: 'India',
20: 'Iran', 21: 'Ireland', 22: 'Italy', 23: 'Jamaica',
24: 'Japan', 25: 'Laos', 26: 'Mexico', 27: 'Nicaragua',
28: 'US Territories (Guam/USVI)', 29: 'Peru', 30: 'Philippines',
31: 'Poland', 32: 'Portugal', 33: 'Puerto Rico', 34: 'Scotland',
35: 'South Korea', 36: 'Taiwan', 37: 'Thailand',
38: 'Trinidad & Tobago', 39: 'United States', 40: 'Vietnam',
41: 'Yugoslavia'
}
}
}
# Prepare features and target
X = data[['Age', 'Workclass_encoded', 'Education-Num', 'Marital Status_encoded',
'Occupation_encoded', 'Relationship_encoded', 'Race_encoded', 'Sex_encoded',
'Capital Gain', 'Capital Loss', 'Hours per week', 'Country_encoded']]
y = data["Target"]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Explain with CloudExplain - This is where the magic happens!
with cloudexplain.azure.explain(
model=model,
X=X_test,
y=y_test,
model_version="0.1.0",
model_name="IncomePredictionModel",
model_description="Predicting whether an individual earns more than $50K annually based on demographic and employment features.",
explanation_name=f"Income prediction model {datetime.datetime.today().strftime('%Y-%m-%d')}",
explanation_env="prod",
data_source="adult dataset",
resource_group_name="p1-cloudexplain-tf",
ml_type="binary_classification",
is_higher_output_better=True,
feature_descriptions=feature_descriptions,
baseline_data=X_train,
api_token="your_api_token_here", # Get this from /dashboards/analytics/tokens
function_url="https://your-env-execute-containers.azurewebsites.net/api/upload_via_token"
) as run:
# Make predictions and evaluate
y_preds = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
# Print model performance
print(f"Explanation ID: {run.run_uuid}")
print("Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_preds):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}")
print(f"F1 Score: {f1_score(y_test, y_preds):.3f}")
print(f"Precision: {precision_score(y_test, y_preds):.3f}")
print(f"Recall: {recall_score(y_test, y_preds):.3f}")
# Your model is now fully explained!
print("\nView your detailed explanations in the CloudExplain dashboard!")
pip install cloudexplain[azure]
) and
have obtained your API token from the tokens page.