Data Types & Datasets

Understanding your data type determines which algorithms and preprocessing steps to use. Numeric data is ready for most models. Categorical data must be encoded. Time-series data requires specialized handling. Text and image data belong to the AI course. Scikit-learn ships built-in datasets for learning that mirror real-world ML problems.

15 min•By Priygop Team•Updated 2026

Data Types and Scikit-learn Datasets

import pandas as pd
import numpy as np
from sklearn.datasets import (load_iris, load_breast_cancer, load_diabetes,
                               fetch_california_housing, make_classification)

# DATA TYPES IN ML
data_types = {
    "Numerical (continuous)": "Price, temperature, age, income -- use as-is after scaling",
    "Numerical (discrete)":   "Count of rooms, number of purchases -- treat as numeric",
    "Ordinal categorical":     "Small/Medium/Large, 1-5 star rating -- encode as 1,2,3...",
    "Nominal categorical":     "Color, city, gender -- one-hot encode or target encode",
    "Boolean":                 "Is_fraud, has_children -- already 0/1",
    "Datetime":                "Extract: hour, day of week, month, days since event",
    "Text":                    "Reviews, tweets -- belongs to NLP (AI course)",
    "Image":                   "Pixels -- belongs to Computer Vision (AI course)",
}

for dtype, handling in data_types.items():
    print(f"  {dtype:30s}: {handling}")

# BUILT-IN SCIKIT-LEARN DATASETS -- great for learning
print("\nScikit-learn built-in datasets:")

# Classification datasets
iris         = load_iris()                  # 150 samples, 4 features, 3 classes
cancer       = load_breast_cancer()         # 569 samples, 30 features, binary
X_cls, y_cls = make_classification(n_samples=1000, n_features=20, n_informative=10, random_state=42)

# Regression dataset
diabetes     = load_diabetes()              # 442 patients, predict disease progression
housing      = fetch_california_housing()   # California census, predict median house price

datasets_info = [
    ("Iris",               iris.data.shape,              iris.target_names.tolist(),   "multi-class classification"),
    ("Breast Cancer",      cancer.data.shape,            cancer.target_names.tolist(), "binary classification"),
    ("Diabetes",           diabetes.data.shape,          ["continuous"],               "regression"),
    ("California Housing", housing.data.shape,           ["continuous"],               "regression"),
]

for name, shape, target, task in datasets_info:
    print(f"  {name:20s}: shape={str(shape):12s} target={str(target):30s} task={task}")

# CREATING A REALISTIC SYNTHETIC DATASET
def make_loan_dataset(n: int = 2000, seed: int = 42) -> pd.DataFrame:
    rng = np.random.RandomState(seed)
    income      = rng.normal(55000, 20000, n).clip(15000, 200000)
    credit_score= rng.normal(680, 80, n).clip(300, 850)
    loan_amount = rng.normal(15000, 8000, n).clip(1000, 50000)
    employment  = rng.choice(["full-time", "part-time", "self-employed"], n, p=[0.6, 0.2, 0.2])
    default_prob = 1 / (1 + np.exp(0.00001*income + 0.005*credit_score - 0.00002*loan_amount - 2))
    default     = rng.binomial(1, default_prob, n)
    return pd.DataFrame({
        "income": income.round(0), "credit_score": credit_score.round(0),
        "loan_amount": loan_amount.round(0), "employment": employment, "default": default,
    })

df = make_loan_dataset()
print(f"\nLoan dataset: {df.shape}")
print(df.dtypes)
print(f"Default rate: {df['default'].mean():.1%}")

Tip

Practice Data Types Datasets in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Everything in Python is an object — use type() to check

Practice Task

Note

Practice Task — (1) Write a working example of Data Types Datasets from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Data Types Datasets is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module

Data Types & Datasets

15 min•By Priygop Team•Updated 2026

Data Types and Scikit-learn Datasets

import pandas as pd
import numpy as np
from sklearn.datasets import (load_iris, load_breast_cancer, load_diabetes,
                               fetch_california_housing, make_classification)

# DATA TYPES IN ML
data_types = {
    "Numerical (continuous)": "Price, temperature, age, income -- use as-is after scaling",
    "Numerical (discrete)":   "Count of rooms, number of purchases -- treat as numeric",
    "Ordinal categorical":     "Small/Medium/Large, 1-5 star rating -- encode as 1,2,3...",
    "Nominal categorical":     "Color, city, gender -- one-hot encode or target encode",
    "Boolean":                 "Is_fraud, has_children -- already 0/1",
    "Datetime":                "Extract: hour, day of week, month, days since event",
    "Text":                    "Reviews, tweets -- belongs to NLP (AI course)",
    "Image":                   "Pixels -- belongs to Computer Vision (AI course)",
}

for dtype, handling in data_types.items():
    print(f"  {dtype:30s}: {handling}")

# BUILT-IN SCIKIT-LEARN DATASETS -- great for learning
print("\nScikit-learn built-in datasets:")

# Classification datasets
iris         = load_iris()                  # 150 samples, 4 features, 3 classes
cancer       = load_breast_cancer()         # 569 samples, 30 features, binary
X_cls, y_cls = make_classification(n_samples=1000, n_features=20, n_informative=10, random_state=42)

# Regression dataset
diabetes     = load_diabetes()              # 442 patients, predict disease progression
housing      = fetch_california_housing()   # California census, predict median house price

datasets_info = [
    ("Iris",               iris.data.shape,              iris.target_names.tolist(),   "multi-class classification"),
    ("Breast Cancer",      cancer.data.shape,            cancer.target_names.tolist(), "binary classification"),
    ("Diabetes",           diabetes.data.shape,          ["continuous"],               "regression"),
    ("California Housing", housing.data.shape,           ["continuous"],               "regression"),
]

for name, shape, target, task in datasets_info:
    print(f"  {name:20s}: shape={str(shape):12s} target={str(target):30s} task={task}")

# CREATING A REALISTIC SYNTHETIC DATASET
def make_loan_dataset(n: int = 2000, seed: int = 42) -> pd.DataFrame:
    rng = np.random.RandomState(seed)
    income      = rng.normal(55000, 20000, n).clip(15000, 200000)
    credit_score= rng.normal(680, 80, n).clip(300, 850)
    loan_amount = rng.normal(15000, 8000, n).clip(1000, 50000)
    employment  = rng.choice(["full-time", "part-time", "self-employed"], n, p=[0.6, 0.2, 0.2])
    default_prob = 1 / (1 + np.exp(0.00001*income + 0.005*credit_score - 0.00002*loan_amount - 2))
    default     = rng.binomial(1, default_prob, n)
    return pd.DataFrame({
        "income": income.round(0), "credit_score": credit_score.round(0),
        "loan_amount": loan_amount.round(0), "employment": employment, "default": default,
    })

df = make_loan_dataset()
print(f"\nLoan dataset: {df.shape}")
print(df.dtypes)
print(f"Default rate: {df['default'].mean():.1%}")

Tip

Practice Data Types Datasets in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Everything in Python is an object — use type() to check

Topics in This Module