Learn unsupervised learning techniques including K-Means clustering, hierarchical clustering, PCA, and dimensionality reduction for pattern discovery.
Learn unsupervised learning techniques including K-Means clustering, hierarchical clustering, PCA, and dimensionality reduction for pattern discovery.
Learn K-Means clustering, a popular algorithm for partitioning data into clusters.
Content by: Nirav Khanpara
AI/ML Engineer
K-Means is one of the most popular unsupervised learning algorithms used for clustering. It partitions data into K clusters where each data point belongs to the cluster with the nearest mean.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Create and fit the model
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X)
# Get cluster labels and centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_
# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8, marker='x')
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
# Evaluate clustering
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")
Test your understanding of this topic:
Explore hierarchical clustering for building a tree of clusters.
Content by: Nirav Khanpara
AI/ML Engineer
Hierarchical clustering builds a tree of clusters by either merging smaller clusters into larger ones (agglomerative) or splitting larger clusters into smaller ones (divisive).
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
# Generate data
X, y_true = make_blobs(n_samples=50, centers=3, cluster_std=0.60, random_state=0)
# Agglomerative clustering
clustering = AgglomerativeClustering(n_clusters=3)
y_pred = clustering.fit_predict(X)
# Create linkage matrix for dendrogram
linkage_matrix = linkage(X, method='ward')
# Plot dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()
# Visualize clusters
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis')
plt.title('Hierarchical Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
Test your understanding of this topic:
Understand PCA for reducing dimensionality while preserving information.
Content by: Nirav Khanpara
AI/ML Engineer
Principal Component Analysis (PCA) is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional representation while preserving the most important information.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
np.random.seed(42)
X = np.random.randn(100, 10)
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print(f"Explained variance ratio: {explained_variance_ratio}")
print(f"Total explained variance: {sum(explained_variance_ratio):.4f}")
# Visualize the results
plt.figure(figsize=(12, 4))
# Original data (first 2 dimensions)
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1])
plt.title('Original Data (First 2 Dimensions)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
# PCA transformed data
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.title('PCA Transformed Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.tight_layout()
plt.show()
Test your understanding of this topic:
Learn various dimensionality reduction techniques for data analysis.
Content by: Nirav Khanpara
AI/ML Engineer
Dimensionality reduction helps reduce the number of features while preserving important information. Common techniques include PCA, t-SNE, and UMAP.
from sklearn.manifold import TSNE
import umap
# PCA (already implemented above)
# t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
# UMAP
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X)
# Compare visualizations
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# PCA
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
axes[0].set_title('PCA')
# t-SNE
axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis')
axes[1].set_title('t-SNE')
# UMAP
axes[2].scatter(X_umap[:, 0], X_umap[:, 1], c=y, cmap='viridis')
axes[2].set_title('UMAP')
plt.tight_layout()
plt.show()
Test your understanding of this topic:
Continue your learning journey and master the next set of concepts.
Continue to Module 5