Neural Network Implementation (PyTorch)
Goal
Implement MLP, CNN, and RNN neural networks in PyTorch with complete training loops, validation, and serialisation.
Conceptual Counterpart
- Deep Learning Architectures — MLP, CNN, RNN inductive biases and design choices
- Deep Learning Theory — backpropagation, activation functions, initialisation
- Document Summarization — downstream NLP application
Purpose
Practical reference for implementing and training feedforward, convolutional, and recurrent neural networks with PyTorch.
Examples
Tabular classification: MLP with BatchNorm and Dropout for structured tabular data.
Image classification: CNN encoder with fully-connected classification head.
Sequence modeling: LSTM applied to time-series or NLP token sequences.
Architecture
Standard training workflow:
Dataset → DataLoader → Model (nn.Module)
↑ ↓
Transforms Forward pass
↓
Loss function
↓
Backward pass
↓
Optimizer.step()
↓
Validation loop (no_grad)
↓
Checkpoint best model
MLP with Full Training Loop
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
# --- Model definition ---
class MLP(nn.Module):
def __init__(self, input_size: int, hidden_sizes: list[int], num_classes: int, dropout: float = 0.3):
super().__init__()
layers = []
in_size = input_size
for h in hidden_sizes:
layers += [nn.Linear(in_size, h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]
in_size = h
layers.append(nn.Linear(in_size, num_classes))
self.net = nn.Sequential(*layers)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
# --- Dataset setup ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE, LR, N_EPOCHS, PATIENCE = 64, 1e-3, 50, 7
X = torch.randn(5000, 20) # replace with real features
y = torch.randint(0, 3, (5000,))
dataset = TensorDataset(X, y)
n_train = int(0.8 * len(dataset))
train_ds, val_ds = random_split(dataset, [n_train, len(dataset) - n_train])
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
# --- Model / optimizer / scheduler ---
model = MLP(20, [128, 64], 3, dropout=0.3).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-2)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EPOCHS)
# --- Training loop with early stopping ---
best_val_loss, no_improve, train_losses, val_losses = float("inf"), 0, [], []
for epoch in range(N_EPOCHS):
# Train
model.train()
epoch_loss = 0.0
for X_b, y_b in train_loader:
X_b, y_b = X_b.to(DEVICE), y_b.to(DEVICE)
optimizer.zero_grad()
loss = criterion(model(X_b), y_b)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
epoch_loss += loss.item()
train_losses.append(epoch_loss / len(train_loader))
# Validate
model.eval()
val_loss = 0.0
with torch.no_grad():
for X_b, y_b in val_loader:
X_b, y_b = X_b.to(DEVICE), y_b.to(DEVICE)
val_loss += criterion(model(X_b), y_b).item()
val_loss /= len(val_loader)
val_losses.append(val_loss)
scheduler.step()
# Early stopping + best checkpoint
if val_loss < best_val_loss:
best_val_loss = val_loss
no_improve = 0
torch.save({"epoch": epoch, "model_state": model.state_dict(),
"optimizer_state": optimizer.state_dict(), "val_loss": val_loss},
"best_model.pt")
else:
no_improve += 1
if no_improve >= PATIENCE:
print(f"Early stopping at epoch {epoch}")
break
# --- Loss curve logging (matplotlib) ---
import matplotlib.pyplot as plt
plt.plot(train_losses, label="train"); plt.plot(val_losses, label="val")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.legend(); plt.savefig("loss_curve.png")
# --- Load best model for inference ---
ckpt = torch.load("best_model.pt", weights_only=True)
model.load_state_dict(ckpt["model_state"])
model.eval()CNN for Image Classification
class CNN(nn.Module):
def __init__(self, in_channels: int = 3, num_classes: int = 10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(in_channels, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((4, 4)),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 256), nn.ReLU(), nn.Dropout(0.4),
nn.Linear(256, num_classes),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.classifier(self.features(x))
# Usage with torchvision datasets:
from torchvision import datasets, transforms
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_data = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True, num_workers=4)LSTM for Sequence Modeling
class LSTMClassifier(nn.Module):
def __init__(self, input_size: int, hidden_size: int, num_layers: int, num_classes: int, dropout: float = 0.3):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# x: (batch, seq_len, input_size)
_, (h_n, _) = self.lstm(x)
return self.fc(h_n[-1]) # use last layer hidden state
# Synthetic time-series example
model_lstm = LSTMClassifier(input_size=10, hidden_size=64, num_layers=2, num_classes=3).to(DEVICE)
x_seq = torch.randn(32, 20, 10).to(DEVICE) # batch=32, seq=20, features=10
out = model_lstm(x_seq) # (32, 3)Model Serialization
# Preferred: save/load state dict only
torch.save(model.state_dict(), "model_weights.pt")
model.load_state_dict(torch.load("model_weights.pt", weights_only=True))
# Full checkpoint (for resuming training)
torch.save({
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"val_loss": best_val_loss,
}, "checkpoint.pt")
# Resume
ckpt = torch.load("checkpoint.pt", weights_only=False)
model.load_state_dict(ckpt["model_state_dict"])
optimizer.load_state_dict(ckpt["optimizer_state_dict"])
start_epoch = ckpt["epoch"] + 1Key Configuration Choices
| Concern | Recommendation |
|---|---|
| Optimizer | AdamW (L2 via weight_decay) for most tasks; SGD+momentum for large-batch CV |
| Learning rate | 1e-3 for Adam; 0.1 for SGD; always use a scheduler |
| Scheduler | CosineAnnealingLR for fixed-budget; ReduceLROnPlateau for dynamic |
| Gradient clipping | clip_grad_norm_(params, 1.0) — prevents exploding gradients |
| Batch norm | Use after Linear/Conv, before activation; disable eval mode noise with model.eval() |
| Dropout | 0.1–0.3 in hidden layers; 0.4–0.5 before final classifier |
| Early stopping | patience=5–10 epochs on val loss; restore best checkpoint |
References
Links
Foundations
- Backpropagation — gradient flow through computation graph
- Batch Normalization — normalising layer activations
- Dropout — regularisation via random unit masking
- Weight Initialization — Xavier / Kaiming init
- Adaptive Optimizers — Adam, AdamW, RMSProp
Modeling
- Multi-Layer Perceptron — MLP architecture theory
- CNN Architecture — convolutional feature learning
- Recurrent Networks — LSTM / GRU theory
Applications
- Deep Learning Training Workflow — end-to-end PyTorch + Accelerate + MLflow pipeline
- Distributed Training with Accelerate — multi-GPU training wrapper