Data Design
Overview
Principles for designing data structures, schemas, and data flows that are efficient, maintainable, and scalable.
Data Modeling
Entity-Relationship Diagrams
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ User │ │ Order │ │ Product │
├─────────────┤ ├─────────────┤ ├─────────────┤
│ id (PK) │──┐ │ id (PK) │ ┌──│ id (PK) │
│ email │ │ │ user_id(FK) │←───┘ │ name │
│ name │ └───→│ status │ │ price │
│ created_at │ │ total │ │ stock │
└─────────────┘ │ created_at │ └─────────────┘
└─────────────┘ │
│ │
┌──────┴──────┐ │
↓ ↓ │
┌─────────────┐ │
│ OrderItem │ │
├─────────────┤ │
│ id (PK) │ │
│ order_id(FK)│ │
│ product_id │─────────────────────┘
│ quantity │
│ price │
└─────────────┘
Relationship Types
| Type |
Description |
Example |
| 1:1 |
One to one |
User ↔ Profile |
| 1:N |
One to many |
User → Orders |
| M:N |
Many to many |
Students ↔ Courses |
-- 1:1 (profile extends user)
CREATE TABLE users (
id SERIAL PRIMARY KEY,
email VARCHAR(255) UNIQUE
);
CREATE TABLE profiles (
user_id INTEGER PRIMARY KEY REFERENCES users(id),
bio TEXT,
avatar_url VARCHAR(255)
);
-- 1:N (user has many orders)
CREATE TABLE orders (
id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES users(id),
total DECIMAL(10,2)
);
-- M:N (students ↔ courses via junction table)
CREATE TABLE enrollments (
student_id INTEGER REFERENCES students(id),
course_id INTEGER REFERENCES courses(id),
enrolled_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (student_id, course_id)
);
Normalization
Normal Forms
| Form |
Rule |
Example Violation |
| 1NF |
Atomic values, no repeating groups |
tags: "a,b,c" |
| 2NF |
1NF + no partial dependencies |
Non-key depends on part of composite key |
| 3NF |
2NF + no transitive dependencies |
zip → city in orders table |
| BCNF |
Every determinant is a candidate key |
Rare edge cases |
-- ❌ Violates 1NF (non-atomic)
CREATE TABLE products (
id SERIAL PRIMARY KEY,
name VARCHAR(255),
tags VARCHAR(255) -- "electronics,sale,featured"
);
-- ✅ 1NF compliant
CREATE TABLE products (
id SERIAL PRIMARY KEY,
name VARCHAR(255)
);
CREATE TABLE product_tags (
product_id INTEGER REFERENCES products(id),
tag VARCHAR(50),
PRIMARY KEY (product_id, tag)
);
-- ❌ Violates 3NF (transitive dependency)
CREATE TABLE orders (
id SERIAL PRIMARY KEY,
customer_zip VARCHAR(10),
customer_city VARCHAR(100) -- Depends on zip, not order
);
-- ✅ 3NF compliant
CREATE TABLE customers (
id SERIAL PRIMARY KEY,
zip VARCHAR(10),
city VARCHAR(100)
);
CREATE TABLE orders (
id SERIAL PRIMARY KEY,
customer_id INTEGER REFERENCES customers(id)
);
Denormalization
When to Denormalize
Normalize for:
✅ Write-heavy workloads
✅ Data integrity requirements
✅ Storage efficiency
✅ Flexibility in queries
Denormalize for:
✅ Read-heavy workloads
✅ Complex joins hurting performance
✅ Reporting/analytics
✅ Known access patterns
Denormalization Patterns
-- Computed columns
CREATE TABLE orders (
id SERIAL PRIMARY KEY,
items JSONB,
item_count INTEGER GENERATED ALWAYS AS (jsonb_array_length(items)) STORED,
total DECIMAL(10,2)
);
-- Duplicated data for read performance
CREATE TABLE posts (
id SERIAL PRIMARY KEY,
author_id INTEGER REFERENCES users(id),
author_name VARCHAR(100), -- Duplicated from users
author_avatar VARCHAR(255), -- Duplicated from users
content TEXT
);
-- Materialized view for complex queries
CREATE MATERIALIZED VIEW monthly_sales AS
SELECT
DATE_TRUNC('month', created_at) as month,
product_id,
SUM(quantity) as units_sold,
SUM(total) as revenue
FROM order_items
GROUP BY 1, 2;
-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY monthly_sales;
Schema Design Patterns
Soft Deletes
CREATE TABLE users (
id SERIAL PRIMARY KEY,
email VARCHAR(255),
deleted_at TIMESTAMP NULL,
-- Partial unique index
CONSTRAINT unique_active_email UNIQUE (email) WHERE deleted_at IS NULL
);
-- Query active users only
SELECT * FROM users WHERE deleted_at IS NULL;
Audit Trail
CREATE TABLE audit_log (
id SERIAL PRIMARY KEY,
table_name VARCHAR(100),
record_id INTEGER,
action VARCHAR(10), -- INSERT, UPDATE, DELETE
old_data JSONB,
new_data JSONB,
changed_by INTEGER REFERENCES users(id),
changed_at TIMESTAMP DEFAULT NOW()
);
-- Trigger for automatic auditing
CREATE OR REPLACE FUNCTION audit_trigger()
RETURNS TRIGGER AS $$
BEGIN
INSERT INTO audit_log (table_name, record_id, action, old_data, new_data, changed_by)
VALUES (
TG_TABLE_NAME,
COALESCE(NEW.id, OLD.id),
TG_OP,
CASE WHEN TG_OP != 'INSERT' THEN to_jsonb(OLD) END,
CASE WHEN TG_OP != 'DELETE' THEN to_jsonb(NEW) END,
current_setting('app.user_id', true)::INTEGER
);
RETURN COALESCE(NEW, OLD);
END;
$$ LANGUAGE plpgsql;
Multi-Tenancy
-- Row-level security
CREATE TABLE organizations (
id SERIAL PRIMARY KEY,
name VARCHAR(255)
);
CREATE TABLE projects (
id SERIAL PRIMARY KEY,
org_id INTEGER REFERENCES organizations(id),
name VARCHAR(255)
);
-- Enable RLS
ALTER TABLE projects ENABLE ROW LEVEL SECURITY;
CREATE POLICY org_isolation ON projects
USING (org_id = current_setting('app.org_id')::INTEGER);
-- Set org context per request
SET app.org_id = 123;
SELECT * FROM projects; -- Only sees org 123's projects
Versioning / History
-- Version table pattern
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
current_version_id INTEGER
);
CREATE TABLE document_versions (
id SERIAL PRIMARY KEY,
document_id INTEGER REFERENCES documents(id),
version INTEGER,
content TEXT,
created_at TIMESTAMP DEFAULT NOW(),
created_by INTEGER REFERENCES users(id),
UNIQUE (document_id, version)
);
-- Temporal tables (PostgreSQL)
CREATE TABLE products (
id SERIAL PRIMARY KEY,
name VARCHAR(255),
price DECIMAL(10,2),
valid_from TIMESTAMP DEFAULT NOW(),
valid_to TIMESTAMP DEFAULT 'infinity'
);
-- Query historical state
SELECT * FROM products
WHERE valid_from <= '2024-01-01' AND valid_to > '2024-01-01';
NoSQL Schema Design
Document Store (MongoDB)
// Embedded vs Referenced
// ✅ Embed when: data is accessed together, 1:few relationship
{
_id: ObjectId("..."),
title: "Blog Post",
author: {
name: "John",
email: "john@example.com"
},
comments: [
{ user: "Jane", text: "Great post!", date: ISODate("...") }
]
}
// ✅ Reference when: data is accessed independently, 1:many or M:N
{
_id: ObjectId("..."),
title: "Blog Post",
authorId: ObjectId("..."), // Reference to users collection
commentIds: [ObjectId("..."), ObjectId("...")]
}
// ❌ Anti-pattern: Unbounded arrays
{
_id: ObjectId("..."),
logs: [...] // Can grow to millions, hits 16MB limit
}
// ✅ Better: Bucket pattern
{
_id: ObjectId("..."),
sensorId: "sensor-123",
date: ISODate("2024-01-15"),
readings: [...] // Max ~1000 per document
}
Key-Value Store (Redis)
# Naming conventions
user:123 # User object
user:123:sessions # User's sessions (set)
user:123:orders # User's orders (list)
order:456 # Order object
orders:pending # Queue of pending orders (list)
products:category:electronics # Products in category (set)
# Expiration patterns
session:{token} # Expires after 30 min
rate_limit:ip:1.2.3.4 # Expires after 1 min
cache:api:/users/123 # Expires after 5 min
Data Pipeline Design
ETL vs ELT
ETL (Extract, Transform, Load):
Source → Transform (external) → Data Warehouse
Use: Traditional, when transformation is complex
ELT (Extract, Load, Transform):
Source → Data Lake/Warehouse → Transform (in-place)
Use: Modern, leverages warehouse compute power
Event Sourcing
// Events are the source of truth
interface Event {
id: string;
aggregateId: string;
type: string;
payload: unknown;
timestamp: Date;
version: number;
}
// Event store
class EventStore {
async append(aggregateId: string, events: Event[]) {
await db.events.insertMany(events);
}
async getEvents(aggregateId: string): Promise<Event[]> {
return db.events
.find({ aggregateId })
.sort({ version: 1 })
.toArray();
}
}
// Rebuild state from events
function rebuildAccount(events: Event[]): Account {
return events.reduce((account, event) => {
switch (event.type) {
case 'AccountOpened':
return { balance: 0, ...event.payload };
case 'MoneyDeposited':
return { ...account, balance: account.balance + event.payload.amount };
case 'MoneyWithdrawn':
return { ...account, balance: account.balance - event.payload.amount };
default:
return account;
}
}, {} as Account);
}
Data Governance
Data Quality Dimensions
| Dimension |
Description |
Example Check |
| Accuracy |
Correct values |
Email format validation |
| Completeness |
No missing data |
Required fields present |
| Consistency |
Same across systems |
User ID matches in all tables |
| Timeliness |
Up to date |
Last updated within 24h |
| Uniqueness |
No duplicates |
Unique email per user |
Schema Evolution
-- Safe migrations
-- ✅ Adding nullable column
ALTER TABLE users ADD COLUMN phone VARCHAR(20) NULL;
-- ✅ Adding column with default
ALTER TABLE users ADD COLUMN status VARCHAR(20) DEFAULT 'active';
-- ⚠️ Making column non-null (multi-step)
-- Step 1: Add with default
ALTER TABLE users ADD COLUMN verified BOOLEAN DEFAULT false;
-- Step 2: Backfill data
UPDATE users SET verified = true WHERE email_verified_at IS NOT NULL;
-- Step 3: Add constraint
ALTER TABLE users ALTER COLUMN verified SET NOT NULL;
-- ❌ Dangerous: Renaming column
-- Instead: Add new, migrate data, remove old (over multiple deploys)
Related Skills
- [[database]] - Database implementation
- [[architecture-patterns]] - Data architecture patterns
- [[api-design]] - Data in APIs