Skip to content

PostgreSQL — Practical

-- active queries
SELECT pid, now()-xact_start AS dur, state, query
FROM pg_stat_activity WHERE state <> 'idle'
ORDER BY dur DESC;
-- blocking pairs
SELECT a.pid blocked, a.query blocked_q, b.pid blocking, b.query blocking_q
FROM pg_stat_activity a
JOIN pg_locks bl ON bl.pid=a.pid AND NOT bl.granted
JOIN pg_locks bg ON bg.locktype=bl.locktype AND bg.relation=bl.relation AND bg.granted
JOIN pg_stat_activity b ON b.pid=bg.pid;
-- top by total time
SELECT calls, total_exec_time::int total_ms, mean_exec_time::int mean_ms,
substring(query,1,100) q
FROM pg_stat_statements ORDER BY total_exec_time DESC LIMIT 10;
-- table sizes
SELECT relname, pg_size_pretty(pg_total_relation_size(c.oid)) total
FROM pg_class c WHERE relkind='r'
ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;
-- unused indexes
SELECT relname, indexrelname, idx_scan
FROM pg_stat_user_indexes WHERE idx_scan=0
ORDER BY pg_relation_size(indexrelid) DESC;
-- bloat estimate
SELECT relname, n_dead_tup, round(n_dead_tup::numeric/nullif(n_live_tup,0)*100,1) dead_pct
FROM pg_stat_user_tables ORDER BY n_dead_tup DESC LIMIT 20;
EXPLAIN (ANALYZE, BUFFERS) SELECT ...;

Watch: Seq Scan on big tables, large actual rows vs estimated, Rows Removed by Filter, Nested Loop over many rows.

INSERT INTO users (email, name) VALUES ('a@b','A')
ON CONFLICT (email) DO UPDATE
SET name=EXCLUDED.name, updated_at=now()
RETURNING *;
SELECT id, created_at, title FROM posts
WHERE (created_at, id) < ($1, $2)
ORDER BY created_at DESC, id DESC
LIMIT 50;
-- top 3 per user
SELECT * FROM (
SELECT *, row_number() OVER (PARTITION BY user_id ORDER BY total DESC) rn
FROM orders
) t WHERE rn <= 3;
-- running total
SELECT date, amount, sum(amount) OVER (ORDER BY date) running FROM payments;
WITH RECURSIVE tree AS (
SELECT id, parent_id, name, 0 depth FROM categories WHERE parent_id IS NULL
UNION ALL
SELECT c.id, c.parent_id, c.name, t.depth+1
FROM categories c JOIN tree t ON c.parent_id=t.id
) SELECT * FROM tree;
CREATE INDEX CONCURRENTLY idx_users_email ON users (email);
DROP INDEX CONCURRENTLY idx_old;
CREATE INDEX idx_data ON events USING gin (data);
CREATE INDEX idx_data_op ON events USING gin (data jsonb_path_ops); -- @> only
SELECT * FROM events WHERE data @> '{"type":"click"}';
SELECT data->>'user_id' FROM events;
CREATE TABLE events (id bigint, ts timestamptz, payload jsonb)
PARTITION BY RANGE (ts);
CREATE TABLE events_2026_05 PARTITION OF events
FOR VALUES FROM ('2026-05-01') TO ('2026-06-01');
WITH next AS (
SELECT id FROM jobs WHERE status='pending'
ORDER BY created_at FOR UPDATE SKIP LOCKED LIMIT 1
)
UPDATE jobs SET status='running'
WHERE id = (SELECT id FROM next) RETURNING *;
ALTER TABLE big_t ADD COLUMN status text; -- 1
-- 2: backfill in batches
UPDATE big_t SET status='ok' WHERE id BETWEEN 1 AND 1000;
-- 3
ALTER TABLE big_t ALTER COLUMN status SET NOT NULL;

Avoid ADD COLUMN ... NOT NULL DEFAULT 'x' on huge tables (rewrites). PG 11+ optimized for constant defaults.

shared_buffers = 25% of RAM
effective_cache_size = 50-75% of RAM
work_mem = 16-64MB
maintenance_work_mem = 1GB
random_page_cost = 1.1 # SSD
Terminal window
pg_dump -Fc dbname > db.dump
pg_restore -d dbname db.dump
pg_dump -Fd -j 4 -f dump_dir dbname # parallel
pg_basebackup -D /backup -Ft -z -X stream # physical

pg_stat_statements (always), pg_partman, pg_repack, postgis, pgvector, hypopg, pgaudit, timescaledb.

psql, DBeaver, pgcli. Pooler: PgBouncer. Migrations: Flyway, Alembic, knex, Atlas.


-- Multicolumn: tenant first because every query filters on it
CREATE INDEX orders_tenant_status_created
ON orders (tenant_id, status, created_at DESC);
-- Partial index: only "open" orders, much smaller
CREATE INDEX orders_open
ON orders (created_at DESC)
WHERE status = 'open';
-- Expression index: case-insensitive lookup
CREATE INDEX users_email_lower ON users (LOWER(email));
-- Covering index: enables index-only scan returning total
CREATE INDEX orders_tenant_total
ON orders (tenant_id, created_at) INCLUDE (total_cents);
-- BRIN for a 5-billion-row append-only event table
CREATE INDEX events_ts_brin ON events USING BRIN (ts);
-- Reading EXPLAIN
EXPLAIN (ANALYZE, BUFFERS) SELECT id FROM orders
WHERE tenant_id = 42 AND status='open' ORDER BY created_at DESC LIMIT 20;
-- Look for: Index Scan / Index Only Scan, Buffers: shared hit=...,
-- and that Rows Removed by Filter is small.
-- Find the worst aggregate offenders
SELECT query, calls, total_exec_time, mean_exec_time,
rows, shared_blks_read, shared_blks_hit
FROM pg_stat_statements
ORDER BY total_exec_time DESC LIMIT 20;
-- Enable plan logging for slow queries
LOAD 'auto_explain';
SET auto_explain.log_min_duration = '500ms';
SET auto_explain.log_analyze = on;
SET auto_explain.log_buffers = on;
-- Anti-pattern: OFFSET pagination on 50M rows (each page rescans)
SELECT id, ts FROM events ORDER BY ts DESC OFFSET 1000000 LIMIT 20;
-- Fix: keyset pagination (constant time per page)
SELECT id, ts FROM events
WHERE (ts, id) < ($1, $2)
ORDER BY ts DESC, id DESC
LIMIT 20;
-- Anti-pattern: ORDER BY random()
SELECT * FROM products ORDER BY random() LIMIT 1; -- O(N) sort
-- Fix on indexed PK with mostly contiguous IDs:
SELECT * FROM products
WHERE id >= (random() * (SELECT max(id) FROM products))::bigint
ORDER BY id LIMIT 1;
CREATE TABLE products (
id bigserial PRIMARY KEY,
sku text UNIQUE NOT NULL,
price numeric(12,2) NOT NULL,
attrs jsonb NOT NULL DEFAULT '{}'::jsonb
);
-- GIN index supporting @> and key-existence
CREATE INDEX products_attrs_gin ON products USING GIN (attrs);
-- Smaller, faster index for containment-only workloads
CREATE INDEX products_attrs_path_gin ON products USING GIN (attrs jsonb_path_ops);
-- Containment query (uses GIN)
SELECT id, sku FROM products WHERE attrs @> '{"color":"red","size":"L"}';
-- jsonpath query
SELECT id FROM products WHERE attrs @@ '$.tags[*] == "clearance"';
-- Expression index for a hot nested key
CREATE INDEX products_brand ON products ((attrs->>'brand'));
DO $$
DECLARE attempts int := 0;
BEGIN
LOOP
BEGIN
START TRANSACTION ISOLATION LEVEL SERIALIZABLE;
UPDATE accounts SET bal = bal - 100 WHERE id = 'A';
UPDATE accounts SET bal = bal + 100 WHERE id = 'B';
COMMIT;
EXIT;
EXCEPTION WHEN serialization_failure THEN
attempts := attempts + 1;
IF attempts > 3 THEN RAISE; END IF;
ROLLBACK;
END;
END LOOP;
END $$;

Deep — partitioning + logical replication

Section titled “Deep — partitioning + logical replication”
-- Postgres declarative partitioning by month, then index
CREATE TABLE events (id bigserial, ts timestamptz NOT NULL, payload jsonb)
PARTITION BY RANGE (ts);
CREATE TABLE events_2026_05 PARTITION OF events
FOR VALUES FROM ('2026-05-01') TO ('2026-06-01');
CREATE INDEX ON events (ts); -- propagates to all (future) partitions
-- Logical replication
CREATE PUBLICATION orders_pub FOR TABLE orders; -- on primary
CREATE SUBSCRIPTION orders_sub -- on subscriber
CONNECTION 'host=primary dbname=app user=repl'
PUBLICATION orders_pub;
-- Anti-pattern: Jaywalking
ALTER TABLE posts ADD COLUMN tags text; -- "red,green,blue" ❌
-- Fix: junction table
CREATE TABLE tags (
id serial PRIMARY KEY,
name text UNIQUE NOT NULL
);
CREATE TABLE post_tags (
post_id bigint REFERENCES posts(id) ON DELETE CASCADE,
tag_id int REFERENCES tags(id),
PRIMARY KEY (post_id, tag_id)
);
CREATE INDEX ON post_tags (tag_id);
-- Postgres-idiomatic alternative for small unjoined lists
ALTER TABLE posts ADD COLUMN tags text[] DEFAULT '{}';
CREATE INDEX posts_tags_gin ON posts USING GIN (tags);
SELECT * FROM posts WHERE tags @> ARRAY['featured'];
-- Closure table for hierarchies (Karwin's preferred pattern)
CREATE TABLE org (id int PRIMARY KEY, name text);
CREATE TABLE org_paths (
ancestor int NOT NULL REFERENCES org(id),
descendant int NOT NULL REFERENCES org(id),
depth int NOT NULL,
PRIMARY KEY (ancestor, descendant)
);
-- "All descendants of node 5"
SELECT o.* FROM org o JOIN org_paths p ON p.descendant = o.id
WHERE p.ancestor = 5 AND p.depth > 0;
-- Safe online migration
ALTER TABLE users ADD COLUMN email_verified_at timestamptz; -- instant in PG 11+
CREATE INDEX CONCURRENTLY users_email_lower ON users (LOWER(email));

Deep — polyglot persistence (Redis + Timescale)

Section titled “Deep — polyglot persistence (Redis + Timescale)”

Redis use cases pair naturally with Postgres as the system of record:

// Redis: rate limiter (token bucket) with atomic Lua
const luaScript = `
local key = KEYS[1]
local capacity = tonumber(ARGV[1])
local refillPerSec = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
local bucket = redis.call('HMGET', key, 'tokens', 'ts')
local tokens = tonumber(bucket[1]) or capacity
local ts = tonumber(bucket[2]) or now
local delta = math.max(0, now - ts) * refillPerSec
tokens = math.min(capacity, tokens + delta)
if tokens < 1 then return 0 end
redis.call('HMSET', key, 'tokens', tokens - 1, 'ts', now)
redis.call('EXPIRE', key, 60)
return 1
`;
const allowed = await redis.eval(luaScript, 1, `rl:${userId}`, 60, 1, Date.now()/1000);
// Redis sorted set: leaderboard
await redis.zadd('lb:season-1', score, userId);
const top10 = await redis.zrevrange('lb:season-1', 0, 9, 'WITHSCORES');
// Redis HyperLogLog: unique visitors (12 KB, ~0.81% error)
await redis.pfadd('uv:2026-05-17', userIdA, userIdB);
const uniques = await redis.pfcount('uv:2026-05-17');
-- TimescaleDB hypertable: stays inside Postgres
CREATE TABLE metrics (
ts timestamptz NOT NULL,
device_id bigint NOT NULL,
value double precision
);
SELECT create_hypertable('metrics', 'ts', chunk_time_interval => INTERVAL '1 day');
CREATE INDEX ON metrics (device_id, ts DESC);
-- Continuous aggregate (incremental materialized view)
CREATE MATERIALIZED VIEW metrics_1h
WITH (timescaledb.continuous) AS
SELECT time_bucket('1 hour', ts) AS hour, device_id, avg(value)
FROM metrics GROUP BY hour, device_id;
-- Postgres bloom-index extension for multi-column equality
CREATE EXTENSION bloom;
CREATE INDEX events_bloom
ON events USING bloom (tenant_id, source, type, severity)
WITH (length=80, col1=2, col2=2, col3=4, col4=2);
-- Now WHERE source='x' AND severity='y' can probe the bloom index
// Redis HyperLogLog vs naïve SET for unique counting
await redis.sadd('uv:naive:2026-05-17', userId); // O(N) memory
await redis.scard('uv:naive:2026-05-17'); // exact
await redis.pfadd('uv:hll:2026-05-17', userId); // ~12 KB total
await redis.pfcount('uv:hll:2026-05-17'); // ~0.81% error
// Merge a week of HLLs into one count
await redis.pfmerge('uv:hll:week-20', 'uv:hll:2026-05-11', /* ... */ 'uv:hll:2026-05-17');