RAG / LLM — Practical
RAG / LLM — Practical patterns
Section titled “RAG / LLM — Practical patterns”Minimal RAG (TS, OpenAI + pgvector)
Section titled “Minimal RAG (TS, OpenAI + pgvector)”import OpenAI from 'openai';import { Pool } from 'pg';
const ai = new OpenAI();const db = new Pool({ connectionString: process.env.DATABASE_URL });
// schema:// CREATE EXTENSION vector;// CREATE TABLE docs (id BIGSERIAL PRIMARY KEY, source TEXT, chunk TEXT,// embedding VECTOR(1536), metadata JSONB);// CREATE INDEX ON docs USING ivfflat (embedding vector_cosine_ops);
async function embed(text: string) { const r = await ai.embeddings.create({ model: 'text-embedding-3-small', input: text, }); return r.data[0].embedding;}
async function ingest(source: string, text: string, metadata = {}) { for (const chunk of chunkText(text, 800, 100)) { const v = await embed(chunk); await db.query( `INSERT INTO docs(source, chunk, embedding, metadata) VALUES ($1, $2, $3, $4)`, [source, chunk, JSON.stringify(v), metadata]); }}
async function retrieve(query: string, k = 5) { const v = await embed(query); const r = await db.query( `SELECT chunk, source, metadata, 1 - (embedding <=> $1::vector) AS sim FROM docs ORDER BY embedding <=> $1::vector LIMIT $2`, [JSON.stringify(v), k]); return r.rows;}
async function answer(question: string) { const ctx = await retrieve(question, 5); const prompt = ctx.map(r => `[${r.source}] ${r.chunk}`).join('\n\n');
const r = await ai.chat.completions.create({ model: 'gpt-4.1-mini', temperature: 0, messages: [ { role: 'system', content: `Answer ONLY from the provided context. If unknown, say "I don't know". Cite sources by [source].` }, { role: 'user', content: `CONTEXT:\n${prompt}\n\nQUESTION: ${question}` }, ], }); return r.choices[0].message.content;}Recursive chunking
Section titled “Recursive chunking”function chunkText(text: string, size = 800, overlap = 100): string[] { // try paragraph splits, then sentences, then words const paragraphs = text.split(/\n\n+/); const chunks: string[] = []; let buf = ''; for (const p of paragraphs) { if ((buf + p).length < size) { buf += (buf ? '\n\n' : '') + p; } else { if (buf) chunks.push(buf); buf = p; } } if (buf) chunks.push(buf);
// add overlap return chunks.map((c, i) => { if (i === 0) return c; const tail = chunks[i-1].slice(-overlap); return tail + '\n' + c; });}Hybrid retrieval (vector + BM25 via RRF)
Section titled “Hybrid retrieval (vector + BM25 via RRF)”-- assume tsvector column maintained:-- ALTER TABLE docs ADD COLUMN tsv tsvector-- GENERATED ALWAYS AS (to_tsvector('english', chunk)) STORED;-- CREATE INDEX ON docs USING gin(tsv);
WITH vec AS ( SELECT id, source, chunk, ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rk FROM docs ORDER BY embedding <=> $1::vector LIMIT 50), kw AS ( SELECT id, source, chunk, ROW_NUMBER() OVER (ORDER BY ts_rank(tsv, websearch_to_tsquery($2)) DESC) AS rk FROM docs WHERE tsv @@ websearch_to_tsquery($2) LIMIT 50), fused AS ( SELECT id, source, chunk, SUM(1.0 / (60 + rk)) AS score FROM (SELECT * FROM vec UNION ALL SELECT * FROM kw) u GROUP BY id, source, chunk)SELECT * FROM fused ORDER BY score DESC LIMIT 10;Reranker (Cohere Rerank example)
Section titled “Reranker (Cohere Rerank example)”import { CohereClient } from 'cohere-ai';const co = new CohereClient({ token: process.env.COHERE_KEY });
async function rerank(query: string, docs: { id: string, text: string }[], top = 5) { const r = await co.rerank({ model: 'rerank-multilingual-v3.0', query, documents: docs.map(d => d.text), topN: top, }); return r.results.map(x => docs[x.index]);}Streaming response with citations
Section titled “Streaming response with citations”const stream = await ai.chat.completions.create({ model: 'gpt-4.1-mini', stream: true, temperature: 0, messages: [...],});for await (const part of stream) { process.stdout.write(part.choices[0]?.delta?.content ?? '');}Multi-tenant filter
Section titled “Multi-tenant filter”SELECT * FROM docsWHERE metadata->>'tenant' = $1ORDER BY embedding <=> $2::vectorLIMIT 10;Always filter before similarity for hard isolation. In Pinecone/Qdrant: namespace = tenant.
RAGAS evaluation (Python)
Section titled “RAGAS evaluation (Python)”from ragas import evaluatefrom ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recallfrom datasets import Dataset
ds = Dataset.from_list([ { "question": "...", "answer": "...", "contexts": [...], "ground_truth": "..." }, ...])result = evaluate(ds, metrics=[faithfulness, answer_relevancy, context_precision, context_recall])print(result)Run on golden set per PR; fail if score regresses.
LangChain quick-start (for rapid prototyping)
Section titled “LangChain quick-start (for rapid prototyping)”from langchain_openai import OpenAIEmbeddings, ChatOpenAIfrom langchain_community.vectorstores import PGVectorfrom langchain.text_splitter import RecursiveCharacterTextSplitterfrom langchain.chains import RetrievalQA
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)docs = splitter.split_documents(loaded)
emb = OpenAIEmbeddings(model="text-embedding-3-small")store = PGVector.from_documents(docs, emb, connection_string=DSN)
retriever = store.as_retriever(search_kwargs={"k": 5})chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-4.1-mini"), retriever=retriever)
print(chain.invoke({"query": "..."}))For production, prefer thinner custom code over LangChain’s abstractions.
Self-hosted (Ollama / vLLM)
Section titled “Self-hosted (Ollama / vLLM)”# Ollama for laptopsollama pull llama3.1curl http://localhost:11434/api/chat -d '{"model":"llama3.1","messages":[...]}'
# vLLM in prodpip install vllmpython -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct# OpenAI-compatible API at :8000Cost / latency optimizations
Section titled “Cost / latency optimizations”- Cache by
hash(question)for FAQ-like. - Pre-compute embeddings once; don’t re-embed.
- Smaller models (gpt-4.1-mini, Haiku) for simple queries; route to flagship for hard.
- Stream output to user.
- Set max output tokens.
- Reduce retrieved context if possible.
- Batch embedding calls.
- Vector DBs: pgvector, Qdrant, Pinecone, Weaviate, Milvus.
- Embedders: OpenAI, Cohere, Voyage, BGE, instructor-xl.
- Rerankers: Cohere Rerank, BGE reranker, Jina.
- Frameworks: LlamaIndex, Haystack, DSPy, LangChain (use carefully).
- Eval: RAGAS, DeepEval, Phoenix, Langfuse.
- Observability: Langfuse, Helicone, Phoenix, OpenLLMetry.