Skip to content

RAG / LLM — Practical

import OpenAI from 'openai';
import { Pool } from 'pg';
const ai = new OpenAI();
const db = new Pool({ connectionString: process.env.DATABASE_URL });
// schema:
// CREATE EXTENSION vector;
// CREATE TABLE docs (id BIGSERIAL PRIMARY KEY, source TEXT, chunk TEXT,
// embedding VECTOR(1536), metadata JSONB);
// CREATE INDEX ON docs USING ivfflat (embedding vector_cosine_ops);
async function embed(text: string) {
const r = await ai.embeddings.create({
model: 'text-embedding-3-small',
input: text,
});
return r.data[0].embedding;
}
async function ingest(source: string, text: string, metadata = {}) {
for (const chunk of chunkText(text, 800, 100)) {
const v = await embed(chunk);
await db.query(
`INSERT INTO docs(source, chunk, embedding, metadata)
VALUES ($1, $2, $3, $4)`,
[source, chunk, JSON.stringify(v), metadata]);
}
}
async function retrieve(query: string, k = 5) {
const v = await embed(query);
const r = await db.query(
`SELECT chunk, source, metadata, 1 - (embedding <=> $1::vector) AS sim
FROM docs
ORDER BY embedding <=> $1::vector
LIMIT $2`,
[JSON.stringify(v), k]);
return r.rows;
}
async function answer(question: string) {
const ctx = await retrieve(question, 5);
const prompt = ctx.map(r => `[${r.source}] ${r.chunk}`).join('\n\n');
const r = await ai.chat.completions.create({
model: 'gpt-4.1-mini',
temperature: 0,
messages: [
{ role: 'system', content:
`Answer ONLY from the provided context. If unknown, say "I don't know". Cite sources by [source].` },
{ role: 'user', content: `CONTEXT:\n${prompt}\n\nQUESTION: ${question}` },
],
});
return r.choices[0].message.content;
}
function chunkText(text: string, size = 800, overlap = 100): string[] {
// try paragraph splits, then sentences, then words
const paragraphs = text.split(/\n\n+/);
const chunks: string[] = [];
let buf = '';
for (const p of paragraphs) {
if ((buf + p).length < size) {
buf += (buf ? '\n\n' : '') + p;
} else {
if (buf) chunks.push(buf);
buf = p;
}
}
if (buf) chunks.push(buf);
// add overlap
return chunks.map((c, i) => {
if (i === 0) return c;
const tail = chunks[i-1].slice(-overlap);
return tail + '\n' + c;
});
}
-- assume tsvector column maintained:
-- ALTER TABLE docs ADD COLUMN tsv tsvector
-- GENERATED ALWAYS AS (to_tsvector('english', chunk)) STORED;
-- CREATE INDEX ON docs USING gin(tsv);
WITH vec AS (
SELECT id, source, chunk,
ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rk
FROM docs ORDER BY embedding <=> $1::vector LIMIT 50
), kw AS (
SELECT id, source, chunk,
ROW_NUMBER() OVER (ORDER BY ts_rank(tsv, websearch_to_tsquery($2)) DESC) AS rk
FROM docs WHERE tsv @@ websearch_to_tsquery($2) LIMIT 50
), fused AS (
SELECT id, source, chunk, SUM(1.0 / (60 + rk)) AS score
FROM (SELECT * FROM vec UNION ALL SELECT * FROM kw) u
GROUP BY id, source, chunk
)
SELECT * FROM fused ORDER BY score DESC LIMIT 10;
import { CohereClient } from 'cohere-ai';
const co = new CohereClient({ token: process.env.COHERE_KEY });
async function rerank(query: string, docs: { id: string, text: string }[], top = 5) {
const r = await co.rerank({
model: 'rerank-multilingual-v3.0',
query,
documents: docs.map(d => d.text),
topN: top,
});
return r.results.map(x => docs[x.index]);
}
const stream = await ai.chat.completions.create({
model: 'gpt-4.1-mini',
stream: true,
temperature: 0,
messages: [...],
});
for await (const part of stream) {
process.stdout.write(part.choices[0]?.delta?.content ?? '');
}
SELECT * FROM docs
WHERE metadata->>'tenant' = $1
ORDER BY embedding <=> $2::vector
LIMIT 10;

Always filter before similarity for hard isolation. In Pinecone/Qdrant: namespace = tenant.

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset
ds = Dataset.from_list([
{ "question": "...", "answer": "...", "contexts": [...], "ground_truth": "..." },
...
])
result = evaluate(ds, metrics=[faithfulness, answer_relevancy, context_precision, context_recall])
print(result)

Run on golden set per PR; fail if score regresses.

LangChain quick-start (for rapid prototyping)

Section titled “LangChain quick-start (for rapid prototyping)”
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import PGVector
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
docs = splitter.split_documents(loaded)
emb = OpenAIEmbeddings(model="text-embedding-3-small")
store = PGVector.from_documents(docs, emb, connection_string=DSN)
retriever = store.as_retriever(search_kwargs={"k": 5})
chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-4.1-mini"), retriever=retriever)
print(chain.invoke({"query": "..."}))

For production, prefer thinner custom code over LangChain’s abstractions.

Terminal window
# Ollama for laptops
ollama pull llama3.1
curl http://localhost:11434/api/chat -d '{"model":"llama3.1","messages":[...]}'
# vLLM in prod
pip install vllm
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct
# OpenAI-compatible API at :8000
  • Cache by hash(question) for FAQ-like.
  • Pre-compute embeddings once; don’t re-embed.
  • Smaller models (gpt-4.1-mini, Haiku) for simple queries; route to flagship for hard.
  • Stream output to user.
  • Set max output tokens.
  • Reduce retrieved context if possible.
  • Batch embedding calls.
  • Vector DBs: pgvector, Qdrant, Pinecone, Weaviate, Milvus.
  • Embedders: OpenAI, Cohere, Voyage, BGE, instructor-xl.
  • Rerankers: Cohere Rerank, BGE reranker, Jina.
  • Frameworks: LlamaIndex, Haystack, DSPy, LangChain (use carefully).
  • Eval: RAGAS, DeepEval, Phoenix, Langfuse.
  • Observability: Langfuse, Helicone, Phoenix, OpenLLMetry.