33.4 Caching: what to cache and what not to

On this page

What to Cache
Implementation
Invalidation
Where to go next

What to Cache

Cache	Don't Cache
Embeddings (slow to compute)	User-specific responses
Common queries with stable answers	Time-sensitive information
Document summaries	Personalized content
Classification results	Security-sensitive outputs

Implementation

// Simple response cache
const cache = new Map();
const CACHE_TTL = 60 * 60 * 1000; // 1 hour

async function cachedGenerate(prompt: string): Promise {
  const key = hashPrompt(prompt);
  const cached = cache.get(key);
  
  if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
    console.log('Cache hit');
    return cached.result;
  }
  
  const result = await model.generate(prompt);
  cache.set(key, { result, timestamp: Date.now() });
  return result;
}

// Semantic caching (more sophisticated)
async function semanticCache(query: string): Promise {
  const embedding = await embed(query);
  
  // Find similar past queries
  const similar = await vectorDB.search(embedding, { threshold: 0.95 });
  
  if (similar.length > 0) {
    // Return cached response for similar query
    return similar[0].response;
  }
  
  return null;  // Cache miss
}

// Embedding cache (expensive to compute)
const embeddingCache = new LRUCache({ max: 10000 });

async function cachedEmbed(text: string): Promise {
  const key = hashText(text);
  if (embeddingCache.has(key)) {
    return embeddingCache.get(key)!;
  }
  
  const embedding = await embed(text);
  embeddingCache.set(key, embedding);
  return embedding;
}

Invalidation

// Time-based expiry (simple)
// Set TTL when caching

// Event-based invalidation (better)
onDocumentUpdate((docId) => {
  // Invalidate cached summaries for this document
  cache.delete(`summary:${docId}`);
});

// Versioned cache keys
const cacheKey = `${promptHash}:${modelVersion}:${dataVersion}`;

Where to go next

33.5 Warm starts and connection reuse