33.3 Parallelizing retrieval and preprocessing

Overview and links for this section of the guide.

The Serial Problem

// Slow: Sequential execution
const context = await fetchFromDB();      // 200ms
const docs = await searchVectorDB();      // 300ms
const userHistory = await getHistory();   // 100ms
const prompt = buildPrompt(context, docs, userHistory);
const result = await model.generate(prompt);  // 2000ms
// Total: 2600ms

Parallel Execution

// Fast: Parallel execution
const [context, docs, userHistory] = await Promise.all([
  fetchFromDB(),        // 200ms ─┐
  searchVectorDB(),     // 300ms ─┼─ Max: 300ms (parallel)
  getHistory()          // 100ms ─┘
]);

const prompt = buildPrompt(context, docs, userHistory);
const result = await model.generate(prompt);  // 2000ms
// Total: 2300ms (saved 300ms)

Patterns

// Pattern 1: Start LLM call while processing response
async function optimizedPipeline(query: string) {
  // Start retrieval
  const docsPromise = searchDocuments(query);
  
  // While waiting, prepare other context
  const userContext = await getUserContext();
  
  // Now wait for docs
  const docs = await docsPromise;
  
  // Build and send prompt
  return model.generate(buildPrompt(docs, userContext));
}

// Pattern 2: Speculative execution
async function speculativeExec(query: string) {
  // Start multiple retrievals, use first that returns
  const result = await Promise.race([
    searchVectorDB(query),
    searchKeywordDB(query)
  ]);
  
  return result;
}

// Pattern 3: Background prefetch
// Prefetch user's recent data on page load
// So it's ready when they submit a query
let prefetchedContext: Context | null = null;

onPageLoad(async (userId) => {
  prefetchedContext = await loadUserContext(userId);
});

onQuery(async (query) => {
  // Context is already loaded!
  return model.generate(buildPrompt(prefetchedContext, query));
});

Where to go next