36.2 Ask questions in natural language; answer in charts/tables

On this page

The Code Generation Approach
The Analysis Prompt
Full Implementation
Handling Different Outputs
Example Queries
Where to go next

The Code Generation Approach

The key insight: Don't ask the LLM to analyze the data. Ask it to write code that analyzes the data.

Why this matters:

Accuracy: Python calculates correctly. LLMs hallucinate arithmetic.
Reproducibility: Code can be audited, saved, and re-run.
Scalability: Same code works on 100 rows or 10 million rows.
Explainability: Users can see exactly what was done.

The Analysis Prompt

// analysis-prompt.ts
export function buildAnalysisPrompt(
  schema: string,
  question: string
): string {
  return `You are a data analyst. Write Python code to answer the user's question.

## Dataset Schema
A pandas DataFrame named \`df\` is already loaded with the following structure:
${schema}

## User Question
"${question}"

## Instructions
1. Write Python code using pandas, numpy, and matplotlib/seaborn
2. The DataFrame \`df\` is already loaded - do NOT load it yourself
3. For charts: save to 'output.png' using plt.savefig('output.png', dpi=150, bbox_inches='tight')
4. For tables: print as JSON using print(result.to_json(orient='records'))
5. For single values: print as JSON using print(json.dumps({"result": value}))
6. Add brief comments explaining your logic
7. Handle edge cases (nulls, empty data)

## Output Format
Return ONLY the Python code, no markdown formatting:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

# Your analysis code here
`;
}

// For follow-up questions that build on previous analysis
export function buildFollowUpPrompt(
  schema: string,
  conversationHistory: Array<{question: string, code: string}>,
  newQuestion: string
): string {
  const historyContext = conversationHistory.map((turn, i) => 
    `Question ${i + 1}: "${turn.question}"\nCode:\n${turn.code}`
  ).join('\n\n');

  return `You are a data analyst continuing an analysis session.

## Dataset Schema
${schema}

## Previous Analysis
${historyContext}

## New Question
"${newQuestion}"

## Instructions
Build on the previous analysis if relevant. Write Python code to answer the new question.
The DataFrame \`df\` is already loaded. Previous code has already run, so any variables created are available.

Return ONLY the Python code:`;
}

Full Implementation

// analyst.ts
import { GoogleGenerativeAI } from '@google/generative-ai';

interface AnalysisResult {
  code: string;
  output: {
    type: 'chart' | 'table' | 'value' | 'error';
    data: any;
  };
  executionTime: number;
}

export class DataAnalyst {
  private model: any;
  private sandbox: CodeSandbox;
  
  constructor(apiKey: string, sandbox: CodeSandbox) {
    const genAI = new GoogleGenerativeAI(apiKey);
    this.model = genAI.getGenerativeModel({
      model: 'gemini-1.5-pro',
      generationConfig: {
        temperature: 0,  // Deterministic code generation
        maxOutputTokens: 4096,
      }
    });
    this.sandbox = sandbox;
  }
  
  async analyze(
    schema: string,
    question: string,
    dataPath: string
  ): Promise {
    // Step 1: Generate the code
    const prompt = buildAnalysisPrompt(schema, question);
    const result = await this.model.generateContent(prompt);
    const code = this.extractCode(result.response.text());
    
    // Step 2: Validate the code
    const validation = validateCode(code);
    if (!validation.valid) {
      throw new Error(`Code validation failed: ${validation.errors.join(', ')}`);
    }
    
    // Step 3: Execute in sandbox
    const startTime = Date.now();
    const output = await this.sandbox.execute(code, { dataPath });
    const executionTime = Date.now() - startTime;
    
    // Step 4: Parse and return results
    return {
      code,
      output: this.parseOutput(output),
      executionTime
    };
  }
  
  private extractCode(response: string): string {
    // Remove markdown code blocks if present
    let code = response;
    
    const codeBlockMatch = response.match(/```python\n([\s\S]*?)```/);
    if (codeBlockMatch) {
      code = codeBlockMatch[1];
    }
    
    // Remove any leading/trailing backticks
    code = code.replace(/^`+|`+$/g, '').trim();
    
    return code;
  }
  
  private parseOutput(sandboxOutput: SandboxOutput): AnalysisResult['output'] {
    // Check for chart file
    if (sandboxOutput.files['output.png']) {
      return {
        type: 'chart',
        data: sandboxOutput.files['output.png']  // Base64 or URL
      };
    }
    
    // Check stdout for JSON
    const stdout = sandboxOutput.stdout.trim();
    if (stdout) {
      try {
        const parsed = JSON.parse(stdout);
        
        // Array = table data
        if (Array.isArray(parsed)) {
          return { type: 'table', data: parsed };
        }
        
        // Object with 'result' = single value
        if (parsed.result !== undefined) {
          return { type: 'value', data: parsed.result };
        }
        
        // Other object = treat as table
        return { type: 'table', data: [parsed] };
      } catch {
        // Not JSON, return as plain text
        return { type: 'value', data: stdout };
      }
    }
    
    // Check for errors
    if (sandboxOutput.stderr) {
      return { type: 'error', data: sandboxOutput.stderr };
    }
    
    return { type: 'error', data: 'No output generated' };
  }
}

// Usage
const analyst = new DataAnalyst(process.env.GEMINI_API_KEY!, sandbox);

const result = await analyst.analyze(
  schemaDescription,
  "Show me total sales by region as a bar chart",
  "/tmp/data/uploaded.csv"
);

if (result.output.type === 'chart') {
  // Display the image
  displayImage(result.output.data);
} else if (result.output.type === 'table') {
  // Render as a table
  renderTable(result.output.data);
}

// Show the code for transparency
console.log("Generated code:", result.code);

Handling Different Outputs

Users ask different types of questions requiring different output formats:

Question Type	Output Format	Example
Visualization	PNG image	"Plot sales over time"
Aggregation	Table (JSON)	"Show top 10 products by revenue"
Single Metric	Value (JSON)	"What's the average order value?"
Comparison	Chart or Table	"Compare Q1 vs Q2 sales"

// Output rendering based on type
function renderOutput(result: AnalysisResult['output']): React.ReactNode {
  switch (result.type) {
    case 'chart':
      return ;
    
    case 'table':
      return (
        
              {Object.keys(result.data[0] || {}).map(col => 
                
              )}
            
            {result.data.map((row, i) => (
              
                {Object.values(row).map((val, j) => 
                  
                )}
              
            ))}
          
          
            {col}
          
          {formatValue(val)}
        
      );
    
    case 'value':
      return (
        
          {formatValue(result.data)}
        
      );
    
    case 'error':
      return (
        
          Error: {result.data}
        
      );
  }
}

{col}
{formatValue(val)}

Example Queries

// Example 1: Simple aggregation
Question: "What are the total sales by region?"
Generated Code:
  sales_by_region = df.groupby('region')['sales'].sum().reset_index()
  sales_by_region.columns = ['Region', 'Total Sales']
  print(sales_by_region.to_json(orient='records'))

Output: [{"Region": "North", "Total Sales": 125000}, ...]

// Example 2: Time series chart
Question: "Plot monthly revenue trend for 2023"
Generated Code:
  df['date'] = pd.to_datetime(df['date'])
  monthly = df[df['date'].dt.year == 2023].resample('M', on='date')['revenue'].sum()
  
  plt.figure(figsize=(12, 6))
  plt.plot(monthly.index, monthly.values, marker='o')
  plt.title('Monthly Revenue Trend - 2023')
  plt.xlabel('Month')
  plt.ylabel('Revenue ($)')
  plt.xticks(rotation=45)
  plt.grid(True, alpha=0.3)
  plt.savefig('output.png', dpi=150, bbox_inches='tight')

Output: [Chart image]

// Example 3: Complex analysis
Question: "Which product category has the highest growth rate month-over-month?"
Generated Code:
  df['date'] = pd.to_datetime(df['date'])
  df['month'] = df['date'].dt.to_period('M')
  
  monthly_category = df.groupby(['month', 'category'])['sales'].sum().unstack()
  growth_rates = monthly_category.pct_change().mean()
  
  result = growth_rates.idxmax()
  avg_growth = growth_rates.max() * 100
  
  print(json.dumps({
    "result": f"{result} with {avg_growth:.1f}% average monthly growth"
  }))

Show the Code

Always display the generated code to users. It builds trust, enables verification, and teaches them about data analysis. Consider a "Code" tab next to the results.

Where to go next

36.3 Schema inference and validation