36.3 Schema inference and validation

Overview and links for this section of the guide.

Resolving Ambiguity

User questions are often ambiguous. The model needs to detect this and ask for clarification:

User Question Ambiguity Clarification Needed
"Show me sales" Which sales column? sales_jan, sales_feb, or total?
"Last month's data" Which month? December 2023? Or relative to today?
"Top products" Top by what? Revenue? Units sold? Profit?
// schema-inference.ts
export async function inferSchemaMapping(
  schema: string,
  question: string,
  model: GenerativeModel
): Promise {
  const prompt = `Determine if the question can be answered unambiguously.

## Dataset Schema
${schema}

## User Question
"${question}"

Output JSON:
{
  "canProceed": true|false,
  "ambiguities": [
    {
      "description": "Brief explanation",
      "options": ["option1", "option2"],
      "suggestedQuestion": "Would you like X or Y?"
    }
  ],
  "suggestedInterpretation": "How you'll interpret if confident"
}`;

  const result = await model.generateContent(prompt);
  return JSON.parse(result.response.text());
}

Code Validation

Before executing any generated code, validate it for safety:

// code-validator.ts
const ALLOWED_IMPORTS = new Set([
  'pandas', 'pd', 'numpy', 'np',
  'matplotlib', 'matplotlib.pyplot', 'plt',
  'seaborn', 'sns', 'json', 'datetime', 'math'
]);

const BANNED_PATTERNS = [
  /\bimport\s+os\b/,
  /\bimport\s+sys\b/,
  /\bimport\s+subprocess\b/,
  /\bimport\s+requests\b/,
  /\bexec\s*\(/,
  /\beval\s*\(/,
  /\bopen\s*\(/,
];

export function validateCode(code: string): ValidationResult {
  const errors: string[] = [];
  
  // Check for banned patterns
  for (const pattern of BANNED_PATTERNS) {
    if (pattern.test(code)) {
      errors.push(`Blocked: ${pattern.toString()}`);
    }
  }
  
  // Check imports
  const importMatches = code.matchAll(/import\s+([\w.]+)|from\s+([\w.]+)\s+import/g);
  for (const match of importMatches) {
    const moduleName = (match[1] || match[2]).split('.')[0];
    if (!ALLOWED_IMPORTS.has(moduleName)) {
      errors.push(`Disallowed import: ${moduleName}`);
    }
  }
  
  return { valid: errors.length === 0, errors };
}

Full Implementation

// schema-validator.ts
export class SchemaValidator {
  validateCodeAgainstSchema(code: string, schema: DatasetMetadata): string[] {
    const errors: string[] = [];
    const columnNames = schema.columns.map(c => c.name);
    
    // Look for df['column'] patterns
    const columnRefs = [
      ...code.matchAll(/df\[['"](\w+)['"]\]/g),
      ...code.matchAll(/df\.(\w+)(?!\s*\()/g),
    ];
    
    for (const match of columnRefs) {
      const colName = match[1];
      if (!columnNames.includes(colName) && 
          !['index', 'values', 'columns'].includes(colName)) {
        errors.push(`Column "${colName}" not found in schema`);
      }
    }
    
    return errors;
  }
}

// Sandbox execution with Docker
export class DockerSandbox {
  async execute(code: string, dataPath: string): Promise {
    const container = await this.docker.createContainer({
      Image: 'python-analyst:3.10',
      Cmd: ['python', '-c', code],
      NetworkDisabled: true,
      HostConfig: {
        Memory: 512 * 1024 * 1024,  // 512MB
        ReadonlyRootfs: true,
        Binds: [`${dataPath}:/data/input.csv:ro`],
      },
    });
    
    await container.start();
    const result = await container.wait();
    const logs = await container.logs({ stdout: true, stderr: true });
    await container.remove();
    
    return { stdout: logs.stdout, stderr: logs.stderr };
  }
}
Defense in Depth

Use ALL validation layers: static code analysis → import checking → sandbox isolation. No single layer is sufficient alone.

Where to go next