36.3 Schema inference and validation

On this page

Resolving Ambiguity
Code Validation
Full Implementation
Where to go next

Resolving Ambiguity

User questions are often ambiguous. The model needs to detect this and ask for clarification:

User Question	Ambiguity	Clarification Needed
"Show me sales"	Which sales column?	`sales_jan`, `sales_feb`, or total?
"Last month's data"	Which month?	December 2023? Or relative to today?
"Top products"	Top by what?	Revenue? Units sold? Profit?

// schema-inference.ts
export async function inferSchemaMapping(
  schema: string,
  question: string,
  model: GenerativeModel
): Promise {
  const prompt = `Determine if the question can be answered unambiguously.

## Dataset Schema
${schema}

## User Question
"${question}"

Output JSON:
{
  "canProceed": true|false,
  "ambiguities": [
    {
      "description": "Brief explanation",
      "options": ["option1", "option2"],
      "suggestedQuestion": "Would you like X or Y?"
    }
  ],
  "suggestedInterpretation": "How you'll interpret if confident"
}`;

  const result = await model.generateContent(prompt);
  return JSON.parse(result.response.text());
}

Code Validation

Before executing any generated code, validate it for safety:

// code-validator.ts
const ALLOWED_IMPORTS = new Set([
  'pandas', 'pd', 'numpy', 'np',
  'matplotlib', 'matplotlib.pyplot', 'plt',
  'seaborn', 'sns', 'json', 'datetime', 'math'
]);

const BANNED_PATTERNS = [
  /\bimport\s+os\b/,
  /\bimport\s+sys\b/,
  /\bimport\s+subprocess\b/,
  /\bimport\s+requests\b/,
  /\bexec\s*\(/,
  /\beval\s*\(/,
  /\bopen\s*\(/,
];

export function validateCode(code: string): ValidationResult {
  const errors: string[] = [];
  
  // Check for banned patterns
  for (const pattern of BANNED_PATTERNS) {
    if (pattern.test(code)) {
      errors.push(`Blocked: ${pattern.toString()}`);
    }
  }
  
  // Check imports
  const importMatches = code.matchAll(/import\s+([\w.]+)|from\s+([\w.]+)\s+import/g);
  for (const match of importMatches) {
    const moduleName = (match[1] || match[2]).split('.')[0];
    if (!ALLOWED_IMPORTS.has(moduleName)) {
      errors.push(`Disallowed import: ${moduleName}`);
    }
  }
  
  return { valid: errors.length === 0, errors };
}

Full Implementation

// schema-validator.ts
export class SchemaValidator {
  validateCodeAgainstSchema(code: string, schema: DatasetMetadata): string[] {
    const errors: string[] = [];
    const columnNames = schema.columns.map(c => c.name);
    
    // Look for df['column'] patterns
    const columnRefs = [
      ...code.matchAll(/df\[['"](\w+)['"]\]/g),
      ...code.matchAll(/df\.(\w+)(?!\s*\()/g),
    ];
    
    for (const match of columnRefs) {
      const colName = match[1];
      if (!columnNames.includes(colName) && 
          !['index', 'values', 'columns'].includes(colName)) {
        errors.push(`Column "${colName}" not found in schema`);
      }
    }
    
    return errors;
  }
}

// Sandbox execution with Docker
export class DockerSandbox {
  async execute(code: string, dataPath: string): Promise {
    const container = await this.docker.createContainer({
      Image: 'python-analyst:3.10',
      Cmd: ['python', '-c', code],
      NetworkDisabled: true,
      HostConfig: {
        Memory: 512 * 1024 * 1024,  // 512MB
        ReadonlyRootfs: true,
        Binds: [`${dataPath}:/data/input.csv:ro`],
      },
    });
    
    await container.start();
    const result = await container.wait();
    const logs = await container.logs({ stdout: true, stderr: true });
    await container.remove();
    
    return { stdout: logs.stdout, stderr: logs.stderr };
  }
}

Defense in Depth

Use ALL validation layers: static code analysis → import checking → sandbox isolation. No single layer is sufficient alone.

Where to go next

36.4 Reproducible "analysis notebooks" from AI output