36.1 Upload/ingest flow for CSV-like data

On this page

The Context Window Problem
The "Head" Trick
Full Implementation
Column Cleaning
Handling Multiple Formats
Where to go next

The Context Window Problem

You cannot send a 1GB CSV to the model. Even Gemini's 2M token context would only fit ~6MB of raw text. But here's the key insight: you don't need to.

To understand the data and write correct code, the model only needs:

The column names
The data types (int, float, string, date)
A few sample rows (typically 3-5)
Basic statistics (row count, null counts, unique values)

This "metadata fingerprint" is usually under 500 tokens, even for a 1 million row dataset.

The "Head" Trick

The simplest approach: show the model the first few rows, just like a data scientist would use df.head().

// head-extraction.ts
import * as Papa from 'papaparse';

interface DatasetMetadata {
  columns: Column[];
  sampleRows: any[];
  totalRows: number;
  nullCounts: Record;
}

interface Column {
  name: string;
  dtype: 'string' | 'integer' | 'float' | 'date' | 'boolean';
  sampleValues: any[];
  uniqueCount: number;
  nullCount: number;
}

async function extractMetadata(
  file: File, 
  sampleSize: number = 5
): Promise {
  return new Promise((resolve, reject) => {
    let allRows: any[] = [];
    let columns: string[] = [];
    
    Papa.parse(file, {
      header: true,
      dynamicTyping: true,
      step: (results, parser) => {
        if (allRows.length === 0) {
          columns = results.meta.fields || [];
        }
        allRows.push(results.data);
        
        // For very large files, we might stop early
        // But we need full scan for accurate counts
      },
      complete: () => {
        const metadata = analyzeColumns(columns, allRows);
        resolve({
          columns: metadata,
          sampleRows: allRows.slice(0, sampleSize),
          totalRows: allRows.length,
          nullCounts: calculateNullCounts(columns, allRows)
        });
      },
      error: reject
    });
  });
}

function inferDtype(values: any[]): Column['dtype'] {
  const nonNull = values.filter(v => v != null && v !== '');
  if (nonNull.length === 0) return 'string';
  
  const sample = nonNull.slice(0, 100);
  
  // Check for dates
  const datePattern = /^\d{4}-\d{2}-\d{2}|^\d{2}\/\d{2}\/\d{4}/;
  if (sample.every(v => typeof v === 'string' && datePattern.test(v))) {
    return 'date';
  }
  
  // Check for booleans
  const boolValues = ['true', 'false', '0', '1', 'yes', 'no'];
  if (sample.every(v => boolValues.includes(String(v).toLowerCase()))) {
    return 'boolean';
  }
  
  // Check for numbers
  if (sample.every(v => typeof v === 'number')) {
    if (sample.every(v => Number.isInteger(v))) {
      return 'integer';
    }
    return 'float';
  }
  
  return 'string';
}

Full Implementation

// data-ingester.ts
export class DataIngester {
  async ingest(file: File): Promise<{
    df: any;  // The actual data (kept server-side)
    schema: DatasetMetadata;  // What we send to the LLM
  }> {
    // Parse the file
    const parsed = await this.parseFile(file);
    
    // Extract metadata for LLM
    const schema = await extractMetadata(file);
    
    // Clean column names
    schema.columns = schema.columns.map(col => ({
      ...col,
      name: this.cleanColumnName(col.name),
      originalName: col.name
    }));
    
    return {
      df: parsed,
      schema
    };
  }
  
  private async parseFile(file: File): Promise {
    const extension = file.name.split('.').pop()?.toLowerCase();
    
    switch (extension) {
      case 'csv':
        return this.parseCSV(file);
      case 'xlsx':
      case 'xls':
        return this.parseExcel(file);
      case 'json':
        return this.parseJSON(file);
      default:
        throw new Error(`Unsupported file type: ${extension}`);
    }
  }
  
  private cleanColumnName(name: string): string {
    // Remove special characters, lowercase, replace spaces with underscores
    let clean = name
      .toLowerCase()
      .replace(/[^\w\s]/g, '')
      .replace(/\s+/g, '_')
      .trim();
    
    // Handle common problematic cases
    if (clean.startsWith('unnamed')) {
      return '';  // Will be filtered out
    }
    
    // Ensure valid Python identifier
    if (/^\d/.test(clean)) {
      clean = 'col_' + clean;
    }
    
    return clean;
  }
  
  buildSchemaForLLM(schema: DatasetMetadata): string {
    const lines = [
      `Dataset: ${schema.totalRows.toLocaleString()} rows, ${schema.columns.length} columns`,
      '',
      'Columns:',
      ...schema.columns.map(col => 
        `  - ${col.name} (${col.dtype}): ${col.uniqueCount} unique values, ${col.nullCount} nulls`
      ),
      '',
      'Sample rows (first 5):',
      JSON.stringify(schema.sampleRows, null, 2)
    ];
    
    return lines.join('\n');
  }
}

// Usage
const ingester = new DataIngester();
const { df, schema } = await ingester.ingest(uploadedFile);

// This goes to the LLM
const schemaDescription = ingester.buildSchemaForLLM(schema);
console.log(schemaDescription);

// Output:
// Dataset: 10,000 rows, 8 columns
// 
// Columns:
//   - date (date): 365 unique values, 0 nulls
//   - sales (float): 8,432 unique values, 12 nulls
//   - region (string): 4 unique values, 0 nulls
//   - product (string): 25 unique values, 3 nulls
//   ...
//
// Sample rows (first 5):
// [{"date": "2023-01-01", "sales": 1234.56, "region": "North"}, ...]

Column Cleaning

Real-world CSV files have messy column names. Clean them before sending to the LLM:

Original	Cleaned	Reason
`Sales (2023)`	`sales_2023`	Remove parentheses, lowercase
`Unnamed: 0`	(dropped)	Index column from pandas export
`First Name`	`first_name`	Replace space with underscore
`123_column`	`col_123_column`	Can't start with number in Python
`Revenue ($)`	`revenue`	Remove special characters

Keep Original Names

Store a mapping from cleaned names to originals. When displaying results to users, show the original column names they recognize.

Handling Multiple Formats

// format-handlers.ts
import * as XLSX from 'xlsx';

export const formatHandlers = {
  csv: async (file: File): Promise => {
    return new Promise((resolve, reject) => {
      Papa.parse(file, {
        header: true,
        dynamicTyping: true,
        complete: (results) => resolve(results.data),
        error: reject
      });
    });
  },
  
  xlsx: async (file: File): Promise => {
    const buffer = await file.arrayBuffer();
    const workbook = XLSX.read(buffer, { type: 'array' });
    
    // Use first sheet by default
    const firstSheet = workbook.SheetNames[0];
    const worksheet = workbook.Sheets[firstSheet];
    
    return XLSX.utils.sheet_to_json(worksheet);
  },
  
  json: async (file: File): Promise => {
    const text = await file.text();
    const data = JSON.parse(text);
    
    // Handle both array and object with data key
    return Array.isArray(data) ? data : data.data || [];
  },
  
  // For database connections
  sql: async (connection: DatabaseConnection, query: string): Promise => {
    const result = await connection.query(query);
    return result.rows;
  }
};

// Detect format from file
function detectFormat(file: File): keyof typeof formatHandlers {
  const ext = file.name.split('.').pop()?.toLowerCase();
  
  switch (ext) {
    case 'csv':
    case 'tsv':
      return 'csv';
    case 'xlsx':
    case 'xls':
      return 'xlsx';
    case 'json':
      return 'json';
    default:
      throw new Error(`Unknown file format: ${ext}`);
  }
}

Where to go next

36.2 Ask questions in natural language; answer in charts/tables