35.2 Structured extraction (order IDs, dates, issue types)

Overview and links for this section of the guide.

Why Regex Is Not Enough

Traditional regex works great for structured formats:

// Regex can handle these easily
const ORDER_ID_REGEX = /ORDER-\d{5,}/gi;
const EMAIL_REGEX = /[\w.-]+@[\w.-]+\.\w+/gi;

"My order ORDER-12345 hasn't arrived" → ["ORDER-12345"] ✓

But customer emails are messy:

// Regex fails on these
"I ordered something last Tuesday"           → What date?
"the order I placed for my mom's birthday"   → Which order?
"my orders 555 and maybe 556?"               → How many? Are both valid?
"I think it was around $200"                 → Exact amount?

LLMs excel at normalization—converting natural language to structured data.

The Extraction Schema

Define exactly what you want to extract:

// extraction-schema.ts
export interface ExtractedTicketData {
  // Order information
  order_ids: string[];           // Normalized order IDs
  order_amounts: number[];       // Amounts in cents
  
  // Temporal information
  dates_mentioned: string[];     // ISO8601 format
  dates_original: string[];      // Original text for debugging
  
  // Customer information
  customer_name: string | null;
  customer_email: string | null;
  
  // Sentiment and urgency
  sentiment: 'angry' | 'frustrated' | 'neutral' | 'happy';
  urgency: 'low' | 'medium' | 'high' | 'urgent';
  
  // Product information
  product_names: string[];
  
  // Issue specifics
  issue_description: string;     // One-sentence summary
}

// JSON Schema for structured output
export const EXTRACTION_SCHEMA = {
  type: "object",
  properties: {
    order_ids: {
      type: "array",
      items: { type: "string" },
      description: "All order/reference numbers mentioned"
    },
    dates_mentioned: {
      type: "array",
      items: { type: "string", format: "date" },
      description: "Dates in ISO8601 format (YYYY-MM-DD)"
    },
    customer_name: {
      type: ["string", "null"],
      description: "Customer's name if mentioned"
    },
    sentiment: {
      type: "string",
      enum: ["angry", "frustrated", "neutral", "happy"]
    },
    urgency: {
      type: "string",
      enum: ["low", "medium", "high", "urgent"]
    },
    issue_description: {
      type: "string",
      description: "One-sentence summary of the issue"
    }
  },
  required: ["order_ids", "sentiment", "urgency", "issue_description"]
};

Implementation

// extractor.ts
import { GoogleGenerativeAI } from '@google/generative-ai';

export class TicketDataExtractor {
  private model: any;
  
  constructor(apiKey: string) {
    const genAI = new GoogleGenerativeAI(apiKey);
    this.model = genAI.getGenerativeModel({
      model: 'gemini-1.5-flash',
      generationConfig: {
        temperature: 0,
        responseMimeType: 'application/json',
        responseSchema: EXTRACTION_SCHEMA,
      }
    });
  }
  
  async extract(email: string, currentDate: Date = new Date()): Promise {
    const prompt = this.buildPrompt(email, currentDate);
    const result = await this.model.generateContent(prompt);
    const text = result.response.text();
    
    return this.parseAndValidate(JSON.parse(text));
  }
  
  private buildPrompt(email: string, currentDate: Date): string {
    return `Extract structured data from this customer support email.

## Today's Date
${currentDate.toISOString().split('T')[0]}
(Use this to resolve relative dates like "yesterday" or "last week")

## Instructions
1. Extract ALL order IDs, normalizing formats (e.g., "order 12345" → "12345")
2. Convert relative dates to ISO8601 (e.g., "last Tuesday" → "2024-01-09")
3. Determine sentiment from tone, not just words
4. Assess urgency based on keywords and context
5. Summarize the core issue in one sentence

## Email
${email}

## Output
Return a JSON object matching the schema.`;
  }
  
  private parseAndValidate(data: any): ExtractedTicketData {
    // Normalize order IDs
    data.order_ids = data.order_ids.map((id: string) => 
      id.replace(/[^a-zA-Z0-9-]/g, '').toUpperCase()
    );
    
    // Validate dates
    data.dates_mentioned = data.dates_mentioned.filter((d: string) => {
      const parsed = new Date(d);
      return !isNaN(parsed.getTime());
    });
    
    return data as ExtractedTicketData;
  }
}

// Usage example
const extractor = new TicketDataExtractor(process.env.GEMINI_API_KEY!);

const data = await extractor.extract(`
  Hi,
  
  I'm really frustrated. I placed two orders last Tuesday - order numbers
  555 and 556 - for my daughter's birthday party this Saturday.
  
  Neither has shipped yet and the party is in 3 days!
  
  Please help ASAP.
  
  - Sarah Johnson
  [email protected]
`);

console.log(data);
// {
//   order_ids: ['555', '556'],
//   dates_mentioned: ['2024-01-09', '2024-01-13'],  // last Tuesday, this Saturday
//   customer_name: 'Sarah Johnson',
//   customer_email: '[email protected]',
//   sentiment: 'frustrated',
//   urgency: 'high',
//   issue_description: 'Two orders not shipped with birthday deadline approaching'
// }

Date Normalization

Relative dates are tricky. The same email means different things depending on when it was sent:

Original Text If email received Monday If email received Friday
"last week" Jan 1-7 Jan 1-7
"yesterday" Jan 7 (Sunday) Jan 11 (Thursday)
"this Friday" Jan 12 Jan 12
"next week" Jan 15-21 Jan 15-21

Always pass the email received date to the model:

// Include timestamp context
const prompt = `
Today's date: ${emailReceivedDate.toISOString().split('T')[0]}
Day of week: ${['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'][emailReceivedDate.getDay()]}

When the email says "yesterday" or "last Tuesday", calculate the actual date.
`;
Keep Original + Normalized

Store both the original text ("last Tuesday") and the normalized date ("2024-01-09"). This helps with debugging and auditing when normalization goes wrong.

Validating Extracted Data

Never trust extracted data blindly. Validate against your database:

// validation.ts
interface ValidationResult {
  field: string;
  valid: boolean;
  value: any;
  error?: string;
}

async function validateExtractedData(
  data: ExtractedTicketData,
  db: Database
): Promise {
  const results: ValidationResult[] = [];
  
  // Validate order IDs exist
  for (const orderId of data.order_ids) {
    const order = await db.orders.findById(orderId);
    results.push({
      field: 'order_id',
      valid: !!order,
      value: orderId,
      error: order ? undefined : `Order ${orderId} not found in database`
    });
  }
  
  // Validate dates are reasonable (not in future, not too old)
  const now = new Date();
  const oneYearAgo = new Date(now.getTime() - 365 * 24 * 60 * 60 * 1000);
  
  for (const dateStr of data.dates_mentioned) {
    const date = new Date(dateStr);
    const isFuture = date > now;
    const isTooOld = date < oneYearAgo;
    
    results.push({
      field: 'date',
      valid: !isFuture && !isTooOld,
      value: dateStr,
      error: isFuture ? 'Date is in the future' : 
             isTooOld ? 'Date is more than 1 year ago' : undefined
    });
  }
  
  // Validate email format
  if (data.customer_email) {
    const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
    results.push({
      field: 'customer_email',
      valid: emailRegex.test(data.customer_email),
      value: data.customer_email,
      error: emailRegex.test(data.customer_email) ? undefined : 'Invalid email format'
    });
  }
  
  return results;
}

// If validation fails, fall back to regex + human review
async function extractWithFallback(email: string, db: Database) {
  // Try LLM extraction first
  const llmData = await extractor.extract(email);
  const validation = await validateExtractedData(llmData, db);
  
  const allValid = validation.every(v => v.valid);
  
  if (allValid) {
    return { data: llmData, method: 'llm', confidence: 'high' };
  }
  
  // Fallback: regex extraction for critical fields
  const orderIds = email.match(/\b\d{5,10}\b/g) || [];
  const emails = email.match(/[\w.-]+@[\w.-]+\.\w+/gi) || [];
  
  return {
    data: {
      ...llmData,
      order_ids: orderIds,  // Override with regex results
      customer_email: emails[0] || null,
    },
    method: 'hybrid',
    confidence: 'medium',
    validationErrors: validation.filter(v => !v.valid)
  };
}

Where to go next