The target shape
Embeddings are grouped by target UUID — scene, element, or character —
and stored under analysis.embeddings[<uuid>]:
{
"analysis": {
"embeddings": {
"<scene-uuid>": [{
"id": "<embedding-uuid>",
"model": "text-embedding-3-large",
"dimensions": 1536,
"values": [0.023, -0.041, …],
"source": "text",
"lang": "en",
"tokens": 420,
"created": "2026-01-14T10:30:00Z"
}]
},
"settings": {
"model": "text-embedding-3-large",
"size": 512,
"overlap": 64,
"tokeniser": "cl100k"
}
}
}
Pipeline (Node.js)
import fs from 'node:fs/promises';
import { randomUUID } from 'node:crypto';
import OpenAI from 'openai';
const client = new OpenAI();
const doc = JSON.parse(await fs.readFile('screenplay.json', 'utf8'));
doc.analysis ??= { embeddings: {}, settings: {
model: 'text-embedding-3-large', size: 512, overlap: 64, tokeniser: 'cl100k',
}};
for (const scene of doc.document.scenes) {
const text = scene.body
.filter((el) => el.text?.en)
.map((el) => el.text.en)
.join('\n');
if (!text) continue;
const res = await client.embeddings.create({
model: 'text-embedding-3-large',
input: text,
});
doc.analysis.embeddings[scene.id] = [{
id: randomUUID(),
model: 'text-embedding-3-large',
dimensions: res.data[0].embedding.length,
values: res.data[0].embedding,
source: 'text',
lang: 'en',
tokens: res.usage.total_tokens,
created: new Date().toISOString(),
}];
}
await fs.writeFile('screenplay-embedded.json', JSON.stringify(doc, null, 2));
With retrieval in a vector database
Write the embeddings to your vector DB keyed by embedding UUID, with scene UUID and document UUID as metadata. At query time, retrieve top-k, look up each hit in the canonical document for provenance, and send the LLM both the passage text and its scene heading.