1. Optional LangFuse tracing that just goes to the project's logging framework by default 2. Experiment tracking for the translations in a local directory 3. Vercel AI SDK to interface with the LLM --- ### Project Structure ``` /datasets /source page-1.html page-2.html /reference page-1.html page-2.html /results main-a3f9c12.json /src tracing.ts # optional Langfuse / logging wrapper eval.ts # experiment runner pipeline.ts # translation pipeline scoring.ts # eval metrics ``` --- ### 1. `src/tracing.ts` — Optional Langfuse / Log Wrapper ```typescript import { Langfuse } from "langfuse"; const IS_LANGFUSE_ENABLED = process.env.LANGFUSE_ENABLED === "true"; // Minimal span interface both implementations share interface Span { id: string; end(opts?: { output?: unknown; metadata?: unknown }): void; } interface Trace { id: string; span(opts: { name: string; input?: unknown }): Span; update(opts: { output?: unknown; metadata?: unknown }): void; } interface Tracer { trace(opts: { name: string; input?: unknown; metadata?: unknown }): Trace; score(opts: { traceId: string; observationId?: string; name: string; value: number; comment?: string }): void; flushAsync(): Promise<void>; } // Logging implementation function makeLoggingTracer(): Tracer { return { trace(opts) { console.log(JSON.stringify({ event: "trace_start", ...opts })); const traceId = `local-${Date.now()}`; return { id: traceId, span(spanOpts) { const start = performance.now(); console.log(JSON.stringify({ event: "span_start", traceId, ...spanOpts })); const spanId = `local-${Date.now()}`; return { id: spanId, end(endOpts) { console.log(JSON.stringify({ event: "span_end", traceId, spanId, name: spanOpts.name, durationMs: (performance.now() - start).toFixed(2), ...endOpts })); } }; }, update(updateOpts) { console.log(JSON.stringify({ event: "trace_update", traceId, ...updateOpts })); } }; }, score(opts) { console.log(JSON.stringify({ event: "score", ...opts })); }, async flushAsync() {} }; } // Langfuse implementation function makeLangfuseTracer(): Tracer { const langfuse = new Langfuse(); return { trace(opts) { const trace = langfuse.trace(opts); return { id: trace.id, span(spanOpts) { const span = trace.span(spanOpts); return { id: span.id, end(endOpts) { span.end(endOpts); } }; }, update(updateOpts) { trace.update(updateOpts); } }; }, score(opts) { langfuse.score(opts); }, async flushAsync() { await langfuse.flushAsync(); } }; } export const tracer: Tracer = IS_LANGFUSE_ENABLED ? makeLangfuseTracer() : makeLoggingTracer(); // Generic step wrapper — works for LLM and non-LLM steps equally export async function traceStep<T>( trace: Trace, name: string, input: unknown, fn: () => Promise<T>, getMetadata?: (output: T) => unknown ): Promise<T> { const span = trace.span({ name, input }); try { const output = await fn(); span.end({ output, metadata: getMetadata?.(output) }); return output; } catch (err) { span.end({ metadata: { error: String(err) } }); throw err; } } ``` --- ### 2. `src/scoring.ts` — Eval Metrics ```typescript import { JSDOM } from "jsdom"; export function tagPreservationScore(source: string, translated: string): number { const sourceTags = new JSDOM(source).window.document .querySelectorAll("*"); const translatedTags = new JSDOM(translated).window.document .querySelectorAll("*"); const sourceTagNames = Array.from(sourceTags).map(el => el.tagName); const translatedTagNames = Array.from(translatedTags).map(el => el.tagName); const preserved = sourceTagNames.filter((tag, i) => translatedTagNames[i] === tag).length; return preserved / sourceTagNames.length; } export function exactMatchScore(output: string, reference: string): number { return output.trim() === reference.trim() ? 1 : 0; } export function characterErrorRate(output: string, reference: string): number { // Simple CER — edit distance / reference length const dp = Array.from({ length: reference.length + 1 }, (_, i) => i); for (const char of output) { let prev = dp[0]++; for (let j = 1; j <= reference.length; j++) { const temp = dp[j]; dp[j] = char === reference[j - 1] ? prev : Math.min(prev, dp[j], dp[j - 1]) + 1; prev = temp; } } return dp[reference.length] / reference.length; } ``` --- ### 3. `src/pipeline.ts` — Translation Pipeline ```typescript import { JSDOM } from "jsdom"; import { generateText } from "ai"; import { createOpenAI } from "@ai-sdk/openai"; import { tracer, traceStep } from "./tracing"; const model = createOpenAI({ baseURL: process.env.VLLM_BASE_URL ?? "http://localhost:8000/v1", apiKey: "not-needed" }); export async function translateHtml( html: string, filename: string, commitSha: string ): Promise<{ final: string; traceId: string }> { const trace = tracer.trace({ name: "html-translation-pipeline", input: { filename }, metadata: { commitSha, filename } }); // Step 1: pure DOM — no LLM const extracted = await traceStep( trace, "extract-text", { filename }, async () => { const dom = new JSDOM(html); dom.window.document.querySelectorAll("script, style").forEach(el => el.remove()); return dom.window.document.body.textContent?.trim() ?? ""; }, (output) => ({ charCount: output.length }) ); // Step 2: LLM translation const translated = await traceStep( trace, "translate-text", { extracted }, async () => { const { text, usage } = await generateText({ model: model("Qwen/Qwen2.5-7B-Instruct"), messages: [ { role: "system", content: "Translate the following text to Spanish. Preserve formatting." }, { role: "user", content: extracted } ] }); return { text, usage }; }, ({ usage }) => ({ tokens: usage }) ); // Step 3: pure DOM reinject — no LLM const final = await traceStep( trace, "reinject-html", { filename }, async () => { const dom = new JSDOM(html); const body = dom.window.document.body; // Simple strategy: replace text nodes, preserve tags body.childNodes.forEach(node => { if (node.nodeType === dom.window.Node.TEXT_NODE) { node.textContent = translated.text; } }); return dom.serialize(); }, (output) => ({ outputSize: output.length }) ); trace.update({ output: { filename, finalSize: final.length } }); return { final, traceId: trace.id }; } ``` --- ### 4. `src/eval.ts` — Experiment Runner ```typescript import fs from "fs"; import path from "path"; import { execSync } from "child_process"; import { translateHtml } from "./pipeline"; import { tracer } from "./tracing"; import { tagPreservationScore, exactMatchScore, characterErrorRate } from "./scoring"; const SOURCE_DIR = "./datasets/source"; const REFERENCE_DIR = "./datasets/reference"; const RESULTS_DIR = "./datasets/results"; function getGitMetadata() { return { commitSha: execSync("git rev-parse HEAD").toString().trim(), branch: execSync("git rev-parse --abbrev-ref HEAD").toString().trim(), isDirty: execSync("git status --porcelain").toString().trim() !== "" }; } async function runEval() { const git = getGitMetadata(); if (git.isDirty) { console.warn("⚠️ Uncommitted changes — results may not be reproducible"); } const runName = `${git.branch}-${git.commitSha.slice(0, 7)}`; const files = fs.readdirSync(SOURCE_DIR).filter(f => f.endsWith(".html")); const results = []; for (const filename of files) { const source = fs.readFileSync(path.join(SOURCE_DIR, filename), "utf-8"); const reference = fs.readFileSync(path.join(REFERENCE_DIR, filename), "utf-8"); const { final, traceId } = await translateHtml(source, filename, git.commitSha); const scores = { tagPreservation: tagPreservationScore(source, final), exactMatch: exactMatchScore(final, reference), characterErrorRate: characterErrorRate(final, reference) }; // Push scores to Langfuse (or logs if Langfuse disabled) for (const [name, value] of Object.entries(scores)) { tracer.score({ traceId, name, value }); } results.push({ filename, scores, traceId, output: final }); console.log(`✅ ${filename}`, scores); } // Aggregate const avg = (key: keyof typeof results[0]["scores"]) => results.reduce((sum, r) => sum + r.scores[key], 0) / results.length; const summary = { runName, git, timestamp: new Date().toISOString(), model: "Qwen/Qwen2.5-7B-Instruct", aggregates: { avgTagPreservation: avg("tagPreservation"), avgExactMatch: avg("exactMatch"), avgCharacterErrorRate: avg("characterErrorRate") }, results }; // Store results locally in Git-friendly format fs.mkdirSync(RESULTS_DIR, { recursive: true }); fs.writeFileSync( path.join(RESULTS_DIR, `${runName}.json`), JSON.stringify(summary, null, 2) ); await tracer.flushAsync(); console.log(`\n📊 Run: ${runName}`); console.table(summary.aggregates); } runEval().catch(console.error); ``` --- ### Running It ```bash # Default — traces go to logs only, results saved locally npx ts-node src/eval.ts # With Langfuse — traces go to Langfuse + results saved locally LANGFUSE_ENABLED=true \ LANGFUSE_PUBLIC_KEY=pk-... \ LANGFUSE_SECRET_KEY=sk-... \ VLLM_BASE_URL=http://your-vllm:8000/v1 \ npx ts-node src/eval.ts ``` --- ### What You Get - **Local results** — every run saved as `datasets/results/main-a3f9c12.json`, committable to Git - **Tracing** — structured JSON logs by default, full Langfuse spans when enabled - **Per-step visibility** — DOM steps and LLM steps traced identically - **Reproducibility** — Git SHA baked into every run, dirty state warned - **No mandatory infrastructure** — works fully offline, Langfuse is opt-in