1. Optional LangFuse tracing that just goes to the project's logging framework by default
2. Experiment tracking for the translations in a local directory
3. Vercel AI SDK to interface with the LLM
---
### Project Structure
```
/datasets
/source
page-1.html
page-2.html
/reference
page-1.html
page-2.html
/results
main-a3f9c12.json
/src
tracing.ts # optional Langfuse / logging wrapper
eval.ts # experiment runner
pipeline.ts # translation pipeline
scoring.ts # eval metrics
```
---
### 1. `src/tracing.ts` — Optional Langfuse / Log Wrapper
```typescript
import { Langfuse } from "langfuse";
const IS_LANGFUSE_ENABLED = process.env.LANGFUSE_ENABLED === "true";
// Minimal span interface both implementations share
interface Span {
id: string;
end(opts?: { output?: unknown; metadata?: unknown }): void;
}
interface Trace {
id: string;
span(opts: { name: string; input?: unknown }): Span;
update(opts: { output?: unknown; metadata?: unknown }): void;
}
interface Tracer {
trace(opts: { name: string; input?: unknown; metadata?: unknown }): Trace;
score(opts: { traceId: string; observationId?: string; name: string; value: number; comment?: string }): void;
flushAsync(): Promise<void>;
}
// Logging implementation
function makeLoggingTracer(): Tracer {
return {
trace(opts) {
console.log(JSON.stringify({ event: "trace_start", ...opts }));
const traceId = `local-${Date.now()}`;
return {
id: traceId,
span(spanOpts) {
const start = performance.now();
console.log(JSON.stringify({ event: "span_start", traceId, ...spanOpts }));
const spanId = `local-${Date.now()}`;
return {
id: spanId,
end(endOpts) {
console.log(JSON.stringify({
event: "span_end",
traceId,
spanId,
name: spanOpts.name,
durationMs: (performance.now() - start).toFixed(2),
...endOpts
}));
}
};
},
update(updateOpts) {
console.log(JSON.stringify({ event: "trace_update", traceId, ...updateOpts }));
}
};
},
score(opts) {
console.log(JSON.stringify({ event: "score", ...opts }));
},
async flushAsync() {}
};
}
// Langfuse implementation
function makeLangfuseTracer(): Tracer {
const langfuse = new Langfuse();
return {
trace(opts) {
const trace = langfuse.trace(opts);
return {
id: trace.id,
span(spanOpts) {
const span = trace.span(spanOpts);
return {
id: span.id,
end(endOpts) { span.end(endOpts); }
};
},
update(updateOpts) { trace.update(updateOpts); }
};
},
score(opts) { langfuse.score(opts); },
async flushAsync() { await langfuse.flushAsync(); }
};
}
export const tracer: Tracer = IS_LANGFUSE_ENABLED
? makeLangfuseTracer()
: makeLoggingTracer();
// Generic step wrapper — works for LLM and non-LLM steps equally
export async function traceStep<T>(
trace: Trace,
name: string,
input: unknown,
fn: () => Promise<T>,
getMetadata?: (output: T) => unknown
): Promise<T> {
const span = trace.span({ name, input });
try {
const output = await fn();
span.end({ output, metadata: getMetadata?.(output) });
return output;
} catch (err) {
span.end({ metadata: { error: String(err) } });
throw err;
}
}
```
---
### 2. `src/scoring.ts` — Eval Metrics
```typescript
import { JSDOM } from "jsdom";
export function tagPreservationScore(source: string, translated: string): number {
const sourceTags = new JSDOM(source).window.document
.querySelectorAll("*");
const translatedTags = new JSDOM(translated).window.document
.querySelectorAll("*");
const sourceTagNames = Array.from(sourceTags).map(el => el.tagName);
const translatedTagNames = Array.from(translatedTags).map(el => el.tagName);
const preserved = sourceTagNames.filter((tag, i) => translatedTagNames[i] === tag).length;
return preserved / sourceTagNames.length;
}
export function exactMatchScore(output: string, reference: string): number {
return output.trim() === reference.trim() ? 1 : 0;
}
export function characterErrorRate(output: string, reference: string): number {
// Simple CER — edit distance / reference length
const dp = Array.from({ length: reference.length + 1 }, (_, i) => i);
for (const char of output) {
let prev = dp[0]++;
for (let j = 1; j <= reference.length; j++) {
const temp = dp[j];
dp[j] = char === reference[j - 1] ? prev : Math.min(prev, dp[j], dp[j - 1]) + 1;
prev = temp;
}
}
return dp[reference.length] / reference.length;
}
```
---
### 3. `src/pipeline.ts` — Translation Pipeline
```typescript
import { JSDOM } from "jsdom";
import { generateText } from "ai";
import { createOpenAI } from "@ai-sdk/openai";
import { tracer, traceStep } from "./tracing";
const model = createOpenAI({
baseURL: process.env.VLLM_BASE_URL ?? "http://localhost:8000/v1",
apiKey: "not-needed"
});
export async function translateHtml(
html: string,
filename: string,
commitSha: string
): Promise<{ final: string; traceId: string }> {
const trace = tracer.trace({
name: "html-translation-pipeline",
input: { filename },
metadata: { commitSha, filename }
});
// Step 1: pure DOM — no LLM
const extracted = await traceStep(
trace,
"extract-text",
{ filename },
async () => {
const dom = new JSDOM(html);
dom.window.document.querySelectorAll("script, style").forEach(el => el.remove());
return dom.window.document.body.textContent?.trim() ?? "";
},
(output) => ({ charCount: output.length })
);
// Step 2: LLM translation
const translated = await traceStep(
trace,
"translate-text",
{ extracted },
async () => {
const { text, usage } = await generateText({
model: model("Qwen/Qwen2.5-7B-Instruct"),
messages: [
{ role: "system", content: "Translate the following text to Spanish. Preserve formatting." },
{ role: "user", content: extracted }
]
});
return { text, usage };
},
({ usage }) => ({ tokens: usage })
);
// Step 3: pure DOM reinject — no LLM
const final = await traceStep(
trace,
"reinject-html",
{ filename },
async () => {
const dom = new JSDOM(html);
const body = dom.window.document.body;
// Simple strategy: replace text nodes, preserve tags
body.childNodes.forEach(node => {
if (node.nodeType === dom.window.Node.TEXT_NODE) {
node.textContent = translated.text;
}
});
return dom.serialize();
},
(output) => ({ outputSize: output.length })
);
trace.update({ output: { filename, finalSize: final.length } });
return { final, traceId: trace.id };
}
```
---
### 4. `src/eval.ts` — Experiment Runner
```typescript
import fs from "fs";
import path from "path";
import { execSync } from "child_process";
import { translateHtml } from "./pipeline";
import { tracer } from "./tracing";
import { tagPreservationScore, exactMatchScore, characterErrorRate } from "./scoring";
const SOURCE_DIR = "./datasets/source";
const REFERENCE_DIR = "./datasets/reference";
const RESULTS_DIR = "./datasets/results";
function getGitMetadata() {
return {
commitSha: execSync("git rev-parse HEAD").toString().trim(),
branch: execSync("git rev-parse --abbrev-ref HEAD").toString().trim(),
isDirty: execSync("git status --porcelain").toString().trim() !== ""
};
}
async function runEval() {
const git = getGitMetadata();
if (git.isDirty) {
console.warn("⚠️ Uncommitted changes — results may not be reproducible");
}
const runName = `${git.branch}-${git.commitSha.slice(0, 7)}`;
const files = fs.readdirSync(SOURCE_DIR).filter(f => f.endsWith(".html"));
const results = [];
for (const filename of files) {
const source = fs.readFileSync(path.join(SOURCE_DIR, filename), "utf-8");
const reference = fs.readFileSync(path.join(REFERENCE_DIR, filename), "utf-8");
const { final, traceId } = await translateHtml(source, filename, git.commitSha);
const scores = {
tagPreservation: tagPreservationScore(source, final),
exactMatch: exactMatchScore(final, reference),
characterErrorRate: characterErrorRate(final, reference)
};
// Push scores to Langfuse (or logs if Langfuse disabled)
for (const [name, value] of Object.entries(scores)) {
tracer.score({ traceId, name, value });
}
results.push({ filename, scores, traceId, output: final });
console.log(`✅ ${filename}`, scores);
}
// Aggregate
const avg = (key: keyof typeof results[0]["scores"]) =>
results.reduce((sum, r) => sum + r.scores[key], 0) / results.length;
const summary = {
runName,
git,
timestamp: new Date().toISOString(),
model: "Qwen/Qwen2.5-7B-Instruct",
aggregates: {
avgTagPreservation: avg("tagPreservation"),
avgExactMatch: avg("exactMatch"),
avgCharacterErrorRate: avg("characterErrorRate")
},
results
};
// Store results locally in Git-friendly format
fs.mkdirSync(RESULTS_DIR, { recursive: true });
fs.writeFileSync(
path.join(RESULTS_DIR, `${runName}.json`),
JSON.stringify(summary, null, 2)
);
await tracer.flushAsync();
console.log(`\n📊 Run: ${runName}`);
console.table(summary.aggregates);
}
runEval().catch(console.error);
```
---
### Running It
```bash
# Default — traces go to logs only, results saved locally
npx ts-node src/eval.ts
# With Langfuse — traces go to Langfuse + results saved locally
LANGFUSE_ENABLED=true \
LANGFUSE_PUBLIC_KEY=pk-... \
LANGFUSE_SECRET_KEY=sk-... \
VLLM_BASE_URL=http://your-vllm:8000/v1 \
npx ts-node src/eval.ts
```
---
### What You Get
- **Local results** — every run saved as `datasets/results/main-a3f9c12.json`, committable to Git
- **Tracing** — structured JSON logs by default, full Langfuse spans when enabled
- **Per-step visibility** — DOM steps and LLM steps traced identically
- **Reproducibility** — Git SHA baked into every run, dirty state warned
- **No mandatory infrastructure** — works fully offline, Langfuse is opt-in