Fix multimodal token handling

This commit is contained in:
JOJO 2026-02-28 19:58:58 +08:00
parent 934a916842
commit aa4f323809

View File

@ -18,9 +18,11 @@ const { renderBanner } = require('../ui/banner');
const { buildStartLine, buildFinalLine, startToolDisplay, formatResultLines, printResultLines } = require('../ui/tool_display'); const { buildStartLine, buildFinalLine, startToolDisplay, formatResultLines, printResultLines } = require('../ui/tool_display');
const { createConversation, updateConversation } = require('../storage/conversation_store'); const { createConversation, updateConversation } = require('../storage/conversation_store');
const { applyUsage, normalizeTokenUsage } = require('../utils/token_usage'); const { applyUsage, normalizeTokenUsage } = require('../utils/token_usage');
const { gray, cyan, green, red } = require('../utils/colors'); const { gray, cyan, green, red, blue } = require('../utils/colors');
const { createIndentedWriter } = require('../ui/indented_writer'); const { createIndentedWriter } = require('../ui/indented_writer');
const { createStatusBar } = require('../ui/status_bar'); const { createStatusBar } = require('../ui/status_bar');
const { visibleWidth } = require('../utils/text_width');
const { readMediafileTool } = require('../tools/read_mediafile');
const WORKSPACE = process.cwd(); const WORKSPACE = process.cwd();
const WORKSPACE_NAME = path.basename(WORKSPACE); const WORKSPACE_NAME = path.basename(WORKSPACE);
@ -28,6 +30,8 @@ const USERNAME = os.userInfo().username || 'user';
const PROMPT = `${USERNAME}@${WORKSPACE_NAME} % `; const PROMPT = `${USERNAME}@${WORKSPACE_NAME} % `;
const MENU_PAGE_SIZE = 6; const MENU_PAGE_SIZE = 6;
const SLASH_COMMANDS = new Set(['/new', '/resume', '/allow', '/model', '/status', '/compact', '/config', '/help', '/exit']); const SLASH_COMMANDS = new Set(['/new', '/resume', '/allow', '/model', '/status', '/compact', '/config', '/help', '/exit']);
const IMAGE_EXTS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.heic']);
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v']);
const config = ensureConfig(); const config = ensureConfig();
const state = createState(config, WORKSPACE); const state = createState(config, WORKSPACE);
@ -57,6 +61,7 @@ let isRunning = false;
let escPendingCancel = false; let escPendingCancel = false;
let activeStreamController = null; let activeStreamController = null;
let activeToolController = null; let activeToolController = null;
let currentMedia = { tokens: [], text: '' };
let commandMenuActive = false; let commandMenuActive = false;
let menuSearchTerm = ''; let menuSearchTerm = '';
let menuLastSearchTerm = ''; let menuLastSearchTerm = '';
@ -64,6 +69,8 @@ let menuJustClosedAt = 0;
let menuInjectedCommand = null; let menuInjectedCommand = null;
let menuAbortController = null; let menuAbortController = null;
let menuJustClosedInjected = false; let menuJustClosedInjected = false;
let suppressSlashMenuUntil = 0;
let pendingSlashTimer = null;
function printNotice(message) { function printNotice(message) {
console.log(''); console.log('');
@ -75,6 +82,141 @@ function printNoticeInline(message) {
console.log(message); console.log(message);
} }
function getPathExt(p) {
const idx = p.lastIndexOf('.');
if (idx === -1) return '';
return p.slice(idx).toLowerCase();
}
function decodeEscapedPath(p) {
let text = String(p || '');
if (text.startsWith('file://')) {
text = text.replace(/^file:\/\//, '');
}
try {
text = decodeURIComponent(text);
} catch (_) {}
return text.replace(/\\ /g, ' ').replace(/\\\\/g, '\\');
}
function findMediaMatches(line) {
const matches = [];
const quoted = /'([^']+)'/g;
let m;
while ((m = quoted.exec(line)) !== null) {
matches.push({ raw: m[0], path: m[1], index: m.index });
}
const extGroup = Array.from(new Set([...IMAGE_EXTS, ...VIDEO_EXTS]))
.map((e) => e.replace('.', '\\.'))
.join('|');
const unquoted = new RegExp(`/((?:\\\\ |[^\\s])+?)\\.(${extGroup})`, 'g');
while ((m = unquoted.exec(line)) !== null) {
const raw = `/${m[1]}.${m[2]}`;
matches.push({ raw, path: raw, index: m.index });
}
matches.sort((a, b) => a.index - b.index);
return matches;
}
function applyMediaTokens(line) {
if (!line) return { line: '', tokens: [] };
const matches = findMediaMatches(line);
if (!matches.length) return { line, tokens: [] };
let images = 0;
let videos = 0;
let cursor = 0;
let out = '';
const tokens = [];
for (const match of matches) {
if (match.index < cursor) continue;
const before = line.slice(cursor, match.index);
out += before;
const decoded = decodeEscapedPath(match.path);
if (!fs.existsSync(decoded)) {
out += match.raw;
cursor = match.index + match.raw.length;
continue;
}
const ext = getPathExt(decoded);
const isImage = IMAGE_EXTS.has(ext);
const isVideo = VIDEO_EXTS.has(ext);
if (!isImage && !isVideo) {
out += match.raw;
cursor = match.index + match.raw.length;
continue;
}
if (isImage && images >= 9) {
out += match.raw;
cursor = match.index + match.raw.length;
continue;
}
if (isVideo && videos >= 1) {
out += match.raw;
cursor = match.index + match.raw.length;
continue;
}
const token = isImage ? `[图片 #${images + 1}]` : `[视频 #${videos + 1}]`;
tokens.push({ token, path: decoded, type: isImage ? 'image' : 'video' });
out += token;
if (isImage) images += 1;
if (isVideo) videos += 1;
cursor = match.index + match.raw.length;
}
out += line.slice(cursor);
return { line: out, tokens };
}
function applySingleMediaPath(line, rawPath, tokens) {
if (!line || !rawPath) return { line, tokens };
const idx = line.indexOf(rawPath);
if (idx === -1) return { line, tokens };
const decoded = decodeEscapedPath(rawPath);
if (!fs.existsSync(decoded)) return { line, tokens };
const ext = getPathExt(decoded);
const isImage = IMAGE_EXTS.has(ext);
const isVideo = VIDEO_EXTS.has(ext);
if (!isImage && !isVideo) return { line, tokens };
const images = tokens.filter((t) => t.type === 'image').length;
const videos = tokens.filter((t) => t.type === 'video').length;
if (isImage && images >= 9) return { line, tokens };
if (isVideo && videos >= 1) return { line, tokens };
const token = isImage ? `[图片 #${images + 1}]` : `[视频 #${videos + 1}]`;
const nextLine = line.slice(0, idx) + token + line.slice(idx + rawPath.length);
const nextTokens = tokens.concat([{ token, path: decoded, type: isImage ? 'image' : 'video' }]);
return { line: nextLine, tokens: nextTokens };
}
function colorizeTokens(line) {
return line.replace(/\[(图片|视频) #\d+\]/g, (t) => blue(t));
}
function refreshInputLine() {
if (!rl || !process.stdout.isTTY || commandMenuActive) return;
const line = rl.line || '';
const colorLine = colorizeTokens(line);
readline.clearLine(process.stdout, 0);
readline.cursorTo(process.stdout, 0);
process.stdout.write(PROMPT + colorLine);
const cursorCol = visibleWidth(PROMPT) + visibleWidth(line.slice(0, rl.cursor || 0));
readline.cursorTo(process.stdout, cursorCol);
}
function removeTokenAtCursor(line, cursor, allowStart = false) {
const tokenRe = /\[(图片|视频) #\d+\]/g;
let m;
while ((m = tokenRe.exec(line)) !== null) {
const start = m.index;
const end = m.index + m[0].length;
if ((allowStart ? cursor >= start : cursor > start) && cursor <= end) {
const nextLine = line.slice(0, start) + line.slice(end);
return { line: nextLine, cursor: start, removed: true };
}
}
return { line, cursor, removed: false };
}
function isSlashCommand(line) { function isSlashCommand(line) {
if (!line || line[0] !== '/') return false; if (!line || line[0] !== '/') return false;
const cmd = line.split(/\s+/)[0]; const cmd = line.split(/\s+/)[0];
@ -84,6 +226,34 @@ function isSlashCommand(line) {
readline.emitKeypressEvents(process.stdin); readline.emitKeypressEvents(process.stdin);
if (process.stdin.isTTY) process.stdin.setRawMode(true); if (process.stdin.isTTY) process.stdin.setRawMode(true);
process.stdin.on('data', (chunk) => {
if (isRunning || commandMenuActive || !rl) return;
const text = chunk ? chunk.toString() : '';
if (!text) return;
const looksLikePath = text.includes('file://') || /\.(png|jpe?g|gif|webp|bmp|tiff|heic|mp4|mov|avi|mkv|webm|m4v)\b/i.test(text);
if (!looksLikePath) return;
suppressSlashMenuUntil = Date.now() + 200;
setImmediate(() => {
const line = rl.line || '';
const raw = text.replace(/\r?\n/g, '').trim();
const decoded = raw ? decodeEscapedPath(raw) : '';
let applied = applyMediaTokens(line);
if (raw) {
if (line.includes(raw)) {
applied = applySingleMediaPath(applied.line, raw, applied.tokens);
} else if (decoded && line.includes(decoded)) {
applied = applySingleMediaPath(applied.line, decoded, applied.tokens);
}
}
if (applied.tokens.length) {
rl.line = applied.line;
rl.cursor = rl.line.length;
currentMedia = { tokens: applied.tokens, text: applied.line };
refreshInputLine();
}
});
});
initReadline(); initReadline();
statusBar = createStatusBar({ statusBar = createStatusBar({
@ -110,7 +280,17 @@ process.stdin.on('keypress', (str, key) => {
} }
return; return;
} }
if (!isRunning && key && key.name === 'backspace') { if (!isRunning && key && (key.name === 'backspace' || key.name === 'delete')) {
const line = rl.line || '';
const cursor = rl.cursor || 0;
const updated = removeTokenAtCursor(line, cursor, key.name === 'delete');
if (updated.removed) {
rl.line = updated.line;
rl.cursor = updated.cursor;
refreshInputLine();
if (statusBar) statusBar.render();
return;
}
if (statusBar) statusBar.render(); if (statusBar) statusBar.render();
} }
if (key && key.name === 'escape' && isRunning) { if (key && key.name === 'escape' && isRunning) {
@ -123,7 +303,45 @@ process.stdin.on('keypress', (str, key) => {
} }
return; return;
} }
if (str === '/' && Date.now() < suppressSlashMenuUntil) {
return;
}
if (pendingSlashTimer && (str !== '/' || (rl.line && rl.line !== '/'))) {
clearTimeout(pendingSlashTimer);
pendingSlashTimer = null;
}
if (!isRunning) {
const rawLine = rl.line || '';
let applied = { line: rawLine, tokens: [] };
if (!isSlashCommand(rawLine)) {
const hasToken = /\[(图片|视频) #\d+\]/.test(rawLine);
if (hasToken && currentMedia.tokens.length && currentMedia.text === rawLine) {
applied = { line: rawLine, tokens: currentMedia.tokens };
} else {
applied = applyMediaTokens(rawLine);
}
if (hasToken && !applied.tokens.length && currentMedia.tokens.length) {
applied = { line: rawLine, tokens: currentMedia.tokens };
}
}
if (applied.line !== (rl.line || '')) {
rl.line = applied.line;
rl.cursor = rl.line.length;
currentMedia = { tokens: applied.tokens, text: applied.line };
} else if (currentMedia.text !== applied.line) {
currentMedia = { tokens: applied.tokens, text: applied.line };
}
refreshInputLine();
}
if (statusBar) statusBar.render();
if (str === '/' && (rl.line === '' || rl.line === '/')) { if (str === '/' && (rl.line === '' || rl.line === '/')) {
if (pendingSlashTimer) {
clearTimeout(pendingSlashTimer);
pendingSlashTimer = null;
}
pendingSlashTimer = setTimeout(() => {
pendingSlashTimer = null;
if (rl.line === '' || rl.line === '/') {
commandMenuActive = true; commandMenuActive = true;
if (rl) { if (rl) {
rl.pause(); rl.pause();
@ -182,6 +400,9 @@ process.stdin.on('keypress', (str, key) => {
} }
}); });
} }
}, 80);
return;
}
}); });
function initReadline() { function initReadline() {
@ -208,7 +429,21 @@ function initReadline() {
if (commandMenuActive) { if (commandMenuActive) {
return; return;
} }
const input = line.trim(); let applied = { line, tokens: [] };
if (!isSlashCommand(line)) {
const hasToken = /\[(图片|视频) #\d+\]/.test(line);
if (hasToken && currentMedia.tokens.length && currentMedia.text === line) {
applied = { line, tokens: currentMedia.tokens };
} else {
applied = applyMediaTokens(line);
}
if (hasToken && !applied.tokens.length && currentMedia.tokens.length) {
applied = { line, tokens: currentMedia.tokens };
}
}
const normalizedLine = applied.line;
currentMedia = { tokens: applied.tokens, text: normalizedLine };
const input = normalizedLine.trim();
if (menuJustClosedAt) { if (menuJustClosedAt) {
if (!menuJustClosedInjected) { if (!menuJustClosedInjected) {
const tooOld = Date.now() - menuJustClosedAt > 800; const tooOld = Date.now() - menuJustClosedAt > 800;
@ -249,8 +484,10 @@ function initReadline() {
console.log(''); console.log('');
const userWriter = createIndentedWriter(' '); const userWriter = createIndentedWriter(' ');
userWriter.writeLine(`${cyan('用户:')}${line}`); const displayLine = colorizeTokens(normalizedLine);
state.messages.push({ role: 'user', content: line }); userWriter.writeLine(`${cyan('用户:')}${displayLine}`);
const content = buildUserContent(normalizedLine, currentMedia.tokens);
state.messages.push({ role: 'user', content });
persistConversation(); persistConversation();
await runAssistantLoop(); await runAssistantLoop();
promptWithStatus(); promptWithStatus();
@ -328,6 +565,22 @@ function buildApiMessages() {
return messages; return messages;
} }
function buildUserContent(line, tokens) {
if (!tokens.length) return line;
const parts = [{ type: 'text', text: line }];
for (const info of tokens) {
const media = readMediafileTool(WORKSPACE, { path: info.path });
if (media && media.success) {
const url = `data:${media.mime};base64,${media.b64}`;
parts.push({
type: media.type === 'image' ? 'image_url' : 'video_url',
[media.type === 'image' ? 'image_url' : 'video_url']: { url },
});
}
}
return parts;
}
function printCancelLine() { function printCancelLine() {
console.log(''); console.log('');
process.stdout.write(` ${red('已取消本次响应')}\n\n`); process.stdout.write(` ${red('已取消本次响应')}\n\n`);