Fix multimodal token handling
This commit is contained in:
parent
934a916842
commit
aa4f323809
359
src/cli/index.js
359
src/cli/index.js
@ -18,9 +18,11 @@ const { renderBanner } = require('../ui/banner');
|
||||
const { buildStartLine, buildFinalLine, startToolDisplay, formatResultLines, printResultLines } = require('../ui/tool_display');
|
||||
const { createConversation, updateConversation } = require('../storage/conversation_store');
|
||||
const { applyUsage, normalizeTokenUsage } = require('../utils/token_usage');
|
||||
const { gray, cyan, green, red } = require('../utils/colors');
|
||||
const { gray, cyan, green, red, blue } = require('../utils/colors');
|
||||
const { createIndentedWriter } = require('../ui/indented_writer');
|
||||
const { createStatusBar } = require('../ui/status_bar');
|
||||
const { visibleWidth } = require('../utils/text_width');
|
||||
const { readMediafileTool } = require('../tools/read_mediafile');
|
||||
|
||||
const WORKSPACE = process.cwd();
|
||||
const WORKSPACE_NAME = path.basename(WORKSPACE);
|
||||
@ -28,6 +30,8 @@ const USERNAME = os.userInfo().username || 'user';
|
||||
const PROMPT = `${USERNAME}@${WORKSPACE_NAME} % `;
|
||||
const MENU_PAGE_SIZE = 6;
|
||||
const SLASH_COMMANDS = new Set(['/new', '/resume', '/allow', '/model', '/status', '/compact', '/config', '/help', '/exit']);
|
||||
const IMAGE_EXTS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.heic']);
|
||||
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v']);
|
||||
|
||||
const config = ensureConfig();
|
||||
const state = createState(config, WORKSPACE);
|
||||
@ -57,6 +61,7 @@ let isRunning = false;
|
||||
let escPendingCancel = false;
|
||||
let activeStreamController = null;
|
||||
let activeToolController = null;
|
||||
let currentMedia = { tokens: [], text: '' };
|
||||
let commandMenuActive = false;
|
||||
let menuSearchTerm = '';
|
||||
let menuLastSearchTerm = '';
|
||||
@ -64,6 +69,8 @@ let menuJustClosedAt = 0;
|
||||
let menuInjectedCommand = null;
|
||||
let menuAbortController = null;
|
||||
let menuJustClosedInjected = false;
|
||||
let suppressSlashMenuUntil = 0;
|
||||
let pendingSlashTimer = null;
|
||||
|
||||
function printNotice(message) {
|
||||
console.log('');
|
||||
@ -75,6 +82,141 @@ function printNoticeInline(message) {
|
||||
console.log(message);
|
||||
}
|
||||
|
||||
function getPathExt(p) {
|
||||
const idx = p.lastIndexOf('.');
|
||||
if (idx === -1) return '';
|
||||
return p.slice(idx).toLowerCase();
|
||||
}
|
||||
|
||||
function decodeEscapedPath(p) {
|
||||
let text = String(p || '');
|
||||
if (text.startsWith('file://')) {
|
||||
text = text.replace(/^file:\/\//, '');
|
||||
}
|
||||
try {
|
||||
text = decodeURIComponent(text);
|
||||
} catch (_) {}
|
||||
return text.replace(/\\ /g, ' ').replace(/\\\\/g, '\\');
|
||||
}
|
||||
|
||||
function findMediaMatches(line) {
|
||||
const matches = [];
|
||||
const quoted = /'([^']+)'/g;
|
||||
let m;
|
||||
while ((m = quoted.exec(line)) !== null) {
|
||||
matches.push({ raw: m[0], path: m[1], index: m.index });
|
||||
}
|
||||
const extGroup = Array.from(new Set([...IMAGE_EXTS, ...VIDEO_EXTS]))
|
||||
.map((e) => e.replace('.', '\\.'))
|
||||
.join('|');
|
||||
const unquoted = new RegExp(`/((?:\\\\ |[^\\s])+?)\\.(${extGroup})`, 'g');
|
||||
while ((m = unquoted.exec(line)) !== null) {
|
||||
const raw = `/${m[1]}.${m[2]}`;
|
||||
matches.push({ raw, path: raw, index: m.index });
|
||||
}
|
||||
matches.sort((a, b) => a.index - b.index);
|
||||
return matches;
|
||||
}
|
||||
|
||||
function applyMediaTokens(line) {
|
||||
if (!line) return { line: '', tokens: [] };
|
||||
const matches = findMediaMatches(line);
|
||||
if (!matches.length) return { line, tokens: [] };
|
||||
|
||||
let images = 0;
|
||||
let videos = 0;
|
||||
let cursor = 0;
|
||||
let out = '';
|
||||
const tokens = [];
|
||||
|
||||
for (const match of matches) {
|
||||
if (match.index < cursor) continue;
|
||||
const before = line.slice(cursor, match.index);
|
||||
out += before;
|
||||
const decoded = decodeEscapedPath(match.path);
|
||||
if (!fs.existsSync(decoded)) {
|
||||
out += match.raw;
|
||||
cursor = match.index + match.raw.length;
|
||||
continue;
|
||||
}
|
||||
const ext = getPathExt(decoded);
|
||||
const isImage = IMAGE_EXTS.has(ext);
|
||||
const isVideo = VIDEO_EXTS.has(ext);
|
||||
if (!isImage && !isVideo) {
|
||||
out += match.raw;
|
||||
cursor = match.index + match.raw.length;
|
||||
continue;
|
||||
}
|
||||
if (isImage && images >= 9) {
|
||||
out += match.raw;
|
||||
cursor = match.index + match.raw.length;
|
||||
continue;
|
||||
}
|
||||
if (isVideo && videos >= 1) {
|
||||
out += match.raw;
|
||||
cursor = match.index + match.raw.length;
|
||||
continue;
|
||||
}
|
||||
const token = isImage ? `[图片 #${images + 1}]` : `[视频 #${videos + 1}]`;
|
||||
tokens.push({ token, path: decoded, type: isImage ? 'image' : 'video' });
|
||||
out += token;
|
||||
if (isImage) images += 1;
|
||||
if (isVideo) videos += 1;
|
||||
cursor = match.index + match.raw.length;
|
||||
}
|
||||
out += line.slice(cursor);
|
||||
return { line: out, tokens };
|
||||
}
|
||||
|
||||
function applySingleMediaPath(line, rawPath, tokens) {
|
||||
if (!line || !rawPath) return { line, tokens };
|
||||
const idx = line.indexOf(rawPath);
|
||||
if (idx === -1) return { line, tokens };
|
||||
const decoded = decodeEscapedPath(rawPath);
|
||||
if (!fs.existsSync(decoded)) return { line, tokens };
|
||||
const ext = getPathExt(decoded);
|
||||
const isImage = IMAGE_EXTS.has(ext);
|
||||
const isVideo = VIDEO_EXTS.has(ext);
|
||||
if (!isImage && !isVideo) return { line, tokens };
|
||||
const images = tokens.filter((t) => t.type === 'image').length;
|
||||
const videos = tokens.filter((t) => t.type === 'video').length;
|
||||
if (isImage && images >= 9) return { line, tokens };
|
||||
if (isVideo && videos >= 1) return { line, tokens };
|
||||
const token = isImage ? `[图片 #${images + 1}]` : `[视频 #${videos + 1}]`;
|
||||
const nextLine = line.slice(0, idx) + token + line.slice(idx + rawPath.length);
|
||||
const nextTokens = tokens.concat([{ token, path: decoded, type: isImage ? 'image' : 'video' }]);
|
||||
return { line: nextLine, tokens: nextTokens };
|
||||
}
|
||||
|
||||
function colorizeTokens(line) {
|
||||
return line.replace(/\[(图片|视频) #\d+\]/g, (t) => blue(t));
|
||||
}
|
||||
|
||||
function refreshInputLine() {
|
||||
if (!rl || !process.stdout.isTTY || commandMenuActive) return;
|
||||
const line = rl.line || '';
|
||||
const colorLine = colorizeTokens(line);
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
process.stdout.write(PROMPT + colorLine);
|
||||
const cursorCol = visibleWidth(PROMPT) + visibleWidth(line.slice(0, rl.cursor || 0));
|
||||
readline.cursorTo(process.stdout, cursorCol);
|
||||
}
|
||||
|
||||
function removeTokenAtCursor(line, cursor, allowStart = false) {
|
||||
const tokenRe = /\[(图片|视频) #\d+\]/g;
|
||||
let m;
|
||||
while ((m = tokenRe.exec(line)) !== null) {
|
||||
const start = m.index;
|
||||
const end = m.index + m[0].length;
|
||||
if ((allowStart ? cursor >= start : cursor > start) && cursor <= end) {
|
||||
const nextLine = line.slice(0, start) + line.slice(end);
|
||||
return { line: nextLine, cursor: start, removed: true };
|
||||
}
|
||||
}
|
||||
return { line, cursor, removed: false };
|
||||
}
|
||||
|
||||
function isSlashCommand(line) {
|
||||
if (!line || line[0] !== '/') return false;
|
||||
const cmd = line.split(/\s+/)[0];
|
||||
@ -84,6 +226,34 @@ function isSlashCommand(line) {
|
||||
readline.emitKeypressEvents(process.stdin);
|
||||
if (process.stdin.isTTY) process.stdin.setRawMode(true);
|
||||
|
||||
process.stdin.on('data', (chunk) => {
|
||||
if (isRunning || commandMenuActive || !rl) return;
|
||||
const text = chunk ? chunk.toString() : '';
|
||||
if (!text) return;
|
||||
const looksLikePath = text.includes('file://') || /\.(png|jpe?g|gif|webp|bmp|tiff|heic|mp4|mov|avi|mkv|webm|m4v)\b/i.test(text);
|
||||
if (!looksLikePath) return;
|
||||
suppressSlashMenuUntil = Date.now() + 200;
|
||||
setImmediate(() => {
|
||||
const line = rl.line || '';
|
||||
const raw = text.replace(/\r?\n/g, '').trim();
|
||||
const decoded = raw ? decodeEscapedPath(raw) : '';
|
||||
let applied = applyMediaTokens(line);
|
||||
if (raw) {
|
||||
if (line.includes(raw)) {
|
||||
applied = applySingleMediaPath(applied.line, raw, applied.tokens);
|
||||
} else if (decoded && line.includes(decoded)) {
|
||||
applied = applySingleMediaPath(applied.line, decoded, applied.tokens);
|
||||
}
|
||||
}
|
||||
if (applied.tokens.length) {
|
||||
rl.line = applied.line;
|
||||
rl.cursor = rl.line.length;
|
||||
currentMedia = { tokens: applied.tokens, text: applied.line };
|
||||
refreshInputLine();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
initReadline();
|
||||
|
||||
statusBar = createStatusBar({
|
||||
@ -110,7 +280,17 @@ process.stdin.on('keypress', (str, key) => {
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (!isRunning && key && key.name === 'backspace') {
|
||||
if (!isRunning && key && (key.name === 'backspace' || key.name === 'delete')) {
|
||||
const line = rl.line || '';
|
||||
const cursor = rl.cursor || 0;
|
||||
const updated = removeTokenAtCursor(line, cursor, key.name === 'delete');
|
||||
if (updated.removed) {
|
||||
rl.line = updated.line;
|
||||
rl.cursor = updated.cursor;
|
||||
refreshInputLine();
|
||||
if (statusBar) statusBar.render();
|
||||
return;
|
||||
}
|
||||
if (statusBar) statusBar.render();
|
||||
}
|
||||
if (key && key.name === 'escape' && isRunning) {
|
||||
@ -123,7 +303,45 @@ process.stdin.on('keypress', (str, key) => {
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (str === '/' && Date.now() < suppressSlashMenuUntil) {
|
||||
return;
|
||||
}
|
||||
if (pendingSlashTimer && (str !== '/' || (rl.line && rl.line !== '/'))) {
|
||||
clearTimeout(pendingSlashTimer);
|
||||
pendingSlashTimer = null;
|
||||
}
|
||||
if (!isRunning) {
|
||||
const rawLine = rl.line || '';
|
||||
let applied = { line: rawLine, tokens: [] };
|
||||
if (!isSlashCommand(rawLine)) {
|
||||
const hasToken = /\[(图片|视频) #\d+\]/.test(rawLine);
|
||||
if (hasToken && currentMedia.tokens.length && currentMedia.text === rawLine) {
|
||||
applied = { line: rawLine, tokens: currentMedia.tokens };
|
||||
} else {
|
||||
applied = applyMediaTokens(rawLine);
|
||||
}
|
||||
if (hasToken && !applied.tokens.length && currentMedia.tokens.length) {
|
||||
applied = { line: rawLine, tokens: currentMedia.tokens };
|
||||
}
|
||||
}
|
||||
if (applied.line !== (rl.line || '')) {
|
||||
rl.line = applied.line;
|
||||
rl.cursor = rl.line.length;
|
||||
currentMedia = { tokens: applied.tokens, text: applied.line };
|
||||
} else if (currentMedia.text !== applied.line) {
|
||||
currentMedia = { tokens: applied.tokens, text: applied.line };
|
||||
}
|
||||
refreshInputLine();
|
||||
}
|
||||
if (statusBar) statusBar.render();
|
||||
if (str === '/' && (rl.line === '' || rl.line === '/')) {
|
||||
if (pendingSlashTimer) {
|
||||
clearTimeout(pendingSlashTimer);
|
||||
pendingSlashTimer = null;
|
||||
}
|
||||
pendingSlashTimer = setTimeout(() => {
|
||||
pendingSlashTimer = null;
|
||||
if (rl.line === '' || rl.line === '/') {
|
||||
commandMenuActive = true;
|
||||
if (rl) {
|
||||
rl.pause();
|
||||
@ -132,55 +350,58 @@ process.stdin.on('keypress', (str, key) => {
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
}
|
||||
menuSearchTerm = '';
|
||||
menuAbortController = new AbortController();
|
||||
void openCommandMenu({
|
||||
rl,
|
||||
prompt: PROMPT,
|
||||
pageSize: MENU_PAGE_SIZE,
|
||||
colorEnabled: process.stdout.isTTY,
|
||||
resetAnsi: '\x1b[0m',
|
||||
onInput: (input) => {
|
||||
menuSearchTerm = input || '';
|
||||
},
|
||||
abortSignal: menuAbortController.signal,
|
||||
})
|
||||
.then((result) => {
|
||||
if (result && result.chosen && !result.cancelled) {
|
||||
menuInjectedCommand = result.chosen;
|
||||
}
|
||||
})
|
||||
.finally(() => {
|
||||
commandMenuActive = false;
|
||||
menuAbortController = null;
|
||||
menuJustClosedAt = menuInjectedCommand ? Date.now() : 0;
|
||||
menuJustClosedInjected = !!menuInjectedCommand;
|
||||
menuLastSearchTerm = menuSearchTerm;
|
||||
drainStdin();
|
||||
rl.line = '';
|
||||
rl.cursor = 0;
|
||||
menuSearchTerm = '';
|
||||
if (process.stdout.isTTY) {
|
||||
readline.clearScreenDown(process.stdout);
|
||||
}
|
||||
// Clear possible echoes from the base readline line (current + previous line).
|
||||
if (process.stdout.isTTY) {
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
readline.moveCursor(process.stdout, 0, -1);
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
} else {
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
}
|
||||
promptWithStatus(true);
|
||||
if (menuInjectedCommand) {
|
||||
const injected = menuInjectedCommand;
|
||||
menuInjectedCommand = null;
|
||||
setImmediate(() => injectLine(injected));
|
||||
}
|
||||
});
|
||||
menuAbortController = new AbortController();
|
||||
void openCommandMenu({
|
||||
rl,
|
||||
prompt: PROMPT,
|
||||
pageSize: MENU_PAGE_SIZE,
|
||||
colorEnabled: process.stdout.isTTY,
|
||||
resetAnsi: '\x1b[0m',
|
||||
onInput: (input) => {
|
||||
menuSearchTerm = input || '';
|
||||
},
|
||||
abortSignal: menuAbortController.signal,
|
||||
})
|
||||
.then((result) => {
|
||||
if (result && result.chosen && !result.cancelled) {
|
||||
menuInjectedCommand = result.chosen;
|
||||
}
|
||||
})
|
||||
.finally(() => {
|
||||
commandMenuActive = false;
|
||||
menuAbortController = null;
|
||||
menuJustClosedAt = menuInjectedCommand ? Date.now() : 0;
|
||||
menuJustClosedInjected = !!menuInjectedCommand;
|
||||
menuLastSearchTerm = menuSearchTerm;
|
||||
drainStdin();
|
||||
rl.line = '';
|
||||
rl.cursor = 0;
|
||||
menuSearchTerm = '';
|
||||
if (process.stdout.isTTY) {
|
||||
readline.clearScreenDown(process.stdout);
|
||||
}
|
||||
// Clear possible echoes from the base readline line (current + previous line).
|
||||
if (process.stdout.isTTY) {
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
readline.moveCursor(process.stdout, 0, -1);
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
} else {
|
||||
readline.clearLine(process.stdout, 0);
|
||||
readline.cursorTo(process.stdout, 0);
|
||||
}
|
||||
promptWithStatus(true);
|
||||
if (menuInjectedCommand) {
|
||||
const injected = menuInjectedCommand;
|
||||
menuInjectedCommand = null;
|
||||
setImmediate(() => injectLine(injected));
|
||||
}
|
||||
});
|
||||
}
|
||||
}, 80);
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
@ -208,7 +429,21 @@ function initReadline() {
|
||||
if (commandMenuActive) {
|
||||
return;
|
||||
}
|
||||
const input = line.trim();
|
||||
let applied = { line, tokens: [] };
|
||||
if (!isSlashCommand(line)) {
|
||||
const hasToken = /\[(图片|视频) #\d+\]/.test(line);
|
||||
if (hasToken && currentMedia.tokens.length && currentMedia.text === line) {
|
||||
applied = { line, tokens: currentMedia.tokens };
|
||||
} else {
|
||||
applied = applyMediaTokens(line);
|
||||
}
|
||||
if (hasToken && !applied.tokens.length && currentMedia.tokens.length) {
|
||||
applied = { line, tokens: currentMedia.tokens };
|
||||
}
|
||||
}
|
||||
const normalizedLine = applied.line;
|
||||
currentMedia = { tokens: applied.tokens, text: normalizedLine };
|
||||
const input = normalizedLine.trim();
|
||||
if (menuJustClosedAt) {
|
||||
if (!menuJustClosedInjected) {
|
||||
const tooOld = Date.now() - menuJustClosedAt > 800;
|
||||
@ -249,8 +484,10 @@ function initReadline() {
|
||||
|
||||
console.log('');
|
||||
const userWriter = createIndentedWriter(' ');
|
||||
userWriter.writeLine(`${cyan('用户:')}${line}`);
|
||||
state.messages.push({ role: 'user', content: line });
|
||||
const displayLine = colorizeTokens(normalizedLine);
|
||||
userWriter.writeLine(`${cyan('用户:')}${displayLine}`);
|
||||
const content = buildUserContent(normalizedLine, currentMedia.tokens);
|
||||
state.messages.push({ role: 'user', content });
|
||||
persistConversation();
|
||||
await runAssistantLoop();
|
||||
promptWithStatus();
|
||||
@ -328,6 +565,22 @@ function buildApiMessages() {
|
||||
return messages;
|
||||
}
|
||||
|
||||
function buildUserContent(line, tokens) {
|
||||
if (!tokens.length) return line;
|
||||
const parts = [{ type: 'text', text: line }];
|
||||
for (const info of tokens) {
|
||||
const media = readMediafileTool(WORKSPACE, { path: info.path });
|
||||
if (media && media.success) {
|
||||
const url = `data:${media.mime};base64,${media.b64}`;
|
||||
parts.push({
|
||||
type: media.type === 'image' ? 'image_url' : 'video_url',
|
||||
[media.type === 'image' ? 'image_url' : 'video_url']: { url },
|
||||
});
|
||||
}
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
function printCancelLine() {
|
||||
console.log('');
|
||||
process.stdout.write(` ${red('已取消本次响应')}\n\n`);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user