Spaces:
Runtime error
Runtime error
" LLM-based text completion using llama.cpp | |
" | |
" requires: | |
" | |
" - neovim or vim | |
" - curl | |
" - llama.cpp server instance | |
" - FIM-compatible model | |
" | |
" sample config: | |
" | |
" - Tab - accept the current suggestion | |
" - Shift+Tab - accept just the first line of the suggestion | |
" - Ctrl+F - toggle FIM completion manually | |
" | |
" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim | |
" | |
" start the llama.cpp server with a FIM-compatible model. for example: | |
" | |
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 | |
" | |
" --batch-size [512, model max context] | |
" | |
" adjust the batch size to control how much of the provided local context will be used during the inference | |
" lower values will use smaller part of the context around the cursor, which will result in faster processing | |
" | |
" --ubatch-size [64, 2048] | |
" | |
" chunks the batch into smaller chunks for faster processing | |
" depends on the specific hardware. use llama-bench to profile and determine the best size | |
" | |
" --cache-reuse (ge:llama_config.n_predict, 1024] | |
" | |
" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict | |
" using non-zero value enables context reuse on the server side which dramatically improves the performance at | |
" large contexts. a value of 256 should be good for all cases | |
" | |
" run this once to initialise llama.vim: | |
" | |
" :call llama#init() | |
" | |
" more info: https://github.com/ggerganov/llama.cpp/pull/9787 | |
" | |
" colors (adjust to your liking) | |
highlight llama_hl_hint guifg=#ff772f ctermfg=202 | |
highlight llama_hl_info guifg=#77ff2f ctermfg=119 | |
" general parameters: | |
" | |
" endpoint: llama.cpp server endpoint | |
" n_prefix: number of lines before the cursor location to include in the local prefix | |
" n_suffix: number of lines after the cursor location to include in the local suffix | |
" n_predict: max number of tokens to predict | |
" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) | |
" t_max_predict_ms: max alloted time for the prediction | |
" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) | |
" auto_fim: trigger FIM completion automatically on cursor movement | |
" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor | |
" | |
" ring buffer of chunks, accumulated with time upon: | |
" | |
" - completion request | |
" - yank | |
" - entering a buffer | |
" - leaving a buffer | |
" - writing a file | |
" | |
" parameters for the ring-buffer with extra context: | |
" | |
" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) | |
" ring_chunk_size: max size of the chunks (in number of lines) | |
" note: adjust these numbers so that you don't overrun your context | |
" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context | |
" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM | |
" ring_update_ms: how often to process queued chunks in normal mode | |
" | |
let s:default_config = { | |
\ 'endpoint': 'http://127.0.0.1:8012/infill', | |
\ 'n_prefix': 256, | |
\ 'n_suffix': 64, | |
\ 'n_predict': 128, | |
\ 't_max_prompt_ms': 500, | |
\ 't_max_predict_ms': 3000, | |
\ 'show_info': 2, | |
\ 'auto_fim': v:true, | |
\ 'max_line_suffix': 8, | |
\ 'ring_n_chunks': 64, | |
\ 'ring_chunk_size': 64, | |
\ 'ring_scope': 1024, | |
\ 'ring_update_ms': 1000, | |
\ } | |
let g:llama_config = get(g:, 'llama_config', s:default_config) | |
function! s:get_indent(str) | |
let l:count = 0 | |
for i in range(len(a:str)) | |
if a:str[i] == "\t" | |
let l:count += &tabstop - 1 | |
else | |
break | |
endif | |
endfor | |
return l:count | |
endfunction | |
function! s:rand(i0, i1) abort | |
return a:i0 + rand() % (a:i1 - a:i0 + 1) | |
endfunction | |
function! llama#init() | |
if !executable('curl') | |
echohl WarningMsg | |
echo 'llama.vim requires the "curl" command to be available' | |
echohl None | |
return | |
endif | |
let s:pos_x = 0 " cursor position upon start of completion | |
let s:pos_y = 0 | |
let s:line_cur = '' | |
let s:line_cur_prefix = '' | |
let s:line_cur_suffix = '' | |
let s:ring_chunks = [] " current set of chunks used as extra context | |
let s:ring_queued = [] " chunks that are queued to be sent for processing | |
let s:ring_n_evict = 0 | |
let s:hint_shown = v:false | |
let s:pos_y_pick = -9999 " last y where we picked a chunk | |
let s:pos_dx = 0 | |
let s:content = [] | |
let s:can_accept = v:false | |
let s:timer_fim = -1 | |
let s:t_fim_start = reltime() " used to measure total FIM time | |
let s:t_last_move = reltime() " last time the cursor moved | |
let s:current_job = v:null | |
let s:ghost_text_nvim = exists('*nvim_buf_get_mark') | |
let s:ghost_text_vim = has('textprop') | |
if s:ghost_text_vim | |
let s:hlgroup_hint = 'llama_hl_hint' | |
let s:hlgroup_info = 'llama_hl_info' | |
if empty(prop_type_get(s:hlgroup_hint)) | |
call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint}) | |
endif | |
if empty(prop_type_get(s:hlgroup_info)) | |
call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info}) | |
endif | |
endif | |
augroup llama | |
autocmd! | |
autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false) | |
autocmd InsertLeavePre * call llama#fim_cancel() | |
autocmd CursorMoved * call s:on_move() | |
autocmd CursorMovedI * call s:on_move() | |
autocmd CompleteChanged * call llama#fim_cancel() | |
if g:llama_config.auto_fim | |
autocmd CursorMovedI * call llama#fim(v:true) | |
endif | |
" gather chunks upon yanking | |
autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif | |
" gather chunks upon entering/leaving a buffer | |
autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) | |
autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) | |
" gather chunk upon saving the file | |
autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) | |
augroup END | |
silent! call llama#fim_cancel() | |
" init background update of the ring buffer | |
if g:llama_config.ring_n_chunks > 0 | |
call s:ring_update() | |
endif | |
endfunction | |
" compute how similar two chunks of text are | |
" 0 - no similarity, 1 - high similarity | |
" TODO: figure out something better | |
function! s:chunk_sim(c0, c1) | |
let l:lines0 = len(a:c0) | |
let l:lines1 = len(a:c1) | |
let l:common = 0 | |
for l:line0 in a:c0 | |
for l:line1 in a:c1 | |
if l:line0 == l:line1 | |
let l:common += 1 | |
break | |
endif | |
endfor | |
endfor | |
return 2.0 * l:common / (l:lines0 + l:lines1) | |
endfunction | |
" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing | |
" | |
" no_mod - do not pick chunks from buffers with pending changes | |
" do_evict - evict chunks that are very similar to the new one | |
" | |
function! s:pick_chunk(text, no_mod, do_evict) | |
" do not pick chunks from buffers with pending changes or buffers that are not files | |
if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) | |
return | |
endif | |
" if the extra context option is disabled - do nothing | |
if g:llama_config.ring_n_chunks <= 0 | |
return | |
endif | |
" don't pick very small chunks | |
if len(a:text) < 3 | |
return | |
endif | |
if len(a:text) + 1 < g:llama_config.ring_chunk_size | |
let l:chunk = a:text | |
else | |
let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) | |
let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) | |
let l:chunk = a:text[l:l0:l:l1] | |
endif | |
let l:chunk_str = join(l:chunk, "\n") . "\n" | |
" check if this chunk is already added | |
let l:exist = v:false | |
for i in range(len(s:ring_chunks)) | |
if s:ring_chunks[i].data == l:chunk | |
let l:exist = v:true | |
break | |
endif | |
endfor | |
for i in range(len(s:ring_queued)) | |
if s:ring_queued[i].data == l:chunk | |
let l:exist = v:true | |
break | |
endif | |
endfor | |
if l:exist | |
return | |
endif | |
" evict queued chunks that are very similar to the new one | |
for i in range(len(s:ring_queued) - 1, 0, -1) | |
if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 | |
if a:do_evict | |
call remove(s:ring_queued, i) | |
let s:ring_n_evict += 1 | |
else | |
return | |
endif | |
endif | |
endfor | |
" also from s:ring_chunks | |
for i in range(len(s:ring_chunks) - 1, 0, -1) | |
if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 | |
if a:do_evict | |
call remove(s:ring_chunks, i) | |
let s:ring_n_evict += 1 | |
else | |
return | |
endif | |
endif | |
endfor | |
" TODO: become parameter ? | |
if len(s:ring_queued) == 16 | |
call remove(s:ring_queued, 0) | |
endif | |
call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) | |
"let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) | |
endfunction | |
" picks a queued chunk, sends it for processing and adds it to s:ring_chunks | |
" called every g:llama_config.ring_update_ms | |
function! s:ring_update() | |
call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) | |
" update only if in normal mode or if the cursor hasn't moved for a while | |
if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 | |
return | |
endif | |
if len(s:ring_queued) == 0 | |
return | |
endif | |
" move the first queued chunk to the ring buffer | |
if len(s:ring_chunks) == g:llama_config.ring_n_chunks | |
call remove(s:ring_chunks, 0) | |
endif | |
call add(s:ring_chunks, remove(s:ring_queued, 0)) | |
"let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) | |
" send asynchronous job with the new extra context so that it is ready for the next FIM | |
let l:extra_context = [] | |
for l:chunk in s:ring_chunks | |
call add(l:extra_context, { | |
\ 'text': l:chunk.str, | |
\ 'time': l:chunk.time, | |
\ 'filename': l:chunk.filename | |
\ }) | |
endfor | |
" no samplers needed here | |
let l:request = json_encode({ | |
\ 'input_prefix': "", | |
\ 'input_suffix': "", | |
\ 'input_extra': l:extra_context, | |
\ 'prompt': "", | |
\ 'n_predict': 1, | |
\ 'temperature': 0.0, | |
\ 'stream': v:false, | |
\ 'samplers': ["temperature"], | |
\ 'cache_prompt': v:true, | |
\ 't_max_prompt_ms': 1, | |
\ 't_max_predict_ms': 1 | |
\ }) | |
let l:curl_command = [ | |
\ "curl", | |
\ "--silent", | |
\ "--no-buffer", | |
\ "--request", "POST", | |
\ "--url", g:llama_config.endpoint, | |
\ "--header", "Content-Type: application/json", | |
\ "--data", l:request | |
\ ] | |
" no callbacks because we don't need to process the response | |
if s:ghost_text_nvim | |
call jobstart(l:curl_command, {}) | |
elseif s:ghost_text_vim | |
call job_start(l:curl_command, {}) | |
endif | |
endfunction | |
" necessary for 'inoremap <expr>' | |
function! llama#fim_inline(is_auto) abort | |
call llama#fim(a:is_auto) | |
return '' | |
endfunction | |
" the main FIM call | |
" takes local context around the cursor and sends it together with the extra context to the server for completion | |
function! llama#fim(is_auto) abort | |
" we already have a suggestion for the current cursor position | |
if s:hint_shown && !a:is_auto | |
call llama#fim_cancel() | |
return | |
endif | |
call llama#fim_cancel() | |
" avoid sending repeated requests too fast | |
if reltimefloat(reltime(s:t_fim_start)) < 0.6 | |
if s:timer_fim != -1 | |
call timer_stop(s:timer_fim) | |
let s:timer_fim = -1 | |
endif | |
let s:t_fim_start = reltime() | |
let s:timer_fim = timer_start(600, {-> llama#fim(v:true)}) | |
return | |
endif | |
let s:t_fim_start = reltime() | |
let s:content = [] | |
let s:can_accept = v:false | |
let s:pos_x = col('.') - 1 | |
let s:pos_y = line('.') | |
let l:max_y = line('$') | |
let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1) | |
let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix])) | |
let s:line_cur = getline('.') | |
let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x) | |
let s:line_cur_suffix = strpart(s:line_cur, s:pos_x) | |
if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix | |
return | |
endif | |
let l:prefix = "" | |
\ . join(l:lines_prefix, "\n") | |
\ . "\n" | |
let l:prompt = "" | |
\ . s:line_cur_prefix | |
let l:suffix = "" | |
\ . s:line_cur_suffix | |
\ . "\n" | |
\ . join(l:lines_suffix, "\n") | |
\ . "\n" | |
" prepare the extra context data | |
let l:extra_context = [] | |
for l:chunk in s:ring_chunks | |
call add(l:extra_context, { | |
\ 'text': l:chunk.str, | |
\ 'time': l:chunk.time, | |
\ 'filename': l:chunk.filename | |
\ }) | |
endfor | |
" the indentation of the current line | |
let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) | |
let l:request = json_encode({ | |
\ 'input_prefix': l:prefix, | |
\ 'input_suffix': l:suffix, | |
\ 'input_extra': l:extra_context, | |
\ 'prompt': l:prompt, | |
\ 'n_predict': g:llama_config.n_predict, | |
\ 'n_indent': l:indent, | |
\ 'top_k': 40, | |
\ 'top_p': 0.99, | |
\ 'stream': v:false, | |
\ 'samplers': ["top_k", "top_p", "infill"], | |
\ 'cache_prompt': v:true, | |
\ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, | |
\ 't_max_predict_ms': g:llama_config.t_max_predict_ms | |
\ }) | |
let l:curl_command = [ | |
\ "curl", | |
\ "--silent", | |
\ "--no-buffer", | |
\ "--request", "POST", | |
\ "--url", g:llama_config.endpoint, | |
\ "--header", "Content-Type: application/json", | |
\ "--data", l:request | |
\ ] | |
if s:current_job != v:null | |
if s:ghost_text_nvim | |
call jobstop(s:current_job) | |
elseif s:ghost_text_vim | |
call job_stop(s:current_job) | |
endif | |
endif | |
" send the request asynchronously | |
if s:ghost_text_nvim | |
let s:current_job = jobstart(l:curl_command, { | |
\ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]), | |
\ 'on_exit': function('s:fim_on_exit'), | |
\ 'stdout_buffered': v:true | |
\ }) | |
elseif s:ghost_text_vim | |
let s:current_job = job_start(l:curl_command, { | |
\ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]), | |
\ 'exit_cb': function('s:fim_on_exit') | |
\ }) | |
endif | |
" TODO: per-file location | |
let l:delta_y = abs(s:pos_y - s:pos_y_pick) | |
" gather some extra context nearby and process it in the background | |
" only gather chunks if the cursor has moved a lot | |
" TODO: something more clever? reranking? | |
if a:is_auto && l:delta_y > 32 | |
" expand the prefix even further | |
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) | |
" pick a suffix chunk | |
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) | |
let s:pos_y_pick = s:pos_y | |
endif | |
endfunction | |
" if first_line == v:true accept only the first line of the response | |
function! llama#fim_accept(first_line) | |
" insert the suggestion at the cursor location | |
if s:can_accept && len(s:content) > 0 | |
call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0]) | |
if len(s:content) > 1 | |
if !a:first_line | |
call append(s:pos_y, s:content[1:-1]) | |
endif | |
endif | |
" move the cursor to the end of the accepted text | |
if !a:first_line && len(s:content) > 1 | |
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1) | |
else | |
call cursor(s:pos_y, s:pos_x + len(s:content[0])) | |
endif | |
endif | |
call llama#fim_cancel() | |
endfunction | |
function! llama#fim_cancel() | |
let s:hint_shown = v:false | |
" clear the virtual text | |
let l:bufnr = bufnr('%') | |
if s:ghost_text_nvim | |
let l:id_vt_fim = nvim_create_namespace('vt_fim') | |
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) | |
elseif s:ghost_text_vim | |
call prop_remove({'type': s:hlgroup_hint, 'all': v:true}) | |
call prop_remove({'type': s:hlgroup_info, 'all': v:true}) | |
endif | |
" remove the mappings | |
silent! iunmap <buffer> <Tab> | |
silent! iunmap <buffer> <S-Tab> | |
silent! iunmap <buffer> <Esc> | |
endfunction | |
function! s:on_move() | |
let s:t_last_move = reltime() | |
call llama#fim_cancel() | |
endfunction | |
" callback that processes the FIM result from the server and displays the suggestion | |
function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null) | |
if s:ghost_text_nvim | |
let l:raw = join(a:data, "\n") | |
elseif s:ghost_text_vim | |
let l:raw = a:data | |
endif | |
if len(l:raw) == 0 | |
return | |
endif | |
if a:pos_x != col('.') - 1 || a:pos_y != line('.') | |
return | |
endif | |
" show the suggestion only in insert mode | |
if mode() !=# 'i' | |
return | |
endif | |
let s:pos_x = a:pos_x | |
let s:pos_y = a:pos_y | |
let s:can_accept = v:true | |
let l:has_info = v:false | |
if s:can_accept && v:shell_error | |
if !a:is_auto | |
call add(s:content, "<| curl error: is the server on? |>") | |
endif | |
let s:can_accept = v:false | |
endif | |
let l:n_prompt = 0 | |
let l:t_prompt_ms = 1.0 | |
let l:s_prompt = 0 | |
let l:n_predict = 0 | |
let l:t_predict_ms = 1.0 | |
let l:s_predict = 0 | |
" get the generated suggestion | |
if s:can_accept | |
let l:response = json_decode(l:raw) | |
for l:part in split(get(l:response, 'content', ''), "\n", 1) | |
call add(s:content, l:part) | |
endfor | |
" remove trailing new lines | |
while len(s:content) > 0 && s:content[-1] == "" | |
call remove(s:content, -1) | |
endwhile | |
let l:generation_settings = get(l:response, 'generation_settings', {}) | |
let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) | |
let l:n_cached = get(l:response, 'tokens_cached', 0) | |
let l:truncated = get(l:response, 'truncated', v:false) | |
" if response.timings is available | |
if len(get(l:response, 'timings', {})) > 0 | |
let l:has_info = v:true | |
let l:timings = get(l:response, 'timings', {}) | |
let l:n_prompt = get(l:timings, 'prompt_n', 0) | |
let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) | |
let l:s_prompt = get(l:timings, 'prompt_per_second', 0) | |
let l:n_predict = get(l:timings, 'predicted_n', 0) | |
let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) | |
let l:s_predict = get(l:timings, 'predicted_per_second', 0) | |
endif | |
endif | |
if len(s:content) == 0 | |
call add(s:content, "") | |
let s:can_accept = v:false | |
endif | |
if len(s:content) == 0 | |
return | |
endif | |
" NOTE: the following is logic for discarding predictions that repeat existing text | |
" the code is quite ugly and there is very likely a simpler and more canonical way to implement this | |
" | |
" still, I wonder if there is some better way that avoids having to do these special hacks? | |
" on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would | |
" start generating whatever we have given it via the extra context. but on the other hand, it's not very | |
" helpful to re-generate the same code that is already there | |
" truncate the suggestion if the first line is empty | |
if len(s:content) == 1 && s:content[0] == "" | |
let s:content = [""] | |
endif | |
" ... and the next lines are repeated | |
if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1) | |
let s:content = [""] | |
endif | |
" truncate the suggestion if it repeats the suffix | |
if len(s:content) == 1 && s:content[0] == s:line_cur_suffix | |
let s:content = [""] | |
endif | |
" find the first non-empty line (strip whitespace) | |
let l:cmp_y = s:pos_y + 1 | |
while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' | |
let l:cmp_y += 1 | |
endwhile | |
if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y) | |
" truncate the suggestion if it repeats the next line | |
if len(s:content) == 1 | |
let s:content = [""] | |
endif | |
" ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 | |
if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1] | |
let s:content = [""] | |
endif | |
" ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1) | |
if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n") | |
let s:content = [""] | |
endif | |
endif | |
" keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix | |
"let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) | |
"for i in range(1, len(s:content) - 1) | |
" if strlen(matchstr(s:content[i], '^\s*')) < l:indent | |
" let s:content = s:content[:i - 1] | |
" break | |
" endif | |
"endfor | |
let s:pos_dx = len(s:content[-1]) | |
let s:content[-1] .= s:line_cur_suffix | |
call llama#fim_cancel() | |
" display virtual text with the suggestion | |
let l:bufnr = bufnr('%') | |
if s:ghost_text_nvim | |
let l:id_vt_fim = nvim_create_namespace('vt_fim') | |
endif | |
" construct the info message | |
if g:llama_config.show_info > 0 && l:has_info | |
let l:prefix = ' ' | |
if l:truncated | |
let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", | |
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', | |
\ l:n_cached, l:n_ctx | |
\ ) | |
else | |
let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", | |
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', | |
\ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), | |
\ l:n_prompt, l:t_prompt_ms, l:s_prompt, | |
\ l:n_predict, l:t_predict_ms, l:s_predict, | |
\ 1000.0 * reltimefloat(reltime(s:t_fim_start)) | |
\ ) | |
endif | |
if g:llama_config.show_info == 1 | |
" display the info in the statusline | |
let &statusline = l:info | |
let l:info = '' | |
endif | |
endif | |
" display the suggestion and append the info to the end of the first line | |
if s:ghost_text_nvim | |
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { | |
\ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']], | |
\ 'virt_text_win_col': virtcol('.') - 1 | |
\ }) | |
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { | |
\ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), | |
\ 'virt_text_win_col': virtcol('.') | |
\ }) | |
elseif s:ghost_text_vim | |
let l:new_suffix = s:content[0] | |
if !empty(l:new_suffix) | |
call prop_add(s:pos_y, s:pos_x + 1, { | |
\ 'type': s:hlgroup_hint, | |
\ 'text': l:new_suffix | |
\ }) | |
endif | |
for line in s:content[1:] | |
call prop_add(s:pos_y, 0, { | |
\ 'type': s:hlgroup_hint, | |
\ 'text': line, | |
\ 'text_padding_left': s:get_indent(line), | |
\ 'text_align': 'below' | |
\ }) | |
endfor | |
if !empty(l:info) | |
call prop_add(s:pos_y, 0, { | |
\ 'type': s:hlgroup_info, | |
\ 'text': l:info, | |
\ 'text_padding_left': col('$'), | |
\ 'text_wrap': 'truncate' | |
\ }) | |
endif | |
endif | |
" setup accept shortcuts | |
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR> | |
inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR> | |
let s:hint_shown = v:true | |
endfunction | |
function! s:fim_on_exit(job_id, exit_code, event = v:null) | |
if a:exit_code != 0 | |
echom "Job failed with exit code: " . a:exit_code | |
endif | |
let s:current_job = v:null | |
endfunction | |