开源大模型框架llama.cpp使用C++ api开发入门

llama.cpp是一个C++编写的轻量级开源类AIGC大模型框架，可以支持在消费级普通设备上本地部署运行大模型，以及作为依赖库集成的到应用程序中提供类GPT的功能。

以下基于llama.cpp的源码利用C++ api来开发实例demo演示加载本地模型文件并提供GPT文本生成。

项目结构

llamacpp_starter
    - llama.cpp-b1547
    - src
      |- main.cpp
    - CMakeLists.txt

CMakeLists.txt

cmake_minimum_required(VERSION 3.15)

project(llamacpp_starter)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_subdirectory(llama.cpp-b1547)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547
    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547/common
)

file(GLOB SRC
    src/*.h
    src/*.cpp
)

add_executable(${PROJECT_NAME}${SRC})

target_link_libraries(${PROJECT_NAME}
    common
    llama
)

main.cpp

#include<iostream>#include<string>#include<vector>#include"common.h"#include"llama.h"intmain(int argc,char** argv){bool numa_support =false;const std::string model_file_path ="./llama-ggml.gguf";const std::string prompt ="once upon a time";// input wordsconstint n_len =32;// total length of the sequence including the prompt// set gpt params
    gpt_params params;
    params.model = model_file_path;
    params.prompt = prompt;// init LLMllama_backend_init(false);// load model
    llama_model_params model_params =llama_model_default_params();//model_params.n_gpu_layers = 99; // offload all layers to the GPU

    llama_model* model =llama_load_model_from_file(model_file_path.c_str(), model_params);if(model ==NULL){
        std::cerr <<__func__<<" load model file error"<< std::endl;return1;}// init context
    llama_context_params ctx_params =llama_context_default_params();

    ctx_params.seed =1234;
    ctx_params.n_ctx =2048;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch ==-1? params.n_threads : params.n_threads_batch;

    llama_context* ctx =llama_new_context_with_model(model, ctx_params);if(ctx ==NULL){
        std::cerr <<__func__<<" failed to create the llama_context"<< std::endl;return1;}// tokenize the prompt
    std::vector<llama_token> tokens_list =llama_tokenize(ctx, params.prompt,true);constint n_ctx =llama_n_ctx(ctx);constint n_kv_req = tokens_list.size()+(n_len - tokens_list.size());// make sure the KV cache is big enough to hold all the prompt and generated tokensif(n_kv_req > n_ctx){
        std::cerr <<__func__<<" error: n_kv_req > n_ctx, the required KV cache size is not big enough"<< std::endl;
        std::cerr <<__func__<<" either reduce n_parallel or increase n_ctx"<< std::endl;return1;}// print the prompt token-by-tokenfor(auto id : tokens_list)
        std::cout <<llama_token_to_piece(ctx, id)<<" ";
    std::cout << std::endl;// create a llama_batch with size 512// we use this object to submit token data for decoding
    llama_batch batch =llama_batch_init(512,0,1);// evaluate the initial promptfor(size_t i =0; i < tokens_list.size(); i++)llama_batch_add(batch, tokens_list[i], i,{0},false);// llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens -1]=true;if(llama_decode(ctx, batch)!=0){
        std::cerr <<__func__<<" llama_decode failed"<< std::endl;return1;}// main loop to generate wordsint n_cur = batch.n_tokens;int n_decode =0;constauto t_main_start =ggml_time_us();while(n_cur <= n_len){// sample the next tokenauto n_vocab =llama_n_vocab(model);auto* logits =llama_get_logits_ith(ctx, batch.n_tokens -1);

        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);for(llama_token token_id =0; token_id < n_vocab; token_id++){
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id],0.0f});}

        llama_token_data_array candidates_p ={ candidates.data(), candidates.size(),false};// sample the most likely tokenconst llama_token new_token_id =llama_sample_token_greedy(ctx,&candidates_p);// is it an end of stream?if(new_token_id ==llama_token_eos(model)|| n_cur == n_len){
            std::cout << std::endl;break;}

        std::cout <<llama_token_to_piece(ctx, new_token_id)<<" ";// prepare the next batchllama_batch_clear(batch);// push this new token for next evaluationllama_batch_add(batch, new_token_id, n_cur,{0},true);

        n_decode +=1;

        n_cur +=1;// evaluate the current batch with the transformer modelif(llama_decode(ctx, batch)){
            std::cerr <<__func__<<" failed to eval"<< std::endl;return1;}}
    std::cout << std::endl;constauto t_main_end =ggml_time_us();

    std::cout <<__func__<<" decoded "<< n_decode <<" tokens in "<<(t_main_end - t_main_start)/1000000.0f<<" s, speed: "<< n_decode /((t_main_end - t_main_start)/1000000.0f)<<" t / s"<< std::endl;llama_print_timings(ctx);llama_batch_free(batch);// free contextllama_free(ctx);llama_free_model(model);// free LLMllama_backend_free();return0;}

注：

llama支持的模型文件需要自己去下载，推荐到huggingface官网下载转换好的gguf格式文件
llama.cpp编译可以配置多种类型的增强选项，比如支持CPU/GPU加速，数据计算加速库

源码

llamacpp_starter

本文由博客一文多发平台 OpenWrite 发布！

标签： C++

本文转载自: https://blog.csdn.net/u012234115/article/details/134604154
版权归原作者 踏莎行hyx 所有，如有侵权，请联系我们删除。

开源大模型框架llama.cpp使用C++ api开发入门

项目结构

源码

发表评论

“开源大模型框架llama.cpp使用C++ api开发入门”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航