java-llama.cpp/src/test/java/examples/OpenAiServerExample.java at main · bernardladenthin/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
// SPDX-FileCopyrightText: 2023-2025 Konstantin Herud
//
// SPDX-License-Identifier: MIT

package examples;

import java.io.IOException;
import net.ladenthin.llama.LlamaModel;
import net.ladenthin.llama.parameters.ModelParameters;
import net.ladenthin.llama.server.OpenAiCompatServer;
import net.ladenthin.llama.server.OpenAiServerConfig;
import org.junit.jupiter.api.Disabled;

// Runnable demo (no @Test): starts a local OpenAI-compatible HTTP endpoint over a GGUF model so an
// editor such as VS Code Copilot (Custom Endpoint) can drive it. Point the model path at a local
// GGUF via -Dnet.ladenthin.llama.server.model=... ; @Disabled keeps it out of `mvn test`.
@Disabled
public class OpenAiServerExample {

    public static void main(String... args) throws IOException, InterruptedException {
        String modelPath = System.getProperty("net.ladenthin.llama.server.model", "models/codellama-7b.Q2_K.gguf");
        int port = Integer.getInteger("net.ladenthin.llama.server.port", 8080);

        // Two parallel slots let the editor's chat and its background title/summary requests run
        // concurrently instead of serializing behind one another.
        ModelParameters modelParams =
                new ModelParameters().setModel(modelPath).setCtxSize(8192).setParallel(2);

        OpenAiServerConfig config = OpenAiServerConfig.builder()
                .port(port)
                .modelId("local-model")
                .maxInputTokens(6144)
                .maxOutputTokens(2048)
                .build();

        try (LlamaModel model = new LlamaModel(modelParams);
                OpenAiCompatServer server = new OpenAiCompatServer(model, config).start()) {
            String url = "http://127.0.0.1:" + server.getPort() + OpenAiCompatServer.PATH_CHAT_COMPLETIONS;
            System.out.println("OpenAI-compatible endpoint ready: " + url);
            System.out.println("In VS Code: Chat: Manage Language Models -> Add Models -> Custom Endpoint ->");
            System.out.println("  API type 'Chat Completions', then set the model 'url' to: " + url);
            System.out.println("Press Ctrl+C to stop.");
            Thread.currentThread().join();
        }
    }
}