Skip to content

Commit d1d9f32

Browse files
committed
Merge branch 'concedo' into api-generate-with-grammar
2 parents 79393c4 + cae6a84 commit d1d9f32

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+6105
-2816
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ tests/test-tokenizer-0
8181
koboldcpp.so
8282
koboldcpp_failsafe.so
8383
koboldcpp_openblas.so
84-
koboldcpp_openblas_noavx2.so
84+
koboldcpp_noavx2.so
8585
koboldcpp_clblast.so
8686
koboldcpp.dll
8787
koboldcpp_failsafe.dll
8888
koboldcpp_openblas.dll
89-
koboldcpp_openblas_noavx2.dll
89+
koboldcpp_noavx2.dll
9090
koboldcpp_clblast.dll
9191
koboldcpp_cublas.dll
9292
cublas64_11.dll

CMakeLists.txt

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ if (NOT MSVC)
4343
endif()
4444

4545
# 3rd party libs
46-
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
46+
option(LLAMA_CUBLAS "llama: use CUDA" ON)
47+
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
4748
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4849
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
4950
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
50-
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
51+
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
5152
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
5253
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
5354

@@ -79,13 +80,15 @@ if (LLAMA_CUBLAS)
7980
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
8081

8182
add_compile_definitions(GGML_USE_CUBLAS)
83+
#add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
84+
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
8285
#add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
8386

8487
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8588
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
8689
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
87-
if (LLAMA_CUDA_DMMV_F16)
88-
add_compile_definitions(GGML_CUDA_DMMV_F16)
90+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
91+
add_compile_definitions(GGML_CUDA_F16)
8992
endif()
9093
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
9194

@@ -96,10 +99,14 @@ if (LLAMA_CUBLAS)
9699
endif()
97100

98101
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
99-
if (LLAMA_CUDA_DMMV_F16)
100-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
102+
# 52 == lowest CUDA 12 standard
103+
# 60 == f16 CUDA intrinsics
104+
# 61 == integer CUDA intrinsics
105+
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
106+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
107+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
101108
else()
102-
set(CMAKE_CUDA_ARCHITECTURES "37;52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
109+
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
103110
endif()
104111
endif()
105112
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -120,6 +127,7 @@ if (LLAMA_ALL_WARNINGS)
120127
-Wshadow
121128
-Wstrict-prototypes
122129
-Wpointer-arith
130+
-Wmissing-prototypes
123131
)
124132
set(cxx_flags
125133
-Wall
@@ -259,6 +267,8 @@ endif()
259267
add_library(ggml OBJECT
260268
ggml.c
261269
ggml.h
270+
ggml-alloc.c
271+
ggml-alloc.h
262272
k_quants.h
263273
k_quants.c
264274
${GGML_SOURCES_CUDA})

Makefile

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
1+
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas
22
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
33
dev: koboldcpp_openblas
44
dev2: koboldcpp_clblast
@@ -42,7 +42,7 @@ endif
4242

4343
# keep standard at C11 and C++11
4444
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45-
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
45+
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
4646
LDFLAGS =
4747

4848
# these are used on windows, to build some libraries with extra old device compatibility
@@ -165,20 +165,34 @@ else ifdef LLAMA_CUDA_DMMV_Y
165165
else
166166
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
167167
endif # LLAMA_CUDA_MMV_Y
168+
ifdef LLAMA_CUDA_F16
169+
NVCCFLAGS += -DGGML_CUDA_F16
170+
endif # LLAMA_CUDA_F16
168171
ifdef LLAMA_CUDA_DMMV_F16
169-
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
172+
NVCCFLAGS += -DGGML_CUDA_F16
170173
endif # LLAMA_CUDA_DMMV_F16
171174
ifdef LLAMA_CUDA_KQUANTS_ITER
172175
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
173176
else
174177
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
175178
endif
179+
ifdef LLAMA_CUDA_MMQ_Y
180+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
181+
else
182+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
183+
endif # LLAMA_CUDA_MMQ_Y
184+
#ifdef LLAMA_CUDA_CUBLAS
185+
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
186+
#endif # LLAMA_CUDA_CUBLAS
187+
ifdef LLAMA_CUDA_CCBIN
188+
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
189+
endif
176190
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
177-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
191+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
178192
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
179-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
193+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
180194
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
181-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
195+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
182196
endif # LLAMA_CUBLAS
183197

184198
ifdef LLAMA_METAL
@@ -213,15 +227,15 @@ endif
213227
DEFAULT_BUILD =
214228
FAILSAFE_BUILD =
215229
OPENBLAS_BUILD =
216-
OPENBLAS_NOAVX2_BUILD =
230+
NOAVX2_BUILD =
217231
CLBLAST_BUILD =
218232
CUBLAS_BUILD =
219233

220234
ifeq ($(OS),Windows_NT)
221235
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
222236
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
223237
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
224-
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS)
238+
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
225239
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
226240

227241
ifdef LLAMA_CUBLAS
@@ -233,7 +247,7 @@ else
233247
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
234248
ifdef LLAMA_OPENBLAS
235249
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
236-
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
250+
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
237251
endif
238252
ifdef LLAMA_CLBLAST
239253
ifeq ($(UNAME_S),Darwin)
@@ -283,8 +297,8 @@ ggml_openblas.o: ggml.c ggml.h
283297
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
284298
ggml_failsafe.o: ggml.c ggml.h
285299
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
286-
ggml_openblas_noavx2.o: ggml.c ggml.h
287-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
300+
ggml_noavx2.o: ggml.c ggml.h
301+
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
288302
ggml_clblast.o: ggml.c ggml.h
289303
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
290304
ggml_cublas.o: ggml.c ggml.h
@@ -298,15 +312,19 @@ k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
298312
k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
299313
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
300314

315+
#there's no intrinsics or special gpu ops used here, so we can have a universal object
316+
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
317+
$(CC) $(CFLAGS) -c $< -o $@
318+
301319
#version 2 libs
302320
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
303321
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
304322
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
305323
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
306324
ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
307325
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
308-
ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
309-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
326+
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
327+
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
310328
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
311329
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
312330
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -327,10 +345,12 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
327345
$(CC) $(CFLAGS) -c $< -o $@
328346

329347
# intermediate objects
330-
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
348+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
331349
$(CXX) $(CXXFLAGS) -c $< -o $@
332350
common.o: examples/common.cpp examples/common.h
333351
$(CXX) $(CXXFLAGS) -c $< -o $@
352+
console.o: examples/console.cpp examples/console.h
353+
$(CXX) $(CXXFLAGS) -c $< -o $@
334354
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
335355
$(CXX) $(CXXFLAGS) -c $< -o $@
336356
expose.o: expose.cpp expose.h
@@ -348,37 +368,37 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
348368
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
349369

350370
clean:
351-
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
371+
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
352372

353-
main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o grammar-parser.o $(OBJS)
373+
main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
354374
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
355375
@echo
356376
@echo '==== Run ./main -h for help. ===='
357377
@echo
358378

359379
#generated libraries
360-
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o grammar-parser.o $(OBJS)
380+
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
361381
$(DEFAULT_BUILD)
362-
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o grammar-parser.o $(OBJS)
382+
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
363383
$(OPENBLAS_BUILD)
364-
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o grammar-parser.o $(OBJS)
384+
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
365385
$(FAILSAFE_BUILD)
366-
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o grammar-parser.o $(OBJS)
367-
$(OPENBLAS_NOAVX2_BUILD)
368-
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o grammar-parser.o $(OBJS)
386+
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
387+
$(NOAVX2_BUILD)
388+
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
369389
$(CLBLAST_BUILD)
370-
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
390+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
371391
$(CUBLAS_BUILD)
372392

373-
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
393+
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
374394
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
375-
quantize_gptj: ggml.o llama.o k_quants.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
395+
quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
376396
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
377-
quantize_gpt2: ggml.o llama.o k_quants.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
397+
quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
378398
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
379-
quantize_neox: ggml.o llama.o k_quants.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
399+
quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
380400
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
381-
quantize_mpt: ggml.o llama.o k_quants.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
401+
quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
382402
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
383403

384404

README.md

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# koboldcpp
22

3-
A self contained distributable from Concedo that exposes llama.cpp function bindings, allowing it to be used via a simulated Kobold API endpoint.
4-
5-
What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights.
3+
KoboldCpp is an easy-to-use AI text-generation software for GGML models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer.
64

75
![Preview](media/preview.png)
86

@@ -49,9 +47,13 @@ For more information, be sure to run the program with the `--help` flag.
4947
## Android (Termux) Alternative method
5048
- See https://github.com/ggerganov/llama.cpp/pull/1828/files
5149

52-
## CuBLAS?
50+
## Using CuBLAS
5351
- If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
54-
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. Note that support for CuBLAS is limited.
52+
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
53+
54+
## Questions and Help
55+
- **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.**
56+
- If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord).
5557

5658
## Considerations
5759
- For Windows: No installation, single file executable, (It Just Works)
@@ -68,11 +70,11 @@ For more information, be sure to run the program with the `--help` flag.
6870
## Notes
6971
- Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` more so.
7072
- I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`
71-
- Supported GGML models:
72-
- LLAMA (All versions including ggml, ggmf, ggjt v1,v2,v3, openllama, gpt4all). Supports CLBlast and OpenBLAS acceleration for all versions.
73-
- GPT-2 (All versions, including legacy f16, newer format + quanitzed, cerebras, starcoder) Supports CLBlast and OpenBLAS acceleration for newer formats, no GPU layer offload.
74-
- GPT-J (All versions including legacy f16, newer format + quantized, pyg.cpp, new pygmalion, janeway etc.) Supports CLBlast and OpenBLAS acceleration for newer formats, no GPU layer offload.
75-
- RWKV (all formats except Q4_1_O).
73+
- Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable):
74+
- LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more)
75+
- GPT-2 / Cerebras
76+
- GPT-J
77+
- RWKV
7678
- GPT-NeoX / Pythia / StableLM / Dolly / RedPajama
77-
- MPT models (ggjt v3)
78-
- Basically every single current and historical GGML format that has ever existed should be supported, except for bloomz.cpp due to lack of demand.
79+
- MPT models
80+

0 commit comments

Comments
 (0)