crow/models.json at main · kh0pper/crow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
{
  "$comment": "Provider registry for Crow's multi-agent orchestrator. See docs/plans/cached-nibbling-clover.md.",
  "$schemaVersion": "1.0",
  "providers": {
    "crow-voice": {
      "$description": "Fast voice/dispatch model for the AI Companion. Qwen3.5-4B (text-only) BF16 via vLLM-ROCm on Strix Halo gfx1151, :8011. Text-only: the model is natively vision-language but its ViT encoder OOMs (256 GiB) during vLLM-ROCm multimodal profiling on gfx1151, so image/video input is disabled (--limit-mm-per-prompt). Vision-bearing turns escalate to crow-chat (35B, multimodal) or grackle-vision. NO mutexGroup + alwaysResident so it co-resides with the warm 35B and never evicts it.",
      "baseUrl": "http://100.118.41.122:8011/v1",
      "apiKey": "none",
      "host": "local",
      "bundleId": "vllm-rocm-qwen35-4b",
      "alwaysResident": true,
      "models": [
        {
          "id": "qwen3.5-4b",
          "contextLen": 8192,
          "warm": true,
          "priority": "voice"
        }
      ]
    },
    "crow-llm": {
      "$description": "Gateway LLM-router endpoint (servers/gateway/routes/llm-router.js, POST /llm/v1). A virtual provider: the AI Companion and Meta Glasses point their LLM base_url here, and the route picks fast (crow-voice/4B) vs escalate (crow-chat/35B) per turn, warming the cold model first. NOT backed by a bundle (no bundleId) so the gpu-orchestrator never manages it; nothing auto-routes to it (smart-router/presets resolve by provider_id, never crow-llm), so there is no loop — the route resolves to crow-voice/crow-chat, never back to crow-llm. Advertises both model ids for client model-selector probes.",
      "baseUrl": "http://localhost:3001/llm/v1",
      "apiKey": "none",
      "host": "local",
      "models": [
        {
          "id": "qwen3.5-4b",
          "contextLen": 8192,
          "warm": true,
          "priority": "voice"
        },
        {
          "id": "qwen3.6-35b-a3b",
          "contextLen": 262144,
          "warm": false,
          "priority": "interactive"
        }
      ]
    },
    "crow-dispatch": {
      "$description": "Fast tool-dispatch + Maker Lab classroom endpoint. Strix Halo gfx1151 via vLLM-ROCm.",
      "baseUrl": "http://100.118.41.122:8001/v1",
      "apiKey": "none",
      "host": "local",
      "bundleId": "vllm-rocm-qwen3",
      "mutexGroup": "crow-strix-vram",
      "models": [
        {
          "id": "qwen3-4b",
          "contextLen": 32768,
          "warm": true,
          "priority": "maker_lab"
        }
      ]
    },
    "crow-chat": {
      "$description": "Daily-driver MoE (3B active) + vision. Qwen3.6-35B-A3B UD-Q6_K + MTP (--spec-type draft-mtp --spec-draft-n-max 2) + mmproj-F16 via llama.cpp Vulkan RADV (vulkan-radv-mtp toolbox) on Strix Halo. Benchmarked 2026-05-23: ~66 tok/s single-stream (MTP forces -np 1), KLD 0.012 vs BF16 (near-lossless); 35B is faster on Vulkan than ROCm. Vision + MTP co-load (verified).",
      "baseUrl": "http://100.118.41.122:8003/v1",
      "apiKey": "none",
      "host": "local",
      "bundleId": "llamacpp-vulkan-qwen36-35b-a3b",
      "mutexGroup": "crow-strix-vram",
      "defaultMember": true,
      "models": [
        {
          "id": "qwen3.6-35b-a3b",
          "contextLen": 262144,
          "warm": true,
          "priority": "interactive"
        }
      ]
    },
    "crow-swap-coder": {
      "$description": "On-demand coder specialist. MoE via llama.cpp Vulkan AMDVLK on Strix Halo. Mutexed with all other crow-* bundles on Strix Halo's unified 124 GB pool.",
      "baseUrl": "http://100.118.41.122:8003/v1",
      "apiKey": "none",
      "host": "local",
      "bundleId": "llamacpp-vulkan-qwen3-coder",
      "mutexGroup": "crow-strix-vram",
      "models": [
        {
          "id": "qwen3-coder-30b-a3b",
          "contextLen": 32768,
          "warm": false,
          "onDemand": true,
          "priority": "interactive"
        }
      ]
    },
    "crow-swap-deep": {
      "$description": "On-demand deep-reasoning specialist. GLM-4.5-Air 106B/12B MoE via llama.cpp Vulkan AMDVLK. Mutexed with all other crow-* bundles on Strix Halo's unified 124 GB pool.",
      "baseUrl": "http://100.118.41.122:8003/v1",
      "apiKey": "none",
      "host": "local",
      "bundleId": "llamacpp-vulkan-glm-45-air",
      "mutexGroup": "crow-strix-vram",
      "models": [
        {
          "id": "glm-4.5-air",
          "contextLen": 16384,
          "warm": false,
          "onDemand": true,
          "conflictsWith": ["crow-chat"],
          "priority": "batch"
        }
      ]
    },
    "grackle-embed": {
      "$description": "Text embeddings for semantic memory/research/blog search. Qwen3-Embedding-0.6B BF16 via vLLM-CUDA on grackle. Always-resident (kept up by gpu-orchestrator) because embed outages silently break memory stores.",
      "baseUrl": "http://100.121.254.89:9100/v1",
      "apiKey": "none",
      "host": "grackle-5fc01ac74463b6f4",
      "bundleId": "vllm-cuda-embed",
      "alwaysResident": true,
      "models": [
        {
          "id": "qwen3-embedding-0.6b",
          "task": "embed",
          "dim": 1024,
          "matryoshkaDims": [1024, 768, 512, 256],
          "warm": true,
          "priority": "interactive"
        }
      ]
    },
    "grackle-rerank": {
      "$description": "Cross-encoder reranker for top-K reordering after hybrid FTS+vector retrieval. Qwen3-Reranker-0.6B BF16 via vLLM-CUDA on grackle. Shares mutex group grackle-specialists-swap with grackle-vision on the 16 GB card.",
      "baseUrl": "http://100.121.254.89:9101/v1",
      "apiKey": "none",
      "host": "grackle-5fc01ac74463b6f4",
      "bundleId": "vllm-cuda-rerank",
      "mutexGroup": "grackle-specialists-swap",
      "defaultMember": true,
      "models": [
        {
          "id": "qwen3-reranker-0.6b",
          "task": "score",
          "warm": true,
          "priority": "interactive"
        }
      ]
    },
    "grackle-vision": {
      "$description": "Vision-language model for image understanding + OCR + structured output. Qwen3-VL-4B-Instruct-FP8 via vLLM-CUDA on grackle. Swapped in on demand by gpu-orchestrator; mutex with grackle-rerank.",
      "baseUrl": "http://100.121.254.89:9102/v1",
      "apiKey": "none",
      "host": "grackle-5fc01ac74463b6f4",
      "bundleId": "vllm-cuda-vision",
      "mutexGroup": "grackle-specialists-swap",
      "models": [
        {
          "id": "qwen3-vl-4b-instruct-fp8",
          "contextLen": 16384,
          "warm": false,
          "onDemand": true,
          "priority": "interactive"
        }
      ]
    }
  }
}