Vllm 转 Ollama 接口-北京尧图网络科技有限公司

发布时间：2026/6/24 2:59:45

最新版本VisualStudio已经可以接入其他大语言模型了通过管理模型的接口进入设置但是他不支持其他的vllm、llama.cpp接口而且ollama接口也只支持本地lhttp://localhost:11434,其他不支持好像到以上位置这个点击添加按钮没啥用改地址也不行直接用大模型帮忙写一个脚本直接做接口转换把VS请求的localhost接口转到已经部署的vllm接口其他接口类似就行但是需要注意的是要实现完整的ollama接口#!/usr/bin/env python3 Ollama to vLLM 简单接口转发 import json import os import sys from datetime import datetime from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse, JSONResponse import httpx # 配置 VLLM_URL http://172.16.1.15:8000 VLLM_API_KEY # 如果有 API key 填这里 OLLAMA_NAME qwen3.6-27b # 本地 Ollama 接口显示的模型名 VLLM_MODEL /data/ai/models/Qwen3.6-27B # vLLM 上的模型名 # app FastAPI() client httpx.AsyncClient(timeout300.0) def log_request(method, url, bodyNone, headersNone): 打印请求日志 print(f\n{*60}, flushTrue) print(f[{datetime.now().strftime(%H:%M:%S)}] {method} {url}, flushTrue) if headers: print(fHeaders: {json.dumps(headers, ensure_asciiFalse)}, flushTrue) if body: print(fBody: {json.dumps(body, ensure_asciiFalse, indent2)}, flushTrue) print(f{*60}\n, flushTrue) def log_response(status_code, dataNone): 打印响应日志 print(f[{datetime.now().strftime(%H:%M:%S)}] Response: {status_code}, flushTrue) if data: print(fData: {json.dumps(data, ensure_asciiFalse, indent2)}, flushTrue) print(f{*60}\n, flushTrue) app.get(/api/tags) async def get_models(): 获取模型列表 - 直接调用 vLLM url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: return {models: [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} data resp.json() models [] for m in data.get(data, []): max_len m.get(max_model_len, 131072) models.append({ name: OLLAMA_NAME, model: OLLAMA_NAME, modified_at: datetime.utcfromtimestamp(m.get(created, 0)).strftime(%Y-%m-%dT%H:%M:%S) 00:00, size: 0, digest: , details: { parent_model: m.get(parent, ) or , format: vllm, family: qwen3, families: [qwen3], parameter_size: 27B, quantization_level: FP16, context_length: max_len, embedding_length: 5120 }, capabilities: [vision, completion, tools] }) return {models: models if models else [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} app.get(/api/ps) async def running_models(): 列出正在运行的模型 return {models: [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} app.post(/api/show) async def show_model(request: Request): Ollama 模型信息接口 - 直接调用 vLLM url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: return {license: , modelfile: , parameters: } data resp.json() model_info data.get(data, [{}])[0] max_len model_info.get(max_model_len, 131072) # 构建参数字符串 params [ fmax_model_len: {max_len}, fcreated: {model_info.get(created, )}, fowned_by: {model_info.get(owned_by, )} ] return { license: , modelfile: fFROM {VLLM_MODEL}\n\nPARAMETER max_model_len {max_len}\n\nSYSTEM \\\You are a helpful assistant.\\\, parameters: \n.join(params), details: { parent_model: model_info.get(parent, ) or , format: vllm, family: qwen3, families: [qwen3], parameter_size: 27B, quantization_level: FP16, context_length: max_len, embedding_length: 5120 }, capabilities: [vision, completion, tools] } app.post(/api/chat) async def chat(request: Request): 聊天接口 - Ollama 格式转 vLLM 格式支持工具调用和图像理解 body await request.json() messages body.get(messages, []) stream body.get(stream, False) vllm_request { model: VLLM_MODEL, messages: messages, temperature: body.get(temperature, 1.0), top_p: body.get(top_p, 1.0), stream: stream } # 支持工具调用 if body.get(tools): vllm_request[tools] body[tools] # 支持工具结果 if body.get(tool_choice): vllm_request[tool_choice] body[tool_choice] # 支持响应格式 if body.get(response_format): vllm_request[response_format] body[response_format] if body.get(max_tokens): vllm_request[max_tokens] body[max_tokens] headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if stream: log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) return StreamingResponse( stream_response(vllm_request, headers), media_typeapplication/x-ndjson ) else: log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) resp await client.post(f{VLLM_URL}/v1/chat/completions, jsonvllm_request, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() choice data[choices][0] result { model: OLLAMA_NAME, message: {role: assistant, content: choice[message][content]}, done: True } # 支持工具调用返回 if choice[message].get(tool_calls): result[message][tool_calls] choice[message][tool_calls] return result async def stream_response(vllm_request, headers): 流式响应 log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) async with client.stream(POST, f{VLLM_URL}/v1/chat/completions, jsonvllm_request, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] 流式响应状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): if line.startswith(data: ): data line[6:] if data [DONE]: yield {done:true}\n break try: chunk json.loads(data) content chunk[choices][0].get(delta, {}).get(content, ) if content: yield json.dumps({model: OLLAMA_NAME, message: {role: assistant, content: content}, done: False}) \n except (json.JSONDecodeError, KeyError): continue app.post(/api/generate) async def generate(request: Request): 文本生成接口 body await request.json() vllm_request { model: VLLM_MODEL, prompt: body.get(prompt, ), temperature: body.get(temperature, 1.0), top_p: body.get(top_p, 1.0), stream: body.get(stream, False) } headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if body.get(stream): log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) return StreamingResponse( stream_generate(vllm_request, headers), media_typeapplication/x-ndjson ) else: log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) resp await client.post(f{VLLM_URL}/v1/completions, jsonvllm_request, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() return { model: OLLAMA_NAME, response: data[choices][0][text], done: True } async def stream_generate(vllm_request, headers): 流式生成 log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) async with client.stream(POST, f{VLLM_URL}/v1/completions, jsonvllm_request, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] 流式生成状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): if line.startswith(data: ): data line[6:] if data [DONE]: yield {done:true}\n break try: chunk json.loads(data) text chunk[choices][0].get(text, ) if text: yield json.dumps({model: OLLAMA_NAME, response: text, done: False}) \n except (json.JSONDecodeError, KeyError): continue app.post(/api/embed) async def embed(request: Request): 生成文本嵌入向量 - 调用 vLLM body await request.json() inputs body.get(input, []) if not isinstance(inputs, list): inputs [inputs] headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} vllm_body {model: VLLM_MODEL, input: inputs} log_request(POST, f{VLLM_URL}/v1/embeddings, vllm_body, headers) resp await client.post( f{VLLM_URL}/v1/embeddings, jsonvllm_body, headersheaders ) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() embeddings [item.get(embedding, []) for item in data.get(data, [])] return {embeddings: embeddings} app.post(/api/embeddings) async def embeddings(request: Request): 旧版嵌入接口 - 调用 vLLM body await request.json() prompt body.get(prompt, ) headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} vllm_body {model: VLLM_MODEL, input: prompt} log_request(POST, f{VLLM_URL}/v1/embeddings, vllm_body, headers) resp await client.post( f{VLLM_URL}/v1/embeddings, jsonvllm_body, headersheaders ) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() embedding data.get(data, [{}])[0].get(embedding, []) return {embedding: embedding} app.post(/v1/chat/completions) async def v1_chat(request: Request): OpenAI 格式接口 - 直接透传到 vLLM body await request.json() body[model] VLLM_MODEL headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if body.get(stream, False): log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) return StreamingResponse( stream_v1_chat(body, headers), media_typetext/event-stream ) else: log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) resp await client.post(f{VLLM_URL}/v1/chat/completions, jsonbody, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) return resp.json() async def stream_v1_chat(body, headers): OpenAI 格式流式响应 log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) async with client.stream(POST, f{VLLM_URL}/v1/chat/completions, jsonbody, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] v1 流式响应状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): yield line \n app.post(/v1/completions) async def v1_completions(request: Request): OpenAI 格式补全接口 - 直接透传到 vLLM body await request.json() body[model] VLLM_MODEL headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} log_request(POST, f{VLLM_URL}/v1/completions, body, headers) resp await client.post(f{VLLM_URL}/v1/completions, jsonbody, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) return resp.json() app.get(/v1/models) async def v1_models(): OpenAI 格式模型列表 url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) return resp.json() if __name__ __main__: import uvicorn print(启动 Ollama 代理: http://127.0.0.1:11434) print(f转发到 vLLM: {VLLM_URL}) uvicorn.run(app, host0.0.0.0, port11434)运行以上代码 python main.py然后回到VS自带模型点击添加出现实现的模型接口勾选模型点击保存就可以通过模型选择进行指定刚才设置的模型了

相关新闻

2026/6/24 1:59:45

BitCloud ZigBee数据分片与节点参数配置实战指南

1. 项目概述：为什么需要关注数据分片与节点参数？如果你正在用BitCloud这个ZigBee协议栈做开发，尤其是涉及到需要传输的数据量稍微大一点，或者网络里节点一多起来，大概率会遇到两个让人头疼的问题：一是数据包…

相关新闻

BitCloud ZigBee数据分片与节点参数配置实战指南

ZigBee 3.0开发实战：BDB、ZCL与ZGP核心组件详解

嵌入式固件升级实战：基于FLIP工具与Atmel芯片的加密认证与传感器调试

微电网控制柜选型5大误区，90%项目都踩过

从 OpenRouter 到 Vapeur AI：多模型统一接入为什么会成为开发标配？

Minecraft世界转换终极指南：如何用Chunker实现跨平台存档共享

多智能体推演：香港黄金清算系统7月上线，银行备货400盎司金条的底层逻辑——因果推断模型

Vllm 转 Ollama 接口

BitCloud ZigBee数据分片与节点参数配置实战指南

嵌入式语音编解码实战：G.726 ADPCM库集成与优化指南

ITU656格式化器寄存器配置实战：VBI数据处理与VCR特技播放兼容性

嵌入式GUI开发实战：emWin环境搭建、配置优化与性能调优指南

TaskJuggler脚本编程入门：用代码实现自动化项目管理

BitCloud SDK实战：SAMR21与ATmegaRFR2 Zigbee节点固件烧录与配置指南

2026年GEO信源媒体发稿平台全盘点：三种模式、代表玩家与适用场景

GIT修改用户名

Win11Debloat：让你的Windows系统重获新生的终极优化工具

技术深度解析：m4s-converter实现原理与B站缓存视频转换最佳实践