fix: correct linewrap think display in generic openai api (#13260)

Signed-off-by: xhe <xw897002528@gmail.com>
This commit is contained in:
xhe 2025-02-06 10:53:08 +08:00 committed by GitHub
parent 7673c36af3
commit da2ee04fce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 26 additions and 440 deletions

View File

@ -1,13 +1,10 @@
import json
from collections.abc import Generator from collections.abc import Generator
from typing import Optional, Union from typing import Optional, Union
import requests
from yarl import URL from yarl import URL
from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta from core.model_runtime.entities.llm_entities import LLMMode, LLMResult
from core.model_runtime.entities.message_entities import ( from core.model_runtime.entities.message_entities import (
AssistantPromptMessage,
PromptMessage, PromptMessage,
PromptMessageTool, PromptMessageTool,
) )
@ -39,208 +36,3 @@ class DeepseekLargeLanguageModel(OAIAPICompatLargeLanguageModel):
credentials["mode"] = LLMMode.CHAT.value credentials["mode"] = LLMMode.CHAT.value
credentials["function_calling_type"] = "tool_call" credentials["function_calling_type"] = "tool_call"
credentials["stream_function_calling"] = "support" credentials["stream_function_calling"] = "support"
def _handle_generate_stream_response(
self, model: str, credentials: dict, response: requests.Response, prompt_messages: list[PromptMessage]
) -> Generator:
"""
Handle llm stream response
:param model: model name
:param credentials: model credentials
:param response: streamed response
:param prompt_messages: prompt messages
:return: llm response chunk generator
"""
full_assistant_content = ""
chunk_index = 0
is_reasoning_started = False # Add flag to track reasoning state
def create_final_llm_result_chunk(
id: Optional[str], index: int, message: AssistantPromptMessage, finish_reason: str, usage: dict
) -> LLMResultChunk:
# calculate num tokens
prompt_tokens = usage and usage.get("prompt_tokens")
if prompt_tokens is None:
prompt_tokens = self._num_tokens_from_string(model, prompt_messages[0].content)
completion_tokens = usage and usage.get("completion_tokens")
if completion_tokens is None:
completion_tokens = self._num_tokens_from_string(model, full_assistant_content)
# transform usage
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
return LLMResultChunk(
id=id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(index=index, message=message, finish_reason=finish_reason, usage=usage),
)
# delimiter for stream response, need unicode_escape
import codecs
delimiter = credentials.get("stream_mode_delimiter", "\n\n")
delimiter = codecs.decode(delimiter, "unicode_escape")
tools_calls: list[AssistantPromptMessage.ToolCall] = []
def increase_tool_call(new_tool_calls: list[AssistantPromptMessage.ToolCall]):
def get_tool_call(tool_call_id: str):
if not tool_call_id:
return tools_calls[-1]
tool_call = next((tool_call for tool_call in tools_calls if tool_call.id == tool_call_id), None)
if tool_call is None:
tool_call = AssistantPromptMessage.ToolCall(
id=tool_call_id,
type="function",
function=AssistantPromptMessage.ToolCall.ToolCallFunction(name="", arguments=""),
)
tools_calls.append(tool_call)
return tool_call
for new_tool_call in new_tool_calls:
# get tool call
tool_call = get_tool_call(new_tool_call.function.name)
# update tool call
if new_tool_call.id:
tool_call.id = new_tool_call.id
if new_tool_call.type:
tool_call.type = new_tool_call.type
if new_tool_call.function.name:
tool_call.function.name = new_tool_call.function.name
if new_tool_call.function.arguments:
tool_call.function.arguments += new_tool_call.function.arguments
finish_reason = None # The default value of finish_reason is None
message_id, usage = None, None
for chunk in response.iter_lines(decode_unicode=True, delimiter=delimiter):
chunk = chunk.strip()
if chunk:
# ignore sse comments
if chunk.startswith(":"):
continue
decoded_chunk = chunk.strip().removeprefix("data:").lstrip()
if decoded_chunk == "[DONE]": # Some provider returns "data: [DONE]"
continue
try:
chunk_json: dict = json.loads(decoded_chunk)
# stream ended
except json.JSONDecodeError as e:
yield create_final_llm_result_chunk(
id=message_id,
index=chunk_index + 1,
message=AssistantPromptMessage(content=""),
finish_reason="Non-JSON encountered.",
usage=usage,
)
break
# handle the error here. for issue #11629
if chunk_json.get("error") and chunk_json.get("choices") is None:
raise ValueError(chunk_json.get("error"))
if chunk_json:
if u := chunk_json.get("usage"):
usage = u
if not chunk_json or len(chunk_json["choices"]) == 0:
continue
choice = chunk_json["choices"][0]
finish_reason = chunk_json["choices"][0].get("finish_reason")
message_id = chunk_json.get("id")
chunk_index += 1
if "delta" in choice:
delta = choice["delta"]
is_reasoning = delta.get("reasoning_content")
delta_content = delta.get("content") or delta.get("reasoning_content")
assistant_message_tool_calls = None
if "tool_calls" in delta and credentials.get("function_calling_type", "no_call") == "tool_call":
assistant_message_tool_calls = delta.get("tool_calls", None)
elif (
"function_call" in delta
and credentials.get("function_calling_type", "no_call") == "function_call"
):
assistant_message_tool_calls = [
{"id": "tool_call_id", "type": "function", "function": delta.get("function_call", {})}
]
# assistant_message_function_call = delta.delta.function_call
# extract tool calls from response
if assistant_message_tool_calls:
tool_calls = self._extract_response_tool_calls(assistant_message_tool_calls)
increase_tool_call(tool_calls)
if delta_content is None or delta_content == "":
continue
# Add markdown quote markers for reasoning content
if is_reasoning:
if not is_reasoning_started:
delta_content = "> 💭 " + delta_content
is_reasoning_started = True
elif "\n\n" in delta_content:
delta_content = delta_content.replace("\n\n", "\n> ")
elif "\n" in delta_content:
delta_content = delta_content.replace("\n", "\n> ")
elif is_reasoning_started:
# If we were in reasoning mode but now getting regular content,
# add \n\n to close the reasoning block
delta_content = "\n\n" + delta_content
is_reasoning_started = False
# transform assistant message to prompt message
assistant_prompt_message = AssistantPromptMessage(
content=delta_content,
)
# reset tool calls
tool_calls = []
full_assistant_content += delta_content
elif "text" in choice:
choice_text = choice.get("text", "")
if choice_text == "":
continue
# transform assistant message to prompt message
assistant_prompt_message = AssistantPromptMessage(content=choice_text)
full_assistant_content += choice_text
else:
continue
yield LLMResultChunk(
id=message_id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(
index=chunk_index,
message=assistant_prompt_message,
),
)
chunk_index += 1
if tools_calls:
yield LLMResultChunk(
id=message_id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(
index=chunk_index,
message=AssistantPromptMessage(tool_calls=tools_calls, content=""),
),
)
yield create_final_llm_result_chunk(
id=message_id,
index=chunk_index,
message=AssistantPromptMessage(content=""),
finish_reason=finish_reason,
usage=usage,
)

View File

@ -1,5 +1,6 @@
import json import json
import logging import logging
import re
from collections.abc import Generator from collections.abc import Generator
from decimal import Decimal from decimal import Decimal
from typing import Optional, Union, cast from typing import Optional, Union, cast
@ -515,6 +516,8 @@ class OAIAPICompatLargeLanguageModel(_CommonOaiApiCompat, LargeLanguageModel):
if "delta" in choice: if "delta" in choice:
delta = choice["delta"] delta = choice["delta"]
delta_content = delta.get("content") delta_content = delta.get("content")
if not delta_content:
delta_content = ""
if not is_reasoning_started_tag and "<think>" in delta_content: if not is_reasoning_started_tag and "<think>" in delta_content:
is_reasoning_started_tag = True is_reasoning_started_tag = True
@ -523,20 +526,21 @@ class OAIAPICompatLargeLanguageModel(_CommonOaiApiCompat, LargeLanguageModel):
delta_content = delta_content.replace("</think>", "") + "\n\n" delta_content = delta_content.replace("</think>", "") + "\n\n"
is_reasoning_started_tag = False is_reasoning_started_tag = False
elif is_reasoning_started_tag: elif is_reasoning_started_tag:
if "\n\n" in delta_content: if "\n" in delta_content:
delta_content = delta_content.replace("\n\n", "\n> ") delta_content = re.sub(r"\n(?!(>|\n))", "\n> ", delta_content)
elif "\n" in delta_content:
delta_content = delta_content.replace("\n", "\n> ")
reasoning_content = delta.get("reasoning_content") reasoning_content = delta.get("reasoning_content")
if reasoning_content: if is_reasoning_started and not reasoning_content and not delta_content:
delta_content = ""
elif reasoning_content:
if not is_reasoning_started: if not is_reasoning_started:
delta_content = "> 💭 " + reasoning_content delta_content = "> 💭 " + reasoning_content
is_reasoning_started = True is_reasoning_started = True
elif "\n\n" in delta_content: else:
delta_content = reasoning_content.replace("\n\n", "\n> ") delta_content = reasoning_content
elif "\n" in delta_content:
delta_content = reasoning_content.replace("\n", "\n> ") if "\n" in delta_content:
delta_content = re.sub(r"\n(?!(>|\n))", "\n> ", delta_content)
elif is_reasoning_started: elif is_reasoning_started:
# If we were in reasoning mode but now getting regular content, # If we were in reasoning mode but now getting regular content,
# add \n\n to close the reasoning block # add \n\n to close the reasoning block

View File

@ -1,13 +1,9 @@
import json
from collections.abc import Generator from collections.abc import Generator
from typing import Optional, Union from typing import Optional, Union
import requests
from core.model_runtime.entities.common_entities import I18nObject from core.model_runtime.entities.common_entities import I18nObject
from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta from core.model_runtime.entities.llm_entities import LLMMode, LLMResult
from core.model_runtime.entities.message_entities import ( from core.model_runtime.entities.message_entities import (
AssistantPromptMessage,
PromptMessage, PromptMessage,
PromptMessageTool, PromptMessageTool,
) )
@ -96,208 +92,3 @@ class SiliconflowLargeLanguageModel(OAIAPICompatLargeLanguageModel):
), ),
], ],
) )
def _handle_generate_stream_response(
self, model: str, credentials: dict, response: requests.Response, prompt_messages: list[PromptMessage]
) -> Generator:
"""
Handle llm stream response
:param model: model name
:param credentials: model credentials
:param response: streamed response
:param prompt_messages: prompt messages
:return: llm response chunk generator
"""
full_assistant_content = ""
chunk_index = 0
is_reasoning_started = False # Add flag to track reasoning state
def create_final_llm_result_chunk(
id: Optional[str], index: int, message: AssistantPromptMessage, finish_reason: str, usage: dict
) -> LLMResultChunk:
# calculate num tokens
prompt_tokens = usage and usage.get("prompt_tokens")
if prompt_tokens is None:
prompt_tokens = self._num_tokens_from_string(model, prompt_messages[0].content)
completion_tokens = usage and usage.get("completion_tokens")
if completion_tokens is None:
completion_tokens = self._num_tokens_from_string(model, full_assistant_content)
# transform usage
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
return LLMResultChunk(
id=id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(index=index, message=message, finish_reason=finish_reason, usage=usage),
)
# delimiter for stream response, need unicode_escape
import codecs
delimiter = credentials.get("stream_mode_delimiter", "\n\n")
delimiter = codecs.decode(delimiter, "unicode_escape")
tools_calls: list[AssistantPromptMessage.ToolCall] = []
def increase_tool_call(new_tool_calls: list[AssistantPromptMessage.ToolCall]):
def get_tool_call(tool_call_id: str):
if not tool_call_id:
return tools_calls[-1]
tool_call = next((tool_call for tool_call in tools_calls if tool_call.id == tool_call_id), None)
if tool_call is None:
tool_call = AssistantPromptMessage.ToolCall(
id=tool_call_id,
type="function",
function=AssistantPromptMessage.ToolCall.ToolCallFunction(name="", arguments=""),
)
tools_calls.append(tool_call)
return tool_call
for new_tool_call in new_tool_calls:
# get tool call
tool_call = get_tool_call(new_tool_call.function.name)
# update tool call
if new_tool_call.id:
tool_call.id = new_tool_call.id
if new_tool_call.type:
tool_call.type = new_tool_call.type
if new_tool_call.function.name:
tool_call.function.name = new_tool_call.function.name
if new_tool_call.function.arguments:
tool_call.function.arguments += new_tool_call.function.arguments
finish_reason = None # The default value of finish_reason is None
message_id, usage = None, None
for chunk in response.iter_lines(decode_unicode=True, delimiter=delimiter):
chunk = chunk.strip()
if chunk:
# ignore sse comments
if chunk.startswith(":"):
continue
decoded_chunk = chunk.strip().removeprefix("data:").lstrip()
if decoded_chunk == "[DONE]": # Some provider returns "data: [DONE]"
continue
try:
chunk_json: dict = json.loads(decoded_chunk)
# stream ended
except json.JSONDecodeError as e:
yield create_final_llm_result_chunk(
id=message_id,
index=chunk_index + 1,
message=AssistantPromptMessage(content=""),
finish_reason="Non-JSON encountered.",
usage=usage,
)
break
# handle the error here. for issue #11629
if chunk_json.get("error") and chunk_json.get("choices") is None:
raise ValueError(chunk_json.get("error"))
if chunk_json:
if u := chunk_json.get("usage"):
usage = u
if not chunk_json or len(chunk_json["choices"]) == 0:
continue
choice = chunk_json["choices"][0]
finish_reason = chunk_json["choices"][0].get("finish_reason")
message_id = chunk_json.get("id")
chunk_index += 1
if "delta" in choice:
delta = choice["delta"]
delta_content = delta.get("content")
assistant_message_tool_calls = None
if "tool_calls" in delta and credentials.get("function_calling_type", "no_call") == "tool_call":
assistant_message_tool_calls = delta.get("tool_calls", None)
elif (
"function_call" in delta
and credentials.get("function_calling_type", "no_call") == "function_call"
):
assistant_message_tool_calls = [
{"id": "tool_call_id", "type": "function", "function": delta.get("function_call", {})}
]
# assistant_message_function_call = delta.delta.function_call
# extract tool calls from response
if assistant_message_tool_calls:
tool_calls = self._extract_response_tool_calls(assistant_message_tool_calls)
increase_tool_call(tool_calls)
if delta_content is None or delta_content == "":
continue
# Check for think tags
if "<think>" in delta_content:
is_reasoning_started = True
# Remove <think> tag and add markdown quote
delta_content = "> 💭 " + delta_content.replace("<think>", "")
elif "</think>" in delta_content:
# Remove </think> tag and add newlines to end quote block
delta_content = delta_content.replace("</think>", "") + "\n\n"
is_reasoning_started = False
elif is_reasoning_started:
# Add quote markers for content within thinking block
if "\n\n" in delta_content:
delta_content = delta_content.replace("\n\n", "\n> ")
elif "\n" in delta_content:
delta_content = delta_content.replace("\n", "\n> ")
# transform assistant message to prompt message
assistant_prompt_message = AssistantPromptMessage(
content=delta_content,
)
# reset tool calls
tool_calls = []
full_assistant_content += delta_content
elif "text" in choice:
choice_text = choice.get("text", "")
if choice_text == "":
continue
# transform assistant message to prompt message
assistant_prompt_message = AssistantPromptMessage(content=choice_text)
full_assistant_content += choice_text
else:
continue
yield LLMResultChunk(
id=message_id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(
index=chunk_index,
message=assistant_prompt_message,
),
)
chunk_index += 1
if tools_calls:
yield LLMResultChunk(
id=message_id,
model=model,
prompt_messages=prompt_messages,
delta=LLMResultChunkDelta(
index=chunk_index,
message=AssistantPromptMessage(tool_calls=tools_calls, content=""),
),
)
yield create_final_llm_result_chunk(
id=message_id,
index=chunk_index,
message=AssistantPromptMessage(content=""),
finish_reason=finish_reason,
usage=usage,
)

View File

@ -1,4 +1,5 @@
import logging import logging
import re
from collections.abc import Generator from collections.abc import Generator
from typing import Optional from typing import Optional
@ -251,25 +252,23 @@ class VolcengineMaaSLargeLanguageModel(LargeLanguageModel):
for chunk in chunks: for chunk in chunks:
content = "" content = ""
if chunk.choices: if chunk.choices:
if hasattr(chunk.choices[0].delta, "reasoning_content"): delta = chunk.choices[0].delta
delta_content = "" if is_reasoning_started and not hasattr(delta, "reasoning_content") and not delta.content:
content = ""
elif hasattr(delta, "reasoning_content"):
if not is_reasoning_started: if not is_reasoning_started:
is_reasoning_started = True is_reasoning_started = True
delta_content = "> 💭 " + chunk.choices[0].delta.reasoning_content content = "> 💭 " + delta.reasoning_content
else: else:
delta_content = chunk.choices[0].delta.reasoning_content content = delta.reasoning_content
if "\n\n" in delta_content: if "\n" in content:
delta_content = delta_content.replace("\n\n", "\n> ") content = re.sub(r"\n(?!(>|\n))", "\n> ", content)
elif "\n" in delta_content:
delta_content = delta_content.replace("\n", "\n> ")
content = delta_content
elif is_reasoning_started: elif is_reasoning_started:
content = "\n\n" + chunk.choices[0].delta.content content = "\n\n" + delta.content
is_reasoning_started = False is_reasoning_started = False
else: else:
content = chunk.choices[0].delta.content content = delta.content
yield LLMResultChunk( yield LLMResultChunk(
model=model, model=model,