diff --git a/api/core/model_runtime/model_providers/deepseek/llm/llm.py b/api/core/model_runtime/model_providers/deepseek/llm/llm.py
index b280856c05..610dc7b458 100644
--- a/api/core/model_runtime/model_providers/deepseek/llm/llm.py
+++ b/api/core/model_runtime/model_providers/deepseek/llm/llm.py
@@ -1,13 +1,10 @@
-import json
from collections.abc import Generator
from typing import Optional, Union
-import requests
from yarl import URL
-from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta
+from core.model_runtime.entities.llm_entities import LLMMode, LLMResult
from core.model_runtime.entities.message_entities import (
- AssistantPromptMessage,
PromptMessage,
PromptMessageTool,
)
@@ -39,208 +36,3 @@ class DeepseekLargeLanguageModel(OAIAPICompatLargeLanguageModel):
credentials["mode"] = LLMMode.CHAT.value
credentials["function_calling_type"] = "tool_call"
credentials["stream_function_calling"] = "support"
-
- def _handle_generate_stream_response(
- self, model: str, credentials: dict, response: requests.Response, prompt_messages: list[PromptMessage]
- ) -> Generator:
- """
- Handle llm stream response
-
- :param model: model name
- :param credentials: model credentials
- :param response: streamed response
- :param prompt_messages: prompt messages
- :return: llm response chunk generator
- """
- full_assistant_content = ""
- chunk_index = 0
- is_reasoning_started = False # Add flag to track reasoning state
-
- def create_final_llm_result_chunk(
- id: Optional[str], index: int, message: AssistantPromptMessage, finish_reason: str, usage: dict
- ) -> LLMResultChunk:
- # calculate num tokens
- prompt_tokens = usage and usage.get("prompt_tokens")
- if prompt_tokens is None:
- prompt_tokens = self._num_tokens_from_string(model, prompt_messages[0].content)
- completion_tokens = usage and usage.get("completion_tokens")
- if completion_tokens is None:
- completion_tokens = self._num_tokens_from_string(model, full_assistant_content)
-
- # transform usage
- usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
-
- return LLMResultChunk(
- id=id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(index=index, message=message, finish_reason=finish_reason, usage=usage),
- )
-
- # delimiter for stream response, need unicode_escape
- import codecs
-
- delimiter = credentials.get("stream_mode_delimiter", "\n\n")
- delimiter = codecs.decode(delimiter, "unicode_escape")
-
- tools_calls: list[AssistantPromptMessage.ToolCall] = []
-
- def increase_tool_call(new_tool_calls: list[AssistantPromptMessage.ToolCall]):
- def get_tool_call(tool_call_id: str):
- if not tool_call_id:
- return tools_calls[-1]
-
- tool_call = next((tool_call for tool_call in tools_calls if tool_call.id == tool_call_id), None)
- if tool_call is None:
- tool_call = AssistantPromptMessage.ToolCall(
- id=tool_call_id,
- type="function",
- function=AssistantPromptMessage.ToolCall.ToolCallFunction(name="", arguments=""),
- )
- tools_calls.append(tool_call)
-
- return tool_call
-
- for new_tool_call in new_tool_calls:
- # get tool call
- tool_call = get_tool_call(new_tool_call.function.name)
- # update tool call
- if new_tool_call.id:
- tool_call.id = new_tool_call.id
- if new_tool_call.type:
- tool_call.type = new_tool_call.type
- if new_tool_call.function.name:
- tool_call.function.name = new_tool_call.function.name
- if new_tool_call.function.arguments:
- tool_call.function.arguments += new_tool_call.function.arguments
-
- finish_reason = None # The default value of finish_reason is None
- message_id, usage = None, None
- for chunk in response.iter_lines(decode_unicode=True, delimiter=delimiter):
- chunk = chunk.strip()
- if chunk:
- # ignore sse comments
- if chunk.startswith(":"):
- continue
- decoded_chunk = chunk.strip().removeprefix("data:").lstrip()
- if decoded_chunk == "[DONE]": # Some provider returns "data: [DONE]"
- continue
-
- try:
- chunk_json: dict = json.loads(decoded_chunk)
- # stream ended
- except json.JSONDecodeError as e:
- yield create_final_llm_result_chunk(
- id=message_id,
- index=chunk_index + 1,
- message=AssistantPromptMessage(content=""),
- finish_reason="Non-JSON encountered.",
- usage=usage,
- )
- break
- # handle the error here. for issue #11629
- if chunk_json.get("error") and chunk_json.get("choices") is None:
- raise ValueError(chunk_json.get("error"))
-
- if chunk_json:
- if u := chunk_json.get("usage"):
- usage = u
- if not chunk_json or len(chunk_json["choices"]) == 0:
- continue
-
- choice = chunk_json["choices"][0]
- finish_reason = chunk_json["choices"][0].get("finish_reason")
- message_id = chunk_json.get("id")
- chunk_index += 1
-
- if "delta" in choice:
- delta = choice["delta"]
- is_reasoning = delta.get("reasoning_content")
- delta_content = delta.get("content") or delta.get("reasoning_content")
-
- assistant_message_tool_calls = None
-
- if "tool_calls" in delta and credentials.get("function_calling_type", "no_call") == "tool_call":
- assistant_message_tool_calls = delta.get("tool_calls", None)
- elif (
- "function_call" in delta
- and credentials.get("function_calling_type", "no_call") == "function_call"
- ):
- assistant_message_tool_calls = [
- {"id": "tool_call_id", "type": "function", "function": delta.get("function_call", {})}
- ]
-
- # assistant_message_function_call = delta.delta.function_call
-
- # extract tool calls from response
- if assistant_message_tool_calls:
- tool_calls = self._extract_response_tool_calls(assistant_message_tool_calls)
- increase_tool_call(tool_calls)
-
- if delta_content is None or delta_content == "":
- continue
-
- # Add markdown quote markers for reasoning content
- if is_reasoning:
- if not is_reasoning_started:
- delta_content = "> 💠" + delta_content
- is_reasoning_started = True
- elif "\n\n" in delta_content:
- delta_content = delta_content.replace("\n\n", "\n> ")
- elif "\n" in delta_content:
- delta_content = delta_content.replace("\n", "\n> ")
- elif is_reasoning_started:
- # If we were in reasoning mode but now getting regular content,
- # add \n\n to close the reasoning block
- delta_content = "\n\n" + delta_content
- is_reasoning_started = False
-
- # transform assistant message to prompt message
- assistant_prompt_message = AssistantPromptMessage(
- content=delta_content,
- )
-
- # reset tool calls
- tool_calls = []
- full_assistant_content += delta_content
- elif "text" in choice:
- choice_text = choice.get("text", "")
- if choice_text == "":
- continue
-
- # transform assistant message to prompt message
- assistant_prompt_message = AssistantPromptMessage(content=choice_text)
- full_assistant_content += choice_text
- else:
- continue
-
- yield LLMResultChunk(
- id=message_id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(
- index=chunk_index,
- message=assistant_prompt_message,
- ),
- )
-
- chunk_index += 1
-
- if tools_calls:
- yield LLMResultChunk(
- id=message_id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(
- index=chunk_index,
- message=AssistantPromptMessage(tool_calls=tools_calls, content=""),
- ),
- )
-
- yield create_final_llm_result_chunk(
- id=message_id,
- index=chunk_index,
- message=AssistantPromptMessage(content=""),
- finish_reason=finish_reason,
- usage=usage,
- )
diff --git a/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py b/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py
index a0d9c450d5..17aefc7efc 100644
--- a/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py
@@ -1,5 +1,6 @@
import json
import logging
+import re
from collections.abc import Generator
from decimal import Decimal
from typing import Optional, Union, cast
@@ -515,6 +516,8 @@ class OAIAPICompatLargeLanguageModel(_CommonOaiApiCompat, LargeLanguageModel):
if "delta" in choice:
delta = choice["delta"]
delta_content = delta.get("content")
+ if not delta_content:
+ delta_content = ""
if not is_reasoning_started_tag and "" in delta_content:
is_reasoning_started_tag = True
@@ -523,20 +526,21 @@ class OAIAPICompatLargeLanguageModel(_CommonOaiApiCompat, LargeLanguageModel):
delta_content = delta_content.replace("", "") + "\n\n"
is_reasoning_started_tag = False
elif is_reasoning_started_tag:
- if "\n\n" in delta_content:
- delta_content = delta_content.replace("\n\n", "\n> ")
- elif "\n" in delta_content:
- delta_content = delta_content.replace("\n", "\n> ")
+ if "\n" in delta_content:
+ delta_content = re.sub(r"\n(?!(>|\n))", "\n> ", delta_content)
reasoning_content = delta.get("reasoning_content")
- if reasoning_content:
+ if is_reasoning_started and not reasoning_content and not delta_content:
+ delta_content = ""
+ elif reasoning_content:
if not is_reasoning_started:
delta_content = "> 💠" + reasoning_content
is_reasoning_started = True
- elif "\n\n" in delta_content:
- delta_content = reasoning_content.replace("\n\n", "\n> ")
- elif "\n" in delta_content:
- delta_content = reasoning_content.replace("\n", "\n> ")
+ else:
+ delta_content = reasoning_content
+
+ if "\n" in delta_content:
+ delta_content = re.sub(r"\n(?!(>|\n))", "\n> ", delta_content)
elif is_reasoning_started:
# If we were in reasoning mode but now getting regular content,
# add \n\n to close the reasoning block
diff --git a/api/core/model_runtime/model_providers/siliconflow/llm/llm.py b/api/core/model_runtime/model_providers/siliconflow/llm/llm.py
index 4109fafab9..dffe321496 100644
--- a/api/core/model_runtime/model_providers/siliconflow/llm/llm.py
+++ b/api/core/model_runtime/model_providers/siliconflow/llm/llm.py
@@ -1,13 +1,9 @@
-import json
from collections.abc import Generator
from typing import Optional, Union
-import requests
-
from core.model_runtime.entities.common_entities import I18nObject
-from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta
+from core.model_runtime.entities.llm_entities import LLMMode, LLMResult
from core.model_runtime.entities.message_entities import (
- AssistantPromptMessage,
PromptMessage,
PromptMessageTool,
)
@@ -96,208 +92,3 @@ class SiliconflowLargeLanguageModel(OAIAPICompatLargeLanguageModel):
),
],
)
-
- def _handle_generate_stream_response(
- self, model: str, credentials: dict, response: requests.Response, prompt_messages: list[PromptMessage]
- ) -> Generator:
- """
- Handle llm stream response
-
- :param model: model name
- :param credentials: model credentials
- :param response: streamed response
- :param prompt_messages: prompt messages
- :return: llm response chunk generator
- """
- full_assistant_content = ""
- chunk_index = 0
- is_reasoning_started = False # Add flag to track reasoning state
-
- def create_final_llm_result_chunk(
- id: Optional[str], index: int, message: AssistantPromptMessage, finish_reason: str, usage: dict
- ) -> LLMResultChunk:
- # calculate num tokens
- prompt_tokens = usage and usage.get("prompt_tokens")
- if prompt_tokens is None:
- prompt_tokens = self._num_tokens_from_string(model, prompt_messages[0].content)
- completion_tokens = usage and usage.get("completion_tokens")
- if completion_tokens is None:
- completion_tokens = self._num_tokens_from_string(model, full_assistant_content)
-
- # transform usage
- usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
-
- return LLMResultChunk(
- id=id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(index=index, message=message, finish_reason=finish_reason, usage=usage),
- )
-
- # delimiter for stream response, need unicode_escape
- import codecs
-
- delimiter = credentials.get("stream_mode_delimiter", "\n\n")
- delimiter = codecs.decode(delimiter, "unicode_escape")
-
- tools_calls: list[AssistantPromptMessage.ToolCall] = []
-
- def increase_tool_call(new_tool_calls: list[AssistantPromptMessage.ToolCall]):
- def get_tool_call(tool_call_id: str):
- if not tool_call_id:
- return tools_calls[-1]
-
- tool_call = next((tool_call for tool_call in tools_calls if tool_call.id == tool_call_id), None)
- if tool_call is None:
- tool_call = AssistantPromptMessage.ToolCall(
- id=tool_call_id,
- type="function",
- function=AssistantPromptMessage.ToolCall.ToolCallFunction(name="", arguments=""),
- )
- tools_calls.append(tool_call)
-
- return tool_call
-
- for new_tool_call in new_tool_calls:
- # get tool call
- tool_call = get_tool_call(new_tool_call.function.name)
- # update tool call
- if new_tool_call.id:
- tool_call.id = new_tool_call.id
- if new_tool_call.type:
- tool_call.type = new_tool_call.type
- if new_tool_call.function.name:
- tool_call.function.name = new_tool_call.function.name
- if new_tool_call.function.arguments:
- tool_call.function.arguments += new_tool_call.function.arguments
-
- finish_reason = None # The default value of finish_reason is None
- message_id, usage = None, None
- for chunk in response.iter_lines(decode_unicode=True, delimiter=delimiter):
- chunk = chunk.strip()
- if chunk:
- # ignore sse comments
- if chunk.startswith(":"):
- continue
- decoded_chunk = chunk.strip().removeprefix("data:").lstrip()
- if decoded_chunk == "[DONE]": # Some provider returns "data: [DONE]"
- continue
-
- try:
- chunk_json: dict = json.loads(decoded_chunk)
- # stream ended
- except json.JSONDecodeError as e:
- yield create_final_llm_result_chunk(
- id=message_id,
- index=chunk_index + 1,
- message=AssistantPromptMessage(content=""),
- finish_reason="Non-JSON encountered.",
- usage=usage,
- )
- break
- # handle the error here. for issue #11629
- if chunk_json.get("error") and chunk_json.get("choices") is None:
- raise ValueError(chunk_json.get("error"))
-
- if chunk_json:
- if u := chunk_json.get("usage"):
- usage = u
- if not chunk_json or len(chunk_json["choices"]) == 0:
- continue
-
- choice = chunk_json["choices"][0]
- finish_reason = chunk_json["choices"][0].get("finish_reason")
- message_id = chunk_json.get("id")
- chunk_index += 1
-
- if "delta" in choice:
- delta = choice["delta"]
- delta_content = delta.get("content")
-
- assistant_message_tool_calls = None
-
- if "tool_calls" in delta and credentials.get("function_calling_type", "no_call") == "tool_call":
- assistant_message_tool_calls = delta.get("tool_calls", None)
- elif (
- "function_call" in delta
- and credentials.get("function_calling_type", "no_call") == "function_call"
- ):
- assistant_message_tool_calls = [
- {"id": "tool_call_id", "type": "function", "function": delta.get("function_call", {})}
- ]
-
- # assistant_message_function_call = delta.delta.function_call
-
- # extract tool calls from response
- if assistant_message_tool_calls:
- tool_calls = self._extract_response_tool_calls(assistant_message_tool_calls)
- increase_tool_call(tool_calls)
-
- if delta_content is None or delta_content == "":
- continue
-
- # Check for think tags
- if "" in delta_content:
- is_reasoning_started = True
- # Remove tag and add markdown quote
- delta_content = "> 💠" + delta_content.replace("", "")
- elif "" in delta_content:
- # Remove tag and add newlines to end quote block
- delta_content = delta_content.replace("", "") + "\n\n"
- is_reasoning_started = False
- elif is_reasoning_started:
- # Add quote markers for content within thinking block
- if "\n\n" in delta_content:
- delta_content = delta_content.replace("\n\n", "\n> ")
- elif "\n" in delta_content:
- delta_content = delta_content.replace("\n", "\n> ")
-
- # transform assistant message to prompt message
- assistant_prompt_message = AssistantPromptMessage(
- content=delta_content,
- )
-
- # reset tool calls
- tool_calls = []
- full_assistant_content += delta_content
- elif "text" in choice:
- choice_text = choice.get("text", "")
- if choice_text == "":
- continue
-
- # transform assistant message to prompt message
- assistant_prompt_message = AssistantPromptMessage(content=choice_text)
- full_assistant_content += choice_text
- else:
- continue
-
- yield LLMResultChunk(
- id=message_id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(
- index=chunk_index,
- message=assistant_prompt_message,
- ),
- )
-
- chunk_index += 1
-
- if tools_calls:
- yield LLMResultChunk(
- id=message_id,
- model=model,
- prompt_messages=prompt_messages,
- delta=LLMResultChunkDelta(
- index=chunk_index,
- message=AssistantPromptMessage(tool_calls=tools_calls, content=""),
- ),
- )
-
- yield create_final_llm_result_chunk(
- id=message_id,
- index=chunk_index,
- message=AssistantPromptMessage(content=""),
- finish_reason=finish_reason,
- usage=usage,
- )
diff --git a/api/core/model_runtime/model_providers/volcengine_maas/llm/llm.py b/api/core/model_runtime/model_providers/volcengine_maas/llm/llm.py
index 40c3777c5c..83c534add8 100644
--- a/api/core/model_runtime/model_providers/volcengine_maas/llm/llm.py
+++ b/api/core/model_runtime/model_providers/volcengine_maas/llm/llm.py
@@ -1,4 +1,5 @@
import logging
+import re
from collections.abc import Generator
from typing import Optional
@@ -251,25 +252,23 @@ class VolcengineMaaSLargeLanguageModel(LargeLanguageModel):
for chunk in chunks:
content = ""
if chunk.choices:
- if hasattr(chunk.choices[0].delta, "reasoning_content"):
- delta_content = ""
+ delta = chunk.choices[0].delta
+ if is_reasoning_started and not hasattr(delta, "reasoning_content") and not delta.content:
+ content = ""
+ elif hasattr(delta, "reasoning_content"):
if not is_reasoning_started:
is_reasoning_started = True
- delta_content = "> 💠" + chunk.choices[0].delta.reasoning_content
+ content = "> 💠" + delta.reasoning_content
else:
- delta_content = chunk.choices[0].delta.reasoning_content
+ content = delta.reasoning_content
- if "\n\n" in delta_content:
- delta_content = delta_content.replace("\n\n", "\n> ")
- elif "\n" in delta_content:
- delta_content = delta_content.replace("\n", "\n> ")
-
- content = delta_content
+ if "\n" in content:
+ content = re.sub(r"\n(?!(>|\n))", "\n> ", content)
elif is_reasoning_started:
- content = "\n\n" + chunk.choices[0].delta.content
+ content = "\n\n" + delta.content
is_reasoning_started = False
else:
- content = chunk.choices[0].delta.content
+ content = delta.content
yield LLMResultChunk(
model=model,