smolagents
is an interesting agent-based library by Huggingface. I like how simple and small it is. So, this is a deep dive into how smolagent works.
Hello world - Ollama Link to heading
Starting with Ollama, This is a small example using Ollama and LiteLLM.
pip install smolagents[litellm]
from smolagents import CodeAgent, LiteLLMModel
model = LiteLLMModel(
model_id="ollama_chat/llama3.2",
api_base="http://localhost:11434",
api_key="YOUR_API_KEY",
num_ctx=8192
)
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.run(
"Could you give me the 118th number in the Fibonacci sequence?",
)
Hello world - OpenAI Link to heading
Next, the smallest example using OpenAI API.
import os
from smolagents import CodeAgent, OpenAIServerModel, ToolCallingAgent, DuckDuckGoSearchTool
api_base = "https://api.openai.com/v1"
model_id = "gpt-4o-mini"
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")
model = OpenAIServerModel(
model_id=model_id,
api_key=api_key,
api_base=api_base,
)
agent = ToolCallingAgent(tools=[DuckDuckGoSearchTool()], model=model)
agent.run("What is the distance between Detroit and Chicago?")
LiteLLMModel and OpenAIServerModel Link to heading
Let’s start with 2 models used above. OpenAIServerModel
and LiteLLMModel
both extend Model
. They implement __call__
to actually call the respective completion function to return a response.
class LiteLLMModel(Model):
"""This model connects to [LiteLLM](https://www.litellm.ai/) as a gateway to hundreds of LLMs.
Parameters:
model_id (`str`):
The model identifier to use on the server (e.g. "gpt-3.5-turbo").
api_base (`str`, *optional*):
The base URL of the OpenAI-compatible API server.
api_key (`str`, *optional*):
The API key to use for authentication.
custom_role_conversions (`dict[str, str]`, *optional*):
Custom role conversion mapping to convert message roles in others.
Useful for specific models that do not support specific message roles like "system".
**kwargs:
Additional keyword arguments to pass to the OpenAI API.
"""
def __init__(
self,
model_id: str = "anthropic/claude-3-5-sonnet-20240620",
api_base=None,
api_key=None,
custom_role_conversions: Optional[Dict[str, str]] = None,
**kwargs,
):
...
def __call__(
self,
messages: List[Dict[str, str]],
stop_sequences: Optional[List[str]] = None,
grammar: Optional[str] = None,
tools_to_call_from: Optional[List[Tool]] = None,
**kwargs,
) -> ChatMessage:
import litellm
completion_kwargs = self._prepare_completion_kwargs(
messages=messages,
stop_sequences=stop_sequences,
grammar=grammar,
tools_to_call_from=tools_to_call_from,
model=self.model_id,
api_base=self.api_base,
api_key=self.api_key,
convert_images_to_image_urls=True,
flatten_messages_as_text=self.model_id.startswith("ollama"),
custom_role_conversions=self.custom_role_conversions,
**kwargs,
)
response = litellm.completion(**completion_kwargs)
class OpenAIServerModel(Model):
"""This model connects to an OpenAI-compatible API server.
Parameters:
model_id (`str`):
The model identifier to use on the server (e.g. "gpt-3.5-turbo").
api_base (`str`, *optional*):
The base URL of the OpenAI-compatible API server.
api_key (`str`, *optional*):
The API key to use for authentication.
organization (`str`, *optional*):
The organization to use for the API request.
project (`str`, *optional*):
The project to use for the API request.
client_kwargs (`dict[str, Any]`, *optional*):
Additional keyword arguments to pass to the OpenAI client (like organization, project, max_retries etc.).
custom_role_conversions (`dict[str, str]`, *optional*):
Custom role conversion mapping to convert message roles in others.
Useful for specific models that do not support specific message roles like "system".
**kwargs:
Additional keyword arguments to pass to the OpenAI API.
"""
def __init__(
self,
model_id: str,
api_base: Optional[str] = None,
api_key: Optional[str] = None,
organization: Optional[str] | None = None,
project: Optional[str] | None = None,
client_kwargs: Optional[Dict[str, Any]] = None,
custom_role_conversions: Optional[Dict[str, str]] = None,
**kwargs,
):
try:
import openai
except ModuleNotFoundError:
raise ModuleNotFoundError(
"Please install 'openai' extra to use OpenAIServerModel: `pip install 'smolagents[openai]'`"
) from None
super().__init__(**kwargs)
self.model_id = model_id
self.client = openai.OpenAI(
base_url=api_base,
api_key=api_key,
organization=organization,
project=project,
**(client_kwargs or {}),
)
self.custom_role_conversions = custom_role_conversions
def __call__(
self,
messages: List[Dict[str, str]],
stop_sequences: Optional[List[str]] = None,
grammar: Optional[str] = None,
tools_to_call_from: Optional[List[Tool]] = None,
**kwargs,
) -> ChatMessage:
completion_kwargs = self._prepare_completion_kwargs(
messages=messages,
stop_sequences=stop_sequences,
grammar=grammar,
tools_to_call_from=tools_to_call_from,
model=self.model_id,
custom_role_conversions=self.custom_role_conversions,
convert_images_to_image_urls=True,
**kwargs,
)
response = self.client.chat.completions.create(**completion_kwargs)
self.last_input_token_count = response.usage.prompt_tokens
self.last_output_token_count = response.usage.completion_tokens
message = ChatMessage.from_dict(
response.choices[0].message.model_dump(include={"role", "content", "tool_calls"})
)
message.raw = response
if tools_to_call_from is not None:
return parse_tool_args_if_needed(message)
return message
MultiStepAgent Link to heading
Now, we got the brain of the agent, Let’s look at MultiStepAgent
. It implements run
which eventually calls step()
. The child agents extends MultiStepAgent
and implements step()
. So, run
create the prompt and calls _run
def run(
self,
task: str,
stream: bool = False,
reset: bool = True,
images: Optional[List[str]] = None,
additional_args: Optional[Dict] = None,
max_steps: Optional[int] = None,
):
self.system_prompt = self.initialize_system_prompt()
self.memory.system_prompt = SystemPromptStep(system_prompt=self.system_prompt)
if reset:
self.memory.reset()
self.monitor.reset()
self.logger.log_task(
content=self.task.strip(),
subtitle=f"{type(self.model).__name__} - {(self.model.model_id if hasattr(self.model, 'model_id') else '')}",
level=LogLevel.INFO,
title=self.name if hasattr(self, "name") else None,
)
self.memory.steps.append(TaskStep(task=self.task, task_images=images))
if getattr(self, "python_executor", None):
self.python_executor.send_variables(variables=self.state)
self.python_executor.send_tools({**self.tools, **self.managed_agents})
if stream:
# The steps are returned as they are executed through a generator to iterate on.
return self._run(task=self.task, max_steps=max_steps, images=images)
# Outputs are returned only at the end. We only look at the last step.
return deque(self._run(task=self.task, max_steps=max_steps, images=images), maxlen=1)[0]
_run
calls _execute_step
which eventually calls step
def _run(
self, task: str, max_steps: int, images: List[str] | None = None
) -> Generator[ActionStep | AgentType, None, None]:
final_answer = None
self.step_number = 1
while final_answer is None and self.step_number <= max_steps:
step_start_time = time.time()
memory_step = self._create_memory_step(step_start_time, images)
try:
final_answer = self._execute_step(task, memory_step)
except AgentError as e:
memory_step.error = e
finally:
self._finalize_step(memory_step, step_start_time)
yield memory_step
self.step_number += 1
def _execute_step(self, task: str, memory_step: ActionStep) -> Union[None, Any]:
if self.planning_interval is not None and self.step_number % self.planning_interval == 1:
self.planning_step(task, is_first_step=(self.step_number == 1), step=self.step_number)
self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO)
final_answer = self.step(memory_step)
if final_answer is not None and self.final_answer_checks:
self._validate_final_answer(final_answer)
return final_answer
ToolCallingAgent Link to heading
Looking deeper at ToolCallingAgent
, that takes list of tools, model and other kargs.
class ToolCallingAgent(MultiStepAgent):
"""
This agent uses JSON-like tool calls, using method `model.get_tool_call` to leverage the LLM engine's tool calling capabilities.
Args:
tools (`list[Tool]`): [`Tool`]s that the agent can use.
model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates.
planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
**kwargs: Additional keyword arguments.
"""
step
picks the argument for function calling formatted by the model. The does ToolCall
passing the arguments. There is a tool called final_answer
that print the final results of prompts.
def step(self, memory_step: ActionStep) -> Union[None, Any]:
"""
Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
Returns None if the step is not final.
"""
memory_messages = self.write_memory_to_messages()
self.input_messages = memory_messages
# Add new step in logs
memory_step.model_input_messages = memory_messages.copy()
try:
model_message: ChatMessage = self.model(
memory_messages,
tools_to_call_from=list(self.tools.values()),
stop_sequences=["Observation:"],
)
memory_step.model_output_message = model_message
if model_message.tool_calls is None or len(model_message.tool_calls) == 0:
raise Exception("Model did not call any tools. Call `final_answer` tool to return a final answer.")
tool_call = model_message.tool_calls[0]
tool_name, tool_call_id = tool_call.function.name, tool_call.id
tool_arguments = tool_call.function.arguments
except Exception as e:
raise AgentGenerationError(f"Error in generating tool call with model:\n{e}", self.logger) from e
memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
# Execute
self.logger.log(
Panel(Text(f"Calling tool: '{tool_name}' with arguments: {tool_arguments}")),
level=LogLevel.INFO,
)
if tool_name == "final_answer":
if isinstance(tool_arguments, dict):
if "answer" in tool_arguments:
answer = tool_arguments["answer"]
else:
answer = tool_arguments
else:
answer = tool_arguments
if (
isinstance(answer, str) and answer in self.state.keys()
): # if the answer is a state variable, return the value
final_answer = self.state[answer]
self.logger.log(
f"[bold {YELLOW_HEX}]Final answer:[/bold {YELLOW_HEX}] Extracting key '{answer}' from state to return value '{final_answer}'.",
level=LogLevel.INFO,
)
else:
final_answer = answer
self.logger.log(
Text(f"Final answer: {final_answer}", style=f"bold {YELLOW_HEX}"),
level=LogLevel.INFO,
)
memory_step.action_output = final_answer
return final_answer
else:
if tool_arguments is None:
tool_arguments = {}
observation = self.execute_tool_call(tool_name, tool_arguments)
observation_type = type(observation)
if observation_type in [AgentImage, AgentAudio]:
if observation_type == AgentImage:
observation_name = "image.png"
elif observation_type == AgentAudio:
observation_name = "audio.mp3"
# TODO: observation naming could allow for different names of same type
self.state[observation_name] = observation
updated_information = f"Stored '{observation_name}' in memory."
else:
updated_information = str(observation).strip()
self.logger.log(
f"Observations: {updated_information.replace('[', '|')}", # escape potential rich-tag-like components
level=LogLevel.INFO,
)
memory_step.observations = updated_information
return None
CodeAgent Link to heading
CodeAgent
seems interesting as it is not using JSON tool calling that llamaIndex
ReAct agent used. Well, smolagents
still provides ToolCallingAgent
which is exactly that.
class CodeAgent(MultiStepAgent):
"""
In this agent, the tool calls will be formulated by the LLM in code format, then parsed and executed.
Args:
tools (`list[Tool]`): [`Tool`]s that the agent can use.
model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
prompt_templates (`dict`, *optional*): Prompt templates.
grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output.
additional_authorized_imports (`list[str]`, *optional*): Additional authorized imports for the agent.
planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
use_e2b_executor (`bool`, default `False`): Whether to use the E2B executor for remote code execution.
max_print_outputs_length (`int`, *optional*): Maximum length of the print outputs.
**kwargs: Additional keyword arguments.
"""
def step(self, memory_step: ActionStep) -> Union[None, Any]:
"""
Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
Returns None if the step is not final.
"""
memory_messages = self.write_memory_to_messages()
self.input_messages = memory_messages.copy()
# Add new step in logs
memory_step.model_input_messages = memory_messages.copy()
try:
additional_args = {"grammar": self.grammar} if self.grammar is not None else {}
chat_message: ChatMessage = self.model(
self.input_messages,
stop_sequences=["<end_code>", "Observation:"],
**additional_args,
)
memory_step.model_output_message = chat_message
model_output = chat_message.content
memory_step.model_output = model_output
except Exception as e:
raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e
self.logger.log_markdown(
content=model_output,
title="Output message of the LLM:",
level=LogLevel.DEBUG,
)
# Parse
try:
code_action = fix_final_answer_code(parse_code_blobs(model_output))
except Exception as e:
error_msg = f"Error in code parsing:\n{e}\nMake sure to provide correct code blobs."
raise AgentParsingError(error_msg, self.logger)
memory_step.tool_calls = [
ToolCall(
name="python_interpreter",
arguments=code_action,
id=f"call_{len(self.memory.steps)}",
)
]
# Execute
self.logger.log_code(title="Executing parsed code:", content=code_action, level=LogLevel.INFO)
is_final_answer = False
try:
output, execution_logs, is_final_answer = self.python_executor(code_action)
execution_outputs_console = []
if len(execution_logs) > 0:
execution_outputs_console += [
Text("Execution logs:", style="bold"),
Text(execution_logs),
]
observation = "Execution logs:\n" + execution_logs
except Exception as e:
if hasattr(self.python_executor, "state") and "_print_outputs" in self.python_executor.state:
execution_logs = str(self.python_executor.state["_print_outputs"])
if len(execution_logs) > 0:
execution_outputs_console = [
Text("Execution logs:", style="bold"),
Text(execution_logs),
]
memory_step.observations = "Execution logs:\n" + execution_logs
self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
error_msg = str(e)
if "Import of " in error_msg and " is not allowed" in error_msg:
self.logger.log(
"[bold red]Warning to user: Code execution failed due to an unauthorized import - Consider passing said import under `additional_authorized_imports` when initializing your CodeAgent.",
level=LogLevel.INFO,
)
raise AgentExecutionError(error_msg, self.logger)
truncated_output = truncate_content(str(output))
observation += "Last output from code snippet:\n" + truncated_output
memory_step.observations = observation
execution_outputs_console += [
Text(
f"{('Out - Final answer' if is_final_answer else 'Out')}: {truncated_output}",
style=(f"bold {YELLOW_HEX}" if is_final_answer else ""),
),
]
self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
memory_step.action_output = output
return output if is_final_answer else None
if use_e2b_executor:
self.python_executor = E2BExecutor(
self.additional_authorized_imports,
list(all_tools.values()),
self.logger,
)
else:
self.python_executor = LocalPythonInterpreter(
self.additional_authorized_imports,
all_tools,
max_print_outputs_length=max_print_outputs_length,
)
System prompt Link to heading
smolagents
have very verbose and well written prompts to generate code. You can find the full prompt src/smolagents/prompts/code_agent.yaml
. The interesting part is the example code for the LLM to generate a task, thought and observation.
You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
During each intermediate step, you can use 'print()' to save whatever important information you will then need.
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
In the end you have to return a final answer using the `final_answer` tool.
Here are a few examples using notional tools:
---
Task: "Generate an image of the oldest person in this document."
Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
Code:
```py
answer = document_qa(document=document, question="Who is the oldest person mentioned?")
print(answer)
```<end_code>
Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
Thought: I will now generate an image showcasing the oldest person.
Code:
```py
image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
final_answer(image)
```<end_code>
---
And at the end, There are some rules for the LLM to generate the code:
Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
2. Use only variables that you have defined!
3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
LocalPythonInterpreter Link to heading
For the code generation and execution, it’s always good idea to sand-box to stop malicious code executing CodeAgent
can use LocalPythonInterpreter
to limit the code specific packages(imports and call). To enforce that, it uses AST to evaluate the generate code. __call__
calls evaluate_python_code
that processes the ast.
class LocalPythonInterpreter:
def __init__(
self,
additional_authorized_imports: List[str],
tools: Dict,
max_print_outputs_length: Optional[int] = None,
):
self.custom_tools = {}
self.state = {}
self.max_print_outputs_length = max_print_outputs_length
if max_print_outputs_length is None:
self.max_print_outputs_length = DEFAULT_MAX_LEN_OUTPUT
self.additional_authorized_imports = additional_authorized_imports
self.authorized_imports = list(set(BASE_BUILTIN_MODULES) | set(self.additional_authorized_imports))
# Add base trusted tools to list
self.static_tools = {
**tools,
**BASE_PYTHON_TOOLS.copy(),
}
# TODO: assert self.authorized imports are all installed locally
def __call__(self, code_action: str, additional_variables: Dict) -> Tuple[Any, str, bool]:
self.state.update(additional_variables)
output, is_final_answer = evaluate_python_code(
code_action,
static_tools=self.static_tools,
custom_tools=self.custom_tools,
state=self.state,
authorized_imports=self.authorized_imports,
max_print_outputs_length=self.max_print_outputs_length,
)
logs = str(self.state["_print_outputs"])
return output, logs, is_final_answer
def evaluate_python_code(
code: str,
static_tools: Optional[Dict[str, Callable]] = None,
custom_tools: Optional[Dict[str, Callable]] = None,
state: Optional[Dict[str, Any]] = None,
authorized_imports: List[str] = BASE_BUILTIN_MODULES,
max_print_outputs_length: int = DEFAULT_MAX_LEN_OUTPUT,
):
"""
Evaluate a python expression using the content of the variables stored in a state and only evaluating a given set
of functions.
This function will recurse through the nodes of the tree provided.
Args:
code (`str`):
The code to evaluate.
static_tools (`Dict[str, Callable]`):
The functions that may be called during the evaluation. These can also be agents in a multiagent setting.
These tools cannot be overwritten in the code: any assignment to their name will raise an error.
custom_tools (`Dict[str, Callable]`):
The functions that may be called during the evaluation.
These tools can be overwritten in the code: any assignment to their name will overwrite them.
state (`Dict[str, Any]`):
A dictionary mapping variable names to values. The `state` should contain the initial inputs but will be
updated by this function to contain all variables as they are evaluated.
The print outputs will be stored in the state under the key "_print_outputs".
"""
try:
expression = ast.parse(code)
except SyntaxError as e:
raise InterpreterError(
f"Code parsing failed on line {e.lineno} due to: {type(e).__name__}\n"
f"{e.text}"
f"{' ' * (e.offset or 0)}^\n"
f"Error: {str(e)}"
)
if state is None:
state = {}
static_tools = static_tools.copy() if static_tools is not None else {}
custom_tools = custom_tools if custom_tools is not None else {}
result = None
state["_print_outputs"] = PrintContainer()
state["_operations_count"] = 0
def final_answer(value):
raise FinalAnswerException(value)
static_tools["final_answer"] = final_answer
try:
for node in expression.body:
result = evaluate_ast(node, state, static_tools, custom_tools, authorized_imports)
state["_print_outputs"].value = truncate_content(
str(state["_print_outputs"]), max_length=max_print_outputs_length
)
is_final_answer = False
return result, is_final_answer
except FinalAnswerException as e:
state["_print_outputs"].value = truncate_content(
str(state["_print_outputs"]), max_length=max_print_outputs_length
)
is_final_answer = True
return e.value, is_final_answer
except Exception as e:
exception_type = type(e).__name__
state["_print_outputs"].value = truncate_content(
str(state["_print_outputs"]), max_length=max_print_outputs_length
)
raise InterpreterError(
f"Code execution failed at line '{ast.get_source_segment(code, node)}' due to: {exception_type}:{str(e)}"
)