使用 REST API 运行评估

我们强烈建议通过 Python 或 TypeScript SDK 运行评估，这些 SDK 已针对性能和可靠性做了大量优化。但如果你所在环境无法使用 SDK（例如采用其他语言，或运行环境受限），仍然可以直接调用 REST API。本文以 Python 的 requests 库为例，演示如何通过 REST API 运行评估，同样的思路也适用于其它语言。在开始之前，建议先阅读：

创建数据集

下方示例为了方便仍使用 Python SDK。你也可以直接通过 API 或 UI 创建，详见此指南。

import os
import requests

from datetime import datetime
from langsmith import Client
from openai import OpenAI
from uuid import uuid4

client = Client()
oa_client = OpenAI()

#  Create a dataset
examples = [
    {
        "inputs": {"text": "Shut up, idiot"},
        "outputs": {"label": "Toxic"},
    },
    {
        "inputs": {"text": "You're a wonderful person"},
        "outputs": {"label": "Not toxic"},
    },
    {
        "inputs": {"text": "This is the worst thing ever"},
        "outputs": {"label": "Toxic"},
    },
    {
        "inputs": {"text": "I had a great day today"},
        "outputs": {"label": "Not toxic"},
    },
    {
        "inputs": {"text": "Nobody likes you"},
        "outputs": {"label": "Toxic"},
    },
    {
        "inputs": {"text": "This is unacceptable. I want to speak to the manager."},
        "outputs": {"label": "Not toxic"},
    },
]

dataset_name = "Toxic Queries - API Example"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(dataset_id=dataset.id, examples=examples)

运行单次实验

首先获取实验所需的全部示例。

#  Pick a dataset id. In this case, we are using the dataset we created above.
#  Spec: https://api.smith.langchain.com/redoc#tag/examples/operation/delete_example_api_v1_examples__example_id__delete
dataset_id = dataset.id
params = { "dataset": dataset_id }

resp = requests.get(
    "https://api.smith.langchain.com/api/v1/examples",
    params=params,
    headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
)

examples = resp.json()

接下来定义一个方法，为单个示例创建运行记录。

os.environ["OPENAI_API_KEY"] = "sk-..."

def run_completion_on_example(example, model_name, experiment_id):
    """Run completions on a list of examples."""
    # We are using the OpenAI API here, but you can use any model you like

    def _post_run(run_id, name, run_type, inputs, parent_id=None):
        """Function to post a new run to the API."""
        data = {
            "id": run_id.hex,
            "name": name,
            "run_type": run_type,
            "inputs": inputs,
            "start_time": datetime.utcnow().isoformat(),
            "reference_example_id": example["id"],
            "session_id": experiment_id,
        }
        if parent_id:
            data["parent_run_id"] = parent_id.hex
        resp = requests.post(
            "https://api.smith.langchain.com/api/v1/runs", # Update appropriately for self-hosted installations or the EU region
            json=data,
            headers=headers
        )
        resp.raise_for_status()

    def _patch_run(run_id, outputs):
        """Function to patch a run with outputs."""
        resp = requests.patch(
            f"https://api.smith.langchain.com/api/v1/runs/{run_id}",
            json={
                "outputs": outputs,
                "end_time": datetime.utcnow().isoformat(),
            },
            headers=headers,
        )
        resp.raise_for_status()

    # Send your API Key in the request headers
    headers = {"x-api-key": os.environ["LANGSMITH_API_KEY"]}

    text = example["inputs"]["text"]

    messages = [
        {
            "role": "system",
            "content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
        },
        {"role": "user", "content": text},
    ]


    # Create parent run
    parent_run_id = uuid4()
    _post_run(parent_run_id, "LLM Pipeline", "chain", {"text": text})

    # Create child run
    child_run_id = uuid4()
    _post_run(child_run_id, "OpenAI Call", "llm", {"messages": messages}, parent_run_id)

    # Generate completion
    chat_completion = oa_client.chat.completions.create(model=model_name, messages=messages)
    output_text = chat_completion.choices[0].message.content

    # End run
    _patch_run(child_run_id, {
    "messages": messages,
        "output": output_text,
        "model": model_name
    })

    _patch_run(parent_run_id, {"label": output_text})

下面将使用 gpt-3.5-turbo 与 gpt-4o-mini 两个模型对所有示例执行推理。

#  Create a new experiment using the /sessions endpoint
#  An experiment is a collection of runs with a reference to the dataset used
#  Spec: https://api.smith.langchain.com/redoc#tag/tracer-sessions/operation/create_tracer_session_api_v1_sessions_post

model_names = ("gpt-3.5-turbo", "gpt-4o-mini")
experiment_ids = []
for model_name in model_names:
    resp = requests.post(
        "https://api.smith.langchain.com/api/v1/sessions",
        json={
            "start_time": datetime.utcnow().isoformat(),
            "reference_dataset_id": str(dataset_id),
            "description": "An optional description for the experiment",
            "name": f"Toxicity detection - API Example - {model_name} - {str(uuid4())[0:8]}",  # A name for the experiment
            "extra": {
                "metadata": {"foo": "bar"},  # Optional metadata
            },
        },
        headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
    )

    experiment = resp.json()
    experiment_ids.append(experiment["id"])

    # Run completions on all examples
    for example in examples:
        run_completion_on_example(example, model_name, experiment["id"])

    # Issue a patch request to "end" the experiment by updating the end_time
    requests.patch(
        f"https://api.smith.langchain.com/api/v1/sessions/{experiment['id']}",
        json={"end_time": datetime.utcnow().isoformat()},
        headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
    )

运行成对比较实验

接下来示范如何构建成对比较实验：即将两个实验输出进行对比。更多细节请参考配对评估指南。

#  A comparative experiment allows you to provide a preferential ranking on the outputs of two or more experiments
#  Spec: https://api.smith.langchain.com/redoc#tag/datasets/operation/create_comparative_experiment_api_v1_datasets_comparative_post
resp = requests.post(
    "https://api.smith.langchain.com/api/v1/datasets/comparative",
    json={
        "experiment_ids": experiment_ids,
        "name": "Toxicity detection - API Example - Comparative - " + str(uuid4())[0:8],
        "description": "An optional description for the comparative experiment",
        "extra": {
            "metadata": {"foo": "bar"},  # Optional metadata
        },
        "reference_dataset_id": str(dataset_id),
    },
    headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
)

comparative_experiment = resp.json()
comparative_experiment_id = comparative_experiment["id"]

#  You can iterate over the runs in the experiments belonging to the comparative experiment and preferentially rank the outputs

#  Fetch the comparative experiment
resp = requests.get(
    f"https://api.smith.langchain.com/api/v1/datasets/{str(dataset_id)}/comparative",
    params={"id": comparative_experiment_id},
    headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
)

comparative_experiment = resp.json()[0]
experiment_ids = [info["id"] for info in comparative_experiment["experiments_info"]]

from collections import defaultdict
example_id_to_runs_map = defaultdict(list)

#  Spec: https://api.smith.langchain.com/redoc#tag/run/operation/query_runs_api_v1_runs_query_post
runs = requests.post(
    f"https://api.smith.langchain.com/api/v1/runs/query",
    headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]},
    json={
        "session": experiment_ids,
        "is_root": True, # Only fetch root runs (spans) which contain the end outputs
        "select": ["id", "reference_example_id", "outputs"],
    }
).json()
runs = runs["runs"]
for run in runs:
    example_id = run["reference_example_id"]
    example_id_to_runs_map[example_id].append(run)

for example_id, runs in example_id_to_runs_map.items():
    print(f"Example ID: {example_id}")
    # Preferentially rank the outputs, in this case we will always prefer the first output
    # In reality, you can use an LLM to rank the outputs
    feedback_group_id = uuid4()

    # Post a feedback score for each run, with the first run being the preferred one
    # Spec: https://api.smith.langchain.com/redoc#tag/feedback/operation/create_feedback_api_v1_feedback_post
    # We'll use the feedback group ID to associate the feedback scores with the same group
    for i, run in enumerate(runs):
        print(f"Run ID: {run['id']}")
        feedback = {
            "score": 1 if i == 0 else 0,
            "run_id": str(run["id"]),
            "key": "ranked_preference",
            "feedback_group_id": str(feedback_group_id),
            "comparative_experiment_id": comparative_experiment_id,
        }
        resp = requests.post(
            "https://api.smith.langchain.com/api/v1/feedback",
            json=feedback,
            headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]}
        )
        resp.raise_for_status()

Edit the source of this page on GitHub.

Connect these docs programmatically to Claude, VSCode, and more via MCP for real-time answers.

Datasets

Set up evaluations

Analyze experiment results

Annotation & human feedback

Common data types

创建数据集

运行单次实验

运行成对比较实验

Datasets

Set up evaluations

Analyze experiment results

Annotation & human feedback

Common data types

​创建数据集

​运行单次实验

​运行成对比较实验

创建数据集

运行单次实验

运行成对比较实验