Gemini Enterprise for Customer Experience API . projects . locations . apps . evaluationDatasets

Instance Methods

close()

Close httplib2 connections.

create(parent, body=None, evaluationDatasetId=None, x__xgafv=None)

Creates an evaluation dataset.

delete(name, etag=None, x__xgafv=None)

Deletes an evaluation dataset.

get(name, x__xgafv=None)

Gets details of the specified evaluation dataset.

list(parent, filter=None, orderBy=None, pageSize=None, pageToken=None, x__xgafv=None)

Lists all evaluation datasets in the given app.

list_next()

Retrieves the next page of results.

patch(name, body=None, updateMask=None, x__xgafv=None)

Updates an evaluation dataset.

Method Details

close()
Close httplib2 connections.
create(parent, body=None, evaluationDatasetId=None, x__xgafv=None)
Creates an evaluation dataset.

Args:
  parent: string, Required. The app to create the evaluation for. Format: `projects/{project}/locations/{location}/apps/{app}` (required)
  body: object, The request body.
    The object takes the form of:

{ # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
  "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
    "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
      { # Metrics aggregated per app version.
        "appVersionId": "A String", # Output only. The app version ID.
        "failCount": 42, # Output only. The number of times the evaluation failed.
        "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
          { # Metrics for hallucination results.
            "score": 3.14, # Output only. The average hallucination score (0 to 1).
          },
        ],
        "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
          { # Metrics aggregated per turn.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this turn.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnIndex": 42, # Output only. The turn index (0-based).
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
        "passCount": 42, # Output only. The number of times the evaluation passed.
        "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
          { # Metrics for semantic similarity results.
            "score": 3.14, # Output only. The average semantic similarity score (0-4).
          },
        ],
        "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
          { # Metrics for tool call latency.
            "averageLatency": "A String", # Output only. The average latency of the tool calls.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "toolMetrics": [ # Output only. Metrics for each tool within this app version.
          { # Metrics for a single tool.
            "failCount": 42, # Output only. The number of times the tool failed.
            "passCount": 42, # Output only. The number of times the tool passed.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
          { # Metrics for turn latency.
            "averageLatency": "A String", # Output only. The average latency of the turns.
          },
        ],
      },
    ],
  },
  "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
  "createdBy": "A String", # Output only. The user who created the evaluation dataset.
  "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
  "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
  "evaluations": [ # Optional. Evaluations that are included in this dataset.
    "A String",
  ],
  "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
  "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
  "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
}

  evaluationDatasetId: string, Optional. The ID to use for the evaluation dataset, which will become the final component of the evaluation dataset's resource name. If not provided, a unique ID will be automatically assigned for the evaluation.
  x__xgafv: string, V1 error format.
    Allowed values
      1 - v1 error format
      2 - v2 error format

Returns:
  An object of the form:

    { # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
  "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
    "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
      { # Metrics aggregated per app version.
        "appVersionId": "A String", # Output only. The app version ID.
        "failCount": 42, # Output only. The number of times the evaluation failed.
        "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
          { # Metrics for hallucination results.
            "score": 3.14, # Output only. The average hallucination score (0 to 1).
          },
        ],
        "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
          { # Metrics aggregated per turn.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this turn.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnIndex": 42, # Output only. The turn index (0-based).
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
        "passCount": 42, # Output only. The number of times the evaluation passed.
        "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
          { # Metrics for semantic similarity results.
            "score": 3.14, # Output only. The average semantic similarity score (0-4).
          },
        ],
        "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
          { # Metrics for tool call latency.
            "averageLatency": "A String", # Output only. The average latency of the tool calls.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "toolMetrics": [ # Output only. Metrics for each tool within this app version.
          { # Metrics for a single tool.
            "failCount": 42, # Output only. The number of times the tool failed.
            "passCount": 42, # Output only. The number of times the tool passed.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
          { # Metrics for turn latency.
            "averageLatency": "A String", # Output only. The average latency of the turns.
          },
        ],
      },
    ],
  },
  "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
  "createdBy": "A String", # Output only. The user who created the evaluation dataset.
  "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
  "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
  "evaluations": [ # Optional. Evaluations that are included in this dataset.
    "A String",
  ],
  "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
  "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
  "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
}
delete(name, etag=None, x__xgafv=None)
Deletes an evaluation dataset.

Args:
  name: string, Required. The resource name of the evaluation dataset to delete. (required)
  etag: string, Optional. The current etag of the evaluation dataset. If an etag is not provided, the deletion will overwrite any concurrent changes. If an etag is provided and does not match the current etag of the evaluation dataset, deletion will be blocked and an ABORTED error will be returned.
  x__xgafv: string, V1 error format.
    Allowed values
      1 - v1 error format
      2 - v2 error format

Returns:
  An object of the form:

    { # A generic empty message that you can re-use to avoid defining duplicated empty messages in your APIs. A typical example is to use it as the request or the response type of an API method. For instance: service Foo { rpc Bar(google.protobuf.Empty) returns (google.protobuf.Empty); }
}
get(name, x__xgafv=None)
Gets details of the specified evaluation dataset.

Args:
  name: string, Required. The resource name of the evaluation dataset to retrieve. (required)
  x__xgafv: string, V1 error format.
    Allowed values
      1 - v1 error format
      2 - v2 error format

Returns:
  An object of the form:

    { # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
  "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
    "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
      { # Metrics aggregated per app version.
        "appVersionId": "A String", # Output only. The app version ID.
        "failCount": 42, # Output only. The number of times the evaluation failed.
        "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
          { # Metrics for hallucination results.
            "score": 3.14, # Output only. The average hallucination score (0 to 1).
          },
        ],
        "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
          { # Metrics aggregated per turn.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this turn.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnIndex": 42, # Output only. The turn index (0-based).
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
        "passCount": 42, # Output only. The number of times the evaluation passed.
        "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
          { # Metrics for semantic similarity results.
            "score": 3.14, # Output only. The average semantic similarity score (0-4).
          },
        ],
        "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
          { # Metrics for tool call latency.
            "averageLatency": "A String", # Output only. The average latency of the tool calls.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "toolMetrics": [ # Output only. Metrics for each tool within this app version.
          { # Metrics for a single tool.
            "failCount": 42, # Output only. The number of times the tool failed.
            "passCount": 42, # Output only. The number of times the tool passed.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
          { # Metrics for turn latency.
            "averageLatency": "A String", # Output only. The average latency of the turns.
          },
        ],
      },
    ],
  },
  "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
  "createdBy": "A String", # Output only. The user who created the evaluation dataset.
  "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
  "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
  "evaluations": [ # Optional. Evaluations that are included in this dataset.
    "A String",
  ],
  "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
  "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
  "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
}
list(parent, filter=None, orderBy=None, pageSize=None, pageToken=None, x__xgafv=None)
Lists all evaluation datasets in the given app.

Args:
  parent: string, Required. The resource name of the app to list evaluation datasets from. (required)
  filter: string, Optional. Filter to be applied when listing the evaluation datasets. See https://google.aip.dev/160 for more details.
  orderBy: string, Optional. Field to sort by. Only "name" and "create_time", and "update_time" are supported. Time fields are ordered in descending order, and the name field is ordered in ascending order. If not included, "update_time" will be the default. See https://google.aip.dev/132#ordering for more details.
  pageSize: integer, Optional. Requested page size. Server may return fewer items than requested. If unspecified, server will pick an appropriate default.
  pageToken: string, Optional. The next_page_token value returned from a previous list EvaluationService.ListEvaluationDatasets call.
  x__xgafv: string, V1 error format.
    Allowed values
      1 - v1 error format
      2 - v2 error format

Returns:
  An object of the form:

    { # Response message for EvaluationService.ListEvaluationDatasets.
  "evaluationDatasets": [ # The list of evaluation datasets.
    { # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
      "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
        "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
          { # Metrics aggregated per app version.
            "appVersionId": "A String", # Output only. The app version ID.
            "failCount": 42, # Output only. The number of times the evaluation failed.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
              { # Metrics aggregated per turn.
                "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
                  { # Metrics for hallucination results.
                    "score": 3.14, # Output only. The average hallucination score (0 to 1).
                  },
                ],
                "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
                  { # Metrics for semantic similarity results.
                    "score": 3.14, # Output only. The average semantic similarity score (0-4).
                  },
                ],
                "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
                  { # Metrics for tool call latency.
                    "averageLatency": "A String", # Output only. The average latency of the tool calls.
                    "tool": "A String", # Output only. The name of the tool.
                  },
                ],
                "toolMetrics": [ # Output only. Metrics for each tool within this turn.
                  { # Metrics for a single tool.
                    "failCount": 42, # Output only. The number of times the tool failed.
                    "passCount": 42, # Output only. The number of times the tool passed.
                    "tool": "A String", # Output only. The name of the tool.
                  },
                ],
                "turnIndex": 42, # Output only. The turn index (0-based).
                "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
                  { # Metrics for turn latency.
                    "averageLatency": "A String", # Output only. The average latency of the turns.
                  },
                ],
              },
            ],
            "passCount": 42, # Output only. The number of times the evaluation passed.
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this app version.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
      },
      "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
      "createdBy": "A String", # Output only. The user who created the evaluation dataset.
      "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
      "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
      "evaluations": [ # Optional. Evaluations that are included in this dataset.
        "A String",
      ],
      "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
      "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
      "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
    },
  ],
  "nextPageToken": "A String", # A token that can be sent as ListEvaluationDatasetsRequest.page_token to retrieve the next page. Absence of this field indicates there are no subsequent pages.
}
list_next()
Retrieves the next page of results.

        Args:
          previous_request: The request for the previous page. (required)
          previous_response: The response from the request for the previous page. (required)

        Returns:
          A request object that you can call 'execute()' on to request the next
          page. Returns None if there are no more items in the collection.
        
patch(name, body=None, updateMask=None, x__xgafv=None)
Updates an evaluation dataset.

Args:
  name: string, Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}` (required)
  body: object, The request body.
    The object takes the form of:

{ # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
  "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
    "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
      { # Metrics aggregated per app version.
        "appVersionId": "A String", # Output only. The app version ID.
        "failCount": 42, # Output only. The number of times the evaluation failed.
        "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
          { # Metrics for hallucination results.
            "score": 3.14, # Output only. The average hallucination score (0 to 1).
          },
        ],
        "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
          { # Metrics aggregated per turn.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this turn.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnIndex": 42, # Output only. The turn index (0-based).
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
        "passCount": 42, # Output only. The number of times the evaluation passed.
        "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
          { # Metrics for semantic similarity results.
            "score": 3.14, # Output only. The average semantic similarity score (0-4).
          },
        ],
        "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
          { # Metrics for tool call latency.
            "averageLatency": "A String", # Output only. The average latency of the tool calls.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "toolMetrics": [ # Output only. Metrics for each tool within this app version.
          { # Metrics for a single tool.
            "failCount": 42, # Output only. The number of times the tool failed.
            "passCount": 42, # Output only. The number of times the tool passed.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
          { # Metrics for turn latency.
            "averageLatency": "A String", # Output only. The average latency of the turns.
          },
        ],
      },
    ],
  },
  "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
  "createdBy": "A String", # Output only. The user who created the evaluation dataset.
  "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
  "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
  "evaluations": [ # Optional. Evaluations that are included in this dataset.
    "A String",
  ],
  "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
  "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
  "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
}

  updateMask: string, Optional. Field mask is used to control which fields get updated. If the mask is not present, all fields will be updated.
  x__xgafv: string, V1 error format.
    Allowed values
      1 - v1 error format
      2 - v2 error format

Returns:
  An object of the form:

    { # An evaluation dataset represents a set of evaluations that are grouped together basaed on shared tags.
  "aggregatedMetrics": { # Aggregated metrics for an evaluation or evaluation dataset. # Output only. The aggregated metrics for this evaluation dataset across all runs.
    "metricsByAppVersion": [ # Output only. Aggregated metrics, grouped by app version ID.
      { # Metrics aggregated per app version.
        "appVersionId": "A String", # Output only. The app version ID.
        "failCount": 42, # Output only. The number of times the evaluation failed.
        "hallucinationMetrics": [ # Output only. Metrics for hallucination within this app version.
          { # Metrics for hallucination results.
            "score": 3.14, # Output only. The average hallucination score (0 to 1).
          },
        ],
        "metricsByTurn": [ # Output only. Metrics aggregated per turn within this app version.
          { # Metrics aggregated per turn.
            "hallucinationMetrics": [ # Output only. Metrics for hallucination within this turn.
              { # Metrics for hallucination results.
                "score": 3.14, # Output only. The average hallucination score (0 to 1).
              },
            ],
            "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this turn.
              { # Metrics for semantic similarity results.
                "score": 3.14, # Output only. The average semantic similarity score (0-4).
              },
            ],
            "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this turn.
              { # Metrics for tool call latency.
                "averageLatency": "A String", # Output only. The average latency of the tool calls.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "toolMetrics": [ # Output only. Metrics for each tool within this turn.
              { # Metrics for a single tool.
                "failCount": 42, # Output only. The number of times the tool failed.
                "passCount": 42, # Output only. The number of times the tool passed.
                "tool": "A String", # Output only. The name of the tool.
              },
            ],
            "turnIndex": 42, # Output only. The turn index (0-based).
            "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this turn.
              { # Metrics for turn latency.
                "averageLatency": "A String", # Output only. The average latency of the turns.
              },
            ],
          },
        ],
        "passCount": 42, # Output only. The number of times the evaluation passed.
        "semanticSimilarityMetrics": [ # Output only. Metrics for semantic similarity within this app version.
          { # Metrics for semantic similarity results.
            "score": 3.14, # Output only. The average semantic similarity score (0-4).
          },
        ],
        "toolCallLatencyMetrics": [ # Output only. Metrics for tool call latency within this app version.
          { # Metrics for tool call latency.
            "averageLatency": "A String", # Output only. The average latency of the tool calls.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "toolMetrics": [ # Output only. Metrics for each tool within this app version.
          { # Metrics for a single tool.
            "failCount": 42, # Output only. The number of times the tool failed.
            "passCount": 42, # Output only. The number of times the tool passed.
            "tool": "A String", # Output only. The name of the tool.
          },
        ],
        "turnLatencyMetrics": [ # Output only. Metrics for turn latency within this app version.
          { # Metrics for turn latency.
            "averageLatency": "A String", # Output only. The average latency of the turns.
          },
        ],
      },
    ],
  },
  "createTime": "A String", # Output only. Timestamp when the evaluation dataset was created.
  "createdBy": "A String", # Output only. The user who created the evaluation dataset.
  "displayName": "A String", # Required. User-defined display name of the evaluation dataset. Unique within an App.
  "etag": "A String", # Output only. Etag used to ensure the object hasn't changed during a read-modify-write operation. If the etag is empty, the update will overwrite any concurrent changes.
  "evaluations": [ # Optional. Evaluations that are included in this dataset.
    "A String",
  ],
  "lastUpdatedBy": "A String", # Output only. The user who last updated the evaluation dataset.
  "name": "A String", # Identifier. The unique identifier of this evaluation dataset. Format: `projects/{project}/locations/{location}/apps/{app}/evaluationDatasets/{evaluationDataset}`
  "updateTime": "A String", # Output only. Timestamp when the evaluation dataset was last updated.
}