Spaces:

kdamevski
/

conversational_uav_explorer

Runtime error

App Files Files Community

conversational_uav_explorer / myenv /Lib /site-packages /argilla /client /api.py

kdamevski

Upload folder using huggingface_hub

1c60c6e almost 2 years ago

raw

history blame contribute delete

26.7 kB

	# coding=utf-8
	# Copyright 2021-present, the Recognai S.L. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import asyncio
	import logging
	import os
	import re
	import warnings
	from asyncio import Future
	from functools import wraps
	from inspect import signature
	from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

	from tqdm.auto import tqdm

	from argilla._constants import (
	_OLD_WORKSPACE_HEADER_NAME,
	DATASET_NAME_REGEX_PATTERN,
	DEFAULT_API_KEY,
	WORKSPACE_HEADER_NAME,
	)
	from argilla.client.apis.datasets import Datasets
	from argilla.client.apis.metrics import MetricsAPI
	from argilla.client.apis.searches import Searches
	from argilla.client.datasets import (
	Dataset,
	DatasetForText2Text,
	DatasetForTextClassification,
	DatasetForTokenClassification,
	)
	from argilla.client.metrics.models import MetricResults
	from argilla.client.models import (
	BulkResponse,
	Record,
	Text2TextRecord,
	TextClassificationRecord,
	TextGenerationRecord,
	TokenClassificationRecord,
	)
	from argilla.client.sdk.client import AuthenticatedClient
	from argilla.client.sdk.commons.api import async_bulk
	from argilla.client.sdk.commons.errors import BaseClientError
	from argilla.client.sdk.datasets import api as datasets_api
	from argilla.client.sdk.datasets.models import CopyDatasetRequest, TaskType
	from argilla.client.sdk.metrics import api as metrics_api
	from argilla.client.sdk.metrics.models import MetricInfo
	from argilla.client.sdk.text2text import api as text2text_api
	from argilla.client.sdk.text2text.models import (
	CreationText2TextRecord,
	Text2TextBulkData,
	Text2TextQuery,
	)
	from argilla.client.sdk.text_classification import api as text_classification_api
	from argilla.client.sdk.text_classification.models import (
	CreationTextClassificationRecord,
	LabelingRule,
	LabelingRuleMetricsSummary,
	TextClassificationBulkData,
	TextClassificationQuery,
	)
	from argilla.client.sdk.token_classification import api as token_classification_api
	from argilla.client.sdk.token_classification.models import (
	CreationTokenClassificationRecord,
	TokenClassificationBulkData,
	TokenClassificationQuery,
	)
	from argilla.client.sdk.users import api as users_api
	from argilla.client.sdk.users.models import User
	from argilla.utils import setup_loop_in_thread

	_LOGGER = logging.getLogger(__name__)


	class _ArgillaLogAgent:
	def __init__(self, api: "Api"):
	self.__api__ = api
	self.__loop__, self.__thread__ = setup_loop_in_thread()

	@staticmethod
	async def __log_internal__(api: "Api", args, *kwargs):
	try:
	return await api.log_async(args, *kwargs)
	except Exception as ex:
	_LOGGER.error(
	f"Cannot log data {args, kwargs}\n"
	f"Error of type {type(ex)}\n: {ex}. ({ex.args})"
	)
	raise ex

	def log(self, args, *kwargs) -> Future:
	return asyncio.run_coroutine_threadsafe(
	self.__log_internal__(self.__api__, args, *kwargs), self.__loop__
	)


	class Api:
	# Larger sizes will trigger a warning
	_MAX_CHUNK_SIZE = 5000

	def __init__(
	self,
	api_url: Optional[str] = None,
	api_key: Optional[str] = None,
	workspace: Optional[str] = None,
	timeout: int = 60,
	extra_headers: Optional[Dict[str, str]] = None,
	):
	"""Init the Python client.

	We will automatically init a default client for you when calling other client methods.
	The arguments provided here will overwrite your corresponding environment variables.

	Args:
	api_url: Address of the REST API. If `None` (default) and the env variable ``ARGILLA_API_URL`` is not set,
	it will default to `http://localhost:6900`.
	api_key: Authentification key for the REST API. If `None` (default) and the env variable ``ARGILLA_API_KEY``
	is not set, it will default to `argilla.apikey`.
	workspace: The workspace to which records will be logged/loaded. If `None` (default) and the
	env variable ``ARGILLA_WORKSPACE`` is not set, it will default to the private user workspace.
	timeout: Wait `timeout` seconds for the connection to timeout. Default: 60.
	extra_headers: Extra HTTP headers sent to the server. You can use this to customize
	the headers of argilla client requests, like additional security restrictions. Default: `None`.

	Examples:
	>>> import argilla as rg
	>>> rg.init(api_url="http://localhost:9090", api_key="4AkeAPIk3Y")
	>>> # Customizing request headers
	>>> headers = {"X-Client-id":"id","X-Secret":"secret"}
	>>> rg.init(api_url="http://localhost:9090", api_key="4AkeAPIk3Y", extra_headers=headers)

	"""
	api_url = api_url or os.getenv("ARGILLA_API_URL", "http://localhost:6900")
	# Checking that the api_url does not end in '/'
	api_url = re.sub(r"\/$", "", api_url)
	api_key = api_key or os.getenv("ARGILLA_API_KEY", DEFAULT_API_KEY)
	workspace = workspace or os.getenv("ARGILLA_WORKSPACE")
	headers = extra_headers or {}

	self._client: AuthenticatedClient = AuthenticatedClient(
	base_url=api_url,
	token=api_key,
	timeout=timeout,
	headers=headers.copy(),
	)

	self._user: User = users_api.whoami(client=self._client)
	if workspace is not None:
	self.set_workspace(workspace)

	self._agent = _ArgillaLogAgent(self)

	def __del__(self):
	if hasattr(self, "_client"):
	del self._client
	if hasattr(self, "_agent"):
	del self._agent

	@property
	def client(self):
	"""The underlying authenticated client"""
	return self._client

	@property
	def datasets(self) -> Datasets:
	return Datasets(client=self._client)

	@property
	def searches(self):
	return Searches(client=self._client)

	@property
	def metrics(self):
	return MetricsAPI(client=self.client)

	def set_workspace(self, workspace: str):
	"""Sets the active workspace.

	Args:
	workspace: The new workspace
	"""
	if workspace is None:
	raise Exception("Must provide a workspace")

	if workspace != self.get_workspace():
	if workspace == self._user.username:
	self._client.headers.pop(WORKSPACE_HEADER_NAME, workspace)
	elif (
	self._user.workspaces is not None
	and workspace not in self._user.workspaces
	):
	raise Exception(f"Wrong provided workspace {workspace}")
	self._client.headers[WORKSPACE_HEADER_NAME] = workspace
	self._client.headers[_OLD_WORKSPACE_HEADER_NAME] = workspace

	def get_workspace(self) -> str:
	"""Returns the name of the active workspace.

	Returns:
	The name of the active workspace as a string.
	"""
	return self._client.headers.get(WORKSPACE_HEADER_NAME, self._user.username)

	def copy(self, dataset: str, name_of_copy: str, workspace: str = None):
	"""Creates a copy of a dataset including its tags and metadata

	Args:
	dataset: Name of the source dataset
	name_of_copy: Name of the copied dataset
	workspace: If provided, dataset will be copied to that workspace

	Examples:
	>>> import argilla as rg
	>>> rg.copy("my_dataset", name_of_copy="new_dataset")
	>>> rg.load("new_dataset")
	"""
	datasets_api.copy_dataset(
	client=self._client,
	name=dataset,
	json_body=CopyDatasetRequest(name=name_of_copy, target_workspace=workspace),
	)

	def delete(self, name: str) -> None:
	"""Deletes a dataset.

	Args:
	name: The dataset name.

	Examples:
	>>> import argilla as rg
	>>> rg.delete(name="example-dataset")
	"""
	datasets_api.delete_dataset(client=self._client, name=name)

	def log(
	self,
	records: Union[Record, Iterable[Record], Dataset],
	name: str,
	tags: Optional[Dict[str, str]] = None,
	metadata: Optional[Dict[str, Any]] = None,
	chunk_size: int = 500,
	verbose: bool = True,
	background: bool = False,
	) -> Union[BulkResponse, Future]:
	"""Logs Records to argilla.

	The logging happens asynchronously in a background thread.

	Args:
	records: The record, an iterable of records, or a dataset to log.
	name: The dataset name.
	tags: A dictionary of tags related to the dataset.
	metadata: A dictionary of extra info for the dataset.
	chunk_size: The chunk size for a data bulk.
	verbose: If True, shows a progress bar and prints out a quick summary at the end.
	background: If True, we will NOT wait for the logging process to finish and return an ``asyncio.Future``
	object. You probably want to set ``verbose`` to False in that case.

	Returns:
	Summary of the response from the REST API.
	If the ``background`` argument is set to True, an ``asyncio.Future`` will be returned instead.

	Examples:
	>>> import argilla as rg
	>>> record = rg.TextClassificationRecord(
	... text="my first argilla example",
	... prediction=[('spam', 0.8), ('ham', 0.2)]
	... )
	>>> rg.log(record, name="example-dataset")
	1 records logged to http://localhost:6900/datasets/argilla/example-dataset
	BulkResponse(dataset='example-dataset', processed=1, failed=0)
	>>>
	>>> # Logging records in the background
	>>> rg.log(record, name="example-dataset", background=True, verbose=False)
	<Future at 0x7f675a1fffa0 state=pending>
	"""
	future = self._agent.log(
	records=records,
	name=name,
	tags=tags,
	metadata=metadata,
	chunk_size=chunk_size,
	verbose=verbose,
	)
	if background:
	return future

	try:
	return future.result()
	finally:
	future.cancel()

	async def log_async(
	self,
	records: Union[Record, Iterable[Record], Dataset],
	name: str,
	tags: Optional[Dict[str, str]] = None,
	metadata: Optional[Dict[str, Any]] = None,
	chunk_size: int = 500,
	verbose: bool = True,
	) -> BulkResponse:
	"""Logs Records to argilla with asyncio.

	Args:
	records: The record, an iterable of records, or a dataset to log.
	name: The dataset name.
	tags: A dictionary of tags related to the dataset.
	metadata: A dictionary of extra info for the dataset.
	chunk_size: The chunk size for a data bulk.
	verbose: If True, shows a progress bar and prints out a quick summary at the end.

	Returns:
	Summary of the response from the REST API

	Examples:
	>>> # Log asynchronously from your notebook
	>>> import asyncio
	>>> import argilla as rg
	>>> from argilla.utils import setup_loop_in_thread
	>>> loop, _ = setup_loop_in_thread()
	>>> future_response = asyncio.run_coroutine_threadsafe(
	... rg.log_async(my_records, dataset_name), loop
	... )
	"""
	tags = tags or {}
	metadata = metadata or {}

	if not name:
	raise InputValueError("Empty dataset name has been passed as argument.")

	if not re.match(DATASET_NAME_REGEX_PATTERN, name):
	raise InputValueError(
	f"Provided dataset name {name} does not match the pattern"
	f" {DATASET_NAME_REGEX_PATTERN}. Please, use a valid name for your"
	" dataset. This limitation is caused by naming conventions for indexes"
	" in Elasticsearch."
	" https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html"
	)

	if chunk_size > self._MAX_CHUNK_SIZE:
	_LOGGER.warning(
	"""The introduced chunk size is noticeably large, timeout errors may occur.
	Consider a chunk size smaller than %s""",
	self._MAX_CHUNK_SIZE,
	)

	if isinstance(records, Record.__args__):
	records = [records]
	records = list(records)

	try:
	record_type = type(records[0])
	except IndexError:
	raise InputValueError("Empty record list has been passed as argument.")

	if record_type is TextClassificationRecord:
	bulk_class = TextClassificationBulkData
	creation_class = CreationTextClassificationRecord
	elif record_type is TokenClassificationRecord:
	bulk_class = TokenClassificationBulkData
	creation_class = CreationTokenClassificationRecord
	elif (record_type is Text2TextRecord) or (record_type is TextGenerationRecord):
	bulk_class = Text2TextBulkData
	creation_class = CreationText2TextRecord
	else:
	raise InputValueError(
	f"Unknown record type {record_type}. Available values are"
	f" {Record.__args__}"
	)

	processed, failed = 0, 0
	progress_bar = tqdm(total=len(records), disable=not verbose)
	for i in range(0, len(records), chunk_size):
	chunk = records[i : i + chunk_size]

	response = await async_bulk(
	client=self._client,
	name=name,
	json_body=bulk_class(
	tags=tags,
	metadata=metadata,
	records=[creation_class.from_client(r) for r in chunk],
	),
	)

	processed += response.parsed.processed
	failed += response.parsed.failed

	progress_bar.update(len(chunk))
	progress_bar.close()

	# TODO: improve logging policy in library
	if verbose:
	_LOGGER.info(
	f"Processed {processed} records in dataset {name}. Failed: {failed}"
	)
	workspace = self.get_workspace()
	if (
	not workspace
	): # Just for backward comp. with datasets with no workspaces
	workspace = "-"
	print(
	f"{processed} records logged to"
	f" {self._client.base_url}/datasets/{workspace}/{name}"
	)

	# Creating a composite BulkResponse with the total processed and failed
	return BulkResponse(dataset=name, processed=processed, failed=failed)

	def delete_records(
	self,
	name: str,
	query: Optional[str] = None,
	ids: Optional[List[Union[str, int]]] = None,
	discard_only: bool = False,
	discard_when_forbidden: bool = True,
	) -> Tuple[int, int]:
	"""Delete records from a argilla dataset.

	Args:
	name: The dataset name.
	query: An ElasticSearch query with the `query string syntax
	<https://rubrix.readthedocs.io/en/stable/guides/queries.html>`_
	ids: If provided, deletes dataset records with given ids.
	discard_only: If `True`, matched records won't be deleted. Instead, they will be marked as `Discarded`
	discard_when_forbidden: Only super-user or dataset creator can delete records from a dataset.
	So, running "hard" deletion for other users will raise an `ForbiddenApiError` error.
	If this parameter is `True`, the client API will automatically try to mark as ``Discarded``
	records instead. Default, `True`

	Returns:
	The total of matched records and real number of processed errors. These numbers could not
	be the same if some data conflicts are found during operations (some matched records change during
	deletion).

	Examples:
	>>> ## Delete by id
	>>> import argilla as rg
	>>> rg.delete_records(name="example-dataset", ids=[1,3,5])
	>>> ## Discard records by query
	>>> import argilla as rg
	>>> rg.delete_records(name="example-dataset", query="metadata.code=33", discard_only=True)
	"""
	return self.datasets.delete_records(
	name=name,
	query=query,
	ids=ids,
	mark_as_discarded=discard_only,
	discard_when_forbidden=discard_when_forbidden,
	)

	def load(
	self,
	name: str,
	query: Optional[str] = None,
	ids: Optional[List[Union[str, int]]] = None,
	limit: Optional[int] = None,
	id_from: Optional[str] = None,
	as_pandas=None,
	) -> Dataset:
	"""Loads a argilla dataset.

	Parameters:
	-----------
	name:
	The dataset name.
	query:
	An ElasticSearch query with the
	`query string syntax <https://argilla.readthedocs.io/en/stable/guides/queries.html>`_
	ids:
	If provided, load dataset records with given ids.
	limit:
	The number of records to retrieve.

	id_from:
	If provided, starts gathering the records starting from that Record. As the Records returned with the
	load method are sorted by ID, ´id_from´ can be used to load using batches.

	as_pandas:
	DEPRECATED! To get a pandas DataFrame do ``rg.load('my_dataset').to_pandas()``.

	Returns:
	--------
	A argilla dataset.

	Examples:
	Basic Loading: load the samples sorted by their ID
	>>> import argilla as rg
	>>> dataset = rg.load(name="example-dataset")

	Iterate over a large dataset:
	When dealing with a large dataset you might want to load it in batches to optimize memory consumption
	and avoid network timeouts. To that end, a simple batch-iteration over the whole database can be done
	employing the `from_id` parameter. This parameter will act as a delimiter, retrieving the N items after
	the given id, where N is determined by the `limit` parameter. NOTE If
	no `limit` is given the whole dataset after that ID will be retrieved.

	>>> import argilla as rg
	>>> dataset_batch_1 = rg.load(name="example-dataset", limit=1000)
	>>> dataset_batch_2 = rg.load(name="example-dataset", limit=1000, id_from=dataset_batch_1[-1].id)

	"""
	if as_pandas is False:
	warnings.warn(
	"The argument `as_pandas` is deprecated and will be removed in a future"
	" version. Please adapt your code accordingly. ",
	FutureWarning,
	)
	elif as_pandas is True:
	raise ValueError(
	"The argument `as_pandas` is deprecated and will be removed in a future"
	" version. Please adapt your code accordingly. ",
	"If you want a pandas DataFrame do"
	" `rg.load('my_dataset').to_pandas()`.",
	)

	response = datasets_api.get_dataset(client=self._client, name=name)
	task = response.parsed.task

	task_config = {
	TaskType.text_classification: (
	text_classification_api.data,
	TextClassificationQuery,
	DatasetForTextClassification,
	),
	TaskType.token_classification: (
	token_classification_api.data,
	TokenClassificationQuery,
	DatasetForTokenClassification,
	),
	TaskType.text2text: (
	text2text_api.data,
	Text2TextQuery,
	DatasetForText2Text,
	),
	}

	try:
	get_dataset_data, request_class, dataset_class = task_config[task]
	except KeyError:
	raise ValueError(
	f"Load method not supported for the '{task}' task. Supported Tasks: "
	f"{[TaskType.text_classification, TaskType.token_classification, TaskType.text2text]}"
	)
	response = get_dataset_data(
	client=self._client,
	name=name,
	request=request_class(ids=ids, query_text=query),
	limit=limit,
	id_from=id_from,
	)

	records = [sdk_record.to_client() for sdk_record in response.parsed]
	try:
	records_sorted_by_id = sorted(records, key=lambda x: x.id)
	# record ids can be a mix of int/str -> sort all as str type
	except TypeError:
	records_sorted_by_id = sorted(records, key=lambda x: str(x.id))

	return dataset_class(records_sorted_by_id)

	def dataset_metrics(self, name: str) -> List[MetricInfo]:
	response = datasets_api.get_dataset(self._client, name)
	response = metrics_api.get_dataset_metrics(
	self._client, name=name, task=response.parsed.task
	)

	return response.parsed

	def get_metric(self, name: str, metric: str) -> Optional[MetricInfo]:
	metrics = self.dataset_metrics(name)
	for metric_ in metrics:
	if metric_.id == metric:
	return metric_

	def compute_metric(
	self,
	name: str,
	metric: str,
	query: Optional[str] = None,
	interval: Optional[float] = None,
	size: Optional[int] = None,
	) -> MetricResults:
	response = datasets_api.get_dataset(self._client, name)

	metric_ = self.get_metric(name, metric=metric)
	assert metric_ is not None, f"Metric {metric} not found !!!"

	response = metrics_api.compute_metric(
	self._client,
	name=name,
	task=response.parsed.task,
	metric=metric,
	query=query,
	interval=interval,
	size=size,
	)

	return MetricResults(**metric_.dict(), results=response.parsed)

	def add_dataset_labeling_rules(self, dataset: str, rules: List[LabelingRule]):
	"""Adds the dataset labeling rules"""
	for rule in rules:
	text_classification_api.add_dataset_labeling_rule(
	self._client,
	name=dataset,
	rule=rule,
	)

	def update_dataset_labeling_rules(self, dataset: str, rules: List[LabelingRule]):
	"""Updates the dataset labeling rules"""
	for rule in rules:
	text_classification_api.update_dataset_labeling_rule(
	self._client, name=dataset, rule=rule
	)

	def delete_dataset_labeling_rules(self, dataset: str, rules: List[LabelingRule]):
	"""Deletes the dataset labeling rules"""
	for rule in rules:
	text_classification_api.delete_dataset_labeling_rule(
	self._client, name=dataset, rule=rule
	)

	def fetch_dataset_labeling_rules(self, dataset: str) -> List[LabelingRule]:
	response = text_classification_api.fetch_dataset_labeling_rules(
	self._client, name=dataset
	)

	return [LabelingRule.parse_obj(data) for data in response.parsed]

	def rule_metrics_for_dataset(
	self, dataset: str, rule: LabelingRule
	) -> LabelingRuleMetricsSummary:
	response = text_classification_api.dataset_rule_metrics(
	self._client, name=dataset, query=rule.query, label=rule.label
	)

	return LabelingRuleMetricsSummary.parse_obj(response.parsed)


	__ACTIVE_API__: Optional[Api] = None


	def active_api() -> Api:
	"""Returns the active API.

	If Active API is None, initialize a default one.
	"""
	global __ACTIVE_API__
	if __ACTIVE_API__ is None:
	__ACTIVE_API__ = Api()
	return __ACTIVE_API__


	def api_wrapper(api_method: Callable):
	"""Decorator to wrap the API methods in module functions.

	Propagates the docstrings and adapts the signature of the methods.
	"""

	def decorator(func):
	if asyncio.iscoroutinefunction(api_method):

	@wraps(api_method)
	async def wrapped_func(args, *kwargs):
	return await func(args, *kwargs)

	else:

	@wraps(api_method)
	def wrapped_func(args, *kwargs):
	return func(args, *kwargs)

	sign = signature(api_method)
	wrapped_func.__signature__ = sign.replace(
	parameters=[val for key, val in sign.parameters.items() if key != "self"]
	)
	return wrapped_func

	return decorator


	@api_wrapper(Api.__init__)
	def init(args, *kwargs):
	global __ACTIVE_API__
	__ACTIVE_API__ = Api(args, *kwargs)


	@api_wrapper(Api.set_workspace)
	def set_workspace(args, *kwargs):
	return active_api().set_workspace(args, *kwargs)


	@api_wrapper(Api.get_workspace)
	def get_workspace(args, *kwargs):
	return active_api().get_workspace(args, *kwargs)


	@api_wrapper(Api.copy)
	def copy(args, *kwargs):
	return active_api().copy(args, *kwargs)


	@api_wrapper(Api.delete)
	def delete(args, *kwargs):
	return active_api().delete(args, *kwargs)


	@api_wrapper(Api.log)
	def log(args, *kwargs):
	return active_api().log(args, *kwargs)


	@api_wrapper(Api.log_async)
	def log_async(args, *kwargs):
	return active_api().log_async(args, *kwargs)


	@api_wrapper(Api.load)
	def load(args, *kwargs):
	return active_api().load(args, *kwargs)


	@api_wrapper(Api.delete_records)
	def delete_records(args, *kwargs):
	return active_api().delete_records(args, *kwargs)


	class InputValueError(BaseClientError):
	pass