Models

`TransformerTokenizer`

Bases: Tokenizer

Represents a tokenizer for models in the transformers library.

Source code in outlines/models/transformers.py

class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token

        if not self.tokenizer.pad_token_id:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if self.is_llama:
            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])

`Transformers`

Represents a transformers model.

Source code in outlines/models/transformers.py

class Transformers:
    """Represents a `transformers` model."""

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.tokenizer = TransformerTokenizer(tokenizer)

    def forward(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
        """Compute a forward pass through the transformer model.

        Parameters
        ----------
        input_ids
            The input token ids.  Must be one or two dimensional.
        attention_mask
            The attention mask.  Must be one or two dimensional.
        past_key_values
            A tuple of tuples containing the cached key and value tensors for each
            attention head.

        Returns
        -------
        The computed logits and the new cached key and value tensors.

        """
        try:
            import torch
        except ImportError:
            ImportError(
                "The `torch` library needs to be installed to use `transformers` models."
            )
        assert 0 < input_ids.ndim < 3

        if past_key_values:
            input_ids = input_ids[..., -1].unsqueeze(-1)

        with torch.inference_mode():
            output = self.model(
                input_ids,
                attention_mask=attention_mask,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
                past_key_values=past_key_values,
            )

        return output.logits, output.past_key_values

    def __call__(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> "torch.FloatTensor":
        logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values)
        next_token_logits = logits[..., -1, :]

        return next_token_logits, kv_cache

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Arguments
        ---------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)

        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        """
        Temporary stream stand-in which implements stream() signature
        and equivalent behaviour but isn't yielded until generation completes.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        for i in range(generated_ids.size(-1)):
            output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
            yield self._decode_generation(output_group_ids)

    def _get_generation_kwargs(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> dict:
        """
        Conert outlines generation parameters into model.generate kwargs
        """
        from transformers import GenerationConfig, LogitsProcessorList, set_seed

        max_new_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_new_tokens is None:
            max_new_tokens = int(2**30)

        # global seed, not desirable
        if seed is not None:
            set_seed(seed)

        if logits_processor is not None:
            logits_processor_list = LogitsProcessorList([logits_processor])
        else:
            logits_processor_list = None

        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            stop_strings=stop_at,
            num_return_sequences=(num_samples or 1),
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            do_sample=(sampler == "multinomial"),
            num_beams=(num_samples if sampler == "beam_search" else 1),
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        return dict(
            logits_processor=logits_processor_list,
            generation_config=generation_config,
            tokenizer=self.tokenizer.tokenizer,
        )

    def _generate_output_seq(
        self, prompts, inputs, generation_config, **generation_kwargs
    ):
        input_ids = inputs["input_ids"]
        output_ids = self.model.generate(
            **inputs, generation_config=generation_config, **generation_kwargs
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        # if batch list inputs AND multiple samples per input, convert generated_id to 3D view
        num_samples = generation_config.num_return_sequences or 1

        if num_samples > 1 and isinstance(prompts, list):
            batch_size = input_ids.size(0)
            num_return_sequences = generation_config.num_return_sequences or 1
            generated_ids = generated_ids.view(batch_size, num_return_sequences, -1)

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:
            raise TypeError(
                f"Generated outputs aren't 1D, 2D or 3D, but instead are {generated_ids.shape}"
            )

`forward(input_ids, attention_mask, past_key_values=None)`

Compute a forward pass through the transformer model.

Parameters

input_ids The input token ids. Must be one or two dimensional. attention_mask The attention mask. Must be one or two dimensional. past_key_values A tuple of tuples containing the cached key and value tensors for each attention head.

Returns

The computed logits and the new cached key and value tensors.

Source code in outlines/models/transformers.py

def forward(
    self,
    input_ids: "torch.LongTensor",
    attention_mask: "torch.LongTensor",
    past_key_values: Optional[Tuple] = None,
) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
    """Compute a forward pass through the transformer model.

    Parameters
    ----------
    input_ids
        The input token ids.  Must be one or two dimensional.
    attention_mask
        The attention mask.  Must be one or two dimensional.
    past_key_values
        A tuple of tuples containing the cached key and value tensors for each
        attention head.

    Returns
    -------
    The computed logits and the new cached key and value tensors.

    """
    try:
        import torch
    except ImportError:
        ImportError(
            "The `torch` library needs to be installed to use `transformers` models."
        )
    assert 0 < input_ids.ndim < 3

    if past_key_values:
        input_ids = input_ids[..., -1].unsqueeze(-1)

    with torch.inference_mode():
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_attentions=False,
            output_hidden_states=False,
            past_key_values=past_key_values,
        )

    return output.logits, output.past_key_values

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

Generate text using transformers.

Arguments

prompts A prompt or list of prompts. generation_parameters An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method. logits_processor The logits processor to use when generating text. sampling_parameters An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

Returns

The generated text

Source code in outlines/models/transformers.py

def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Arguments
    ---------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)

    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

Temporary stream stand-in which implements stream() signature and equivalent behaviour but isn't yielded until generation completes.

TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

Source code in outlines/models/transformers.py

def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Iterator[Union[str, List[str]]]:
    """
    Temporary stream stand-in which implements stream() signature
    and equivalent behaviour but isn't yielded until generation completes.

    TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)
    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    for i in range(generated_ids.size(-1)):
        output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
        yield self._decode_generation(output_group_ids)

`get_llama_tokenizer_types()`

Get all the Llama tokenizer types/classes that need work-arounds.

When they can't be imported, a dummy class is created.

Source code in outlines/models/transformers.py

def get_llama_tokenizer_types():
    """Get all the Llama tokenizer types/classes that need work-arounds.

    When they can't be imported, a dummy class is created.

    """
    try:
        from transformers.models.llama import LlamaTokenizer
    except ImportError:

        class LlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.llama import LlamaTokenizerFast
    except ImportError:

        class LlamaTokenizerFast:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizer
    except ImportError:

        class CodeLlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizerFast
    except ImportError:

        class CodeLlamaTokenizerFast:  # type: ignore
            pass

    return (
        LlamaTokenizer,
        LlamaTokenizerFast,
        CodeLlamaTokenizer,
        CodeLlamaTokenizerFast,
    )

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

Instantiate a model from the transformers library and its tokenizer.

Parameters

model_name The name of the model as listed on Hugging Face's model page. device The device(s) on which the model should be loaded. This overrides the device_map entry in model_kwargs when provided. model_kwargs A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the model. tokenizer_kwargs A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the tokenizer.

Returns

A TransformersModel model instance.

Source code in outlines/models/transformers.py

def transformers(
    model_name: str,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_class=None,
    tokenizer_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    tokenizer_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the tokenizer.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if model_class is None or tokenizer_class is None:
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if model_class is None:
        model_class = AutoModelForCausalLM
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    tokenizer_kwargs.setdefault("padding_side", "left")
    tokenizer = tokenizer_class.from_pretrained(model_name, **tokenizer_kwargs)

    return Transformers(model, tokenizer)

Integration with OpenAI's API.

`OpenAI`

An object that represents the OpenAI API.

Source code in outlines/models/openai.py

class OpenAI:
    """An object that represents the OpenAI API."""

    def __init__(
        self,
        client,
        config,
        tokenizer=None,
        system_prompt: Optional[str] = None,
    ):
        """Create an `OpenAI` instance.

        This class supports the standard OpenAI API, the Azure OpeanAI API as
        well as compatible APIs that rely on the OpenAI client.

        Parameters
        ----------
        client
            An instance of the API's async client.
        config
            An instance of `OpenAIConfig`. Can be useful to specify some
            parameters that cannot be set by calling this class' methods.
        tokenizer
            The tokenizer associated with the model the client connects to.

        """

        self.client = client
        self.tokenizer = tokenizer
        self.config = config

        # We count the total number of prompt and generated tokens as returned
        # by the OpenAI API, summed over all the requests performed with this
        # model instance.
        self.prompt_tokens = 0
        self.completion_tokens = 0

    def __call__(
        self,
        prompt: Union[str, List[str]],
        max_tokens: Optional[int] = None,
        stop_at: Optional[Union[List[str], str]] = None,
        *,
        system_prompt: Optional[str] = None,
        temperature: Optional[float] = None,
        samples: Optional[int] = None,
    ) -> np.ndarray:
        """Call the OpenAI API to generate text.

        Parameters
        ----------
        prompt
            A string or list of strings that will be used to prompt the model
        max_tokens
            The maximum number of tokens to generate
        stop_at
            A string or array of strings which, such that the generation stops
            when they are generated.
        system_prompt
            The content of the system message that precedes the user's prompt.
        temperature
            The value of the temperature used to sample tokens
        samples
            The number of completions to generate for each prompt
        stop_at
            Up to 4 words where the API will stop the completion.

        """
        if max_tokens is None:
            max_tokens = self.config.max_tokens
        if stop_at is None:
            stop_at = self.config.stop
        if temperature is None:
            temperature = self.config.temperature
        if samples is None:
            samples = self.config.n

        config = replace(self.config, max_tokens=max_tokens, temperature=temperature, n=samples, stop=stop_at)  # type: ignore

        response, prompt_tokens, completion_tokens = generate_chat(
            prompt, system_prompt, self.client, config
        )
        self.prompt_tokens += prompt_tokens
        self.completion_tokens += completion_tokens

        return response

    def stream(self, *args, **kwargs):
        raise NotImplementedError(
            "Streaming is currently not supported for the OpenAI API"
        )

    def generate_choice(
        self,
        prompt: str,
        choices: List[str],
        max_tokens: Optional[int] = None,
        system_prompt: Optional[str] = None,
    ) -> str:
        """Call the OpenAI API to generate one of several choices.

        Parameters
        ----------
        prompt
            A string or list of strings that will be used to prompt the model
        choices
            The list of strings between which we ask the model to choose
        max_tokens
            The maximum number of tokens to generate
        system_prompt
            The content of the system message that precedes the user's prompt.

        """
        if self.tokenizer is None:
            raise ValueError(
                "You must initialize the `OpenAI` class with a tokenizer to use `outlines.generate.choice`"
            )

        config = replace(self.config, max_tokens=max_tokens)

        greedy = False
        decoded: List[str] = []
        encoded_choices_left: List[List[int]] = [
            self.tokenizer.encode(word) for word in choices
        ]

        while len(encoded_choices_left) > 0:
            max_tokens_left = max([len(tokens) for tokens in encoded_choices_left])
            transposed_choices_left: List[Set] = [
                {item for item in subset if item is not None}
                for subset in zip_longest(*encoded_choices_left)
            ]

            if not greedy:
                mask = build_optimistic_mask(transposed_choices_left)
            else:
                mask = {}
                for token in transposed_choices_left[0]:  # build greedy mask
                    mask[token] = 100

            if len(mask) == 0:
                break

            config = replace(config, logit_bias=mask, max_tokens=max_tokens_left)

            response, prompt_tokens, completion_tokens = generate_chat(
                prompt, system_prompt, self.client, config
            )
            self.prompt_tokens += prompt_tokens
            self.completion_tokens += completion_tokens

            encoded_response = self.tokenizer.encode(response)

            if encoded_response in encoded_choices_left:
                decoded.append(response)
                break
            else:
                (
                    encoded_response,
                    encoded_choices_left,
                ) = find_response_choices_intersection(
                    encoded_response, encoded_choices_left
                )

                if len(encoded_response) == 0:
                    greedy = True  # next iteration will be "greedy"
                    continue
                else:
                    decoded.append("".join(self.tokenizer.decode(encoded_response)))

                    if len(encoded_choices_left) == 1:  # only one choice left
                        choice_left = self.tokenizer.decode(encoded_choices_left[0])
                        decoded.append(choice_left)
                        break

                    greedy = False  # after each success, stay with (or switch to) "optimistic" approach

                prompt = prompt + "".join(decoded)

        choice = "".join(decoded)

        return choice

    def generate_json(self):
        """Call the OpenAI API to generate a JSON object."""
        raise NotImplementedError

    def __str__(self):
        return self.__class__.__name__ + " API"

    def __repr__(self):
        return str(self.config)

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

Call the OpenAI API to generate text.

Parameters

prompt A string or list of strings that will be used to prompt the model max_tokens The maximum number of tokens to generate stop_at A string or array of strings which, such that the generation stops when they are generated. system_prompt The content of the system message that precedes the user's prompt. temperature The value of the temperature used to sample tokens samples The number of completions to generate for each prompt stop_at Up to 4 words where the API will stop the completion.

Source code in outlines/models/openai.py

def __call__(
    self,
    prompt: Union[str, List[str]],
    max_tokens: Optional[int] = None,
    stop_at: Optional[Union[List[str], str]] = None,
    *,
    system_prompt: Optional[str] = None,
    temperature: Optional[float] = None,
    samples: Optional[int] = None,
) -> np.ndarray:
    """Call the OpenAI API to generate text.

    Parameters
    ----------
    prompt
        A string or list of strings that will be used to prompt the model
    max_tokens
        The maximum number of tokens to generate
    stop_at
        A string or array of strings which, such that the generation stops
        when they are generated.
    system_prompt
        The content of the system message that precedes the user's prompt.
    temperature
        The value of the temperature used to sample tokens
    samples
        The number of completions to generate for each prompt
    stop_at
        Up to 4 words where the API will stop the completion.

    """
    if max_tokens is None:
        max_tokens = self.config.max_tokens
    if stop_at is None:
        stop_at = self.config.stop
    if temperature is None:
        temperature = self.config.temperature
    if samples is None:
        samples = self.config.n

    config = replace(self.config, max_tokens=max_tokens, temperature=temperature, n=samples, stop=stop_at)  # type: ignore

    response, prompt_tokens, completion_tokens = generate_chat(
        prompt, system_prompt, self.client, config
    )
    self.prompt_tokens += prompt_tokens
    self.completion_tokens += completion_tokens

    return response

`init(client, config, tokenizer=None, system_prompt=None)`

Create an OpenAI instance.

This class supports the standard OpenAI API, the Azure OpeanAI API as well as compatible APIs that rely on the OpenAI client.

Parameters

client An instance of the API's async client. config An instance of OpenAIConfig. Can be useful to specify some parameters that cannot be set by calling this class' methods. tokenizer The tokenizer associated with the model the client connects to.

Source code in outlines/models/openai.py

def __init__(
    self,
    client,
    config,
    tokenizer=None,
    system_prompt: Optional[str] = None,
):
    """Create an `OpenAI` instance.

    This class supports the standard OpenAI API, the Azure OpeanAI API as
    well as compatible APIs that rely on the OpenAI client.

    Parameters
    ----------
    client
        An instance of the API's async client.
    config
        An instance of `OpenAIConfig`. Can be useful to specify some
        parameters that cannot be set by calling this class' methods.
    tokenizer
        The tokenizer associated with the model the client connects to.

    """

    self.client = client
    self.tokenizer = tokenizer
    self.config = config

    # We count the total number of prompt and generated tokens as returned
    # by the OpenAI API, summed over all the requests performed with this
    # model instance.
    self.prompt_tokens = 0
    self.completion_tokens = 0

`generate_choice(prompt, choices, max_tokens=None, system_prompt=None)`

Call the OpenAI API to generate one of several choices.

Parameters

prompt A string or list of strings that will be used to prompt the model choices The list of strings between which we ask the model to choose max_tokens The maximum number of tokens to generate system_prompt The content of the system message that precedes the user's prompt.

Source code in outlines/models/openai.py

def generate_choice(
    self,
    prompt: str,
    choices: List[str],
    max_tokens: Optional[int] = None,
    system_prompt: Optional[str] = None,
) -> str:
    """Call the OpenAI API to generate one of several choices.

    Parameters
    ----------
    prompt
        A string or list of strings that will be used to prompt the model
    choices
        The list of strings between which we ask the model to choose
    max_tokens
        The maximum number of tokens to generate
    system_prompt
        The content of the system message that precedes the user's prompt.

    """
    if self.tokenizer is None:
        raise ValueError(
            "You must initialize the `OpenAI` class with a tokenizer to use `outlines.generate.choice`"
        )

    config = replace(self.config, max_tokens=max_tokens)

    greedy = False
    decoded: List[str] = []
    encoded_choices_left: List[List[int]] = [
        self.tokenizer.encode(word) for word in choices
    ]

    while len(encoded_choices_left) > 0:
        max_tokens_left = max([len(tokens) for tokens in encoded_choices_left])
        transposed_choices_left: List[Set] = [
            {item for item in subset if item is not None}
            for subset in zip_longest(*encoded_choices_left)
        ]

        if not greedy:
            mask = build_optimistic_mask(transposed_choices_left)
        else:
            mask = {}
            for token in transposed_choices_left[0]:  # build greedy mask
                mask[token] = 100

        if len(mask) == 0:
            break

        config = replace(config, logit_bias=mask, max_tokens=max_tokens_left)

        response, prompt_tokens, completion_tokens = generate_chat(
            prompt, system_prompt, self.client, config
        )
        self.prompt_tokens += prompt_tokens
        self.completion_tokens += completion_tokens

        encoded_response = self.tokenizer.encode(response)

        if encoded_response in encoded_choices_left:
            decoded.append(response)
            break
        else:
            (
                encoded_response,
                encoded_choices_left,
            ) = find_response_choices_intersection(
                encoded_response, encoded_choices_left
            )

            if len(encoded_response) == 0:
                greedy = True  # next iteration will be "greedy"
                continue
            else:
                decoded.append("".join(self.tokenizer.decode(encoded_response)))

                if len(encoded_choices_left) == 1:  # only one choice left
                    choice_left = self.tokenizer.decode(encoded_choices_left[0])
                    decoded.append(choice_left)
                    break

                greedy = False  # after each success, stay with (or switch to) "optimistic" approach

            prompt = prompt + "".join(decoded)

    choice = "".join(decoded)

    return choice

`generate_json()`

Call the OpenAI API to generate a JSON object.

Source code in outlines/models/openai.py

253
254
255

def generate_json(self):
    """Call the OpenAI API to generate a JSON object."""
    raise NotImplementedError

`OpenAIConfig` `dataclass`

Represents the parameters of the OpenAI API.

The information was last fetched on 2023/11/20. We document below the properties that are specific to the OpenAI API. Not all these properties are supported by Outlines.

Properties

model The name of the model. Available models can be found on OpenAI's website. frequence_penalty Number between 2.0 and -2.0. Positive values penalize new tokens based on their existing frequency in the text, logit_bias Modifies the likelihood of specified tokens to appear in the completion. Number between -100 (forbid) and +100 (only allows). n The number of completions to return for each prompt. presence_penalty Similar to frequency penalty. response_format Specifies the format the model must output. {"type": "json_object"} enables JSON mode. seed Two completions with the same seed value should return the same completion. This is however not guaranteed. stop Up to 4 words where the API will stop the completion. temperature Number between 0 and 2. Higher values make the output more random, while lower values make it more deterministic. top_p Number between 0 and 1. Parameter for nucleus sampling. user A unique identifier for the end-user.

Source code in outlines/models/openai.py

@dataclass(frozen=True)
class OpenAIConfig:
    """Represents the parameters of the OpenAI API.

    The information was last fetched on 2023/11/20. We document below the
    properties that are specific to the OpenAI API. Not all these properties are
    supported by Outlines.

    Properties
    ----------
    model
        The name of the model. Available models can be found on OpenAI's website.
    frequence_penalty
        Number between 2.0 and -2.0. Positive values penalize new tokens based on
        their existing frequency in the text,
    logit_bias
        Modifies the likelihood of specified tokens to appear in the completion.
        Number between -100 (forbid) and +100 (only allows).
    n
        The number of completions to return for each prompt.
    presence_penalty
        Similar to frequency penalty.
    response_format
        Specifies the format the model must output. `{"type": "json_object"}`
        enables JSON mode.
    seed
        Two completions with the same `seed` value should return the same
        completion. This is however not guaranteed.
    stop
        Up to 4 words where the API will stop the completion.
    temperature
        Number between 0 and 2. Higher values make the output more random, while
        lower values make it more deterministic.
    top_p
        Number between 0 and 1. Parameter for nucleus sampling.
    user
        A unique identifier for the end-user.

    """

    model: str = ""
    frequency_penalty: float = 0
    logit_bias: Dict[int, int] = field(default_factory=dict)
    max_tokens: Optional[int] = None
    n: int = 1
    presence_penalty: float = 0
    response_format: Optional[Dict[str, str]] = None
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
    temperature: float = 1.0
    top_p: int = 1
    user: str = field(default_factory=str)

`build_optimistic_mask(transposed, max_mask_size=300)`

We build the largest mask possible.

Tokens are added from left to right, so if the encoded choices are e.g. [[1,2], [3,4]], 1 and 3 will be added before 2 and 4.

Parameters

transposed A list of lists that contain the nth token of each choice.

Source code in outlines/models/openai.py

def build_optimistic_mask(
    transposed: List[Set[int]], max_mask_size: int = 300
) -> Dict[int, int]:
    """We build the largest mask possible.

    Tokens are added from left to right, so if the encoded choices are e.g.
    `[[1,2], [3,4]]`, `1` and `3` will be added before `2` and `4`.

    Parameters
    ----------
    transposed
        A list of lists that contain the nth token of each choice.

    """
    mask: Dict[int, int] = {}
    for tokens in transposed:
        for token in tokens:
            if len(mask) == max_mask_size:
                return mask
            mask[token] = 100

    return mask

`error_handler(api_call_fn)`

Handle OpenAI API errors and missing API key.

Source code in outlines/models/openai.py

def error_handler(api_call_fn: Callable) -> Callable:
    """Handle OpenAI API errors and missing API key."""

    def call(*args, **kwargs):
        import openai

        try:
            return api_call_fn(*args, **kwargs)
        except (
            openai.APITimeoutError,
            openai.InternalServerError,
            openai.RateLimitError,
        ) as e:
            raise OSError(f"Could not connect to the OpenAI API: {e}")
        except (
            openai.AuthenticationError,
            openai.BadRequestError,
            openai.ConflictError,
            openai.PermissionDeniedError,
            openai.NotFoundError,
            openai.UnprocessableEntityError,
        ) as e:
            raise e

    return call

`find_longest_intersection(response, choice)`

Find the longest intersection between the response and the choice.

Source code in outlines/models/openai.py

def find_longest_intersection(response: List[int], choice: List[int]) -> List[int]:
    """Find the longest intersection between the response and the choice."""
    for i, (token_r, token_c) in enumerate(zip_longest(response, choice)):
        if token_r != token_c:
            return response[:i]

    return response

`find_response_choices_intersection(response, choices)`

Find the longest intersection between the response and the different choices.

Say the response is of the form [1, 2, 3, 4, 5] and we have the choices [[1, 2], [1, 2, 3], [6, 7, 8] then the function will return [1, 2, 3] as the intersection, and [[]] as the list of choices left.

Parameters

response The model's response choices The remaining possible choices

Returns

A tuple that contains the longest intersection between the response and the different choices, and the choices which start with this intersection, with the intersection removed.

Source code in outlines/models/openai.py

def find_response_choices_intersection(
    response: List[int], choices: List[List[int]]
) -> Tuple[List[int], List[List[int]]]:
    """Find the longest intersection between the response and the different
    choices.

    Say the response is of the form `[1, 2, 3, 4, 5]` and we have the choices
    `[[1, 2], [1, 2, 3], [6, 7, 8]` then the function will return `[1, 2, 3]` as the
    intersection, and `[[]]` as the list of choices left.

    Parameters
    ----------
    response
        The model's response
    choices
        The remaining possible choices

    Returns
    -------
    A tuple that contains the longest intersection between the response and the
    different choices, and the choices which start with this intersection, with the
    intersection removed.

    """
    max_len_prefix = 0
    choices_left = []
    longest_prefix = []
    for i, choice in enumerate(choices):
        # Find the longest intersection between the response and the choice.
        prefix = find_longest_intersection(response, choice)

        if len(prefix) > max_len_prefix:
            max_len_prefix = len(prefix)
            choices_left = [choice[len(prefix) :]]
            longest_prefix = prefix

        elif len(prefix) == max_len_prefix:
            choices_left.append(choice[len(prefix) :])

    return longest_prefix, choices_left

`generate_chat(prompt, system_prompt, client, config)` `async`

Call OpenAI's Chat Completion API.

Parameters

prompt The prompt we use to start the generation. Passed to the model with the "user" role. system_prompt The system prompt, passed to the model with the "system" role before the prompt. client The API client config An OpenAIConfig instance.

Returns

A tuple that contains the model's response(s) and usage statistics.

Source code in outlines/models/openai.py

@functools.partial(vectorize, signature="(),(),(),()->(s),(),()")
async def generate_chat(
    prompt: str,
    system_prompt: Union[str, None],
    client,
    config: OpenAIConfig,
) -> Tuple[np.ndarray, int, int]:
    """Call OpenAI's Chat Completion API.

    Parameters
    ----------
    prompt
        The prompt we use to start the generation. Passed to the model
        with the "user" role.
    system_prompt
        The system prompt, passed to the model with the "system" role
        before the prompt.
    client
        The API client
    config
        An `OpenAIConfig` instance.

    Returns
    -------
    A tuple that contains the model's response(s) and usage statistics.

    """

    @error_handler
    @cache()
    async def call_api(prompt, system_prompt, config):
        responses = await client.chat.completions.create(
            messages=system_message + user_message,
            **asdict(config),  # type: ignore
        )
        return responses.model_dump()

    system_message = (
        [{"role": "system", "content": system_prompt}] if system_prompt else []
    )
    user_message = [{"role": "user", "content": prompt}]

    responses = await call_api(prompt, system_prompt, config)

    results = np.array(
        [responses["choices"][i]["message"]["content"] for i in range(config.n)]
    )
    usage = responses["usage"]

    return results, usage["prompt_tokens"], usage["completion_tokens"]

Models

TransformerTokenizer

Transformers

forward(input_ids, attention_mask, past_key_values=None)

Parameters

Returns

generate(prompts, generation_parameters, logits_processor, sampling_parameters)

Arguments

Returns

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

get_llama_tokenizer_types()

transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)

Parameters

Returns

OpenAI

__call__(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)

Parameters

__init__(client, config, tokenizer=None, system_prompt=None)

Parameters

generate_choice(prompt, choices, max_tokens=None, system_prompt=None)

Parameters

generate_json()

OpenAIConfig dataclass

Properties

build_optimistic_mask(transposed, max_mask_size=300)

Parameters

error_handler(api_call_fn)

find_longest_intersection(response, choice)

find_response_choices_intersection(response, choices)

Parameters

Returns

generate_chat(prompt, system_prompt, client, config) async

Parameters

Returns

`TransformerTokenizer`

`Transformers`

`forward(input_ids, attention_mask, past_key_values=None)`

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

`get_llama_tokenizer_types()`

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

`OpenAI`

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

`init(client, config, tokenizer=None, system_prompt=None)`

`generate_choice(prompt, choices, max_tokens=None, system_prompt=None)`

`generate_json()`

`OpenAIConfig` `dataclass`

`build_optimistic_mask(transposed, max_mask_size=300)`

`error_handler(api_call_fn)`

`find_longest_intersection(response, choice)`

`find_response_choices_intersection(response, choices)`

`generate_chat(prompt, system_prompt, client, config)` `async`