Skip to content

Evaluation Pipelines

Hemm evaluation pipelines for Diffusers pipelines.

BaseDiffusionModel

Bases: Model

Base weave.Model wrapping diffusers.DiffusionPipeline.

Parameters:

Name Type Description Default
diffusion_model_name_or_path str

The name or path of the diffusion model.

required
enable_cpu_offfload bool

Enable CPU offload for the diffusion model.

required
image_height int

The height of the generated image.

required
image_width int

The width of the generated image.

required
Source code in hemm/eval_pipelines/model.py
class BaseDiffusionModel(weave.Model):
    """Base `weave.Model` wrapping `diffusers.DiffusionPipeline`.

    Args:
        diffusion_model_name_or_path (str): The name or path of the diffusion model.
        enable_cpu_offfload (bool): Enable CPU offload for the diffusion model.
        image_height (int): The height of the generated image.
        image_width (int): The width of the generated image.
    """

    diffusion_model_name_or_path: str
    enable_cpu_offfload: bool = False
    image_height: int = 512
    image_width: int = 512
    _torch_dtype: torch.dtype = torch.float16
    _pipeline: DiffusionPipeline = None

    def initialize(self):
        self._pipeline = DiffusionPipeline.from_pretrained(
            self.diffusion_model_name_or_path, torch_dtype=self._torch_dtype
        )
        if self.enable_cpu_offfload:
            self._pipeline.enable_model_cpu_offload()
        else:
            self._pipeline = self._pipeline.to("cuda")
        self._pipeline.set_progress_bar_config(leave=False, desc="Generating Image")

    @weave.op()
    def predict(self, prompt: str, seed: int) -> Dict[str, str]:
        image = self._pipeline(
            prompt,
            num_images_per_prompt=1,
            height=self.image_height,
            width=self.image_width,
            generator=torch.Generator(device="cuda").manual_seed(seed),
        ).images[0]
        return {"image": base64_encode_image(image)}

EvaluationPipeline

Bases: ABC

Evaluation pipeline to evaluate the a multi-modal generative model.

Parameters:

Name Type Description Default
model BaseDiffusionModel

The model to evaluate.

required
seed int

Seed value for the random number generator.

42
Source code in hemm/eval_pipelines/eval_pipeline.py
class EvaluationPipeline(ABC):
    """Evaluation pipeline to evaluate the a multi-modal generative model.

    Args:
        model (BaseDiffusionModel): The model to evaluate.
        seed (int): Seed value for the random number generator.
    """

    def __init__(self, model: BaseDiffusionModel, seed: int = 42) -> None:
        super().__init__()
        self.model = model
        self.model.initialize()

        self.image_size = (self.model.image_height, self.model.image_width)
        self.seed = seed

        self.inference_counter = 1
        self.table_columns = ["model", "prompt", "generated_image"]
        self.table_rows: List = []
        self.wandb_table: wandb.Table = None
        self.metric_functions: List[Callable] = []

        self.evaluation_configs = {
            "pretrained_model_name_or_path": self.model.diffusion_model_name_or_path,
            "torch_dtype": str(self.model._torch_dtype),
            "enable_cpu_offfload": self.model.enable_cpu_offfload,
            "image_size": {
                "height": self.image_size[0],
                "width": self.image_size[1],
            },
            "seed": seed,
            "diffusion_pipeline": dict(self.model._pipeline.config),
        }

    def add_metric(self, metric_fn: Callable):
        """Add a metric function to the evaluation pipeline.

        Args:
            metric_fn (Callable): Metric function to evaluate the generated images.
        """
        self.table_columns.append(metric_fn.__class__.__name__)
        self.evaluation_configs.update(metric_fn.config)
        self.metric_functions.append(metric_fn)

    @weave.op()
    async def infer(self, prompt: str) -> Dict[str, str]:
        """Inference function to generate images for the given prompt.

        Args:
            prompt (str): Prompt to generate the image.

        Returns:
            Dict[str, str]: Dictionary containing base64 encoded image to be logged as
                a Weave object.
        """
        if self.inference_counter == 1:
            self.wandb_table = wandb.Table(columns=self.table_columns)
        self.inference_counter += 1
        output = self.model.predict(prompt, seed=self.seed)
        self.table_rows.append(
            [
                self.model.diffusion_model_name_or_path,
                prompt,
                wandb.Image(
                    Image.open(
                        BytesIO(base64.b64decode(output["image"].split(";base64,")[-1]))
                    )
                ),
            ]
        )
        return output

    def log_summary(self):
        """Log the evaluation summary to the Weights & Biases dashboard."""
        config = wandb.config
        config.update(self.evaluation_configs)
        for row_idx, row in enumerate(self.table_rows):
            current_row = row
            for metric_fn in self.metric_functions:
                current_row.append(metric_fn.scores[row_idx])
            self.wandb_table.add_data(*current_row)
        wandb.log(
            {f"Evalution/{self.model.diffusion_model_name_or_path}": self.wandb_table}
        )

    def __call__(self, dataset: Union[List[Dict], str]) -> None:
        """Evaluate the Stable Diffusion model on the given dataset.

        Args:
            dataset (Union[List[Dict], str]): Dataset to evaluate the model on. If a string is
                passed, it is assumed to be a Weave dataset reference.
        """
        dataset = weave.ref(dataset).get() if isinstance(dataset, str) else dataset
        evaluation = Evaluation(
            dataset=dataset,
            scorers=[metric_fn.__call__ for metric_fn in self.metric_functions],
        )
        with weave.attributes(self.evaluation_configs):
            asyncio.run(evaluation.evaluate(self.infer))
        self.log_summary()

__call__(dataset)

Evaluate the Stable Diffusion model on the given dataset.

Parameters:

Name Type Description Default
dataset Union[List[Dict], str]

Dataset to evaluate the model on. If a string is passed, it is assumed to be a Weave dataset reference.

required
Source code in hemm/eval_pipelines/eval_pipeline.py
def __call__(self, dataset: Union[List[Dict], str]) -> None:
    """Evaluate the Stable Diffusion model on the given dataset.

    Args:
        dataset (Union[List[Dict], str]): Dataset to evaluate the model on. If a string is
            passed, it is assumed to be a Weave dataset reference.
    """
    dataset = weave.ref(dataset).get() if isinstance(dataset, str) else dataset
    evaluation = Evaluation(
        dataset=dataset,
        scorers=[metric_fn.__call__ for metric_fn in self.metric_functions],
    )
    with weave.attributes(self.evaluation_configs):
        asyncio.run(evaluation.evaluate(self.infer))
    self.log_summary()

add_metric(metric_fn)

Add a metric function to the evaluation pipeline.

Parameters:

Name Type Description Default
metric_fn Callable

Metric function to evaluate the generated images.

required
Source code in hemm/eval_pipelines/eval_pipeline.py
def add_metric(self, metric_fn: Callable):
    """Add a metric function to the evaluation pipeline.

    Args:
        metric_fn (Callable): Metric function to evaluate the generated images.
    """
    self.table_columns.append(metric_fn.__class__.__name__)
    self.evaluation_configs.update(metric_fn.config)
    self.metric_functions.append(metric_fn)

infer(prompt) async

Inference function to generate images for the given prompt.

Parameters:

Name Type Description Default
prompt str

Prompt to generate the image.

required

Returns:

Type Description
Dict[str, str]

Dict[str, str]: Dictionary containing base64 encoded image to be logged as a Weave object.

Source code in hemm/eval_pipelines/eval_pipeline.py
@weave.op()
async def infer(self, prompt: str) -> Dict[str, str]:
    """Inference function to generate images for the given prompt.

    Args:
        prompt (str): Prompt to generate the image.

    Returns:
        Dict[str, str]: Dictionary containing base64 encoded image to be logged as
            a Weave object.
    """
    if self.inference_counter == 1:
        self.wandb_table = wandb.Table(columns=self.table_columns)
    self.inference_counter += 1
    output = self.model.predict(prompt, seed=self.seed)
    self.table_rows.append(
        [
            self.model.diffusion_model_name_or_path,
            prompt,
            wandb.Image(
                Image.open(
                    BytesIO(base64.b64decode(output["image"].split(";base64,")[-1]))
                )
            ),
        ]
    )
    return output

log_summary()

Log the evaluation summary to the Weights & Biases dashboard.

Source code in hemm/eval_pipelines/eval_pipeline.py
def log_summary(self):
    """Log the evaluation summary to the Weights & Biases dashboard."""
    config = wandb.config
    config.update(self.evaluation_configs)
    for row_idx, row in enumerate(self.table_rows):
        current_row = row
        for metric_fn in self.metric_functions:
            current_row.append(metric_fn.scores[row_idx])
        self.wandb_table.add_data(*current_row)
    wandb.log(
        {f"Evalution/{self.model.diffusion_model_name_or_path}": self.wandb_table}
    )