Datasets

Module for generating tf.data.Dataset

`BaseImageDatagen`

A base for image data generators

Attributes:

Name	Type	Description
`shape`	`List[int]`	shape of the images to generate
`name`	`str`	name of the dataset to use in references/logs

Methods

read_image : read a file from given filepath and decode it as a jpeg file parse_sample : given a path, read and preprocess it for further usage create_ds, abstractmethod:

Source code in conftrainer/datasets/datagen.py

class BaseImageDatagen:
    """
    A base for image data generators

    Attributes
    ----------
    shape : List[int]
        shape of the images to generate
    name : str
        name of the dataset to use in references/logs

    Methods
    --------
    read_image : read a file from given filepath and decode it as a jpeg file
    parse_sample : given a path, read and preprocess it for further usage
    create_ds, abstractmethod:
    """

    def __init__(self, shape: List[int], name: str = 'current', **kwargs):
        self.shape = shape
        self.name = name

    @staticmethod
    def read_image(path: str) -> tf.Tensor:
        """
        Read and decode an image from disk

        Parameters
        ----------
        path : str
            path to the image

        Returns
        -------
        out : tf.Tensor
            decoded image
        """

        im_bytes = tf.io.read_file(path)
        img = tf.io.decode_jpeg(im_bytes, channels=3)

        return img

    def parse_sample(self, path: str) -> tf.Tensor:
        """
        Read and resize an image

        Parameters
        ----------
        path : str
            path to the image

        Returns
        -------
        out : tf.Tensor, Any
            (image, label) pair
        """
        img = self.read_image(path)
        img = tf.image.resize(img, self.shape[:2])

        return img

    @abstractmethod
    def create_ds(self, batch_size: int = 32, shape: Optional[List[int]] = None,
                  training: bool = False) -> tf.data.Dataset:
        """Create a dataset with given arguments"""

`read_image(path)` `staticmethod`

Read and decode an image from disk

Parameters:

Name	Type	Description	Default
`path`	`str`	path to the image	required

Returns:

Name	Type	Description
`out`	`tf.Tensor`	decoded image

Source code in conftrainer/datasets/datagen.py

@staticmethod
def read_image(path: str) -> tf.Tensor:
    """
    Read and decode an image from disk

    Parameters
    ----------
    path : str
        path to the image

    Returns
    -------
    out : tf.Tensor
        decoded image
    """

    im_bytes = tf.io.read_file(path)
    img = tf.io.decode_jpeg(im_bytes, channels=3)

    return img

`parse_sample(path)`

Read and resize an image

Parameters:

Name	Type	Description	Default
`path`	`str`	path to the image	required

Returns:

Name	Type	Description
`out`	`tf.Tensor, Any`	(image, label) pair

Source code in conftrainer/datasets/datagen.py

def parse_sample(self, path: str) -> tf.Tensor:
    """
    Read and resize an image

    Parameters
    ----------
    path : str
        path to the image

    Returns
    -------
    out : tf.Tensor, Any
        (image, label) pair
    """
    img = self.read_image(path)
    img = tf.image.resize(img, self.shape[:2])

    return img

`create_ds(batch_size=32, shape=None, training=False)` `abstractmethod`

Create a dataset with given arguments

Source code in conftrainer/datasets/datagen.py

@abstractmethod
def create_ds(self, batch_size: int = 32, shape: Optional[List[int]] = None,
              training: bool = False) -> tf.data.Dataset:
    """Create a dataset with given arguments"""

`ImageDatagen`

Bases: BaseImageDatagen

Generator class for creating image datasets via tf.dataframe.Dataset API.

Parameters:

Name	Type	Description	Default
`filepaths`	`List[str]`	names of the filepaths to read samples	required
`labels`	`Optional[np.ndarray]`	labels	`None`
`name`	`str`	name of the object to use in references	`'current'`
`shape`	`List[int]`	shape of the images. Might be overwritten in create_ds method	`None`

Source code in conftrainer/datasets/datagen.py

class ImageDatagen(BaseImageDatagen):
    """
    Generator class for creating image datasets via tf.dataframe.Dataset API.

    Parameters
    ----------
    filepaths : List[str]
        names of the filepaths to read samples
    labels : Optional[np.ndarray]
        labels
    name : str
        name of the object to use in references
    shape : List[int]
        shape of the images. Might be overwritten in create_ds method

    """

    def __init__(self, filepaths: List[str], labels: Optional[np.ndarray] = None,
                 classes: Optional[List[str]] = None, name: str = "current",
                 shape: Optional[List[int]] = None) -> None:
        super().__init__(shape=shape, name=name)
        self.filepaths = filepaths
        self.labels = labels
        self.classes = classes
        self.dataset = None
        if shape is not None:
            self.dataset = self.create_ds()

        self.probs_to_labels = lambda x: None
        if labels is not None:
            is_multilabel = (labels.sum(axis=1) > 1.1).sum()  # check row for label sum
            self.probs_to_labels = np.round if is_multilabel else partial(np.argmax, axis=1)

    def create_ds(self, batch_size: int = 32, shape: Optional[List[int]] = None,
                  training: bool = False) -> tf.data.Dataset:
        """
        Create a dataset via tf.dataframe.Dataset API and assign it to .dataset attribute of the
        datagen

        Parameters
        ----------
        batch_size : int = 32
            size of each batch
        training : bool
            whether the dataset will be used for training. If so, the dataset will be shuffled
            on each call
        shape : List[int], optional
            shape of the images to output. If not provided, defaults will be used instead

        Returns
        -------
        out : tf.data.Dataset
            batched and prefetched dataset ready to pass to a network
        """
        if shape is not None:
            self.shape = shape

        get_logger().info(f"{len(self.filepaths)} images in {self.name} dataset")
        if self.labels is not None:
            dataset = self.create_labeled_ds(batch_size=batch_size, training=training)

        else:
            dataset = self.create_unlabeled_ds(batch_size=batch_size, training=training)

        self.dataset = dataset
        return dataset

    def create_labeled_ds(self, training, batch_size):
        """Create a labeled tf.data.Dataset"""
        dataset = tf.data.Dataset.from_tensor_slices((self.filepaths, self.labels))

        if training:
            get_logger().info("Shuffling the dataset")
            dataset = dataset.shuffle(buffer_size=len(self.filepaths),
                                      reshuffle_each_iteration=True)

        dataset = dataset.map(
            lambda x, y: (self.parse_sample(x), y),
            num_parallel_calls=tf.data.AUTOTUNE).batch(
            batch_size,
            drop_remainder=False, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

        return dataset

    def create_unlabeled_ds(self, training, batch_size):
        """Create an unlabeled tf.data.Dataset"""
        dataset = tf.data.Dataset.from_tensor_slices(self.filepaths)
        if training:
            dataset = dataset.shuffle(len(self.filepaths)).map(self.parse_sample,
                                                               num_parallel_calls=tf.data.AUTOTUNE,
                                                               deterministic=False)
        else:  # keep the order of elements
            dataset = dataset.map(self.parse_sample,
                                  num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
        return dataset

`create_ds(batch_size=32, shape=None, training=False)`

Create a dataset via tf.dataframe.Dataset API and assign it to .dataset attribute of the datagen

Parameters:

Name	Type	Description	Default
`batch_size`	`int = 32`	size of each batch	`32`
`training`	`bool`	whether the dataset will be used for training. If so, the dataset will be shuffled on each call	`False`
`shape`	`List[int]`	shape of the images to output. If not provided, defaults will be used instead	`None`

Returns:

Name	Type	Description
`out`	`tf.data.Dataset`	batched and prefetched dataset ready to pass to a network

Source code in conftrainer/datasets/datagen.py

def create_ds(self, batch_size: int = 32, shape: Optional[List[int]] = None,
              training: bool = False) -> tf.data.Dataset:
    """
    Create a dataset via tf.dataframe.Dataset API and assign it to .dataset attribute of the
    datagen

    Parameters
    ----------
    batch_size : int = 32
        size of each batch
    training : bool
        whether the dataset will be used for training. If so, the dataset will be shuffled
        on each call
    shape : List[int], optional
        shape of the images to output. If not provided, defaults will be used instead

    Returns
    -------
    out : tf.data.Dataset
        batched and prefetched dataset ready to pass to a network
    """
    if shape is not None:
        self.shape = shape

    get_logger().info(f"{len(self.filepaths)} images in {self.name} dataset")
    if self.labels is not None:
        dataset = self.create_labeled_ds(batch_size=batch_size, training=training)

    else:
        dataset = self.create_unlabeled_ds(batch_size=batch_size, training=training)

    self.dataset = dataset
    return dataset

`create_labeled_ds(training, batch_size)`

Create a labeled tf.data.Dataset

Source code in conftrainer/datasets/datagen.py

def create_labeled_ds(self, training, batch_size):
    """Create a labeled tf.data.Dataset"""
    dataset = tf.data.Dataset.from_tensor_slices((self.filepaths, self.labels))

    if training:
        get_logger().info("Shuffling the dataset")
        dataset = dataset.shuffle(buffer_size=len(self.filepaths),
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(
        lambda x, y: (self.parse_sample(x), y),
        num_parallel_calls=tf.data.AUTOTUNE).batch(
        batch_size,
        drop_remainder=False, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

    return dataset

`create_unlabeled_ds(training, batch_size)`

Create an unlabeled tf.data.Dataset

Source code in conftrainer/datasets/datagen.py

def create_unlabeled_ds(self, training, batch_size):
    """Create an unlabeled tf.data.Dataset"""
    dataset = tf.data.Dataset.from_tensor_slices(self.filepaths)
    if training:
        dataset = dataset.shuffle(len(self.filepaths)).map(self.parse_sample,
                                                           num_parallel_calls=tf.data.AUTOTUNE,
                                                           deterministic=False)
    else:  # keep the order of elements
        dataset = dataset.map(self.parse_sample,
                              num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
    return dataset

`MultiOutputDatagen`

Bases: BaseImageDatagen

A dataset with multiple set of labels for each image for multibranch training

Source code in conftrainer/datasets/datagen.py

class MultiOutputDatagen(BaseImageDatagen):
    """A dataset with multiple set of labels for each image for multibranch training"""

    def __init__(self, config: MultiOutputDatagenConfig, shape: List[int]):
        super().__init__(shape=shape, name=config.name)
        self.filepaths = config.paths
        self.dataset = None
        self.labels, self.classes, self.task_names, self.task_types = \
            self.unpack_per_task_info(per_task_data=config.per_task_data)
        # DO NOT MOVE this part, strange behavior inside create_ds for unknown reason
        labels_tuple = tuple(tf.data.Dataset.from_tensor_slices(labels, name=name)
                             for labels, name in zip(self.labels, self.task_names))

        self.labels_tuple = tf.data.Dataset.zip(labels_tuple)

    @staticmethod
    def unpack_per_task_info(per_task_data: List[SingleTaskDataConfig]) -> Tuple:
        """Given list of info for each task, create lists of per task labels, class names and task names"""
        labels, classes, task_names, task_types = [], [], [], []
        for task in per_task_data:
            labels.append(task.labels)
            classes.append(task.classes)
            task_names.append(task.name)
            task_types.append(task.task_type)
        return labels, classes, task_names, task_types

    @property
    def proba_postprocessing_functions(self) -> List[Callable]:
        """
        Get postprocessing functions for each tasks' labels. If the task is multilabel, its postprocessing fn will
        be np.round, otherwise np.argmax

        Returns
        -------
        functions : List[Callable]
            functions to use when postprocessing labels
        """
        functions = []
        for task_labels, task_type in zip(self.labels, self.task_types):
            func = np.round if task_type == 'multilabel' else partial(np.argmax, axis=1)
            functions.append(func)
        return functions

    def probs_to_labels(self, probas: List[ArrayLike]) -> List[ArrayLike]:
        """Postprocess a list of per task probabilities to get labels"""
        processed_labels = []
        for func, task_labels in zip(self.proba_postprocessing_functions, probas):
            processed_labels.append(func(task_labels))
        return processed_labels

    def create_ds(self, batch_size: int = 32, shape: Iterable[int] = None,
                  training: bool = False) -> tf.data.Dataset:
        """Create the tf.data.Dataset object to pass to network"""
        if shape is not None:
            self.shape = shape
        if self.shape is None:
            raise ValueError("Please provide a valid shape to create a dataset")

        filepaths = tf.data.Dataset.from_tensor_slices(self.filepaths, name='inputs')
        dataset = tf.data.Dataset.zip((filepaths, self.labels_tuple))

        if training:
            dataset = dataset.shuffle(buffer_size=len(self.filepaths),
                                      reshuffle_each_iteration=True)
            print(f'Shuffling {self.name} dataset')

        dataset = dataset.map(
            lambda x, y: (self.parse_sample(x), y), num_parallel_calls=tf.data.AUTOTUNE).batch(
            batch_size, drop_remainder=False, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
        self.dataset = dataset
        return dataset

`proba_postprocessing_functions: List[Callable]` `property`

Get postprocessing functions for each tasks' labels. If the task is multilabel, its postprocessing fn will be np.round, otherwise np.argmax

Returns:

Name	Type	Description
`functions`	`List[Callable]`	functions to use when postprocessing labels

`unpack_per_task_info(per_task_data)` `staticmethod`

Given list of info for each task, create lists of per task labels, class names and task names

Source code in conftrainer/datasets/datagen.py

@staticmethod
def unpack_per_task_info(per_task_data: List[SingleTaskDataConfig]) -> Tuple:
    """Given list of info for each task, create lists of per task labels, class names and task names"""
    labels, classes, task_names, task_types = [], [], [], []
    for task in per_task_data:
        labels.append(task.labels)
        classes.append(task.classes)
        task_names.append(task.name)
        task_types.append(task.task_type)
    return labels, classes, task_names, task_types

`probs_to_labels(probas)`

Postprocess a list of per task probabilities to get labels

Source code in conftrainer/datasets/datagen.py

def probs_to_labels(self, probas: List[ArrayLike]) -> List[ArrayLike]:
    """Postprocess a list of per task probabilities to get labels"""
    processed_labels = []
    for func, task_labels in zip(self.proba_postprocessing_functions, probas):
        processed_labels.append(func(task_labels))
    return processed_labels

`create_ds(batch_size=32, shape=None, training=False)`

Create the tf.data.Dataset object to pass to network

Source code in conftrainer/datasets/datagen.py

def create_ds(self, batch_size: int = 32, shape: Iterable[int] = None,
              training: bool = False) -> tf.data.Dataset:
    """Create the tf.data.Dataset object to pass to network"""
    if shape is not None:
        self.shape = shape
    if self.shape is None:
        raise ValueError("Please provide a valid shape to create a dataset")

    filepaths = tf.data.Dataset.from_tensor_slices(self.filepaths, name='inputs')
    dataset = tf.data.Dataset.zip((filepaths, self.labels_tuple))

    if training:
        dataset = dataset.shuffle(buffer_size=len(self.filepaths),
                                  reshuffle_each_iteration=True)
        print(f'Shuffling {self.name} dataset')

    dataset = dataset.map(
        lambda x, y: (self.parse_sample(x), y), num_parallel_calls=tf.data.AUTOTUNE).batch(
        batch_size, drop_remainder=False, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
    self.dataset = dataset
    return dataset

Util for loading data from given csv

`load_all_datagens(csvs, data_dir, name_col, shape, trainable_classes, clean_dataset)`

Create Datagen objects for training, validation and testing

Parameters:

Name	Type	Description	Default
`csvs`	`CSVsConfig`	path(s) to read the data from	required
`data_dir`	`str`	path to directory where the images are stored	required
`name_col`	`str`	name of the column with filenames	required
`trainable_classes`	`List[str]`	names of label columns	required
`shape`	`List[int]`	shape of the images	required
`clean_dataset`	`bool`	whether to clean the dataset before using it	required

Returns:

Name	Type	Description
`out`	`Dict[str, ImageDatagen]`	names as keys and Datagens as values

Source code in conftrainer/datasets/loader.py

def load_all_datagens(csvs: CSVsConfig, data_dir: str, name_col: str, shape: List[int],
                      trainable_classes: List[str], clean_dataset: bool) -> Dict[str, ImageDatagen]:
    """
    Create Datagen objects for training, validation and testing

    Parameters
    ----------
    csvs : CSVsConfig
        path(s) to read the data from
    data_dir : str
        path to directory where the images are stored
    name_col : str
        name of the column with filenames
    trainable_classes : List[str]
        names of label columns
    shape : List[int]
        shape of the images
    clean_dataset : bool
        whether to clean the dataset before using it

    Returns
    -------
    out : Dict[str, ImageDatagen]
        names as keys and Datagens as values
    """

    datagens = {}
    csv_dict = csvs.dict()
    for name, csv_path in csv_dict.items():
        filepaths, labels = read_preprocess_dataframe(csv_path=csv_path,
                                                      name_col=name_col,
                                                      classes=trainable_classes,
                                                      data_dir=data_dir,
                                                      clean_dataset=clean_dataset)
        datagens[name] = ImageDatagen(filepaths=filepaths, labels=labels,
                                      classes=trainable_classes, name=name, shape=shape)

    return datagens

`create_datasets(datagens, **kwargs)`

Update the .dataset attributes of given ImageDatagens by calling .create_ds method

Parameters:

Name	Type	Description	Default
`datagens`	`ImageDatagen`	names of the datagens as key and Datagen objects as values	required
`**kwargs`		keyword arguments for .create_ds method of ImageDatagen. See its documentation of ImageDatagen for more details	`{}`

Source code in conftrainer/datasets/loader.py

def create_datasets(datagens: List[ImageDatagen], **kwargs) -> None:
    """
    Update the .dataset attributes of given ImageDatagens by calling .create_ds method

    Parameters
    ----------
    datagens : ImageDatagen
        names of the datagens as key and Datagen objects as values
    **kwargs :
        keyword arguments for .create_ds method of ImageDatagen. See its documentation of
        ImageDatagen for more details
    """

    for datagen in datagens:
        datagen.create_ds(training=datagen.name == "train", **kwargs)

`create_multioutput_datagens(read_config, shape)`

Given csv files and classes for each task, create multi output data generators for training, validation ( optional) and test (optional) datasets

Source code in conftrainer/datasets/loader.py

def create_multioutput_datagens(read_config: MultiOutputDataConfig,
                                shape: Iterable[int]) -> Dict[str, MultiOutputDatagen]:
    """Given csv files and classes for each task, create multi output data generators for training, validation (
    optional) and test (optional) datasets"""
    datagens = {}
    csv_dict = read_config.csvs.dict()
    for name, csv_path in csv_dict.items():
        datagen_config = read_multioutput_dataframe(csv_path=csv_path, name=name,
                                                    per_task_data=read_config.per_task_data,
                                                    data_dir=read_config.data_dir, name_col=read_config.name_col,
                                                    clean_dataset=read_config.clean_dataset)
        datagens[name] = MultiOutputDatagen(config=datagen_config, shape=shape)
    [datagen.create_ds(shape=shape) for datagen in datagens.values()]
    return datagens

`filter_unlabeled_samples(dataframe, class_names)`

Given a dataframe and list of columns, filter out rows with all zeroes as values in given cols

Source code in conftrainer/datasets/utils.py

def filter_unlabeled_samples(dataframe: pd.DataFrame, class_names: List[str]) -> pd.DataFrame:
    """
    Given a dataframe and list of columns, filter out rows with all zeroes as values in given cols
    """

    return dataframe[~(dataframe[class_names] == 0).all(axis=1)]

`stratify_split(dataframe, class_names, test_size=0.2, random_state=None)`

Split given dataframe into train and test datasets, stratifying by given columns.

Parameters:

Name	Type	Description	Default
`dataframe`	`pandas.DataFrame`	data to split	required
`class_names`	`List[str]`	names of label columns to split	required
`test_size`	`float`	ratio of test dataset	`0.2`
`random_state`	`int`	random state for reproducible results	`None`

Returns:

Name	Type	Description
`out`	`pandas.DataFrame, pandas.DataFrame`	train and test dataframes

Source code in conftrainer/datasets/utils.py

def stratify_split(dataframe: pd.DataFrame, class_names: List[str], test_size: float = 0.2,
                   random_state: Optional[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split given dataframe into train and test datasets, stratifying by given columns.

    Parameters
    ----------
    dataframe : pandas.DataFrame
        data to split
    class_names : List[str]
        names of label columns to split
    test_size : float
        ratio of test dataset
    random_state : int, optional
        random state for reproducible results

    Returns
    -------
    out : (pandas.DataFrame, pandas.DataFrame)
        train and test dataframes
    """
    labels = dataframe[class_names].values.tolist()
    msss = MultilabelStratifiedShuffleSplit(n_splits=3,
                                            test_size=test_size,
                                            random_state=random_state)
    train_index, test_index = next(msss.split(labels, labels))
    train = dataframe.iloc[train_index].reset_index(drop=True)
    test = dataframe.iloc[test_index].reset_index(drop=True)

    return train, test

`clean_dataframe(dataframe, data_dir, col, clean_save_path=None)`

Remove rows with broken/missing images from a dataframe. If a savepath is provided, clean dataframe will be saved on disk.

Parameters:

Name	Type	Description	Default
`dataframe`	`pd.DataFrame`	dataframe to clean	required
`col`	`str`	name of the column containing filenames	required
`data_dir`	`str`	path to directory containing the images	required
`clean_save_path`	`Optional[str]`	a csv path to save the cleaned dataframe	`None`

Returns:

Name	Type	Description
`out`	`pandas.DataFrame`	cleaned dataframe

Source code in conftrainer/datasets/utils.py

def clean_dataframe(dataframe: pd.DataFrame, data_dir: str, col: str,
                    clean_save_path: Optional[str] = None) -> pd.DataFrame:
    """
    Remove rows with broken/missing images from a dataframe. If a savepath is provided,
    clean dataframe will be saved on disk.

    Parameters
    ----------
    dataframe: pandas.DataFrame
        dataframe to clean
    col: str
        name of the column containing filenames
    data_dir: str
        path to directory containing the images
    clean_save_path: str, optional, default: None
        a csv path to save the cleaned dataframe

    Returns
    --------
    out : pandas.DataFrame
        cleaned dataframe
    """
    broken = []
    for name in dataframe[col]:
        filepath = os.path.join(data_dir, name)
        try:
            read_file(filepath)
        except errors.InvalidArgumentError:
            broken.append(name)
            os.remove(filepath)
        except errors.NotFoundError:
            broken.append(name)
    print(f"Found {len(broken)} broken images. Removing them from dataframe...")
    clean_df = dataframe[~dataframe[col].isin(broken)].reset_index(drop=True)
    if clean_save_path:
        print(f"Saving cleaned dataframe to {clean_save_path}")
        clean_df.to_csv(clean_save_path, index=False)
    return clean_df

`read_preprocess_dataframe(csv_path, name_col, classes=None, clean_dataset=False, data_dir='./')`

Read and preprocess a dataframe containing filepaths and their labels

Parameters:

Name	Type	Description	Default
`csv_path`	`str`	path of the csv containing names & labels of samples	required
`name_col`	`str`	name of the column with filenames	required
`classes`	`List[str] = None`	name of the columns containing class names	`None`
`data_dir`	`str = './'`	root directory to read the images from	`'./'`
`clean_dataset`	`bool`	whether to clean the dataframe from non-existing/invalid images before proceeding	`False`

Returns:

Name	Type	Description
`filenames`	`List[str]`	filenames of the samples
`labels`	`np.ndarray`	values of label columns

Source code in conftrainer/datasets/utils.py

def read_preprocess_dataframe(csv_path: str, name_col: str,
                              classes: Optional[List[str]] = None, clean_dataset: bool = False,
                              data_dir: str = './') -> Tuple[List[str], Optional[np.ndarray]]:
    """
    Read and preprocess a dataframe containing filepaths and their labels

    Parameters
    ----------
    csv_path : str
        path of the csv containing names & labels of samples
    name_col : str
        name of the column with filenames
    classes : List[str] = None
        name of the columns containing class names
    data_dir : str = './'
        root directory to read the images from
    clean_dataset: bool = False
        whether to clean the dataframe from non-existing/invalid images before proceeding

    Returns
    -------
    filenames : List[str]
        filenames of the samples
    labels : np.ndarray
        values of label columns
    """
    dataframe = pd.read_csv(csv_path)
    if clean_dataset:
        dataframe = clean_dataframe(dataframe=dataframe,
                                    data_dir=data_dir,
                                    col=name_col,
                                    clean_save_path=csv_path)

    labels = None

    if classes:
        dataframe = filter_unlabeled_samples(dataframe=dataframe,
                                             class_names=classes)
        validate_column_sum(dataframe=dataframe, columns=classes, name=csv_path)
        labels = dataframe[classes].values
    filenames = [os.path.join(data_dir, filename) for filename in dataframe[name_col]]
    return filenames, labels

`validate_column_sum(dataframe, columns=List[str], name='')`

Check if all provided columns of a one-hot enocoded dataframe have at least one sample

Parameters:

Name	Type	Description	Default
`dataframe`	`pd.DataFrame`	data to check	required
`columns`	`List[str]`	names of columns to check	`List[str]`
`name`	`str = ''`	name of the dataframe to display in error message	`''`

Raises:

Type	Description
`ValueError: if there's a column with no samples`

Source code in conftrainer/datasets/utils.py

def validate_column_sum(dataframe: pd.DataFrame, columns=List[str], name: str = '') -> None:
    """
    Check if all provided columns of a one-hot enocoded dataframe have at least one sample

    Parameters
    ----------
    dataframe :
        data to check
    columns : List[str]
        names of columns to check
    name : str = ''
        name of the dataframe to display in error message

    Raises
    -------
    ValueError: if there's a column with no samples
    """
    col_sum = dataframe[columns].sum(axis=0)
    zero_sum_cols = list(col_sum[col_sum == 0].index)
    if zero_sum_cols:
        raise ValueError(f"Please provide at least 1 example for following classes in {name} "
                         f"dataframe: {zero_sum_cols}")

`read_multioutput_dataframe(csv_path, per_task_data, name, data_dir, name_col, clean_dataset=False)`

Read a dataframe with possibly multiple set of labels and generate a configuration to create a Datagen

Parameters:

Name	Type	Description	Default
`csv_path`	`str`	path of csv file to read	required
`per_task_data`	`List[SingleTaskDataConfig]`	configuration for each task. Includes class names, and will be filled with actual labels	required
`name`	`str`	name of the datagen config	required
`data_dir`	`str`	root directory to read the images from	required
`name_col`	`str`	name of the column containing filenames	required
`clean_dataset`	`bool = False`	whether to check if there are broken images in the data	`False`

Returns:

Name	Type	Description
`out`	`MultiOutputDatagenConfig`	a configuration to create image data generator with given filepaths and a separate set of labels per task

Source code in conftrainer/datasets/utils.py

def read_multioutput_dataframe(csv_path: str, per_task_data: List[SingleTaskDataConfig], name: str,
                               data_dir: str, name_col: str, clean_dataset: bool = False) -> MultiOutputDatagenConfig:
    """
    Read a dataframe with possibly multiple set of labels and generate a configuration to create a Datagen

    Parameters
    ----------
    csv_path : str
        path of csv file to read
    per_task_data : List[SingleTaskDataConfig]
        configuration for each task. Includes class names, and will be filled with actual labels
    name : str
        name of the datagen config
    data_dir : str
        root directory to read the images from
    name_col : str
        name of the column containing filenames
    clean_dataset : bool = False
        whether to check if there are broken images in the data

    Returns
    -------
    out : MultiOutputDatagenConfig
        a configuration to create image data generator with given filepaths and a separate set of labels per task
    """
    dataframe = pd.read_csv(csv_path)
    if clean_dataset:
        dataframe = clean_dataframe(dataframe=dataframe,
                                    data_dir=data_dir,
                                    col=name_col,
                                    clean_save_path=csv_path)
    # Remove samples with no positive labels
    for task in per_task_data:
        dataframe = filter_unlabeled_samples(dataframe=dataframe, class_names=task.classes)

    for task in per_task_data:
        validate_column_sum(dataframe=dataframe, columns=task.classes, name=csv_path)
        task.labels = dataframe[task.classes].values
    filepaths = [os.path.join(data_dir, filename) for filename in dataframe[name_col].values]
    return MultiOutputDatagenConfig(paths=filepaths,
                                    per_task_data=per_task_data,
                                    name=name)

Datasets

Module for generating tf.data.Dataset

BaseImageDatagen

Methods

read_image(path) staticmethod

parse_sample(path)

create_ds(batch_size=32, shape=None, training=False) abstractmethod

ImageDatagen

create_ds(batch_size=32, shape=None, training=False)

create_labeled_ds(training, batch_size)

create_unlabeled_ds(training, batch_size)

MultiOutputDatagen

proba_postprocessing_functions: List[Callable] property

unpack_per_task_info(per_task_data) staticmethod

probs_to_labels(probas)

create_ds(batch_size=32, shape=None, training=False)

Util for loading data from given csv

load_all_datagens(csvs, data_dir, name_col, shape, trainable_classes, clean_dataset)

create_datasets(datagens, **kwargs)

create_multioutput_datagens(read_config, shape)

Helper functions related to data

filter_unlabeled_samples(dataframe, class_names)

stratify_split(dataframe, class_names, test_size=0.2, random_state=None)

clean_dataframe(dataframe, data_dir, col, clean_save_path=None)

read_preprocess_dataframe(csv_path, name_col, classes=None, clean_dataset=False, data_dir='./')

validate_column_sum(dataframe, columns=List[str], name='')

read_multioutput_dataframe(csv_path, per_task_data, name, data_dir, name_col, clean_dataset=False)

`BaseImageDatagen`

`read_image(path)` `staticmethod`

`parse_sample(path)`

`create_ds(batch_size=32, shape=None, training=False)` `abstractmethod`

`ImageDatagen`

`create_ds(batch_size=32, shape=None, training=False)`

`create_labeled_ds(training, batch_size)`

`create_unlabeled_ds(training, batch_size)`

`MultiOutputDatagen`

`proba_postprocessing_functions: List[Callable]` `property`

`unpack_per_task_info(per_task_data)` `staticmethod`

`probs_to_labels(probas)`

`create_ds(batch_size=32, shape=None, training=False)`

`load_all_datagens(csvs, data_dir, name_col, shape, trainable_classes, clean_dataset)`

`create_datasets(datagens, **kwargs)`

`create_multioutput_datagens(read_config, shape)`

`filter_unlabeled_samples(dataframe, class_names)`

`stratify_split(dataframe, class_names, test_size=0.2, random_state=None)`

`clean_dataframe(dataframe, data_dir, col, clean_save_path=None)`

`read_preprocess_dataframe(csv_path, name_col, classes=None, clean_dataset=False, data_dir='./')`

`validate_column_sum(dataframe, columns=List[str], name='')`

`read_multioutput_dataframe(csv_path, per_task_data, name, data_dir, name_col, clean_dataset=False)`