Skip to content

Input Validation

Data-based

Quality

pymdma.tabular.measures.input_val.CorrelationScore

Computes linear correlations between attributes in a dataset and returns the average percentage of attributes that are moderately or strongly correlated with each attribute.

Objective: Correlation

Parameters:

Name Type Description Default
column_names list of str

List of column names corresponding to the attributes in the dataset.

None
correlation_thresh float

The correlation threshold to consider an attribute as moderately or strongly correlated. Defaults to 0.5.

0.5
**kwargs dict

Additional keyword arguments for compatibility or future use.

{}
References

Shrestha, Detecting multicollinearity in regression analysis (2020). http://pubs.sciepub.com/ajams/8/2/1

Returns:

Type Description
MetricResult

A MetricResult object containing the percentage of columns correlated with each other, and global summary statistics.

Examples:

>>> # Example 1: Initializing and computing correlation on random data
>>> import numpy as np
>>> column_names = [f'col_{i}' for i in range(10)]
>>> data = np.random.rand(100, 10)
>>> correlation_score = CorrelationScore(column_names=column_names)
>>> result: MetricResult = correlation_score.compute(data)
>>> dataset_level, _ = result.value # Percentage of correlated attributes
>>> dataset_stats, _ = result.stats # Mean and std of correlation percentages
>>> # Example 2: Specifying a different correlation threshold
>>> correlation_score = CorrelationScore(column_names=column_names, correlation_thresh=0.7)
>>> result: MetricResult = correlation_score.compute(data)
>>> dataset_level, _ = result.value # Percentage of correlated attributes
>>> dataset_stats, _ = result.stats # Mean and std of correlation percentages
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class CorrelationScore(Metric):
    """Computes linear correlations between attributes in a dataset and returns
    the average percentage of attributes that are moderately or strongly
    correlated with each attribute.

    **Objective**: Correlation

    Parameters
    ----------
    column_names : list of str, optional, default=None
        List of column names corresponding to the attributes in the dataset.
    correlation_thresh : float, optional, default=0.5
        The correlation threshold to consider an attribute as moderately or strongly correlated.
        Defaults to 0.5.
    **kwargs : dict
        Additional keyword arguments for compatibility or future use.

    References
    ----------
    Shrestha, Detecting multicollinearity in regression analysis (2020).
    http://pubs.sciepub.com/ajams/8/2/1

    Returns
    -------
    MetricResult
        A MetricResult object containing the percentage of columns correlated with each other, and global summary statistics.

    Examples
    --------
    >>> # Example 1: Initializing and computing correlation on random data
    >>> import numpy as np
    >>> column_names = [f'col_{i}' for i in range(10)]
    >>> data = np.random.rand(100, 10)
    >>> correlation_score = CorrelationScore(column_names=column_names)
    >>> result: MetricResult = correlation_score.compute(data)
    >>> dataset_level, _ = result.value # Percentage of correlated attributes
    >>> dataset_stats, _ = result.stats # Mean and std of correlation percentages

    >>> # Example 2: Specifying a different correlation threshold
    >>> correlation_score = CorrelationScore(column_names=column_names, correlation_thresh=0.7)
    >>> result: MetricResult = correlation_score.compute(data)
    >>> dataset_level, _ = result.value # Percentage of correlated attributes
    >>> dataset_stats, _ = result.stats # Mean and std of correlation percentages
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = 1.0

    def __init__(
        self,
        column_names: Optional[List[str]] = None,
        correlation_thresh: Optional[float] = 0.5,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.c_thresh = correlation_thresh
        self.column_names = column_names

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the correlation matrix for the input data and determines
        the percentage of attributes that are moderately or strongly correlated
        with each attribute.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the correlation matrix will be computed.
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the percentage of correlated attributes for each
            attribute and statistics (mean and standard deviation) of the correlations.
        """

        # correlation matrix
        corr_m = corr_matrix(
            data=data,
            **kwargs,
        )

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        # Moderate/highly correlated attributes per attribute
        stats_d = corr_strong(
            corr_matrix=corr_m,
            cols=cols,
            c_thresh=self.c_thresh,
            **kwargs,
        )

        # global score
        perc_corr = {col: round(100 * len(att) / (len(cols) - 1), 1) for col, att in stats_d.items()}

        # stats
        perc_stats = {
            "mean": np.mean(list(perc_corr.values())),
            "std": np.std(list(perc_corr.values())),
        }

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.KEY_VAL,
                "subtype": "float",
                "value": perc_corr,
                "stats": perc_stats,
            },
        )

pymdma.tabular.measures.input_val.UniquenessScore

Computes the percentage of duplicate records in a dataset, providing a measure of the dataset's uniqueness.

The uniqueness score is calculated by determining the proportion of duplicate rows in the dataset. A higher percentage indicates more duplicates, while a lower percentage indicates higher uniqueness.

Objective: Uniqueness

Parameters:

Name Type Description Default
**kwargs dict

Additional keyword arguments for compatibility or future use.

{}
References

Sukhobok, Tabular data anomaly patterns (2017). https://ieeexplore.ieee.org/document/8316296

Returns:

Type Description
MetricResult

A MetricResult object containing the uniformity score for each column and summary statistics.

Examples:

>>> # Example 1: Computing uniqueness score on a dataset with no duplicates
>>> import numpy as np
>>> data = np.random.rand(100, 5)  # Random dataset (no duplicates)
>>> uniqueness_score = UniquenessScore()
>>> result: MetricResult = uniqueness_score.compute(data)
>>> dataset_level, _ = result.value  # Output: 0.0 (no duplicates)
>>> # Example 2: Computing uniqueness score on a dataset with duplicates
>>> data_with_dupl = np.concatenate([data, data[:10]])  # Add 10 duplicate rows
>>> result: MetricResult = uniqueness_score.compute(data_with_dupl)
>>> dataset_level, _ = result.value  # Output: Percentage of duplicate rows
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
class UniquenessScore(Metric):
    """Computes the percentage of duplicate records in a dataset, providing a
    measure of the dataset's uniqueness.

    The uniqueness score is calculated by determining the proportion of duplicate rows in the dataset. A higher
    percentage indicates more duplicates, while a lower percentage indicates higher uniqueness.

    **Objective**: Uniqueness

    Parameters
    ----------
    **kwargs : dict
        Additional keyword arguments for compatibility or future use.

    References
    ----------
    Sukhobok, Tabular data anomaly patterns (2017).
    https://ieeexplore.ieee.org/document/8316296

    Returns
    -------
    MetricResult
        A MetricResult object containing the uniformity score for each column and summary statistics.

    Examples
    --------
    >>> # Example 1: Computing uniqueness score on a dataset with no duplicates
    >>> import numpy as np
    >>> data = np.random.rand(100, 5)  # Random dataset (no duplicates)
    >>> uniqueness_score = UniquenessScore()
    >>> result: MetricResult = uniqueness_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: 0.0 (no duplicates)

    >>> # Example 2: Computing uniqueness score on a dataset with duplicates
    >>> data_with_dupl = np.concatenate([data, data[:10]])  # Add 10 duplicate rows
    >>> result: MetricResult = uniqueness_score.compute(data_with_dupl)
    >>> dataset_level, _ = result.value  # Output: Percentage of duplicate rows
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = 1.0

    def __init__(
        self,
        **kwargs,
    ):
        super().__init__(**kwargs)

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the percentage of duplicate records in a dataset, providing
        a measure of the dataset's uniqueness.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the uniqueness score will be computed.
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the percentage of duplicate rows in the dataset.
        """

        # total number of samples
        n_total = len(data)

        # number of non-duplicates
        _, cnt = np.unique(
            data,
            axis=0,
            return_counts=True,
        )
        n_dupl = cnt[cnt > 1].sum()

        # percentage of duplicates
        p_dupl = 100 * n_dupl / n_total

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.NUMERIC,
                "subtype": "float",
                "value": p_dupl,
            },
        )

pymdma.tabular.measures.input_val.UniformityScore

Computes a uniformity score for each attribute in the dataset, evaluating both discrete and continuous columns.

For each column, the score assesses its uniformity, entropy, and imbalance level, which can be aggregated to provide insights into the overall distribution of values. Discrete columns are scored based on categories, while continuous columns are assessed based on the spread of values.

Objective: Uniformity

Parameters:

Name Type Description Default
column_names list of str

List of column names in the dataset for which the uniformity score will be computed, by default None.

None
col_map dict of str

Dictionary mapping each column name to its data type information, including whether it's continuous or discrete, by default None.

None
**kwargs dict

Additional keyword arguments for compatibility or future use.

{}

Returns:

Type Description
MetricResult

A MetricResult object containing the uniformity score for each column and summary statistics.

Examples:

>>> # Example 1: Computing uniformity score on a dataset with random data
>>> import numpy as np
>>> column_names = ['A', 'B', 'C']
>>> col_map = {'A': {'type': {'tag': 'discrete', 'opt': [1, 2, 3]}},
...            'B': {'type': {'tag': 'discrete', 'opt': [0, 1]}},
...            'C': {'type': {'tag': 'continuous'}}}
>>> data = np.random.rand(100, 3)
>>> uniformity_score = UniformityScore(column_names=column_names, col_map=col_map)
>>> result: MetricResult = uniformity_score.compute(data)
>>> dataset_level, _ = result.value  # Output: Uniformity scores per column
>>> # Example 2: Computing uniformity score on a dataset with predefined categories
>>> data_with_categories = np.array([[1, 0, 3.2], [1, 1, 2.5], [2, 0, 4.1]])
>>> result: MetricResult = uniformity_score.compute(data_with_categories)
>>> dataset_level, _ = result.value  # Output: Uniformity scores per column
>>> dataset_stats, _ = result.stats  # Output: Mean and standard deviation of imbalance levels
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
class UniformityScore(Metric):
    """Computes a uniformity score for each attribute in the dataset,
    evaluating both discrete and continuous columns.

    For each column, the score assesses its uniformity, entropy, and imbalance level, which can be aggregated to
    provide insights into the overall distribution of values. Discrete columns are scored based on categories,
    while continuous columns are assessed based on the spread of values.

    **Objective**: Uniformity

    Parameters
    ----------
    column_names : list of str, optional, default=None
        List of column names in the dataset for which the uniformity score will be computed, by default None.
    col_map : dict of str, optional, default=None
        Dictionary mapping each column name to its data type information, including whether it's continuous or discrete,
        by default None.
    **kwargs : dict
        Additional keyword arguments for compatibility or future use.

    Returns
    -------
    MetricResult
        A MetricResult object containing the uniformity score for each column and summary statistics.

    Examples
    --------
    >>> # Example 1: Computing uniformity score on a dataset with random data
    >>> import numpy as np
    >>> column_names = ['A', 'B', 'C']
    >>> col_map = {'A': {'type': {'tag': 'discrete', 'opt': [1, 2, 3]}},
    ...            'B': {'type': {'tag': 'discrete', 'opt': [0, 1]}},
    ...            'C': {'type': {'tag': 'continuous'}}}
    >>> data = np.random.rand(100, 3)
    >>> uniformity_score = UniformityScore(column_names=column_names, col_map=col_map)
    >>> result: MetricResult = uniformity_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: Uniformity scores per column

    >>> # Example 2: Computing uniformity score on a dataset with predefined categories
    >>> data_with_categories = np.array([[1, 0, 3.2], [1, 1, 2.5], [2, 0, 4.1]])
    >>> result: MetricResult = uniformity_score.compute(data_with_categories)
    >>> dataset_level, _ = result.value  # Output: Uniformity scores per column
    >>> dataset_stats, _ = result.stats  # Output: Mean and standard deviation of imbalance levels
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = 1.0

    def __init__(
        self,
        column_names: Optional[List[str]] = None,
        col_map: Optional[Dict[str, str]] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.column_names = column_names
        self.col_map = col_map

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the uniformity score for each column in the dataset.

        For discrete columns, the function calculates how uniformly the categories are distributed.
        For continuous columns, it assesses the spread of values. The results are returned for each column,
        including the overall mean and standard deviation of imbalance levels.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the uniformity score will be computed.
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the uniformity scores for each column and summary statistics.
        """

        # score dictionary
        score_d = {}

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        # column map
        col_map_exists = isinstance(self.col_map, dict)

        # loop over columns
        for idx, col in enumerate(cols):
            # type and number of tag (discrete/continuous)
            if col_map_exists:
                # get column key
                vtype = self.col_map.get(col, {}).get("type", {})
                vtag = vtype.get("tag")
                vnum = len(vtype.get("opt"))
            else:
                vtag = "discrete" if is_categorical(data) else "continuous"
                vnum = None

            # compute scores
            if vtag == "discrete":
                stat, ent, imb = uniformity_score_per_column(
                    data_col=data[:, idx],
                    is_continuous=False,
                    n_categories=vnum,
                )
            elif vtag == "continuous":
                stat, ent, imb = uniformity_score_per_column(
                    data_col=data[:, idx],
                    is_continuous=True,
                )
            else:
                stat, ent, imb = [None] * 4

            # assign to dict
            score_d[col] = {
                "stat_score": stat,
                "entropy_score": ent,
                "level_score": imb,
            }

            # aggregated uniformity score per column
            imb_d = {col: val.get("level_score") for col, val in score_d.items()}

            # imbalance values
            imb_v = list(imb_d.values())

            # stats
            imb_stats = {
                "mean": np.mean(imb_v),
                "std": np.std(imb_v),
            }

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.KEY_VAL,
                "subtype": "float",
                "value": imb_d,
                "stats": imb_stats,
            },
        )

pymdma.tabular.measures.input_val.OutlierScore

Computes the percentage of outliers in each column of a dataset.

For each column, the function detects outliers using both z-score and interquartile range (IQR) methods, calculates the percentage of outliers, and averages the results of both methods. It also computes summary statistics (mean and standard deviation) of the outlier percentages across all columns.

Objective: Out of Distribution Detection

Parameters:

Name Type Description Default
column_names list of str

List of column names in the dataset, by default None.

None
**kwargs dict

Additional keyword arguments passed to the parent class.

{}
References

Iglewicz, B. and Hoaglin, D. (1993) The ASQC Basic References in Quality Control: Statistical Techniques. In: Mykytka, E.F., Eds., How to Detect and Handle Outliers, ASQC Quality Press, Milwaukee, Vol. 16

Returns:

Type Description
MetricResult

A MetricResult object containing the outlier percentage for each column and summary statistics.

Examples:

>>> # Example 1: Computing outlier score on a random dataset
>>> import numpy as np
>>> column_names = ['A', 'B', 'C']
>>> data = np.random.rand(100, 3)  # Random dataset of 100 samples and 3 columns
>>> outlier_score = OutlierScore(column_names=column_names)
>>> result: MetricResult = outlier_score.compute(data)
>>> dataset_level, _ = result.value  # Output: Percentage of outliers per column
>>> # Example 2: Computing outlier score on a dataset with some extreme values
>>> data_with_outliers = np.array([[1, 2, 3], [4, 5, 1000], [6, 7, 8]])  # Column 'C' contains an outlier
>>> result: MetricResult = outlier_score.compute(data_with_outliers)
>>> dataset_level, _ = result.value  # Output: Percentage of outliers per column
>>> dataset_stats, _ = result.stats  # Output: Mean and standard deviation of outlier percentages
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
class OutlierScore(Metric):
    """Computes the percentage of outliers in each column of a dataset.

    For each column, the function detects outliers using both z-score and interquartile range (IQR) methods,
    calculates the percentage of outliers, and averages the results of both methods. It also computes summary
    statistics (mean and standard deviation) of the outlier percentages across all columns.

    **Objective**: Out of Distribution Detection

    Parameters
    ----------
    column_names : list of str, optional, default=None
        List of column names in the dataset, by default None.
    **kwargs : dict
        Additional keyword arguments passed to the parent class.

    References
    ----------
    Iglewicz, B. and Hoaglin, D. (1993) The ASQC Basic References in Quality Control: Statistical Techniques. 
    In: Mykytka, E.F., Eds., How to Detect and Handle Outliers, ASQC Quality Press, Milwaukee, Vol. 16

    Returns
    -------
    MetricResult
        A MetricResult object containing the outlier percentage for each column and summary statistics.

    Examples
    --------
    >>> # Example 1: Computing outlier score on a random dataset
    >>> import numpy as np
    >>> column_names = ['A', 'B', 'C']
    >>> data = np.random.rand(100, 3)  # Random dataset of 100 samples and 3 columns
    >>> outlier_score = OutlierScore(column_names=column_names)
    >>> result: MetricResult = outlier_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: Percentage of outliers per column

    >>> # Example 2: Computing outlier score on a dataset with some extreme values
    >>> data_with_outliers = np.array([[1, 2, 3], [4, 5, 1000], [6, 7, 8]])  # Column 'C' contains an outlier
    >>> result: MetricResult = outlier_score.compute(data_with_outliers)
    >>> dataset_level, _ = result.value  # Output: Percentage of outliers per column
    >>> dataset_stats, _ = result.stats  # Output: Mean and standard deviation of outlier percentages
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = 1.0

    def __init__(self, column_names: Optional[List[str]] = None, **kwargs):
        """Initializes the OutlierScore metric with the column names.

        Parameters
        ----------
        column_names : list of str, optional
            List of column names for which outlier detection will be performed.
        **kwargs : dict
            Additional keyword arguments passed to the parent class.
        """

        super().__init__(**kwargs)

        # column list
        self.column_names = column_names

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the percentage of outliers for each column in the dataset.

        For each column, it calculates the outlier percentage using both z-score and interquartile range (IQR) methods,
        and then averages the results. The summary statistics (mean and standard deviation) of the outlier percentages
        are also computed.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the outlier percentage will be computed. Rows with NaN values are excluded.
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the outlier percentages for each column and summary statistics.
        """

        # curate data
        data_ = data[~np.isnan(data).sum(axis=1, dtype=bool)]

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        perc_out_d = {}
        for idx, col in enumerate(cols):
            # number of samples
            n_samples = len(data_[:, idx])

            # statistical outliers
            z_score = z_score_outliers(data_[:, idx])
            iqr_score = iqr_outliers(data_[:, idx])

            # percentage of outliers
            perc_z = 100 * z_score / n_samples
            perc_iqr = 100 * iqr_score / n_samples

            # average both
            perc_outliers = np.mean([perc_z, perc_iqr])

            # assign
            perc_out_d[col] = perc_outliers

        # statistical aggregates
        out_vals = list(perc_out_d.values())

        perc_out_stats = {
            "mean": np.mean(out_vals),
            "std": np.std(out_vals),
        }

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.KEY_VAL,
                "subtype": "float",
                "value": perc_out_d,
                "stats": perc_out_stats,
            },
        )

pymdma.tabular.measures.input_val.MissingScore

Computes the percentage of missing values per column in the dataset and provides summary statistics for missing rates across samples and columns.

Objective: Missing Values

Parameters:

Name Type Description Default
column_names list of str

List of column names in the dataset, by default None.

None
**kwargs dict

Additional keyword arguments passed to the parent class.

{}
References

Taleb et al., Big data quality: A quality dimensions evaluation (2016). https://ieeexplore.ieee.org/document/7816918

Returns:

Type Description
MetricResult

A MetricResult object containing the missing value percentage for each column and summary statistics.

Examples:

>>> # Example 1: Computing missing values score on a random dataset
>>> import numpy as np
>>> column_names = ['A', 'B', 'C']
>>> data = np.random.rand(100, 3)  # Random dataset of 100 samples and 3 columns
>>> data[0, 0] = np.nan  # Introducing a missing value
>>> missing_score = MissingScore(column_names=column_names)
>>> result: MetricResults = missing_score.compute(data)
>>> dataset_level, _ = result.value  # Output: Percentage of missing values per column
>>> # Example 2: Computing missing score on a dataset with significant missing values
>>> data_with_missing = np.array([[1, np.nan, 3], [np.nan, 5, np.nan], [6, 7, 8]])  # Missing values in multiple columns
>>> result: MetricResults = missing_score.compute(data_with_missing)
>>> dataset_level, _ = result.value  # Output: Percentage of missing values per column
>>> dataset_stats, _ = result.stats  # Output: Mean missing rate for samples and columns
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
class MissingScore(Metric):
    """Computes the percentage of missing values per column in the dataset and
    provides summary statistics for missing rates across samples and columns.

    **Objective**: Missing Values

    Parameters
    ----------
    column_names : list of str, optional, default=None
        List of column names in the dataset, by default None.
    **kwargs : dict
        Additional keyword arguments passed to the parent class.

    References
    ----------
    Taleb et al., Big data quality: A quality dimensions evaluation (2016).
    https://ieeexplore.ieee.org/document/7816918

    Returns
    -------
    MetricResult
        A MetricResult object containing the missing value percentage for each column and summary statistics.

    Examples
    --------
    >>> # Example 1: Computing missing values score on a random dataset
    >>> import numpy as np
    >>> column_names = ['A', 'B', 'C']
    >>> data = np.random.rand(100, 3)  # Random dataset of 100 samples and 3 columns
    >>> data[0, 0] = np.nan  # Introducing a missing value
    >>> missing_score = MissingScore(column_names=column_names)
    >>> result: MetricResults = missing_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: Percentage of missing values per column

    >>> # Example 2: Computing missing score on a dataset with significant missing values
    >>> data_with_missing = np.array([[1, np.nan, 3], [np.nan, 5, np.nan], [6, 7, 8]])  # Missing values in multiple columns
    >>> result: MetricResults = missing_score.compute(data_with_missing)
    >>> dataset_level, _ = result.value  # Output: Percentage of missing values per column
    >>> dataset_stats, _ = result.stats  # Output: Mean missing rate for samples and columns
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = 1.0

    def __init__(self, column_names: Optional[List[str]] = None, **kwargs):
        super().__init__(**kwargs)
        self.column_names = column_names

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the percentage of missing values for each column or row in
        the dataset.

        It calculates the percentage of missing values across rows and columns.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the missing value percentage will be computed.
            The data should be a 2D array where NaN values represent missing entries.
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the missing value percentages for each column and summary statistics.
        """

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        # missing values
        miss_samp = np.round(100 * np.isnan(data).sum(axis=1) / data.shape[1], 3)
        miss_col = np.round(100 * np.isnan(data).sum(axis=0) / data.shape[0], 3)

        # missing rates per column
        miss_col_d = {col: miss for col, miss in zip(cols, miss_col)}

        # aggregated missing rates
        miss_agg_d = {
            "sample": miss_samp.mean(),
            "column": miss_col.mean(),
        }

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.KEY_VAL,
                "subtype": "float",
                "value": miss_col_d,
                "stats": miss_agg_d,
            },
        )

pymdma.tabular.measures.input_val.DimCurseScore

Computes the ratio of the number of columns (features) to the number of samples (instances) in the dataset to evaluate the curse of dimensionality. A higher ratio indicates that the dataset may suffer from high dimensionality relative to the number of samples.

Objective: Dimensionality

Parameters:

Name Type Description Default
**kwargs dict

Additional keyword arguments passed to the parent class.

{}

Returns:

Type Description
MetricResult

A MetricResult object containing the ratio of columns to samples.

Examples:

>>> # Example 1: Evaluating dimensionality on a dataset with more samples than features
>>> import numpy as np
>>> data = np.random.rand(100, 10)  # 100 samples, 10 columns
>>> dim_curse_score = DimCurseScore()
>>> result: MetricResult = dim_curse_score.compute(data)
>>> dataset_level, _ = result.value  # Output: 0.1 (indicating more samples than features)
>>> # Example 2: Evaluating dimensionality on a dataset with more features than samples
>>> data = np.random.rand(10, 100)  # 10 samples, 100 columns
>>> result: MetricResult = dim_curse_score.compute(data)
>>> dataset_level, _ = result.value  # Output: 10.0 (indicating more features than samples)
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
class DimCurseScore(Metric):
    """Computes the ratio of the number of columns (features) to the number of
    samples (instances) in the dataset to evaluate the curse of dimensionality.
    A higher ratio indicates that the dataset may suffer from high
    dimensionality relative to the number of samples.

    **Objective**: Dimensionality

    Parameters
    ----------
    **kwargs : dict
        Additional keyword arguments passed to the parent class.

    Returns
    -------
    MetricResult
        A MetricResult object containing the ratio of columns to samples.

    Examples
    --------
    >>> # Example 1: Evaluating dimensionality on a dataset with more samples than features
    >>> import numpy as np
    >>> data = np.random.rand(100, 10)  # 100 samples, 10 columns
    >>> dim_curse_score = DimCurseScore()
    >>> result: MetricResult = dim_curse_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: 0.1 (indicating more samples than features)

    >>> # Example 2: Evaluating dimensionality on a dataset with more features than samples
    >>> data = np.random.rand(10, 100)  # 10 samples, 100 columns
    >>> result: MetricResult = dim_curse_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: 10.0 (indicating more features than samples)
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = np.inf

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the ratio of the number of columns (features) to the number
        of rows (samples) to assess the dataset's susceptibility to the curse
        of dimensionality.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which the dimensionality ratio will be computed.
            The data should be a 2D array where the shape is (samples, columns).
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the computed ratio of columns to samples.
        """

        # columns vs samples ratio
        ratio = np.divide(*data.shape)

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.NUMERIC,
                "subtype": "float",
                "value": ratio,
            },
        )

pymdma.tabular.measures.input_val.VIFactorScore

Calculates the Variance Inflation Factor (VIF) to assess the multicollinearity of each attribute (feature) in the dataset. VIF measures how much the variance of an estimated regression coefficient increases if your predictors are correlated.

Objective: Multicollinearity

Parameters:

Name Type Description Default
column_names list of str

List of the names of the columns (features) in the dataset.

None
**kwargs dict

Additional keyword arguments passed to the parent class.

{}
References

Marcoulides and Raykov, Evaluation of variance inflation factors in regression models using latent variable modeling methods (2019). https://pmc.ncbi.nlm.nih.gov/articles/PMC6713981/

Returns:

Type Description
MetricResult

A MetricResult object containing the variance inflation factor (VIF) for each attribute.

Examples:

>>> # Example 1: Evaluating VIF on a dataset with low multicollinearity
>>> import numpy as np
>>> data = np.random.rand(100, 5)  # 100 samples, 5 features
>>> vif_score = VIFactorScore(column_names=["col1", "col2", "col3", "col4", "col5"])
>>> result: MetricResult = vif_score.compute(data)
>>> dataset_level, _ = result.value  # Output: VIF scores per column
>>> # Example 2: Evaluating VIF on a dataset with high multicollinearity
>>> data = np.array([[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]])  # Multicollinear features
>>> vif_score = VIFactorScore(column_names=["A", "B", "C"])
>>> result: MetricResult = vif_score.compute(data)
>>> dataset_level, _ = result.value  # Output: Very high VIF scores for multicollinear columns
Source code in src/pymdma/tabular/measures/input_val/data/quality.py
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
class VIFactorScore(Metric):
    """Calculates the Variance Inflation Factor (VIF) to assess the
    multicollinearity of each attribute (feature) in the dataset. VIF measures
    how much the variance of an estimated regression coefficient increases if
    your predictors are correlated.

    **Objective**: Multicollinearity

    Parameters
    ----------
    column_names : list of str, optional, default=None
        List of the names of the columns (features) in the dataset.
    **kwargs : dict
        Additional keyword arguments passed to the parent class.

    References
    ----------
    Marcoulides and Raykov, Evaluation of variance inflation factors in regression models using latent variable modeling methods (2019).
    https://pmc.ncbi.nlm.nih.gov/articles/PMC6713981/

    Returns
    -------
    MetricResult
        A MetricResult object containing the variance inflation factor (VIF) for each attribute.

    Examples
    --------
    >>> # Example 1: Evaluating VIF on a dataset with low multicollinearity
    >>> import numpy as np
    >>> data = np.random.rand(100, 5)  # 100 samples, 5 features
    >>> vif_score = VIFactorScore(column_names=["col1", "col2", "col3", "col4", "col5"])
    >>> result: MetricResult = vif_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: VIF scores per column

    >>> # Example 2: Evaluating VIF on a dataset with high multicollinearity
    >>> data = np.array([[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]])  # Multicollinear features
    >>> vif_score = VIFactorScore(column_names=["A", "B", "C"])
    >>> result: MetricResult = vif_score.compute(data)
    >>> dataset_level, _ = result.value  # Output: Very high VIF scores for multicollinear columns
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.QUALITY

    higher_is_better: bool = False
    min_value: float = 0.0
    max_value: float = np.inf

    def __init__(self, column_names: Optional[List[str]] = None, **kwargs):
        super().__init__(**kwargs)
        self.column_names = column_names

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the Variance Inflation Factor (VIF) for each feature in the
        dataset to assess multicollinearity.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which VIF will be computed. The dataset should be in array-like
            format with the shape (samples, features).
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the VIF scores for each feature.
        """

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        # compute VIF for each column
        vif_p, _ = compute_vif(
            data=data,
            column_names=cols,
            **kwargs,
        )

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.NUMERIC,
                "subtype": "int",
                "value": vif_p,
            },
        )

Privacy

pymdma.tabular.measures.input_val.KAnonymityScore

Calculates the k for k-anonymity. A higher k value indicates that each record is less unique, meaning it is more difficult to re-identify individuals within the dataset.

Objective: Privacy

Parameters:

Name Type Description Default
column_names list

List of the names of the columns (features) in the dataset.

None
qi_names list

List of the quasi-identifier column names.

None
**kwargs dict

Additional keyword arguments passed to the parent class.

{}
References

Díaz and García, A python library to check the level of anonymity of a dataset. (2022). http://dx.doi.org/10.1038/s41597-022-01894-2

Returns:

Type Description
MetricResult

A MetricResult object containing the k-anonymity score.

Examples:

>>> # Example 1: Evaluating k-anonymity on a dataset with sufficient quasi-identifiers
>>> import numpy as np
>>> data = np.array([
...     ['Alice', 'Smith', 'NY'],
...     ['Alice', 'Smith', 'NY'],
...     ['Bob', 'Jones', 'CA'],
...     ['Bob', 'Jones', 'CA']
... ])
>>> k_anonymity = KAnonymityScore(
...     column_names=['first_name', 'last_name', 'state'],
...     qi_names=['first_name', 'last_name']
... )
>>> result = k_anonymity.compute(data)
>>> dataset_level, _ = result.value # Output: k-anonymity score
>>> # Example 2: Evaluating k-anonymity on a dataset with low uniqueness
>>> data = np.array([
...     ['Alice', 'Smith', 'NY'],
...     ['Alice', 'Smith', 'CA'],
...     ['Bob', 'Jones', 'NY']
... ])
>>> k_anonymity = KAnonymityScore(
...     column_names=['first_name', 'last_name', 'state'],
...     qi_names=['first_name', 'last_name']
... )
>>> result: MetricResult = k_anonymity.compute(data)
>>> dataset_level, _ = result.value # Output: k-anonymity score
Source code in src/pymdma/tabular/measures/input_val/data/privacy.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class KAnonymityScore(Metric):
    """Calculates the k for k-anonymity. A higher k value indicates that each
    record is less unique, meaning it is more difficult to re-identify
    individuals within the dataset.

    **Objective**: Privacy

    Parameters
    ----------
    column_names : list
        List of the names of the columns (features) in the dataset.
    qi_names : list, optional, default=None
        List of the quasi-identifier column names.
    **kwargs : dict
        Additional keyword arguments passed to the parent class.

    References
    ----------
    Díaz and García, A python library to check the level of anonymity of a dataset. (2022).
    http://dx.doi.org/10.1038/s41597-022-01894-2

    Returns
    -------
    MetricResult
        A MetricResult object containing the k-anonymity score.

    Examples
    --------
    >>> # Example 1: Evaluating k-anonymity on a dataset with sufficient quasi-identifiers
    >>> import numpy as np
    >>> data = np.array([
    ...     ['Alice', 'Smith', 'NY'],
    ...     ['Alice', 'Smith', 'NY'],
    ...     ['Bob', 'Jones', 'CA'],
    ...     ['Bob', 'Jones', 'CA']
    ... ])
    >>> k_anonymity = KAnonymityScore(
    ...     column_names=['first_name', 'last_name', 'state'],
    ...     qi_names=['first_name', 'last_name']
    ... )
    >>> result = k_anonymity.compute(data)
    >>> dataset_level, _ = result.value # Output: k-anonymity score

    >>> # Example 2: Evaluating k-anonymity on a dataset with low uniqueness
    >>> data = np.array([
    ...     ['Alice', 'Smith', 'NY'],
    ...     ['Alice', 'Smith', 'CA'],
    ...     ['Bob', 'Jones', 'NY']
    ... ])
    >>> k_anonymity = KAnonymityScore(
    ...     column_names=['first_name', 'last_name', 'state'],
    ...     qi_names=['first_name', 'last_name']
    ... )
    >>> result: MetricResult = k_anonymity.compute(data)
    >>> dataset_level, _ = result.value # Output: k-anonymity score
    """

    reference_type = ReferenceType.NONE
    evaluation_level = EvaluationLevel.DATASET
    metric_group = MetricGroup.PRIVACY

    higher_is_better: bool = True
    min_value: float = 0.0
    max_value: float = 100.0

    def __init__(self, column_names: Optional[List[str]] = None, qi_names: Optional[List[str]] = None, **kwargs):
        super().__init__(**kwargs)
        self.column_names = column_names
        self.qi_names = qi_names

    def compute(self, data: np.ndarray, **kwargs) -> MetricResult:
        """Computes the k-anonymity score for the dataset based on the
        specified quasi-identifiers.

        Parameters
        ----------
        data : np.ndarray
            The input dataset for which k-anonymity will be computed. The dataset
            should be in array-like format with the shape (samples, features).
        **kwargs : dict
            Additional keyword arguments for controlling the computation.

        Returns
        -------
        MetricResult
            A MetricResult object containing the k-anonymity score.
        """

        # columns
        cols = (
            self.column_names
            if isinstance(self.column_names, list)
            else [f"att_{idx}" for idx in range(data.shape[-1])]
        )

        # qi names
        qi_names = self.qi_names if isinstance(self.qi_names, list) else cols[-1:]

        # k anonimity
        k_anom = compute_k_anonymity(
            data=data,
            column_names=cols,
            qi_names=qi_names,
        )

        return MetricResult(
            dataset_level={
                "dtype": OutputsTypes.NUMERIC,
                "subtype": "float",
                "value": k_anom,
            },
        )