Skip to content

copairs.compute

copairs.compute

Functions to compute distances and ranks using numpy operations.

ap_contiguous(rel_k_list, counts)

Compute Average Precision (AP) scores from relevance labels.

This function calculates Average Precision (AP) scores for each profile based on relevance labels and their associated counts. It also returns configurations indicating the number of positive and total pairs for each profile.

Parameters:

  • rel_k_list (ndarray) –

    Array of relevance labels (1 for positive pairs, 0 for negative pairs), sorted by descending similarity within profiles.

  • counts (ndarray) –

    Array indicating how many times each profile appears in the rank list.

Returns:

  • ap_scores ( ndarray ) –

    Array of Average Precision scores for each profile.

  • null_confs ( ndarray ) –

    Array of configurations, where each row corresponds to: - Number of positive pairs (num_pos). - Total number of pairs (counts).

Source code in src/copairs/compute.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
def ap_contiguous(
    rel_k_list: np.ndarray, counts: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """Compute Average Precision (AP) scores from relevance labels.

    This function calculates Average Precision (AP) scores for each profile based on
    relevance labels and their associated counts. It also returns configurations
    indicating the number of positive and total pairs for each profile.

    Parameters
    ----------
    rel_k_list : np.ndarray
        Array of relevance labels (1 for positive pairs, 0 for negative pairs), sorted
        by descending similarity within profiles.
    counts : np.ndarray
        Array indicating how many times each profile appears in the rank list.

    Returns
    -------
    ap_scores : np.ndarray
        Array of Average Precision scores for each profile.
    null_confs : np.ndarray
        Array of configurations, where each row corresponds to:
        - Number of positive pairs (`num_pos`).
        - Total number of pairs (`counts`).
    """
    # Convert counts into cutoff indices to segment relevance labels
    cutoffs = to_cutoffs(counts)

    num_pos = np.add.reduceat(rel_k_list, cutoffs, dtype=np.uint32)
    shift = np.empty_like(num_pos)
    shift[0], shift[1:] = 0, num_pos[:-1]

    # Calculate cumulative true positives for each profile segment
    tp = rel_k_list.cumsum() - np.repeat(shift.cumsum(), counts)

    # Rank positions for each relevance label, adjusted by cutoff indices
    k = np.arange(1, len(rel_k_list) + 1) - np.repeat(cutoffs, counts)

    # Compute precision at each rank (precision = TP / rank)
    pr_k = tp / k

    # Calculate average precision scores for each profile
    with np.errstate(divide="ignore", invalid="ignore"):
        ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos

    # Generate configurations (number of positive and total pairs)
    null_confs = np.stack([num_pos, counts], axis=1)

    return ap_scores, null_confs

average_precision(rel_k)

Compute average precision based on binary list indices.

Source code in src/copairs/compute.py
367
368
369
370
371
372
def average_precision(rel_k) -> np.ndarray:
    """Compute average precision based on binary list indices."""
    num_pos = rel_k.shape[1]
    pr_k = np.arange(1, num_pos + 1, dtype=np.float32) / (rel_k + 1)
    ap_values = pr_k.sum(axis=1) / num_pos
    return ap_values.astype(np.float32)

batch_processing(pairwise_op, progress_bar=True)

Add batch processing support to pairwise operations.

This decorator wraps a pairwise operation to process data in batches, enabling efficient computation and multithreading when working with large datasets.

Parameters:

  • pairwise_op (Callable) –

    A function that computes pairwise operations (e.g., similarity or distance) between two arrays of features.

  • progress_bar (bool, default: True ) –

    Whether or not to show tqdm's progress bar.

Returns:

  • Callable

    A wrapped function that processes pairwise operations in batches.

Source code in src/copairs/compute.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def batch_processing(
    pairwise_op: Callable[[np.ndarray, np.ndarray], np.ndarray],
    progress_bar: bool = True,
):
    """
    Add batch processing support to pairwise operations.

    This decorator wraps a pairwise operation to process data in batches,
    enabling efficient computation and multithreading when working with large
    datasets.

    Parameters
    ----------
    pairwise_op : Callable
        A function that computes pairwise operations (e.g., similarity or distance)
        between two arrays of features.
    progress_bar : bool
        Whether or not to show tqdm's progress bar.

    Returns
    -------
    Callable
        A wrapped function that processes pairwise operations in batches.

    """

    def batched_fn(feats: np.ndarray, pair_ix: np.ndarray, batch_size: int):
        # Total number of pairs to process
        num_pairs = len(pair_ix)

        # Initialize an empty result array to store pairwise operation results
        result = np.empty(num_pairs, dtype=np.float32)

        def par_func(i):
            # Extract the features for the current batch of pairs
            x_sample = feats[pair_ix[i : i + batch_size, 0]]
            y_sample = feats[pair_ix[i : i + batch_size, 1]]

            # Compute pairwise operations for the current batch
            result[i : i + len(x_sample)] = pairwise_op(x_sample, y_sample)

        # Use multithreading to process the batches in parallel
        parallel_map(
            par_func, np.arange(0, num_pairs, batch_size), progress_bar=progress_bar
        )

        return result

    return batched_fn

concat_ranges(start, end)

Create a 1D array by concatenating multiple integer ranges.

This function generates a single concatenated array from multiple ranges defined by the start and end arrays. Each range is inclusive of start and exclusive of end.

Parameters:

  • start (ndarray) –

    A 1D array of start indices for the ranges.

  • end (ndarray) –

    A 1D array of end indices for the ranges. Must have the same shape as start.

Returns:

  • ndarray

    A 1D array containing the concatenated ranges.

Source code in src/copairs/compute.py
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray:
    """Create a 1D array by concatenating multiple integer ranges.

    This function generates a single concatenated array from multiple ranges defined
    by the `start` and `end` arrays. Each range is inclusive of `start` and exclusive
    of `end`.

    Parameters
    ----------
    start : np.ndarray
        A 1D array of start indices for the ranges.
    end : np.ndarray
        A 1D array of end indices for the ranges. Must have the same shape as `start`.

    Returns
    -------
    np.ndarray
        A 1D array containing the concatenated ranges.
    """
    # Generate individual ranges using `range` for each pair of start and end
    slices = map(range, start, end)

    # Flatten the ranges into a single iterable
    slices = itertools.chain.from_iterable(slices)

    # Calculate the total length of the concatenated ranges
    count = (end - start).sum()

    # Create a 1D array from the concatenated ranges
    mask = np.fromiter(slices, dtype=np.int32, count=count)

    return mask

get_null_dists(confs, null_size, seed, cache_dir=None, progress_bar=True)

Generate null distributions for each configuration of positive and total pairs.

Parameters:

  • confs (ndarray) –

    Array where each row contains the number of positive pairs (num_pos) and total pairs (total) for a specific configuration.

  • null_size (int) –

    Number of samples to generate in the null distribution.

  • seed (int) –

    Random seed for reproducibility.

  • progress_bar (bool, default: True ) –

    Whether or not to show tqdm's progress bar.

Returns:

  • ndarray

    A 2D array where each row corresponds to a null distribution for a specific configuration.

Source code in src/copairs/compute.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
def get_null_dists(
    confs: np.ndarray,
    null_size: int,
    seed: int,
    cache_dir: Optional[Union[str, Path]] = None,
    progress_bar: bool = True,
) -> np.ndarray:
    """Generate null distributions for each configuration of positive and total pairs.

    Parameters
    ----------
    confs : np.ndarray
        Array where each row contains the number of positive pairs (`num_pos`)
        and total pairs (`total`) for a specific configuration.
    null_size : int
        Number of samples to generate in the null distribution.
    seed : int
        Random seed for reproducibility.
    progress_bar : bool
        Whether or not to show tqdm's progress bar.

    Returns
    -------
    np.ndarray
        A 2D array where each row corresponds to a null distribution for a specific
        configuration.
    """
    # Define the directory for caching null distributions
    cache_dir = Path.home() / ".copairs" if cache_dir is None else Path(cache_dir)
    cache_dir = cache_dir / f"seed{seed}" / f"ns{null_size}"

    # Number of configurations and random seeds for each configuration
    num_confs = len(confs)
    rng = np.random.default_rng(seed)
    seeds = rng.integers(8096, size=num_confs)

    # Initialize an array to store null distributions
    null_dists = np.empty([len(confs), null_size], dtype=np.float32)

    # Open one shared cache for all threads
    with diskcache.Cache(str(cache_dir)) as cache:

        def par_func(i):
            num_pos, total = confs[i]
            key = f"n{total}_k{num_pos}"
            null_dist = cache.get(key)
            if null_dist is None:
                null_dist = random_ap(null_size, num_pos, total, seeds[i])
                cache.set(key, null_dist)
            null_dists[i] = null_dist

        # Parallelize the generation of null distributions
        parallel_map(par_func, np.arange(num_confs), progress_bar)

    return null_dists

get_similarity_fn(distance, progress_bar=True)

Retrieve a similarity function based on a distance string identifier or custom callable.

This function provides flexibility in specifying the distance function to be used for pairwise similarity computations. Users can choose a metrics from a predefined set, scipy.spational.distance submodule, or provide a custom callable.

Parameters:

  • distance (str or callable) –

    The name of the distance function or a custom callable function. Supported string identifiers for predefined metrics are: - "cosine": Cosine similarity. - "abs_cosine": Absolute cosine similarity. - "correlation": Pearson correlation coefficient. - "euclidean": Inverse Euclidean distance (scaled to range 0-1). - "manhattan": Inverse Manhattan distance (scaled to range 0-1). - "chebyshev": Inverse Chebyshev distance (scaled to range 0-1).

    Additionally, any distance metric supported by scipy.spatial.distance.cdist can be used by providing the metric name as a string.

    If a callable is provided, it must accept the paramters associated with each callable function.

  • progress_bar (bool, default: True ) –

    Whether or not to show tqdm's progress bar.

Returns:

  • callable

    A function implementing the specified similarity function.

Raises:

  • ValueError:

    If the provided distance is not a recognized string identifier or a valid callable.

Example:

distance_fn = get_distance_fn("cosine") similarity_scores = distance_fn(x_sample, y_sample)

Source code in src/copairs/compute.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def get_similarity_fn(
    distance: Union[str, Callable], progress_bar: bool = True
) -> Callable:
    """Retrieve a similarity function based on a distance string identifier or custom callable.

    This function provides flexibility in specifying the distance function to be used
    for pairwise similarity computations. Users can choose a metrics from a predefined set,
    scipy.spational.distance submodule, or provide a custom callable.

    Parameters
    ----------
    distance : str or callable
        The name of the distance function or a custom callable function. Supported
        string identifiers for predefined metrics are:
        - "cosine": Cosine similarity.
        - "abs_cosine": Absolute cosine similarity.
        - "correlation": Pearson correlation coefficient.
        - "euclidean": Inverse Euclidean distance (scaled to range 0-1).
        - "manhattan": Inverse Manhattan distance (scaled to range 0-1).
        - "chebyshev": Inverse Chebyshev distance (scaled to range 0-1).

        Additionally, any distance metric supported by `scipy.spatial.distance.cdist`
        can be used by providing the metric name as a string.

        If a callable is provided, it must accept the paramters associated with each
        callable function.
    progress_bar : bool
        Whether or not to show tqdm's progress bar.

    Returns
    -------
    callable
        A function implementing the specified similarity function.

    Raises
    ------
    ValueError:
        If the provided `distance` is not a recognized string identifier or a valid callable.

    Example:
    -------
    >>> distance_fn = get_distance_fn("cosine")
    >>> similarity_scores = distance_fn(x_sample, y_sample)
    """
    # Dictionary of supported similarity functions
    similarity_functions = {
        "abs_cosine": pairwise_abs_cosine,
        "cosine": pairwise_cosine,
        "correlation": pairwise_corr,
        "euclidean": pairwise_euclidean,
        "manhattan": pairwise_manhattan,
        "chebyshev": pairwise_chebyshev,
    }

    # If a string is provided, look up the corresponding function
    if isinstance(distance, str):
        if distance in similarity_functions:
            similarity_fn = similarity_functions[distance]
        elif distance in SCIPY_METRICS_NAMES:
            similarity_fn = lambda x_sample, y_sample: _cdist_diag_sim(
                x_sample, y_sample, distance
            )
        else:
            raise ValueError(
                f"Unsupported distance function: {distance}. Supported functions are: {set(similarity_functions.keys()) | set(SCIPY_METRICS_NAMES)}"
            )
    elif callable(distance):
        # If a callable is provided, use it directly
        similarity_fn = distance
    else:
        # Raise an error if neither a string nor a callable is provided
        raise ValueError("Distance must be either a string or a callable object.")

    # Wrap the distance function for efficient batch processing
    return batch_processing(similarity_fn, progress_bar=progress_bar)

null_dist_cached(num_pos, total, seed, null_size, cache_dir)

Generate or retrieve a cached null distribution for a given configuration.

This function calculates a null distribution for a specified number of positive pairs (num_pos) and total pairs (total). It uses diskcache (SQLite-backed) for process-safe concurrent caching.

Parameters:

  • num_pos (int) –

    Number of positive pairs in the configuration.

  • total (int) –

    Total number of pairs (positive + negative) in the configuration.

  • seed (int) –

    Random seed for reproducibility.

  • null_size (int) –

    Number of samples to generate in the null distribution.

  • cache_dir (Path) –

    Directory to store or retrieve cached null distributions.

Returns:

  • ndarray

    Null distribution for the specified configuration.

Source code in src/copairs/compute.py
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
def null_dist_cached(
    num_pos: int, total: int, seed: int, null_size: int, cache_dir: Path
) -> np.ndarray:
    """Generate or retrieve a cached null distribution for a given configuration.

    This function calculates a null distribution for a specified number of positive
    pairs (`num_pos`) and total pairs (`total`). It uses diskcache (SQLite-backed)
    for process-safe concurrent caching.

    Parameters
    ----------
    num_pos : int
        Number of positive pairs in the configuration.
    total : int
        Total number of pairs (positive + negative) in the configuration.
    seed : int
        Random seed for reproducibility.
    null_size : int
        Number of samples to generate in the null distribution.
    cache_dir : Path
        Directory to store or retrieve cached null distributions.

    Returns
    -------
    np.ndarray
        Null distribution for the specified configuration.
    """
    if seed is None:
        return random_ap(null_size, num_pos, total, seed)

    key = f"n{total}_k{num_pos}"
    with diskcache.Cache(str(cache_dir)) as cache:
        null_dist = cache.get(key)
        if null_dist is None:
            null_dist = random_ap(null_size, num_pos, total, seed)
            cache.set(key, null_dist)
    return null_dist

p_values(ap_scores, null_confs, null_size, seed, progress_bar=True)

Calculate p-values for an array of Average Precision (AP) scores using a null distribution.

Parameters:

  • ap_scores (ndarray) –

    Array of observed AP scores for which to calculate p-values.

  • null_confs (ndarray) –

    Configuration array indicating the relevance or context of each AP score. Used to generate corresponding null distributions.

  • null_size (int) –

    Number of samples to generate in the null distribution for each configuration.

  • seed (int) –

    Seed for the random number generator to ensure reproducibility of the null distribution.

  • progress_bar (bool, default: True ) –

    Whether or not to show tqdm's progress bar.

Returns:

  • ndarray

    An array of p-values corresponding to the input AP scores.

Source code in src/copairs/compute.py
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
def p_values(
    ap_scores: np.ndarray,
    null_confs: np.ndarray,
    null_size: int,
    seed: int,
    progress_bar: bool = True,
):
    """Calculate p-values for an array of Average Precision (AP) scores using a null distribution.

    Parameters
    ----------
    ap_scores : np.ndarray
        Array of observed AP scores for which to calculate p-values.
    null_confs : np.ndarray
        Configuration array indicating the relevance or context of each AP score. Used
        to generate corresponding null distributions.
    null_size : int
        Number of samples to generate in the null distribution for each configuration.
    seed : int
        Seed for the random number generator to ensure reproducibility of the null
        distribution.
    progress_bar : bool
        Whether or not to show tqdm's progress bar.

    Returns
    -------
    np.ndarray
        An array of p-values corresponding to the input AP scores.
    """
    # Identify unique configurations and their indices
    confs, rev_ix = np.unique(null_confs, axis=0, return_inverse=True)

    # Generate null distributions for each unique configuration
    null_dists = get_null_dists(confs, null_size, seed, progress_bar=progress_bar)

    # Sort null distributions for efficient p-value computation
    null_dists.sort(axis=1)

    # Initialize an array to store the p-values
    pvals = np.empty(len(ap_scores), dtype=np.float32)

    # Compute p-values for each AP score
    for i, (ap_score, ix) in enumerate(zip(ap_scores, rev_ix)):
        # Find the rank of the observed AP score in the sorted null distribution
        num = null_size - np.searchsorted(null_dists[ix], ap_score)

        # Calculate the p-value as the proportion of null scores >= observed score
        pvals[i] = (num + 1) / (null_size + 1)

    return pvals

pairwise_abs_cosine(x_sample, y_sample)

Compute the absolute cosine similarity for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile.

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    Absolute values of cosine similarity scores.

Source code in src/copairs/compute.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def pairwise_abs_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """Compute the absolute cosine similarity for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile.
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        Absolute values of cosine similarity scores.
    """
    return np.abs(pairwise_cosine(x_sample, y_sample))

pairwise_chebyshev(x_sample, y_sample)

Compute the inverse Chebyshev distance for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile.

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    A 1D array of inverse Chebyshev distance scores (scaled to range 0-1).

Source code in src/copairs/compute.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def pairwise_chebyshev(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """Compute the inverse Chebyshev distance for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile.
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        A 1D array of inverse Chebyshev distance scores (scaled to range 0-1).
    """
    c_dist = np.max(np.abs(x_sample - y_sample), axis=1)
    return 1 / (1 + c_dist)

pairwise_corr(x_sample, y_sample)

Compute the Pearson correlation coefficient for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    A 1D array of Pearson correlation coefficients for each row pair in x_sample and y_sample.

Source code in src/copairs/compute.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """Compute the Pearson correlation coefficient for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        A 1D array of Pearson correlation coefficients for each row pair in
        `x_sample` and `y_sample`.
    """
    # Compute the mean for each row
    x_mean = x_sample.mean(axis=1, keepdims=True)
    y_mean = y_sample.mean(axis=1, keepdims=True)

    # Center the rows by subtracting the mean
    x_center = x_sample - x_mean
    y_center = y_sample - y_mean

    # Compute the numerator (dot product of centered vectors)
    numer = (x_center * y_center).sum(axis=1)

    # Compute the denominator (product of vector magnitudes)
    denom = (x_center**2).sum(axis=1) * (y_center**2).sum(axis=1)
    denom = np.sqrt(denom)

    # Calculate correlation coefficients
    corrs = numer / denom
    return corrs

pairwise_cosine(x_sample, y_sample)

Compute cosine similarity for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile.

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    A 1D array of cosine similarity scores for each row pair in x_sample and y_sample.

Source code in src/copairs/compute.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def pairwise_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """Compute cosine similarity for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile.
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        A 1D array of cosine similarity scores for each row pair in `x_sample` and `y_sample`.
    """
    # Normalize each row to unit vectors
    x_norm = x_sample / np.linalg.norm(x_sample, axis=1)[:, np.newaxis]
    y_norm = y_sample / np.linalg.norm(y_sample, axis=1)[:, np.newaxis]

    # Compute the dot product of normalized vectors
    c_sim = np.sum(x_norm * y_norm, axis=1)
    return c_sim

pairwise_euclidean(x_sample, y_sample)

Compute the inverse Euclidean distance for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile.

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    A 1D array of inverse Euclidean distance scores (scaled to range 0-1).

Source code in src/copairs/compute.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def pairwise_euclidean(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """
    Compute the inverse Euclidean distance for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile.
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        A 1D array of inverse Euclidean distance scores (scaled to range 0-1).
    """
    # Compute Euclidean distance and scale to a range of 0 to 1
    e_dist = np.sqrt(np.sum((x_sample - y_sample) ** 2, axis=1))
    return 1 / (1 + e_dist)

pairwise_manhattan(x_sample, y_sample)

Compute the inverse Manhattan distance for paired rows of two matrices.

Parameters:

  • x_sample (ndarray) –

    A 2D array where each row represents a profile.

  • y_sample (ndarray) –

    A 2D array of the same shape as x_sample.

Returns:

  • ndarray

    A 1D array of inverse Manhattan distance scores (scaled to range 0-1).

Source code in src/copairs/compute.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def pairwise_manhattan(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
    """Compute the inverse Manhattan distance for paired rows of two matrices.

    Parameters
    ----------
    x_sample : np.ndarray
        A 2D array where each row represents a profile.
    y_sample : np.ndarray
        A 2D array of the same shape as `x_sample`.

    Returns
    -------
    np.ndarray
        A 1D array of inverse Manhattan distance scores (scaled to range 0-1).
    """
    m_dist = np.sum(np.abs(x_sample - y_sample), axis=1)
    return 1 / (1 + m_dist)

parallel_map(par_func, items, progress_bar=True)

Execute a function in parallel over a list of items.

This function uses a thread pool to process items in parallel, with progress tracking via tqdm. It is particularly useful for batch operations that benefit from multithreading.

Parameters:

  • par_func (Callable) –

    A function to execute for each item. It should accept a single argument (an item index or value).

  • items (ndarray) –

    An array or list of items to process.

Source code in src/copairs/compute.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def parallel_map(
    par_func: Callable[[int], None],
    items: np.ndarray,
    progress_bar: bool = True,
) -> None:
    """Execute a function in parallel over a list of items.

    This function uses a thread pool to process items in parallel, with progress
    tracking via `tqdm`. It is particularly useful for batch operations that benefit
    from multithreading.

    Parameters
    ----------
    par_func : Callable
        A function to execute for each item. It should accept a single argument
        (an item index or value).
    items : np.ndarray
        An array or list of items to process.
    """
    # Total number of items to process
    num_items = len(items)

    # Determine the number of threads to use, limited by CPU count
    pool_size = min(num_items, os.cpu_count())

    # Calculate chunk size for dividing work among threads
    chunksize = num_items // pool_size

    # Use a thread pool to execute the function in parallel
    with ThreadPool(pool_size) as pool:
        # Map the function to items with unordered execution for better efficiency
        tasks = pool.imap_unordered(par_func, items, chunksize=chunksize)

        if progress_bar:
            # Display progress using tqdm
            from tqdm.autonotebook import tqdm

            tasks = tqdm(tasks, total=len(items), leave=False)
        for _ in tasks:
            pass

random_ap(num_perm, num_pos, total, seed)

Generate random Average Precision (AP) scores to create a null distribution.

This function computes multiple Average Precision (AP) scores based on randomly generated binary relevance lists. It is useful for generating a null distribution to assess the significance of observed AP scores.

Parameters:

  • num_perm (int) –

    Number of random permutations (i.e., how many random relevance lists to generate).

  • num_pos (int) –

    Number of positive samples (1's) in each relevance list.

  • total (int) –

    Total number of samples (columns) in each relevance list.

  • seed (int) –

    Seed for the random number generator to ensure reproducibility.

Returns:

  • ndarray

    A 1D array containing the Average Precision scores for each randomly generated relevance list.

Source code in src/copairs/compute.py
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
def random_ap(num_perm: int, num_pos: int, total: int, seed: int):
    """Generate random Average Precision (AP) scores to create a null distribution.

    This function computes multiple Average Precision (AP) scores based on randomly
    generated binary relevance lists. It is useful for generating a null distribution
    to assess the significance of observed AP scores.

    Parameters
    ----------
    num_perm : int
        Number of random permutations (i.e., how many random relevance lists to generate).
    num_pos : int
        Number of positive samples (1's) in each relevance list.
    total : int
        Total number of samples (columns) in each relevance list.
    seed : int
        Seed for the random number generator to ensure reproducibility.

    Returns
    -------
    np.ndarray
        A 1D array containing the Average Precision scores for each randomly
        generated relevance list.
    """
    # Initialize the random number generator
    rng = np.random.default_rng(seed)

    # Generate a binary matrix with `num_perm` rows and `total` columns,
    # where each row contains exactly `num_pos` ones distributed randomly
    rel_k = random_binary_matrix(num_perm, total, num_pos, rng)

    # Compute Average Precision (AP) scores for each row of the binary matrix
    null_dist = average_precision(rel_k)
    return null_dist

random_binary_matrix(n, m, k, rng)

Generate a indices of k values in 1 per row in a random binary n*m matrix.

Args: n: Number of rows. m: Number of columns. k: Number of 1's per row.

Returns:

  • ndarray

    A binary matrix of shape (n, m) with exactly k ones per row.

Source code in src/copairs/compute.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def random_binary_matrix(n, m, k, rng):
    """Generate a indices of k values in 1 per row in a random binary n*m matrix.

    Args:
    n: Number of rows.
    m: Number of columns.
    k: Number of 1's per row.

    Returns
    -------
    np.ndarray
        A binary matrix of shape `(n, m)` with exactly `k` ones per row.
    """
    dtype = np.uint16 if m < 2**16 else np.uint32
    indices = np.tile(np.arange(m, dtype=dtype), (n, 1))
    rng.permuted(indices, axis=1, out=indices)
    return np.sort(indices[:, :k], axis=1)

to_cutoffs(counts)

Convert counts into cumulative cutoff indices.

This function generates a 1D array of indices that mark the start of each segment in a cumulative list. The first index is always 0, and subsequent indices correspond to the cumulative sum of counts up to the previous entry.

Parameters:

  • counts (ndarray) –

    A 1D array of counts representing the size of each segment.

Returns:

  • ndarray

    A 1D array of cutoff indices where each value indicates the starting index for the corresponding segment.

Source code in src/copairs/compute.py
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
def to_cutoffs(counts: np.ndarray) -> np.ndarray:
    """Convert counts into cumulative cutoff indices.

    This function generates a 1D array of indices that mark the start of each segment
    in a cumulative list. The first index is always `0`, and subsequent indices
    correspond to the cumulative sum of counts up to the previous entry.

    Parameters
    ----------
    counts : np.ndarray
        A 1D array of counts representing the size of each segment.

    Returns
    -------
    np.ndarray
        A 1D array of cutoff indices where each value indicates the starting index
        for the corresponding segment.
    """
    # Initialize an empty array for cutoff indices
    cutoffs = np.empty_like(counts)

    # Set the first cutoff to 0 (start of the first segment)
    cutoffs[0] = 0

    # Compute subsequent cutoffs using cumulative sums, excluding the last element
    cutoffs[1:] = counts.cumsum()[:-1]

    return cutoffs