copairs.replicating¶

`copairs.replicating` ¶

Class for getting Percent replicating metric.

`CorrelationTestResult` ¶

Class representing the percent replicating score. It stores distributions.

Source code in src/copairs/replicating.py

class CorrelationTestResult:
    """Class representing the percent replicating score. It stores distributions."""

    def __init__(self, corr_df: pd.DataFrame, null_dist: pd.Series):
        """Initialize object."""
        self.corr_df = corr_df
        self.corr_dist = corr_df["median"]
        self.null_dist = null_dist

    def percent_score_left(self):
        """Calculate the percent score using the 5th percentile threshold.

        :return: proportion of correlation distribution beyond the threshold and the threshold.
        """
        perc_5 = np.nanpercentile(self.null_dist, 5)
        below_threshold = self.corr_dist.dropna() < perc_5
        return np.nanmean(below_threshold.astype(float)), perc_5

    def percent_score_right(self):
        """
        Calculate the percent score using the 95th percentile threshold.

        :return: proportion of correlation distribution beyond the threshold and the threshold.
        """
        perc_95 = np.nanpercentile(self.null_dist, 95)
        above_threshold = self.corr_dist.dropna() > perc_95
        return np.nanmean(above_threshold.astype(float)), perc_95

    def percent_score_both(self):
        """
        Calculate the percent score using the 5th and 95th percentile or thresholds.

        :return: proportion of correlation distribution beyond the thresholds and the thresholds.
        """
        perc_95 = np.nanpercentile(self.null_dist, 95)
        above_threshold = self.corr_dist.dropna() > perc_95
        perc_5 = np.nanpercentile(self.null_dist, 5)
        below_threshold = self.corr_dist.dropna() < perc_5
        return (
            (
                np.nanmean(above_threshold.astype(float))
                + np.nanmean(below_threshold.astype(float))
            ),
            perc_5,
            perc_95,
        )

    def percent_score(self, how: Literal["left", "right", "both"]):
        """Calculate percent score given the `how` criteria."""
        left_th, right_th = None, None
        if how == "right":
            percent_score, right_th = self.percent_score_right()
        elif how == "left":
            percent_score, left_th = self.percent_score_left()
        elif how == "both":
            percent_score, left_th, right_th = self.percent_score_both()
        else:
            raise ValueError(f"Invalid value: {how} for how param")

        return percent_score, left_th, right_th

    def wasserstein_distance(self):
        """Compute the Wasserstein distance between null and corr distributions."""
        from scipy.stats import wasserstein_distance

        return wasserstein_distance(self.null_dist.values, self.corr_dist.values)

`init(corr_df, null_dist)` ¶

Initialize object.

Source code in src/copairs/replicating.py

def __init__(self, corr_df: pd.DataFrame, null_dist: pd.Series):
    """Initialize object."""
    self.corr_df = corr_df
    self.corr_dist = corr_df["median"]
    self.null_dist = null_dist

`percent_score(how)` ¶

Calculate percent score given the how criteria.

Source code in src/copairs/replicating.py

def percent_score(self, how: Literal["left", "right", "both"]):
    """Calculate percent score given the `how` criteria."""
    left_th, right_th = None, None
    if how == "right":
        percent_score, right_th = self.percent_score_right()
    elif how == "left":
        percent_score, left_th = self.percent_score_left()
    elif how == "both":
        percent_score, left_th, right_th = self.percent_score_both()
    else:
        raise ValueError(f"Invalid value: {how} for how param")

    return percent_score, left_th, right_th

`percent_score_both()` ¶

Calculate the percent score using the 5th and 95th percentile or thresholds.

:return: proportion of correlation distribution beyond the thresholds and the thresholds.

Source code in src/copairs/replicating.py

def percent_score_both(self):
    """
    Calculate the percent score using the 5th and 95th percentile or thresholds.

    :return: proportion of correlation distribution beyond the thresholds and the thresholds.
    """
    perc_95 = np.nanpercentile(self.null_dist, 95)
    above_threshold = self.corr_dist.dropna() > perc_95
    perc_5 = np.nanpercentile(self.null_dist, 5)
    below_threshold = self.corr_dist.dropna() < perc_5
    return (
        (
            np.nanmean(above_threshold.astype(float))
            + np.nanmean(below_threshold.astype(float))
        ),
        perc_5,
        perc_95,
    )

`percent_score_left()` ¶

Calculate the percent score using the 5th percentile threshold.

:return: proportion of correlation distribution beyond the threshold and the threshold.

Source code in src/copairs/replicating.py

def percent_score_left(self):
    """Calculate the percent score using the 5th percentile threshold.

    :return: proportion of correlation distribution beyond the threshold and the threshold.
    """
    perc_5 = np.nanpercentile(self.null_dist, 5)
    below_threshold = self.corr_dist.dropna() < perc_5
    return np.nanmean(below_threshold.astype(float)), perc_5

`percent_score_right()` ¶

Calculate the percent score using the 95th percentile threshold.

:return: proportion of correlation distribution beyond the threshold and the threshold.

Source code in src/copairs/replicating.py

def percent_score_right(self):
    """
    Calculate the percent score using the 95th percentile threshold.

    :return: proportion of correlation distribution beyond the threshold and the threshold.
    """
    perc_95 = np.nanpercentile(self.null_dist, 95)
    above_threshold = self.corr_dist.dropna() > perc_95
    return np.nanmean(above_threshold.astype(float)), perc_95

`wasserstein_distance()` ¶

Compute the Wasserstein distance between null and corr distributions.

Source code in src/copairs/replicating.py

def wasserstein_distance(self):
    """Compute the Wasserstein distance between null and corr distributions."""
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(self.null_dist.values, self.corr_dist.values)

`corr_between_non_replicates(X, meta, n_samples, n_replicates, diffby, progress_bar=True)` ¶

Null distribution between random "replicates".

Parameters:

X (ndarray) –

Feature matrix.
meta (DataFrame) –

Metadata dataframe.
n_samples (int) –

Number of samples to generate.
n_replicates (int) –

Number of replicates per sample.
diffby (List[str]) –

List of columns that should be different.
progress_bar (bool, default: True ) –

Whether to show progress bar [default: True].

Returns:

Series –

Correlation values, with a length of n_samples.

Source code in src/copairs/replicating.py

def corr_between_non_replicates(
    X: np.ndarray,
    meta: pd.DataFrame,
    n_samples: int,
    n_replicates: int,
    diffby: List[str],
    progress_bar: bool = True,
):
    """
    Null distribution between random "replicates".

    Parameters
    ----------
    X : np.ndarray
        Feature matrix.
    meta : pandas.DataFrame
        Metadata dataframe.
    n_samples : int
        Number of samples to generate.
    n_replicates : int
        Number of replicates per sample.
    diffby : List[str]
        List of columns that should be different.
    progress_bar : bool, optional
        Whether to show progress bar [default: True].

    Returns
    -------
    pd.Series
        Correlation values, with a length of `n_samples`.
    """
    matcher = Matcher(meta, diffby, seed=0)
    n_pairs = n_replicates * n_samples

    null_pairs = [matcher.sample_null_pair(diffby) for _ in range(n_pairs)]
    return corr_from_null_pairs(X, null_pairs, n_replicates, progress_bar=progress_bar)

`corr_between_replicates(X, meta, sameby, diffby, progress_bar=True)` ¶

Correlation between replicates.

Parameters:

X (ndarray) –

Feature matrix.
meta (DataFrame) –

Metadata dataframe.
sameby (List[str]) –

Feature names to group the data frame by.
diffby (List[str]) –

Feature names to force different values.
progress_bar (bool, default: True ) –

Whether to show progress bar [default: True].

Returns:

tuple –

(DataFrame with correlation statistics, median number of replicates).

Source code in src/copairs/replicating.py

def corr_between_replicates(
    X: np.ndarray,
    meta: pd.DataFrame,
    sameby: List[str],
    diffby: List[str],
    progress_bar: bool = True,
):
    """
    Correlation between replicates.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix.
    meta : pd.DataFrame
        Metadata dataframe.
    sameby : List[str]
        Feature names to group the data frame by.
    diffby : List[str]
        Feature names to force different values.
    progress_bar : bool, optional
        Whether to show progress bar [default: True].

    Returns
    -------
    tuple
        (DataFrame with correlation statistics, median number of replicates).
    """
    matcher = Matcher(meta, sameby + diffby, seed=0)
    pairs = matcher.get_all_pairs(sameby, diffby)
    return corr_from_pairs(X, pairs, sameby, progress_bar=progress_bar)

`corr_from_null_pairs(X, null_pairs, n_replicates, progress_bar=True)` ¶

Correlation from a given list of unnamed pairs.

Source code in src/copairs/replicating.py

def corr_from_null_pairs(
    X: np.ndarray, null_pairs, n_replicates, progress_bar: bool = True
):
    """Correlation from a given list of unnamed pairs."""
    null_pairs = np.asarray(null_pairs, int)
    corr_fn = get_similarity_fn("correlation", progress_bar=progress_bar)
    corrs = corr_fn(X, null_pairs, batch_size=20000)
    corrs = corrs.reshape(-1, n_replicates)
    null_dist = np.nanmedian(corrs, axis=1)
    return pd.Series(null_dist)

`corr_from_pairs(X, pairs, sameby, progress_bar=True)` ¶

Correlation from a list of named pairs. Generated by Matcher.get_all_pairs.

Parameters:

X (ndarray) –
pairs (dict) –

Returns:

list-like of correlation values and median of number of replicates –

Source code in src/copairs/replicating.py

def corr_from_pairs(
    X: np.ndarray, pairs: dict, sameby: List[str], progress_bar: bool = True
):
    """
    Correlation from a list of named pairs. Generated by Matcher.get_all_pairs.

    Parameters
    ----------
    X: Matrix containing samples in rows
    pairs: dictionary with list of index pairs.

    Returns
    -------
    list-like of correlation values and median of number of replicates
    """
    pair_ix = np.vstack(list(pairs.values()))
    corr_fn = get_similarity_fn("correlation", progress_bar=progress_bar)
    corrs = corr_fn(X, pair_ix, batch_size=20000)
    counts = [len(v) for v in pairs.values()]

    if len(sameby) == 1:
        sameby_vals = np.repeat(list(pairs.keys()), counts)
    else:
        sameby_vals = np.repeat(list(map("_".join, pairs.keys())), counts)

    sameby_col = "_".join(sameby)

    corrs = pd.DataFrame(
        {
            sameby_col: sameby_vals,
            "corr": corrs,
            "row_x": pair_ix[:, 0],
            "row_y": pair_ix[:, 1],
        }
    )
    corrs = corrs.groupby(sameby_col).agg(
        {
            "corr": ["median", "count"],
            "row_x": "nunique",
        }
    )

    median_num_repl = int(corrs["row_x", "nunique"].median())
    corr_dist = corrs["corr"]

    return corr_dist, median_num_repl

`correlation_test(X, meta, sameby, diffby, n_samples=1000, progress_bar=True)` ¶

Generate Null and replicate distribution for replicate correlation analysis.

Source code in src/copairs/replicating.py

def correlation_test(
    X: np.ndarray,
    meta: pd.DataFrame,
    sameby: List[str],
    diffby: List[str],
    n_samples: int = 1000,
    progress_bar: bool = True,
) -> CorrelationTestResult:
    """Generate Null and replicate distribution for replicate correlation analysis."""
    corr_df, median_num_repl = corr_between_replicates(
        X, meta, sameby, diffby, progress_bar=progress_bar
    )

    n_replicates = min(median_num_repl, 50)
    null_dist = corr_between_non_replicates(
        X,
        meta,
        n_samples=n_samples,
        n_replicates=n_replicates,
        diffby=sameby + diffby,
        progress_bar=progress_bar,
    )

    return CorrelationTestResult(corr_df, null_dist)

`correlation_test_from_pairs(X, pairs, null_pairs, sameby, progress_bar=True)` ¶

Generate Null and replicate distribution for replicate correlation analysis.

Source code in src/copairs/replicating.py

def correlation_test_from_pairs(
    X: np.ndarray,
    pairs: dict,
    null_pairs: list,
    sameby: list,
    progress_bar: bool = True,
) -> CorrelationTestResult:
    """Generate Null and replicate distribution for replicate correlation analysis."""
    corr_df, median_num_repl = corr_from_pairs(
        X, pairs, sameby, progress_bar=progress_bar
    )
    n_replicates = min(median_num_repl, 50)
    null_dist = corr_from_null_pairs(
        X, null_pairs, n_replicates, progress_bar=progress_bar
    )
    return CorrelationTestResult(corr_df, null_dist)

copairs.replicating¶