Skip to content

copairs.replicating

copairs.replicating

Class for getting Percent replicating metric.

CorrelationTestResult

Class representing the percent replicating score. It stores distributions.

Source code in src/copairs/replicating.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class CorrelationTestResult:
    """Class representing the percent replicating score. It stores distributions."""

    def __init__(self, corr_df: pd.DataFrame, null_dist: pd.Series):
        """Initialize object."""
        self.corr_df = corr_df
        self.corr_dist = corr_df["median"]
        self.null_dist = null_dist

    def percent_score_left(self):
        """Calculate the percent score using the 5th percentile threshold.

        :return: proportion of correlation distribution beyond the threshold and the threshold.
        """
        perc_5 = np.nanpercentile(self.null_dist, 5)
        below_threshold = self.corr_dist.dropna() < perc_5
        return np.nanmean(below_threshold.astype(float)), perc_5

    def percent_score_right(self):
        """
        Calculate the percent score using the 95th percentile threshold.

        :return: proportion of correlation distribution beyond the threshold and the threshold.
        """
        perc_95 = np.nanpercentile(self.null_dist, 95)
        above_threshold = self.corr_dist.dropna() > perc_95
        return np.nanmean(above_threshold.astype(float)), perc_95

    def percent_score_both(self):
        """
        Calculate the percent score using the 5th and 95th percentile or thresholds.

        :return: proportion of correlation distribution beyond the thresholds and the thresholds.
        """
        perc_95 = np.nanpercentile(self.null_dist, 95)
        above_threshold = self.corr_dist.dropna() > perc_95
        perc_5 = np.nanpercentile(self.null_dist, 5)
        below_threshold = self.corr_dist.dropna() < perc_5
        return (
            (
                np.nanmean(above_threshold.astype(float))
                + np.nanmean(below_threshold.astype(float))
            ),
            perc_5,
            perc_95,
        )

    def percent_score(self, how: Literal["left", "right", "both"]):
        """Calculate percent score given the `how` criteria."""
        left_th, right_th = None, None
        if how == "right":
            percent_score, right_th = self.percent_score_right()
        elif how == "left":
            percent_score, left_th = self.percent_score_left()
        elif how == "both":
            percent_score, left_th, right_th = self.percent_score_both()
        else:
            raise ValueError(f"Invalid value: {how} for how param")

        return percent_score, left_th, right_th

    def wasserstein_distance(self):
        """Compute the Wasserstein distance between null and corr distributions."""
        from scipy.stats import wasserstein_distance

        return wasserstein_distance(self.null_dist.values, self.corr_dist.values)

__init__(corr_df, null_dist)

Initialize object.

Source code in src/copairs/replicating.py
147
148
149
150
151
def __init__(self, corr_df: pd.DataFrame, null_dist: pd.Series):
    """Initialize object."""
    self.corr_df = corr_df
    self.corr_dist = corr_df["median"]
    self.null_dist = null_dist

percent_score(how)

Calculate percent score given the how criteria.

Source code in src/copairs/replicating.py
191
192
193
194
195
196
197
198
199
200
201
202
203
def percent_score(self, how: Literal["left", "right", "both"]):
    """Calculate percent score given the `how` criteria."""
    left_th, right_th = None, None
    if how == "right":
        percent_score, right_th = self.percent_score_right()
    elif how == "left":
        percent_score, left_th = self.percent_score_left()
    elif how == "both":
        percent_score, left_th, right_th = self.percent_score_both()
    else:
        raise ValueError(f"Invalid value: {how} for how param")

    return percent_score, left_th, right_th

percent_score_both()

Calculate the percent score using the 5th and 95th percentile or thresholds.

:return: proportion of correlation distribution beyond the thresholds and the thresholds.

Source code in src/copairs/replicating.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def percent_score_both(self):
    """
    Calculate the percent score using the 5th and 95th percentile or thresholds.

    :return: proportion of correlation distribution beyond the thresholds and the thresholds.
    """
    perc_95 = np.nanpercentile(self.null_dist, 95)
    above_threshold = self.corr_dist.dropna() > perc_95
    perc_5 = np.nanpercentile(self.null_dist, 5)
    below_threshold = self.corr_dist.dropna() < perc_5
    return (
        (
            np.nanmean(above_threshold.astype(float))
            + np.nanmean(below_threshold.astype(float))
        ),
        perc_5,
        perc_95,
    )

percent_score_left()

Calculate the percent score using the 5th percentile threshold.

:return: proportion of correlation distribution beyond the threshold and the threshold.

Source code in src/copairs/replicating.py
153
154
155
156
157
158
159
160
def percent_score_left(self):
    """Calculate the percent score using the 5th percentile threshold.

    :return: proportion of correlation distribution beyond the threshold and the threshold.
    """
    perc_5 = np.nanpercentile(self.null_dist, 5)
    below_threshold = self.corr_dist.dropna() < perc_5
    return np.nanmean(below_threshold.astype(float)), perc_5

percent_score_right()

Calculate the percent score using the 95th percentile threshold.

:return: proportion of correlation distribution beyond the threshold and the threshold.

Source code in src/copairs/replicating.py
162
163
164
165
166
167
168
169
170
def percent_score_right(self):
    """
    Calculate the percent score using the 95th percentile threshold.

    :return: proportion of correlation distribution beyond the threshold and the threshold.
    """
    perc_95 = np.nanpercentile(self.null_dist, 95)
    above_threshold = self.corr_dist.dropna() > perc_95
    return np.nanmean(above_threshold.astype(float)), perc_95

wasserstein_distance()

Compute the Wasserstein distance between null and corr distributions.

Source code in src/copairs/replicating.py
205
206
207
208
209
def wasserstein_distance(self):
    """Compute the Wasserstein distance between null and corr distributions."""
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(self.null_dist.values, self.corr_dist.values)

corr_between_non_replicates(X, meta, n_samples, n_replicates, diffby, progress_bar=True)

Null distribution between random "replicates".

Parameters:

  • X (ndarray) –

    Feature matrix.

  • meta (DataFrame) –

    Metadata dataframe.

  • n_samples (int) –

    Number of samples to generate.

  • n_replicates (int) –

    Number of replicates per sample.

  • diffby (List[str]) –

    List of columns that should be different.

  • progress_bar (bool, default: True ) –

    Whether to show progress bar [default: True].

Returns:

  • Series

    Correlation values, with a length of n_samples.

Source code in src/copairs/replicating.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def corr_between_non_replicates(
    X: np.ndarray,
    meta: pd.DataFrame,
    n_samples: int,
    n_replicates: int,
    diffby: List[str],
    progress_bar: bool = True,
):
    """
    Null distribution between random "replicates".

    Parameters
    ----------
    X : np.ndarray
        Feature matrix.
    meta : pandas.DataFrame
        Metadata dataframe.
    n_samples : int
        Number of samples to generate.
    n_replicates : int
        Number of replicates per sample.
    diffby : List[str]
        List of columns that should be different.
    progress_bar : bool, optional
        Whether to show progress bar [default: True].

    Returns
    -------
    pd.Series
        Correlation values, with a length of `n_samples`.
    """
    matcher = Matcher(meta, diffby, seed=0)
    n_pairs = n_replicates * n_samples

    null_pairs = [matcher.sample_null_pair(diffby) for _ in range(n_pairs)]
    return corr_from_null_pairs(X, null_pairs, n_replicates, progress_bar=progress_bar)

corr_between_replicates(X, meta, sameby, diffby, progress_bar=True)

Correlation between replicates.

Parameters:

  • X (ndarray) –

    Feature matrix.

  • meta (DataFrame) –

    Metadata dataframe.

  • sameby (List[str]) –

    Feature names to group the data frame by.

  • diffby (List[str]) –

    Feature names to force different values.

  • progress_bar (bool, default: True ) –

    Whether to show progress bar [default: True].

Returns:

  • tuple

    (DataFrame with correlation statistics, median number of replicates).

Source code in src/copairs/replicating.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def corr_between_replicates(
    X: np.ndarray,
    meta: pd.DataFrame,
    sameby: List[str],
    diffby: List[str],
    progress_bar: bool = True,
):
    """
    Correlation between replicates.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix.
    meta : pd.DataFrame
        Metadata dataframe.
    sameby : List[str]
        Feature names to group the data frame by.
    diffby : List[str]
        Feature names to force different values.
    progress_bar : bool, optional
        Whether to show progress bar [default: True].

    Returns
    -------
    tuple
        (DataFrame with correlation statistics, median number of replicates).
    """
    matcher = Matcher(meta, sameby + diffby, seed=0)
    pairs = matcher.get_all_pairs(sameby, diffby)
    return corr_from_pairs(X, pairs, sameby, progress_bar=progress_bar)

corr_from_null_pairs(X, null_pairs, n_replicates, progress_bar=True)

Correlation from a given list of unnamed pairs.

Source code in src/copairs/replicating.py
13
14
15
16
17
18
19
20
21
22
def corr_from_null_pairs(
    X: np.ndarray, null_pairs, n_replicates, progress_bar: bool = True
):
    """Correlation from a given list of unnamed pairs."""
    null_pairs = np.asarray(null_pairs, int)
    corr_fn = get_similarity_fn("correlation", progress_bar=progress_bar)
    corrs = corr_fn(X, null_pairs, batch_size=20000)
    corrs = corrs.reshape(-1, n_replicates)
    null_dist = np.nanmedian(corrs, axis=1)
    return pd.Series(null_dist)

corr_from_pairs(X, pairs, sameby, progress_bar=True)

Correlation from a list of named pairs. Generated by Matcher.get_all_pairs.

Parameters:

  • X (ndarray) –
  • pairs (dict) –

Returns:

  • list-like of correlation values and median of number of replicates
Source code in src/copairs/replicating.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def corr_from_pairs(
    X: np.ndarray, pairs: dict, sameby: List[str], progress_bar: bool = True
):
    """
    Correlation from a list of named pairs. Generated by Matcher.get_all_pairs.

    Parameters
    ----------
    X: Matrix containing samples in rows
    pairs: dictionary with list of index pairs.

    Returns
    -------
    list-like of correlation values and median of number of replicates
    """
    pair_ix = np.vstack(list(pairs.values()))
    corr_fn = get_similarity_fn("correlation", progress_bar=progress_bar)
    corrs = corr_fn(X, pair_ix, batch_size=20000)
    counts = [len(v) for v in pairs.values()]

    if len(sameby) == 1:
        sameby_vals = np.repeat(list(pairs.keys()), counts)
    else:
        sameby_vals = np.repeat(list(map("_".join, pairs.keys())), counts)

    sameby_col = "_".join(sameby)

    corrs = pd.DataFrame(
        {
            sameby_col: sameby_vals,
            "corr": corrs,
            "row_x": pair_ix[:, 0],
            "row_y": pair_ix[:, 1],
        }
    )
    corrs = corrs.groupby(sameby_col).agg(
        {
            "corr": ["median", "count"],
            "row_x": "nunique",
        }
    )

    median_num_repl = int(corrs["row_x", "nunique"].median())
    corr_dist = corrs["corr"]

    return corr_dist, median_num_repl

correlation_test(X, meta, sameby, diffby, n_samples=1000, progress_bar=True)

Generate Null and replicate distribution for replicate correlation analysis.

Source code in src/copairs/replicating.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def correlation_test(
    X: np.ndarray,
    meta: pd.DataFrame,
    sameby: List[str],
    diffby: List[str],
    n_samples: int = 1000,
    progress_bar: bool = True,
) -> CorrelationTestResult:
    """Generate Null and replicate distribution for replicate correlation analysis."""
    corr_df, median_num_repl = corr_between_replicates(
        X, meta, sameby, diffby, progress_bar=progress_bar
    )

    n_replicates = min(median_num_repl, 50)
    null_dist = corr_between_non_replicates(
        X,
        meta,
        n_samples=n_samples,
        n_replicates=n_replicates,
        diffby=sameby + diffby,
        progress_bar=progress_bar,
    )

    return CorrelationTestResult(corr_df, null_dist)

correlation_test_from_pairs(X, pairs, null_pairs, sameby, progress_bar=True)

Generate Null and replicate distribution for replicate correlation analysis.

Source code in src/copairs/replicating.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def correlation_test_from_pairs(
    X: np.ndarray,
    pairs: dict,
    null_pairs: list,
    sameby: list,
    progress_bar: bool = True,
) -> CorrelationTestResult:
    """Generate Null and replicate distribution for replicate correlation analysis."""
    corr_df, median_num_repl = corr_from_pairs(
        X, pairs, sameby, progress_bar=progress_bar
    )
    n_replicates = min(median_num_repl, 50)
    null_dist = corr_from_null_pairs(
        X, null_pairs, n_replicates, progress_bar=progress_bar
    )
    return CorrelationTestResult(corr_df, null_dist)