IDPForge/mk_ldr_template.py at main · THGLab/IDPForge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Auxilary file for preparing the LDR inputs
# created by OZ, 1/8/25
# modified by SDC, 3/12/26

import numpy as np
import mdtraj as md
import argparse
import sys

try:
    from idpforge.utils.np_utils import (
        process_pdb, assign_rama,
        get_chi_angles
    )
except ImportError:
    print("Error: Could not import from 'idpforge.utils.np_utils'.")
    print("       Ensure IDPForge is installed.")
    sys.exit(1)


def sample_fix_distance(batch_size, fixed_distance,
                        exclude_center=[], exclude_radius=[], tol=0.8): # Default: 0.8 Rg
    """
    Generates points on a sphere, vectorized to efficiently exclude multiple
    regions.
    """
    if batch_size == 0:
        return np.empty((0, 3))

    # Generate random angles
    theta = np.random.uniform(0, 2 * np.pi, batch_size)
    phi = np.random.uniform(0, np.pi, batch_size)

    # Convert to Cartesian coordinates
    x = fixed_distance * np.sin(phi) * np.cos(theta)
    y = fixed_distance * np.sin(phi) * np.sin(theta)
    z = fixed_distance * np.cos(phi)
    vectors = np.stack([x, y, z], axis=1)

    # If there's nothing to exclude, return all vectors
    if not exclude_center:
        return vectors

    # Convert inputs to numpy arrays for broadcasting
    centers = np.atleast_2d(np.array(exclude_center))

    # Calculate squared radii for comparison
    radii_sq = (np.array(exclude_radius) * tol)**2

    # Calculate differences between each vector and each center
    diffs = vectors[:, np.newaxis, :] - centers[np.newaxis, :, :]

    # Calculate squared distances
    dists_sq = np.einsum('nmj,nmj->nm', diffs, diffs)

    # Check if distances are less than squared radii
    is_too_close = dists_sq < radii_sq[np.newaxis, :]

    # Identify points that are too close to any exclusion zone
    is_bad_point = np.any(is_too_close, axis=1)

    # Return only the 'good' points
    return vectors[~is_bad_point]


def est_distance(n):
    """Estimate Rg-based distance for a disordered tail/loop.

    Hofmann et al. 2012 (PNAS 109:16155), Eq. 3:
    Rg = sqrt(2*lp*b / ((2v+1)(2v+2))) * N^v
    With lp=4.0 A, b=3.8 A, v=0.588: Rg ≈ 2.10 * N^0.588
    """
    return 2.10 * n ** 0.588


def calc_rg(crd):
    """Optimized Rg calculation."""
    # Filter out 0,0,0 atoms before calculating Rg
    valid_atoms = crd[np.any(crd != 0, axis=1)]
    if valid_atoms.shape[0] == 0:
        return 1.0 # Return a small default if no valid atoms

    # Calculate deviations from the mean
    mean_coord = valid_atoms.mean(axis=0)
    dev = valid_atoms - mean_coord

    # Calculate Rg^2
    rg_sq = np.einsum('ij,ij->', dev, dev) / valid_atoms.shape[0]
    return np.sqrt(rg_sq)


def main(pdb, disorder_idx, nsample):
    crd, seq = process_pdb(pdb)
    torsion = get_chi_angles(crd, seq)[0]
    torsion_vec = np.stack((np.sin(torsion), np.cos(torsion)), axis=-1)

    traj = md.load(pdb)
    dssp = md.compute_dssp(traj, simplified=True)[0]
    phis = md.compute_phi(traj)[1][0]
    psis = md.compute_psi(traj)[1][0]
    phis = np.concatenate(([-180], np.degrees(phis)))
    psis = np.concatenate((np.degrees(psis), [180]))
    rama = assign_rama(np.stack([phis, psis], axis=-1))
    encode = "".join([dssp[i] if dssp[i] in ["H", "E"] else rama[i] for i in range(len(dssp))])
    atom_mask = crd.sum(axis=-1) == 0

    disorder_idx_list = list(disorder_idx)
    if not disorder_idx_list:
         raise ValueError("Disorder index list is empty.")

    start_idx = disorder_idx_list[0]
    end_idx = disorder_idx_list[-1]

    # --- Define anchor point and exclusion zones ---
    if start_idx == 0: # N-terminal tail
        folded_center = crd[end_idx+1, 1]
        exclude_coords = [crd[end_idx+1:, 1]]
    elif end_idx == crd.shape[0] - 1: # C-terminal tail
        folded_center = crd[start_idx-1, 1]
        exclude_coords = [crd[:start_idx, 1]]
    else: # Loop
        folded_center = crd[[start_idx-1, end_idx+1], 1].mean(axis=0)
        exclude_coords = [crd[:start_idx, 1], crd[end_idx+1:, 1]]

    # Calculate COM and Rg for exclusion zones
    exclude_center = [c.mean(axis=0) for c in exclude_coords if c.shape[0] > 0]
    exclude_radius = [calc_rg(c) for c in exclude_coords if c.shape[0] > 0]

    # 1. Shift main coordinates so anchor point is at (0,0,0)
    crd -= folded_center
    # 2. Shift exclusion centers relative to the new origin
    shifted_exclude_center = [c - folded_center for c in exclude_center]

    d = est_distance(len(disorder_idx_list))

    batch_multiplier = 10 # Start by sampling 10x more than needed
    max_tries = 10
    tries = 0

    # Initial attempt
    initial_batch_size = nsample * batch_multiplier
    noise_init_list = [sample_fix_distance(initial_batch_size, d,
                                           shifted_exclude_center, exclude_radius)]

    points_found = len(noise_init_list[0])

    while points_found < nsample and tries < max_tries:
        print(f"     ... {points_found}/{nsample} points found. Sampling more (Try {tries + 1}/{max_tries})", flush=True)

        needed = nsample - points_found

        # Request a new batch, scaled by the multiplier
        batch_size = needed * batch_multiplier

        new_points = sample_fix_distance(batch_size, d,
                                          shifted_exclude_center, exclude_radius)

        if len(new_points) == 0:
            # If we get nothing, the region is tough to sample.
            # Increase multiplier for the next attempt.
            batch_multiplier *= 2
        else:
            noise_init_list.append(new_points)
            points_found += len(new_points)

        tries += 1

    # --- Fallback and Finalizing ---
    if points_found == 0:
        print(f"     Warning: Could not find any non-clashing points. Using a default vector. It is recommended to review this template.", flush=True)
        noise_init = np.array([[d, 0.0, 0.0]]) # A simple default point
    else:
        noise_init = np.concatenate(noise_init_list, axis=0)

    # Ensure the final array has the correct shape
    if len(noise_init) < nsample:
        print(f"     Warning: Only found {len(noise_init)}/{nsample} points. Duplicating to fill. It is recommended to review this template.", flush=True)
        repeat_times = int(np.ceil(nsample / len(noise_init)))
        noise_init = np.tile(noise_init, (repeat_times, 1))

    # Safely take the first nsample points and add noise
    permuted = noise_init[:nsample] + np.random.normal(0, 3, size=(nsample, 3))

    # Zero-out missing atoms
    i, j = np.where(atom_mask)
    crd[i, j] = 0

    # Create template
    template = {"coord": crd, "torsion": torsion_vec, "sec": encode, "seq": seq}

    mask = np.ones(len(crd), dtype=bool)
    mask[disorder_idx_list] = False
    template["mask"] = mask
    template["coord_offset"] = permuted
    return template


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Prepare LDR inputs from a PDB file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('input', help="Input PDB file.")
    parser.add_argument('disorder_domain', help="Specify residue number for disordered region (1-index, both ends inclusive). Example: 1-15 or 38-129")
    parser.add_argument('output', help="Output .npz file.")
    parser.add_argument('--nconf', default=200, type=int, help="Number of conformations (starting points) to generate.")
    args = parser.parse_args()

    try:
        i, j = args.disorder_domain.split("-")
        # Convert from 1-based (inclusive) to 0-based (exclusive end)
        disorder_range = range(int(i)-1, int(j))
        out = main(args.input, disorder_range, args.nconf)
        np.savez(args.output, **out)
        print(f"Successfully generated {args.output} with {args.nconf} conformations.")
    except Exception as e:
        print(f"Error processing {args.input} with {args.disorder_domain}: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc()
        sys.exit(1)