proteusPy.DisulfideLoader

This module is part of the proteusPy package, a Python package for the analysis and modeling of protein structures, with an emphasis on disulfide bonds. This work is based on the original C/C++ implementation by Eric G. Suchanek.

Author: Eric G. Suchanek, PhD Last revision: 2/9/2024

  1"""
  2This module is part of the proteusPy package, a Python package for 
  3the analysis and modeling of protein structures, with an emphasis on disulfide bonds.
  4This work is based on the original C/C++ implementation by Eric G. Suchanek. \n
  5
  6Author: Eric G. Suchanek, PhD
  7Last revision: 2/9/2024
  8"""
  9
 10import copy
 11import pickle
 12import sys
 13import time
 14
 15import pandas as pd
 16
 17import proteusPy
 18from proteusPy.atoms import *
 19from proteusPy.data import *
 20from proteusPy.data import (
 21    DATA_DIR,
 22    LOADER_FNAME,
 23    LOADER_SUBSET_FNAME,
 24    SS_DICT_PICKLE_FILE,
 25    SS_PICKLE_FILE,
 26    SS_TORSIONS_FILE,
 27)
 28from proteusPy.Disulfide import Disulfide
 29from proteusPy.DisulfideClass_Constructor import DisulfideClass_Constructor
 30from proteusPy.DisulfideExceptions import *
 31from proteusPy.DisulfideList import DisulfideList
 32from proteusPy.ProteusGlobals import MODEL_DIR, PDB_DIR, REPO_DATA_DIR
 33
 34try:
 35    # Check if running in Jupyter
 36    shell = get_ipython().__class__.__name__
 37    if shell == "ZMQInteractiveShell":
 38        from tqdm.notebook import tqdm
 39    else:
 40        from tqdm import tqdm
 41except NameError:
 42    from tqdm import tqdm
 43
 44# Now use tqdm as normal, depending on your environment
 45
 46
 47class DisulfideLoader:
 48    """
 49    This class represents the disulfide database itself and is its primary means of accession.
 50    The entirety of the RCSB disulfide database is stored within the class via a
 51    proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of
 52    indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow
 53    simple, direct and flexible access to the disulfide structures contained herein.
 54    This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name.
 55
 56    The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the
 57    [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.\n
 58
 59    Important note: For typical usage one will access the database via the `Load_PDB_SS()` function.
 60    The difference is that the latter function loads the compressed database from its single
 61    source. the `Load_PDB_SS()` function will load the individual torsions and disulfide .pkl,
 62    builds the classlist structures.
 63
 64    *Developer's Notes:*
 65    The .pkl files needed to instantiate this class and save it into its final .pkl file are
 66    defined in the proteusPy.data class and should not be changed. Upon initialization the class
 67    will load them and initialize itself.
 68
 69    """
 70
 71    def __init__(
 72        self,
 73        verbose: bool = True,
 74        datadir: str = REPO_DATA_DIR,
 75        picklefile: str = SS_PICKLE_FILE,
 76        pickle_dict_file: str = SS_DICT_PICKLE_FILE,
 77        torsion_file: str = SS_TORSIONS_FILE,
 78        quiet: bool = True,
 79        subset: bool = False,
 80        cutoff: float = -1.0,
 81    ) -> None:
 82        """
 83        Initializing the class initiates loading either the entire Disulfide dataset,
 84        or the 'subset', which consists of the first 1000 PDB structures. The subset
 85        is useful for testing and debugging since it doesn't require nearly as much
 86        memory or time. The name for the subset file is hard-coded. One can pass a
 87        different data directory and file names for the pickle files. These different
 88        directories are normally established with the proteusPy.Extract_Disulfides
 89        function.
 90        """
 91
 92        self.ModelDir = datadir
 93        self.PickleFile = f"{datadir}{picklefile}"
 94        self.PickleDictFile = f"{datadir}{pickle_dict_file}"
 95        self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}"
 96        self.TorsionFile = f"{datadir}{torsion_file}"
 97        self.SSList = DisulfideList([], "ALL_PDB_SS")
 98        self.SSDict = {}
 99        self.TorsionDF = pd.DataFrame()
100        self.TotalDisulfides = 0
101        self.IDList = []
102        self.QUIET = quiet
103
104        self.tclass = None  # disulfideClass_constructor to manage classes
105        self.cutoff = cutoff  # distance cutoff used to bulid the database
106        self.verbose = verbose
107        self.timestamp = time.time()
108        self.version = proteusPy.__version__
109
110        idlist = []
111
112        if subset:
113            self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}"
114            self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}"
115            self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}"
116
117        if self.verbose:
118            print(
119                f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ",
120                end="",
121            )
122
123        with open(self.PickleFile, "rb") as f:
124            # sslist = pd.compat.pickle_compat.load(f)
125            sslist = pickle.load(f)
126            self.SSList = sslist
127            self.TotalDisulfides = len(self.SSList)
128
129        if self.verbose:
130            print(
131                f"done.",
132            )
133
134        if self.verbose:
135            print(
136                f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...",
137                end="",
138            )
139
140        with open(self.PickleDictFile, "rb") as f:
141
142            self.SSDict = pickle.load(f)
143            # self.SSDict = pd.compat.pickle_compat.load(f)
144
145            for key in self.SSDict:
146                idlist.append(key)
147            self.IDList = idlist.copy()
148            totalSS_dict = len(self.IDList)
149
150        if self.verbose:
151            print(f"done.")
152
153        if self.verbose:
154            print(
155                f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...",
156                end="",
157            )
158
159        tmpDF = pd.read_csv(self.TorsionFile)
160        tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True)
161
162        self.TorsionDF = tmpDF.copy()
163        self.TotalDisulfides = len(self.SSList)
164
165        if self.verbose:
166            print(f" done.")
167
168        self.tclass = DisulfideClass_Constructor(self, self.verbose)
169
170        if self.verbose:
171            print(f"-> DisulfideLoader(): Loading complete.")
172            self.describe()
173        return
174
175    # overload __getitem__ to handle slicing and indexing, and access by name
176
177    def __getitem__(self, item):
178        """
179        Implements indexing and slicing to retrieve DisulfideList objects from the
180        DisulfideLoader. Supports:
181
182        - Integer indexing to retrieve a single DisulfideList
183        - Slicing to retrieve a subset as a DisulfideList
184        - Lookup by PDB ID to retrieve all Disulfides for that structure
185        - Lookup by full disulfide name
186
187        Raises DisulfideException on invalid indices or names.
188        """
189
190        res = DisulfideList([], "none")
191
192        if isinstance(item, slice):
193            indices = range(*item.indices(len(self.SSList)))
194            name = self.SSList[0].pdb_id
195            resolution = self.SSList[0].resolution
196            sublist = [self.SSList[i] for i in indices]
197            return DisulfideList(sublist, name, resolution)
198
199        if isinstance(item, int):
200            if item < 0 or item >= self.TotalDisulfides:
201                mess = f"DisulfideLoader(): Index {item} out of range 0-{self.TotalDisulfides - 1}"
202                raise DisulfideException(mess)
203            else:
204                return self.SSList[item]
205
206        try:
207            # PDB_SS['4yys'] return a list of SS
208            indices = self.SSDict[item]
209            res = DisulfideList([], item)
210            sslist = self.SSList
211            for ind in indices:
212                res.append(sslist[ind])
213            res.resolution = res[0].resolution
214
215        except KeyError:
216            try:
217                res = self.SSList.get_by_name(item)  # full disulfide name
218            except:
219                mess = f"DisulfideLoader(): Cannot find key {item} in SSBond dict!"
220                raise DisulfideException(mess)
221        return res
222
223    def __setitem__(self, index, item):
224        self.SSList[index] = self._validate_ss(item)
225
226    def _validate_ss(self, value):
227        if isinstance(value, (Disulfide)):
228            return value
229        raise TypeError(f"Disulfide object expected, got {type(value).__name__}")
230
231    @property
232    def Average_Resolution(self) -> float:
233        """
234        Compute and return the average structure resolution for the given list.
235
236        :return: Average resolution (A)
237        """
238        res = 0.0
239        cnt = 1
240        sslist = self.SSList
241
242        for ss in sslist:
243            _res = ss.resolution
244            if _res is not None and res != -1.0:
245                res += _res
246                cnt += 1
247        return res / cnt
248
249    def build_ss_from_idlist(self, idlist):
250        """
251        Given a list of PDBid, return a DisulfideList of Disulfides
252
253        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
254        :return: DisulfideList
255        """
256        res = DisulfideList([], "tmp")
257
258        for id in idlist:
259            for ss in self.SSList:
260                if ss.pdb_id == id:
261                    res.append(ss)
262                    break
263        return res
264
265    def copy(self):
266        """
267        Return a copy of self.
268
269        :return: Copy of self
270        """
271        return copy.deepcopy(self)
272
273    def extract_class(self, clsid) -> DisulfideList:
274        """
275        Return the list of disulfides corresponding to the input `clsid`.
276
277        :param clsid: The class name to extract.
278        :return: The list of disulfide bonds from the class.
279        """
280
281        # from tqdm import tqdm
282        six = self.tclass.sixclass_df
283        tot_classes = six.shape[0]
284        class_disulfides = DisulfideList([], clsid, quiet=True)
285        _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True)
286        for idx, row in _pbar:
287            _cls = row["class_id"]
288            if _cls == clsid:
289                ss_list = row["ss_id"]
290                pbar = tqdm(ss_list, leave=True)
291                for ssid in pbar:
292                    class_disulfides.append(self[ssid])
293                pbar.set_postfix({"Done": ""})
294                break
295
296            _pbar.set_postfix({"Cnt": idx})
297
298        return class_disulfides
299
300    def getlist(self) -> DisulfideList:
301        """
302        Return the list of Disulfides contained in the class.
303
304        :return: DisulfideList
305        :rtype: DisulfideList
306        """
307        return copy.deepcopy(self.SSList)
308
309    def get_by_name(self, name) -> Disulfide:
310        """
311        Returns the Disulfide with the given name from the list.
312        """
313        for ss in self.SSList.data:
314            if ss.name == name:
315                return ss  # or ss.copy() !!!
316        return None
317
318    def describe(self) -> None:
319        """
320        Provides information about the Disulfide database contained in `self`.
321
322        Example:<br>
323
324        ```python
325        from proteusPy import Load_PDB_SS
326        PDB_SS = Load_PDB_SS(verbose=False, subset=False)
327        PDB_SS.describe()
328             =========== RCSB Disulfide Database Summary ==============
329                 =========== Built: 2024-02-12 17:48:13 ==============
330        PDB IDs present:                    35818
331        Disulfides loaded:                  120494
332        Average structure resolution:       2.34 Ã…
333        Lowest Energy Disulfide:            2q7q_75D_140D
334        Highest Energy Disulfide:           1toz_456A_467A
335        Cα distance cutoff:                 8.00 Å
336        Total RAM Used:                     30.72 GB.
337            ================= proteusPy: 0.91 =======================
338        ```
339        """
340        vers = self.version
341        tot = self.TotalDisulfides
342        pdbs = len(self.SSDict)
343        ram = (
344            sys.getsizeof(self.SSList)
345            + sys.getsizeof(self.SSDict)
346            + sys.getsizeof(self.TorsionDF)
347        ) / (1024 * 1024)
348        res = self.Average_Resolution
349        cutoff = self.cutoff
350        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
351        ssMin, ssMax = self.SSList.minmax_energy
352
353        print(f"    =========== RCSB Disulfide Database Summary ==============")
354        print(f"       =========== Built: {timestr} ==============")
355        print(f"PDB IDs present:                    {pdbs}")
356        print(f"Disulfides loaded:                  {tot}")
357        print(f"Average structure resolution:       {res:.2f} Ã…")
358        print(f"Lowest Energy Disulfide:            {ssMin.name}")
359        print(f"Highest Energy Disulfide:           {ssMax.name}")
360        print(f"Cα distance cutoff:                 {cutoff:.2f} Å")
361        print(f"Total RAM Used:                     {ram:.2f} GB.")
362        print(f"    ================= proteusPy: {vers} =======================")
363
364    def display_overlay(self, pdbid) -> None:
365        """
366        Display all disulfides for a given PDB ID overlaid in stick mode against
367        a common coordinate frame. This allows us to see all of the disulfides
368        at one time in a single view. Colors vary smoothy between bonds.
369
370        :param self: DisulfideLoader object initialized with the database.
371        :param pdbid: the PDB id string, e.g. 4yys
372        :return: None
373
374        Example:
375        >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList
376
377        Instantiate the Loader with the SS database subset.
378
379        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
380
381        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
382        a common reference (the proximal disulfides).
383
384        >>> PDB_SS.display_overlay('4yys')
385
386        You can also slice the loader and display as an overly.
387        >>> PDB_SS[:8].display_overlay()
388
389        """
390
391        ssbonds = self[pdbid]
392        ssbonds.display_overlay()
393        return
394
395    def getTorsions(self, pdbID=None) -> pd.DataFrame:
396        """
397        Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols
398
399        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
400        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
401            then return the entire dataset.
402        :raises DisulfideParseWarning: Raised if not found
403        :return: Torsions Dataframe
404        :rtype: pd.DataFrame
405
406        Example:
407        >>> from proteusPy import Load_PDB_SS
408        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
409        >>> Tor_DF = PDB_SS.getTorsions()
410        """
411        res_df = pd.DataFrame()
412
413        if pdbID:
414            try:
415                res = self.SSDict[pdbID]
416                sel = self.TorsionDF["source"] == pdbID
417                res_df = self.TorsionDF[sel]
418                return res_df.copy()
419            except KeyError:
420                mess = f"! Cannot find key {pdbID} in SSBond DB"
421                raise DisulfideParseWarning(mess)
422        else:
423            return copy.deepcopy(self.TorsionDF)
424
425    def list_binary_classes(self):
426        for k, v in enumerate(self.classdict):
427            print(f"Class: |{k}|, |{v}|")
428
429    @property
430    def quiet(self) -> bool:
431        """
432        The loader quiet state
433
434        :return: quiet parameter
435        :rtype: bool
436        """
437        return self.QUIET
438
439    @quiet.setter
440    def quiet(self, perm: bool) -> None:
441        """
442        Sets the quiet attribute for the loader. This silences many of the BIO.PDB warnings.
443
444        :param perm: True or False
445        :type perm: bool
446        """
447        self.QUIET = perm
448
449    def plot_classes_vs_cutoff(self, cutoff, steps) -> None:
450        """
451        Plot the total percentage and number of members for each class against the cutoff value.
452
453        :param cutoff: Percent cutoff value for filtering the classes.
454        :return: None
455        """
456
457        import matplotlib.pyplot as plt
458        import numpy as np
459
460        _cutoff = np.linspace(0, cutoff, steps)
461        tot_list = []
462        members_list = []
463
464        for c in _cutoff:
465            class_df = self.tclass.filter_sixclass_by_percentage(c)
466            tot = class_df["percentage"].sum()
467            tot_list.append(tot)
468            members_list.append(class_df.shape[0])
469            print(
470                f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long."
471            )
472
473        fig, ax1 = plt.subplots()
474
475        ax2 = ax1.twinx()
476        ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue")
477        ax2.plot(_cutoff, members_list, label="Number of members", color="red")
478
479        ax1.set_xlabel("Cutoff")
480        ax1.set_ylabel("Total percentage", color="blue")
481        ax2.set_ylabel("Number of members", color="red")
482
483        plt.show()
484
485    def plot_binary_to_sixclass_incidence(
486        self, light=True, save=False, savedir="."
487    ) -> None:
488        """
489        Plot the incidence of all sextant Disulfide classes for a given binary class.
490
491        :param loader: `proteusPy.DisulfideLoader` object
492        """
493
494        from proteusPy.DisulfideClasses import plot_count_vs_class_df
495
496        def _enumerate_sixclass_fromlist(sslist):
497            x = []
498            y = []
499
500            for sixcls in sslist:
501                if sixcls is not None:
502                    _y = self.tclass.sslist_from_classid(sixcls)
503                    # it's possible to have 0 SS in a class
504                    if _y is not None:
505                        # only append if we have both.
506                        x.append(sixcls)
507                        y.append(len(_y))
508
509            sslist_df = pd.DataFrame(columns=["class_id", "count"])
510            sslist_df["class_id"] = x
511            sslist_df["count"] = y
512            return sslist_df
513
514        clslist = self.tclass.classdf["class_id"]
515        for cls in clslist:
516            sixcls = self.tclass.binary_to_six_class(cls)
517            df = _enumerate_sixclass_fromlist(sixcls)
518            plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir)
519        return
520
521    def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame:
522        x = []
523        y = []
524
525        for sixcls in sslist:
526            if sixcls is not None:
527                _y = self.tclass.sslist_from_classid(sixcls)
528                # it's possible to have 0 SS in a class
529                if _y is not None:
530                    # only append if we have both.
531                    x.append(sixcls)
532                    y.append(len(_y))
533
534        sslist_df = pd.DataFrame(columns=["class_id", "count"])
535        sslist_df["class_id"] = x
536        sslist_df["count"] = y
537        return sslist_df
538
539    def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0):
540        """
541        Save a copy of the fully instantiated Loader to the specified file.
542
543        :param savepath: Path to save the file, defaults to DATA_DIR
544        :param fname: Filename, defaults to LOADER_FNAME
545        :param verbose: Verbosity, defaults to False
546        :param cutoff: Distance cutoff used to build the database, -1 means no cutoff.
547        """
548        self.version = proteusPy.__version__
549        self.cutoff = cutoff
550
551        if subset:
552            fname = LOADER_SUBSET_FNAME
553        else:
554            fname = LOADER_FNAME
555
556        _fname = f"{savepath}{fname}"
557
558        if self.verbose:
559            print(f"-> DisulfideLoader.save(): Writing {_fname}... ")
560
561        with open(_fname, "wb+") as f:
562            pickle.dump(self, f)
563
564        if self.verbose:
565            print(f"-> DisulfideLoader.save(): Done.")
566
567
568# class ends
569
570
571def Download_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False):
572    """
573    Download the databases from my Google Drive.
574
575    :param loadpath: Path from which to load, defaults to DATA_DIR
576    :param verbose: Verbosity, defaults to False
577    """
578
579    import gdown
580
581    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
582    _fname_all = f"{loadpath}{LOADER_FNAME}"
583
584    if verbose:
585        print(f"--> DisulfideLoader: Downloading Disulfide Database from Drive...")
586
587    gdown.download(LOADER_ALL_URL, _fname_all, quiet=False)
588
589    if subset:
590        if verbose:
591            print(
592                f"--> DisulfideLoader: Downloading Disulfide Subset Database from Drive..."
593            )
594
595        gdown.download(LOADER_SUBSET_URL, _fname_sub, quiet=False)
596
597    return
598
599
600def Download_PDB_SS_GitHub(loadpath=DATA_DIR, verbose=True, subset=False):
601    """
602    Download the databases from Github. Note: if you change the database these sizes will
603    need to be changed!
604
605    :param loadpath: Path from which to load, defaults to DATA_DIR
606    :param verbose: Verbosity, defaults to True
607    """
608
609    import urllib
610
611    _good1 = 0  # all data
612    _good2 = 0  # subset data
613
614    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
615    _fname_all = f"{loadpath}{LOADER_FNAME}"
616
617    _all_length = 340371775
618    _subset_length = 9636086
619
620    if verbose:
621        print(f"--> DisulfideLoader: Downloading Disulfide Database from GitHub...")
622
623    resp, headers = urllib.request.urlretrieve(
624        "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_ALL_LOADER.pkl",
625        _fname_all,
626    )
627    num_bytes = headers.get("content-length")
628    if num_bytes == _all_length:
629        _good1 = 1
630    else:
631        print(f"--> Read: {num_bytes}, expecting: {_all_length}")
632
633    if subset:
634        if verbose:
635            print(
636                f"--> DisulfideLoader: Downloading Disulfide Subset Database from GitHub..."
637            )
638
639        resp, headers = urllib.request.urlretrieve(
640            "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_SUBSET_LOADER.pkl",
641            _fname_sub,
642        )
643        num_bytes = headers.get("content-length")
644        if num_bytes == _subset_length:
645            _good2 = 1
646        else:
647            print(f"--> Read: {num_bytes}, expecting: {_subset_length}")
648    return _good1 + _good2
649
650
651def Load_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False) -> DisulfideLoader:
652    """
653    Load the fully instantiated Disulfide database from the specified file. Use the
654    defaults unless you are building the database by hand. *This is the function
655    used to load the built database.*
656
657    :param loadpath: Path from which to load, defaults to DATA_DIR
658    :param verbose: Verbosity, defaults to False
659    :param subset: If True, load the subset DB, otherwise load the full database
660    """
661    # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy
662    # distribution. In that case we need to download the files for all disulfides and the subset
663    # from the GitHub.
664
665    _good1 = False  # all data
666    _good2 = False  # subset data
667
668    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
669    _fname_all = f"{loadpath}{LOADER_FNAME}"
670
671    if subset:
672        _fname = _fname_sub
673    else:
674        _fname = _fname_all
675
676    if not os.path.exists(_fname_sub):
677        res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=True)
678
679    if not os.path.exists(_fname_all):
680        res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=False)
681
682    # first attempt to read the local copy of the loader
683    if verbose:
684        print(f"-> load_PDB_SS(): Reading {_fname}... ")
685
686    with open(_fname, "rb") as f:
687        res = pickle.load(f)
688        # res = pd.compat.pickle_compat.load(f)
689
690    if verbose:
691        print(f"-> load_PDB_SS(): Done reading {_fname}... ")
692    return res
693
694
695if __name__ == "__main__":
696    import doctest
697
698    doctest.testmod()
699
700# End of file
class DisulfideLoader:
 48class DisulfideLoader:
 49    """
 50    This class represents the disulfide database itself and is its primary means of accession.
 51    The entirety of the RCSB disulfide database is stored within the class via a
 52    proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of
 53    indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow
 54    simple, direct and flexible access to the disulfide structures contained herein.
 55    This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name.
 56
 57    The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the
 58    [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.\n
 59
 60    Important note: For typical usage one will access the database via the `Load_PDB_SS()` function.
 61    The difference is that the latter function loads the compressed database from its single
 62    source. the `Load_PDB_SS()` function will load the individual torsions and disulfide .pkl,
 63    builds the classlist structures.
 64
 65    *Developer's Notes:*
 66    The .pkl files needed to instantiate this class and save it into its final .pkl file are
 67    defined in the proteusPy.data class and should not be changed. Upon initialization the class
 68    will load them and initialize itself.
 69
 70    """
 71
 72    def __init__(
 73        self,
 74        verbose: bool = True,
 75        datadir: str = REPO_DATA_DIR,
 76        picklefile: str = SS_PICKLE_FILE,
 77        pickle_dict_file: str = SS_DICT_PICKLE_FILE,
 78        torsion_file: str = SS_TORSIONS_FILE,
 79        quiet: bool = True,
 80        subset: bool = False,
 81        cutoff: float = -1.0,
 82    ) -> None:
 83        """
 84        Initializing the class initiates loading either the entire Disulfide dataset,
 85        or the 'subset', which consists of the first 1000 PDB structures. The subset
 86        is useful for testing and debugging since it doesn't require nearly as much
 87        memory or time. The name for the subset file is hard-coded. One can pass a
 88        different data directory and file names for the pickle files. These different
 89        directories are normally established with the proteusPy.Extract_Disulfides
 90        function.
 91        """
 92
 93        self.ModelDir = datadir
 94        self.PickleFile = f"{datadir}{picklefile}"
 95        self.PickleDictFile = f"{datadir}{pickle_dict_file}"
 96        self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}"
 97        self.TorsionFile = f"{datadir}{torsion_file}"
 98        self.SSList = DisulfideList([], "ALL_PDB_SS")
 99        self.SSDict = {}
100        self.TorsionDF = pd.DataFrame()
101        self.TotalDisulfides = 0
102        self.IDList = []
103        self.QUIET = quiet
104
105        self.tclass = None  # disulfideClass_constructor to manage classes
106        self.cutoff = cutoff  # distance cutoff used to bulid the database
107        self.verbose = verbose
108        self.timestamp = time.time()
109        self.version = proteusPy.__version__
110
111        idlist = []
112
113        if subset:
114            self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}"
115            self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}"
116            self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}"
117
118        if self.verbose:
119            print(
120                f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ",
121                end="",
122            )
123
124        with open(self.PickleFile, "rb") as f:
125            # sslist = pd.compat.pickle_compat.load(f)
126            sslist = pickle.load(f)
127            self.SSList = sslist
128            self.TotalDisulfides = len(self.SSList)
129
130        if self.verbose:
131            print(
132                f"done.",
133            )
134
135        if self.verbose:
136            print(
137                f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...",
138                end="",
139            )
140
141        with open(self.PickleDictFile, "rb") as f:
142
143            self.SSDict = pickle.load(f)
144            # self.SSDict = pd.compat.pickle_compat.load(f)
145
146            for key in self.SSDict:
147                idlist.append(key)
148            self.IDList = idlist.copy()
149            totalSS_dict = len(self.IDList)
150
151        if self.verbose:
152            print(f"done.")
153
154        if self.verbose:
155            print(
156                f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...",
157                end="",
158            )
159
160        tmpDF = pd.read_csv(self.TorsionFile)
161        tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True)
162
163        self.TorsionDF = tmpDF.copy()
164        self.TotalDisulfides = len(self.SSList)
165
166        if self.verbose:
167            print(f" done.")
168
169        self.tclass = DisulfideClass_Constructor(self, self.verbose)
170
171        if self.verbose:
172            print(f"-> DisulfideLoader(): Loading complete.")
173            self.describe()
174        return
175
176    # overload __getitem__ to handle slicing and indexing, and access by name
177
178    def __getitem__(self, item):
179        """
180        Implements indexing and slicing to retrieve DisulfideList objects from the
181        DisulfideLoader. Supports:
182
183        - Integer indexing to retrieve a single DisulfideList
184        - Slicing to retrieve a subset as a DisulfideList
185        - Lookup by PDB ID to retrieve all Disulfides for that structure
186        - Lookup by full disulfide name
187
188        Raises DisulfideException on invalid indices or names.
189        """
190
191        res = DisulfideList([], "none")
192
193        if isinstance(item, slice):
194            indices = range(*item.indices(len(self.SSList)))
195            name = self.SSList[0].pdb_id
196            resolution = self.SSList[0].resolution
197            sublist = [self.SSList[i] for i in indices]
198            return DisulfideList(sublist, name, resolution)
199
200        if isinstance(item, int):
201            if item < 0 or item >= self.TotalDisulfides:
202                mess = f"DisulfideLoader(): Index {item} out of range 0-{self.TotalDisulfides - 1}"
203                raise DisulfideException(mess)
204            else:
205                return self.SSList[item]
206
207        try:
208            # PDB_SS['4yys'] return a list of SS
209            indices = self.SSDict[item]
210            res = DisulfideList([], item)
211            sslist = self.SSList
212            for ind in indices:
213                res.append(sslist[ind])
214            res.resolution = res[0].resolution
215
216        except KeyError:
217            try:
218                res = self.SSList.get_by_name(item)  # full disulfide name
219            except:
220                mess = f"DisulfideLoader(): Cannot find key {item} in SSBond dict!"
221                raise DisulfideException(mess)
222        return res
223
224    def __setitem__(self, index, item):
225        self.SSList[index] = self._validate_ss(item)
226
227    def _validate_ss(self, value):
228        if isinstance(value, (Disulfide)):
229            return value
230        raise TypeError(f"Disulfide object expected, got {type(value).__name__}")
231
232    @property
233    def Average_Resolution(self) -> float:
234        """
235        Compute and return the average structure resolution for the given list.
236
237        :return: Average resolution (A)
238        """
239        res = 0.0
240        cnt = 1
241        sslist = self.SSList
242
243        for ss in sslist:
244            _res = ss.resolution
245            if _res is not None and res != -1.0:
246                res += _res
247                cnt += 1
248        return res / cnt
249
250    def build_ss_from_idlist(self, idlist):
251        """
252        Given a list of PDBid, return a DisulfideList of Disulfides
253
254        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
255        :return: DisulfideList
256        """
257        res = DisulfideList([], "tmp")
258
259        for id in idlist:
260            for ss in self.SSList:
261                if ss.pdb_id == id:
262                    res.append(ss)
263                    break
264        return res
265
266    def copy(self):
267        """
268        Return a copy of self.
269
270        :return: Copy of self
271        """
272        return copy.deepcopy(self)
273
274    def extract_class(self, clsid) -> DisulfideList:
275        """
276        Return the list of disulfides corresponding to the input `clsid`.
277
278        :param clsid: The class name to extract.
279        :return: The list of disulfide bonds from the class.
280        """
281
282        # from tqdm import tqdm
283        six = self.tclass.sixclass_df
284        tot_classes = six.shape[0]
285        class_disulfides = DisulfideList([], clsid, quiet=True)
286        _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True)
287        for idx, row in _pbar:
288            _cls = row["class_id"]
289            if _cls == clsid:
290                ss_list = row["ss_id"]
291                pbar = tqdm(ss_list, leave=True)
292                for ssid in pbar:
293                    class_disulfides.append(self[ssid])
294                pbar.set_postfix({"Done": ""})
295                break
296
297            _pbar.set_postfix({"Cnt": idx})
298
299        return class_disulfides
300
301    def getlist(self) -> DisulfideList:
302        """
303        Return the list of Disulfides contained in the class.
304
305        :return: DisulfideList
306        :rtype: DisulfideList
307        """
308        return copy.deepcopy(self.SSList)
309
310    def get_by_name(self, name) -> Disulfide:
311        """
312        Returns the Disulfide with the given name from the list.
313        """
314        for ss in self.SSList.data:
315            if ss.name == name:
316                return ss  # or ss.copy() !!!
317        return None
318
319    def describe(self) -> None:
320        """
321        Provides information about the Disulfide database contained in `self`.
322
323        Example:<br>
324
325        ```python
326        from proteusPy import Load_PDB_SS
327        PDB_SS = Load_PDB_SS(verbose=False, subset=False)
328        PDB_SS.describe()
329             =========== RCSB Disulfide Database Summary ==============
330                 =========== Built: 2024-02-12 17:48:13 ==============
331        PDB IDs present:                    35818
332        Disulfides loaded:                  120494
333        Average structure resolution:       2.34 Ã…
334        Lowest Energy Disulfide:            2q7q_75D_140D
335        Highest Energy Disulfide:           1toz_456A_467A
336        Cα distance cutoff:                 8.00 Å
337        Total RAM Used:                     30.72 GB.
338            ================= proteusPy: 0.91 =======================
339        ```
340        """
341        vers = self.version
342        tot = self.TotalDisulfides
343        pdbs = len(self.SSDict)
344        ram = (
345            sys.getsizeof(self.SSList)
346            + sys.getsizeof(self.SSDict)
347            + sys.getsizeof(self.TorsionDF)
348        ) / (1024 * 1024)
349        res = self.Average_Resolution
350        cutoff = self.cutoff
351        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
352        ssMin, ssMax = self.SSList.minmax_energy
353
354        print(f"    =========== RCSB Disulfide Database Summary ==============")
355        print(f"       =========== Built: {timestr} ==============")
356        print(f"PDB IDs present:                    {pdbs}")
357        print(f"Disulfides loaded:                  {tot}")
358        print(f"Average structure resolution:       {res:.2f} Ã…")
359        print(f"Lowest Energy Disulfide:            {ssMin.name}")
360        print(f"Highest Energy Disulfide:           {ssMax.name}")
361        print(f"Cα distance cutoff:                 {cutoff:.2f} Å")
362        print(f"Total RAM Used:                     {ram:.2f} GB.")
363        print(f"    ================= proteusPy: {vers} =======================")
364
365    def display_overlay(self, pdbid) -> None:
366        """
367        Display all disulfides for a given PDB ID overlaid in stick mode against
368        a common coordinate frame. This allows us to see all of the disulfides
369        at one time in a single view. Colors vary smoothy between bonds.
370
371        :param self: DisulfideLoader object initialized with the database.
372        :param pdbid: the PDB id string, e.g. 4yys
373        :return: None
374
375        Example:
376        >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList
377
378        Instantiate the Loader with the SS database subset.
379
380        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
381
382        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
383        a common reference (the proximal disulfides).
384
385        >>> PDB_SS.display_overlay('4yys')
386
387        You can also slice the loader and display as an overly.
388        >>> PDB_SS[:8].display_overlay()
389
390        """
391
392        ssbonds = self[pdbid]
393        ssbonds.display_overlay()
394        return
395
396    def getTorsions(self, pdbID=None) -> pd.DataFrame:
397        """
398        Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols
399
400        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
401        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
402            then return the entire dataset.
403        :raises DisulfideParseWarning: Raised if not found
404        :return: Torsions Dataframe
405        :rtype: pd.DataFrame
406
407        Example:
408        >>> from proteusPy import Load_PDB_SS
409        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
410        >>> Tor_DF = PDB_SS.getTorsions()
411        """
412        res_df = pd.DataFrame()
413
414        if pdbID:
415            try:
416                res = self.SSDict[pdbID]
417                sel = self.TorsionDF["source"] == pdbID
418                res_df = self.TorsionDF[sel]
419                return res_df.copy()
420            except KeyError:
421                mess = f"! Cannot find key {pdbID} in SSBond DB"
422                raise DisulfideParseWarning(mess)
423        else:
424            return copy.deepcopy(self.TorsionDF)
425
426    def list_binary_classes(self):
427        for k, v in enumerate(self.classdict):
428            print(f"Class: |{k}|, |{v}|")
429
430    @property
431    def quiet(self) -> bool:
432        """
433        The loader quiet state
434
435        :return: quiet parameter
436        :rtype: bool
437        """
438        return self.QUIET
439
440    @quiet.setter
441    def quiet(self, perm: bool) -> None:
442        """
443        Sets the quiet attribute for the loader. This silences many of the BIO.PDB warnings.
444
445        :param perm: True or False
446        :type perm: bool
447        """
448        self.QUIET = perm
449
450    def plot_classes_vs_cutoff(self, cutoff, steps) -> None:
451        """
452        Plot the total percentage and number of members for each class against the cutoff value.
453
454        :param cutoff: Percent cutoff value for filtering the classes.
455        :return: None
456        """
457
458        import matplotlib.pyplot as plt
459        import numpy as np
460
461        _cutoff = np.linspace(0, cutoff, steps)
462        tot_list = []
463        members_list = []
464
465        for c in _cutoff:
466            class_df = self.tclass.filter_sixclass_by_percentage(c)
467            tot = class_df["percentage"].sum()
468            tot_list.append(tot)
469            members_list.append(class_df.shape[0])
470            print(
471                f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long."
472            )
473
474        fig, ax1 = plt.subplots()
475
476        ax2 = ax1.twinx()
477        ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue")
478        ax2.plot(_cutoff, members_list, label="Number of members", color="red")
479
480        ax1.set_xlabel("Cutoff")
481        ax1.set_ylabel("Total percentage", color="blue")
482        ax2.set_ylabel("Number of members", color="red")
483
484        plt.show()
485
486    def plot_binary_to_sixclass_incidence(
487        self, light=True, save=False, savedir="."
488    ) -> None:
489        """
490        Plot the incidence of all sextant Disulfide classes for a given binary class.
491
492        :param loader: `proteusPy.DisulfideLoader` object
493        """
494
495        from proteusPy.DisulfideClasses import plot_count_vs_class_df
496
497        def _enumerate_sixclass_fromlist(sslist):
498            x = []
499            y = []
500
501            for sixcls in sslist:
502                if sixcls is not None:
503                    _y = self.tclass.sslist_from_classid(sixcls)
504                    # it's possible to have 0 SS in a class
505                    if _y is not None:
506                        # only append if we have both.
507                        x.append(sixcls)
508                        y.append(len(_y))
509
510            sslist_df = pd.DataFrame(columns=["class_id", "count"])
511            sslist_df["class_id"] = x
512            sslist_df["count"] = y
513            return sslist_df
514
515        clslist = self.tclass.classdf["class_id"]
516        for cls in clslist:
517            sixcls = self.tclass.binary_to_six_class(cls)
518            df = _enumerate_sixclass_fromlist(sixcls)
519            plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir)
520        return
521
522    def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame:
523        x = []
524        y = []
525
526        for sixcls in sslist:
527            if sixcls is not None:
528                _y = self.tclass.sslist_from_classid(sixcls)
529                # it's possible to have 0 SS in a class
530                if _y is not None:
531                    # only append if we have both.
532                    x.append(sixcls)
533                    y.append(len(_y))
534
535        sslist_df = pd.DataFrame(columns=["class_id", "count"])
536        sslist_df["class_id"] = x
537        sslist_df["count"] = y
538        return sslist_df
539
540    def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0):
541        """
542        Save a copy of the fully instantiated Loader to the specified file.
543
544        :param savepath: Path to save the file, defaults to DATA_DIR
545        :param fname: Filename, defaults to LOADER_FNAME
546        :param verbose: Verbosity, defaults to False
547        :param cutoff: Distance cutoff used to build the database, -1 means no cutoff.
548        """
549        self.version = proteusPy.__version__
550        self.cutoff = cutoff
551
552        if subset:
553            fname = LOADER_SUBSET_FNAME
554        else:
555            fname = LOADER_FNAME
556
557        _fname = f"{savepath}{fname}"
558
559        if self.verbose:
560            print(f"-> DisulfideLoader.save(): Writing {_fname}... ")
561
562        with open(_fname, "wb+") as f:
563            pickle.dump(self, f)
564
565        if self.verbose:
566            print(f"-> DisulfideLoader.save(): Done.")

This class represents the disulfide database itself and is its primary means of accession. The entirety of the RCSB disulfide database is stored within the class via a proteusPy.DisulfideList, a Pandas .csv file, and a dict of indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow simple, direct and flexible access to the disulfide structures contained herein. This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name.

The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the display_overlay() method. See below for examples.

Important note: For typical usage one will access the database via the Load_PDB_SS() function. The difference is that the latter function loads the compressed database from its single source. the Load_PDB_SS() function will load the individual torsions and disulfide .pkl, builds the classlist structures.

Developer's Notes: The .pkl files needed to instantiate this class and save it into its final .pkl file are defined in the proteusPy.data class and should not be changed. Upon initialization the class will load them and initialize itself.

DisulfideLoader( verbose: bool = True, datadir: str = '/Users/egs/repos/proteusPy/proteusPy/data/', picklefile: str = 'PDB_all_ss.pkl', pickle_dict_file: str = 'PDB_all_ss_dict.pkl', torsion_file: str = 'PDB_all_ss_torsions.csv', quiet: bool = True, subset: bool = False, cutoff: float = -1.0)
 72    def __init__(
 73        self,
 74        verbose: bool = True,
 75        datadir: str = REPO_DATA_DIR,
 76        picklefile: str = SS_PICKLE_FILE,
 77        pickle_dict_file: str = SS_DICT_PICKLE_FILE,
 78        torsion_file: str = SS_TORSIONS_FILE,
 79        quiet: bool = True,
 80        subset: bool = False,
 81        cutoff: float = -1.0,
 82    ) -> None:
 83        """
 84        Initializing the class initiates loading either the entire Disulfide dataset,
 85        or the 'subset', which consists of the first 1000 PDB structures. The subset
 86        is useful for testing and debugging since it doesn't require nearly as much
 87        memory or time. The name for the subset file is hard-coded. One can pass a
 88        different data directory and file names for the pickle files. These different
 89        directories are normally established with the proteusPy.Extract_Disulfides
 90        function.
 91        """
 92
 93        self.ModelDir = datadir
 94        self.PickleFile = f"{datadir}{picklefile}"
 95        self.PickleDictFile = f"{datadir}{pickle_dict_file}"
 96        self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}"
 97        self.TorsionFile = f"{datadir}{torsion_file}"
 98        self.SSList = DisulfideList([], "ALL_PDB_SS")
 99        self.SSDict = {}
100        self.TorsionDF = pd.DataFrame()
101        self.TotalDisulfides = 0
102        self.IDList = []
103        self.QUIET = quiet
104
105        self.tclass = None  # disulfideClass_constructor to manage classes
106        self.cutoff = cutoff  # distance cutoff used to bulid the database
107        self.verbose = verbose
108        self.timestamp = time.time()
109        self.version = proteusPy.__version__
110
111        idlist = []
112
113        if subset:
114            self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}"
115            self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}"
116            self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}"
117
118        if self.verbose:
119            print(
120                f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ",
121                end="",
122            )
123
124        with open(self.PickleFile, "rb") as f:
125            # sslist = pd.compat.pickle_compat.load(f)
126            sslist = pickle.load(f)
127            self.SSList = sslist
128            self.TotalDisulfides = len(self.SSList)
129
130        if self.verbose:
131            print(
132                f"done.",
133            )
134
135        if self.verbose:
136            print(
137                f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...",
138                end="",
139            )
140
141        with open(self.PickleDictFile, "rb") as f:
142
143            self.SSDict = pickle.load(f)
144            # self.SSDict = pd.compat.pickle_compat.load(f)
145
146            for key in self.SSDict:
147                idlist.append(key)
148            self.IDList = idlist.copy()
149            totalSS_dict = len(self.IDList)
150
151        if self.verbose:
152            print(f"done.")
153
154        if self.verbose:
155            print(
156                f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...",
157                end="",
158            )
159
160        tmpDF = pd.read_csv(self.TorsionFile)
161        tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True)
162
163        self.TorsionDF = tmpDF.copy()
164        self.TotalDisulfides = len(self.SSList)
165
166        if self.verbose:
167            print(f" done.")
168
169        self.tclass = DisulfideClass_Constructor(self, self.verbose)
170
171        if self.verbose:
172            print(f"-> DisulfideLoader(): Loading complete.")
173            self.describe()
174        return

Initializing the class initiates loading either the entire Disulfide dataset, or the 'subset', which consists of the first 1000 PDB structures. The subset is useful for testing and debugging since it doesn't require nearly as much memory or time. The name for the subset file is hard-coded. One can pass a different data directory and file names for the pickle files. These different directories are normally established with the proteusPy.Extract_Disulfides function.

ModelDir
PickleFile
PickleDictFile
PickleClassFile
TorsionFile
SSList
SSDict
TorsionDF
TotalDisulfides
IDList
QUIET
tclass
cutoff
verbose
timestamp
version
Average_Resolution: float
232    @property
233    def Average_Resolution(self) -> float:
234        """
235        Compute and return the average structure resolution for the given list.
236
237        :return: Average resolution (A)
238        """
239        res = 0.0
240        cnt = 1
241        sslist = self.SSList
242
243        for ss in sslist:
244            _res = ss.resolution
245            if _res is not None and res != -1.0:
246                res += _res
247                cnt += 1
248        return res / cnt

Compute and return the average structure resolution for the given list.

Returns

Average resolution (A)

def build_ss_from_idlist(self, idlist):
250    def build_ss_from_idlist(self, idlist):
251        """
252        Given a list of PDBid, return a DisulfideList of Disulfides
253
254        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
255        :return: DisulfideList
256        """
257        res = DisulfideList([], "tmp")
258
259        for id in idlist:
260            for ss in self.SSList:
261                if ss.pdb_id == id:
262                    res.append(ss)
263                    break
264        return res

Given a list of PDBid, return a DisulfideList of Disulfides

Parameters
  • idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
Returns

DisulfideList

def copy(self):
266    def copy(self):
267        """
268        Return a copy of self.
269
270        :return: Copy of self
271        """
272        return copy.deepcopy(self)

Return a copy of self.

Returns

Copy of self

def extract_class(self, clsid) -> proteusPy.DisulfideList.DisulfideList:
274    def extract_class(self, clsid) -> DisulfideList:
275        """
276        Return the list of disulfides corresponding to the input `clsid`.
277
278        :param clsid: The class name to extract.
279        :return: The list of disulfide bonds from the class.
280        """
281
282        # from tqdm import tqdm
283        six = self.tclass.sixclass_df
284        tot_classes = six.shape[0]
285        class_disulfides = DisulfideList([], clsid, quiet=True)
286        _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True)
287        for idx, row in _pbar:
288            _cls = row["class_id"]
289            if _cls == clsid:
290                ss_list = row["ss_id"]
291                pbar = tqdm(ss_list, leave=True)
292                for ssid in pbar:
293                    class_disulfides.append(self[ssid])
294                pbar.set_postfix({"Done": ""})
295                break
296
297            _pbar.set_postfix({"Cnt": idx})
298
299        return class_disulfides

Return the list of disulfides corresponding to the input clsid.

Parameters
  • clsid: The class name to extract.
Returns

The list of disulfide bonds from the class.

def getlist(self) -> proteusPy.DisulfideList.DisulfideList:
301    def getlist(self) -> DisulfideList:
302        """
303        Return the list of Disulfides contained in the class.
304
305        :return: DisulfideList
306        :rtype: DisulfideList
307        """
308        return copy.deepcopy(self.SSList)

Return the list of Disulfides contained in the class.

Returns

DisulfideList

def get_by_name(self, name) -> proteusPy.Disulfide.Disulfide:
310    def get_by_name(self, name) -> Disulfide:
311        """
312        Returns the Disulfide with the given name from the list.
313        """
314        for ss in self.SSList.data:
315            if ss.name == name:
316                return ss  # or ss.copy() !!!
317        return None

Returns the Disulfide with the given name from the list.

def describe(self) -> None:
319    def describe(self) -> None:
320        """
321        Provides information about the Disulfide database contained in `self`.
322
323        Example:<br>
324
325        ```python
326        from proteusPy import Load_PDB_SS
327        PDB_SS = Load_PDB_SS(verbose=False, subset=False)
328        PDB_SS.describe()
329             =========== RCSB Disulfide Database Summary ==============
330                 =========== Built: 2024-02-12 17:48:13 ==============
331        PDB IDs present:                    35818
332        Disulfides loaded:                  120494
333        Average structure resolution:       2.34 Ã…
334        Lowest Energy Disulfide:            2q7q_75D_140D
335        Highest Energy Disulfide:           1toz_456A_467A
336        Cα distance cutoff:                 8.00 Å
337        Total RAM Used:                     30.72 GB.
338            ================= proteusPy: 0.91 =======================
339        ```
340        """
341        vers = self.version
342        tot = self.TotalDisulfides
343        pdbs = len(self.SSDict)
344        ram = (
345            sys.getsizeof(self.SSList)
346            + sys.getsizeof(self.SSDict)
347            + sys.getsizeof(self.TorsionDF)
348        ) / (1024 * 1024)
349        res = self.Average_Resolution
350        cutoff = self.cutoff
351        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
352        ssMin, ssMax = self.SSList.minmax_energy
353
354        print(f"    =========== RCSB Disulfide Database Summary ==============")
355        print(f"       =========== Built: {timestr} ==============")
356        print(f"PDB IDs present:                    {pdbs}")
357        print(f"Disulfides loaded:                  {tot}")
358        print(f"Average structure resolution:       {res:.2f} Ã…")
359        print(f"Lowest Energy Disulfide:            {ssMin.name}")
360        print(f"Highest Energy Disulfide:           {ssMax.name}")
361        print(f"Cα distance cutoff:                 {cutoff:.2f} Å")
362        print(f"Total RAM Used:                     {ram:.2f} GB.")
363        print(f"    ================= proteusPy: {vers} =======================")

Provides information about the Disulfide database contained in self.

Example:

from proteusPy import Load_PDB_SS
PDB_SS = Load_PDB_SS(verbose=False, subset=False)
PDB_SS.describe()
     =========== RCSB Disulfide Database Summary ==============
         =========== Built: 2024-02-12 17:48:13 ==============
PDB IDs present:                    35818
Disulfides loaded:                  120494
Average structure resolution:       2.34 Ã…
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           1toz_456A_467A
Cα distance cutoff:                 8.00 Å
Total RAM Used:                     30.72 GB.
    ================= proteusPy: 0.91 =======================
def display_overlay(self, pdbid) -> None:
365    def display_overlay(self, pdbid) -> None:
366        """
367        Display all disulfides for a given PDB ID overlaid in stick mode against
368        a common coordinate frame. This allows us to see all of the disulfides
369        at one time in a single view. Colors vary smoothy between bonds.
370
371        :param self: DisulfideLoader object initialized with the database.
372        :param pdbid: the PDB id string, e.g. 4yys
373        :return: None
374
375        Example:
376        >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList
377
378        Instantiate the Loader with the SS database subset.
379
380        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
381
382        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
383        a common reference (the proximal disulfides).
384
385        >>> PDB_SS.display_overlay('4yys')
386
387        You can also slice the loader and display as an overly.
388        >>> PDB_SS[:8].display_overlay()
389
390        """
391
392        ssbonds = self[pdbid]
393        ssbonds.display_overlay()
394        return

Display all disulfides for a given PDB ID overlaid in stick mode against a common coordinate frame. This allows us to see all of the disulfides at one time in a single view. Colors vary smoothy between bonds.

Parameters
  • self: DisulfideLoader object initialized with the database.
  • pdbid: the PDB id string, e.g. 4yys
Returns

None

Example:

>>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList

Instantiate the Loader with the SS database subset.

>>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)

Display the Disulfides from the PDB ID 4yys, overlaid onto a common reference (the proximal disulfides).

>>> PDB_SS.display_overlay('4yys')

You can also slice the loader and display as an overly.

>>> PDB_SS[:8].display_overlay()
def getTorsions(self, pdbID=None) -> pandas.core.frame.DataFrame:
396    def getTorsions(self, pdbID=None) -> pd.DataFrame:
397        """
398        Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols
399
400        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
401        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
402            then return the entire dataset.
403        :raises DisulfideParseWarning: Raised if not found
404        :return: Torsions Dataframe
405        :rtype: pd.DataFrame
406
407        Example:
408        >>> from proteusPy import Load_PDB_SS
409        >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
410        >>> Tor_DF = PDB_SS.getTorsions()
411        """
412        res_df = pd.DataFrame()
413
414        if pdbID:
415            try:
416                res = self.SSDict[pdbID]
417                sel = self.TorsionDF["source"] == pdbID
418                res_df = self.TorsionDF[sel]
419                return res_df.copy()
420            except KeyError:
421                mess = f"! Cannot find key {pdbID} in SSBond DB"
422                raise DisulfideParseWarning(mess)
423        else:
424            return copy.deepcopy(self.TorsionDF)

Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols

Parameters
  • pdbID: pdbID, defaults to None, meaning return entire dataset.
Raises
  • DisulfideParseWarning: Raised if not found
Returns

Torsions Dataframe

Example:

>>> from proteusPy import Load_PDB_SS
>>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
>>> Tor_DF = PDB_SS.getTorsions()
def list_binary_classes(self):
426    def list_binary_classes(self):
427        for k, v in enumerate(self.classdict):
428            print(f"Class: |{k}|, |{v}|")
quiet: bool
430    @property
431    def quiet(self) -> bool:
432        """
433        The loader quiet state
434
435        :return: quiet parameter
436        :rtype: bool
437        """
438        return self.QUIET

The loader quiet state

Returns

quiet parameter

def plot_classes_vs_cutoff(self, cutoff, steps) -> None:
450    def plot_classes_vs_cutoff(self, cutoff, steps) -> None:
451        """
452        Plot the total percentage and number of members for each class against the cutoff value.
453
454        :param cutoff: Percent cutoff value for filtering the classes.
455        :return: None
456        """
457
458        import matplotlib.pyplot as plt
459        import numpy as np
460
461        _cutoff = np.linspace(0, cutoff, steps)
462        tot_list = []
463        members_list = []
464
465        for c in _cutoff:
466            class_df = self.tclass.filter_sixclass_by_percentage(c)
467            tot = class_df["percentage"].sum()
468            tot_list.append(tot)
469            members_list.append(class_df.shape[0])
470            print(
471                f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long."
472            )
473
474        fig, ax1 = plt.subplots()
475
476        ax2 = ax1.twinx()
477        ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue")
478        ax2.plot(_cutoff, members_list, label="Number of members", color="red")
479
480        ax1.set_xlabel("Cutoff")
481        ax1.set_ylabel("Total percentage", color="blue")
482        ax2.set_ylabel("Number of members", color="red")
483
484        plt.show()

Plot the total percentage and number of members for each class against the cutoff value.

Parameters
  • cutoff: Percent cutoff value for filtering the classes.
Returns

None

def plot_binary_to_sixclass_incidence(self, light=True, save=False, savedir='.') -> None:
486    def plot_binary_to_sixclass_incidence(
487        self, light=True, save=False, savedir="."
488    ) -> None:
489        """
490        Plot the incidence of all sextant Disulfide classes for a given binary class.
491
492        :param loader: `proteusPy.DisulfideLoader` object
493        """
494
495        from proteusPy.DisulfideClasses import plot_count_vs_class_df
496
497        def _enumerate_sixclass_fromlist(sslist):
498            x = []
499            y = []
500
501            for sixcls in sslist:
502                if sixcls is not None:
503                    _y = self.tclass.sslist_from_classid(sixcls)
504                    # it's possible to have 0 SS in a class
505                    if _y is not None:
506                        # only append if we have both.
507                        x.append(sixcls)
508                        y.append(len(_y))
509
510            sslist_df = pd.DataFrame(columns=["class_id", "count"])
511            sslist_df["class_id"] = x
512            sslist_df["count"] = y
513            return sslist_df
514
515        clslist = self.tclass.classdf["class_id"]
516        for cls in clslist:
517            sixcls = self.tclass.binary_to_six_class(cls)
518            df = _enumerate_sixclass_fromlist(sixcls)
519            plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir)
520        return

Plot the incidence of all sextant Disulfide classes for a given binary class.

Parameters
def enumerate_sixclass_fromlist(self, sslist) -> pandas.core.frame.DataFrame:
522    def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame:
523        x = []
524        y = []
525
526        for sixcls in sslist:
527            if sixcls is not None:
528                _y = self.tclass.sslist_from_classid(sixcls)
529                # it's possible to have 0 SS in a class
530                if _y is not None:
531                    # only append if we have both.
532                    x.append(sixcls)
533                    y.append(len(_y))
534
535        sslist_df = pd.DataFrame(columns=["class_id", "count"])
536        sslist_df["class_id"] = x
537        sslist_df["count"] = y
538        return sslist_df
def save( self, savepath='/Users/egs/repos/proteusPy/proteusPy/data/', subset=False, cutoff=-1.0):
540    def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0):
541        """
542        Save a copy of the fully instantiated Loader to the specified file.
543
544        :param savepath: Path to save the file, defaults to DATA_DIR
545        :param fname: Filename, defaults to LOADER_FNAME
546        :param verbose: Verbosity, defaults to False
547        :param cutoff: Distance cutoff used to build the database, -1 means no cutoff.
548        """
549        self.version = proteusPy.__version__
550        self.cutoff = cutoff
551
552        if subset:
553            fname = LOADER_SUBSET_FNAME
554        else:
555            fname = LOADER_FNAME
556
557        _fname = f"{savepath}{fname}"
558
559        if self.verbose:
560            print(f"-> DisulfideLoader.save(): Writing {_fname}... ")
561
562        with open(_fname, "wb+") as f:
563            pickle.dump(self, f)
564
565        if self.verbose:
566            print(f"-> DisulfideLoader.save(): Done.")

Save a copy of the fully instantiated Loader to the specified file.

Parameters
  • savepath: Path to save the file, defaults to DATA_DIR
  • fname: Filename, defaults to LOADER_FNAME
  • verbose: Verbosity, defaults to False
  • cutoff: Distance cutoff used to build the database, -1 means no cutoff.
def Download_PDB_SS( loadpath='/Users/egs/repos/proteusPy/proteusPy/data/', verbose=False, subset=False):
572def Download_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False):
573    """
574    Download the databases from my Google Drive.
575
576    :param loadpath: Path from which to load, defaults to DATA_DIR
577    :param verbose: Verbosity, defaults to False
578    """
579
580    import gdown
581
582    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
583    _fname_all = f"{loadpath}{LOADER_FNAME}"
584
585    if verbose:
586        print(f"--> DisulfideLoader: Downloading Disulfide Database from Drive...")
587
588    gdown.download(LOADER_ALL_URL, _fname_all, quiet=False)
589
590    if subset:
591        if verbose:
592            print(
593                f"--> DisulfideLoader: Downloading Disulfide Subset Database from Drive..."
594            )
595
596        gdown.download(LOADER_SUBSET_URL, _fname_sub, quiet=False)
597
598    return

Download the databases from my Google Drive.

Parameters
  • loadpath: Path from which to load, defaults to DATA_DIR
  • verbose: Verbosity, defaults to False
def Download_PDB_SS_GitHub( loadpath='/Users/egs/repos/proteusPy/proteusPy/data/', verbose=True, subset=False):
601def Download_PDB_SS_GitHub(loadpath=DATA_DIR, verbose=True, subset=False):
602    """
603    Download the databases from Github. Note: if you change the database these sizes will
604    need to be changed!
605
606    :param loadpath: Path from which to load, defaults to DATA_DIR
607    :param verbose: Verbosity, defaults to True
608    """
609
610    import urllib
611
612    _good1 = 0  # all data
613    _good2 = 0  # subset data
614
615    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
616    _fname_all = f"{loadpath}{LOADER_FNAME}"
617
618    _all_length = 340371775
619    _subset_length = 9636086
620
621    if verbose:
622        print(f"--> DisulfideLoader: Downloading Disulfide Database from GitHub...")
623
624    resp, headers = urllib.request.urlretrieve(
625        "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_ALL_LOADER.pkl",
626        _fname_all,
627    )
628    num_bytes = headers.get("content-length")
629    if num_bytes == _all_length:
630        _good1 = 1
631    else:
632        print(f"--> Read: {num_bytes}, expecting: {_all_length}")
633
634    if subset:
635        if verbose:
636            print(
637                f"--> DisulfideLoader: Downloading Disulfide Subset Database from GitHub..."
638            )
639
640        resp, headers = urllib.request.urlretrieve(
641            "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_SUBSET_LOADER.pkl",
642            _fname_sub,
643        )
644        num_bytes = headers.get("content-length")
645        if num_bytes == _subset_length:
646            _good2 = 1
647        else:
648            print(f"--> Read: {num_bytes}, expecting: {_subset_length}")
649    return _good1 + _good2

Download the databases from Github. Note: if you change the database these sizes will need to be changed!

Parameters
  • loadpath: Path from which to load, defaults to DATA_DIR
  • verbose: Verbosity, defaults to True
def Load_PDB_SS( loadpath='/Users/egs/repos/proteusPy/proteusPy/data/', verbose=False, subset=False) -> DisulfideLoader:
652def Load_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False) -> DisulfideLoader:
653    """
654    Load the fully instantiated Disulfide database from the specified file. Use the
655    defaults unless you are building the database by hand. *This is the function
656    used to load the built database.*
657
658    :param loadpath: Path from which to load, defaults to DATA_DIR
659    :param verbose: Verbosity, defaults to False
660    :param subset: If True, load the subset DB, otherwise load the full database
661    """
662    # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy
663    # distribution. In that case we need to download the files for all disulfides and the subset
664    # from the GitHub.
665
666    _good1 = False  # all data
667    _good2 = False  # subset data
668
669    _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}"
670    _fname_all = f"{loadpath}{LOADER_FNAME}"
671
672    if subset:
673        _fname = _fname_sub
674    else:
675        _fname = _fname_all
676
677    if not os.path.exists(_fname_sub):
678        res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=True)
679
680    if not os.path.exists(_fname_all):
681        res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=False)
682
683    # first attempt to read the local copy of the loader
684    if verbose:
685        print(f"-> load_PDB_SS(): Reading {_fname}... ")
686
687    with open(_fname, "rb") as f:
688        res = pickle.load(f)
689        # res = pd.compat.pickle_compat.load(f)
690
691    if verbose:
692        print(f"-> load_PDB_SS(): Done reading {_fname}... ")
693    return res

Load the fully instantiated Disulfide database from the specified file. Use the defaults unless you are building the database by hand. This is the function used to load the built database.

Parameters
  • loadpath: Path from which to load, defaults to DATA_DIR
  • verbose: Verbosity, defaults to False
  • subset: If True, load the subset DB, otherwise load the full database