proteusPy.DisulfideLoader

This module is part of the proteusPy package, a Python package for the analysis and modeling of protein structures, with an emphasis on disulfide bonds. This work is based on the original C/C++ implementation by Eric G. Suchanek.

Last revision: 2025-03-26 13:14:09 -egs-

   1"""
   2This module is part of the proteusPy package, a Python package for
   3the analysis and modeling of protein structures, with an emphasis on disulfide bonds.
   4This work is based on the original C/C++ implementation by Eric G. Suchanek. \n
   5
   6Last revision: 2025-03-26 13:14:09 -egs-
   7"""
   8
   9# Cα N, Cα, Cβ, C', Sγ Å ° ρ
  10
  11# pylint: disable=C0301
  12# pylint: disable=C0302
  13# pylint: disable=W1203
  14# pylint: disable=C0103
  15# pylint: disable=W0612
  16# pylint: disable=R1702
  17
  18# Cα N, Cα, Cβ, C', Sγ Å ° ρ
  19
  20import copy
  21import pickle
  22import time
  23from dataclasses import dataclass, field
  24from pathlib import Path
  25from typing import Any, Dict, List, Optional, Tuple
  26
  27import gdown
  28import pandas as pd
  29import plotly_express as px
  30from pympler import asizeof
  31
  32from proteusPy import __version__
  33from proteusPy.DisulfideBase import Disulfide, DisulfideList
  34from proteusPy.DisulfideClassGenerator import DisulfideClassGenerator
  35from proteusPy.DisulfideClassManager import DisulfideClassManager
  36from proteusPy.DisulfideExceptions import DisulfideParseWarning
  37from proteusPy.DisulfideStats import DisulfideStats
  38from proteusPy.DisulfideVisualization import DisulfideVisualization
  39from proteusPy.logger_config import create_logger
  40from proteusPy.ProteusGlobals import (
  41    DATA_DIR,
  42    LOADER_FNAME,
  43    LOADER_SUBSET_FNAME,
  44    SS_LIST_URL,
  45    SS_PICKLE_FILE,
  46)
  47
  48_logger = create_logger(__name__)
  49
  50try:
  51    # Check if running in Jupyter
  52    shell = get_ipython().__class__.__name__  # type: ignore
  53    if shell == "ZMQInteractiveShell":
  54        from tqdm.notebook import tqdm
  55    else:
  56        from tqdm import tqdm
  57except NameError:
  58    from tqdm import tqdm
  59
  60
  61@dataclass
  62class DisulfideLoader:
  63    """
  64    This class represents the disulfide database itself and is its primary means of accession.
  65    The entirety of the RCSB disulfide database is stored within the class via a
  66    proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of
  67    indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow
  68    simple, direct and flexible access to the disulfide structures contained herein.
  69    This makes it possible to access the disulfides by array index, PDB structure ID, disulfide
  70    name and class ID.
  71
  72    The class also provides methods for plotting distance and angle deviations
  73    as well as torsion statistics for the disulfides in the database.
  74
  75    The class can also render Disulfides overlaid on a common coordinate system to a pyVista
  76    window using the [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.
  77
  78    Important note: For typical usage one will access the database via the `Load_PDB_SS()` function.
  79    The difference is that the latter function loads the compressed database from its single
  80    source. The `DisulfideLoader` class is used to build the Disulifde database with a
  81    specific cutoff, or for saving the database to a file.
  82
  83    Cutoff values of -1.0 indicate imposing no cutoffs on the data.
  84
  85    :param verbose: Flag to control output verbosity
  86    :type verbose: bool
  87    :param datadir: Directory containingA data files
  88    :type datadir: str
  89    :param picklefile: Name of the pickle file containing disulfide data
  90    :type picklefile: str
  91    :param quiet: Flag to suppress warnings
  92    :type quiet: bool
  93    :param subset: Flag to load only a subset of data
  94    :type subset: bool
  95    :param cutoff: Distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  96    :type cutoff: float
  97    :param sg_cutoff: SG distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  98    :type sg_cutoff: float
  99    :param percentile: Percentile cutoff for filtering disulfides. Must be between 0 and 100.
 100    Filters based on statistical cutoffs derived from the data.
 101    :type percentile: float
 102    :param minimum: Minimum atom distance for filtering disulfides. -1 is no filtering.
 103    :type minimum: float
 104    :param save: Flag to save the Loader to a file
 105    :type save: bool
 106    """
 107
 108    # Fields that serve as both instance attributes and initialization parameters
 109    datadir: str = field(default=DATA_DIR)
 110    picklefile: str = field(default=SS_PICKLE_FILE)
 111    subset: bool = field(default=False)
 112    cutoff: float = field(default=-1.0)
 113    sg_cutoff: float = field(default=-1.0)
 114    verbose: bool = field(default=False)
 115    percentile: float = field(default=-1.0)
 116    quiet: bool = field(default=False)
 117    minimum: float = field(default=-1.0)
 118    saveit: bool = field(default=False)
 119
 120    # Fields that are only used internally and don't need to be initialization parameters
 121    SSList: DisulfideList = field(
 122        default_factory=lambda: DisulfideList([], "ALL_PDB_SS"), init=False
 123    )
 124    SSDict: Dict = field(default_factory=dict, init=False)
 125    TorsionDF: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)
 126    TotalDisulfides: int = field(default=0, init=False)
 127    IDList: List = field(default_factory=list, init=False)
 128    tclass: Optional[DisulfideClassManager] = field(default=None, init=False)
 129    class_generator: Optional[DisulfideClassGenerator] = field(default=None, init=False)
 130    timestamp: float = field(default_factory=time.time, init=False)
 131    version: str = field(default=__version__, init=False)
 132
 133    def __post_init__(self) -> None:
 134        """
 135        Initialize the DisulfideLoader after dataclass initialization.
 136        This method handles loading and processing of the disulfide data.
 137        """
 138
 139        cutoffs = {}
 140        old_length = new_length = 0
 141        full_path = Path(self.datadir) / self.picklefile
 142
 143        if self.verbose and not self.quiet:
 144            _logger.info(
 145                f"Reading disulfides from: {full_path}... ",
 146            )
 147
 148        try:
 149            # Check if the file exists before attempting to open it
 150            if not full_path.exists():
 151                fname = SS_PICKLE_FILE
 152                url = SS_LIST_URL
 153
 154                _fname = Path(DATA_DIR) / fname
 155
 156                if not _fname.exists():
 157                    if self.verbose:
 158                        _logger.info(
 159                            "Master SS list unavailable. Downloading Disulfide Database from Drive..."
 160                        )
 161                    gdown.download(url, str(_fname), quiet=False)
 162
 163            with open(full_path, "rb") as f:
 164                sslist = pickle.load(f)
 165
 166                if self.percentile > 0.0:
 167                    if self.percentile > 100.0:
 168                        raise ValueError("Percentile must be between 0 and 100.")
 169
 170                    cutoffs = DisulfideStats.calculate_cutoff_from_percentile(
 171                        sslist, percentile=self.percentile, verbose=self.verbose
 172                    )
 173
 174                    ca_cutoff = cutoffs["ca_cutoff_percentile"]
 175                    sg_cutoff = cutoffs["sg_cutoff_percentile"]
 176                    self.cutoff = ca_cutoff
 177                    self.sg_cutoff = sg_cutoff
 178
 179                    if self.verbose:
 180                        _logger.info(
 181                            f"Using percentile cutoffs: {ca_cutoff:.2f}, {sg_cutoff:.2f}"
 182                        )
 183
 184                old_length = len(sslist)
 185                filt = sslist.filter_by_distance(
 186                    distance=self.cutoff, distance_type="ca", minimum=-1.0
 187                )
 188                filt = DisulfideList(
 189                    filt,
 190                    f"filtered by Ca cutoff:{self.cutoff:.2f}, Sg cutoff: {self.sg_cutoff:.2f}",
 191                )
 192
 193                new_length = len(filt)
 194
 195                if self.verbose:
 196                    _logger.info(
 197                        "Filtered with Cα cutoff %.2f: old: %d, new: %d",
 198                        self.cutoff,
 199                        old_length,
 200                        new_length,
 201                    )
 202
 203                old_length = new_length
 204                filt = filt.filter_by_distance(
 205                    distance=self.sg_cutoff, distance_type="sg", minimum=-1.0
 206                )
 207                new_length = len(filt)
 208
 209                if self.verbose:
 210                    _logger.info(
 211                        "Filtered with Sγ: cutoff %.2f: old: %d, new: %d",
 212                        self.sg_cutoff,
 213                        old_length,
 214                        new_length,
 215                    )
 216                if self.subset:
 217                    self.SSList = DisulfideList(filt[:5000], "SUBSET_PDB_SS")
 218                else:
 219                    self.SSList = DisulfideList(filt, "ALL_PDB_SS")
 220
 221                self.SSDict = self._create_disulfide_dict()
 222                self.IDList = list(self.SSDict.keys())
 223
 224                self.TorsionDF = self.SSList.torsion_df
 225                self.TotalDisulfides = len(self.SSList)
 226                self.tclass = DisulfideClassManager(self, self.verbose)
 227                self.class_generator = DisulfideClassGenerator(verbose=self.verbose)
 228
 229            if self.verbose:
 230                _logger.info("Loader initialization complete.")
 231                self.describe()
 232
 233        except FileNotFoundError as e:
 234            _logger.error("File not found: %s", full_path)
 235            raise e
 236
 237        except Exception as e:
 238            _logger.error("An error occurred while loading the file: %s", full_path)
 239            raise e
 240        if self.saveit:
 241            self.save(
 242                savepath=DATA_DIR,
 243                verbose=self.verbose,
 244            )
 245
 246    # overload __getitem__ to handle slicing and indexing, and access by name or classid
 247    def __getitem__(self, item: int | slice | str) -> DisulfideList | Disulfide:
 248        """
 249        Implements indexing and slicing to retrieve DisulfideList objects from the
 250        DisulfideLoader. Supports:
 251
 252        - Integer indexing to retrieve a single DisulfideList
 253        - Slicing to retrieve a subset as a DisulfideList
 254        - Lookup by PDB ID to retrieve all Disulfides for that structure
 255        - Lookup by full disulfide name
 256        - Lookup by classid in the format 11111b or 11111o. The last char is the class type.
 257        - Lookup by classid in the format 11111. The base is 8 by default.
 258
 259        :param index: The index or key to retrieve the DisulfideList.
 260        :type index: int, slice, str
 261        :return: A DisulfideList object or a subset of it.
 262        :rtype: DisulfideList
 263        :raises DisulfideException: If the index or name is invalid.
 264        """
 265
 266        res = DisulfideList([], "none")
 267        ind_list = []
 268
 269        if isinstance(item, slice):
 270            indices = range(*item.indices(len(self.SSList)))
 271            ind_list = list(indices)
 272            name = f"pdb_slice[{ind_list[0]}:{ind_list[-1]+1}]"
 273            resolution = self.SSList[0].resolution
 274            sublist = [self.SSList[i] for i in indices]
 275            return DisulfideList(sublist, name, resolution)
 276
 277        if isinstance(item, int):
 278            if item < 0 or item >= self.TotalDisulfides:
 279                _logger.error(
 280                    "DisulfideLoader(): Index %d out of range 0-%d",
 281                    item,
 282                    self.TotalDisulfides - 1,
 283                )
 284                return res
 285
 286            res = self.SSList[item]
 287            return res
 288
 289        # if the item is a string, it could be a PDB ID or a full disulfide name
 290        # or a classid in the format 11111b or 11111o. the last char is the class type
 291
 292        if isinstance(item, str) and len(item) == 6 or len(item) == 5:  # classid
 293            res = self.extract_class(item, verbose=self.verbose)
 294            return res
 295
 296        # PDB_SS['4yys'] return a list of SS
 297        try:
 298            indices = self.SSDict[item]
 299            if indices:
 300                res = DisulfideList([], item)
 301                sslist = self.SSList
 302                for ind in indices:
 303                    res.append(sslist[ind])
 304            else:
 305                # try to find the full disulfide name
 306                res = self.SSList.get_by_name(item)  # full disulfide name
 307
 308        except KeyError as e:
 309            res = self.SSList.get_by_name(item)  # full disulfide name
 310
 311        if not res:
 312            _logger.error("DisulfideLoader(): Cannot find key %s in SSBond DB", item)
 313        return res
 314
 315    def __setitem__(self, index: int, item: Disulfide) -> None:
 316        self.SSList[index] = self._validate_ss(item)
 317
 318    def _validate_ss(self, value: Any) -> Disulfide:
 319        if isinstance(value, Disulfide):
 320            return value
 321        raise TypeError(f"Disulfide object expected, got {type(value).__name__}")
 322
 323    @property
 324    def average_resolution(self) -> float:
 325        """
 326        Return the average structure resolution for the given list.
 327        Result is cached since resolution values don't change after loading.
 328
 329        :return: Average resolution (A)
 330        """
 331        sslist = self.SSList
 332        valid_resolutions = [
 333            ss.resolution
 334            for ss in sslist
 335            if ss.resolution is not None and ss.resolution != -1.0
 336        ]
 337
 338        if not valid_resolutions:
 339            return -1.0
 340
 341        return sum(valid_resolutions) / len(valid_resolutions)
 342
 343    def binary_to_class(self, binary_class: str, base: int = 8) -> list[str]:
 344        """
 345        Convert a binary class string to an octant class string.
 346
 347        :param binary_class: The binary class string to convert.
 348        :param base: The base class to use, 2 or 8.
 349        :return: The octant class list.
 350        """
 351        return self.tclass.binary_to_class(binary_class, base)
 352
 353    def build_ss_from_idlist(self, idlist: List[str]) -> DisulfideList:
 354        """
 355        Return a DisulfideList of Disulfides for a given list of PDBIDs
 356
 357        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
 358        :return: DisulfideList
 359        """
 360        res = DisulfideList([], "RCSB_list")
 361        for pdbid, sslist in self.SSDict.items():
 362            if pdbid in idlist:
 363                for ssid in sslist:
 364                    res.append(self.SSList[ssid])
 365        return res
 366
 367    def _class_indices_from_tors_df(self, class_string: str, base: int = 8) -> pd.Index:
 368        """
 369        Return the row indices of the torsion dataframe that match the class string.
 370
 371        This method is used internally to find the indices of rows in the torsion dataframe
 372        that match the specified class string based on the given base.
 373
 374        :param class_string: The class string to match in the torsion dataframe.
 375        :type class_string: str
 376        :param base: The base class to use for matching, either 2 or 8. Defaults to 8.
 377        :type base: int
 378        :return: The row indices of the torsion dataframe that match the class string.
 379        :rtype: pd.Index
 380        :raises ValueError: If the base is not 2 or 8.
 381        """
 382        tors_df = self.TorsionDF
 383        match base:
 384            case 8:
 385                column = "octant_class_string"
 386            case 2:
 387                column = "binary_class_string"
 388            case _:
 389                raise ValueError(f"Base must be 2 or 8, not {base}")
 390
 391        return tors_df[tors_df[column] == class_string].index
 392
 393    def copy(self) -> "DisulfideLoader":
 394        """
 395        Return a copy of self.
 396
 397        :return: Copy of self
 398        """
 399        return copy.deepcopy(self)
 400
 401    def _create_disulfide_dict(self) -> Dict[str, List[int]]:
 402        """
 403        Create a dictionary from a list of disulfide objects where the key is the pdb_id
 404        and the value is a list of indices of the disulfide objects in the list.
 405
 406        This is an internal method used during initialization.
 407
 408        :param disulfide_list: List of disulfide objects.
 409        :type disulfide_list: list
 410        :return: Dictionary with pdb_id as keys and lists of indices as values.
 411        :rtype: dict
 412        """
 413        disulfide_list = self.SSList
 414
 415        disulfide_dict = {}
 416        for index, disulfide in enumerate(disulfide_list):
 417            if disulfide.pdb_id not in disulfide_dict:
 418                disulfide_dict[disulfide.pdb_id] = []
 419            disulfide_dict[disulfide.pdb_id].append(index)
 420        return disulfide_dict
 421
 422    def get_class_df(self, base: int = 8) -> pd.DataFrame:
 423        """
 424        Return the class incidence dataframe for the input base.
 425        Result is cached since class distributions don't change after loading.
 426
 427        :param base: The base class to use, 2 or 8.
 428        :return: pd.DataFrame
 429        """
 430        return self.tclass.get_class_df(base)
 431
 432    def extract_class(self, clsid: str, verbose: bool = False) -> DisulfideList:
 433        """
 434        Return the list of disulfides corresponding to the input `clsid`.
 435
 436        :param clsid: The class name to extract.
 437        :param verbose: If True, display progress bars, by default False
 438        :return: The list of disulfide bonds from the class.
 439        """
 440
 441        # cls = clsid[:5]
 442        cls = clsid
 443        ss_ids = []
 444        class_disulfides = None
 445
 446        try:
 447            ss_ids = self.tclass[clsid]
 448
 449        except KeyError:
 450            _logger.error("Cannot find key %s in SSBond DB", clsid)
 451            return DisulfideList([], cls, quiet=True)
 452
 453        tot_ss = len(ss_ids)
 454        class_disulfides = DisulfideList([], cls, quiet=True)
 455
 456        _pbar = (
 457            tqdm(range(tot_ss), total=tot_ss, leave=True) if verbose else range(tot_ss)
 458        )
 459
 460        for idx in _pbar:
 461            ssid = ss_ids[idx]
 462            class_disulfides.append(self[ssid])
 463
 464        return class_disulfides
 465
 466    def getlist(self) -> DisulfideList:
 467        """
 468        Return the list of Disulfides contained in the class.
 469
 470        :return: DisulfideList
 471        :rtype: DisulfideList
 472        """
 473        return copy.deepcopy(self.SSList)
 474
 475    def get_by_name(self, name: str = None) -> Optional[Disulfide]:
 476        """
 477        Return the Disulfide with the given name from the list.
 478        Result is cached since disulfide data doesn't change after loading.
 479        """
 480        for ss in self.SSList.data:
 481            if ss.name == name:
 482                return ss  # or ss.copy() !!!
 483        return None
 484
 485    def describe(self, memusg: bool = False) -> None:
 486        """
 487        Reveal key details about the Disulfide database stored in `self`. If `memusg` is True,
 488        the total RAM usage of the object is calculated and displayed — note that this process
 489        may take around 30 seconds on a 2024 MacBook Pro, M3 Max.
 490
 491        :param memusg: Set to True to include the RAM usage of the `DisulfideLoader` object.
 492        :return: None — just the facts!
 493        """
 494        # pylint: disable=E1101
 495        vers = self.version
 496        tot = self.TotalDisulfides
 497        pdbs = len(self.SSDict)
 498        ram = 0
 499        if memusg:
 500            ram = asizeof.asizeof(self) / (1024 * 1024 * 1024)
 501
 502        res = self.average_resolution
 503        cutoff = self.cutoff
 504        sg_cutoff = self.sg_cutoff
 505        percentile = self.percentile
 506        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
 507        ssMinMax = self.SSList.minmax_energy
 508        ssMin_name: Disulfide = ssMinMax[0].name
 509        ssMax_name: Disulfide = ssMinMax[1].name
 510
 511        print("")
 512        print("    🌟 RCSB Disulfide Database Summary 🌟")
 513        print(f"       🕒 Constructed: {timestr} 🕒")
 514        print(f"PDB IDs Present:               {pdbs}")
 515        print(f"Disulfides Loaded:             {tot}")
 516        print(f"Average Resolution:            {res:.2f} Å")
 517        print(f"Lowest Energy Disulfide:       {ssMin_name}")
 518        print(f"Highest Energy Disulfide:      {ssMax_name}")
 519        print(f"Cα Distance Cutoff:            {cutoff:.2f} Å")
 520        print(f"Sγ Distance Cutoff:            {sg_cutoff:.2f} Å")
 521        print(f"Percentile Cutoff:             {percentile:.2f} %")
 522        if memusg:
 523            print(f"Total RAM Usage:            {ram:.2f} GB")
 524        print(f"     ⚡ proteusPy Version: {vers} ⚡")
 525        print("")
 526
 527        return
 528
 529    def display_overlay(
 530        self, pdbid: str = "", verbose: bool = False, spin: bool = False
 531    ) -> None:
 532        """
 533        Display all disulfides for a given PDB ID overlaid in stick mode against
 534        a common coordinate frame. This allows us to see all of the disulfides
 535        at one time in a single view. Colors vary smoothy between bonds.
 536
 537        :param self: DisulfideLoader object initialized with the database.
 538        :param pdbid: the PDB id string, e.g. 4yys
 539        :param verbose: If True, display progress bars, by default False
 540        :type verbose: bool
 541        :param spin: If True, spin the display, by default False
 542        :type spin: bool
 543        :return: None
 544
 545        Example:
 546        >>> import proteusPy as pp
 547
 548        Instantiate the Loader with the SS database subset.
 549
 550        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
 551
 552        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
 553        a common reference (the proximal disulfides).
 554
 555        >>> PDB_SS.display_overlay('4yys', verbose=False)
 556
 557        You can also slice the loader and display as an overly.
 558        >>> PDB_SS[:8].display_overlay(verbose=False)
 559
 560        """
 561
 562        try:
 563            ssbonds = self[pdbid]
 564        except KeyError:
 565            _logger.error("Cannot find key %s in SSBond DB", pdbid)
 566            return
 567
 568        ssbonds.display_overlay(verbose=verbose, spin=spin)
 569        return
 570
 571    def getTorsions(self, pdbID: Optional[str] = None) -> pd.DataFrame:
 572        """
 573        Return the torsions, distances and energies defined by Torsion_DF_cols
 574
 575        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
 576        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
 577            then return the entire dataset.
 578        :raises DisulfideParseWarning: Raised if not found
 579        :return: Torsions Dataframe
 580        :rtype: pd.DataFrame
 581
 582        Example:
 583        >>> import proteusPy as pp
 584        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
 585        >>> Tor_DF = PDB_SS.getTorsions()
 586        """
 587        res_df = pd.DataFrame()
 588
 589        if pdbID:
 590            try:
 591                res = self.SSDict[pdbID]
 592                sel = self.TorsionDF["source"] == pdbID
 593                res_df = self.TorsionDF[sel]
 594                return res_df.copy()
 595            except KeyError as e:
 596                mess = f"Cannot find key {pdbID} in SSBond DB"
 597                _logger.error(mess)
 598                raise DisulfideParseWarning(mess) from e
 599        else:
 600            return copy.deepcopy(self.TorsionDF)
 601
 602    def list_binary_classes(self) -> None:
 603        """Enumerate the binary classes"""
 604        for k, v in enumerate(self.tclass.binaryclass_dict):
 605            print(f"Class: |{k}|, |{v}|")
 606
 607    def plot_classes(
 608        self,
 609        base: int = 8,
 610        class_string: Optional[str] = None,
 611        theme: str = "auto",
 612        log: bool = False,
 613        paginated: bool = False,
 614        page_size: int = 200,
 615    ) -> None:
 616        """
 617        Plot the classes for the given base.
 618
 619        :param base: The base class to use, 2 or 8.
 620        :param class_string: The class string to plot.
 621        :param theme: The theme to use for the plot ('auto', 'light', or 'dark').
 622        :param log: Whether to use a log scale for the y-axis.
 623        :param paginated: Whether to paginate the plot.
 624        :param page_size: Number of items per page.
 625        """
 626        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 627
 628        DisulfideVisualization.plot_classes(
 629            self.tclass,
 630            class_string=class_string,
 631            base=base,
 632            theme=theme,
 633            log=log,
 634            page_size=page_size,
 635            paginated=paginated,
 636        )
 637
 638    def plot_classes_vs_cutoff(
 639        self,
 640        cutoff: float,
 641        steps: int = 50,
 642        base: int = 8,
 643        theme: str = "auto",
 644        verbose: bool = False,
 645    ) -> None:
 646        """
 647        Plot the total percentage and number of members for each octant class against the cutoff value.
 648
 649        :param cutoff: Percent cutoff value for filtering the classes.
 650        :type cutoff: float
 651        :param steps: Number of steps to take in the cutoff.
 652        :type steps: int
 653        :param base: The base class to use, 6 or 8.
 654        :type base: int
 655        :param theme: The theme to use for the plot ('auto', 'light', or 'dark'), defaults to 'auto'.
 656        :type theme: str
 657        :param verbose: Whether to display verbose output, defaults to False.
 658        :type verbose: bool
 659        :return: None
 660        :rtype: None
 661        """
 662        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 663
 664        DisulfideVisualization.plot_classes_vs_cutoff(
 665            self.tclass, cutoff, steps, base, theme, verbose
 666        )
 667
 668    def plot_binary_to_eightclass_incidence(
 669        self,
 670        theme: str = "light",
 671        save: bool = False,
 672        savedir: str = ".",
 673        verbose: bool = False,
 674        log: bool = False,
 675    ) -> None:
 676        """Plot the incidence of all octant Disulfide classes for a given binary class.
 677
 678        :param theme: The theme to use for the plot
 679        :type theme: str
 680        :param save: Whether to save the plots
 681        :type save: bool
 682        :param savedir: Directory to save plots to
 683        :type savedir: str
 684        :param verbose: Whether to display verbose output
 685        :type verbose: bool
 686        :param log: Whether to use a log scale for the y-axis
 687        :type log: bool
 688        :return: None
 689        :rtype: None
 690        """
 691
 692        DisulfideVisualization.plot_binary_to_eightclass_incidence(
 693            self.tclass,
 694            theme=theme,
 695            save=save,
 696            savedir=savedir,
 697            verbose=verbose,
 698            log=log,
 699        )
 700
 701    def plot_count_vs_class_df(
 702        self,
 703        class_string: str,
 704        title: str = "title",
 705        theme: str = "auto",
 706        save: bool = False,
 707        savedir: str = ".",
 708        base: int = 8,
 709        verbose: bool = False,
 710        log: bool = False,
 711        sample_size: Optional[int] = None,
 712        page_size: Optional[int] = None,
 713    ) -> None:
 714        """
 715        Plot a line graph of count vs class ID using Plotly for the given disulfide class. The
 716        base selects the class type to plot: 2, 6, or 8, for binary, sextant, or octant classes.
 717
 718        :param class_string: The binary class string to be plotted.
 719        :param title: A string representing the title of the plot (default is 'title').
 720        :param theme: Theme to use for the plot
 721        :param save: Whether to save the plot
 722        :param savedir: Directory to save the plot to
 723        :param base: Base for class IDs (2 or 8)
 724        :param verbose: Whether to display verbose output
 725        :param log: Whether to use log scale for y-axis
 726        :param sample_size: Number of items to sample
 727        :param page_size: Number of items per page
 728        """
 729        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 730        class_list = self.tclass.binary_to_class(class_string, base)
 731        df = self._enumerate_class_fromlist(class_list, base=base)
 732
 733        if sample_size:
 734            DisulfideVisualization.plot_count_vs_class_df_sampled(
 735                df,
 736                title,
 737                theme,
 738                save,
 739                savedir,
 740                base,
 741                verbose,
 742                log,
 743                sample_size,
 744            )
 745        elif page_size:
 746            DisulfideVisualization.plot_count_vs_class_df_paginated(
 747                df, title, theme, save, savedir, base, verbose, log, page_size
 748            )
 749        else:
 750            DisulfideVisualization.plot_count_vs_class_df(
 751                df, title, theme, save, savedir, base, verbose, log
 752            )
 753
 754    def plot_count_vs_classid(
 755        self,
 756        cls: Optional[str] = None,
 757        theme: str = "auto",
 758        base: int = 8,
 759        log: bool = True,
 760    ) -> None:
 761        """
 762        Plot a line graph of count vs class ID using Plotly.
 763
 764        :param cls: Specific class to plot (optional)
 765        :param theme: Theme to use for the plot
 766        :param base: Base for class IDs (2 or 8)
 767        :param log: Whether to use log scale for y-axis
 768        """
 769        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 770
 771        DisulfideVisualization.plot_count_vs_classid(self.tclass, cls, theme, base, log)
 772
 773    def _enumerate_class_fromlist(
 774        self, sslist: List[str], base: int = 8
 775    ) -> pd.DataFrame:
 776        """
 777        Enumerate the classes from a list of class IDs and return a DataFrame with class IDs and their corresponding counts.
 778        Results are cached for improved performance on repeated calls.
 779
 780        :param sslist: A list of class IDs to enumerate.
 781        :param base: The base value for the enumeration, by default 8.
 782        :return: A DataFrame with columns "class_id" and "count" representing the class IDs and their corresponding counts.
 783        """
 784        x = []
 785        y = []
 786
 787        for cls in sslist:
 788            if cls is not None:
 789                _y = self.tclass.sslist_from_classid(cls, base=base)
 790                # it's possible to have 0 SS in a class
 791                if _y is not None:
 792                    # only append if we have both.
 793                    x.append(cls)
 794                    y.append(len(_y))
 795
 796        sslist_df = pd.DataFrame(columns=["class_id", "count"])
 797        sslist_df["class_id"] = x
 798        sslist_df["count"] = y
 799        return sslist_df
 800
 801    def save(
 802        self,
 803        savepath: str = DATA_DIR,
 804        verbose: bool = False,
 805        fname: Optional[str] = None,
 806    ) -> None:
 807        """
 808        Save a copy of the fully instantiated Loader to the specified file.
 809
 810        :param savepath: Path to save the file, defaults to DATA_DIR
 811        :param fname: Filename, defaults to LOADER_FNAME
 812        :param verbose: Verbosity, defaults to False
 813        """
 814        self.version = __version__
 815
 816        fname = None
 817        if self.subset:
 818            fname = LOADER_SUBSET_FNAME
 819        else:
 820            fname = LOADER_FNAME
 821
 822        _fname = Path(savepath) / fname
 823
 824        if verbose:
 825            _logger.info("Writing Disulfide Loader to: %s...", _fname)
 826
 827        with open(str(_fname), "wb+") as f:
 828            pickle.dump(self, f)
 829
 830        if verbose:
 831            _logger.info("Done saving loader.")
 832
 833    def plot_disulfides_vs_pdbid(self, cutoff: int = 1) -> Tuple[List[str], List[int]]:
 834        """
 835        Plots the number of disulfides versus pdbid.
 836
 837        :param cutoff: The minimum number of disulfides a PDB ID must have to be included in the plot.
 838        :type cutoff: int
 839        :return: A tuple containing the list of PDB IDs and the corresponding number of disulfides.
 840        :rtype: tuple
 841        """
 842        pdbids = []
 843        num_disulfides = []
 844
 845        for pdbid, disulfides in self.SSDict.items():
 846            if len(disulfides) > cutoff:
 847                pdbids.append(pdbid)
 848                num_disulfides.append(len(disulfides))
 849
 850        # Create a DataFrame
 851        df = pd.DataFrame({"PDB ID": pdbids, "Number of Disulfides": num_disulfides})
 852        fig = px.bar(
 853            df,
 854            x="PDB ID",
 855            y="Number of Disulfides",
 856            title=f"Disulfides vs PDB ID with cutoff: {cutoff}, {len(pdbids)} PDB IDs",
 857        )
 858        fig.update_layout(
 859            xaxis_title="PDB ID",
 860            yaxis_title="Number of Disulfides",
 861            xaxis_tickangle=-90,
 862        )
 863        fig.show()
 864
 865        return pdbids, num_disulfides
 866
 867    def plot_distances(
 868        self,
 869        distance_type: str = "ca",
 870        cutoff: float = -1,
 871        comparison: str = "less",
 872        theme: str = "auto",
 873        log: bool = True,
 874    ) -> None:
 875        """
 876        Plot the distances for the disulfides in the loader.
 877
 878        :param distance_type: The type of distance to plot ('ca' for Cα-Cα distance, 'sg' for Sγ-Sγ distance)
 879        :param cutoff: The cutoff value for the distance, defaults to -1 (no cutoff)
 880        :param comparison: if 'less' then plot distances less than the cutoff, if 'greater' then plot distances greater than the cutoff
 881        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
 882        :param log: Whether to use a log scale for the y-axis
 883        """
 884        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 885
 886        # sslist = self.SSList
 887        # distances = sslist.extract_distances(distance_type, comparison, cutoff)
 888
 889        self.SSList.plot_distances(
 890            distance_type=distance_type,
 891            cutoff=cutoff,
 892            comparison=comparison,
 893            theme=theme,
 894            log=log,
 895        )
 896
 897    def plot_deviation_scatterplots(
 898        self, verbose: bool = False, theme: str = "auto"
 899    ) -> None:
 900        """
 901        Plot scatter plots for Bondlength_Deviation, Angle_Deviation Ca_Distance
 902        and SG_Distance.
 903
 904        :param verbose: Whether to display the plot in the notebook. Default is False.
 905        :type verbose: bool
 906        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
 907        :type light: str
 908        :return: None
 909        """
 910        self.SSList.plot_deviation_scatterplots(verbose=verbose, theme=theme)
 911
 912    def plot_deviation_histograms(
 913        self, theme: str = "auto", verbose: bool = True
 914    ) -> None:
 915        """
 916        Plot histograms for Bondlength_Deviation, Angle_Deviation, and Ca_Distance.
 917        """
 918        self.SSList.plot_deviation_histograms(theme=theme, verbose=verbose)
 919
 920    def sslist_from_class(
 921        self, class_string: str, base: int = 8, cutoff: float = 0.0
 922    ) -> DisulfideList:
 923        """
 924        Return a DisulfideList containing Disulfides with the given class_string.
 925
 926        :param class_string: The class string to search for.
 927        :param base: The base of the class string. Default is 8.
 928        :param cutoff: The % cutoff value for the class. Default is 0.0.
 929        :return: DisulfideList containing Disulfides with the given class_string.
 930        """
 931        sslist_name = f"{class_string}_{base}_{cutoff:.2f}"
 932        sslist = DisulfideList([], sslist_name)
 933
 934        indices = self._class_indices_from_tors_df(class_string, base=base)
 935
 936        for i in indices:
 937            sslist.append(self[i])
 938
 939        return sslist
 940
 941    def display_torsion_statistics(
 942        self,
 943        class_id: Optional[str] = None,
 944        display: bool = True,
 945        save: bool = False,
 946        fname: str = "ss_torsions.png",
 947        theme: str = "auto",
 948        verbose: bool = False,
 949        dpi: int = 300,
 950        figure_size: tuple[int, int] = (4, 3),
 951    ) -> None:
 952        """
 953        Display torsion and distance statistics for all Disulfides in the loader.
 954        If a class ID is provided, display statistics for that class only.
 955
 956        :param class_id: The class ID to display statistics for. Default is None.
 957        :type class_id: str
 958        :param display: Whether to display the plot in the notebook. Default is True.
 959        :type display: bool
 960        :param save: Whether to save the plot as an image file. Default is False.
 961        :type save: bool
 962        :param fname: The name of the image file to save. Default is 'ss_torsions.png'.
 963        :type fname: str
 964        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
 965        :type theme: str
 966        :param verbose: Whether to display verbose output. Default is False.
 967        :type verbose: bool
 968        :param dpi: Dots per inch for the plot. Default is 300.
 969        :type dpi: int
 970        :param figure_size: Size of the figure as a tuple (width, height). Default is (4, 3).
 971        :type figure_size: tuple
 972        :return: None
 973        """
 974        if class_id:
 975            DisulfideVisualization.display_torsion_class_df(
 976                self.TorsionDF,
 977                class_id,
 978                display=display,
 979                save=save,
 980                fname=fname,
 981                theme=theme,
 982                dpi=dpi,
 983                figure_size=figure_size,
 984            )
 985        else:
 986            self.SSList.display_torsion_statistics(
 987                display=display,
 988                save=save,
 989                fname=fname,
 990                theme=theme,
 991                verbose=verbose,
 992                dpi=dpi,
 993                figure_size=figure_size,
 994            )
 995
 996    def classes_vs_cutoff(self, cutoff: float, base: int = 8) -> int:
 997        """
 998        Return number of members for the octant class for a given cutoff value.
 999
1000        :param cutoff: Percent cutoff value for filtering the classes.
1001        :return: None
1002        """
1003
1004        class_df = self.tclass.filter_class_by_percentage(cutoff, base=base)
1005        return class_df.shape[0]
1006
1007    def display_torsion_class_df(
1008        self,
1009        class_id: str,
1010        display: bool = True,
1011        save: bool = False,
1012        fname: str = "ss_torsions.png",
1013        theme: str = "auto",
1014        dpi: int = 300,
1015        figure_size: tuple[int, int] = (4, 3),
1016    ) -> None:
1017        """
1018        Display torsion and distance statistics for a given class ID using the TorsionDF dataframe.
1019
1020        :param class_id: The class ID to display statistics for (e.g. '11111b' for binary or '11111o' for octant)
1021        :param display: Whether to display the plot in the notebook
1022        :param save: Whether to save the plot as an image file
1023        :param fname: The name of the image file to save
1024        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
1025        :param dpi: DPI (dots per inch) for the saved image, controls the resolution
1026        :param figure_size: Tuple of (width, height) in inches for the figure size
1027        """
1028
1029        DisulfideVisualization.display_torsion_class_df(
1030            self.TorsionDF,
1031            class_id,
1032            display=display,
1033            save=save,
1034            fname=fname,
1035            theme=theme,
1036            dpi=dpi,
1037            figure_size=figure_size,
1038        )
1039
1040    def plot_3d_hexbin_leftright(
1041        self,
1042        width: int = 800,
1043        height: int = 600,
1044        gridsize: int = 80,
1045        tormin: float = -180.0,
1046        tormax: float = 180.0,
1047        scaling: str = "sqrt",
1048        column1: str = "chi2",
1049        column2: str = "chi4",
1050        title: Optional[str] = None,
1051    ) -> None:
1052        """
1053        Create 3D hexbin plots for left and right-handed chi2-chi4 correlations with customizable z-scaling.
1054
1055        :param loader: Loader object to retrieve torsion data
1056        :type loader: proteusPy.PDB_SS
1057        :param width: Window width in pixels
1058        :type width: int, optional
1059        :default width: 800
1060        :param height: Window height in pixels
1061        :type height: int, optional
1062        :default height: 600
1063        :param gridsize: Number of bins for hexbin
1064        :type gridsize: int, optional
1065        :default gridsize: 30
1066        :param tormin: Minimum torsion angle
1067        :type tormin: float, optional
1068        :default tormin: -180.0
1069        :param tormax: Maximum torsion angle
1070        :type tormax: float, optional
1071        :default tormax: 180.0
1072        :param scaling: Scaling method for z-values ('linear', 'sqrt', 'log', 'power')
1073        :type scaling: str, optional
1074        :default scaling: 'sqrt'
1075        :param column1: Name of the first column (x-axis)
1076        :type column1: str, optional
1077        :default column1: 'chi2'
1078        :param column2: Name of the second column (y-axis)
1079        :type column2: str, optional
1080        :default column2: 'chi4'
1081        :param title: Title of the plot
1082        :type title: str, optional
1083        """
1084
1085        DisulfideVisualization.plot_3d_hexbin_leftright(
1086            self,
1087            width=width,
1088            height=height,
1089            gridsize=gridsize,
1090            tormin=tormin,
1091            tormax=tormax,
1092            scaling=scaling,
1093            column1=column1,
1094            column2=column2,
1095            title=title,
1096        )
1097
1098
1099# class ends
1100
1101
1102def Load_PDB_SS(
1103    loadpath: str = DATA_DIR,
1104    verbose: bool = False,
1105    subset: bool = False,
1106    percentile: float = -1.0,
1107    force: bool = False,
1108) -> DisulfideLoader:
1109    """
1110    Load the fully instantiated Disulfide database from the specified file. This function
1111    will load the pre-built database if available, or bootstrap a new loader by downloading
1112    the data from Google Drive if needed. Use the provided parameters to control the loading
1113    behavior, filtering cutoffs, and verbosity.
1114
1115    :param loadpath: Path from which to load the database; defaults to DATA_DIR.
1116    :type loadpath: str
1117    :param verbose: If True, enables verbose logging; defaults to False.
1118    :type verbose: bool
1119    :param subset: If True, loads the subset database; otherwise loads the full database.
1120    :type subset: bool
1121    :param cutoff: Cα distance cutoff used to filter disulfides; defaults to CA_CUTOFF.
1122    :type cutoff: float
1123    :param sg_cutoff: Sγ distance cutoff used to filter disulfides; defaults to SG_CUTOFF.
1124    :type sg_cutoff: float
1125    :param force: If True, forces re-loading from Google Drive even if the file exists; defaults to False.
1126    :type force: bool
1127    :param percentile: Percentile (0-100) to compute cutoffs dynamically; if set to -1.0, the percentile method is not used.
1128    :type percentile: float
1129    :return: An instance of DisulfideLoader containing the loaded disulfide database.
1130    :rtype: DisulfideLoader
1131
1132    Example:
1133        >>> from proteusPy import Load_PDB_SS, create_logger
1134        >>> import logging
1135        >>> _logger = create_logger("testing")
1136        >>> _logger.setLevel(logging.WARNING)
1137        >>> loader = Load_PDB_SS(verbose=False, subset=True)
1138        >>> print(loader[0])
1139        <Disulfide 6dmb_203A_226A, Source: 6dmb, Resolution: 3.0 Å>
1140    """
1141
1142    # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy
1143    # distribution. In that case we need to download the files for all disulfides and the subset
1144    # from my Google Drive. This is a one-time operation.
1145
1146    _fname_sub = Path(loadpath) / LOADER_SUBSET_FNAME
1147    _fname_all = Path(loadpath) / LOADER_FNAME
1148    _fpath = _fname_sub if subset else _fname_all
1149
1150    sg_cutoff = ca_cutoff = -1.0
1151
1152    if not _fpath.exists() or force is True:
1153        if verbose:
1154            _logger.info(f"Bootstrapping new loader: {str(_fpath)}... ")
1155
1156        loader = Bootstrap_PDB_SS(
1157            loadpath=loadpath,
1158            verbose=verbose,
1159            subset=subset,
1160            force=force,
1161            percentile=percentile,
1162        )
1163        loader.save(
1164            savepath=loadpath,
1165            verbose=verbose,
1166        )
1167        return loader
1168
1169    if verbose:
1170        _logger.info("Reading disulfides from: %s...", _fpath)
1171
1172    with open(_fpath, "rb") as f:
1173        loader = pickle.load(f)
1174    if verbose:
1175        _logger.info("Done reading disulfides from: %s...", _fpath)
1176        loader.describe()
1177
1178    return loader
1179
1180
1181def Bootstrap_PDB_SS(
1182    loadpath: str = DATA_DIR,
1183    verbose: bool = True,
1184    subset: bool = False,
1185    force: bool = False,
1186    fake: bool = False,
1187    percentile: float = -1.0,
1188) -> Optional[DisulfideLoader]:
1189    """
1190    Download and instantiate the disulfide databases from Google Drive.
1191
1192    This function downloads the disulfide master SS list from Google Drive if it doesn't
1193    already exist in the specified load path or if the force flag is set to True.
1194    It then loads the disulfide data from the downloaded file and initializes a
1195    DisulfideLoader instance.
1196
1197    :param loadpath: Path from which to load the data, defaults to DATA_DIR
1198    :type loadpath: str
1199    :param cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
1200    :type cutoff: float
1201    :param sg_cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
1202    :type sg_cutoff: float
1203    :param verbose: Flag to enable verbose logging, defaults to False
1204    :type verbose: bool
1205    :param subset: Flag to indicate whether to load a subset of the data, defaults to False
1206    :type subset: bool
1207    :param force: Flag to force download even if the file exists, defaults to False
1208    :type force: bool
1209    :return: An instance of DisulfideLoader initialized with the loaded data
1210    :rtype: DisulfideLoader
1211    """
1212
1213    fname = SS_PICKLE_FILE
1214    url = SS_LIST_URL
1215
1216    # _fname = Path(loadpath) / fname
1217    full_path = Path(loadpath) / fname
1218
1219    if not full_path.exists() or force is True:
1220        if verbose:
1221            _logger.warning("Can't find %s. Downloading from Drive...", full_path)
1222
1223        if not fake:
1224            gdown.download(url, str(full_path), quiet=False)
1225        else:
1226            if verbose:
1227                _logger.warning("Fake download: %s", full_path)
1228                return None
1229    if verbose:
1230        _logger.info(
1231            "Building loader from: %s with cutoffs %s s...",
1232            full_path,
1233            percentile,
1234        )
1235
1236    loader = DisulfideLoader(
1237        datadir=DATA_DIR,
1238        subset=subset,
1239        verbose=verbose,
1240        percentile=percentile,
1241    )
1242
1243    if loader.TotalDisulfides == 0:
1244        _logger.error("No disulfides loaded!")
1245        return None
1246
1247    if verbose:
1248        _logger.info("Done building loader.")
1249
1250    return loader
1251
1252
1253if __name__ == "__main__":
1254    import doctest
1255
1256    doctest.testmod()
1257
1258# End of file
@dataclass
class DisulfideLoader:
  62@dataclass
  63class DisulfideLoader:
  64    """
  65    This class represents the disulfide database itself and is its primary means of accession.
  66    The entirety of the RCSB disulfide database is stored within the class via a
  67    proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of
  68    indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow
  69    simple, direct and flexible access to the disulfide structures contained herein.
  70    This makes it possible to access the disulfides by array index, PDB structure ID, disulfide
  71    name and class ID.
  72
  73    The class also provides methods for plotting distance and angle deviations
  74    as well as torsion statistics for the disulfides in the database.
  75
  76    The class can also render Disulfides overlaid on a common coordinate system to a pyVista
  77    window using the [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.
  78
  79    Important note: For typical usage one will access the database via the `Load_PDB_SS()` function.
  80    The difference is that the latter function loads the compressed database from its single
  81    source. The `DisulfideLoader` class is used to build the Disulifde database with a
  82    specific cutoff, or for saving the database to a file.
  83
  84    Cutoff values of -1.0 indicate imposing no cutoffs on the data.
  85
  86    :param verbose: Flag to control output verbosity
  87    :type verbose: bool
  88    :param datadir: Directory containingA data files
  89    :type datadir: str
  90    :param picklefile: Name of the pickle file containing disulfide data
  91    :type picklefile: str
  92    :param quiet: Flag to suppress warnings
  93    :type quiet: bool
  94    :param subset: Flag to load only a subset of data
  95    :type subset: bool
  96    :param cutoff: Distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  97    :type cutoff: float
  98    :param sg_cutoff: SG distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  99    :type sg_cutoff: float
 100    :param percentile: Percentile cutoff for filtering disulfides. Must be between 0 and 100.
 101    Filters based on statistical cutoffs derived from the data.
 102    :type percentile: float
 103    :param minimum: Minimum atom distance for filtering disulfides. -1 is no filtering.
 104    :type minimum: float
 105    :param save: Flag to save the Loader to a file
 106    :type save: bool
 107    """
 108
 109    # Fields that serve as both instance attributes and initialization parameters
 110    datadir: str = field(default=DATA_DIR)
 111    picklefile: str = field(default=SS_PICKLE_FILE)
 112    subset: bool = field(default=False)
 113    cutoff: float = field(default=-1.0)
 114    sg_cutoff: float = field(default=-1.0)
 115    verbose: bool = field(default=False)
 116    percentile: float = field(default=-1.0)
 117    quiet: bool = field(default=False)
 118    minimum: float = field(default=-1.0)
 119    saveit: bool = field(default=False)
 120
 121    # Fields that are only used internally and don't need to be initialization parameters
 122    SSList: DisulfideList = field(
 123        default_factory=lambda: DisulfideList([], "ALL_PDB_SS"), init=False
 124    )
 125    SSDict: Dict = field(default_factory=dict, init=False)
 126    TorsionDF: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)
 127    TotalDisulfides: int = field(default=0, init=False)
 128    IDList: List = field(default_factory=list, init=False)
 129    tclass: Optional[DisulfideClassManager] = field(default=None, init=False)
 130    class_generator: Optional[DisulfideClassGenerator] = field(default=None, init=False)
 131    timestamp: float = field(default_factory=time.time, init=False)
 132    version: str = field(default=__version__, init=False)
 133
 134    def __post_init__(self) -> None:
 135        """
 136        Initialize the DisulfideLoader after dataclass initialization.
 137        This method handles loading and processing of the disulfide data.
 138        """
 139
 140        cutoffs = {}
 141        old_length = new_length = 0
 142        full_path = Path(self.datadir) / self.picklefile
 143
 144        if self.verbose and not self.quiet:
 145            _logger.info(
 146                f"Reading disulfides from: {full_path}... ",
 147            )
 148
 149        try:
 150            # Check if the file exists before attempting to open it
 151            if not full_path.exists():
 152                fname = SS_PICKLE_FILE
 153                url = SS_LIST_URL
 154
 155                _fname = Path(DATA_DIR) / fname
 156
 157                if not _fname.exists():
 158                    if self.verbose:
 159                        _logger.info(
 160                            "Master SS list unavailable. Downloading Disulfide Database from Drive..."
 161                        )
 162                    gdown.download(url, str(_fname), quiet=False)
 163
 164            with open(full_path, "rb") as f:
 165                sslist = pickle.load(f)
 166
 167                if self.percentile > 0.0:
 168                    if self.percentile > 100.0:
 169                        raise ValueError("Percentile must be between 0 and 100.")
 170
 171                    cutoffs = DisulfideStats.calculate_cutoff_from_percentile(
 172                        sslist, percentile=self.percentile, verbose=self.verbose
 173                    )
 174
 175                    ca_cutoff = cutoffs["ca_cutoff_percentile"]
 176                    sg_cutoff = cutoffs["sg_cutoff_percentile"]
 177                    self.cutoff = ca_cutoff
 178                    self.sg_cutoff = sg_cutoff
 179
 180                    if self.verbose:
 181                        _logger.info(
 182                            f"Using percentile cutoffs: {ca_cutoff:.2f}, {sg_cutoff:.2f}"
 183                        )
 184
 185                old_length = len(sslist)
 186                filt = sslist.filter_by_distance(
 187                    distance=self.cutoff, distance_type="ca", minimum=-1.0
 188                )
 189                filt = DisulfideList(
 190                    filt,
 191                    f"filtered by Ca cutoff:{self.cutoff:.2f}, Sg cutoff: {self.sg_cutoff:.2f}",
 192                )
 193
 194                new_length = len(filt)
 195
 196                if self.verbose:
 197                    _logger.info(
 198                        "Filtered with Cα cutoff %.2f: old: %d, new: %d",
 199                        self.cutoff,
 200                        old_length,
 201                        new_length,
 202                    )
 203
 204                old_length = new_length
 205                filt = filt.filter_by_distance(
 206                    distance=self.sg_cutoff, distance_type="sg", minimum=-1.0
 207                )
 208                new_length = len(filt)
 209
 210                if self.verbose:
 211                    _logger.info(
 212                        "Filtered with Sγ: cutoff %.2f: old: %d, new: %d",
 213                        self.sg_cutoff,
 214                        old_length,
 215                        new_length,
 216                    )
 217                if self.subset:
 218                    self.SSList = DisulfideList(filt[:5000], "SUBSET_PDB_SS")
 219                else:
 220                    self.SSList = DisulfideList(filt, "ALL_PDB_SS")
 221
 222                self.SSDict = self._create_disulfide_dict()
 223                self.IDList = list(self.SSDict.keys())
 224
 225                self.TorsionDF = self.SSList.torsion_df
 226                self.TotalDisulfides = len(self.SSList)
 227                self.tclass = DisulfideClassManager(self, self.verbose)
 228                self.class_generator = DisulfideClassGenerator(verbose=self.verbose)
 229
 230            if self.verbose:
 231                _logger.info("Loader initialization complete.")
 232                self.describe()
 233
 234        except FileNotFoundError as e:
 235            _logger.error("File not found: %s", full_path)
 236            raise e
 237
 238        except Exception as e:
 239            _logger.error("An error occurred while loading the file: %s", full_path)
 240            raise e
 241        if self.saveit:
 242            self.save(
 243                savepath=DATA_DIR,
 244                verbose=self.verbose,
 245            )
 246
 247    # overload __getitem__ to handle slicing and indexing, and access by name or classid
 248    def __getitem__(self, item: int | slice | str) -> DisulfideList | Disulfide:
 249        """
 250        Implements indexing and slicing to retrieve DisulfideList objects from the
 251        DisulfideLoader. Supports:
 252
 253        - Integer indexing to retrieve a single DisulfideList
 254        - Slicing to retrieve a subset as a DisulfideList
 255        - Lookup by PDB ID to retrieve all Disulfides for that structure
 256        - Lookup by full disulfide name
 257        - Lookup by classid in the format 11111b or 11111o. The last char is the class type.
 258        - Lookup by classid in the format 11111. The base is 8 by default.
 259
 260        :param index: The index or key to retrieve the DisulfideList.
 261        :type index: int, slice, str
 262        :return: A DisulfideList object or a subset of it.
 263        :rtype: DisulfideList
 264        :raises DisulfideException: If the index or name is invalid.
 265        """
 266
 267        res = DisulfideList([], "none")
 268        ind_list = []
 269
 270        if isinstance(item, slice):
 271            indices = range(*item.indices(len(self.SSList)))
 272            ind_list = list(indices)
 273            name = f"pdb_slice[{ind_list[0]}:{ind_list[-1]+1}]"
 274            resolution = self.SSList[0].resolution
 275            sublist = [self.SSList[i] for i in indices]
 276            return DisulfideList(sublist, name, resolution)
 277
 278        if isinstance(item, int):
 279            if item < 0 or item >= self.TotalDisulfides:
 280                _logger.error(
 281                    "DisulfideLoader(): Index %d out of range 0-%d",
 282                    item,
 283                    self.TotalDisulfides - 1,
 284                )
 285                return res
 286
 287            res = self.SSList[item]
 288            return res
 289
 290        # if the item is a string, it could be a PDB ID or a full disulfide name
 291        # or a classid in the format 11111b or 11111o. the last char is the class type
 292
 293        if isinstance(item, str) and len(item) == 6 or len(item) == 5:  # classid
 294            res = self.extract_class(item, verbose=self.verbose)
 295            return res
 296
 297        # PDB_SS['4yys'] return a list of SS
 298        try:
 299            indices = self.SSDict[item]
 300            if indices:
 301                res = DisulfideList([], item)
 302                sslist = self.SSList
 303                for ind in indices:
 304                    res.append(sslist[ind])
 305            else:
 306                # try to find the full disulfide name
 307                res = self.SSList.get_by_name(item)  # full disulfide name
 308
 309        except KeyError as e:
 310            res = self.SSList.get_by_name(item)  # full disulfide name
 311
 312        if not res:
 313            _logger.error("DisulfideLoader(): Cannot find key %s in SSBond DB", item)
 314        return res
 315
 316    def __setitem__(self, index: int, item: Disulfide) -> None:
 317        self.SSList[index] = self._validate_ss(item)
 318
 319    def _validate_ss(self, value: Any) -> Disulfide:
 320        if isinstance(value, Disulfide):
 321            return value
 322        raise TypeError(f"Disulfide object expected, got {type(value).__name__}")
 323
 324    @property
 325    def average_resolution(self) -> float:
 326        """
 327        Return the average structure resolution for the given list.
 328        Result is cached since resolution values don't change after loading.
 329
 330        :return: Average resolution (A)
 331        """
 332        sslist = self.SSList
 333        valid_resolutions = [
 334            ss.resolution
 335            for ss in sslist
 336            if ss.resolution is not None and ss.resolution != -1.0
 337        ]
 338
 339        if not valid_resolutions:
 340            return -1.0
 341
 342        return sum(valid_resolutions) / len(valid_resolutions)
 343
 344    def binary_to_class(self, binary_class: str, base: int = 8) -> list[str]:
 345        """
 346        Convert a binary class string to an octant class string.
 347
 348        :param binary_class: The binary class string to convert.
 349        :param base: The base class to use, 2 or 8.
 350        :return: The octant class list.
 351        """
 352        return self.tclass.binary_to_class(binary_class, base)
 353
 354    def build_ss_from_idlist(self, idlist: List[str]) -> DisulfideList:
 355        """
 356        Return a DisulfideList of Disulfides for a given list of PDBIDs
 357
 358        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
 359        :return: DisulfideList
 360        """
 361        res = DisulfideList([], "RCSB_list")
 362        for pdbid, sslist in self.SSDict.items():
 363            if pdbid in idlist:
 364                for ssid in sslist:
 365                    res.append(self.SSList[ssid])
 366        return res
 367
 368    def _class_indices_from_tors_df(self, class_string: str, base: int = 8) -> pd.Index:
 369        """
 370        Return the row indices of the torsion dataframe that match the class string.
 371
 372        This method is used internally to find the indices of rows in the torsion dataframe
 373        that match the specified class string based on the given base.
 374
 375        :param class_string: The class string to match in the torsion dataframe.
 376        :type class_string: str
 377        :param base: The base class to use for matching, either 2 or 8. Defaults to 8.
 378        :type base: int
 379        :return: The row indices of the torsion dataframe that match the class string.
 380        :rtype: pd.Index
 381        :raises ValueError: If the base is not 2 or 8.
 382        """
 383        tors_df = self.TorsionDF
 384        match base:
 385            case 8:
 386                column = "octant_class_string"
 387            case 2:
 388                column = "binary_class_string"
 389            case _:
 390                raise ValueError(f"Base must be 2 or 8, not {base}")
 391
 392        return tors_df[tors_df[column] == class_string].index
 393
 394    def copy(self) -> "DisulfideLoader":
 395        """
 396        Return a copy of self.
 397
 398        :return: Copy of self
 399        """
 400        return copy.deepcopy(self)
 401
 402    def _create_disulfide_dict(self) -> Dict[str, List[int]]:
 403        """
 404        Create a dictionary from a list of disulfide objects where the key is the pdb_id
 405        and the value is a list of indices of the disulfide objects in the list.
 406
 407        This is an internal method used during initialization.
 408
 409        :param disulfide_list: List of disulfide objects.
 410        :type disulfide_list: list
 411        :return: Dictionary with pdb_id as keys and lists of indices as values.
 412        :rtype: dict
 413        """
 414        disulfide_list = self.SSList
 415
 416        disulfide_dict = {}
 417        for index, disulfide in enumerate(disulfide_list):
 418            if disulfide.pdb_id not in disulfide_dict:
 419                disulfide_dict[disulfide.pdb_id] = []
 420            disulfide_dict[disulfide.pdb_id].append(index)
 421        return disulfide_dict
 422
 423    def get_class_df(self, base: int = 8) -> pd.DataFrame:
 424        """
 425        Return the class incidence dataframe for the input base.
 426        Result is cached since class distributions don't change after loading.
 427
 428        :param base: The base class to use, 2 or 8.
 429        :return: pd.DataFrame
 430        """
 431        return self.tclass.get_class_df(base)
 432
 433    def extract_class(self, clsid: str, verbose: bool = False) -> DisulfideList:
 434        """
 435        Return the list of disulfides corresponding to the input `clsid`.
 436
 437        :param clsid: The class name to extract.
 438        :param verbose: If True, display progress bars, by default False
 439        :return: The list of disulfide bonds from the class.
 440        """
 441
 442        # cls = clsid[:5]
 443        cls = clsid
 444        ss_ids = []
 445        class_disulfides = None
 446
 447        try:
 448            ss_ids = self.tclass[clsid]
 449
 450        except KeyError:
 451            _logger.error("Cannot find key %s in SSBond DB", clsid)
 452            return DisulfideList([], cls, quiet=True)
 453
 454        tot_ss = len(ss_ids)
 455        class_disulfides = DisulfideList([], cls, quiet=True)
 456
 457        _pbar = (
 458            tqdm(range(tot_ss), total=tot_ss, leave=True) if verbose else range(tot_ss)
 459        )
 460
 461        for idx in _pbar:
 462            ssid = ss_ids[idx]
 463            class_disulfides.append(self[ssid])
 464
 465        return class_disulfides
 466
 467    def getlist(self) -> DisulfideList:
 468        """
 469        Return the list of Disulfides contained in the class.
 470
 471        :return: DisulfideList
 472        :rtype: DisulfideList
 473        """
 474        return copy.deepcopy(self.SSList)
 475
 476    def get_by_name(self, name: str = None) -> Optional[Disulfide]:
 477        """
 478        Return the Disulfide with the given name from the list.
 479        Result is cached since disulfide data doesn't change after loading.
 480        """
 481        for ss in self.SSList.data:
 482            if ss.name == name:
 483                return ss  # or ss.copy() !!!
 484        return None
 485
 486    def describe(self, memusg: bool = False) -> None:
 487        """
 488        Reveal key details about the Disulfide database stored in `self`. If `memusg` is True,
 489        the total RAM usage of the object is calculated and displayed — note that this process
 490        may take around 30 seconds on a 2024 MacBook Pro, M3 Max.
 491
 492        :param memusg: Set to True to include the RAM usage of the `DisulfideLoader` object.
 493        :return: None — just the facts!
 494        """
 495        # pylint: disable=E1101
 496        vers = self.version
 497        tot = self.TotalDisulfides
 498        pdbs = len(self.SSDict)
 499        ram = 0
 500        if memusg:
 501            ram = asizeof.asizeof(self) / (1024 * 1024 * 1024)
 502
 503        res = self.average_resolution
 504        cutoff = self.cutoff
 505        sg_cutoff = self.sg_cutoff
 506        percentile = self.percentile
 507        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
 508        ssMinMax = self.SSList.minmax_energy
 509        ssMin_name: Disulfide = ssMinMax[0].name
 510        ssMax_name: Disulfide = ssMinMax[1].name
 511
 512        print("")
 513        print("    🌟 RCSB Disulfide Database Summary 🌟")
 514        print(f"       🕒 Constructed: {timestr} 🕒")
 515        print(f"PDB IDs Present:               {pdbs}")
 516        print(f"Disulfides Loaded:             {tot}")
 517        print(f"Average Resolution:            {res:.2f} Å")
 518        print(f"Lowest Energy Disulfide:       {ssMin_name}")
 519        print(f"Highest Energy Disulfide:      {ssMax_name}")
 520        print(f"Cα Distance Cutoff:            {cutoff:.2f} Å")
 521        print(f"Sγ Distance Cutoff:            {sg_cutoff:.2f} Å")
 522        print(f"Percentile Cutoff:             {percentile:.2f} %")
 523        if memusg:
 524            print(f"Total RAM Usage:            {ram:.2f} GB")
 525        print(f"     ⚡ proteusPy Version: {vers} ⚡")
 526        print("")
 527
 528        return
 529
 530    def display_overlay(
 531        self, pdbid: str = "", verbose: bool = False, spin: bool = False
 532    ) -> None:
 533        """
 534        Display all disulfides for a given PDB ID overlaid in stick mode against
 535        a common coordinate frame. This allows us to see all of the disulfides
 536        at one time in a single view. Colors vary smoothy between bonds.
 537
 538        :param self: DisulfideLoader object initialized with the database.
 539        :param pdbid: the PDB id string, e.g. 4yys
 540        :param verbose: If True, display progress bars, by default False
 541        :type verbose: bool
 542        :param spin: If True, spin the display, by default False
 543        :type spin: bool
 544        :return: None
 545
 546        Example:
 547        >>> import proteusPy as pp
 548
 549        Instantiate the Loader with the SS database subset.
 550
 551        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
 552
 553        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
 554        a common reference (the proximal disulfides).
 555
 556        >>> PDB_SS.display_overlay('4yys', verbose=False)
 557
 558        You can also slice the loader and display as an overly.
 559        >>> PDB_SS[:8].display_overlay(verbose=False)
 560
 561        """
 562
 563        try:
 564            ssbonds = self[pdbid]
 565        except KeyError:
 566            _logger.error("Cannot find key %s in SSBond DB", pdbid)
 567            return
 568
 569        ssbonds.display_overlay(verbose=verbose, spin=spin)
 570        return
 571
 572    def getTorsions(self, pdbID: Optional[str] = None) -> pd.DataFrame:
 573        """
 574        Return the torsions, distances and energies defined by Torsion_DF_cols
 575
 576        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
 577        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
 578            then return the entire dataset.
 579        :raises DisulfideParseWarning: Raised if not found
 580        :return: Torsions Dataframe
 581        :rtype: pd.DataFrame
 582
 583        Example:
 584        >>> import proteusPy as pp
 585        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
 586        >>> Tor_DF = PDB_SS.getTorsions()
 587        """
 588        res_df = pd.DataFrame()
 589
 590        if pdbID:
 591            try:
 592                res = self.SSDict[pdbID]
 593                sel = self.TorsionDF["source"] == pdbID
 594                res_df = self.TorsionDF[sel]
 595                return res_df.copy()
 596            except KeyError as e:
 597                mess = f"Cannot find key {pdbID} in SSBond DB"
 598                _logger.error(mess)
 599                raise DisulfideParseWarning(mess) from e
 600        else:
 601            return copy.deepcopy(self.TorsionDF)
 602
 603    def list_binary_classes(self) -> None:
 604        """Enumerate the binary classes"""
 605        for k, v in enumerate(self.tclass.binaryclass_dict):
 606            print(f"Class: |{k}|, |{v}|")
 607
 608    def plot_classes(
 609        self,
 610        base: int = 8,
 611        class_string: Optional[str] = None,
 612        theme: str = "auto",
 613        log: bool = False,
 614        paginated: bool = False,
 615        page_size: int = 200,
 616    ) -> None:
 617        """
 618        Plot the classes for the given base.
 619
 620        :param base: The base class to use, 2 or 8.
 621        :param class_string: The class string to plot.
 622        :param theme: The theme to use for the plot ('auto', 'light', or 'dark').
 623        :param log: Whether to use a log scale for the y-axis.
 624        :param paginated: Whether to paginate the plot.
 625        :param page_size: Number of items per page.
 626        """
 627        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 628
 629        DisulfideVisualization.plot_classes(
 630            self.tclass,
 631            class_string=class_string,
 632            base=base,
 633            theme=theme,
 634            log=log,
 635            page_size=page_size,
 636            paginated=paginated,
 637        )
 638
 639    def plot_classes_vs_cutoff(
 640        self,
 641        cutoff: float,
 642        steps: int = 50,
 643        base: int = 8,
 644        theme: str = "auto",
 645        verbose: bool = False,
 646    ) -> None:
 647        """
 648        Plot the total percentage and number of members for each octant class against the cutoff value.
 649
 650        :param cutoff: Percent cutoff value for filtering the classes.
 651        :type cutoff: float
 652        :param steps: Number of steps to take in the cutoff.
 653        :type steps: int
 654        :param base: The base class to use, 6 or 8.
 655        :type base: int
 656        :param theme: The theme to use for the plot ('auto', 'light', or 'dark'), defaults to 'auto'.
 657        :type theme: str
 658        :param verbose: Whether to display verbose output, defaults to False.
 659        :type verbose: bool
 660        :return: None
 661        :rtype: None
 662        """
 663        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 664
 665        DisulfideVisualization.plot_classes_vs_cutoff(
 666            self.tclass, cutoff, steps, base, theme, verbose
 667        )
 668
 669    def plot_binary_to_eightclass_incidence(
 670        self,
 671        theme: str = "light",
 672        save: bool = False,
 673        savedir: str = ".",
 674        verbose: bool = False,
 675        log: bool = False,
 676    ) -> None:
 677        """Plot the incidence of all octant Disulfide classes for a given binary class.
 678
 679        :param theme: The theme to use for the plot
 680        :type theme: str
 681        :param save: Whether to save the plots
 682        :type save: bool
 683        :param savedir: Directory to save plots to
 684        :type savedir: str
 685        :param verbose: Whether to display verbose output
 686        :type verbose: bool
 687        :param log: Whether to use a log scale for the y-axis
 688        :type log: bool
 689        :return: None
 690        :rtype: None
 691        """
 692
 693        DisulfideVisualization.plot_binary_to_eightclass_incidence(
 694            self.tclass,
 695            theme=theme,
 696            save=save,
 697            savedir=savedir,
 698            verbose=verbose,
 699            log=log,
 700        )
 701
 702    def plot_count_vs_class_df(
 703        self,
 704        class_string: str,
 705        title: str = "title",
 706        theme: str = "auto",
 707        save: bool = False,
 708        savedir: str = ".",
 709        base: int = 8,
 710        verbose: bool = False,
 711        log: bool = False,
 712        sample_size: Optional[int] = None,
 713        page_size: Optional[int] = None,
 714    ) -> None:
 715        """
 716        Plot a line graph of count vs class ID using Plotly for the given disulfide class. The
 717        base selects the class type to plot: 2, 6, or 8, for binary, sextant, or octant classes.
 718
 719        :param class_string: The binary class string to be plotted.
 720        :param title: A string representing the title of the plot (default is 'title').
 721        :param theme: Theme to use for the plot
 722        :param save: Whether to save the plot
 723        :param savedir: Directory to save the plot to
 724        :param base: Base for class IDs (2 or 8)
 725        :param verbose: Whether to display verbose output
 726        :param log: Whether to use log scale for y-axis
 727        :param sample_size: Number of items to sample
 728        :param page_size: Number of items per page
 729        """
 730        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 731        class_list = self.tclass.binary_to_class(class_string, base)
 732        df = self._enumerate_class_fromlist(class_list, base=base)
 733
 734        if sample_size:
 735            DisulfideVisualization.plot_count_vs_class_df_sampled(
 736                df,
 737                title,
 738                theme,
 739                save,
 740                savedir,
 741                base,
 742                verbose,
 743                log,
 744                sample_size,
 745            )
 746        elif page_size:
 747            DisulfideVisualization.plot_count_vs_class_df_paginated(
 748                df, title, theme, save, savedir, base, verbose, log, page_size
 749            )
 750        else:
 751            DisulfideVisualization.plot_count_vs_class_df(
 752                df, title, theme, save, savedir, base, verbose, log
 753            )
 754
 755    def plot_count_vs_classid(
 756        self,
 757        cls: Optional[str] = None,
 758        theme: str = "auto",
 759        base: int = 8,
 760        log: bool = True,
 761    ) -> None:
 762        """
 763        Plot a line graph of count vs class ID using Plotly.
 764
 765        :param cls: Specific class to plot (optional)
 766        :param theme: Theme to use for the plot
 767        :param base: Base for class IDs (2 or 8)
 768        :param log: Whether to use log scale for y-axis
 769        """
 770        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 771
 772        DisulfideVisualization.plot_count_vs_classid(self.tclass, cls, theme, base, log)
 773
 774    def _enumerate_class_fromlist(
 775        self, sslist: List[str], base: int = 8
 776    ) -> pd.DataFrame:
 777        """
 778        Enumerate the classes from a list of class IDs and return a DataFrame with class IDs and their corresponding counts.
 779        Results are cached for improved performance on repeated calls.
 780
 781        :param sslist: A list of class IDs to enumerate.
 782        :param base: The base value for the enumeration, by default 8.
 783        :return: A DataFrame with columns "class_id" and "count" representing the class IDs and their corresponding counts.
 784        """
 785        x = []
 786        y = []
 787
 788        for cls in sslist:
 789            if cls is not None:
 790                _y = self.tclass.sslist_from_classid(cls, base=base)
 791                # it's possible to have 0 SS in a class
 792                if _y is not None:
 793                    # only append if we have both.
 794                    x.append(cls)
 795                    y.append(len(_y))
 796
 797        sslist_df = pd.DataFrame(columns=["class_id", "count"])
 798        sslist_df["class_id"] = x
 799        sslist_df["count"] = y
 800        return sslist_df
 801
 802    def save(
 803        self,
 804        savepath: str = DATA_DIR,
 805        verbose: bool = False,
 806        fname: Optional[str] = None,
 807    ) -> None:
 808        """
 809        Save a copy of the fully instantiated Loader to the specified file.
 810
 811        :param savepath: Path to save the file, defaults to DATA_DIR
 812        :param fname: Filename, defaults to LOADER_FNAME
 813        :param verbose: Verbosity, defaults to False
 814        """
 815        self.version = __version__
 816
 817        fname = None
 818        if self.subset:
 819            fname = LOADER_SUBSET_FNAME
 820        else:
 821            fname = LOADER_FNAME
 822
 823        _fname = Path(savepath) / fname
 824
 825        if verbose:
 826            _logger.info("Writing Disulfide Loader to: %s...", _fname)
 827
 828        with open(str(_fname), "wb+") as f:
 829            pickle.dump(self, f)
 830
 831        if verbose:
 832            _logger.info("Done saving loader.")
 833
 834    def plot_disulfides_vs_pdbid(self, cutoff: int = 1) -> Tuple[List[str], List[int]]:
 835        """
 836        Plots the number of disulfides versus pdbid.
 837
 838        :param cutoff: The minimum number of disulfides a PDB ID must have to be included in the plot.
 839        :type cutoff: int
 840        :return: A tuple containing the list of PDB IDs and the corresponding number of disulfides.
 841        :rtype: tuple
 842        """
 843        pdbids = []
 844        num_disulfides = []
 845
 846        for pdbid, disulfides in self.SSDict.items():
 847            if len(disulfides) > cutoff:
 848                pdbids.append(pdbid)
 849                num_disulfides.append(len(disulfides))
 850
 851        # Create a DataFrame
 852        df = pd.DataFrame({"PDB ID": pdbids, "Number of Disulfides": num_disulfides})
 853        fig = px.bar(
 854            df,
 855            x="PDB ID",
 856            y="Number of Disulfides",
 857            title=f"Disulfides vs PDB ID with cutoff: {cutoff}, {len(pdbids)} PDB IDs",
 858        )
 859        fig.update_layout(
 860            xaxis_title="PDB ID",
 861            yaxis_title="Number of Disulfides",
 862            xaxis_tickangle=-90,
 863        )
 864        fig.show()
 865
 866        return pdbids, num_disulfides
 867
 868    def plot_distances(
 869        self,
 870        distance_type: str = "ca",
 871        cutoff: float = -1,
 872        comparison: str = "less",
 873        theme: str = "auto",
 874        log: bool = True,
 875    ) -> None:
 876        """
 877        Plot the distances for the disulfides in the loader.
 878
 879        :param distance_type: The type of distance to plot ('ca' for Cα-Cα distance, 'sg' for Sγ-Sγ distance)
 880        :param cutoff: The cutoff value for the distance, defaults to -1 (no cutoff)
 881        :param comparison: if 'less' then plot distances less than the cutoff, if 'greater' then plot distances greater than the cutoff
 882        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
 883        :param log: Whether to use a log scale for the y-axis
 884        """
 885        # from proteusPy.DisulfideVisualization import DisulfideVisualization
 886
 887        # sslist = self.SSList
 888        # distances = sslist.extract_distances(distance_type, comparison, cutoff)
 889
 890        self.SSList.plot_distances(
 891            distance_type=distance_type,
 892            cutoff=cutoff,
 893            comparison=comparison,
 894            theme=theme,
 895            log=log,
 896        )
 897
 898    def plot_deviation_scatterplots(
 899        self, verbose: bool = False, theme: str = "auto"
 900    ) -> None:
 901        """
 902        Plot scatter plots for Bondlength_Deviation, Angle_Deviation Ca_Distance
 903        and SG_Distance.
 904
 905        :param verbose: Whether to display the plot in the notebook. Default is False.
 906        :type verbose: bool
 907        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
 908        :type light: str
 909        :return: None
 910        """
 911        self.SSList.plot_deviation_scatterplots(verbose=verbose, theme=theme)
 912
 913    def plot_deviation_histograms(
 914        self, theme: str = "auto", verbose: bool = True
 915    ) -> None:
 916        """
 917        Plot histograms for Bondlength_Deviation, Angle_Deviation, and Ca_Distance.
 918        """
 919        self.SSList.plot_deviation_histograms(theme=theme, verbose=verbose)
 920
 921    def sslist_from_class(
 922        self, class_string: str, base: int = 8, cutoff: float = 0.0
 923    ) -> DisulfideList:
 924        """
 925        Return a DisulfideList containing Disulfides with the given class_string.
 926
 927        :param class_string: The class string to search for.
 928        :param base: The base of the class string. Default is 8.
 929        :param cutoff: The % cutoff value for the class. Default is 0.0.
 930        :return: DisulfideList containing Disulfides with the given class_string.
 931        """
 932        sslist_name = f"{class_string}_{base}_{cutoff:.2f}"
 933        sslist = DisulfideList([], sslist_name)
 934
 935        indices = self._class_indices_from_tors_df(class_string, base=base)
 936
 937        for i in indices:
 938            sslist.append(self[i])
 939
 940        return sslist
 941
 942    def display_torsion_statistics(
 943        self,
 944        class_id: Optional[str] = None,
 945        display: bool = True,
 946        save: bool = False,
 947        fname: str = "ss_torsions.png",
 948        theme: str = "auto",
 949        verbose: bool = False,
 950        dpi: int = 300,
 951        figure_size: tuple[int, int] = (4, 3),
 952    ) -> None:
 953        """
 954        Display torsion and distance statistics for all Disulfides in the loader.
 955        If a class ID is provided, display statistics for that class only.
 956
 957        :param class_id: The class ID to display statistics for. Default is None.
 958        :type class_id: str
 959        :param display: Whether to display the plot in the notebook. Default is True.
 960        :type display: bool
 961        :param save: Whether to save the plot as an image file. Default is False.
 962        :type save: bool
 963        :param fname: The name of the image file to save. Default is 'ss_torsions.png'.
 964        :type fname: str
 965        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
 966        :type theme: str
 967        :param verbose: Whether to display verbose output. Default is False.
 968        :type verbose: bool
 969        :param dpi: Dots per inch for the plot. Default is 300.
 970        :type dpi: int
 971        :param figure_size: Size of the figure as a tuple (width, height). Default is (4, 3).
 972        :type figure_size: tuple
 973        :return: None
 974        """
 975        if class_id:
 976            DisulfideVisualization.display_torsion_class_df(
 977                self.TorsionDF,
 978                class_id,
 979                display=display,
 980                save=save,
 981                fname=fname,
 982                theme=theme,
 983                dpi=dpi,
 984                figure_size=figure_size,
 985            )
 986        else:
 987            self.SSList.display_torsion_statistics(
 988                display=display,
 989                save=save,
 990                fname=fname,
 991                theme=theme,
 992                verbose=verbose,
 993                dpi=dpi,
 994                figure_size=figure_size,
 995            )
 996
 997    def classes_vs_cutoff(self, cutoff: float, base: int = 8) -> int:
 998        """
 999        Return number of members for the octant class for a given cutoff value.
1000
1001        :param cutoff: Percent cutoff value for filtering the classes.
1002        :return: None
1003        """
1004
1005        class_df = self.tclass.filter_class_by_percentage(cutoff, base=base)
1006        return class_df.shape[0]
1007
1008    def display_torsion_class_df(
1009        self,
1010        class_id: str,
1011        display: bool = True,
1012        save: bool = False,
1013        fname: str = "ss_torsions.png",
1014        theme: str = "auto",
1015        dpi: int = 300,
1016        figure_size: tuple[int, int] = (4, 3),
1017    ) -> None:
1018        """
1019        Display torsion and distance statistics for a given class ID using the TorsionDF dataframe.
1020
1021        :param class_id: The class ID to display statistics for (e.g. '11111b' for binary or '11111o' for octant)
1022        :param display: Whether to display the plot in the notebook
1023        :param save: Whether to save the plot as an image file
1024        :param fname: The name of the image file to save
1025        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
1026        :param dpi: DPI (dots per inch) for the saved image, controls the resolution
1027        :param figure_size: Tuple of (width, height) in inches for the figure size
1028        """
1029
1030        DisulfideVisualization.display_torsion_class_df(
1031            self.TorsionDF,
1032            class_id,
1033            display=display,
1034            save=save,
1035            fname=fname,
1036            theme=theme,
1037            dpi=dpi,
1038            figure_size=figure_size,
1039        )
1040
1041    def plot_3d_hexbin_leftright(
1042        self,
1043        width: int = 800,
1044        height: int = 600,
1045        gridsize: int = 80,
1046        tormin: float = -180.0,
1047        tormax: float = 180.0,
1048        scaling: str = "sqrt",
1049        column1: str = "chi2",
1050        column2: str = "chi4",
1051        title: Optional[str] = None,
1052    ) -> None:
1053        """
1054        Create 3D hexbin plots for left and right-handed chi2-chi4 correlations with customizable z-scaling.
1055
1056        :param loader: Loader object to retrieve torsion data
1057        :type loader: proteusPy.PDB_SS
1058        :param width: Window width in pixels
1059        :type width: int, optional
1060        :default width: 800
1061        :param height: Window height in pixels
1062        :type height: int, optional
1063        :default height: 600
1064        :param gridsize: Number of bins for hexbin
1065        :type gridsize: int, optional
1066        :default gridsize: 30
1067        :param tormin: Minimum torsion angle
1068        :type tormin: float, optional
1069        :default tormin: -180.0
1070        :param tormax: Maximum torsion angle
1071        :type tormax: float, optional
1072        :default tormax: 180.0
1073        :param scaling: Scaling method for z-values ('linear', 'sqrt', 'log', 'power')
1074        :type scaling: str, optional
1075        :default scaling: 'sqrt'
1076        :param column1: Name of the first column (x-axis)
1077        :type column1: str, optional
1078        :default column1: 'chi2'
1079        :param column2: Name of the second column (y-axis)
1080        :type column2: str, optional
1081        :default column2: 'chi4'
1082        :param title: Title of the plot
1083        :type title: str, optional
1084        """
1085
1086        DisulfideVisualization.plot_3d_hexbin_leftright(
1087            self,
1088            width=width,
1089            height=height,
1090            gridsize=gridsize,
1091            tormin=tormin,
1092            tormax=tormax,
1093            scaling=scaling,
1094            column1=column1,
1095            column2=column2,
1096            title=title,
1097        )

This class represents the disulfide database itself and is its primary means of accession. The entirety of the RCSB disulfide database is stored within the class via a proteusPy.DisulfideList, a Pandas .csv file, and a dict of indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow simple, direct and flexible access to the disulfide structures contained herein. This makes it possible to access the disulfides by array index, PDB structure ID, disulfide name and class ID.

The class also provides methods for plotting distance and angle deviations as well as torsion statistics for the disulfides in the database.

The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the display_overlay() method. See below for examples.

Important note: For typical usage one will access the database via the Load_PDB_SS() function. The difference is that the latter function loads the compressed database from its single source. The DisulfideLoader class is used to build the Disulifde database with a specific cutoff, or for saving the database to a file.

Cutoff values of -1.0 indicate imposing no cutoffs on the data.

Parameters
  • verbose: Flag to control output verbosity
  • datadir: Directory containingA data files
  • picklefile: Name of the pickle file containing disulfide data
  • quiet: Flag to suppress warnings
  • subset: Flag to load only a subset of data
  • cutoff: Distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  • sg_cutoff: SG distance cutoff, (A) for filtering disulfides. Defaults to -1.0.
  • percentile: Percentile cutoff for filtering disulfides. Must be between 0 and 100. Filters based on statistical cutoffs derived from the data.
  • minimum: Minimum atom distance for filtering disulfides. -1 is no filtering.
  • save: Flag to save the Loader to a file
DisulfideLoader( datadir: str = '/Users/egs/repos/proteusPy/proteusPy/data', picklefile: str = 'PDB_all_ss.pkl', subset: bool = False, cutoff: float = -1.0, sg_cutoff: float = -1.0, verbose: bool = False, percentile: float = -1.0, quiet: bool = False, minimum: float = -1.0, saveit: bool = False)
datadir: str = '/Users/egs/repos/proteusPy/proteusPy/data'
picklefile: str = 'PDB_all_ss.pkl'
subset: bool = False
cutoff: float = -1.0
sg_cutoff: float = -1.0
verbose: bool = False
percentile: float = -1.0
quiet: bool = False
minimum: float = -1.0
saveit: bool = False
SSDict: Dict
TorsionDF: pandas.core.frame.DataFrame
TotalDisulfides: int = 0
IDList: List
timestamp: float
version: str = '0.99.35.dev0'
average_resolution: float
324    @property
325    def average_resolution(self) -> float:
326        """
327        Return the average structure resolution for the given list.
328        Result is cached since resolution values don't change after loading.
329
330        :return: Average resolution (A)
331        """
332        sslist = self.SSList
333        valid_resolutions = [
334            ss.resolution
335            for ss in sslist
336            if ss.resolution is not None and ss.resolution != -1.0
337        ]
338
339        if not valid_resolutions:
340            return -1.0
341
342        return sum(valid_resolutions) / len(valid_resolutions)

Return the average structure resolution for the given list. Result is cached since resolution values don't change after loading.

Returns

Average resolution (A)

def binary_to_class(self, binary_class: str, base: int = 8) -> list[str]:
344    def binary_to_class(self, binary_class: str, base: int = 8) -> list[str]:
345        """
346        Convert a binary class string to an octant class string.
347
348        :param binary_class: The binary class string to convert.
349        :param base: The base class to use, 2 or 8.
350        :return: The octant class list.
351        """
352        return self.tclass.binary_to_class(binary_class, base)

Convert a binary class string to an octant class string.

Parameters
  • binary_class: The binary class string to convert.
  • base: The base class to use, 2 or 8.
Returns

The octant class list.

def build_ss_from_idlist(self, idlist: List[str]) -> proteusPy.DisulfideBase.DisulfideList:
354    def build_ss_from_idlist(self, idlist: List[str]) -> DisulfideList:
355        """
356        Return a DisulfideList of Disulfides for a given list of PDBIDs
357
358        :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
359        :return: DisulfideList
360        """
361        res = DisulfideList([], "RCSB_list")
362        for pdbid, sslist in self.SSDict.items():
363            if pdbid in idlist:
364                for ssid in sslist:
365                    res.append(self.SSList[ssid])
366        return res

Return a DisulfideList of Disulfides for a given list of PDBIDs

Parameters
  • idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
Returns

DisulfideList

def copy(self) -> DisulfideLoader:
394    def copy(self) -> "DisulfideLoader":
395        """
396        Return a copy of self.
397
398        :return: Copy of self
399        """
400        return copy.deepcopy(self)

Return a copy of self.

Returns

Copy of self

def get_class_df(self, base: int = 8) -> pandas.core.frame.DataFrame:
423    def get_class_df(self, base: int = 8) -> pd.DataFrame:
424        """
425        Return the class incidence dataframe for the input base.
426        Result is cached since class distributions don't change after loading.
427
428        :param base: The base class to use, 2 or 8.
429        :return: pd.DataFrame
430        """
431        return self.tclass.get_class_df(base)

Return the class incidence dataframe for the input base. Result is cached since class distributions don't change after loading.

Parameters
  • base: The base class to use, 2 or 8.
Returns

pd.DataFrame

def extract_class( self, clsid: str, verbose: bool = False) -> proteusPy.DisulfideBase.DisulfideList:
433    def extract_class(self, clsid: str, verbose: bool = False) -> DisulfideList:
434        """
435        Return the list of disulfides corresponding to the input `clsid`.
436
437        :param clsid: The class name to extract.
438        :param verbose: If True, display progress bars, by default False
439        :return: The list of disulfide bonds from the class.
440        """
441
442        # cls = clsid[:5]
443        cls = clsid
444        ss_ids = []
445        class_disulfides = None
446
447        try:
448            ss_ids = self.tclass[clsid]
449
450        except KeyError:
451            _logger.error("Cannot find key %s in SSBond DB", clsid)
452            return DisulfideList([], cls, quiet=True)
453
454        tot_ss = len(ss_ids)
455        class_disulfides = DisulfideList([], cls, quiet=True)
456
457        _pbar = (
458            tqdm(range(tot_ss), total=tot_ss, leave=True) if verbose else range(tot_ss)
459        )
460
461        for idx in _pbar:
462            ssid = ss_ids[idx]
463            class_disulfides.append(self[ssid])
464
465        return class_disulfides

Return the list of disulfides corresponding to the input clsid.

Parameters
  • clsid: The class name to extract.
  • verbose: If True, display progress bars, by default False
Returns

The list of disulfide bonds from the class.

def getlist(self) -> proteusPy.DisulfideBase.DisulfideList:
467    def getlist(self) -> DisulfideList:
468        """
469        Return the list of Disulfides contained in the class.
470
471        :return: DisulfideList
472        :rtype: DisulfideList
473        """
474        return copy.deepcopy(self.SSList)

Return the list of Disulfides contained in the class.

Returns

DisulfideList

def get_by_name(self, name: str = None) -> Optional[proteusPy.DisulfideBase.Disulfide]:
476    def get_by_name(self, name: str = None) -> Optional[Disulfide]:
477        """
478        Return the Disulfide with the given name from the list.
479        Result is cached since disulfide data doesn't change after loading.
480        """
481        for ss in self.SSList.data:
482            if ss.name == name:
483                return ss  # or ss.copy() !!!
484        return None

Return the Disulfide with the given name from the list. Result is cached since disulfide data doesn't change after loading.

def describe(self, memusg: bool = False) -> None:
486    def describe(self, memusg: bool = False) -> None:
487        """
488        Reveal key details about the Disulfide database stored in `self`. If `memusg` is True,
489        the total RAM usage of the object is calculated and displayed — note that this process
490        may take around 30 seconds on a 2024 MacBook Pro, M3 Max.
491
492        :param memusg: Set to True to include the RAM usage of the `DisulfideLoader` object.
493        :return: None — just the facts!
494        """
495        # pylint: disable=E1101
496        vers = self.version
497        tot = self.TotalDisulfides
498        pdbs = len(self.SSDict)
499        ram = 0
500        if memusg:
501            ram = asizeof.asizeof(self) / (1024 * 1024 * 1024)
502
503        res = self.average_resolution
504        cutoff = self.cutoff
505        sg_cutoff = self.sg_cutoff
506        percentile = self.percentile
507        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp))
508        ssMinMax = self.SSList.minmax_energy
509        ssMin_name: Disulfide = ssMinMax[0].name
510        ssMax_name: Disulfide = ssMinMax[1].name
511
512        print("")
513        print("    🌟 RCSB Disulfide Database Summary 🌟")
514        print(f"       🕒 Constructed: {timestr} 🕒")
515        print(f"PDB IDs Present:               {pdbs}")
516        print(f"Disulfides Loaded:             {tot}")
517        print(f"Average Resolution:            {res:.2f} Å")
518        print(f"Lowest Energy Disulfide:       {ssMin_name}")
519        print(f"Highest Energy Disulfide:      {ssMax_name}")
520        print(f"Cα Distance Cutoff:            {cutoff:.2f} Å")
521        print(f"Sγ Distance Cutoff:            {sg_cutoff:.2f} Å")
522        print(f"Percentile Cutoff:             {percentile:.2f} %")
523        if memusg:
524            print(f"Total RAM Usage:            {ram:.2f} GB")
525        print(f"     ⚡ proteusPy Version: {vers} ⚡")
526        print("")
527
528        return

Reveal key details about the Disulfide database stored in self. If memusg is True, the total RAM usage of the object is calculated and displayed — note that this process may take around 30 seconds on a 2024 MacBook Pro, M3 Max.

Parameters
  • memusg: Set to True to include the RAM usage of the DisulfideLoader object.
Returns

None — just the facts!

def display_overlay(self, pdbid: str = '', verbose: bool = False, spin: bool = False) -> None:
530    def display_overlay(
531        self, pdbid: str = "", verbose: bool = False, spin: bool = False
532    ) -> None:
533        """
534        Display all disulfides for a given PDB ID overlaid in stick mode against
535        a common coordinate frame. This allows us to see all of the disulfides
536        at one time in a single view. Colors vary smoothy between bonds.
537
538        :param self: DisulfideLoader object initialized with the database.
539        :param pdbid: the PDB id string, e.g. 4yys
540        :param verbose: If True, display progress bars, by default False
541        :type verbose: bool
542        :param spin: If True, spin the display, by default False
543        :type spin: bool
544        :return: None
545
546        Example:
547        >>> import proteusPy as pp
548
549        Instantiate the Loader with the SS database subset.
550
551        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
552
553        Display the Disulfides from the PDB ID ```4yys```, overlaid onto
554        a common reference (the proximal disulfides).
555
556        >>> PDB_SS.display_overlay('4yys', verbose=False)
557
558        You can also slice the loader and display as an overly.
559        >>> PDB_SS[:8].display_overlay(verbose=False)
560
561        """
562
563        try:
564            ssbonds = self[pdbid]
565        except KeyError:
566            _logger.error("Cannot find key %s in SSBond DB", pdbid)
567            return
568
569        ssbonds.display_overlay(verbose=verbose, spin=spin)
570        return

Display all disulfides for a given PDB ID overlaid in stick mode against a common coordinate frame. This allows us to see all of the disulfides at one time in a single view. Colors vary smoothy between bonds.

Parameters
  • self: DisulfideLoader object initialized with the database.
  • pdbid: the PDB id string, e.g. 4yys
  • verbose: If True, display progress bars, by default False
  • spin: If True, spin the display, by default False
Returns

None

Example:

>>> import proteusPy as pp

Instantiate the Loader with the SS database subset.

>>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)

Display the Disulfides from the PDB ID 4yys, overlaid onto a common reference (the proximal disulfides).

>>> PDB_SS.display_overlay('4yys', verbose=False)

You can also slice the loader and display as an overly.

>>> PDB_SS[:8].display_overlay(verbose=False)
def getTorsions(self, pdbID: Optional[str] = None) -> pandas.core.frame.DataFrame:
572    def getTorsions(self, pdbID: Optional[str] = None) -> pd.DataFrame:
573        """
574        Return the torsions, distances and energies defined by Torsion_DF_cols
575
576        :param pdbID: pdbID, defaults to None, meaning return entire dataset.
577        :type pdbID: str, optional used to extract for a specific PDB structure. If not specified
578            then return the entire dataset.
579        :raises DisulfideParseWarning: Raised if not found
580        :return: Torsions Dataframe
581        :rtype: pd.DataFrame
582
583        Example:
584        >>> import proteusPy as pp
585        >>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
586        >>> Tor_DF = PDB_SS.getTorsions()
587        """
588        res_df = pd.DataFrame()
589
590        if pdbID:
591            try:
592                res = self.SSDict[pdbID]
593                sel = self.TorsionDF["source"] == pdbID
594                res_df = self.TorsionDF[sel]
595                return res_df.copy()
596            except KeyError as e:
597                mess = f"Cannot find key {pdbID} in SSBond DB"
598                _logger.error(mess)
599                raise DisulfideParseWarning(mess) from e
600        else:
601            return copy.deepcopy(self.TorsionDF)

Return the torsions, distances and energies defined by Torsion_DF_cols

Parameters
  • pdbID: pdbID, defaults to None, meaning return entire dataset.
Raises
  • DisulfideParseWarning: Raised if not found
Returns

Torsions Dataframe

Example:

>>> import proteusPy as pp
>>> PDB_SS = pp.Load_PDB_SS(verbose=False, subset=True)
>>> Tor_DF = PDB_SS.getTorsions()
def list_binary_classes(self) -> None:
603    def list_binary_classes(self) -> None:
604        """Enumerate the binary classes"""
605        for k, v in enumerate(self.tclass.binaryclass_dict):
606            print(f"Class: |{k}|, |{v}|")

Enumerate the binary classes

def plot_classes( self, base: int = 8, class_string: Optional[str] = None, theme: str = 'auto', log: bool = False, paginated: bool = False, page_size: int = 200) -> None:
608    def plot_classes(
609        self,
610        base: int = 8,
611        class_string: Optional[str] = None,
612        theme: str = "auto",
613        log: bool = False,
614        paginated: bool = False,
615        page_size: int = 200,
616    ) -> None:
617        """
618        Plot the classes for the given base.
619
620        :param base: The base class to use, 2 or 8.
621        :param class_string: The class string to plot.
622        :param theme: The theme to use for the plot ('auto', 'light', or 'dark').
623        :param log: Whether to use a log scale for the y-axis.
624        :param paginated: Whether to paginate the plot.
625        :param page_size: Number of items per page.
626        """
627        # from proteusPy.DisulfideVisualization import DisulfideVisualization
628
629        DisulfideVisualization.plot_classes(
630            self.tclass,
631            class_string=class_string,
632            base=base,
633            theme=theme,
634            log=log,
635            page_size=page_size,
636            paginated=paginated,
637        )

Plot the classes for the given base.

Parameters
  • base: The base class to use, 2 or 8.
  • class_string: The class string to plot.
  • theme: The theme to use for the plot ('auto', 'light', or 'dark').
  • log: Whether to use a log scale for the y-axis.
  • paginated: Whether to paginate the plot.
  • page_size: Number of items per page.
def plot_classes_vs_cutoff( self, cutoff: float, steps: int = 50, base: int = 8, theme: str = 'auto', verbose: bool = False) -> None:
639    def plot_classes_vs_cutoff(
640        self,
641        cutoff: float,
642        steps: int = 50,
643        base: int = 8,
644        theme: str = "auto",
645        verbose: bool = False,
646    ) -> None:
647        """
648        Plot the total percentage and number of members for each octant class against the cutoff value.
649
650        :param cutoff: Percent cutoff value for filtering the classes.
651        :type cutoff: float
652        :param steps: Number of steps to take in the cutoff.
653        :type steps: int
654        :param base: The base class to use, 6 or 8.
655        :type base: int
656        :param theme: The theme to use for the plot ('auto', 'light', or 'dark'), defaults to 'auto'.
657        :type theme: str
658        :param verbose: Whether to display verbose output, defaults to False.
659        :type verbose: bool
660        :return: None
661        :rtype: None
662        """
663        # from proteusPy.DisulfideVisualization import DisulfideVisualization
664
665        DisulfideVisualization.plot_classes_vs_cutoff(
666            self.tclass, cutoff, steps, base, theme, verbose
667        )

Plot the total percentage and number of members for each octant class against the cutoff value.

Parameters
  • cutoff: Percent cutoff value for filtering the classes.
  • steps: Number of steps to take in the cutoff.
  • base: The base class to use, 6 or 8.
  • theme: The theme to use for the plot ('auto', 'light', or 'dark'), defaults to 'auto'.
  • verbose: Whether to display verbose output, defaults to False.
Returns

None

def plot_binary_to_eightclass_incidence( self, theme: str = 'light', save: bool = False, savedir: str = '.', verbose: bool = False, log: bool = False) -> None:
669    def plot_binary_to_eightclass_incidence(
670        self,
671        theme: str = "light",
672        save: bool = False,
673        savedir: str = ".",
674        verbose: bool = False,
675        log: bool = False,
676    ) -> None:
677        """Plot the incidence of all octant Disulfide classes for a given binary class.
678
679        :param theme: The theme to use for the plot
680        :type theme: str
681        :param save: Whether to save the plots
682        :type save: bool
683        :param savedir: Directory to save plots to
684        :type savedir: str
685        :param verbose: Whether to display verbose output
686        :type verbose: bool
687        :param log: Whether to use a log scale for the y-axis
688        :type log: bool
689        :return: None
690        :rtype: None
691        """
692
693        DisulfideVisualization.plot_binary_to_eightclass_incidence(
694            self.tclass,
695            theme=theme,
696            save=save,
697            savedir=savedir,
698            verbose=verbose,
699            log=log,
700        )

Plot the incidence of all octant Disulfide classes for a given binary class.

Parameters
  • theme: The theme to use for the plot
  • save: Whether to save the plots
  • savedir: Directory to save plots to
  • verbose: Whether to display verbose output
  • log: Whether to use a log scale for the y-axis
Returns

None

def plot_count_vs_class_df( self, class_string: str, title: str = 'title', theme: str = 'auto', save: bool = False, savedir: str = '.', base: int = 8, verbose: bool = False, log: bool = False, sample_size: Optional[int] = None, page_size: Optional[int] = None) -> None:
702    def plot_count_vs_class_df(
703        self,
704        class_string: str,
705        title: str = "title",
706        theme: str = "auto",
707        save: bool = False,
708        savedir: str = ".",
709        base: int = 8,
710        verbose: bool = False,
711        log: bool = False,
712        sample_size: Optional[int] = None,
713        page_size: Optional[int] = None,
714    ) -> None:
715        """
716        Plot a line graph of count vs class ID using Plotly for the given disulfide class. The
717        base selects the class type to plot: 2, 6, or 8, for binary, sextant, or octant classes.
718
719        :param class_string: The binary class string to be plotted.
720        :param title: A string representing the title of the plot (default is 'title').
721        :param theme: Theme to use for the plot
722        :param save: Whether to save the plot
723        :param savedir: Directory to save the plot to
724        :param base: Base for class IDs (2 or 8)
725        :param verbose: Whether to display verbose output
726        :param log: Whether to use log scale for y-axis
727        :param sample_size: Number of items to sample
728        :param page_size: Number of items per page
729        """
730        # from proteusPy.DisulfideVisualization import DisulfideVisualization
731        class_list = self.tclass.binary_to_class(class_string, base)
732        df = self._enumerate_class_fromlist(class_list, base=base)
733
734        if sample_size:
735            DisulfideVisualization.plot_count_vs_class_df_sampled(
736                df,
737                title,
738                theme,
739                save,
740                savedir,
741                base,
742                verbose,
743                log,
744                sample_size,
745            )
746        elif page_size:
747            DisulfideVisualization.plot_count_vs_class_df_paginated(
748                df, title, theme, save, savedir, base, verbose, log, page_size
749            )
750        else:
751            DisulfideVisualization.plot_count_vs_class_df(
752                df, title, theme, save, savedir, base, verbose, log
753            )

Plot a line graph of count vs class ID using Plotly for the given disulfide class. The base selects the class type to plot: 2, 6, or 8, for binary, sextant, or octant classes.

Parameters
  • class_string: The binary class string to be plotted.
  • title: A string representing the title of the plot (default is 'title').
  • theme: Theme to use for the plot
  • save: Whether to save the plot
  • savedir: Directory to save the plot to
  • base: Base for class IDs (2 or 8)
  • verbose: Whether to display verbose output
  • log: Whether to use log scale for y-axis
  • sample_size: Number of items to sample
  • page_size: Number of items per page
def plot_count_vs_classid( self, cls: Optional[str] = None, theme: str = 'auto', base: int = 8, log: bool = True) -> None:
755    def plot_count_vs_classid(
756        self,
757        cls: Optional[str] = None,
758        theme: str = "auto",
759        base: int = 8,
760        log: bool = True,
761    ) -> None:
762        """
763        Plot a line graph of count vs class ID using Plotly.
764
765        :param cls: Specific class to plot (optional)
766        :param theme: Theme to use for the plot
767        :param base: Base for class IDs (2 or 8)
768        :param log: Whether to use log scale for y-axis
769        """
770        # from proteusPy.DisulfideVisualization import DisulfideVisualization
771
772        DisulfideVisualization.plot_count_vs_classid(self.tclass, cls, theme, base, log)

Plot a line graph of count vs class ID using Plotly.

Parameters
  • cls: Specific class to plot (optional)
  • theme: Theme to use for the plot
  • base: Base for class IDs (2 or 8)
  • log: Whether to use log scale for y-axis
def save( self, savepath: str = '/Users/egs/repos/proteusPy/proteusPy/data', verbose: bool = False, fname: Optional[str] = None) -> None:
802    def save(
803        self,
804        savepath: str = DATA_DIR,
805        verbose: bool = False,
806        fname: Optional[str] = None,
807    ) -> None:
808        """
809        Save a copy of the fully instantiated Loader to the specified file.
810
811        :param savepath: Path to save the file, defaults to DATA_DIR
812        :param fname: Filename, defaults to LOADER_FNAME
813        :param verbose: Verbosity, defaults to False
814        """
815        self.version = __version__
816
817        fname = None
818        if self.subset:
819            fname = LOADER_SUBSET_FNAME
820        else:
821            fname = LOADER_FNAME
822
823        _fname = Path(savepath) / fname
824
825        if verbose:
826            _logger.info("Writing Disulfide Loader to: %s...", _fname)
827
828        with open(str(_fname), "wb+") as f:
829            pickle.dump(self, f)
830
831        if verbose:
832            _logger.info("Done saving loader.")

Save a copy of the fully instantiated Loader to the specified file.

Parameters
  • savepath: Path to save the file, defaults to DATA_DIR
  • fname: Filename, defaults to LOADER_FNAME
  • verbose: Verbosity, defaults to False
def plot_disulfides_vs_pdbid(self, cutoff: int = 1) -> Tuple[List[str], List[int]]:
834    def plot_disulfides_vs_pdbid(self, cutoff: int = 1) -> Tuple[List[str], List[int]]:
835        """
836        Plots the number of disulfides versus pdbid.
837
838        :param cutoff: The minimum number of disulfides a PDB ID must have to be included in the plot.
839        :type cutoff: int
840        :return: A tuple containing the list of PDB IDs and the corresponding number of disulfides.
841        :rtype: tuple
842        """
843        pdbids = []
844        num_disulfides = []
845
846        for pdbid, disulfides in self.SSDict.items():
847            if len(disulfides) > cutoff:
848                pdbids.append(pdbid)
849                num_disulfides.append(len(disulfides))
850
851        # Create a DataFrame
852        df = pd.DataFrame({"PDB ID": pdbids, "Number of Disulfides": num_disulfides})
853        fig = px.bar(
854            df,
855            x="PDB ID",
856            y="Number of Disulfides",
857            title=f"Disulfides vs PDB ID with cutoff: {cutoff}, {len(pdbids)} PDB IDs",
858        )
859        fig.update_layout(
860            xaxis_title="PDB ID",
861            yaxis_title="Number of Disulfides",
862            xaxis_tickangle=-90,
863        )
864        fig.show()
865
866        return pdbids, num_disulfides

Plots the number of disulfides versus pdbid.

Parameters
  • cutoff: The minimum number of disulfides a PDB ID must have to be included in the plot.
Returns

A tuple containing the list of PDB IDs and the corresponding number of disulfides.

def plot_distances( self, distance_type: str = 'ca', cutoff: float = -1, comparison: str = 'less', theme: str = 'auto', log: bool = True) -> None:
868    def plot_distances(
869        self,
870        distance_type: str = "ca",
871        cutoff: float = -1,
872        comparison: str = "less",
873        theme: str = "auto",
874        log: bool = True,
875    ) -> None:
876        """
877        Plot the distances for the disulfides in the loader.
878
879        :param distance_type: The type of distance to plot ('ca' for Cα-Cα distance, 'sg' for Sγ-Sγ distance)
880        :param cutoff: The cutoff value for the distance, defaults to -1 (no cutoff)
881        :param comparison: if 'less' then plot distances less than the cutoff, if 'greater' then plot distances greater than the cutoff
882        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
883        :param log: Whether to use a log scale for the y-axis
884        """
885        # from proteusPy.DisulfideVisualization import DisulfideVisualization
886
887        # sslist = self.SSList
888        # distances = sslist.extract_distances(distance_type, comparison, cutoff)
889
890        self.SSList.plot_distances(
891            distance_type=distance_type,
892            cutoff=cutoff,
893            comparison=comparison,
894            theme=theme,
895            log=log,
896        )

Plot the distances for the disulfides in the loader.

Parameters
  • distance_type: The type of distance to plot ('ca' for Cα-Cα distance, 'sg' for Sγ-Sγ distance)
  • cutoff: The cutoff value for the distance, defaults to -1 (no cutoff)
  • comparison: if 'less' then plot distances less than the cutoff, if 'greater' then plot distances greater than the cutoff
  • theme: The theme to use for the plot ('auto', 'light', or 'dark')
  • log: Whether to use a log scale for the y-axis
def plot_deviation_scatterplots(self, verbose: bool = False, theme: str = 'auto') -> None:
898    def plot_deviation_scatterplots(
899        self, verbose: bool = False, theme: str = "auto"
900    ) -> None:
901        """
902        Plot scatter plots for Bondlength_Deviation, Angle_Deviation Ca_Distance
903        and SG_Distance.
904
905        :param verbose: Whether to display the plot in the notebook. Default is False.
906        :type verbose: bool
907        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
908        :type light: str
909        :return: None
910        """
911        self.SSList.plot_deviation_scatterplots(verbose=verbose, theme=theme)

Plot scatter plots for Bondlength_Deviation, Angle_Deviation Ca_Distance and SG_Distance.

Parameters
  • verbose: Whether to display the plot in the notebook. Default is False.
  • theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
Returns

None

def plot_deviation_histograms(self, theme: str = 'auto', verbose: bool = True) -> None:
913    def plot_deviation_histograms(
914        self, theme: str = "auto", verbose: bool = True
915    ) -> None:
916        """
917        Plot histograms for Bondlength_Deviation, Angle_Deviation, and Ca_Distance.
918        """
919        self.SSList.plot_deviation_histograms(theme=theme, verbose=verbose)

Plot histograms for Bondlength_Deviation, Angle_Deviation, and Ca_Distance.

def sslist_from_class( self, class_string: str, base: int = 8, cutoff: float = 0.0) -> proteusPy.DisulfideBase.DisulfideList:
921    def sslist_from_class(
922        self, class_string: str, base: int = 8, cutoff: float = 0.0
923    ) -> DisulfideList:
924        """
925        Return a DisulfideList containing Disulfides with the given class_string.
926
927        :param class_string: The class string to search for.
928        :param base: The base of the class string. Default is 8.
929        :param cutoff: The % cutoff value for the class. Default is 0.0.
930        :return: DisulfideList containing Disulfides with the given class_string.
931        """
932        sslist_name = f"{class_string}_{base}_{cutoff:.2f}"
933        sslist = DisulfideList([], sslist_name)
934
935        indices = self._class_indices_from_tors_df(class_string, base=base)
936
937        for i in indices:
938            sslist.append(self[i])
939
940        return sslist

Return a DisulfideList containing Disulfides with the given class_string.

Parameters
  • class_string: The class string to search for.
  • base: The base of the class string. Default is 8.
  • cutoff: The % cutoff value for the class. Default is 0.0.
Returns

DisulfideList containing Disulfides with the given class_string.

def display_torsion_statistics( self, class_id: Optional[str] = None, display: bool = True, save: bool = False, fname: str = 'ss_torsions.png', theme: str = 'auto', verbose: bool = False, dpi: int = 300, figure_size: tuple[int, int] = (4, 3)) -> None:
942    def display_torsion_statistics(
943        self,
944        class_id: Optional[str] = None,
945        display: bool = True,
946        save: bool = False,
947        fname: str = "ss_torsions.png",
948        theme: str = "auto",
949        verbose: bool = False,
950        dpi: int = 300,
951        figure_size: tuple[int, int] = (4, 3),
952    ) -> None:
953        """
954        Display torsion and distance statistics for all Disulfides in the loader.
955        If a class ID is provided, display statistics for that class only.
956
957        :param class_id: The class ID to display statistics for. Default is None.
958        :type class_id: str
959        :param display: Whether to display the plot in the notebook. Default is True.
960        :type display: bool
961        :param save: Whether to save the plot as an image file. Default is False.
962        :type save: bool
963        :param fname: The name of the image file to save. Default is 'ss_torsions.png'.
964        :type fname: str
965        :param theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
966        :type theme: str
967        :param verbose: Whether to display verbose output. Default is False.
968        :type verbose: bool
969        :param dpi: Dots per inch for the plot. Default is 300.
970        :type dpi: int
971        :param figure_size: Size of the figure as a tuple (width, height). Default is (4, 3).
972        :type figure_size: tuple
973        :return: None
974        """
975        if class_id:
976            DisulfideVisualization.display_torsion_class_df(
977                self.TorsionDF,
978                class_id,
979                display=display,
980                save=save,
981                fname=fname,
982                theme=theme,
983                dpi=dpi,
984                figure_size=figure_size,
985            )
986        else:
987            self.SSList.display_torsion_statistics(
988                display=display,
989                save=save,
990                fname=fname,
991                theme=theme,
992                verbose=verbose,
993                dpi=dpi,
994                figure_size=figure_size,
995            )

Display torsion and distance statistics for all Disulfides in the loader. If a class ID is provided, display statistics for that class only.

Parameters
  • class_id: The class ID to display statistics for. Default is None.
  • display: Whether to display the plot in the notebook. Default is True.
  • save: Whether to save the plot as an image file. Default is False.
  • fname: The name of the image file to save. Default is 'ss_torsions.png'.
  • theme: One of 'Auto', 'Light', or 'Dark'. Default is 'Auto'.
  • verbose: Whether to display verbose output. Default is False.
  • dpi: Dots per inch for the plot. Default is 300.
  • figure_size: Size of the figure as a tuple (width, height). Default is (4, 3).
Returns

None

def classes_vs_cutoff(self, cutoff: float, base: int = 8) -> int:
 997    def classes_vs_cutoff(self, cutoff: float, base: int = 8) -> int:
 998        """
 999        Return number of members for the octant class for a given cutoff value.
1000
1001        :param cutoff: Percent cutoff value for filtering the classes.
1002        :return: None
1003        """
1004
1005        class_df = self.tclass.filter_class_by_percentage(cutoff, base=base)
1006        return class_df.shape[0]

Return number of members for the octant class for a given cutoff value.

Parameters
  • cutoff: Percent cutoff value for filtering the classes.
Returns

None

def display_torsion_class_df( self, class_id: str, display: bool = True, save: bool = False, fname: str = 'ss_torsions.png', theme: str = 'auto', dpi: int = 300, figure_size: tuple[int, int] = (4, 3)) -> None:
1008    def display_torsion_class_df(
1009        self,
1010        class_id: str,
1011        display: bool = True,
1012        save: bool = False,
1013        fname: str = "ss_torsions.png",
1014        theme: str = "auto",
1015        dpi: int = 300,
1016        figure_size: tuple[int, int] = (4, 3),
1017    ) -> None:
1018        """
1019        Display torsion and distance statistics for a given class ID using the TorsionDF dataframe.
1020
1021        :param class_id: The class ID to display statistics for (e.g. '11111b' for binary or '11111o' for octant)
1022        :param display: Whether to display the plot in the notebook
1023        :param save: Whether to save the plot as an image file
1024        :param fname: The name of the image file to save
1025        :param theme: The theme to use for the plot ('auto', 'light', or 'dark')
1026        :param dpi: DPI (dots per inch) for the saved image, controls the resolution
1027        :param figure_size: Tuple of (width, height) in inches for the figure size
1028        """
1029
1030        DisulfideVisualization.display_torsion_class_df(
1031            self.TorsionDF,
1032            class_id,
1033            display=display,
1034            save=save,
1035            fname=fname,
1036            theme=theme,
1037            dpi=dpi,
1038            figure_size=figure_size,
1039        )

Display torsion and distance statistics for a given class ID using the TorsionDF dataframe.

Parameters
  • class_id: The class ID to display statistics for (e.g. '11111b' for binary or '11111o' for octant)
  • display: Whether to display the plot in the notebook
  • save: Whether to save the plot as an image file
  • fname: The name of the image file to save
  • theme: The theme to use for the plot ('auto', 'light', or 'dark')
  • dpi: DPI (dots per inch) for the saved image, controls the resolution
  • figure_size: Tuple of (width, height) in inches for the figure size
def plot_3d_hexbin_leftright( self, width: int = 800, height: int = 600, gridsize: int = 80, tormin: float = -180.0, tormax: float = 180.0, scaling: str = 'sqrt', column1: str = 'chi2', column2: str = 'chi4', title: Optional[str] = None) -> None:
1041    def plot_3d_hexbin_leftright(
1042        self,
1043        width: int = 800,
1044        height: int = 600,
1045        gridsize: int = 80,
1046        tormin: float = -180.0,
1047        tormax: float = 180.0,
1048        scaling: str = "sqrt",
1049        column1: str = "chi2",
1050        column2: str = "chi4",
1051        title: Optional[str] = None,
1052    ) -> None:
1053        """
1054        Create 3D hexbin plots for left and right-handed chi2-chi4 correlations with customizable z-scaling.
1055
1056        :param loader: Loader object to retrieve torsion data
1057        :type loader: proteusPy.PDB_SS
1058        :param width: Window width in pixels
1059        :type width: int, optional
1060        :default width: 800
1061        :param height: Window height in pixels
1062        :type height: int, optional
1063        :default height: 600
1064        :param gridsize: Number of bins for hexbin
1065        :type gridsize: int, optional
1066        :default gridsize: 30
1067        :param tormin: Minimum torsion angle
1068        :type tormin: float, optional
1069        :default tormin: -180.0
1070        :param tormax: Maximum torsion angle
1071        :type tormax: float, optional
1072        :default tormax: 180.0
1073        :param scaling: Scaling method for z-values ('linear', 'sqrt', 'log', 'power')
1074        :type scaling: str, optional
1075        :default scaling: 'sqrt'
1076        :param column1: Name of the first column (x-axis)
1077        :type column1: str, optional
1078        :default column1: 'chi2'
1079        :param column2: Name of the second column (y-axis)
1080        :type column2: str, optional
1081        :default column2: 'chi4'
1082        :param title: Title of the plot
1083        :type title: str, optional
1084        """
1085
1086        DisulfideVisualization.plot_3d_hexbin_leftright(
1087            self,
1088            width=width,
1089            height=height,
1090            gridsize=gridsize,
1091            tormin=tormin,
1092            tormax=tormax,
1093            scaling=scaling,
1094            column1=column1,
1095            column2=column2,
1096            title=title,
1097        )

Create 3D hexbin plots for left and right-handed chi2-chi4 correlations with customizable z-scaling.

Parameters
  • loader: Loader object to retrieve torsion data
  • width: Window width in pixels :default width: 800
  • height: Window height in pixels :default height: 600
  • gridsize: Number of bins for hexbin :default gridsize: 30
  • tormin: Minimum torsion angle :default tormin: -180.0
  • tormax: Maximum torsion angle :default tormax: 180.0
  • scaling: Scaling method for z-values ('linear', 'sqrt', 'log', 'power') :default scaling: 'sqrt'
  • column1: Name of the first column (x-axis) :default column1: 'chi2'
  • column2: Name of the second column (y-axis) :default column2: 'chi4'
  • title: Title of the plot
def Load_PDB_SS( loadpath: str = '/Users/egs/repos/proteusPy/proteusPy/data', verbose: bool = False, subset: bool = False, percentile: float = -1.0, force: bool = False) -> DisulfideLoader:
1103def Load_PDB_SS(
1104    loadpath: str = DATA_DIR,
1105    verbose: bool = False,
1106    subset: bool = False,
1107    percentile: float = -1.0,
1108    force: bool = False,
1109) -> DisulfideLoader:
1110    """
1111    Load the fully instantiated Disulfide database from the specified file. This function
1112    will load the pre-built database if available, or bootstrap a new loader by downloading
1113    the data from Google Drive if needed. Use the provided parameters to control the loading
1114    behavior, filtering cutoffs, and verbosity.
1115
1116    :param loadpath: Path from which to load the database; defaults to DATA_DIR.
1117    :type loadpath: str
1118    :param verbose: If True, enables verbose logging; defaults to False.
1119    :type verbose: bool
1120    :param subset: If True, loads the subset database; otherwise loads the full database.
1121    :type subset: bool
1122    :param cutoff: Cα distance cutoff used to filter disulfides; defaults to CA_CUTOFF.
1123    :type cutoff: float
1124    :param sg_cutoff: Sγ distance cutoff used to filter disulfides; defaults to SG_CUTOFF.
1125    :type sg_cutoff: float
1126    :param force: If True, forces re-loading from Google Drive even if the file exists; defaults to False.
1127    :type force: bool
1128    :param percentile: Percentile (0-100) to compute cutoffs dynamically; if set to -1.0, the percentile method is not used.
1129    :type percentile: float
1130    :return: An instance of DisulfideLoader containing the loaded disulfide database.
1131    :rtype: DisulfideLoader
1132
1133    Example:
1134        >>> from proteusPy import Load_PDB_SS, create_logger
1135        >>> import logging
1136        >>> _logger = create_logger("testing")
1137        >>> _logger.setLevel(logging.WARNING)
1138        >>> loader = Load_PDB_SS(verbose=False, subset=True)
1139        >>> print(loader[0])
1140        <Disulfide 6dmb_203A_226A, Source: 6dmb, Resolution: 3.0 Å>
1141    """
1142
1143    # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy
1144    # distribution. In that case we need to download the files for all disulfides and the subset
1145    # from my Google Drive. This is a one-time operation.
1146
1147    _fname_sub = Path(loadpath) / LOADER_SUBSET_FNAME
1148    _fname_all = Path(loadpath) / LOADER_FNAME
1149    _fpath = _fname_sub if subset else _fname_all
1150
1151    sg_cutoff = ca_cutoff = -1.0
1152
1153    if not _fpath.exists() or force is True:
1154        if verbose:
1155            _logger.info(f"Bootstrapping new loader: {str(_fpath)}... ")
1156
1157        loader = Bootstrap_PDB_SS(
1158            loadpath=loadpath,
1159            verbose=verbose,
1160            subset=subset,
1161            force=force,
1162            percentile=percentile,
1163        )
1164        loader.save(
1165            savepath=loadpath,
1166            verbose=verbose,
1167        )
1168        return loader
1169
1170    if verbose:
1171        _logger.info("Reading disulfides from: %s...", _fpath)
1172
1173    with open(_fpath, "rb") as f:
1174        loader = pickle.load(f)
1175    if verbose:
1176        _logger.info("Done reading disulfides from: %s...", _fpath)
1177        loader.describe()
1178
1179    return loader

Load the fully instantiated Disulfide database from the specified file. This function will load the pre-built database if available, or bootstrap a new loader by downloading the data from Google Drive if needed. Use the provided parameters to control the loading behavior, filtering cutoffs, and verbosity.

Parameters
  • loadpath: Path from which to load the database; defaults to DATA_DIR.
  • verbose: If True, enables verbose logging; defaults to False.
  • subset: If True, loads the subset database; otherwise loads the full database.
  • cutoff: Cα distance cutoff used to filter disulfides; defaults to CA_CUTOFF.
  • sg_cutoff: Sγ distance cutoff used to filter disulfides; defaults to SG_CUTOFF.
  • force: If True, forces re-loading from Google Drive even if the file exists; defaults to False.
  • percentile: Percentile (0-100) to compute cutoffs dynamically; if set to -1.0, the percentile method is not used.
Returns

An instance of DisulfideLoader containing the loaded disulfide database.

Example:

from proteusPy import Load_PDB_SS, create_logger import logging _logger = create_logger("testing") _logger.setLevel(logging.WARNING) loader = Load_PDB_SS(verbose=False, subset=True) print(loader[0])

def Bootstrap_PDB_SS( loadpath: str = '/Users/egs/repos/proteusPy/proteusPy/data', verbose: bool = True, subset: bool = False, force: bool = False, fake: bool = False, percentile: float = -1.0) -> Optional[DisulfideLoader]:
1182def Bootstrap_PDB_SS(
1183    loadpath: str = DATA_DIR,
1184    verbose: bool = True,
1185    subset: bool = False,
1186    force: bool = False,
1187    fake: bool = False,
1188    percentile: float = -1.0,
1189) -> Optional[DisulfideLoader]:
1190    """
1191    Download and instantiate the disulfide databases from Google Drive.
1192
1193    This function downloads the disulfide master SS list from Google Drive if it doesn't
1194    already exist in the specified load path or if the force flag is set to True.
1195    It then loads the disulfide data from the downloaded file and initializes a
1196    DisulfideLoader instance.
1197
1198    :param loadpath: Path from which to load the data, defaults to DATA_DIR
1199    :type loadpath: str
1200    :param cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
1201    :type cutoff: float
1202    :param sg_cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
1203    :type sg_cutoff: float
1204    :param verbose: Flag to enable verbose logging, defaults to False
1205    :type verbose: bool
1206    :param subset: Flag to indicate whether to load a subset of the data, defaults to False
1207    :type subset: bool
1208    :param force: Flag to force download even if the file exists, defaults to False
1209    :type force: bool
1210    :return: An instance of DisulfideLoader initialized with the loaded data
1211    :rtype: DisulfideLoader
1212    """
1213
1214    fname = SS_PICKLE_FILE
1215    url = SS_LIST_URL
1216
1217    # _fname = Path(loadpath) / fname
1218    full_path = Path(loadpath) / fname
1219
1220    if not full_path.exists() or force is True:
1221        if verbose:
1222            _logger.warning("Can't find %s. Downloading from Drive...", full_path)
1223
1224        if not fake:
1225            gdown.download(url, str(full_path), quiet=False)
1226        else:
1227            if verbose:
1228                _logger.warning("Fake download: %s", full_path)
1229                return None
1230    if verbose:
1231        _logger.info(
1232            "Building loader from: %s with cutoffs %s s...",
1233            full_path,
1234            percentile,
1235        )
1236
1237    loader = DisulfideLoader(
1238        datadir=DATA_DIR,
1239        subset=subset,
1240        verbose=verbose,
1241        percentile=percentile,
1242    )
1243
1244    if loader.TotalDisulfides == 0:
1245        _logger.error("No disulfides loaded!")
1246        return None
1247
1248    if verbose:
1249        _logger.info("Done building loader.")
1250
1251    return loader

Download and instantiate the disulfide databases from Google Drive.

This function downloads the disulfide master SS list from Google Drive if it doesn't already exist in the specified load path or if the force flag is set to True. It then loads the disulfide data from the downloaded file and initializes a DisulfideLoader instance.

Parameters
  • loadpath: Path from which to load the data, defaults to DATA_DIR
  • cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
  • sg_cutoff: Cutoff value for disulfide loading, defaults to -1.0 (no filtering)
  • verbose: Flag to enable verbose logging, defaults to False
  • subset: Flag to indicate whether to load a subset of the data, defaults to False
  • force: Flag to force download even if the file exists, defaults to False
Returns

An instance of DisulfideLoader initialized with the loaded data