proteusPy.DisulfideLoader
This module is part of the proteusPy package, a Python package for the analysis and modeling of protein structures, with an emphasis on disulfide bonds. This work is based on the original C/C++ implementation by Eric G. Suchanek.
Author: Eric G. Suchanek, PhD Last revision: 2/9/2024
1""" 2This module is part of the proteusPy package, a Python package for 3the analysis and modeling of protein structures, with an emphasis on disulfide bonds. 4This work is based on the original C/C++ implementation by Eric G. Suchanek. \n 5 6Author: Eric G. Suchanek, PhD 7Last revision: 2/9/2024 8""" 9 10import copy 11import pickle 12import sys 13import time 14 15import pandas as pd 16 17import proteusPy 18from proteusPy.atoms import * 19from proteusPy.data import * 20from proteusPy.data import ( 21 DATA_DIR, 22 LOADER_FNAME, 23 LOADER_SUBSET_FNAME, 24 SS_DICT_PICKLE_FILE, 25 SS_PICKLE_FILE, 26 SS_TORSIONS_FILE, 27) 28from proteusPy.Disulfide import Disulfide 29from proteusPy.DisulfideClass_Constructor import DisulfideClass_Constructor 30from proteusPy.DisulfideExceptions import * 31from proteusPy.DisulfideList import DisulfideList 32from proteusPy.ProteusGlobals import MODEL_DIR, PDB_DIR, REPO_DATA_DIR 33 34try: 35 # Check if running in Jupyter 36 shell = get_ipython().__class__.__name__ 37 if shell == "ZMQInteractiveShell": 38 from tqdm.notebook import tqdm 39 else: 40 from tqdm import tqdm 41except NameError: 42 from tqdm import tqdm 43 44# Now use tqdm as normal, depending on your environment 45 46 47class DisulfideLoader: 48 """ 49 This class represents the disulfide database itself and is its primary means of accession. 50 The entirety of the RCSB disulfide database is stored within the class via a 51 proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of 52 indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow 53 simple, direct and flexible access to the disulfide structures contained herein. 54 This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name. 55 56 The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the 57 [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.\n 58 59 Important note: For typical usage one will access the database via the `Load_PDB_SS()` function. 60 The difference is that the latter function loads the compressed database from its single 61 source. the `Load_PDB_SS()` function will load the individual torsions and disulfide .pkl, 62 builds the classlist structures. 63 64 *Developer's Notes:* 65 The .pkl files needed to instantiate this class and save it into its final .pkl file are 66 defined in the proteusPy.data class and should not be changed. Upon initialization the class 67 will load them and initialize itself. 68 69 """ 70 71 def __init__( 72 self, 73 verbose: bool = True, 74 datadir: str = REPO_DATA_DIR, 75 picklefile: str = SS_PICKLE_FILE, 76 pickle_dict_file: str = SS_DICT_PICKLE_FILE, 77 torsion_file: str = SS_TORSIONS_FILE, 78 quiet: bool = True, 79 subset: bool = False, 80 cutoff: float = -1.0, 81 ) -> None: 82 """ 83 Initializing the class initiates loading either the entire Disulfide dataset, 84 or the 'subset', which consists of the first 1000 PDB structures. The subset 85 is useful for testing and debugging since it doesn't require nearly as much 86 memory or time. The name for the subset file is hard-coded. One can pass a 87 different data directory and file names for the pickle files. These different 88 directories are normally established with the proteusPy.Extract_Disulfides 89 function. 90 """ 91 92 self.ModelDir = datadir 93 self.PickleFile = f"{datadir}{picklefile}" 94 self.PickleDictFile = f"{datadir}{pickle_dict_file}" 95 self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}" 96 self.TorsionFile = f"{datadir}{torsion_file}" 97 self.SSList = DisulfideList([], "ALL_PDB_SS") 98 self.SSDict = {} 99 self.TorsionDF = pd.DataFrame() 100 self.TotalDisulfides = 0 101 self.IDList = [] 102 self.QUIET = quiet 103 104 self.tclass = None # disulfideClass_constructor to manage classes 105 self.cutoff = cutoff # distance cutoff used to bulid the database 106 self.verbose = verbose 107 self.timestamp = time.time() 108 self.version = proteusPy.__version__ 109 110 idlist = [] 111 112 if subset: 113 self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}" 114 self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}" 115 self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}" 116 117 if self.verbose: 118 print( 119 f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ", 120 end="", 121 ) 122 123 with open(self.PickleFile, "rb") as f: 124 # sslist = pd.compat.pickle_compat.load(f) 125 sslist = pickle.load(f) 126 self.SSList = sslist 127 self.TotalDisulfides = len(self.SSList) 128 129 if self.verbose: 130 print( 131 f"done.", 132 ) 133 134 if self.verbose: 135 print( 136 f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...", 137 end="", 138 ) 139 140 with open(self.PickleDictFile, "rb") as f: 141 142 self.SSDict = pickle.load(f) 143 # self.SSDict = pd.compat.pickle_compat.load(f) 144 145 for key in self.SSDict: 146 idlist.append(key) 147 self.IDList = idlist.copy() 148 totalSS_dict = len(self.IDList) 149 150 if self.verbose: 151 print(f"done.") 152 153 if self.verbose: 154 print( 155 f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...", 156 end="", 157 ) 158 159 tmpDF = pd.read_csv(self.TorsionFile) 160 tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True) 161 162 self.TorsionDF = tmpDF.copy() 163 self.TotalDisulfides = len(self.SSList) 164 165 if self.verbose: 166 print(f" done.") 167 168 self.tclass = DisulfideClass_Constructor(self, self.verbose) 169 170 if self.verbose: 171 print(f"-> DisulfideLoader(): Loading complete.") 172 self.describe() 173 return 174 175 # overload __getitem__ to handle slicing and indexing, and access by name 176 177 def __getitem__(self, item): 178 """ 179 Implements indexing and slicing to retrieve DisulfideList objects from the 180 DisulfideLoader. Supports: 181 182 - Integer indexing to retrieve a single DisulfideList 183 - Slicing to retrieve a subset as a DisulfideList 184 - Lookup by PDB ID to retrieve all Disulfides for that structure 185 - Lookup by full disulfide name 186 187 Raises DisulfideException on invalid indices or names. 188 """ 189 190 res = DisulfideList([], "none") 191 192 if isinstance(item, slice): 193 indices = range(*item.indices(len(self.SSList))) 194 name = self.SSList[0].pdb_id 195 resolution = self.SSList[0].resolution 196 sublist = [self.SSList[i] for i in indices] 197 return DisulfideList(sublist, name, resolution) 198 199 if isinstance(item, int): 200 if item < 0 or item >= self.TotalDisulfides: 201 mess = f"DisulfideLoader(): Index {item} out of range 0-{self.TotalDisulfides - 1}" 202 raise DisulfideException(mess) 203 else: 204 return self.SSList[item] 205 206 try: 207 # PDB_SS['4yys'] return a list of SS 208 indices = self.SSDict[item] 209 res = DisulfideList([], item) 210 sslist = self.SSList 211 for ind in indices: 212 res.append(sslist[ind]) 213 res.resolution = res[0].resolution 214 215 except KeyError: 216 try: 217 res = self.SSList.get_by_name(item) # full disulfide name 218 except: 219 mess = f"DisulfideLoader(): Cannot find key {item} in SSBond dict!" 220 raise DisulfideException(mess) 221 return res 222 223 def __setitem__(self, index, item): 224 self.SSList[index] = self._validate_ss(item) 225 226 def _validate_ss(self, value): 227 if isinstance(value, (Disulfide)): 228 return value 229 raise TypeError(f"Disulfide object expected, got {type(value).__name__}") 230 231 @property 232 def Average_Resolution(self) -> float: 233 """ 234 Compute and return the average structure resolution for the given list. 235 236 :return: Average resolution (A) 237 """ 238 res = 0.0 239 cnt = 1 240 sslist = self.SSList 241 242 for ss in sslist: 243 _res = ss.resolution 244 if _res is not None and res != -1.0: 245 res += _res 246 cnt += 1 247 return res / cnt 248 249 def build_ss_from_idlist(self, idlist): 250 """ 251 Given a list of PDBid, return a DisulfideList of Disulfides 252 253 :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q'] 254 :return: DisulfideList 255 """ 256 res = DisulfideList([], "tmp") 257 258 for id in idlist: 259 for ss in self.SSList: 260 if ss.pdb_id == id: 261 res.append(ss) 262 break 263 return res 264 265 def copy(self): 266 """ 267 Return a copy of self. 268 269 :return: Copy of self 270 """ 271 return copy.deepcopy(self) 272 273 def extract_class(self, clsid) -> DisulfideList: 274 """ 275 Return the list of disulfides corresponding to the input `clsid`. 276 277 :param clsid: The class name to extract. 278 :return: The list of disulfide bonds from the class. 279 """ 280 281 # from tqdm import tqdm 282 six = self.tclass.sixclass_df 283 tot_classes = six.shape[0] 284 class_disulfides = DisulfideList([], clsid, quiet=True) 285 _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True) 286 for idx, row in _pbar: 287 _cls = row["class_id"] 288 if _cls == clsid: 289 ss_list = row["ss_id"] 290 pbar = tqdm(ss_list, leave=True) 291 for ssid in pbar: 292 class_disulfides.append(self[ssid]) 293 pbar.set_postfix({"Done": ""}) 294 break 295 296 _pbar.set_postfix({"Cnt": idx}) 297 298 return class_disulfides 299 300 def getlist(self) -> DisulfideList: 301 """ 302 Return the list of Disulfides contained in the class. 303 304 :return: DisulfideList 305 :rtype: DisulfideList 306 """ 307 return copy.deepcopy(self.SSList) 308 309 def get_by_name(self, name) -> Disulfide: 310 """ 311 Returns the Disulfide with the given name from the list. 312 """ 313 for ss in self.SSList.data: 314 if ss.name == name: 315 return ss # or ss.copy() !!! 316 return None 317 318 def describe(self) -> None: 319 """ 320 Provides information about the Disulfide database contained in `self`. 321 322 Example:<br> 323 324 ```python 325 from proteusPy import Load_PDB_SS 326 PDB_SS = Load_PDB_SS(verbose=False, subset=False) 327 PDB_SS.describe() 328 =========== RCSB Disulfide Database Summary ============== 329 =========== Built: 2024-02-12 17:48:13 ============== 330 PDB IDs present: 35818 331 Disulfides loaded: 120494 332 Average structure resolution: 2.34 Å 333 Lowest Energy Disulfide: 2q7q_75D_140D 334 Highest Energy Disulfide: 1toz_456A_467A 335 Cα distance cutoff: 8.00 Å 336 Total RAM Used: 30.72 GB. 337 ================= proteusPy: 0.91 ======================= 338 ``` 339 """ 340 vers = self.version 341 tot = self.TotalDisulfides 342 pdbs = len(self.SSDict) 343 ram = ( 344 sys.getsizeof(self.SSList) 345 + sys.getsizeof(self.SSDict) 346 + sys.getsizeof(self.TorsionDF) 347 ) / (1024 * 1024) 348 res = self.Average_Resolution 349 cutoff = self.cutoff 350 timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp)) 351 ssMin, ssMax = self.SSList.minmax_energy 352 353 print(f" =========== RCSB Disulfide Database Summary ==============") 354 print(f" =========== Built: {timestr} ==============") 355 print(f"PDB IDs present: {pdbs}") 356 print(f"Disulfides loaded: {tot}") 357 print(f"Average structure resolution: {res:.2f} Å") 358 print(f"Lowest Energy Disulfide: {ssMin.name}") 359 print(f"Highest Energy Disulfide: {ssMax.name}") 360 print(f"Cα distance cutoff: {cutoff:.2f} Å") 361 print(f"Total RAM Used: {ram:.2f} GB.") 362 print(f" ================= proteusPy: {vers} =======================") 363 364 def display_overlay(self, pdbid) -> None: 365 """ 366 Display all disulfides for a given PDB ID overlaid in stick mode against 367 a common coordinate frame. This allows us to see all of the disulfides 368 at one time in a single view. Colors vary smoothy between bonds. 369 370 :param self: DisulfideLoader object initialized with the database. 371 :param pdbid: the PDB id string, e.g. 4yys 372 :return: None 373 374 Example: 375 >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList 376 377 Instantiate the Loader with the SS database subset. 378 379 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 380 381 Display the Disulfides from the PDB ID ```4yys```, overlaid onto 382 a common reference (the proximal disulfides). 383 384 >>> PDB_SS.display_overlay('4yys') 385 386 You can also slice the loader and display as an overly. 387 >>> PDB_SS[:8].display_overlay() 388 389 """ 390 391 ssbonds = self[pdbid] 392 ssbonds.display_overlay() 393 return 394 395 def getTorsions(self, pdbID=None) -> pd.DataFrame: 396 """ 397 Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols 398 399 :param pdbID: pdbID, defaults to None, meaning return entire dataset. 400 :type pdbID: str, optional used to extract for a specific PDB structure. If not specified 401 then return the entire dataset. 402 :raises DisulfideParseWarning: Raised if not found 403 :return: Torsions Dataframe 404 :rtype: pd.DataFrame 405 406 Example: 407 >>> from proteusPy import Load_PDB_SS 408 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 409 >>> Tor_DF = PDB_SS.getTorsions() 410 """ 411 res_df = pd.DataFrame() 412 413 if pdbID: 414 try: 415 res = self.SSDict[pdbID] 416 sel = self.TorsionDF["source"] == pdbID 417 res_df = self.TorsionDF[sel] 418 return res_df.copy() 419 except KeyError: 420 mess = f"! Cannot find key {pdbID} in SSBond DB" 421 raise DisulfideParseWarning(mess) 422 else: 423 return copy.deepcopy(self.TorsionDF) 424 425 def list_binary_classes(self): 426 for k, v in enumerate(self.classdict): 427 print(f"Class: |{k}|, |{v}|") 428 429 @property 430 def quiet(self) -> bool: 431 """ 432 The loader quiet state 433 434 :return: quiet parameter 435 :rtype: bool 436 """ 437 return self.QUIET 438 439 @quiet.setter 440 def quiet(self, perm: bool) -> None: 441 """ 442 Sets the quiet attribute for the loader. This silences many of the BIO.PDB warnings. 443 444 :param perm: True or False 445 :type perm: bool 446 """ 447 self.QUIET = perm 448 449 def plot_classes_vs_cutoff(self, cutoff, steps) -> None: 450 """ 451 Plot the total percentage and number of members for each class against the cutoff value. 452 453 :param cutoff: Percent cutoff value for filtering the classes. 454 :return: None 455 """ 456 457 import matplotlib.pyplot as plt 458 import numpy as np 459 460 _cutoff = np.linspace(0, cutoff, steps) 461 tot_list = [] 462 members_list = [] 463 464 for c in _cutoff: 465 class_df = self.tclass.filter_sixclass_by_percentage(c) 466 tot = class_df["percentage"].sum() 467 tot_list.append(tot) 468 members_list.append(class_df.shape[0]) 469 print( 470 f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long." 471 ) 472 473 fig, ax1 = plt.subplots() 474 475 ax2 = ax1.twinx() 476 ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue") 477 ax2.plot(_cutoff, members_list, label="Number of members", color="red") 478 479 ax1.set_xlabel("Cutoff") 480 ax1.set_ylabel("Total percentage", color="blue") 481 ax2.set_ylabel("Number of members", color="red") 482 483 plt.show() 484 485 def plot_binary_to_sixclass_incidence( 486 self, light=True, save=False, savedir="." 487 ) -> None: 488 """ 489 Plot the incidence of all sextant Disulfide classes for a given binary class. 490 491 :param loader: `proteusPy.DisulfideLoader` object 492 """ 493 494 from proteusPy.DisulfideClasses import plot_count_vs_class_df 495 496 def _enumerate_sixclass_fromlist(sslist): 497 x = [] 498 y = [] 499 500 for sixcls in sslist: 501 if sixcls is not None: 502 _y = self.tclass.sslist_from_classid(sixcls) 503 # it's possible to have 0 SS in a class 504 if _y is not None: 505 # only append if we have both. 506 x.append(sixcls) 507 y.append(len(_y)) 508 509 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 510 sslist_df["class_id"] = x 511 sslist_df["count"] = y 512 return sslist_df 513 514 clslist = self.tclass.classdf["class_id"] 515 for cls in clslist: 516 sixcls = self.tclass.binary_to_six_class(cls) 517 df = _enumerate_sixclass_fromlist(sixcls) 518 plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir) 519 return 520 521 def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame: 522 x = [] 523 y = [] 524 525 for sixcls in sslist: 526 if sixcls is not None: 527 _y = self.tclass.sslist_from_classid(sixcls) 528 # it's possible to have 0 SS in a class 529 if _y is not None: 530 # only append if we have both. 531 x.append(sixcls) 532 y.append(len(_y)) 533 534 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 535 sslist_df["class_id"] = x 536 sslist_df["count"] = y 537 return sslist_df 538 539 def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0): 540 """ 541 Save a copy of the fully instantiated Loader to the specified file. 542 543 :param savepath: Path to save the file, defaults to DATA_DIR 544 :param fname: Filename, defaults to LOADER_FNAME 545 :param verbose: Verbosity, defaults to False 546 :param cutoff: Distance cutoff used to build the database, -1 means no cutoff. 547 """ 548 self.version = proteusPy.__version__ 549 self.cutoff = cutoff 550 551 if subset: 552 fname = LOADER_SUBSET_FNAME 553 else: 554 fname = LOADER_FNAME 555 556 _fname = f"{savepath}{fname}" 557 558 if self.verbose: 559 print(f"-> DisulfideLoader.save(): Writing {_fname}... ") 560 561 with open(_fname, "wb+") as f: 562 pickle.dump(self, f) 563 564 if self.verbose: 565 print(f"-> DisulfideLoader.save(): Done.") 566 567 568# class ends 569 570 571def Download_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False): 572 """ 573 Download the databases from my Google Drive. 574 575 :param loadpath: Path from which to load, defaults to DATA_DIR 576 :param verbose: Verbosity, defaults to False 577 """ 578 579 import gdown 580 581 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 582 _fname_all = f"{loadpath}{LOADER_FNAME}" 583 584 if verbose: 585 print(f"--> DisulfideLoader: Downloading Disulfide Database from Drive...") 586 587 gdown.download(LOADER_ALL_URL, _fname_all, quiet=False) 588 589 if subset: 590 if verbose: 591 print( 592 f"--> DisulfideLoader: Downloading Disulfide Subset Database from Drive..." 593 ) 594 595 gdown.download(LOADER_SUBSET_URL, _fname_sub, quiet=False) 596 597 return 598 599 600def Download_PDB_SS_GitHub(loadpath=DATA_DIR, verbose=True, subset=False): 601 """ 602 Download the databases from Github. Note: if you change the database these sizes will 603 need to be changed! 604 605 :param loadpath: Path from which to load, defaults to DATA_DIR 606 :param verbose: Verbosity, defaults to True 607 """ 608 609 import urllib 610 611 _good1 = 0 # all data 612 _good2 = 0 # subset data 613 614 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 615 _fname_all = f"{loadpath}{LOADER_FNAME}" 616 617 _all_length = 340371775 618 _subset_length = 9636086 619 620 if verbose: 621 print(f"--> DisulfideLoader: Downloading Disulfide Database from GitHub...") 622 623 resp, headers = urllib.request.urlretrieve( 624 "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_ALL_LOADER.pkl", 625 _fname_all, 626 ) 627 num_bytes = headers.get("content-length") 628 if num_bytes == _all_length: 629 _good1 = 1 630 else: 631 print(f"--> Read: {num_bytes}, expecting: {_all_length}") 632 633 if subset: 634 if verbose: 635 print( 636 f"--> DisulfideLoader: Downloading Disulfide Subset Database from GitHub..." 637 ) 638 639 resp, headers = urllib.request.urlretrieve( 640 "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_SUBSET_LOADER.pkl", 641 _fname_sub, 642 ) 643 num_bytes = headers.get("content-length") 644 if num_bytes == _subset_length: 645 _good2 = 1 646 else: 647 print(f"--> Read: {num_bytes}, expecting: {_subset_length}") 648 return _good1 + _good2 649 650 651def Load_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False) -> DisulfideLoader: 652 """ 653 Load the fully instantiated Disulfide database from the specified file. Use the 654 defaults unless you are building the database by hand. *This is the function 655 used to load the built database.* 656 657 :param loadpath: Path from which to load, defaults to DATA_DIR 658 :param verbose: Verbosity, defaults to False 659 :param subset: If True, load the subset DB, otherwise load the full database 660 """ 661 # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy 662 # distribution. In that case we need to download the files for all disulfides and the subset 663 # from the GitHub. 664 665 _good1 = False # all data 666 _good2 = False # subset data 667 668 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 669 _fname_all = f"{loadpath}{LOADER_FNAME}" 670 671 if subset: 672 _fname = _fname_sub 673 else: 674 _fname = _fname_all 675 676 if not os.path.exists(_fname_sub): 677 res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=True) 678 679 if not os.path.exists(_fname_all): 680 res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=False) 681 682 # first attempt to read the local copy of the loader 683 if verbose: 684 print(f"-> load_PDB_SS(): Reading {_fname}... ") 685 686 with open(_fname, "rb") as f: 687 res = pickle.load(f) 688 # res = pd.compat.pickle_compat.load(f) 689 690 if verbose: 691 print(f"-> load_PDB_SS(): Done reading {_fname}... ") 692 return res 693 694 695if __name__ == "__main__": 696 import doctest 697 698 doctest.testmod() 699 700# End of file
48class DisulfideLoader: 49 """ 50 This class represents the disulfide database itself and is its primary means of accession. 51 The entirety of the RCSB disulfide database is stored within the class via a 52 proteusPy.DisulfideList, a ```Pandas``` .csv file, and a ```dict``` of 53 indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow 54 simple, direct and flexible access to the disulfide structures contained herein. 55 This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name. 56 57 The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the 58 [display_overlay()](#DisulfideLoader.display_overlay) method. See below for examples.\n 59 60 Important note: For typical usage one will access the database via the `Load_PDB_SS()` function. 61 The difference is that the latter function loads the compressed database from its single 62 source. the `Load_PDB_SS()` function will load the individual torsions and disulfide .pkl, 63 builds the classlist structures. 64 65 *Developer's Notes:* 66 The .pkl files needed to instantiate this class and save it into its final .pkl file are 67 defined in the proteusPy.data class and should not be changed. Upon initialization the class 68 will load them and initialize itself. 69 70 """ 71 72 def __init__( 73 self, 74 verbose: bool = True, 75 datadir: str = REPO_DATA_DIR, 76 picklefile: str = SS_PICKLE_FILE, 77 pickle_dict_file: str = SS_DICT_PICKLE_FILE, 78 torsion_file: str = SS_TORSIONS_FILE, 79 quiet: bool = True, 80 subset: bool = False, 81 cutoff: float = -1.0, 82 ) -> None: 83 """ 84 Initializing the class initiates loading either the entire Disulfide dataset, 85 or the 'subset', which consists of the first 1000 PDB structures. The subset 86 is useful for testing and debugging since it doesn't require nearly as much 87 memory or time. The name for the subset file is hard-coded. One can pass a 88 different data directory and file names for the pickle files. These different 89 directories are normally established with the proteusPy.Extract_Disulfides 90 function. 91 """ 92 93 self.ModelDir = datadir 94 self.PickleFile = f"{datadir}{picklefile}" 95 self.PickleDictFile = f"{datadir}{pickle_dict_file}" 96 self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}" 97 self.TorsionFile = f"{datadir}{torsion_file}" 98 self.SSList = DisulfideList([], "ALL_PDB_SS") 99 self.SSDict = {} 100 self.TorsionDF = pd.DataFrame() 101 self.TotalDisulfides = 0 102 self.IDList = [] 103 self.QUIET = quiet 104 105 self.tclass = None # disulfideClass_constructor to manage classes 106 self.cutoff = cutoff # distance cutoff used to bulid the database 107 self.verbose = verbose 108 self.timestamp = time.time() 109 self.version = proteusPy.__version__ 110 111 idlist = [] 112 113 if subset: 114 self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}" 115 self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}" 116 self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}" 117 118 if self.verbose: 119 print( 120 f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ", 121 end="", 122 ) 123 124 with open(self.PickleFile, "rb") as f: 125 # sslist = pd.compat.pickle_compat.load(f) 126 sslist = pickle.load(f) 127 self.SSList = sslist 128 self.TotalDisulfides = len(self.SSList) 129 130 if self.verbose: 131 print( 132 f"done.", 133 ) 134 135 if self.verbose: 136 print( 137 f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...", 138 end="", 139 ) 140 141 with open(self.PickleDictFile, "rb") as f: 142 143 self.SSDict = pickle.load(f) 144 # self.SSDict = pd.compat.pickle_compat.load(f) 145 146 for key in self.SSDict: 147 idlist.append(key) 148 self.IDList = idlist.copy() 149 totalSS_dict = len(self.IDList) 150 151 if self.verbose: 152 print(f"done.") 153 154 if self.verbose: 155 print( 156 f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...", 157 end="", 158 ) 159 160 tmpDF = pd.read_csv(self.TorsionFile) 161 tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True) 162 163 self.TorsionDF = tmpDF.copy() 164 self.TotalDisulfides = len(self.SSList) 165 166 if self.verbose: 167 print(f" done.") 168 169 self.tclass = DisulfideClass_Constructor(self, self.verbose) 170 171 if self.verbose: 172 print(f"-> DisulfideLoader(): Loading complete.") 173 self.describe() 174 return 175 176 # overload __getitem__ to handle slicing and indexing, and access by name 177 178 def __getitem__(self, item): 179 """ 180 Implements indexing and slicing to retrieve DisulfideList objects from the 181 DisulfideLoader. Supports: 182 183 - Integer indexing to retrieve a single DisulfideList 184 - Slicing to retrieve a subset as a DisulfideList 185 - Lookup by PDB ID to retrieve all Disulfides for that structure 186 - Lookup by full disulfide name 187 188 Raises DisulfideException on invalid indices or names. 189 """ 190 191 res = DisulfideList([], "none") 192 193 if isinstance(item, slice): 194 indices = range(*item.indices(len(self.SSList))) 195 name = self.SSList[0].pdb_id 196 resolution = self.SSList[0].resolution 197 sublist = [self.SSList[i] for i in indices] 198 return DisulfideList(sublist, name, resolution) 199 200 if isinstance(item, int): 201 if item < 0 or item >= self.TotalDisulfides: 202 mess = f"DisulfideLoader(): Index {item} out of range 0-{self.TotalDisulfides - 1}" 203 raise DisulfideException(mess) 204 else: 205 return self.SSList[item] 206 207 try: 208 # PDB_SS['4yys'] return a list of SS 209 indices = self.SSDict[item] 210 res = DisulfideList([], item) 211 sslist = self.SSList 212 for ind in indices: 213 res.append(sslist[ind]) 214 res.resolution = res[0].resolution 215 216 except KeyError: 217 try: 218 res = self.SSList.get_by_name(item) # full disulfide name 219 except: 220 mess = f"DisulfideLoader(): Cannot find key {item} in SSBond dict!" 221 raise DisulfideException(mess) 222 return res 223 224 def __setitem__(self, index, item): 225 self.SSList[index] = self._validate_ss(item) 226 227 def _validate_ss(self, value): 228 if isinstance(value, (Disulfide)): 229 return value 230 raise TypeError(f"Disulfide object expected, got {type(value).__name__}") 231 232 @property 233 def Average_Resolution(self) -> float: 234 """ 235 Compute and return the average structure resolution for the given list. 236 237 :return: Average resolution (A) 238 """ 239 res = 0.0 240 cnt = 1 241 sslist = self.SSList 242 243 for ss in sslist: 244 _res = ss.resolution 245 if _res is not None and res != -1.0: 246 res += _res 247 cnt += 1 248 return res / cnt 249 250 def build_ss_from_idlist(self, idlist): 251 """ 252 Given a list of PDBid, return a DisulfideList of Disulfides 253 254 :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q'] 255 :return: DisulfideList 256 """ 257 res = DisulfideList([], "tmp") 258 259 for id in idlist: 260 for ss in self.SSList: 261 if ss.pdb_id == id: 262 res.append(ss) 263 break 264 return res 265 266 def copy(self): 267 """ 268 Return a copy of self. 269 270 :return: Copy of self 271 """ 272 return copy.deepcopy(self) 273 274 def extract_class(self, clsid) -> DisulfideList: 275 """ 276 Return the list of disulfides corresponding to the input `clsid`. 277 278 :param clsid: The class name to extract. 279 :return: The list of disulfide bonds from the class. 280 """ 281 282 # from tqdm import tqdm 283 six = self.tclass.sixclass_df 284 tot_classes = six.shape[0] 285 class_disulfides = DisulfideList([], clsid, quiet=True) 286 _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True) 287 for idx, row in _pbar: 288 _cls = row["class_id"] 289 if _cls == clsid: 290 ss_list = row["ss_id"] 291 pbar = tqdm(ss_list, leave=True) 292 for ssid in pbar: 293 class_disulfides.append(self[ssid]) 294 pbar.set_postfix({"Done": ""}) 295 break 296 297 _pbar.set_postfix({"Cnt": idx}) 298 299 return class_disulfides 300 301 def getlist(self) -> DisulfideList: 302 """ 303 Return the list of Disulfides contained in the class. 304 305 :return: DisulfideList 306 :rtype: DisulfideList 307 """ 308 return copy.deepcopy(self.SSList) 309 310 def get_by_name(self, name) -> Disulfide: 311 """ 312 Returns the Disulfide with the given name from the list. 313 """ 314 for ss in self.SSList.data: 315 if ss.name == name: 316 return ss # or ss.copy() !!! 317 return None 318 319 def describe(self) -> None: 320 """ 321 Provides information about the Disulfide database contained in `self`. 322 323 Example:<br> 324 325 ```python 326 from proteusPy import Load_PDB_SS 327 PDB_SS = Load_PDB_SS(verbose=False, subset=False) 328 PDB_SS.describe() 329 =========== RCSB Disulfide Database Summary ============== 330 =========== Built: 2024-02-12 17:48:13 ============== 331 PDB IDs present: 35818 332 Disulfides loaded: 120494 333 Average structure resolution: 2.34 Å 334 Lowest Energy Disulfide: 2q7q_75D_140D 335 Highest Energy Disulfide: 1toz_456A_467A 336 Cα distance cutoff: 8.00 Å 337 Total RAM Used: 30.72 GB. 338 ================= proteusPy: 0.91 ======================= 339 ``` 340 """ 341 vers = self.version 342 tot = self.TotalDisulfides 343 pdbs = len(self.SSDict) 344 ram = ( 345 sys.getsizeof(self.SSList) 346 + sys.getsizeof(self.SSDict) 347 + sys.getsizeof(self.TorsionDF) 348 ) / (1024 * 1024) 349 res = self.Average_Resolution 350 cutoff = self.cutoff 351 timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp)) 352 ssMin, ssMax = self.SSList.minmax_energy 353 354 print(f" =========== RCSB Disulfide Database Summary ==============") 355 print(f" =========== Built: {timestr} ==============") 356 print(f"PDB IDs present: {pdbs}") 357 print(f"Disulfides loaded: {tot}") 358 print(f"Average structure resolution: {res:.2f} Å") 359 print(f"Lowest Energy Disulfide: {ssMin.name}") 360 print(f"Highest Energy Disulfide: {ssMax.name}") 361 print(f"Cα distance cutoff: {cutoff:.2f} Å") 362 print(f"Total RAM Used: {ram:.2f} GB.") 363 print(f" ================= proteusPy: {vers} =======================") 364 365 def display_overlay(self, pdbid) -> None: 366 """ 367 Display all disulfides for a given PDB ID overlaid in stick mode against 368 a common coordinate frame. This allows us to see all of the disulfides 369 at one time in a single view. Colors vary smoothy between bonds. 370 371 :param self: DisulfideLoader object initialized with the database. 372 :param pdbid: the PDB id string, e.g. 4yys 373 :return: None 374 375 Example: 376 >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList 377 378 Instantiate the Loader with the SS database subset. 379 380 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 381 382 Display the Disulfides from the PDB ID ```4yys```, overlaid onto 383 a common reference (the proximal disulfides). 384 385 >>> PDB_SS.display_overlay('4yys') 386 387 You can also slice the loader and display as an overly. 388 >>> PDB_SS[:8].display_overlay() 389 390 """ 391 392 ssbonds = self[pdbid] 393 ssbonds.display_overlay() 394 return 395 396 def getTorsions(self, pdbID=None) -> pd.DataFrame: 397 """ 398 Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols 399 400 :param pdbID: pdbID, defaults to None, meaning return entire dataset. 401 :type pdbID: str, optional used to extract for a specific PDB structure. If not specified 402 then return the entire dataset. 403 :raises DisulfideParseWarning: Raised if not found 404 :return: Torsions Dataframe 405 :rtype: pd.DataFrame 406 407 Example: 408 >>> from proteusPy import Load_PDB_SS 409 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 410 >>> Tor_DF = PDB_SS.getTorsions() 411 """ 412 res_df = pd.DataFrame() 413 414 if pdbID: 415 try: 416 res = self.SSDict[pdbID] 417 sel = self.TorsionDF["source"] == pdbID 418 res_df = self.TorsionDF[sel] 419 return res_df.copy() 420 except KeyError: 421 mess = f"! Cannot find key {pdbID} in SSBond DB" 422 raise DisulfideParseWarning(mess) 423 else: 424 return copy.deepcopy(self.TorsionDF) 425 426 def list_binary_classes(self): 427 for k, v in enumerate(self.classdict): 428 print(f"Class: |{k}|, |{v}|") 429 430 @property 431 def quiet(self) -> bool: 432 """ 433 The loader quiet state 434 435 :return: quiet parameter 436 :rtype: bool 437 """ 438 return self.QUIET 439 440 @quiet.setter 441 def quiet(self, perm: bool) -> None: 442 """ 443 Sets the quiet attribute for the loader. This silences many of the BIO.PDB warnings. 444 445 :param perm: True or False 446 :type perm: bool 447 """ 448 self.QUIET = perm 449 450 def plot_classes_vs_cutoff(self, cutoff, steps) -> None: 451 """ 452 Plot the total percentage and number of members for each class against the cutoff value. 453 454 :param cutoff: Percent cutoff value for filtering the classes. 455 :return: None 456 """ 457 458 import matplotlib.pyplot as plt 459 import numpy as np 460 461 _cutoff = np.linspace(0, cutoff, steps) 462 tot_list = [] 463 members_list = [] 464 465 for c in _cutoff: 466 class_df = self.tclass.filter_sixclass_by_percentage(c) 467 tot = class_df["percentage"].sum() 468 tot_list.append(tot) 469 members_list.append(class_df.shape[0]) 470 print( 471 f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long." 472 ) 473 474 fig, ax1 = plt.subplots() 475 476 ax2 = ax1.twinx() 477 ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue") 478 ax2.plot(_cutoff, members_list, label="Number of members", color="red") 479 480 ax1.set_xlabel("Cutoff") 481 ax1.set_ylabel("Total percentage", color="blue") 482 ax2.set_ylabel("Number of members", color="red") 483 484 plt.show() 485 486 def plot_binary_to_sixclass_incidence( 487 self, light=True, save=False, savedir="." 488 ) -> None: 489 """ 490 Plot the incidence of all sextant Disulfide classes for a given binary class. 491 492 :param loader: `proteusPy.DisulfideLoader` object 493 """ 494 495 from proteusPy.DisulfideClasses import plot_count_vs_class_df 496 497 def _enumerate_sixclass_fromlist(sslist): 498 x = [] 499 y = [] 500 501 for sixcls in sslist: 502 if sixcls is not None: 503 _y = self.tclass.sslist_from_classid(sixcls) 504 # it's possible to have 0 SS in a class 505 if _y is not None: 506 # only append if we have both. 507 x.append(sixcls) 508 y.append(len(_y)) 509 510 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 511 sslist_df["class_id"] = x 512 sslist_df["count"] = y 513 return sslist_df 514 515 clslist = self.tclass.classdf["class_id"] 516 for cls in clslist: 517 sixcls = self.tclass.binary_to_six_class(cls) 518 df = _enumerate_sixclass_fromlist(sixcls) 519 plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir) 520 return 521 522 def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame: 523 x = [] 524 y = [] 525 526 for sixcls in sslist: 527 if sixcls is not None: 528 _y = self.tclass.sslist_from_classid(sixcls) 529 # it's possible to have 0 SS in a class 530 if _y is not None: 531 # only append if we have both. 532 x.append(sixcls) 533 y.append(len(_y)) 534 535 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 536 sslist_df["class_id"] = x 537 sslist_df["count"] = y 538 return sslist_df 539 540 def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0): 541 """ 542 Save a copy of the fully instantiated Loader to the specified file. 543 544 :param savepath: Path to save the file, defaults to DATA_DIR 545 :param fname: Filename, defaults to LOADER_FNAME 546 :param verbose: Verbosity, defaults to False 547 :param cutoff: Distance cutoff used to build the database, -1 means no cutoff. 548 """ 549 self.version = proteusPy.__version__ 550 self.cutoff = cutoff 551 552 if subset: 553 fname = LOADER_SUBSET_FNAME 554 else: 555 fname = LOADER_FNAME 556 557 _fname = f"{savepath}{fname}" 558 559 if self.verbose: 560 print(f"-> DisulfideLoader.save(): Writing {_fname}... ") 561 562 with open(_fname, "wb+") as f: 563 pickle.dump(self, f) 564 565 if self.verbose: 566 print(f"-> DisulfideLoader.save(): Done.")
This class represents the disulfide database itself and is its primary means of accession.
The entirety of the RCSB disulfide database is stored within the class via a
proteusPy.DisulfideList, a Pandas
.csv file, and a dict
of
indices mapping the PDB IDs into their respective list of disulfides. The datastructures allow
simple, direct and flexible access to the disulfide structures contained herein.
This makes it possible to access the disulfides by array index, PDB structure ID or disulfide name.
The class can also render Disulfides overlaid on a common coordinate system to a pyVista window using the display_overlay() method. See below for examples.
Important note: For typical usage one will access the database via the Load_PDB_SS()
function.
The difference is that the latter function loads the compressed database from its single
source. the Load_PDB_SS()
function will load the individual torsions and disulfide .pkl,
builds the classlist structures.
Developer's Notes: The .pkl files needed to instantiate this class and save it into its final .pkl file are defined in the proteusPy.data class and should not be changed. Upon initialization the class will load them and initialize itself.
72 def __init__( 73 self, 74 verbose: bool = True, 75 datadir: str = REPO_DATA_DIR, 76 picklefile: str = SS_PICKLE_FILE, 77 pickle_dict_file: str = SS_DICT_PICKLE_FILE, 78 torsion_file: str = SS_TORSIONS_FILE, 79 quiet: bool = True, 80 subset: bool = False, 81 cutoff: float = -1.0, 82 ) -> None: 83 """ 84 Initializing the class initiates loading either the entire Disulfide dataset, 85 or the 'subset', which consists of the first 1000 PDB structures. The subset 86 is useful for testing and debugging since it doesn't require nearly as much 87 memory or time. The name for the subset file is hard-coded. One can pass a 88 different data directory and file names for the pickle files. These different 89 directories are normally established with the proteusPy.Extract_Disulfides 90 function. 91 """ 92 93 self.ModelDir = datadir 94 self.PickleFile = f"{datadir}{picklefile}" 95 self.PickleDictFile = f"{datadir}{pickle_dict_file}" 96 self.PickleClassFile = f"{datadir}{SS_CLASS_DICT_FILE}" 97 self.TorsionFile = f"{datadir}{torsion_file}" 98 self.SSList = DisulfideList([], "ALL_PDB_SS") 99 self.SSDict = {} 100 self.TorsionDF = pd.DataFrame() 101 self.TotalDisulfides = 0 102 self.IDList = [] 103 self.QUIET = quiet 104 105 self.tclass = None # disulfideClass_constructor to manage classes 106 self.cutoff = cutoff # distance cutoff used to bulid the database 107 self.verbose = verbose 108 self.timestamp = time.time() 109 self.version = proteusPy.__version__ 110 111 idlist = [] 112 113 if subset: 114 self.PickleFile = f"{datadir}{SS_SUBSET_PICKLE_FILE}" 115 self.PickleDictFile = f"{datadir}{SS_SUBSET_DICT_PICKLE_FILE}" 116 self.TorsionFile = f"{datadir}{SS_SUBSET_TORSIONS_FILE}" 117 118 if self.verbose: 119 print( 120 f"-> DisulfideLoader(): Reading disulfides from: {self.PickleFile}... ", 121 end="", 122 ) 123 124 with open(self.PickleFile, "rb") as f: 125 # sslist = pd.compat.pickle_compat.load(f) 126 sslist = pickle.load(f) 127 self.SSList = sslist 128 self.TotalDisulfides = len(self.SSList) 129 130 if self.verbose: 131 print( 132 f"done.", 133 ) 134 135 if self.verbose: 136 print( 137 f"-> DisulfideLoader(): Reading disulfide dict from: {self.PickleDictFile}...", 138 end="", 139 ) 140 141 with open(self.PickleDictFile, "rb") as f: 142 143 self.SSDict = pickle.load(f) 144 # self.SSDict = pd.compat.pickle_compat.load(f) 145 146 for key in self.SSDict: 147 idlist.append(key) 148 self.IDList = idlist.copy() 149 totalSS_dict = len(self.IDList) 150 151 if self.verbose: 152 print(f"done.") 153 154 if self.verbose: 155 print( 156 f"-> DisulfideLoader(): Reading Torsion DF from: {self.TorsionFile}...", 157 end="", 158 ) 159 160 tmpDF = pd.read_csv(self.TorsionFile) 161 tmpDF.drop(tmpDF.columns[[0]], axis=1, inplace=True) 162 163 self.TorsionDF = tmpDF.copy() 164 self.TotalDisulfides = len(self.SSList) 165 166 if self.verbose: 167 print(f" done.") 168 169 self.tclass = DisulfideClass_Constructor(self, self.verbose) 170 171 if self.verbose: 172 print(f"-> DisulfideLoader(): Loading complete.") 173 self.describe() 174 return
Initializing the class initiates loading either the entire Disulfide dataset, or the 'subset', which consists of the first 1000 PDB structures. The subset is useful for testing and debugging since it doesn't require nearly as much memory or time. The name for the subset file is hard-coded. One can pass a different data directory and file names for the pickle files. These different directories are normally established with the proteusPy.Extract_Disulfides function.
232 @property 233 def Average_Resolution(self) -> float: 234 """ 235 Compute and return the average structure resolution for the given list. 236 237 :return: Average resolution (A) 238 """ 239 res = 0.0 240 cnt = 1 241 sslist = self.SSList 242 243 for ss in sslist: 244 _res = ss.resolution 245 if _res is not None and res != -1.0: 246 res += _res 247 cnt += 1 248 return res / cnt
Compute and return the average structure resolution for the given list.
Returns
Average resolution (A)
250 def build_ss_from_idlist(self, idlist): 251 """ 252 Given a list of PDBid, return a DisulfideList of Disulfides 253 254 :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q'] 255 :return: DisulfideList 256 """ 257 res = DisulfideList([], "tmp") 258 259 for id in idlist: 260 for ss in self.SSList: 261 if ss.pdb_id == id: 262 res.append(ss) 263 break 264 return res
Given a list of PDBid, return a DisulfideList of Disulfides
Parameters
- idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
Returns
DisulfideList
266 def copy(self): 267 """ 268 Return a copy of self. 269 270 :return: Copy of self 271 """ 272 return copy.deepcopy(self)
Return a copy of self.
Returns
Copy of self
274 def extract_class(self, clsid) -> DisulfideList: 275 """ 276 Return the list of disulfides corresponding to the input `clsid`. 277 278 :param clsid: The class name to extract. 279 :return: The list of disulfide bonds from the class. 280 """ 281 282 # from tqdm import tqdm 283 six = self.tclass.sixclass_df 284 tot_classes = six.shape[0] 285 class_disulfides = DisulfideList([], clsid, quiet=True) 286 _pbar = tqdm(six.iterrows(), total=tot_classes, leave=True) 287 for idx, row in _pbar: 288 _cls = row["class_id"] 289 if _cls == clsid: 290 ss_list = row["ss_id"] 291 pbar = tqdm(ss_list, leave=True) 292 for ssid in pbar: 293 class_disulfides.append(self[ssid]) 294 pbar.set_postfix({"Done": ""}) 295 break 296 297 _pbar.set_postfix({"Cnt": idx}) 298 299 return class_disulfides
Return the list of disulfides corresponding to the input clsid
.
Parameters
- clsid: The class name to extract.
Returns
The list of disulfide bonds from the class.
301 def getlist(self) -> DisulfideList: 302 """ 303 Return the list of Disulfides contained in the class. 304 305 :return: DisulfideList 306 :rtype: DisulfideList 307 """ 308 return copy.deepcopy(self.SSList)
Return the list of Disulfides contained in the class.
Returns
DisulfideList
310 def get_by_name(self, name) -> Disulfide: 311 """ 312 Returns the Disulfide with the given name from the list. 313 """ 314 for ss in self.SSList.data: 315 if ss.name == name: 316 return ss # or ss.copy() !!! 317 return None
Returns the Disulfide with the given name from the list.
319 def describe(self) -> None: 320 """ 321 Provides information about the Disulfide database contained in `self`. 322 323 Example:<br> 324 325 ```python 326 from proteusPy import Load_PDB_SS 327 PDB_SS = Load_PDB_SS(verbose=False, subset=False) 328 PDB_SS.describe() 329 =========== RCSB Disulfide Database Summary ============== 330 =========== Built: 2024-02-12 17:48:13 ============== 331 PDB IDs present: 35818 332 Disulfides loaded: 120494 333 Average structure resolution: 2.34 Å 334 Lowest Energy Disulfide: 2q7q_75D_140D 335 Highest Energy Disulfide: 1toz_456A_467A 336 Cα distance cutoff: 8.00 Å 337 Total RAM Used: 30.72 GB. 338 ================= proteusPy: 0.91 ======================= 339 ``` 340 """ 341 vers = self.version 342 tot = self.TotalDisulfides 343 pdbs = len(self.SSDict) 344 ram = ( 345 sys.getsizeof(self.SSList) 346 + sys.getsizeof(self.SSDict) 347 + sys.getsizeof(self.TorsionDF) 348 ) / (1024 * 1024) 349 res = self.Average_Resolution 350 cutoff = self.cutoff 351 timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.timestamp)) 352 ssMin, ssMax = self.SSList.minmax_energy 353 354 print(f" =========== RCSB Disulfide Database Summary ==============") 355 print(f" =========== Built: {timestr} ==============") 356 print(f"PDB IDs present: {pdbs}") 357 print(f"Disulfides loaded: {tot}") 358 print(f"Average structure resolution: {res:.2f} Å") 359 print(f"Lowest Energy Disulfide: {ssMin.name}") 360 print(f"Highest Energy Disulfide: {ssMax.name}") 361 print(f"Cα distance cutoff: {cutoff:.2f} Å") 362 print(f"Total RAM Used: {ram:.2f} GB.") 363 print(f" ================= proteusPy: {vers} =======================")
Provides information about the Disulfide database contained in self
.
Example:
from proteusPy import Load_PDB_SS
PDB_SS = Load_PDB_SS(verbose=False, subset=False)
PDB_SS.describe()
=========== RCSB Disulfide Database Summary ==============
=========== Built: 2024-02-12 17:48:13 ==============
PDB IDs present: 35818
Disulfides loaded: 120494
Average structure resolution: 2.34 Ã…
Lowest Energy Disulfide: 2q7q_75D_140D
Highest Energy Disulfide: 1toz_456A_467A
Cα distance cutoff: 8.00 Å
Total RAM Used: 30.72 GB.
================= proteusPy: 0.91 =======================
365 def display_overlay(self, pdbid) -> None: 366 """ 367 Display all disulfides for a given PDB ID overlaid in stick mode against 368 a common coordinate frame. This allows us to see all of the disulfides 369 at one time in a single view. Colors vary smoothy between bonds. 370 371 :param self: DisulfideLoader object initialized with the database. 372 :param pdbid: the PDB id string, e.g. 4yys 373 :return: None 374 375 Example: 376 >>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList 377 378 Instantiate the Loader with the SS database subset. 379 380 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 381 382 Display the Disulfides from the PDB ID ```4yys```, overlaid onto 383 a common reference (the proximal disulfides). 384 385 >>> PDB_SS.display_overlay('4yys') 386 387 You can also slice the loader and display as an overly. 388 >>> PDB_SS[:8].display_overlay() 389 390 """ 391 392 ssbonds = self[pdbid] 393 ssbonds.display_overlay() 394 return
Display all disulfides for a given PDB ID overlaid in stick mode against a common coordinate frame. This allows us to see all of the disulfides at one time in a single view. Colors vary smoothy between bonds.
Parameters
- self: DisulfideLoader object initialized with the database.
- pdbid: the PDB id string, e.g. 4yys
Returns
None
Example:
>>> from proteusPy import Disulfide, Load_PDB_SS, DisulfideList
Instantiate the Loader with the SS database subset.
>>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
Display the Disulfides from the PDB ID 4yys
, overlaid onto
a common reference (the proximal disulfides).
>>> PDB_SS.display_overlay('4yys')
You can also slice the loader and display as an overly.
>>> PDB_SS[:8].display_overlay()
396 def getTorsions(self, pdbID=None) -> pd.DataFrame: 397 """ 398 Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols 399 400 :param pdbID: pdbID, defaults to None, meaning return entire dataset. 401 :type pdbID: str, optional used to extract for a specific PDB structure. If not specified 402 then return the entire dataset. 403 :raises DisulfideParseWarning: Raised if not found 404 :return: Torsions Dataframe 405 :rtype: pd.DataFrame 406 407 Example: 408 >>> from proteusPy import Load_PDB_SS 409 >>> PDB_SS = Load_PDB_SS(verbose=False, subset=True) 410 >>> Tor_DF = PDB_SS.getTorsions() 411 """ 412 res_df = pd.DataFrame() 413 414 if pdbID: 415 try: 416 res = self.SSDict[pdbID] 417 sel = self.TorsionDF["source"] == pdbID 418 res_df = self.TorsionDF[sel] 419 return res_df.copy() 420 except KeyError: 421 mess = f"! Cannot find key {pdbID} in SSBond DB" 422 raise DisulfideParseWarning(mess) 423 else: 424 return copy.deepcopy(self.TorsionDF)
Return the torsions, distances and energies defined by Disulfide.Torsion_DF_cols
Parameters
- pdbID: pdbID, defaults to None, meaning return entire dataset.
Raises
- DisulfideParseWarning: Raised if not found
Returns
Torsions Dataframe
Example:
>>> from proteusPy import Load_PDB_SS
>>> PDB_SS = Load_PDB_SS(verbose=False, subset=True)
>>> Tor_DF = PDB_SS.getTorsions()
430 @property 431 def quiet(self) -> bool: 432 """ 433 The loader quiet state 434 435 :return: quiet parameter 436 :rtype: bool 437 """ 438 return self.QUIET
The loader quiet state
Returns
quiet parameter
450 def plot_classes_vs_cutoff(self, cutoff, steps) -> None: 451 """ 452 Plot the total percentage and number of members for each class against the cutoff value. 453 454 :param cutoff: Percent cutoff value for filtering the classes. 455 :return: None 456 """ 457 458 import matplotlib.pyplot as plt 459 import numpy as np 460 461 _cutoff = np.linspace(0, cutoff, steps) 462 tot_list = [] 463 members_list = [] 464 465 for c in _cutoff: 466 class_df = self.tclass.filter_sixclass_by_percentage(c) 467 tot = class_df["percentage"].sum() 468 tot_list.append(tot) 469 members_list.append(class_df.shape[0]) 470 print( 471 f"Cutoff: {c:5.3} accounts for {tot:7.2f}% and is {class_df.shape[0]:5} members long." 472 ) 473 474 fig, ax1 = plt.subplots() 475 476 ax2 = ax1.twinx() 477 ax1.plot(_cutoff, tot_list, label="Total percentage", color="blue") 478 ax2.plot(_cutoff, members_list, label="Number of members", color="red") 479 480 ax1.set_xlabel("Cutoff") 481 ax1.set_ylabel("Total percentage", color="blue") 482 ax2.set_ylabel("Number of members", color="red") 483 484 plt.show()
Plot the total percentage and number of members for each class against the cutoff value.
Parameters
- cutoff: Percent cutoff value for filtering the classes.
Returns
None
486 def plot_binary_to_sixclass_incidence( 487 self, light=True, save=False, savedir="." 488 ) -> None: 489 """ 490 Plot the incidence of all sextant Disulfide classes for a given binary class. 491 492 :param loader: `proteusPy.DisulfideLoader` object 493 """ 494 495 from proteusPy.DisulfideClasses import plot_count_vs_class_df 496 497 def _enumerate_sixclass_fromlist(sslist): 498 x = [] 499 y = [] 500 501 for sixcls in sslist: 502 if sixcls is not None: 503 _y = self.tclass.sslist_from_classid(sixcls) 504 # it's possible to have 0 SS in a class 505 if _y is not None: 506 # only append if we have both. 507 x.append(sixcls) 508 y.append(len(_y)) 509 510 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 511 sslist_df["class_id"] = x 512 sslist_df["count"] = y 513 return sslist_df 514 515 clslist = self.tclass.classdf["class_id"] 516 for cls in clslist: 517 sixcls = self.tclass.binary_to_six_class(cls) 518 df = _enumerate_sixclass_fromlist(sixcls) 519 plot_count_vs_class_df(df, cls, theme="light", save=save, savedir=savedir) 520 return
Plot the incidence of all sextant Disulfide classes for a given binary class.
Parameters
- loader:
proteusPy.DisulfideLoader
object
522 def enumerate_sixclass_fromlist(self, sslist) -> pd.DataFrame: 523 x = [] 524 y = [] 525 526 for sixcls in sslist: 527 if sixcls is not None: 528 _y = self.tclass.sslist_from_classid(sixcls) 529 # it's possible to have 0 SS in a class 530 if _y is not None: 531 # only append if we have both. 532 x.append(sixcls) 533 y.append(len(_y)) 534 535 sslist_df = pd.DataFrame(columns=["class_id", "count"]) 536 sslist_df["class_id"] = x 537 sslist_df["count"] = y 538 return sslist_df
540 def save(self, savepath=DATA_DIR, subset=False, cutoff=-1.0): 541 """ 542 Save a copy of the fully instantiated Loader to the specified file. 543 544 :param savepath: Path to save the file, defaults to DATA_DIR 545 :param fname: Filename, defaults to LOADER_FNAME 546 :param verbose: Verbosity, defaults to False 547 :param cutoff: Distance cutoff used to build the database, -1 means no cutoff. 548 """ 549 self.version = proteusPy.__version__ 550 self.cutoff = cutoff 551 552 if subset: 553 fname = LOADER_SUBSET_FNAME 554 else: 555 fname = LOADER_FNAME 556 557 _fname = f"{savepath}{fname}" 558 559 if self.verbose: 560 print(f"-> DisulfideLoader.save(): Writing {_fname}... ") 561 562 with open(_fname, "wb+") as f: 563 pickle.dump(self, f) 564 565 if self.verbose: 566 print(f"-> DisulfideLoader.save(): Done.")
Save a copy of the fully instantiated Loader to the specified file.
Parameters
- savepath: Path to save the file, defaults to DATA_DIR
- fname: Filename, defaults to LOADER_FNAME
- verbose: Verbosity, defaults to False
- cutoff: Distance cutoff used to build the database, -1 means no cutoff.
572def Download_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False): 573 """ 574 Download the databases from my Google Drive. 575 576 :param loadpath: Path from which to load, defaults to DATA_DIR 577 :param verbose: Verbosity, defaults to False 578 """ 579 580 import gdown 581 582 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 583 _fname_all = f"{loadpath}{LOADER_FNAME}" 584 585 if verbose: 586 print(f"--> DisulfideLoader: Downloading Disulfide Database from Drive...") 587 588 gdown.download(LOADER_ALL_URL, _fname_all, quiet=False) 589 590 if subset: 591 if verbose: 592 print( 593 f"--> DisulfideLoader: Downloading Disulfide Subset Database from Drive..." 594 ) 595 596 gdown.download(LOADER_SUBSET_URL, _fname_sub, quiet=False) 597 598 return
Download the databases from my Google Drive.
Parameters
- loadpath: Path from which to load, defaults to DATA_DIR
- verbose: Verbosity, defaults to False
601def Download_PDB_SS_GitHub(loadpath=DATA_DIR, verbose=True, subset=False): 602 """ 603 Download the databases from Github. Note: if you change the database these sizes will 604 need to be changed! 605 606 :param loadpath: Path from which to load, defaults to DATA_DIR 607 :param verbose: Verbosity, defaults to True 608 """ 609 610 import urllib 611 612 _good1 = 0 # all data 613 _good2 = 0 # subset data 614 615 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 616 _fname_all = f"{loadpath}{LOADER_FNAME}" 617 618 _all_length = 340371775 619 _subset_length = 9636086 620 621 if verbose: 622 print(f"--> DisulfideLoader: Downloading Disulfide Database from GitHub...") 623 624 resp, headers = urllib.request.urlretrieve( 625 "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_ALL_LOADER.pkl", 626 _fname_all, 627 ) 628 num_bytes = headers.get("content-length") 629 if num_bytes == _all_length: 630 _good1 = 1 631 else: 632 print(f"--> Read: {num_bytes}, expecting: {_all_length}") 633 634 if subset: 635 if verbose: 636 print( 637 f"--> DisulfideLoader: Downloading Disulfide Subset Database from GitHub..." 638 ) 639 640 resp, headers = urllib.request.urlretrieve( 641 "https://github.com/suchanek/proteusPy/raw/master/data/PDB_SS_SUBSET_LOADER.pkl", 642 _fname_sub, 643 ) 644 num_bytes = headers.get("content-length") 645 if num_bytes == _subset_length: 646 _good2 = 1 647 else: 648 print(f"--> Read: {num_bytes}, expecting: {_subset_length}") 649 return _good1 + _good2
Download the databases from Github. Note: if you change the database these sizes will need to be changed!
Parameters
- loadpath: Path from which to load, defaults to DATA_DIR
- verbose: Verbosity, defaults to True
652def Load_PDB_SS(loadpath=DATA_DIR, verbose=False, subset=False) -> DisulfideLoader: 653 """ 654 Load the fully instantiated Disulfide database from the specified file. Use the 655 defaults unless you are building the database by hand. *This is the function 656 used to load the built database.* 657 658 :param loadpath: Path from which to load, defaults to DATA_DIR 659 :param verbose: Verbosity, defaults to False 660 :param subset: If True, load the subset DB, otherwise load the full database 661 """ 662 # normally the .pkl files are local, EXCEPT for the first run from a newly-installed proteusPy 663 # distribution. In that case we need to download the files for all disulfides and the subset 664 # from the GitHub. 665 666 _good1 = False # all data 667 _good2 = False # subset data 668 669 _fname_sub = f"{loadpath}{LOADER_SUBSET_FNAME}" 670 _fname_all = f"{loadpath}{LOADER_FNAME}" 671 672 if subset: 673 _fname = _fname_sub 674 else: 675 _fname = _fname_all 676 677 if not os.path.exists(_fname_sub): 678 res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=True) 679 680 if not os.path.exists(_fname_all): 681 res2 = Download_PDB_SS(loadpath=loadpath, verbose=verbose, subset=False) 682 683 # first attempt to read the local copy of the loader 684 if verbose: 685 print(f"-> load_PDB_SS(): Reading {_fname}... ") 686 687 with open(_fname, "rb") as f: 688 res = pickle.load(f) 689 # res = pd.compat.pickle_compat.load(f) 690 691 if verbose: 692 print(f"-> load_PDB_SS(): Done reading {_fname}... ") 693 return res
Load the fully instantiated Disulfide database from the specified file. Use the defaults unless you are building the database by hand. This is the function used to load the built database.
Parameters
- loadpath: Path from which to load, defaults to DATA_DIR
- verbose: Verbosity, defaults to False
- subset: If True, load the subset DB, otherwise load the full database