Source code for protflow.utils.metrics

"""
This module provides a comprehensive suite of tools for calculating various metrics related to proteins and their sequences, specifically designed to facilitate detailed analysis and comparisons. The functionalities in this module allow users to determine mutations between protein sequences, calculate structural metrics such as the radius of gyration, and compute sequence identities, among other tasks.

Overview:
The module encompasses a range of utilities aimed at analyzing protein structures and sequences. Users can compare protein sequences to identify mutations, calculate the radius of gyration from PDB files, and assess sequence identity both pairwise and across multiple sequences. Additionally, it provides methods for evaluating protein entropy, ligand interactions, and structural consistency metrics such as self-consistency TM-score and Baker in silico success scores.

Key functionalities include:
- **Mutation Analysis**: Tools to list and count mutations between wild-type and variant sequences, and to identify mutation indices.
- **Structural Metrics**: Calculation of the radius of gyration for protein structures and evaluation of ligand clashes and contacts.
- **Sequence Analysis**: Computation of sequence identity between two sequences and across a list of sequences.
- **Entropy Calculation**: Calculation of entropy based on a given probability distribution.
- **Self-Consistency and Success Scores**: Methods to compute self-consistency TM-scores and Baker in silico success scores within dataframes.
- **Ligand Interaction Analysis**: Evaluation of ligand clashes and contacts within a protein structure.

Examples:
Here are some examples of how to use the functions provided in this module:

1. Counting mutations between two protein sequences:
    ```python
    from metrics import count_mutations
    mutation_count, mutations = count_mutations("ACDEFG", "ACDQFG")
    print(mutation_count, mutations)
    # Output: 1, ['E4Q']
    ```

2. Calculating radius of gyration from a PDB file:
    ```python
    from metrics import calc_rog_of_pdb
    rog = calc_rog_of_pdb("example.pdb")
    print(rog)
    ```

3. Finding mutation indices between two sequences:
    ```python
    from metrics import get_mutation_indeces
    indices = get_mutation_indeces("ACGTAGCT", "ACCTAGCT")
    print(indices)
    # Output: [3]
    ```

4. Calculating sequence identity between two sequences:
    ```python
    from metrics import calc_sequence_identity
    identity = calc_sequence_identity("ACDEFG", "ACDQFG")
    print(identity)
    # Output: 0.8333333333333334
    ```

5. Computing all-against-all sequence identity for a list of sequences:
    ```python
    from metrics import all_against_all_sequence_identity
    identities = all_against_all_sequence_identity(["ACDEFG", "ACDFGG", "ACDEFG"])
    print(identities)
    # Output: [0.8333333333333334, 0.8333333333333334, 1.0]
    ```

6. Calculating entropy from a probability distribution:
    ```python
    from metrics import entropy
    prob_dist = np.array([0.1, 0.2, 0.7])
    ent = entropy(prob_dist)
    print(ent)
    # Output: 1.1567796494470395
    ```

7. Calculating self-consistency TM-score in a dataframe:
    ```python
    from metrics import calc_sc_tm
    df = pd.DataFrame({"ref_col": ["A", "B"], "tm_col": [0.9, 0.85]})
    updated_df = calc_sc_tm(df, "sc_tm_score", "ref_col", "tm_col")
    print(updated_df)
    ```

These examples illustrate the primary capabilities of the module, showcasing how it can be utilized to streamline the process of analyzing protein structures and sequences.
"""
# Imports

# dependencies
import numpy as np
from Bio.PDB.Structure import Structure
import pandas as pd

# customs
from .biopython_tools import get_atoms, load_structure_from_pdbfile
from .utils import vdw_radii


[docs]
def get_mutations_list(wt: str, variant:str) -> None:
    '''Not implemented.'''
    raise NotImplementedError



[docs]
def count_mutations(wt: str, variant:str) -> tuple[int, list[str]]:
    """
    Compares two protein sequences and counts the number of mutations, 
    returning both the count and a detailed list of mutations.

    Each mutation is represented in the format: 
    '[original amino acid][position][mutated amino acid]'.
    
    Parameters:
    seq1 (str): The first protein sequence (e.g., wild type).
    seq2 (str): The second protein sequence (e.g., variant).

    Returns:
    tuple[int, list[str]]: A tuple where the first element is an integer 
    representing the number of mutations, and the second element is a list of 
    strings detailing each mutation.

    Raises:
    ValueError: If the input sequences are not of the same length.

    Example:
    >>> count_mutations("ACDEFG", "ACDQFG")
    (1, ['E4Q'])
    """
    # Check if the lengths of the sequences are the same
    if len(wt) != len(variant):
        raise ValueError("Sequences must be of the same length")

    mutations = []
    mutation_count = 0

    for i, (a, b) in enumerate(zip(wt, variant)):
        if a != b:
            mutation_count += 1
            mutation = f"{a}{i+1}{b}"
            mutations.append(mutation)

    return mutation_count, mutations



[docs]
def get_mutation_indeces(wt: str, variant:str) -> list[int]:
    '''
    Find the indices of mutations between two sequences.
    Can be protein, or nucleic acid sequences.

    Parameters:
    - wt (str): The wild-type sequence.
    - variant (str): The variant sequence.

    Returns:
    - list[int]: A list of indices where mutations occur (1-based index).

    Raises:
    - ValueError: If the lengths of 'wt' and 'variant' sequences are not the same.

    Description:
    This function takes two sequences, 'wt' (wild-type) and 'variant', and returns a list of indices where mutations occur (i.e., where the two sequences differ). The indices are 1-based. If the lengths of 'wt' and 'variant' are not the same, a ValueError is raised.

    Example:
    >>> wt_sequence = "ACGTAGCT"
    >>> variant_sequence = "ACCTAGCT"
    >>> mutations = get_mutation_indeces(wt_sequence, variant_sequence)
    >>> print(mutations)
    [3]
    '''
    # sanity
    if len(wt) != len(variant): raise ValueError(f"wt and variant must be of same length! lengths: wt: {len(wt)} variant: {len(variant)}")

    # convert sequences into arrays
    wt_arr = np.array(list(wt))
    variant_arr = np.array(list(variant))

    # Find indices where mutations occur (1-based index)
    return list(np.where(wt_arr != variant_arr)[0] + 1)



[docs]
def calc_rog_of_pdb(pdb_path: str, min_dist: float = 0, chain: str = None) -> float:
    """
    Calculate the radius of gyration of a protein from a PDB file.

    This function loads a protein structure from a PDB file and computes the
    radius of gyration for the alpha carbon atoms (Cα).

    Parameters
    ----------
    pdb_path : str
        Path to the PDB file containing the protein structure.
    min_dist : float, optional
        Minimum distance to consider between atoms, by default 0.

    Returns
    -------
    float
        The calculated radius of gyration of the protein.

    Example
    -------
    >>> from metrics import calc_rog_of_pdb
    >>> rog = calc_rog_of_pdb("example.pdb")
    >>> print(rog)
    """
    return calc_rog(load_structure_from_pdbfile(pdb_path), min_dist=min_dist, chain = chain)



[docs]
def calc_rog(pose: Structure, min_dist: float = 0, chain: str = None) -> float:
    """
    Calculate the radius of gyration of a protein's alpha carbons.

    This function computes the radius of gyration for the alpha carbon atoms (Cα)
    in a given protein structure.

    Parameters
    ----------
    pose : Bio.PDB.Structure.Structure
        A Bio.PDB.Structure.Structure object representing the protein structure.
    min_dist : float, optional
        Minimum distance to consider between atoms, by default 0.

    Returns
    -------
    float
        The calculated radius of gyration of the protein.

    Raises
    ------
    ValueError
        If the `pose` parameter is not of type Bio.PDB.Structure.Structure.

    Example
    -------
    >>> from metrics import calc_rog
    >>> from Bio.PDB import PDBParser
    >>> parser = PDBParser()
    >>> structure = parser.get_structure("example", "example.pdb")
    >>> rog = calc_rog(structure)
    >>> print(rog)
    """
    # get CA coordinates and calculate centroid
    if chain:
        ca_coords = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.id == "CA" and atom.get_parent().get_parent().id == chain])
    else:
        ca_coords = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.id == "CA"])
    centroid = np.mean(ca_coords, axis=0)

    # calculate average distance of CA atoms to centroid
    dgram = np.maximum(min_dist, np.linalg.norm(ca_coords - centroid, axis=-1))

    # take root over squared sum of distances and return (rog):
    return np.sqrt(np.sum(dgram**2) / ca_coords.shape[0])



[docs]
def calc_sequence_identity(seq1: str, seq2: str) -> float:
    """
    Calculate sequence identity between two protein sequences.

    This function computes the sequence identity by comparing two protein sequences
    of the same length and determining the proportion of matching amino acids.

    Parameters
    ----------
    seq1 : str
        The first protein sequence.
    seq2 : str
        The second protein sequence.

    Returns
    -------
    float
        The sequence identity as a fraction of matching amino acids.

    Raises
    ------
    ValueError
        If the input sequences are not of the same length.

    Example
    -------
    >>> from metrics import calc_sequence_identity
    >>> identity = calc_sequence_identity("ACDEFG", "ACDQFG")
    >>> print(identity)
    # Output: 0.8333333333333334
    """
    if len(seq1) != len(seq2):
        raise ValueError(f"Sequences must be of the same length. Length of seq1: {len(seq1)}, length of seq2: {len(seq2)}")
    matching = sum(1 for a, b in zip(seq1, seq2) if a == b)
    return matching / len(seq1)



[docs]
def all_against_all_sequence_identity(input_seqs: list[str]) -> list:
    """
    Calculate the maximum sequence identity for all sequences against each other.

    This function takes a list of protein sequences and computes the maximum sequence
    identity for each sequence against all others in the list.

    Parameters
    ----------
    input_seqs : list of str
        A list of protein sequences.

    Returns
    -------
    list of float
        A list of maximum sequence identities for each sequence against all others.

    Example
    -------
    >>> from metrics import all_against_all_sequence_identity
    >>> identities = all_against_all_sequence_identity(["ACDEFG", "ACDFGG", "ACDEFG"])
    >>> print(identities)
    # Output: [0.8333333333333334, 0.8333333333333334, 1.0]
    """
    # create a mapping for quick calculation with numpy
    aa_mapping = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}
    mapped_seqs = np.array([[aa_mapping[s] for s in seq] for seq in input_seqs])

    # create all-against-all array and calculate sequence identity
    expanded_a = mapped_seqs[:, np.newaxis]
    expanded_b = mapped_seqs[np.newaxis, :]
    similarity_matrix = np.mean(expanded_a == expanded_b, axis=2)

    # convert diagonal from 1.0 to -inf
    np.fill_diagonal(similarity_matrix, -np.inf)

    return list(np.max(similarity_matrix, axis=1))



[docs]
def entropy(prob_distribution: np.ndarray, axis: int = -1) -> float:
    """
    Compute element-wise Shannon entropy H(p) = –∑ p·log₂ p along the given axis,
    safely ignoring any p == 0 terms.

    Parameters
    ----------
    prob_distribution : np.array
        An array representing the probability distribution.

    Returns
    -------
    np.ndarray
        The calculated entropies of the probability distribution.

    Example
    -------
    >>> from metrics import entropy
    >>> prob_dist = np.array([0.1, 0.2, 0.7])
    >>> ent = entropy(prob_dist)
    >>> print(ent)
    # Output: 1.1567796494470395
    """
    # sanity
    prob_distribution = np.asarray(prob_distribution)

    # Suppress log(0) warnings; log₂(0)→–inf, but we’ll zero it out next.
    with np.errstate(divide='ignore', invalid='ignore'):
        plogp = prob_distribution * np.log2(prob_distribution)

    # Wherever p was zero, force p·log₂p → 0
    plogp = np.where(prob_distribution > 0, plogp, 0.0)

    # calculate entropy element wise
    H = -np.sum(plogp, axis=axis)
    return H



[docs]
def calc_sc_tm(input_df: pd.DataFrame, name: str, ref_col: str, tm_col: str) -> pd.DataFrame:
    """
    Calculate self-consistency TM-score in a dataframe.

    This function computes the self-consistency TM-score for protein structures and
    integrates the results into the input dataframe.

    Parameters
    ----------
    input_df : pd.DataFrame
        A dataframe containing protein structure data.
    name : str
        The name of the new column that should hold the self-consistency TM-score.
    ref_col : str
        The column in `input_df` pointing to the reference description or location.
    tm_col : str
        The column in `input_df` pointing to the TM-scores from TMAlign runner.

    Returns
    -------
    pd.DataFrame
        The input dataframe with the integrated self-consistency TM-score column.

    Raises
    ------
    KeyError
        If the `name` column already exists in `input_df` or if `tm_col` does not exist in `input_df`.
    ValueError
        If `ref_col` does not point to a description or location column in `input_df`.

    Example
    -------
    >>> from metrics import calc_sc_tm
    >>> df = pd.DataFrame({"ref_col": ["A", "B"], "tm_col": [0.9, 0.85]})
    >>> updated_df = calc_sc_tm(df, "sc_tm_score", "ref_col", "tm_col")
    >>> print(updated_df)
    """
    # check if name exists in poses
    if name in input_df.columns:
        raise KeyError(f"Column {name} already present in DataFrame. Choose different name!")
    if tm_col not in input_df.columns:
        raise KeyError(f"Column {tm_col} does not exist in DataFrame. Did you mean any of {[x for x in input_df.columns if tm_col.split('_')[0] in x]}")

    # get descriptions of poses and tm_col_description
    if not ref_col.endswith("description") and not ref_col.endswith("location"):
        raise ValueError(f"Parameter :ref_col: does not point to description or location column in :input_df:. ref_col: {ref_col}")

    # group df by ref_col description and get maximum tm_score in each group.
    grouped_max = input_df.groupby(ref_col)[tm_col].max().reset_index()
    grouped_max.columns = [ref_col, name]

    # merge max tm-scores into input_df[name]
    input_df = input_df.merge(grouped_max, on=ref_col, how='left')
    return input_df



[docs]
def calc_ligand_clashes(pose: str|Structure, ligand_chain: str, dist: float = 3, atoms: list[str] = None, exclude_ligand_hydrogens: bool = False) -> float:
    """
    Calculate ligand clashes for a PDB file given a ligand chain.

    This method calculates the number of clashes between a specified ligand chain and the rest of the structure in a PDB file or a Bio.PDB Structure object. A clash is defined as any pair of atoms (one from the ligand, one from the rest of the structure) that are within a specified distance of each other.

    Parameters:
        pose (str | Bio.PDB.Structure.Structure): The pose representing the structure, which can be a path to a PDB file (str) or a Bio.PDB Structure object.
        ligand_chain (str): The chain identifier for the ligand within the structure.
        dist (float, optional): The distance threshold for defining a clash. Default is 3.0.
        atoms (list[str], optional): A list of atom names to consider for clash calculations. If None, all atoms are considered. If specified, only these atoms will be included in the clash calculation.

    Returns:
        float: The number of clashes found between the ligand and the rest of the structure.

    Examples:
        Here is an example of how to use the `calc_ligand_clashes` method:

        .. code-block:: python

            from Bio.PDB import PDBParser

            # Load structure from a PDB file
            parser = PDBParser()
            structure = parser.get_structure("example", "example.pdb")

            # Calculate clashes
            clashes = calc_ligand_clashes(structure, ligand_chain="A", dist=3.0, atoms=["N", "CA", "C"])
            # clashes will be a float representing the number of clashes

    Further Details:
        - **Clash Calculation:** The method calculates the Euclidean distance between all specified atoms of the ligand chain and the rest of the structure. A clash is counted if the distance is less than the specified threshold.
        - **Usage:** This function is useful for evaluating potential steric clashes in molecular docking studies or for validating the positioning of ligands in structural models.

    This method is designed to facilitate the detection of steric clashes between ligands and the surrounding structure, providing a quantitative measure of potential conflicts.
    """
    # verify inputs
    if isinstance(pose, str):
        pose = load_structure_from_pdbfile(pose)
    elif not isinstance(pose, Structure):
        raise ValueError(f"Parameter :pose: has to be of type str or Bio.PDB.Structure.Structure. type(pose) = {type(pose)}")

    # check for ligand chain
    pose_chains = list(chain.id for chain in pose.get_chains())
    if ligand_chain not in pose_chains:
        raise KeyError(f"Chain {ligand_chain} not found in pose. Available Chains: {pose_chains}")

    # get atoms
    if not atoms or atoms == "all":
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain])
    elif isinstance(atoms, list) and all(isinstance(atom, str) for atom in atoms):
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain and atom.id in atoms])
    else:
        raise ValueError(f"Invalid Value for parameter :atoms:. For all atoms set to {{None, False, 'all'}} or specify list of atoms e.g. ['N', 'CA', 'CO']")
    if exclude_ligand_hydrogens: 
        ligand_atoms = np.array([atom.get_coord() for atom in pose[ligand_chain].get_atoms() if not atom.element == "H"])
    else: 
        ligand_atoms = np.array([atom.get_coord() for atom in pose[ligand_chain].get_atoms()])

    # calculate clashes
    dgram = np.linalg.norm(pose_atoms[:, np.newaxis] - ligand_atoms[np.newaxis, :], axis=-1)

    return np.sum((dgram < dist))



[docs]
def calc_ligand_clashes_vdw(pose: str|Structure, ligand_chain: str, factor: float = 1, atoms: list[str] = None, exclude_ligand_elements: list[str] = None) -> int:
    """
    Calculate ligand clashes for a PDB file given a ligand chain.

    This method calculates the number of clashes between a specified ligand chain and the rest of the structure in a PDB file or a Bio.PDB Structure object. A clash is defined as any pair of atoms (one from the ligand, one from the rest of the structure) that are within the sum of their Van der Waals radii multiplied by a factor.

    Parameters:
        pose (str | Bio.PDB.Structure.Structure): The pose representing the structure, which can be a path to a PDB file (str) or a Bio.PDB Structure object.
        ligand_chain (str): The chain identifier for the ligand within the structure.
        factor (float, optional): The multiplier for the VdW clash threshold for defining a clash. Lower numbers result in less stringent clash detection. Default is 1.0.
        atoms (list[str], optional): A list of atom names to consider for clash calculations. If None, all atoms are considered. If specified, only these atoms will be included in the clash calculation.
        exclude_ligand_elements (list[str], optional): A list of elements that should not be considered during clash detection (e.g. ['H']). Default is None

    Returns:
        float: The number of clashes found between the ligand and the rest of the structure.

    Examples:
        Here is an example of how to use the `calc_ligand_clashes` method:

        .. code-block:: python

            from Bio.PDB import PDBParser

            # Load structure from a PDB file
            parser = PDBParser()
            structure = parser.get_structure("example", "example.pdb")

            # Calculate clashes
            clashes = calc_ligand_clashes_vdw(structure, ligand_chain="A", factor=0.8, atoms=["N", "CA", "C"], exclude_ligand_atoms=["H"])
            # clashes will be a float representing the number of clashes

    Further Details:
        - **Clash Calculation:** The method calculates the Euclidean distance between all specified atoms of the ligand chain and the rest of the structure. A clash is detected if the distance is less than the sum of their Van der Waals radii multiplied by a set factor.
        - **Usage:** This function is useful for evaluating potential steric clashes in molecular docking studies or for validating the positioning of ligands in structural models.

    This method is designed to facilitate the detection of steric clashes between ligands and the surrounding structure, providing a quantitative measure of potential conflicts.
    """
    # verify inputs
    if isinstance(pose, str):
        pose = load_structure_from_pdbfile(pose)
    elif not isinstance(pose, Structure):
        raise ValueError(f"Parameter :pose: has to be of type str or Bio.PDB.Structure.Structure. type(pose) = {type(pose)}")

    if exclude_ligand_elements:
        if not isinstance(exclude_ligand_elements, list):
            raise ValueError(f"Parameter:exclude_ligand_atoms: has to be a list of str, not {type(exclude_ligand_elements)}!")
        exclude_ligand_elements = [element.lower() for element in exclude_ligand_elements]

    # import VdW radii
    vdw_dict = vdw_radii()

    # check for ligand chain
    pose_chains = list(chain.id for chain in pose.get_chains())
    if ligand_chain not in pose_chains:
        raise KeyError(f"Chain {ligand_chain} not found in pose. Available Chains: {pose_chains}")

    # get atoms
    if not atoms or atoms == "all":
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain])
        pose_vdw = np.array([vdw_dict[atom.element.lower()] for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain])
    elif isinstance(atoms, list) and all(isinstance(atom, str) for atom in atoms):
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain and atom.id in atoms])
        pose_vdw = np.array([vdw_dict[atom.element.lower()] for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain and atom.id in atoms])
    else:
        raise ValueError(f"Invalid Value for parameter :atoms:. For all atoms set to {{None, False, 'all'}} or specify list of atoms e.g. ['N', 'CA', 'CO']")

    # get ligand atoms
    if exclude_ligand_elements:
        ligand_atoms = np.array([atom.get_coord() for atom in pose[ligand_chain].get_atoms() if not atom.element.lower() in exclude_ligand_elements])
        ligand_vdw = np.array([vdw_dict[atom.element.lower()] for atom in pose[ligand_chain].get_atoms() if not atom.element.lower() in exclude_ligand_elements])
    else:
        ligand_atoms = np.array([atom.get_coord() for atom in pose[ligand_chain].get_atoms()])
        ligand_vdw = np.array([vdw_dict[atom.element.lower()] for atom in pose[ligand_chain].get_atoms()])

    if np.any(np.isnan(ligand_vdw)):
        raise RuntimeError("Could not find Van der Waals radii for all elements in ligand. Check protflow.utils.vdw_radii and add it, if applicable!")

    # calculate distances between all atoms of ligand and protein
    dgram = np.linalg.norm(pose_atoms[:, np.newaxis] - ligand_atoms[np.newaxis, :], axis=-1)

    # calculate distance cutoff for each atom pair, considering VdW radii 
    distance_cutoff = pose_vdw[:, np.newaxis] + ligand_vdw[np.newaxis, :]

    # multiply distance cutoffs with set parameter
    distance_cutoff = distance_cutoff * factor

    # compare distances to distance_cutoff
    check = dgram - distance_cutoff

    # count number of clashes (where distances are below distance cutoff)
    clashes = np.sum((check < 0))

    return clashes



[docs]
def calc_ligand_contacts(pose: str, ligand_chain: str, min_dist: float = 3, max_dist: float = 5, atoms: list[str] = None, excluded_elements: list[str] = None) -> float:
    """
    Calculate contacts of a ligand within a structure.

    This method calculates the number of contacts between a specified ligand chain and the rest of the structure within a specified distance range. Contacts are defined as any pair of atoms (one from the ligand, one from the rest of the structure) where the distance falls between the minimum and maximum specified distances.

    Parameters:
        pose (str | Bio.PDB.Structure.Structure): The pose representing the structure, which can be a path to a PDB file (str) or a Bio.PDB Structure object.
        ligand_chain (str): The chain identifier for the ligand within the structure.
        min_dist (float, optional): The minimum distance threshold for defining a contact. Default is 3.0.
        max_dist (float, optional): The maximum distance threshold for defining a contact. Default is 5.0.
        atoms (list[str], optional): A list of atom names to consider for contact calculations. If None, all atoms are considered. If specified, only these atoms will be included in the contact calculation.
        excluded_elements (list[str], optional): A list of element symbols to exclude from the contact calculations. Default is ["H"].

    Returns:
        float: The number of contacts normalized by the number of ligand atoms.

    Examples:
        Here is an example of how to use the `calc_ligand_contacts` method:

        .. code-block:: python

            from Bio.PDB import PDBParser

            # Load structure from a PDB file
            parser = PDBParser()
            structure = parser.get_structure("example", "example.pdb")

            # Calculate contacts
            contacts = calc_ligand_contacts(structure, ligand_chain="A", min_dist=3.0, max_dist=5.0, atoms=["N", "CA", "C"], excluded_elements=["H", "O"])
            # contacts will be a float representing the number of contacts normalized by the number of ligand atoms

    Further Details:
        - **Contact Calculation:** The method calculates the Euclidean distance between all specified atoms of the ligand chain and the rest of the structure. A contact is counted if the distance is within the specified range (min_dist to max_dist).
        - **Usage:** This function is useful for evaluating potential interactions between ligands and the surrounding structure, particularly in drug design and molecular docking studies.

    This method is designed to facilitate the detection of relevant contacts between ligands and the surrounding structure, providing a quantitative measure of potential interactions.
    """

    # verify inputs
    if isinstance(pose, str):
        pose = load_structure_from_pdbfile(pose)
    elif not isinstance(pose, Structure):
        raise ValueError(f"Parameter :pose: has to be of type str or Bio.PDB.Structure.Structure. type(pose) = {type(pose)}")

    # check for ligand chain
    pose_chains = list(chain.id for chain in pose.get_chains())
    if ligand_chain not in pose_chains:
        raise KeyError(f"Chain {ligand_chain} not found in pose. Available Chains: {pose_chains}")

    # get pose atoms
    if not atoms or atoms == "all":
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain and atom.element not in excluded_elements])
    elif isinstance(atoms, list) and all(isinstance(atom, str) for atom in atoms):
        pose_atoms = np.array([atom.get_coord() for atom in pose.get_atoms() if atom.full_id[2] != ligand_chain and atom.id in atoms and atom.element not in excluded_elements])
    else:
        raise ValueError(f"Invalid Value for parameter :atoms:. For all atoms set to {{None, False, 'all'}} or specify list of atoms e.g. ['N', 'CA', 'CO']")
    ligand_atoms = np.array([atom.get_coord() for atom in pose[ligand_chain].get_atoms() if atom.element not in excluded_elements])

    # calculate complete dgram
    dgram = np.linalg.norm(pose_atoms[:, np.newaxis] - ligand_atoms[np.newaxis, :], axis=-1)

    # return number of contacts
    return np.sum((dgram > min_dist) & (dgram < max_dist)) / len(ligand_atoms)



[docs]
def residue_contacts(pose:str, max_distance:float, target_chain:str, partner_chain:str, target_resnum: int, target_atom_names:list[str]=None, partner_atom_names:list[str]=None, min_distance:float=0, ):
    # TODO: Write proper docstrings!
    # calculates number of atoms on partner_chain that are between max_distance and min_distance from target_atom_names on target_resnum of chain target_chain.
    pose = load_structure_from_pdbfile(pose)
    target = pose[target_chain][target_resnum] #[res for res in pose[target_chain].get_residues() if res.get_segid() == target_resnum][0]
    partner = pose[partner_chain]
    if target_atom_names:
        target_coords = np.array([atom.get_coord() for atom in target.get_atoms() if atom.id in target_atom_names])
    else:
        target_coords = np.array([atom.get_coord() for atom in target.get_atoms()])

    if partner_atom_names:
        partner_coords = np.array([atom.get_coord() for atom in partner.get_atoms() if atom.id in partner_atom_names])
    else:
        partner_coords = np.array([atom.get_coord() for atom in partner.get_atoms()])

    # calculate complete dgram
    dgram = np.linalg.norm(target_coords[:, np.newaxis] - partner_coords[np.newaxis, :], axis=-1)

    # return number of contacts
    return np.sum((dgram < max_distance) & (dgram > min_distance))



[docs]
def calc_interchain_contacts(pose: Structure, chains: list[str,str], contact_bounds: tuple[float,float] = (4,8), atoms: list[str] = None) -> float:
    '''Calculates contacts between chains in pose'''
    # get atoms of chains
    chain_a_atoms = np.array([atom.coord for atom in get_atoms(pose, chains=list(chains[0]), atoms=atoms)])
    chain_b_atoms = np.array([atom.coord for atom in get_atoms(pose, chains=list(chains[1]), atoms=atoms)])

    # calc dgram
    dgram = np.linalg.norm(chain_a_atoms[:, np.newaxis] - chain_b_atoms[np.newaxis, :], axis=-1)

    # return number of contacts according to dgram.
    min_dist, max_dist = contact_bounds
    return np.sum((dgram > min_dist) & (dgram < max_dist))



[docs]
def calc_interchain_contacts_pdb(pdb_path: str, chains: list[str,str], contact_bounds: tuple[float,float] = (4,8), atoms: list[str] = None):
    '''Calculates interchain contacts in pose for .pdb file'''
    pose = load_structure_from_pdbfile(pdb_path)
    return calc_interchain_contacts(pose, chains, contact_bounds, atoms)