Source code for kripodb.canned

# Copyright 2016 Netherlands eScience Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module with functions which use pandas DataFrame as input and output.

For using Kripo data files inside KNIME (http://www.knime.org)
"""

from __future__ import absolute_import

import numpy as np
import pandas as pd
from requests import HTTPError

from .db import FragmentsDb
from .pairs import similar, open_similarity_matrix
from .pharmacophores import PharmacophoresDb, as_phar
from .webservice.client import WebserviceClient, IncompleteFragments, IncompletePharmacophores


[docs]class IncompleteHits(Exception):
    def __init__(self, absent_identifiers, hits):
        """List of hits and list of identifiers for which no information could be found

        Args:
            absent_identifiers (List[str]): List of identifiers for which no information could be found
            hits (pandas.DataFrame): Data frame with query_fragment_id, hit_frag_id and score columns
        """
        message = 'Some query fragment identifiers could not be found'
        super(IncompleteHits, self).__init__(message)
        self.absent_identifiers = absent_identifiers
        self.hits = hits


[docs]def similarities(queries, similarity_matrix_filename_or_url, cutoff, limit=1000):
    """Find similar fragments to queries based on similarity matrix.

    Args:
        queries (List[str]): Query fragment identifiers
        similarity_matrix_filename_or_url (str): Filename of similarity matrix file or base url of kripodb webservice
        cutoff (float): Cutoff, similarity scores below cutoff are discarded.
        limit (int): Maximum number of hits for each query.
            Default is 1000. Use is None for no limit.

    Examples:
        Fragments similar to '3j7u_NDP_frag24' fragment.

        >>> import pandas as pd
        >>> from kripodb.canned import similarities
        >>> queries = pd.Series(['3j7u_NDP_frag24'])
        >>> hits = similarities(queries, 'data/similaritys.h5', 0.55)
        >>> len(hits)
        11

        Retrieved from web service instead of local similarity matrix file.
        Make sure the web service is running,
        for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`.

        >>> hits = similarities(queries, 'http://localhost:8084/kripo', 0.55)
        >>> len(hits)
        11

    Returns:
        pandas.DataFrame: Data frame with query_fragment_id, hit_frag_id and score columns

    Raises:
        IncompleteHits: When one or more of the identifiers could not be found.
    """
    hits = []
    absent_identifiers = []
    if similarity_matrix_filename_or_url.startswith('http'):
        client = WebserviceClient(similarity_matrix_filename_or_url)
        for query in queries:
            try:
                qhits = client.similar_fragments(query, cutoff, limit)
                hits.extend(qhits)
            except HTTPError as e:
                if e.response.status_code == 404:
                    absent_identifiers.append(query)
    else:
        similarity_matrix = open_similarity_matrix(similarity_matrix_filename_or_url)
        for query in queries:
            try:
                for query_id, hit_id, score in similar(query, similarity_matrix, cutoff, limit):
                    hit = {'query_frag_id': query_id,
                           'hit_frag_id': hit_id,
                           'score': score,
                           }
                    hits.append(hit)
            except KeyError:
                absent_identifiers.append(query)

        similarity_matrix.close()

    if absent_identifiers:
        if len(hits) > 0:
            df = pd.DataFrame(hits, columns=['query_frag_id', 'hit_frag_id', 'score'])
        else:
            # empty hits array will give dataframe without columns
            df = pd.DataFrame({'query_frag_id': pd.Series(dtype=str),
                               'hit_frag_id': pd.Series(dtype=str),
                               'score': pd.Series(dtype=np.double)
                               }, columns=['query_frag_id', 'hit_frag_id', 'score'])
        raise IncompleteHits(absent_identifiers, df)

    return pd.DataFrame(hits, columns=['query_frag_id', 'hit_frag_id', 'score'])


[docs]def fragments_by_pdb_codes(pdb_codes, fragments_db_filename_or_url, prefix=''):
    """Retrieve fragments based on PDB codes.

    See http://www.rcsb.org/pdb/ for PDB structures.

    Args:
        pdb_codes (List[str]): List of PDB codes
        fragments_db_filename_or_url (str): Filename of fragments db or base url of kripodb webservice
        prefix (str): Prefix for output columns

    Examples:
        Fetch fragments of '2n2k' PDB code

        >>> from kripodb.canned import fragments_by_pdb_codes
        >>> pdb_codes = pd.Series(['2n2k'])
        >>> fragments = fragments_by_pdb_codes(pdb_codes, 'data/fragments.sqlite')
        >>> len(fragments)
        3

        Retrieved from web service instead of local fragments db file.
        Make sure the web service is running,
        for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`.

        >>> fragments = fragments_by_pdb_codes(pdb_codes, 'http://localhost:8084/kripo')
        >>> len(fragments)
        3

    Returns:
        pandas.DataFrame: Data frame with fragment information

    Raises:
        IncompleteFragments: When one or more of the identifiers could not be found.
    """
    if fragments_db_filename_or_url.startswith('http'):
        client = WebserviceClient(fragments_db_filename_or_url)
        try:
            fragments = client.fragments_by_pdb_codes(pdb_codes)
        except IncompleteFragments as e:
            df = pd.DataFrame(e.fragments)
            df.rename(columns=lambda x: prefix + x, inplace=True)
            raise IncompleteFragments(e.absent_identifiers, df)
    else:
        fragmentsdb = FragmentsDb(fragments_db_filename_or_url)
        fragments = []
        absent_identifiers = []
        for pdb_code in pdb_codes:
            try:
                for fragment in fragmentsdb.by_pdb_code(pdb_code):
                    fragments.append(fragment)
            except LookupError as e:
                absent_identifiers.append(pdb_code)
        if absent_identifiers:
            df = pd.DataFrame(fragments)
            df.rename(columns=lambda x: prefix + x, inplace=True)
            raise IncompleteFragments(absent_identifiers, df)

    df = pd.DataFrame(fragments)
    df.rename(columns=lambda x: prefix + x, inplace=True)
    return df


[docs]def fragments_by_id(fragment_ids, fragments_db_filename_or_url, prefix=''):
    """Retrieve fragments based on fragment identifier.

    Args:
        fragment_ids (List[str]): List of fragment identifiers
        fragments_db_filename_or_url (str): Filename of fragments db or base url of kripodb webservice
        prefix (str): Prefix for output columns

    Examples:
        Fetch fragments of '2n2k_MTN_frag1' fragment identifier

        >>> from kripodb.canned import fragments_by_id
        >>> fragment_ids = pd.Series(['2n2k_MTN_frag1'])
        >>> fragments = fragments_by_id(fragment_ids, 'data/fragments.sqlite')
        >>> len(fragments)
        1

        Retrieved from web service instead of local fragments db file.
        Make sure the web service is running,
        for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`.

        >>> fragments = fragments_by_id(fragment_ids,, 'http://localhost:8084/kripo')
        >>> len(fragments)
        1

    Returns:
        pandas.DataFrame: Data frame with fragment information

    Raises:
        IncompleteFragments: When one or more of the identifiers could not be found.
    """
    if fragments_db_filename_or_url.startswith('http'):
        client = WebserviceClient(fragments_db_filename_or_url)
        try:
            fragments = client.fragments_by_id(fragment_ids)
        except IncompleteFragments as e:
            df = pd.DataFrame(e.fragments)
            df.rename(columns=lambda x: prefix + x, inplace=True)
            raise IncompleteFragments(e.absent_identifiers, df)
    else:
        fragmentsdb = FragmentsDb(fragments_db_filename_or_url)
        fragments = []
        absent_identifiers = []
        for frag_id in fragment_ids:
            try:
                fragments.append(fragmentsdb[frag_id])
            except KeyError:
                absent_identifiers.append(frag_id)
        if absent_identifiers:
            df = pd.DataFrame(fragments)
            df.rename(columns=lambda x: prefix + x, inplace=True)
            raise IncompleteFragments(absent_identifiers, df)

    df = pd.DataFrame(fragments)
    df.rename(columns=lambda x: prefix + x, inplace=True)
    return df


[docs]def pharmacophores_by_id(fragment_ids, pharmacophores_db_filename_or_url):
    """Fetch pharmacophore points by fragment identifiers

    Args:
        fragment_ids (pd.Series): List of fragment identifiers
        pharmacophores_db_filename_or_url: Filename of pharmacophores db or base url of kripodb webservice

    Returns:
        pandas.Series: Pandas series with pharmacophores as string in phar format.
                       Fragment without pharmacophore will return None

    Examples:
        Fragments similar to '3j7u_NDP_frag24' fragment.

        >>> from kripodb.canned import pharmacophores_by_id
        >>> fragment_ids = pd.Series(['2n2k_MTN_frag1'], ['Row0'])
        >>> pharmacophores = pharmacophores_by_id(fragment_ids, 'data/pharmacophores.h5')
        >>> len(pharmacophores)
        1

        Retrieved from web service instead of local pharmacophores db file.
        Make sure the web service is running,
        for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`.

        >>> pharmacophores = pharmacophores_by_id(fragment_ids, 'http://localhost:8084/kripo')
        >>> len(pharmacophores)
        1
    """
    pphors = pd.Series([], dtype=str)
    if pharmacophores_db_filename_or_url.startswith('http'):
        client = WebserviceClient(pharmacophores_db_filename_or_url)
        try:
            pphorsarray = client.pharmacophores(fragment_ids)
            pphors = pd.Series(pphorsarray, fragment_ids.index, dtype=str)
        except IncompletePharmacophores as e:
            pphors = pd.Series(e.pharmacophores, fragment_ids.index, dtype=str)
            raise IncompletePharmacophores(e.absent_identifiers, pphors)
    else:
        with PharmacophoresDb(pharmacophores_db_filename_or_url) as pharmacophoresdb:
            absent_identifiers = []
            for row_id, frag_id in fragment_ids.iteritems():
                try:
                    phar = as_phar(frag_id, pharmacophoresdb[frag_id])
                    pphors[row_id] = phar
                except KeyError:
                    pphors[row_id] = None
                    absent_identifiers.append(frag_id)
            if absent_identifiers:
                raise IncompletePharmacophores(absent_identifiers, pphors)
    return pphors