Source code for kripodb.canned

# Copyright 2016 Netherlands eScience Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module with functions which use pandas DataFrame as input and output.

For using Kripo data files inside KNIME (http://www.knime.org)
"""

from __future__ import absolute_import

import numpy as np
import pandas as pd
from requests import HTTPError

from .db import FragmentsDb
from .pairs import similar, open_similarity_matrix
from .pharmacophores import PharmacophoresDb, as_phar
from .webservice.client import WebserviceClient, IncompleteFragments, IncompletePharmacophores


[docs]class IncompleteHits(Exception): def __init__(self, absent_identifiers, hits): """List of hits and list of identifiers for which no information could be found Args: absent_identifiers (List[str]): List of identifiers for which no information could be found hits (pandas.DataFrame): Data frame with query_fragment_id, hit_frag_id and score columns """ message = 'Some query fragment identifiers could not be found' super(IncompleteHits, self).__init__(message) self.absent_identifiers = absent_identifiers self.hits = hits
[docs]def similarities(queries, similarity_matrix_filename_or_url, cutoff, limit=1000): """Find similar fragments to queries based on similarity matrix. Args: queries (List[str]): Query fragment identifiers similarity_matrix_filename_or_url (str): Filename of similarity matrix file or base url of kripodb webservice cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits for each query. Default is 1000. Use is None for no limit. Examples: Fragments similar to '3j7u_NDP_frag24' fragment. >>> import pandas as pd >>> from kripodb.canned import similarities >>> queries = pd.Series(['3j7u_NDP_frag24']) >>> hits = similarities(queries, 'data/similaritys.h5', 0.55) >>> len(hits) 11 Retrieved from web service instead of local similarity matrix file. Make sure the web service is running, for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`. >>> hits = similarities(queries, 'http://localhost:8084/kripo', 0.55) >>> len(hits) 11 Returns: pandas.DataFrame: Data frame with query_fragment_id, hit_frag_id and score columns Raises: IncompleteHits: When one or more of the identifiers could not be found. """ hits = [] absent_identifiers = [] if similarity_matrix_filename_or_url.startswith('http'): client = WebserviceClient(similarity_matrix_filename_or_url) for query in queries: try: qhits = client.similar_fragments(query, cutoff, limit) hits.extend(qhits) except HTTPError as e: if e.response.status_code == 404: absent_identifiers.append(query) else: similarity_matrix = open_similarity_matrix(similarity_matrix_filename_or_url) for query in queries: try: for query_id, hit_id, score in similar(query, similarity_matrix, cutoff, limit): hit = {'query_frag_id': query_id, 'hit_frag_id': hit_id, 'score': score, } hits.append(hit) except KeyError: absent_identifiers.append(query) similarity_matrix.close() if absent_identifiers: if len(hits) > 0: df = pd.DataFrame(hits, columns=['query_frag_id', 'hit_frag_id', 'score']) else: # empty hits array will give dataframe without columns df = pd.DataFrame({'query_frag_id': pd.Series(dtype=str), 'hit_frag_id': pd.Series(dtype=str), 'score': pd.Series(dtype=np.double) }, columns=['query_frag_id', 'hit_frag_id', 'score']) raise IncompleteHits(absent_identifiers, df) return pd.DataFrame(hits, columns=['query_frag_id', 'hit_frag_id', 'score'])
[docs]def fragments_by_pdb_codes(pdb_codes, fragments_db_filename_or_url, prefix=''): """Retrieve fragments based on PDB codes. See http://www.rcsb.org/pdb/ for PDB structures. Args: pdb_codes (List[str]): List of PDB codes fragments_db_filename_or_url (str): Filename of fragments db or base url of kripodb webservice prefix (str): Prefix for output columns Examples: Fetch fragments of '2n2k' PDB code >>> from kripodb.canned import fragments_by_pdb_codes >>> pdb_codes = pd.Series(['2n2k']) >>> fragments = fragments_by_pdb_codes(pdb_codes, 'data/fragments.sqlite') >>> len(fragments) 3 Retrieved from web service instead of local fragments db file. Make sure the web service is running, for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`. >>> fragments = fragments_by_pdb_codes(pdb_codes, 'http://localhost:8084/kripo') >>> len(fragments) 3 Returns: pandas.DataFrame: Data frame with fragment information Raises: IncompleteFragments: When one or more of the identifiers could not be found. """ if fragments_db_filename_or_url.startswith('http'): client = WebserviceClient(fragments_db_filename_or_url) try: fragments = client.fragments_by_pdb_codes(pdb_codes) except IncompleteFragments as e: df = pd.DataFrame(e.fragments) df.rename(columns=lambda x: prefix + x, inplace=True) raise IncompleteFragments(e.absent_identifiers, df) else: fragmentsdb = FragmentsDb(fragments_db_filename_or_url) fragments = [] absent_identifiers = [] for pdb_code in pdb_codes: try: for fragment in fragmentsdb.by_pdb_code(pdb_code): fragments.append(fragment) except LookupError as e: absent_identifiers.append(pdb_code) if absent_identifiers: df = pd.DataFrame(fragments) df.rename(columns=lambda x: prefix + x, inplace=True) raise IncompleteFragments(absent_identifiers, df) df = pd.DataFrame(fragments) df.rename(columns=lambda x: prefix + x, inplace=True) return df
[docs]def fragments_by_id(fragment_ids, fragments_db_filename_or_url, prefix=''): """Retrieve fragments based on fragment identifier. Args: fragment_ids (List[str]): List of fragment identifiers fragments_db_filename_or_url (str): Filename of fragments db or base url of kripodb webservice prefix (str): Prefix for output columns Examples: Fetch fragments of '2n2k_MTN_frag1' fragment identifier >>> from kripodb.canned import fragments_by_id >>> fragment_ids = pd.Series(['2n2k_MTN_frag1']) >>> fragments = fragments_by_id(fragment_ids, 'data/fragments.sqlite') >>> len(fragments) 1 Retrieved from web service instead of local fragments db file. Make sure the web service is running, for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`. >>> fragments = fragments_by_id(fragment_ids,, 'http://localhost:8084/kripo') >>> len(fragments) 1 Returns: pandas.DataFrame: Data frame with fragment information Raises: IncompleteFragments: When one or more of the identifiers could not be found. """ if fragments_db_filename_or_url.startswith('http'): client = WebserviceClient(fragments_db_filename_or_url) try: fragments = client.fragments_by_id(fragment_ids) except IncompleteFragments as e: df = pd.DataFrame(e.fragments) df.rename(columns=lambda x: prefix + x, inplace=True) raise IncompleteFragments(e.absent_identifiers, df) else: fragmentsdb = FragmentsDb(fragments_db_filename_or_url) fragments = [] absent_identifiers = [] for frag_id in fragment_ids: try: fragments.append(fragmentsdb[frag_id]) except KeyError: absent_identifiers.append(frag_id) if absent_identifiers: df = pd.DataFrame(fragments) df.rename(columns=lambda x: prefix + x, inplace=True) raise IncompleteFragments(absent_identifiers, df) df = pd.DataFrame(fragments) df.rename(columns=lambda x: prefix + x, inplace=True) return df
[docs]def pharmacophores_by_id(fragment_ids, pharmacophores_db_filename_or_url): """Fetch pharmacophore points by fragment identifiers Args: fragment_ids (pd.Series): List of fragment identifiers pharmacophores_db_filename_or_url: Filename of pharmacophores db or base url of kripodb webservice Returns: pandas.Series: Pandas series with pharmacophores as string in phar format. Fragment without pharmacophore will return None Examples: Fragments similar to '3j7u_NDP_frag24' fragment. >>> from kripodb.canned import pharmacophores_by_id >>> fragment_ids = pd.Series(['2n2k_MTN_frag1'], ['Row0']) >>> pharmacophores = pharmacophores_by_id(fragment_ids, 'data/pharmacophores.h5') >>> len(pharmacophores) 1 Retrieved from web service instead of local pharmacophores db file. Make sure the web service is running, for example by `kripodb serve data/similarities.h5 data/fragments.sqlite data/pharmacophores.h5`. >>> pharmacophores = pharmacophores_by_id(fragment_ids, 'http://localhost:8084/kripo') >>> len(pharmacophores) 1 """ pphors = pd.Series([], dtype=str) if pharmacophores_db_filename_or_url.startswith('http'): client = WebserviceClient(pharmacophores_db_filename_or_url) try: pphorsarray = client.pharmacophores(fragment_ids) pphors = pd.Series(pphorsarray, fragment_ids.index, dtype=str) except IncompletePharmacophores as e: pphors = pd.Series(e.pharmacophores, fragment_ids.index, dtype=str) raise IncompletePharmacophores(e.absent_identifiers, pphors) else: with PharmacophoresDb(pharmacophores_db_filename_or_url) as pharmacophoresdb: absent_identifiers = [] for row_id, frag_id in fragment_ids.iteritems(): try: phar = as_phar(frag_id, pharmacophoresdb[frag_id]) pphors[row_id] = phar except KeyError: pphors[row_id] = None absent_identifiers.append(frag_id) if absent_identifiers: raise IncompletePharmacophores(absent_identifiers, pphors) return pphors