Source code for kripodb.pairs

# Copyright 2016 Netherlands eScience Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module handling generation and retrieval of similarity of fingerprint pairs"""

from __future__ import absolute_import

import tables

import logging
from kripodb.frozen import FrozenSimilarityMatrix

from .hdf5 import SimilarityMatrix
from .modifiedtanimoto import similarities, corrections
from .webservice.client import WebserviceClient


[docs]def dump_pairs(bitsets1, bitsets2, out_format, out_file, out, number_of_bits, mean_onbit_density, cutoff, label2id, nomemory, ignore_upper_triangle=False): """Dump pairs of bitset collection. A pairs are rows of the bitset identifier of both bitsets with a similarity score. Args: bitsets1 (Dict{str, pyroaring.BitMap}): First dict of fingerprints with fingerprint label as key and pyroaring.BitMap as value bitsets2 (Dict{str, pyroaring.BitMap}): Second dict of fingerprints with fingerprint label as key and pyroaring.BitMap as value out_format: 'tsv' or 'hdf5' out_file: Filename of output file where 'hdf5' format is written to. out (File): File object where 'tsv' format is written to. number_of_bits (int): Number of bits for all bitsets mean_onbit_density (float): Mean on bit density cutoff (float): Cutoff, similarity scores below cutoff are discarded. label2id: dict to translate label to id (string to int) nomemory: If true bitset2 is not loaded into memory ignore_upper_triangle: When true returns similarity where label1 > label2, when false returns all similarities """ if out_file == '-' and out_format.startswith('hdf5'): raise Exception("hdf5 formats can't be outputted to stdout") if not nomemory: # load whole dict in memory so it can be reused for each bitset1 # deserialization of bitsets2 is only done one time bitsets2 = bitsets2.materialize() expectedrows = len(bitsets1) * len(bitsets2) * cutoff * 0.025 (corr_st, corr_sto) = corrections(mean_onbit_density) logging.warning('Generating pairs') similarities_iter = similarities(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ignore_upper_triangle) if out_format == 'tsv': dump_pairs_tsv(similarities_iter, out) elif out_format == 'hdf5': dump_pairs_hdf5(similarities_iter, label2id, expectedrows, out_file) else: raise LookupError('Invalid output format')
[docs]def dump_pairs_tsv(similarities_iter, out): """Dump pairs in tab delimited file Pro: * when stored in sqlite can be used outside of Python Con: * big, unless output is compressed Args: similarities_iter (Iterator): Iterator with tuple with fingerprint 1 label, fingerprint 2 label, similarity as members out (File): Writeable file """ for label1, label2, similarity in similarities_iter: out.write('{0}\t{1}\t{2:.5}\n'.format(label1, label2, similarity))
[docs]def dump_pairs_hdf5(similarities_iter, label2id, expectedrows, out_file): """Dump pairs in hdf5 file Pro: * very small, 10 bytes for each pair + compression Con: * requires hdf5 library to access Args: similarities_iter (Iterator): Iterator with tuple with fingerprint 1 label, fingerprint 2 label, similarity as members label2id (dict): dict to translate label to id (string to int) expectedrows: out_file: """ matrix = SimilarityMatrix(out_file, 'w', expectedpairrows=expectedrows, expectedlabelrows=len(label2id)) matrix.update(similarities_iter, label2id) matrix.close()
[docs]def similarity2query(bitsets2, query, out, mean_onbit_density, cutoff, memory): """Calculate similarity of query against all fingerprints in bitsets2 and write to tab delimited file. Args: bitsets2 (kripodb.db.IntbitsetDict): query (str): Query identifier or beginning of it out (File): File object to write output to mean_onbit_density (flaot): Mean on bit density cutoff (float): Cutoff, similarity scores below cutoff are discarded. memory (Optional[bool]): When true will load bitset2 into memory, when false it doesn't """ number_of_bits = bitsets2.number_of_bits if query in bitsets2: # exact match query_bitset = bitsets2[query] bitsets1 = { query: query_bitset } else: # all bitsets which have a key that starts with query bitsets1 = {k: v for k, v in bitsets2.iteritems_startswith(query)} if memory: # load whole dict in memory so it can be reused for each bitset1 # deserialization of bitset2 is only done one time bitsets2 = bitsets2.materialize() (corr_st, corr_sto) = corrections(mean_onbit_density) similarities_iter = similarities(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, True) sorted_similarities = sorted(similarities_iter, key=lambda row: row[2], reverse=True) dump_pairs_tsv(sorted_similarities, out)
[docs]def similar_run(query, pairsdbfn, cutoff, out): """Find similar fragments to query based on similarity matrix and write to tab delimited file. Args: query (str): Query fragment identifier pairsdbfn (str): Filename of similarity matrix file or url of kripodb webservice cutoff (float): Cutoff, similarity scores below cutoff are discarded. out (File): File object to write output to """ if pairsdbfn.startswith('http'): client = WebserviceClient(pairsdbfn) hits = client.similar_fragments(query, cutoff) hits = [(h['query_frag_id'], h['hit_frag_id'], h['score']) for h in hits] dump_pairs_tsv(hits, out) else: matrix = open_similarity_matrix(pairsdbfn) hits = similar(query, matrix, cutoff) dump_pairs_tsv(hits, out) matrix.close()
[docs]def open_similarity_matrix(fn): """Open read-only similarity matrix file. Args: fn (str): Filename of similarity matrix Returns: SimilarityMatrix | FrozenSimilarityMatrix: A read-only similarity matrix object """ # peek in file to detect format f = tables.open_file(fn, 'r') is_frozen = 'scores' in f.root f.close() if is_frozen: matrix = FrozenSimilarityMatrix(fn) else: matrix = SimilarityMatrix(fn, cache_labels=True) return matrix
[docs]def similar(query, similarity_matrix, cutoff, limit=None): """Find similar fragments to query based on similarity matrix. Args: query (str): Query fragment identifier similarity_matrix (kripodb.db.SimilarityMatrix): Similarity matrix cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Yields: Tuple[(str, str, float)]: List of (query fragment identifier, hit fragment identifier, similarity score) sorted on similarity score """ raw_hits = similarity_matrix.find(query, cutoff, limit) # add query column for hit_id, score in raw_hits: yield query, hit_id, score
[docs]def total_number_of_pairs(fingerprint_filenames): """Count number of pairs in similarity matrix files Args: fingerprint_filenames (list[str]): List of file names of similarity matrices Returns: int: Total number of pairs """ sizes = [] for filename in fingerprint_filenames: matrix = SimilarityMatrix(filename) pairs = matrix.pairs sizes.append(len(pairs)) matrix.close() return sum(sizes)
[docs]def merge(ins, out): """Concatenate similarity matrix files into a single one. Args: ins (list[str]): List of input similarity matrix filenames out (str): Output similarity matrix filenames Raises: AssertionError: When nr of labels of input files is not the same """ expectedrows = total_number_of_pairs(ins) out_matrix = SimilarityMatrix(out, 'w', expectedpairrows=expectedrows) # copy pairs for in_filename in ins: in_matrix = SimilarityMatrix(in_filename) out_matrix.append(in_matrix) in_matrix.close() out_matrix.close()