Source code for kripodb.script.similarities

import argparse
import csv

from tables import parameters
from .. import pairs
from ..db import FragmentsDb
from ..frozen import FrozenSimilarityMatrix
from ..hdf5 import SimilarityMatrix


[docs]def make_similarities_parser(subparsers): """Creates a parser for similarities sub commands Args: subparsers (argparse.ArgumentParser): Parser to which to add sub commands to """ sc = subparsers.add_parser('similarities', help='Similarity matrix').add_subparsers() similar_sc(sc) merge_pairs_sc(sc) simmatrix_export_sc(sc) simmatrix_import_sc(sc) simmatrix_filter_sc(sc) similarity_freeze_sc(sc) similarity_thaw_sc(sc) fpneigh2tsv_sc(sc) histogram_sc(sc)
def similar_sc(subparsers): sc_help = 'Find the fragments closets to query based on similarity matrix' sc = subparsers.add_parser('similar', help=sc_help) sc.add_argument('pairsdbfn', type=str, help='hdf5 similarity matrix file or base url of kripodb webservice') sc.add_argument('query', type=str, help='Query fragment identifier') sc.add_argument('--out', type=argparse.FileType('w'), default='-', help='Output file tab delimited (query, hit, similarity score)') sc.add_argument('--cutoff', type=float, default=0.55, help='Similarity cutoff (default: %(default)s)') sc.set_defaults(func=pairs.similar_run) def merge_pairs_sc(subparsers): sc = subparsers.add_parser('merge', help='Combine pairs files into a new file') sc.add_argument('ins', help='Input pair file in hdf5_compact format', nargs='+') sc.add_argument('out', help='Output pair file in hdf5_compact format') sc.set_defaults(func=pairs.merge) def simmatrix_export_sc(subparsers): sc = subparsers.add_parser('export', help='Export similarity matrix to tab delimited file') sc.add_argument('simmatrixfn', type=str, help='Compact hdf5 similarity matrix filename') sc.add_argument('outputfile', type=argparse.FileType('w'), help='Tab delimited output file, use - for stdout') sc.add_argument('--no_header', action='store_true', help='Output no header (default: %(default)s)') sc.add_argument('--frag1', action='store_true', help='Only output *frag1 fragments (default: %(default)s)') pdbhelp = 'Only output fragments which are from pdb code in file, one pdb code per line (default: %(default)s)' sc.add_argument('--pdb', type=argparse.FileType('r'), help=pdbhelp) sc.set_defaults(func=simmatrix_export_run) def load_pdb_filter_file(pdbs_file): pdbs = set() for line in pdbs_file: pdbs.add(line.strip().lower()) return pdbs def pdb_filter(rows, pdbs): for row in rows: if row[0][:4] in pdbs and row[1][:4] in pdbs: yield row def frag1_filter(rows): for row in rows: if row[0].endswith('frag1') and row[1].endswith('frag1'): yield row
[docs]def simmatrix_export_run(simmatrixfn, outputfile, no_header, frag1, pdb): """Export similarity matrix to tab delimited file Args: simmatrixfn (str): (Compact) hdf5 similarity matrix filename outputfile (file): Tab delimited output file no_header (bool): Output no header frag1 (bool): Only output \*frag1 pdb (str): Filename with pdb codes inside """ simmatrix = pairs.open_similarity_matrix(simmatrixfn) if pdb: pdbs = load_pdb_filter_file(pdb) else: pdbs = None writer = csv.writer(outputfile, delimiter="\t", lineterminator='\n') with_header = not no_header if with_header: writer.writerow(['frag_id1', 'frag_id2', 'score']) if frag1 and pdb: writer.writerows(pdb_filter(frag1_filter(simmatrix), pdbs)) elif frag1: writer.writerows(frag1_filter(simmatrix)) elif pdb: writer.writerows(pdb_filter(simmatrix, pdbs)) else: writer.writerows(simmatrix) simmatrix.close()
def simmatrix_import_sc(subparsers): sc = subparsers.add_parser('import', help='Import similarity matrix from tab delimited file', description='''When input has been split into chunks, use `--ignore_upper_triangle` flag for similarities between same chunk. This prevents storing pair a->b also as b->a.''') sc.add_argument('inputfile', type=argparse.FileType('r'), help='Input file, use - for stdin') sc.add_argument('fragmentsdb', default='fragments.db', help='Name of fragments db file (default: %(default)s)') sc.add_argument('simmatrixfn', type=str, help='Compact hdf5 similarity matrix file, will overwrite file if it exists') sc.add_argument('--inputformat', choices=['tsv', 'fpneigh'], default='fpneigh', help='tab delimited (tsv) or fpneigh formatted input (default: %(default)s)') # Have to ask, because inputfile can be stdin so can't do 2 passes through file sc.add_argument('--nrrows', type=int, default=2**16, help='Number of rows in inputfile (default: %(default)s)') sc.add_argument('--ignore_upper_triangle', action='store_true', help='Ignore upper triangle (default: %(default)s)') sc.set_defaults(func=simmatrix_import_run) def simmatrix_import_run(inputfile, fragmentsdb, simmatrixfn, inputformat, nrrows, ignore_upper_triangle=False): if inputformat == 'tsv': simmatrix_import_tsv(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle) elif inputformat == 'fpneigh': simmatrix_importfpneigh_run(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle) def simmatrix_import_tsv(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle=False): frags = FragmentsDb(fragmentsdb) label2id = frags.label2id().materialize() simmatrix = SimilarityMatrix(simmatrixfn, 'w', expectedlabelrows=len(label2id), expectedpairrows=nrrows) reader = csv.reader(inputfile, delimiter="\t") # ignore header next(reader) # simmatrix wants score as float instead of str def csv_iter(rows): for row in rows: if row[0] == row[1]: continue if ignore_upper_triangle and row[0] > row[1]: continue row[2] = float(row[2]) yield row simmatrix.update(csv_iter(reader), label2id) simmatrix.close() def simmatrix_importfpneigh_run(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle=False): frags = FragmentsDb(fragmentsdb) label2id = frags.label2id().materialize() simmatrix = SimilarityMatrix(simmatrixfn, 'w', expectedlabelrows=len(label2id), expectedpairrows=nrrows) simmatrix.update(read_fpneighpairs_file(inputfile, ignore_upper_triangle), label2id) simmatrix.close() def simmatrix_filter_sc(subparsers): sc = subparsers.add_parser('filter', help='Filter similarity matrix') sc.add_argument('input', type=str, help='Input hdf5 similarity matrix file') sc.add_argument('output', type=str, help='Output hdf5 similarity matrix file, will overwrite file if it exists') group = sc.add_mutually_exclusive_group() group.add_argument('--fragmentsdb', help='Name of fragments db file, ' 'fragments in it will be kept as well as their pair counter parts.') group.add_argument('--skip', type=argparse.FileType('r'), help='File with fragment identifiers on each line to skip') sc.set_defaults(func=simmatrix_filter) def simmatrix_filter(input, output, fragmentsdb, skip): simmatrix_in = SimilarityMatrix(input) if fragmentsdb: frags = FragmentsDb(fragmentsdb) expectedlabelrows = len(frags) labelsin = len(simmatrix_in.labels) expectedpairrows = int(len(simmatrix_in.pairs) * (float(expectedlabelrows) / labelsin)) simmatrix_out = SimilarityMatrix(output, 'w', expectedlabelrows=expectedlabelrows, expectedpairrows=expectedpairrows, ) frag_labels2keep = set(frags.id2label().values()) simmatrix_in.keep(simmatrix_out, frag_labels2keep) if skip: labels2skip = set() for line in skip: labels2skip.add(line.strip()) labelsin = len(simmatrix_in.labels) expectedlabelrows = labelsin - len(labels2skip) expectedpairrows = int(len(simmatrix_in.pairs) * (float(expectedlabelrows) / labelsin)) simmatrix_out = SimilarityMatrix(output, 'w', expectedlabelrows=expectedlabelrows, expectedpairrows=expectedpairrows, ) simmatrix_in.skip(simmatrix_out, labels2skip) simmatrix_in.close() simmatrix_out.close() def similarity_freeze_sc(subparsers): sc = subparsers.add_parser('freeze', help='Optimize similarity matrix for reading') sc.add_argument('in_fn', type=str, help='Input pairs file') sc.add_argument('out_fn', type=str, help='Output array file, file is overwritten') sc.add_argument('-f', '--frame_size', type=int, default=10**8, help='Size of frame (default: %(default)s)') sc.add_argument('-m', '--memory', type=int, default=1, help='Memory cache in Gigabytes (default: %(default)s)') sc.add_argument('-l', '--limit', type=int, help='Number of pairs to copy, None for no limit (default: %(default)s)') sc.add_argument('-s', '--single_sided', action='store_true', help='Store half matrix (default: %(default)s)') sc.set_defaults(func=similarity_freeze_run) def similarity_freeze_run(in_fn, out_fn, frame_size, memory, limit, single_sided): dm = SimilarityMatrix(in_fn, 'r') parameters.CHUNK_CACHE_SIZE = memory * 1024 ** 3 parameters.CHUNK_CACHE_NELMTS = 2 ** 14 dfm = FrozenSimilarityMatrix(out_fn, 'w') dfm.from_pairs(dm, frame_size, limit, single_sided) dm.close() dfm.close() def similarity_thaw_sc(subparsers): sc = subparsers.add_parser('thaw', help='Optimize similarity matrix for writing') sc.add_argument('in_fn', type=str, help='Input packed frozen matrix file') sc.add_argument('out_fn', type=str, help='Output pairs file, file is overwritten') sc.add_argument('--nonzero_fraction', type=float, default=0.012, help='Fraction of pairs which have score above threshold (default: %(default)s)') sc.set_defaults(func=similarity_thaw_run) def similarity_thaw_run(in_fn, out_fn, nonzero_fraction): fsm = FrozenSimilarityMatrix(in_fn, 'r') nr_scores = int(fsm.scores.shape[0] * fsm.scores.shape[1] * nonzero_fraction) nr_labels = fsm.labels.shape[0] sm = SimilarityMatrix(out_fn, 'w', expectedpairrows=nr_scores, expectedlabelrows=nr_labels) fsm.to_pairs(sm) sm.close() fsm.close()
[docs]def read_fpneighpairs_file(inputfile, ignore_upper_triangle=False): """Read fpneigh formatted similarity matrix file. Args: inputfile (File): File object to read ignore_upper_triangle (bool): Ignore upper triangle of input Yields: Tuple((Str,Str,Float)): List of (query fragment identifier, hit fragment identifier, similarity score) """ current_query = None reader = csv.reader(inputfile, delimiter=' ', skipinitialspace=True) for row in reader: if len(row) == 2 and current_query != row[0]: if ignore_upper_triangle and current_query > row[0]: continue yield (current_query, row[0], float(row[1])) elif len(row) == 4: current_query = row[3][:-1]
def fpneigh2tsv_sc(subparsers): sc = subparsers.add_parser('fpneigh2tsv', help='Convert fpneigh formatted file to tab delimited file') sc.add_argument('inputfile', type=argparse.FileType('r'), help='Input file, use - for stdin') sc.add_argument('outputfile', type=argparse.FileType('w'), help='Tab delimited output file, use - for stdout') sc.set_defaults(func=fpneigh2tsv_run) def fpneigh2tsv_run(inputfile, outputfile): reader = read_fpneighpairs_file(inputfile) writer = csv.writer(outputfile, delimiter="\t", lineterminator='\n') writer.writerow(['frag_id1', 'frag_id2', 'score']) writer.writerows(reader) def histogram_sc(subparsers): sc = subparsers.add_parser('histogram', help='Distribution of similarity scores') sc.add_argument('inputfile', type=str, help='Filename of similarity matrix hdf5 file') sc.add_argument('outputfile', type=argparse.FileType('w'), help='Tab delimited output file, use - for stdout') sc.add_argument('-f', '--frame_size', type=int, default=10**8, help='Size of frame (default: %(default)s)') sc.add_argument('-r', '--raw_score', action='store_true', help='Return raw score (16 bit integer) instead of fraction score') sc.add_argument('-l', '--lower_triangle', action='store_true', help='Return scores from lower triangle else return scores from upper triangle') sc.set_defaults(func=histogram) def histogram(inputfile, outputfile, frame_size, raw_score, lower_triangle): matrix = pairs.open_similarity_matrix(inputfile) counts = matrix.count(frame_size=frame_size, raw_score=raw_score, lower_triangle=lower_triangle) writer = csv.writer(outputfile, delimiter="\t", lineterminator='\n') writer.writerow(['score', 'count']) writer.writerows(counts) matrix.close()