import argparse
import gzip
import sys
import tarfile
from .. import pairs, makebits
from ..db import FragmentsDb, FingerprintsDb
from ..modifiedtanimoto import calc_mean_onbit_density
[docs]def make_fingerprints_parser(subparsers):
"""Creates a parser for fingerprints sub commands
Args:
subparsers (argparse.ArgumentParser): Parser to which to add sub commands to
"""
fp_sc = subparsers.add_parser('fingerprints', help='Fingerprints').add_subparsers()
makebits2fingerprintsdb_sc(fp_sc)
fingerprintsdb2makebits_sc(fp_sc)
meanbitdensity_sc(fp_sc)
similarity2query_sc(fp_sc)
pairs_sc(fp_sc)
merge_fingerprintsdb_sc(fp_sc)
def pairs_sc(subparsers):
sc_help = '''Calculate modified tanimoto similarity between fingerprints'''
sc_description = '''
Output formats:
* tsv, tab separated id1,id2, similarity
* hdf5, hdf5 file constructed with pytables with a, b and score, but but a and b have been replaced
by numbers and similarity has been converted to scaled int
When input has been split into chunks,
use `--ignore_upper_triangle` flag for computing similarities between same chunk.
This prevents storing pair a->b also as b->a.
'''
out_formats = ['tsv', 'hdf5']
sc = subparsers.add_parser('similarities',
help=sc_help,
description=sc_description)
sc.add_argument('fingerprintsfn1',
help='Name of reference fingerprints db file')
sc.add_argument('fingerprintsfn2',
help='Name of query fingerprints db file')
sc.add_argument('out_file',
help='Name of output file (use - for stdout)')
sc.add_argument('--out_format',
choices=out_formats,
default='hdf5',
help='Format of output (default: %(default)s)')
sc.add_argument('--fragmentsdbfn',
help='Name of fragments db file (only required for hdf5 format)')
sc.add_argument('--mean_onbit_density',
help='Mean on bit density (default: %(default)s)',
type=float,
default=0.01)
sc.add_argument('--cutoff',
type=float,
default=0.45,
help='Set Tanimoto cutoff (default: %(default)s)')
sc.add_argument('--nomemory',
action='store_true',
help='Do not store query fingerprints in memory (default: %(default)s)')
sc.add_argument('--ignore_upper_triangle',
action='store_true',
help='Ignore upper triangle (default: %(default)s)')
sc.set_defaults(func=pairs_run)
def pairs_run(fingerprintsfn1, fingerprintsfn2,
out_format, out_file,
mean_onbit_density,
cutoff,
fragmentsdbfn,
nomemory,
ignore_upper_triangle):
if 'hdf5' in out_format and fragmentsdbfn is None:
raise Exception('Hdf5 format requires fragments db')
label2id = {}
if fragmentsdbfn is not None:
label2id = FragmentsDb(fragmentsdbfn).label2id().materialize()
bitsets1 = FingerprintsDb(fingerprintsfn1).as_dict()
if fingerprintsfn1 == fingerprintsfn2:
bitsets2 = bitsets1
ignore_upper_triangle = True
else:
bitsets2 = FingerprintsDb(fingerprintsfn2).as_dict()
if bitsets1.number_of_bits != bitsets2.number_of_bits:
raise Exception('Number of bits is not the same')
out = sys.stdout
if out_file != '-' and out_format.startswith('tsv'):
if out_file.endswith('gz'):
out = gzip.open(out_file, 'w')
else:
out = open(out_file, 'w')
pairs.dump_pairs(bitsets1,
bitsets2,
out_format,
out_file,
out,
bitsets1.number_of_bits,
mean_onbit_density,
cutoff,
label2id,
nomemory,
ignore_upper_triangle)
def makebits2fingerprintsdb_sc(subparsers):
sc = subparsers.add_parser('import', help='Add Makebits file to fingerprints db')
sc.add_argument('infiles', nargs='+', type=argparse.FileType('r'), metavar='infile',
help='Name of makebits formatted fingerprint file (.tar.gz or not packed or - for stdin)')
sc.add_argument('outfile', help='Name of fingerprints db file', default='fingerprints.db')
sc.set_defaults(func=makebits2fingerprintsdb)
def makebits2fingerprintsdb_single(infile, bitsets):
gen = makebits.iter_file(infile)
header = next(gen)
number_of_bits = makebits.read_fp_size(header)
bitsets.number_of_bits = number_of_bits
bitsets.update(gen)
def makebits2fingerprintsdb(infiles, outfile):
bitsets = FingerprintsDb(outfile).as_dict()
for infile in infiles:
if infile.name.endswith('tar.gz'):
with tarfile.open(fileobj=infile) as tar:
for tarinfo in tar:
if tarinfo.isfile():
f = tar.extractfile(tarinfo)
makebits2fingerprintsdb_single(f, bitsets)
f.close()
else:
makebits2fingerprintsdb_single(infile, bitsets)
def fingerprintsdb2makebits_sc(subparsers):
sc = subparsers.add_parser('export',
help='Dump bitsets in fingerprints db to makebits file')
sc.add_argument('infile',
default='fingerprints.db',
help='Name of fingerprints db file')
sc.add_argument('outfile',
type=argparse.FileType('w'),
help='Name of makebits formatted fingerprint file (or - for stdout)')
sc.set_defaults(func=fingerprintsdb2makebits)
def fingerprintsdb2makebits(infile, outfile):
bitsets = FingerprintsDb(infile).as_dict()
makebits.write_file(bitsets.number_of_bits, bitsets, outfile)
def similarity2query_sc(subparsers):
sc_help = 'Find the fragments closests to query based on fingerprints'
sc = subparsers.add_parser('similar', help=sc_help)
sc.add_argument('fingerprintsdb',
default='fingerprints.db',
help='Name of fingerprints db file')
sc.add_argument('query', type=str, help='Query identifier or beginning of it')
sc.add_argument('out', type=argparse.FileType('w'), help='Output file tabdelimited (query, hit, score)')
sc.add_argument('--mean_onbit_density',
help='Mean on bit density (default: %(default)s)',
type=float,
default=0.01)
sc.add_argument('--cutoff',
type=float,
default=0.55,
help='Set Tanimoto cutoff (default: %(default)s)')
sc.add_argument('--memory',
action='store_true',
help='Store bitsets in memory (default: %(default)s)')
sc.set_defaults(func=pairs.similarity2query)
def similarity2query_run(fingerprintsdb, query, out, mean_onbit_density, cutoff, memory):
bitsets = FingerprintsDb(fingerprintsdb).as_dict()
pairs.similarity2query(bitsets, query, out, mean_onbit_density, cutoff, memory)
def meanbitdensity_sc(subparsers):
sc = subparsers.add_parser('meanbitdensity', help='Compute mean bit density of fingerprints')
sc.add_argument('fingerprintsdb',
default='fingerprints.db',
help='Name of fingerprints db file (default: %(default)s)')
sc.add_argument('--out', type=argparse.FileType('w'),
default='-',
help='Output file, default is stdout (default: %(default)s)')
sc.set_defaults(func=meanbitdensity_run)
def meanbitdensity_run(fingerprintsdb, out):
bitsets = FingerprintsDb(fingerprintsdb).as_dict()
density = calc_mean_onbit_density(bitsets.values(), bitsets.number_of_bits)
out.write("{0:.5}\n".format(density))
def merge_fingerprintsdb_sc(subparsers):
sc = subparsers.add_parser('merge', help='Combine fingerprints databases into a single new one')
sc.add_argument('ins', nargs='+', help='Input fingerprints database files')
sc.add_argument('out', help='Output fingerprints database file')
sc.set_defaults(func=merge_fingerprintsdb)
def merge_fingerprintsdb(ins, out):
with FingerprintsDb(out) as output_db:
c = output_db.cursor
c.execute('SELECT name FROM sqlite_master WHERE type="table"')
tables = [table[0] for table in c.fetchall()]
for input_fn in ins:
c.execute('ATTACH DATABASE ? AS other', (input_fn,))
for table in tables:
c.execute('INSERT INTO {0} SELECT * FROM other.{0}'.format(table))
c.execute('DETACH DATABASE other')