Source code for kripodb.dive

import csv
import json
import logging
import math
from os.path import basename

from progressbar import ProgressBar
from rdkit.Chem.Descriptors import HeavyAtomMolWt
import six

from .db import FragmentsDb
from .frozen import FrozenSimilarityMatrix
from .pdb import PdbReport


[docs]def dive_sphere(inputfile, outputfile, onlyfrag1):
    """Export fragments as DiVE formatted sphere

    Args:
        inputfile (str): fragments db input file
        outputfile (file): fragments dive output file
        onlyfrag1 (bool): Only \*_frag1

    """
    frags_db = FragmentsDb(inputfile)
    nodes = {}

    # distribute fragments evenly on sphere using Fibonacci sphere algorithm
    # from http://stackoverflow.com/questions/9600801/evenly-distributing-n-points-on-a-sphere
    samples = len(frags_db)

    sql = 'SELECT frag_id, pdb_code, het_code FROM fragments'
    if onlyfrag1:
        sql += ' WHERE frag_id LIKE "%_frag1"'
        frags_db.cursor.execute('SELECT count(*) FROM fragments WHERE frag_id LIKE "%_frag1"')
        samples = frags_db.cursor.fetchone()[0]

    rnd = 1.
    offset = 2. / samples
    increment = math.pi * (3. - math.sqrt(5.));

    frag_ids = frags_db.cursor.execute(sql)
    for i, frag in enumerate(frag_ids):
        y = ((i * offset) - 1) + (offset / 2);
        r = math.sqrt(1 - pow(y, 2))

        phi = ((i + rnd) % samples) * increment

        x = math.cos(phi) * r
        z = math.sin(phi) * r

        node_info = {
            'Path': [],
            'Coordinates': [x, y, z],
            'Categories': [frag[1], frag[2]],
            'Properties': []
        }
        nodes[frag[0]] = node_info

    json.dump(nodes, outputfile)


[docs]def dive_export(fragmentsdb, uniprot_annot, pdbtags, propnames, props):
    """Writes metdata props for DiVE visualization

    Args:
        fragmentsdb (str): Filename fo fragments db file
        uniprot_annot (file): Readable file object with uniprot gene and family mapping as tsv
        pdbtags (list): List of readable file objects to tag pdb by filename
        propnames (file): Writable file object to write prop names to
        props (file): Writeable file object to write props to
    """
    db = FragmentsDb(fragmentsdb)

    data = {}
    dive_get_fragments(db, data)
    dive_merge_uniprot(uniprot_annot, data)
    dive_merge_pdb(data)
    dive_merge_pdb_tag(pdbtags, data)

    dump_propnames(propnames, pdbtags is not None)
    dump_props(data, props)


def dive_get_fragments(db, data):
    # TODO add organism column to pdb data table
    sql = '''SELECT
            frag_id,
            pdb_code as pdb,
            het_code as het,
            frag_nr as fragment,
            pdb_title as title,
            uniprot_acc as uniprot,
            uniprot_name as protein,
            smiles,
            mol
          FROM
            fragments
            JOIN pdbs USING (pdb_code)
            LEFT JOIN molecules USING (frag_id)
            '''
    for row in db.cursor.execute(sql):
        cols = row.keys()
        frag_id = row[0]
        data[frag_id] = {}
        mol = row[-1]
        if mol:
            data[frag_id]['weight'] = HeavyAtomMolWt(mol)
            # TODO add other Lipinski parameters aswell http://www.rdkit.org/Python_Docs/rdkit.Chem.Lipinski-module.html
        for col in cols[1:-1]:
            data[frag_id][col] = row[col]


def dive_merge_uniprot(uniprot_annot_fn, data):
    pdb2uniprot_accs = {}
    uniprot_acc2gene = {}
    uniprot_acc2family = {}
    logging.warning('Loading uniprot')
    reader = csv.reader(uniprot_annot_fn, delimiter='\t')
    next(reader)
    for row in reader:
        if row[1]:
            uniprot_acc2gene[row[0]] = row[1]
        if row[2]:
            uniprot_acc2family[row[0]] = row[2].split(', ')
        if row[3]:
            for pdb in row[3].split(';'):
                # Kripo uses lowercase pdb code, while rest of world uses uppercase
                pdb2uniprot_accs[pdb.lower()] = row[0]

    for frag_id in data:
        record = data[frag_id]
        pdb_code = record['pdb']
        if pdb_code in pdb2uniprot_accs:
            uniprot_acc = pdb2uniprot_accs[pdb_code]
            if uniprot_acc != record['uniprot']:
                record['uniprot'] = uniprot_acc
            if uniprot_acc in uniprot_acc2gene:
                record['gene'] = uniprot_acc2gene[uniprot_acc]
            if uniprot_acc in uniprot_acc2family:
                record['families'] = uniprot_acc2family[uniprot_acc]


def dive_merge_pdb(data):
    logging.warning('Loading pdb from internet')
    pdb_report = PdbReport(fields=['source'])
    pdb2organism = {pdb['structureId'].lower(): pdb['source'] for pdb in pdb_report.fetch() if pdb['source']}
    for frag_id in data:
        record = data[frag_id]
        pdb_code = record['pdb']
        if pdb_code in pdb2organism:
            organism = pdb2organism[pdb_code]
            record['organism'] = organism


def dive_merge_pdb_tag(pdbtags, data):
    logging.warning('Loading pdb tags')
    tags = {}
    for pdbtagfile in pdbtags:
        tagname = basename(pdbtagfile.name)
        for line in pdbtagfile:
            tags[line.strip().lower()] = tagname
    for frag_id in data:
        record = data[frag_id]
        pdb_code = record['pdb']
        if pdb_code in tags:
            record['pdbtag'] = tags[pdb_code]


def dump_propnames(propnamesfn, has_pdbtag):
    propnames = [
        'pdb',
        'het',
        'fragment',
        'title',
        'smiles',
        'weight',
        'uniprot',
        'protein',
        'organism',
        'gene',
    ]
    if has_pdbtag:
        propnames.append('pdbtag')
    propnames.extend([
        'family0',
        'family1',
        'family2',
        'family3',
        'family4',
    ])
    json.dump(propnames, propnamesfn)


def dump_props(props, propsfn):
    for frag_id, v in six.iteritems(props):
        propsfn.write(frag_id)
        propsfn.write(' ')
        fields = [
            'pdb:' + v['pdb'],
            'het:' + v['het'],
            'fragment:' + str(v['fragment']),
            '"title:' + v['title'] + '"',
        ]
        if 'smiles' in v and v['smiles']:
            fields.append('smiles:' + v['smiles'])
        else:
            fields.append('')
        if 'weight' in v:
            fields.append('{0:.2f}'.format(v['weight']))
        else:
            fields.append('')
        if v['uniprot']:
            fields.append('uniprot:' + v['uniprot'].split('#')[0])
        else:
            fields.append('')
        if 'protein' in v and v['protein']:
            fields.append('"protein:' + v['protein'] + '"')
        else:
            fields.append('')
        if 'organism' in v:
            fields.append('"organism:' + v['organism'] + '"')
        else:
            fields.append('')
        if 'gene' in v:
            fields.append('"gene:' + v['gene'] + '"')
        else:
            fields.append('')
        if 'pdbtag' in v:
            fields.append('pdbtag:' + v['pdbtag'])
        else:
            fields.append('')
        if 'families' in v:
            for idx, fam in enumerate(v['families']):
                fields.append('"family' + str(idx) + ':' + fam + '"')

        propsfn.write(' '.join(fields))
        propsfn.write("\n")


[docs]def dense_dump(inputfile, outputfile, frag1only):
    """Dump dense matrix with zeros included

    Args:
        inputfile (str): Filename of dense similarity matrix
        outputfile (file): Writeable file object
        frag1only (bool): Only dump frag1 fragments

    Returns:

    """
    matrix = FrozenSimilarityMatrix(inputfile)
    writer = csv.writer(outputfile, delimiter='\t', lineterminator='\n')
    writer.writerow(['frag_id1', 'frag_id2', 'score'])
    writer.writerows(dense_dump_iter(matrix, frag1only))
    matrix.close()


[docs]def dense_dump_iter(matrix, frag1only):
    """Iterate dense matrix with zeros

    Args:
        matrix (FrozenSimilarityMatrix): Dense similarity matrix
        frag1only (bool): True to iterate over \*frag1 only

    Yields:
        (str, str, float): Fragment label pair and score
    """
    completed_frags = set()
    bar = ProgressBar()
    labels = [v.decode() for v in matrix.labels]
    for row_label in bar(labels):
        if frag1only and not row_label.endswith('frag1'):
            continue
        completed_frags.add(row_label)
        cols = matrix[row_label]
        for (col_label, score) in cols:
            if frag1only and not col_label.endswith('frag1'):
                continue
            if col_label in completed_frags:
                continue
            if not score:
                continue
            yield (row_label, col_label, score)