Source code for kripodb.dive

import csv
import json
import logging
import math
from os.path import basename

from progressbar import ProgressBar
from rdkit.Chem.Descriptors import HeavyAtomMolWt
import six

from .db import FragmentsDb
from .frozen import FrozenSimilarityMatrix
from .pdb import PdbReport


[docs]def dive_sphere(inputfile, outputfile, onlyfrag1): """Export fragments as DiVE formatted sphere Args: inputfile (str): fragments db input file outputfile (file): fragments dive output file onlyfrag1 (bool): Only \*_frag1 """ frags_db = FragmentsDb(inputfile) nodes = {} # distribute fragments evenly on sphere using Fibonacci sphere algorithm # from http://stackoverflow.com/questions/9600801/evenly-distributing-n-points-on-a-sphere samples = len(frags_db) sql = 'SELECT frag_id, pdb_code, het_code FROM fragments' if onlyfrag1: sql += ' WHERE frag_id LIKE "%_frag1"' frags_db.cursor.execute('SELECT count(*) FROM fragments WHERE frag_id LIKE "%_frag1"') samples = frags_db.cursor.fetchone()[0] rnd = 1. offset = 2. / samples increment = math.pi * (3. - math.sqrt(5.)); frag_ids = frags_db.cursor.execute(sql) for i, frag in enumerate(frag_ids): y = ((i * offset) - 1) + (offset / 2); r = math.sqrt(1 - pow(y, 2)) phi = ((i + rnd) % samples) * increment x = math.cos(phi) * r z = math.sin(phi) * r node_info = { 'Path': [], 'Coordinates': [x, y, z], 'Categories': [frag[1], frag[2]], 'Properties': [] } nodes[frag[0]] = node_info json.dump(nodes, outputfile)
[docs]def dive_export(fragmentsdb, uniprot_annot, pdbtags, propnames, props): """Writes metdata props for DiVE visualization Args: fragmentsdb (str): Filename fo fragments db file uniprot_annot (file): Readable file object with uniprot gene and family mapping as tsv pdbtags (list): List of readable file objects to tag pdb by filename propnames (file): Writable file object to write prop names to props (file): Writeable file object to write props to """ db = FragmentsDb(fragmentsdb) data = {} dive_get_fragments(db, data) dive_merge_uniprot(uniprot_annot, data) dive_merge_pdb(data) dive_merge_pdb_tag(pdbtags, data) dump_propnames(propnames, pdbtags is not None) dump_props(data, props)
def dive_get_fragments(db, data): # TODO add organism column to pdb data table sql = '''SELECT frag_id, pdb_code as pdb, het_code as het, frag_nr as fragment, pdb_title as title, uniprot_acc as uniprot, uniprot_name as protein, smiles, mol FROM fragments JOIN pdbs USING (pdb_code) LEFT JOIN molecules USING (frag_id) ''' for row in db.cursor.execute(sql): cols = row.keys() frag_id = row[0] data[frag_id] = {} mol = row[-1] if mol: data[frag_id]['weight'] = HeavyAtomMolWt(mol) # TODO add other Lipinski parameters aswell http://www.rdkit.org/Python_Docs/rdkit.Chem.Lipinski-module.html for col in cols[1:-1]: data[frag_id][col] = row[col] def dive_merge_uniprot(uniprot_annot_fn, data): pdb2uniprot_accs = {} uniprot_acc2gene = {} uniprot_acc2family = {} logging.warning('Loading uniprot') reader = csv.reader(uniprot_annot_fn, delimiter='\t') next(reader) for row in reader: if row[1]: uniprot_acc2gene[row[0]] = row[1] if row[2]: uniprot_acc2family[row[0]] = row[2].split(', ') if row[3]: for pdb in row[3].split(';'): # Kripo uses lowercase pdb code, while rest of world uses uppercase pdb2uniprot_accs[pdb.lower()] = row[0] for frag_id in data: record = data[frag_id] pdb_code = record['pdb'] if pdb_code in pdb2uniprot_accs: uniprot_acc = pdb2uniprot_accs[pdb_code] if uniprot_acc != record['uniprot']: record['uniprot'] = uniprot_acc if uniprot_acc in uniprot_acc2gene: record['gene'] = uniprot_acc2gene[uniprot_acc] if uniprot_acc in uniprot_acc2family: record['families'] = uniprot_acc2family[uniprot_acc] def dive_merge_pdb(data): logging.warning('Loading pdb from internet') pdb_report = PdbReport(fields=['source']) pdb2organism = {pdb['structureId'].lower(): pdb['source'] for pdb in pdb_report.fetch() if pdb['source']} for frag_id in data: record = data[frag_id] pdb_code = record['pdb'] if pdb_code in pdb2organism: organism = pdb2organism[pdb_code] record['organism'] = organism def dive_merge_pdb_tag(pdbtags, data): logging.warning('Loading pdb tags') tags = {} for pdbtagfile in pdbtags: tagname = basename(pdbtagfile.name) for line in pdbtagfile: tags[line.strip().lower()] = tagname for frag_id in data: record = data[frag_id] pdb_code = record['pdb'] if pdb_code in tags: record['pdbtag'] = tags[pdb_code] def dump_propnames(propnamesfn, has_pdbtag): propnames = [ 'pdb', 'het', 'fragment', 'title', 'smiles', 'weight', 'uniprot', 'protein', 'organism', 'gene', ] if has_pdbtag: propnames.append('pdbtag') propnames.extend([ 'family0', 'family1', 'family2', 'family3', 'family4', ]) json.dump(propnames, propnamesfn) def dump_props(props, propsfn): for frag_id, v in six.iteritems(props): propsfn.write(frag_id) propsfn.write(' ') fields = [ 'pdb:' + v['pdb'], 'het:' + v['het'], 'fragment:' + str(v['fragment']), '"title:' + v['title'] + '"', ] if 'smiles' in v and v['smiles']: fields.append('smiles:' + v['smiles']) else: fields.append('') if 'weight' in v: fields.append('{0:.2f}'.format(v['weight'])) else: fields.append('') if v['uniprot']: fields.append('uniprot:' + v['uniprot'].split('#')[0]) else: fields.append('') if 'protein' in v and v['protein']: fields.append('"protein:' + v['protein'] + '"') else: fields.append('') if 'organism' in v: fields.append('"organism:' + v['organism'] + '"') else: fields.append('') if 'gene' in v: fields.append('"gene:' + v['gene'] + '"') else: fields.append('') if 'pdbtag' in v: fields.append('pdbtag:' + v['pdbtag']) else: fields.append('') if 'families' in v: for idx, fam in enumerate(v['families']): fields.append('"family' + str(idx) + ':' + fam + '"') propsfn.write(' '.join(fields)) propsfn.write("\n")
[docs]def dense_dump(inputfile, outputfile, frag1only): """Dump dense matrix with zeros included Args: inputfile (str): Filename of dense similarity matrix outputfile (file): Writeable file object frag1only (bool): Only dump frag1 fragments Returns: """ matrix = FrozenSimilarityMatrix(inputfile) writer = csv.writer(outputfile, delimiter='\t', lineterminator='\n') writer.writerow(['frag_id1', 'frag_id2', 'score']) writer.writerows(dense_dump_iter(matrix, frag1only)) matrix.close()
[docs]def dense_dump_iter(matrix, frag1only): """Iterate dense matrix with zeros Args: matrix (FrozenSimilarityMatrix): Dense similarity matrix frag1only (bool): True to iterate over \*frag1 only Yields: (str, str, float): Fragment label pair and score """ completed_frags = set() bar = ProgressBar() labels = [v.decode() for v in matrix.labels] for row_label in bar(labels): if frag1only and not row_label.endswith('frag1'): continue completed_frags.add(row_label) cols = matrix[row_label] for (col_label, score) in cols: if frag1only and not col_label.endswith('frag1'): continue if col_label in completed_frags: continue if not score: continue yield (row_label, col_label, score)