Source code for kripodb.makebits

# Copyright 2016 Netherlands eScience Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module to read/write fingerprints in Makebits file format"""

from __future__ import absolute_import
from pyroaring import BitMap
import six


def read_header(line):
    cols = line.strip().split(' ')
    format_name = cols.pop(0)
    format_version = cols.pop(0)
    fp_size = int(cols.pop(0))
    label = ' '.join(cols)
    return format_name, format_version, fp_size, label


def read_fp_size(header):
    return header[2]


def read_bitset(line, fp_size):
    row = line.split(' ', fp_size + 3)
    fid = row.pop(0)
    nr_onbits = int(row.pop())
    # ignore 0, is seperator between metadata and data
    row.pop()
    bits = [int(d) for d in row]
    bitset = BitMap(bits)
    if len(bitset) != nr_onbits:
        raise Exception('On bit checksum incorrect for {}'.format(fid))
    return fid, bitset


def read_file(infile):
    header = infile.readline()
    header_cols = read_header(header)
    fp_size = header_cols[2]
    bitsets = {}
    for line in infile:
        (fid, bitset) = read_bitset(line, fp_size)
        bitsets[fid] = bitset
    return bitsets, fp_size


[docs]def iter_file(infile): """Reads Makebits formatted file Yields header first then tuples of identifier and BitMap object Yields: first header (format name, format version, number of bits, description), then tuples of the fingerprint identifier and an BitMap object Args: infile (File): File object of Makebits formatted file to read Examples: Read a file >>> f = iter_file(open('fingerprints01.fp')) >>> read_fp_size(next(f)) 4 >>> {frag_id: fp for frag_id, fp in f} {'id1': BitMap([1, 2, 3, 4])} """ header = read_header(infile.readline()) fp_size = read_fp_size(header) yield header for line in infile: (fid, bitset) = read_bitset(line, fp_size) yield fid, bitset
def write_header(fp_size): return "MAKEBITS 1.0 {} BigGrid\n".format(fp_size) def write_bitset(fid, bitset): bits = list(bitset) bits.extend([0, len(bitset)]) return fid + " " + " ".join([str(d) for d in bits]) + "\n"
[docs]def write_file(fp_size, bitsets, fn): """Write makebits formatted file Args: fp_size (int): Number of bits bitsets (dict): Dict with fingerprint identifier as key and BitMap object as value fn (File): File object to write to Examples: Write a file >>> write_file(4, {'id1': BitMap([1, 2, 3, 4])}, open('fingerprints01.fp', 'w')) """ fn.write(write_header(fp_size)) for fid, bitset in six.iteritems(bitsets): fn.write(write_bitset(fid, bitset))