26 An introduction to RLEs
Rles are runlengths - a datastructure that can compactly describe the coverage of a genome (or some other score that is associated with each nucleotide). It is used for efficient genomewide arithmetic operations on these scores.
from pyrle import Rle
= [10, 10, 10, 10]
runs = [0, 1, 0, 0]
values = Rle(runs, values)
r1 print(r1)
## +--------+------+------+------+
## | Runs | 10 | 10 | 20 |
## |--------+------+------+------|
## | Values | 0.0 | 1.0 | 0.0 |
## +--------+------+------+------+
## Rle of length 40 containing 3 elements (avg. length 13.333)
= [11, 9, 20]
runs2 = [100, 0, 100]
values2 = Rle(runs2, values2)
r2 print(r2)
## +--------+-------+-----+-------+
## | Runs | 11 | 9 | 20 |
## |--------+-------+-----+-------|
## | Values | 100.0 | 0.0 | 100.0 |
## +--------+-------+-----+-------+
## Rle of length 40 containing 3 elements (avg. length 13.333)
print(r1 + r2)
## +--------+-------+-------+-----+-------+
## | Runs | 10 | 1 | 9 | 20 |
## |--------+-------+-------+-----+-------|
## | Values | 100.0 | 101.0 | 1.0 | 100.0 |
## +--------+-------+-------+-----+-------+
## Rle of length 40 containing 4 elements (avg. length 10.0)
print(r1 * r2)
## +--------+------+-------+------+
## | Runs | 10 | 1 | 29 |
## |--------+------+-------+------|
## | Values | 0.0 | 100.0 | 0.0 |
## +--------+------+-------+------+
## Rle of length 40 containing 3 elements (avg. length 13.333)
print(r1.runs)
## [10 10 20]
print(r1.values)
## [0. 1. 0.]
= r1 + 5
r1 print(r1)
## +--------+------+------+------+
## | Runs | 10 | 10 | 20 |
## |--------+------+------+------|
## | Values | 5.0 | 6.0 | 5.0 |
## +--------+------+------+------+
## Rle of length 40 containing 3 elements (avg. length 13.333)
print(r2 / r1)
## +--------+------+--------------------+-----+------+
## | Runs | 10 | 1 | 9 | 20 |
## |--------+------+--------------------+-----+------|
## | Values | 20.0 | 16.666666666666668 | 0.0 | 20.0 |
## +--------+------+--------------------+-----+------+
## Rle of length 40 containing 4 elements (avg. length 10.0)
Rles have two helper-methods, numbers_only
and defragment
. The former
replaces NaN with zero and infinity with large finite numbers, while the latter
merges consecutive runs of duplicate values in the Rle (which might occur when
manually manipulating the values vector).
import numpy as np
from pyrle import Rle
= [11, 9, 20]
runs2 = [100, 0, 100]
values2 = Rle(runs2, values2)
r2 = np.array([1, 1, 1], dtype=np.double)
r2.values print(r2)
## +--------+------+-----+------+
## | Runs | 11 | 9 | 20 |
## |--------+------+-----+------|
## | Values | 1.0 | 1.0 | 1.0 |
## +--------+------+-----+------+
## Rle of length 40 containing 3 elements (avg. length 13.333)
print(r2.defragment())
## +--------+------+
## | Runs | 40 |
## |--------+------|
## | Values | 1.0 |
## +--------+------+
## Rle of length 40 containing 1 elements (avg. length 40.0)