
This notebook creates the figures for the paper titled “OptiMask: Efficiently Finding the Largest NaN-Free Submatrix” intended for the SciPy 2025 proceedings. Throughout the cells, several PNG images are generated. At the conclusion, these computed PNGs are losslessly compressed using oxipng.
Cyril Joly
# ! pip install optimask pyoxipng
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from optimask import OptiMask
from optimask.utils import generate_mar, plot
from oxipng import optimize[p.unlink() for p in Path('.').glob('*.png')]DPI = 2001The problem at hand¶
Illustrating the trade-off between removing cells or columns in a toy example:
x = np.zeros((12, 5))
x[7, [1, 3]] = np.nan
plt.figure()
plt.subplot(1, 3, 1)
plot(x, show=False, title="original data")
plt.subplot(1, 3, 2)
plot(x, rows_to_remove=[7], show=False, title="remove row")
plt.subplot(1, 3, 3)
plot(x, cols_to_remove=[1, 3], show=False, title="remove columns")
plt.savefig('at_hand_two.png', bbox_inches='tight', dpi=DPI)
plt.show()2Algorithm Steps¶
def heights(x, axis=0):
return (x.shape[axis] - np.argmax(np.flip(x, axis=axis), axis=axis)).astype(np.uint32)
def to_plot(x):
xp = x.astype(float)
xp[xp > 0] = np.nan
return xpGeneration of an example matrix with missing values, with missing at random values plus a contiguous set of 7 missing values.
m, n = 40, 30
x = generate_mar(m=m, n=n, ratio=0.04, rng=1)
x[12:19, 10] = np.nan
plot(x, show=False)
plt.savefig('algo_data.png', bbox_inches='tight', dpi=DPI)
plt.show()
nan_rows, nan_cols = np.isnan(x).nonzero()
nan_rows, nan_cols = np.unique(nan_rows), np.unique(nan_cols)
xp = np.isnan(x[nan_rows][:, nan_cols])
hx, hy = heights(xp, axis=0), heights(xp, axis=1)
plot(to_plot(xp), xticks=hx, yticks=hy, show=False)
plt.savefig('algo_0.png', bbox_inches='tight', dpi=DPI)
plt.show()hx, hy = heights(xp, axis=0).astype(np.uint32), heights(xp, axis=1).astype(np.uint32)
k = 0
plt.figure(figsize=(12, 5))
while not (OptiMask.is_decreasing(hx) and OptiMask.is_decreasing(hy)):
axis = k % 2
p_step = np.argsort(-heights(xp, axis=axis))
if axis == 0:
xp = xp[:, p_step]
if axis == 1:
xp = xp[p_step]
hx, hy = heights(xp, axis=0).astype(np.uint32), heights(xp, axis=1).astype(
np.uint32
)
plt.subplot(1, 3, k+1)
plot(to_plot(xp), xticks=hx, yticks=hy, title=f"Iteration {k+1}", show=False)
k += 1
plt.tight_layout()
plt.savefig('algo_iterations.png', bbox_inches='tight', dpi=DPI)
plt.show()def rectangle(xmin, ymin, xmax, ymax, c='k', lw=0.4):
x = [xmin, xmax, xmax, xmin, xmin]
y = [ymin, ymin, ymax, ymax, ymin]
plt.plot(x, y, c=c, linestyle='--', lw=lw)opt_area = 0
plot(np.zeros_like(x), show=False)
plot(to_plot(xp), xticks=hx, yticks=hy, show=False)
for i0 in range(len(hx)):
if (i0 == 0) or (i0 > 0 and (hx[i0] < hx[i0-1])):
rectangle(i0, hx[i0], n, m)
if (n-i0)*(m-hx[i0]) > opt_area:
opt_area, opt_index = (n-i0)*(m-hx[i0]), i0
rectangle(len(hx), 0, n, m)
rectangle(opt_index, hx[opt_index], n, m, c='r', lw=1)
plt.savefig('algo_result_permuted_space.png', bbox_inches='tight', dpi=DPI)
plt.show()rows, cols = OptiMask(n_tries=10000).solve(x)plot(x, rows, cols, show=False)
plt.savefig('algo_result.png', bbox_inches='tight', dpi=DPI)
plt.show()[optimize(path) for path in Path('.').glob('*.png')]Copyright © 2025 Joly. This is an open-access article distributed under the terms of the Creative Commons Attribution 4.0 International license, which enables reusers to distribute, remix, adapt, and build upon the material in any medium or format, so long as attribution is given to the creator.
- NaN
- Not a Number