-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
python bioformats is fast enough? #49
Comments
bioformats-python hangs unintendedly and outputs no error message. class DatasetWithBioformats(Dataset):
def __init__(self, wsi, tile_size, transform):
javabridge.start_vm(class_path=bioformats.JARS)
self.wsi = wsi
self.tile_size = tile_size
self.transform = transform
metadata = bioformats.get_omexml_metadata(wsi)
root = ET.fromstring(metadata)
self.width = int(root.findall('.//*[@SizeX]')[0].attrib["SizeX"])
self.height = int(root.findall('.//*[@SizeY]')[0].attrib["SizeY"])
x_coords = [i for i in range(0, self.width, tile_size)]
y_coords = [i for i in range(0, self.height, tile_size)]
self.coords = list(product(x_coords, y_coords))
self.reader = bioformats.ImageReader(wsi)
def __del__(self):
self.reader.close()
javabridge.kill_vm()
def __len__(self):
return len(self.coords)
def __getitem__(self, idx):
x, y = self.coords[idx]
width = min(self.tile_size, self.width-x)
height = min(self.tile_size, self.height-y)
print(x, y, width, height)
tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, width, height))
tensor = self.transfrom(tile)
return tensor aicsimageio wraps bioformats. does this work? |
aicsimageio does not provide apis to stitch mosaic. |
with jpype and loci_tools.jar, wsi data can be loaded with bioformats
|
def speed_bioformats():
tile_size = 512
reader = jpype.JPackage('loci').formats.ImageReader()
reader.setId(wsi)
ys = range(0, reader.getSizeY(), tile_size)
xs = range(0, reader.getSizeX(), tile_size)
tile_size = 512
for x, y in tqdm(product(xs, ys), total=len(xs)*len(ys)):
w, h = min(tile_size, reader.getSizeX() - x), min(tile_size, reader.getSizeY() - y)
byte = reader.openBytes(0, x, y, w, h) # byte[] in java
tensor = torch.frombuffer(deepcopy(bytearray(JByte[::](byte))), dtype=torch.uint8).reshape(3, h, w) -> around 36.90it/s |
class DatasetWithBioformats(Dataset):
def __init__(self, wsi, tile_size, transform):
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=./loci_tools.jar", "-Xmx1024m")
self.wsi = wsi
self.tile_size = tile_size
self.transform = transform
self.reader = jpype.JPackage('loci').formats.ImageReader()
self.reader.setId(self.wsi)
self.width = self.reader.getSizeX()
self.height = self.reader.getSizeY()
x_coords = [i for i in range(0, self.width, tile_size)]
y_coords = [i for i in range(0, self.height, tile_size)]
self.coords = list(product(x_coords, y_coords))
def __del__(self):
self.reader.close()
jpype.shutdownJVM()
def __len__(self):
return len(self.coords)
def __getitem__(self, idx):
x, y = self.coords[idx]
w = min(self.tile_size, self.width-x)
h = min(self.tile_size, self.height-y)
byte = self.reader.openBytes(0, x, y, w, h)
tensor = torch.frombuffer(bytearray(JByte[::](byte)), dtype=torch.uint8).reshape(3, h, w)
tensor = self.transfrom(tensor)
return tensor
loader = DataLoader(DatasetWithBioformats(wsi, tile_size, transform=None), batch_size=batch_size, num_workers=4)
for image in tqdm(loader):
print(image.shape)
import pdb
pdb.set_trace() -> |
not with jpype, with javabridge tile_size = 512
batch_size = 16
wsi = "examples/CMU-1.ndpi"
bioformats_path = "loci_tools.jar"
javabridge.start_vm(class_path=bioformats_path)
ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(wsi)
width = reader.getSizeX()
height = reader.getSizeY()
x_coords = [i for i in range(0, width, tile_size)]
y_coords = [i for i in range(0, height, tile_size)]
coords = list(product(x_coords, y_coords))
for x, y in tqdm(coords, desc="Loading tiles"):
w = min(tile_size, width - x)
h = min(tile_size, height - y)
w, h = min(tile_size, reader.getSizeX() - x), min(tile_size, reader.getSizeY() - y)
byte = reader.openBytes(0, x, y, w, h)
tensor = torch.tensor(byte, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1) -> 136.19it/s |
javabridge version works if start_vm is run after replicated to fork in processes. class DatasetWithBioformats(Dataset):
def __init__(self, wsi, tile_size, coords_len):
self.wsi = wsi
self.tile_size = tile_size
self.coords_len = coords_len
self.transform = transforms.Resize((self.tile_size, self.tile_size))
def lazy_load(self):
javabridge.start_vm(class_path=bioformats.JARS)
self.reader = bioformats.ImageReader(wsi)
self.set_params()
def __del__(self):
if hasattr(self, "reader"):
self.reader.close()
javabridge.kill_vm()
def __len__(self):
return self.coords_len
def __getitem__(self, idx):
if not hasattr(self, "reader"):
self.lazy_load()
x, y = self.coords[idx]
w = min(tile_size, self.width - x)
h = min(tile_size, self.height - y)
w, h = min(tile_size, self.width - x), min(tile_size, self.height - y)
tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, w, h))
tensor = torch.tensor(tile, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1)
return self.transform(tensor)
def set_params(self):
ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(self.wsi)
self.width = reader.getSizeX()
self.height = reader.getSizeY()
self.coords = list(product(
range(0, self.width, self.tile_size),
range(0, self.height, self.tile_size)))
reader.close() in this script, coords_len can be read after javabridge.start_vm, but once its started something goes wrong about in the replicated processes. Here I calculated coords_len beforehand in another script, and saved it to a file. |
workaround by loading it in another process with concurrent.futures.ProcessPoolExecutor class DatasetWithBioformats(Dataset):
def __init__(self, wsi, tile_size):
self.wsi = wsi
self.tile_size = tile_size
self.coords = self.get_coords()
self.transform = transforms.Resize((self.tile_size, self.tile_size))
def read_coords(self):
javabridge.start_vm(class_path=bioformats.JARS)
ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(wsi)
coords = list(product(
range(0, reader.getSizeX(), tile_size),
range(0, reader.getSizeY(), tile_size)))
reader.close()
javabridge.kill_vm()
return coords
def get_coords(self):
with ProcessPoolExecutor() as executor:
future = executor.submit(self.read_coords)
return future.result()
def lazy_load(self):
javabridge.start_vm(class_path=bioformats.JARS)
self.reader = bioformats.ImageReader(wsi)
self.set_params()
def __del__(self):
if hasattr(self, "reader"):
self.reader.close()
javabridge.kill_vm()
def __len__(self):
return len(self.coords)
def __getitem__(self, idx):
if not hasattr(self, "reader"):
self.lazy_load()
x, y = self.coords[idx]
w = min(tile_size, self.width - x)
h = min(tile_size, self.height - y)
w, h = min(tile_size, self.width - x), min(tile_size, self.height - y)
tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, w, h))
tensor = torch.tensor(tile, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1)
return self.transform(tensor)
def set_params(self):
ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(self.wsi)
self.width = reader.getSizeX()
self.height = reader.getSizeY()
self.coords = list(product(
range(0, self.width, self.tile_size),
range(0, self.height, self.tile_size)))
reader.close()
dataset = DatasetWithBioformats(wsi, tile_size)
loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
with tqdm(loader) as t:
for image in t:
image += 1
pass this worked. |
Hi, have you tried to compare the speed between javabridge with ImageReader and Openslide when reading WSI? I try to read as big as possible images for example 20480*20480 images and compare their speed. But I found that Javabridge is not faster, which I presume is due to the large amount of debug logs being output. Do you have a way to turn off these logs please. |
@CDPDisk Hi, thank you for your comment.
I'm curious about your problem. If you find the answer, or some could be helpful for you, can you be kind enough to share the result with me? |
I calculated the time after overhead in the new code, but that didn't make an absolute difference in the results. At the moment I'm not doing parallel reads, both are using only one process as well as reading only 1 20480 x 20480 image. I'm interested in your issue because you've mentioned that you achieved a read speed of 136.19it/s. But I can't achieve this with javabridge. I found out that I don't have this speed when reading with 512 size patches, and reading 512 size on SSD took about 60s for 1600 random patches, or 26.67it/s, which is not at all the same as your result. So I suspect the log output is seriously affecting my speed. |
@CDPDisk def supress_log():
# https://github.com/CellProfiler/python-bioformats/issues/137#issuecomment-802313393
# https://github.com/pskeshu/microscoper/blob/master/microscoper/io.py#L141-L162
rootLoggerName = javabridge.get_static_field("org/slf4j/Logger",
"ROOT_LOGGER_NAME",
"Ljava/lang/String;")
rootLogger = javabridge.static_call("org/slf4j/LoggerFactory",
"getLogger",
"(Ljava/lang/String;)Lorg/slf4j/Logger;",
rootLoggerName)
logLevel = javabridge.get_static_field("ch/qos/logback/classic/Level",
"WARN",
"Lch/qos/logback/classic/Level;")
javabridge.call(rootLogger,
"setLevel",
"(Lch/qos/logback/classic/Level;)V",
logLevel)
def speed_javabridge():
javabridge.start_vm(class_path=bioformats.JARS)
supress_log()
ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(wsi)
xs = range(0, reader.getSizeX(), tile_size)
ys = range(0, reader.getSizeY(), tile_size)
width = reader.getSizeX()
height = reader.getSizeY()
for x, y in tqdm(list(product(xs, ys)), desc="javabridge"):
w, h = min(tile_size, width - x), min(tile_size, height - y)
byte = reader.openBytes(0, x, y, w, h)
tensor = torch.tensor(byte, dtype=torch.uint8).reshape(
h, w, 3).permute(2, 0, 1)
javabridge.kill_vm() For reference, I also did below, but none of them showed good result.
|
Thanks for the code, it did stop outputting the log, but I found it wasn't as fast as openslide. So I tried different things mainly with openslide, including reading different files, different sizes (512, 5120 and 10240, then subdivided into 512), different types of hard disks (SSD and HDD), different ways of reading (dz in the type column is deepzoom from the wiki, openslide is openslide.read_ region), and the read speed it/s for 512 size patches. These are some conclusions with my understand but I don't if it is correct:
This is my test code:
|
@CDPDisk
As you have mentioned, reading speed depends on the data format because the numbers of layers in a whole slide image data pyramid are different. I'm not sure, but SVS (Aperio) has 3 layers while NDPI (Hamamatsu) has almost 10 for a same image of 50000x50000. This leads slower speed for SVS because it has less indexed layer to load from. Please see level_dimensions of openslide and set the tile_size to some number near to the dimensions output. Also this might be one of the reasons of no significant difference between HDD and SSD. # helper
files = pd.Series(sorted(df.file.unique()), name="file")
def show_diff(tile_size, type, num_workers, rename):
df = pd.read_csv("record.csv")
df = df[(df.type==type) & (df.tile_size==tile_size) & (df.num_workers==num_workers)].reset_index(drop=True)
ssd = df[df.disk == "SSD"].sort_values(by="file").speed.reset_index(drop=True)
hdd = df[df.disk == "HDD"].sort_values(by="file").speed.reset_index(drop=True)
diff = (ssd - hdd).rename(rename)
return diff
# pd.concat([
# files,
# show_diff(512, "openslide", 1, "t512-w1"),
# show_diff(5120, "openslide", 1, "t5120-w1"),
# show_diff(10240, "openslide", 1, "t10240-w1"),
# show_diff(512, "openslide", 4, "t512-w4"),
# show_diff(5120, "openslide", 4, "t5120-w4"),
# show_diff(10240, "openslide", 4, "t10240-w4")
# ], axis=1)
| | files | t512-w1 | t5120-w1 | t10240-w1 | t512-w4 | t5120-w4 | t10240-w4 |
|--:|-----------------:|----------:|----------:|-----------:|-----------:|-----------:|----------:|
| 0 | 2016-27600.svs | -0.719293 | -0.040872 | -0.941112 | -10.437483 | -35.053484 | 9.573670 |
| 1 | 2017-16772.ndpi | 2.660345 | 0.411925 | -6.603403 | -3.760096 | -52.624435 | 23.775951 |
| 2 | 2018-56178.ndpi | -1.588323 | -2.843337 | 0.591185 | -8.121937 | -41.055469 | -1.777307 |
| 3 | 2018-57913.ndpi | 0.760055 | -4.158040 | -2.688662 | -1.053764 | -56.187543 | -8.596195 |
| 4 | 2019-31077.svs | 0.324398 | 0.410919 | 0.328029 | 3.259479 | -18.384691 | -1.016658 |
| 5 | D2020-13525.svs | 0.653814 | -0.387489 | -1.347961 | -4.567287 | 6.325705 | 0.492831 |
| 6 | D2021-15343.svs | -0.555875 | -0.341506 | -1.482710 | 6.321090 | 6.675624 | 2.414112 |
| 7 | T2018-05476.ndpi | -0.964013 | -1.661222 | -15.845805 | 21.999301 | -5.834994 | 1.447804 |
# pd.concat([
# files,
# show_diff(512, "dz", 1, "t512-w1"),
# show_diff(5120, "dz", 1, "t5120-w1"),
# show_diff(10240, "dz", 1, "t10240-w1"),
# show_diff(512, "dz", 4, "t512-w4"),
# show_diff(5120, "dz", 4, "t5120-w4"),
# show_diff(10240, "dz", 4, "t10240-w4")
# ], axis=1)
| | file | t512-w1 | t5120-w1 | t10240-w1 | t512-w4 | t5120-w4 | t10240-w4 |
|--:|-----------------:|-----------:|----------:|----------:|-----------:|-----------:|-----------:|
| 0 | 2016-27600.svs | 0.092700 | 14.292636 | 9.629455 | 80.693135 | 50.028105 | 61.569070 |
| 1 | 2017-16772.ndpi | 40.957503 | 2.934863 | -0.369446 | 9.006010 | 8.737644 | 31.673067 |
| 2 | 2018-56178.ndpi | 9.865274 | 43.074879 | 59.077770 | 20.562455 | 113.167572 | 250.569624 |
| 3 | 2018-57913.ndpi | 7.887157 | 4.709428 | 2.230740 | 18.212027 | 25.220513 | 27.763726 |
| 4 | 2019-31077.svs | 58.328462 | 26.117867 | 15.106393 | 298.646397 | 180.995066 | 143.793427 |
| 5 | D2020-13525.svs | 59.747630 | 25.610529 | 14.544780 | 293.596099 | 145.770164 | 137.956814 |
| 6 | D2021-15343.svs | 53.866016 | 19.862468 | 9.989578 | 273.043651 | 146.639732 | 102.914911 |
| 7 | T2018-05476.ndpi | -24.659575 | -0.636017 | -3.715397 | 19.266292 | -10.067838 | 5.670925 | |
Thanks for your reply. They are helpful! |
No description provided.
The text was updated successfully, but these errors were encountered: