Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python bioformats is fast enough? #49

Open
tand826 opened this issue Apr 8, 2023 · 15 comments
Open

python bioformats is fast enough? #49

tand826 opened this issue Apr 8, 2023 · 15 comments

Comments

@tand826
Copy link
Owner

tand826 commented Apr 8, 2023

No description provided.

@tand826
Copy link
Owner Author

tand826 commented Apr 10, 2023

bioformats-python hangs unintendedly and outputs no error message.

class DatasetWithBioformats(Dataset):

    def __init__(self, wsi, tile_size, transform):
        javabridge.start_vm(class_path=bioformats.JARS)
        self.wsi = wsi
        self.tile_size = tile_size
        self.transform = transform
        metadata = bioformats.get_omexml_metadata(wsi)
        root = ET.fromstring(metadata)
        self.width = int(root.findall('.//*[@SizeX]')[0].attrib["SizeX"])
        self.height = int(root.findall('.//*[@SizeY]')[0].attrib["SizeY"])
        x_coords = [i for i in range(0, self.width, tile_size)]
        y_coords = [i for i in range(0, self.height, tile_size)]
        self.coords = list(product(x_coords, y_coords))
        self.reader = bioformats.ImageReader(wsi)

    def __del__(self):
        self.reader.close()
        javabridge.kill_vm()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, idx):
        x, y = self.coords[idx]
        width = min(self.tile_size, self.width-x)
        height = min(self.tile_size, self.height-y)
        print(x, y, width, height)
        tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, width, height))
        tensor = self.transfrom(tile)
        return tensor

aicsimageio wraps bioformats. does this work?

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

aicsimageio does not provide apis to stitch mosaic.

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

with jpype and loci_tools.jar, wsi data can be loaded with bioformats

import jpype
import numpy as np
from PIL import Image
from jpype.types import JByte
from copy import copy

jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=./loci_tools.jar", "-Xmx1024m")
loci = jpype.JPackage('loci')

reader = loci.formats.ImageReader()
reader.setId("examples/CMU-1.ndpi")

h = 100
w = 120
x = 10000
y = 23000
byte = reader.openBytes(0, x, y, w, h)  # byte[] in java

# with PIL
Image.frombytes(mode="RGB", size=(w, h), data=bytes(JByte[::](byte)))

# with numpy
np.frombuffer(bytes(JByte[::](byte)), dtype=np.uint8).reshape(h, w, 3)

# with torch
torch.frombuffer(bytearray(JByte[::](byte)), dtype=torch.uint8).reshape(3, h, w,)

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

def speed_bioformats():
    tile_size = 512
    reader = jpype.JPackage('loci').formats.ImageReader()
    reader.setId(wsi)
    ys = range(0, reader.getSizeY(), tile_size)
    xs = range(0, reader.getSizeX(), tile_size)
    tile_size = 512

    for x, y in tqdm(product(xs, ys), total=len(xs)*len(ys)):
        w, h = min(tile_size, reader.getSizeX() - x), min(tile_size, reader.getSizeY() - y)
        byte = reader.openBytes(0, x, y, w, h)  # byte[] in java
        tensor = torch.frombuffer(deepcopy(bytearray(JByte[::](byte))), dtype=torch.uint8).reshape(3, h, w)

-> around 36.90it/s

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

class DatasetWithBioformats(Dataset):

    def __init__(self, wsi, tile_size, transform):
        jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=./loci_tools.jar", "-Xmx1024m")

        self.wsi = wsi
        self.tile_size = tile_size
        self.transform = transform
        self.reader = jpype.JPackage('loci').formats.ImageReader()
        self.reader.setId(self.wsi)

        self.width = self.reader.getSizeX()
        self.height = self.reader.getSizeY()
        x_coords = [i for i in range(0, self.width, tile_size)]
        y_coords = [i for i in range(0, self.height, tile_size)]
        self.coords = list(product(x_coords, y_coords))

    def __del__(self):
        self.reader.close()
        jpype.shutdownJVM()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, idx):
        x, y = self.coords[idx]
        w = min(self.tile_size, self.width-x)
        h = min(self.tile_size, self.height-y)
        byte = self.reader.openBytes(0, x, y, w, h)
        tensor = torch.frombuffer(bytearray(JByte[::](byte)), dtype=torch.uint8).reshape(3, h, w)
        tensor = self.transfrom(tensor)
        return tensor


loader = DataLoader(DatasetWithBioformats(wsi, tile_size, transform=None), batch_size=batch_size, num_workers=4)
for image in tqdm(loader):
    print(image.shape)
    import pdb
    pdb.set_trace()

-> _pickle.PicklingError: Can't pickle <java class 'java.lang.NoClassDefFoundError'>: import of module 'java.lang' failed

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

not with jpype, with javabridge

tile_size = 512
batch_size = 16
wsi = "examples/CMU-1.ndpi"
bioformats_path = "loci_tools.jar"
javabridge.start_vm(class_path=bioformats_path)

ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
reader = ImageReader()
reader.setId(wsi)
width = reader.getSizeX()
height = reader.getSizeY()
x_coords = [i for i in range(0, width, tile_size)]
y_coords = [i for i in range(0, height, tile_size)]

coords = list(product(x_coords, y_coords))
for x, y in tqdm(coords, desc="Loading tiles"):
    w = min(tile_size, width - x)
    h = min(tile_size, height - y)
    w, h = min(tile_size, reader.getSizeX() - x), min(tile_size, reader.getSizeY() - y)
    byte = reader.openBytes(0, x, y, w, h)
    tensor = torch.tensor(byte, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1)

-> 136.19it/s

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

bioformats-python hangs unintendedly and outputs no error message.

class DatasetWithBioformats(Dataset):

    def __init__(self, wsi, tile_size, transform):
        javabridge.start_vm(class_path=bioformats.JARS)
        self.wsi = wsi
        self.tile_size = tile_size
        self.transform = transform
        metadata = bioformats.get_omexml_metadata(wsi)
        root = ET.fromstring(metadata)
        self.width = int(root.findall('.//*[@SizeX]')[0].attrib["SizeX"])
        self.height = int(root.findall('.//*[@SizeY]')[0].attrib["SizeY"])
        x_coords = [i for i in range(0, self.width, tile_size)]
        y_coords = [i for i in range(0, self.height, tile_size)]
        self.coords = list(product(x_coords, y_coords))
        self.reader = bioformats.ImageReader(wsi)

    def __del__(self):
        self.reader.close()
        javabridge.kill_vm()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, idx):
        x, y = self.coords[idx]
        width = min(self.tile_size, self.width-x)
        height = min(self.tile_size, self.height-y)
        print(x, y, width, height)
        tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, width, height))
        tensor = self.transfrom(tile)
        return tensor

aicsimageio wraps bioformats. does this work?

javabridge version works if start_vm is run after replicated to fork in processes.

class DatasetWithBioformats(Dataset):

    def __init__(self, wsi, tile_size, coords_len):
        self.wsi = wsi
        self.tile_size = tile_size
        self.coords_len = coords_len
        self.transform = transforms.Resize((self.tile_size, self.tile_size))

    def lazy_load(self):
        javabridge.start_vm(class_path=bioformats.JARS)
        self.reader = bioformats.ImageReader(wsi)
        self.set_params()

    def __del__(self):
        if hasattr(self, "reader"):
            self.reader.close()
        javabridge.kill_vm()

    def __len__(self):
        return self.coords_len

    def __getitem__(self, idx):
        if not hasattr(self, "reader"):
            self.lazy_load()
        x, y = self.coords[idx]
        w = min(tile_size, self.width - x)
        h = min(tile_size, self.height - y)
        w, h = min(tile_size, self.width - x), min(tile_size, self.height - y)
        tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, w, h))
        tensor = torch.tensor(tile, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1)
        return self.transform(tensor)

    def set_params(self):
        ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
        reader = ImageReader()
        reader.setId(self.wsi)
        self.width = reader.getSizeX()
        self.height = reader.getSizeY()
        self.coords = list(product(
            range(0, self.width, self.tile_size),
            range(0, self.height, self.tile_size)))
        reader.close()

in this script, coords_len can be read after javabridge.start_vm, but once its started something goes wrong about in the replicated processes. Here I calculated coords_len beforehand in another script, and saved it to a file.

@tand826
Copy link
Owner Author

tand826 commented Apr 11, 2023

workaround by loading it in another process with concurrent.futures.ProcessPoolExecutor

class DatasetWithBioformats(Dataset):

    def __init__(self, wsi, tile_size):
        self.wsi = wsi
        self.tile_size = tile_size
        self.coords = self.get_coords()
        self.transform = transforms.Resize((self.tile_size, self.tile_size))

    def read_coords(self):
        javabridge.start_vm(class_path=bioformats.JARS)
        ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
        reader = ImageReader()
        reader.setId(wsi)
        coords = list(product(
            range(0, reader.getSizeX(), tile_size),
            range(0, reader.getSizeY(), tile_size)))
        reader.close()
        javabridge.kill_vm()
        return coords

    def get_coords(self):
        with ProcessPoolExecutor() as executor:
            future = executor.submit(self.read_coords)
        return future.result()

    def lazy_load(self):
        javabridge.start_vm(class_path=bioformats.JARS)
        self.reader = bioformats.ImageReader(wsi)
        self.set_params()

    def __del__(self):
        if hasattr(self, "reader"):
            self.reader.close()
        javabridge.kill_vm()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, idx):
        if not hasattr(self, "reader"):
            self.lazy_load()
        x, y = self.coords[idx]
        w = min(tile_size, self.width - x)
        h = min(tile_size, self.height - y)
        w, h = min(tile_size, self.width - x), min(tile_size, self.height - y)
        tile = self.reader.read(c=0, rescale=False, XYWH=(x, y, w, h))
        tensor = torch.tensor(tile, dtype=torch.uint8).reshape(h, w, 3).permute(2, 0, 1)
        return self.transform(tensor)

    def set_params(self):
        ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
        reader = ImageReader()
        reader.setId(self.wsi)
        self.width = reader.getSizeX()
        self.height = reader.getSizeY()
        self.coords = list(product(
            range(0, self.width, self.tile_size),
            range(0, self.height, self.tile_size)))
        reader.close()


dataset = DatasetWithBioformats(wsi, tile_size)
loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
with tqdm(loader) as t:
    for image in t:
        image += 1
        pass

this worked.

@CDPDisk
Copy link

CDPDisk commented Nov 28, 2024

Hi, have you tried to compare the speed between javabridge with ImageReader and Openslide when reading WSI? I try to read as big as possible images for example 20480*20480 images and compare their speed. But I found that Javabridge is not faster, which I presume is due to the large amount of debug logs being output. Do you have a way to turn off these logs please.
I have mention it: CellProfiler/python-bioformats#171 (comment)

@tand826
Copy link
Owner Author

tand826 commented Nov 29, 2024

@CDPDisk Hi, thank you for your comment.
It's been a while, and I don't remember how to control the log outputs, but there are a few ideas to find the answer.

  • Is the log output really the core of the problem? How about using %%time in jupyter or any measuring ways to compare the specific part of the codes?
  • There is a overhead of starting java vm with javabridge. Can you measure the time after the overhead?
  • If you want to read images in parallel, the limitation that the number of vms in 1 process is 1 might cause slower reading speed. Can you try to read only 1 20480x20480 image?

I'm curious about your problem. If you find the answer, or some could be helpful for you, can you be kind enough to share the result with me?

@CDPDisk
Copy link

CDPDisk commented Nov 29, 2024

@CDPDisk Hi, thank you for your comment. It's been a while, and I don't remember how to control the log outputs, but there are a few ideas to find the answer.

  • Is the log output really the core of the problem? How about using %%time in jupyter or any measuring ways to compare the specific part of the codes?
  • There is a overhead of starting java vm with javabridge. Can you measure the time after the overhead?
  • If you want to read images in parallel, the limitation that the number of vms in 1 process is 1 might cause slower reading speed. Can you try to read only 1 20480x20480 image?

I'm curious about your problem. If you find the answer, or some could be helpful for you, can you be kind enough to share the result with me?

I calculated the time after overhead in the new code, but that didn't make an absolute difference in the results. At the moment I'm not doing parallel reads, both are using only one process as well as reading only 1 20480 x 20480 image.

I'm interested in your issue because you've mentioned that you achieved a read speed of 136.19it/s. But I can't achieve this with javabridge. I found out that I don't have this speed when reading with 512 size patches, and reading 512 size on SSD took about 60s for 1600 random patches, or 26.67it/s, which is not at all the same as your result. So I suspect the log output is seriously affecting my speed.

@tand826
Copy link
Owner Author

tand826 commented Nov 30, 2024

@CDPDisk
I tried to reproduce the speed above by supressing the log output and got 137.17it/s. Does this reproduce the speed in your environment?

def supress_log():
    # https://github.com/CellProfiler/python-bioformats/issues/137#issuecomment-802313393
    # https://github.com/pskeshu/microscoper/blob/master/microscoper/io.py#L141-L162
    rootLoggerName = javabridge.get_static_field("org/slf4j/Logger",
                                                 "ROOT_LOGGER_NAME",
                                                 "Ljava/lang/String;")

    rootLogger = javabridge.static_call("org/slf4j/LoggerFactory",
                                        "getLogger",
                                        "(Ljava/lang/String;)Lorg/slf4j/Logger;",
                                        rootLoggerName)

    logLevel = javabridge.get_static_field("ch/qos/logback/classic/Level",
                                           "WARN",
                                           "Lch/qos/logback/classic/Level;")

    javabridge.call(rootLogger,
                    "setLevel",
                    "(Lch/qos/logback/classic/Level;)V",
                    logLevel)
                    
def speed_javabridge():
    javabridge.start_vm(class_path=bioformats.JARS)
    supress_log()
    ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
    reader = ImageReader()
    reader.setId(wsi)
    xs = range(0, reader.getSizeX(), tile_size)
    ys = range(0, reader.getSizeY(), tile_size)
    width = reader.getSizeX()
    height = reader.getSizeY()

    for x, y in tqdm(list(product(xs, ys)), desc="javabridge"):
        w, h = min(tile_size,  width - x), min(tile_size, height - y)
        byte = reader.openBytes(0, x, y, w, h)
        tensor = torch.tensor(byte, dtype=torch.uint8).reshape(
            h, w, 3).permute(2, 0, 1)
    javabridge.kill_vm()   

For reference, I also did below, but none of them showed good result.

  • download the javabridge with the latest version of Apr 2023
  • inserted some python-bioformats based codes
  • used torch.utils.data.Dataset to run in parallel ( -> 18.85it/s for batch_size=16, 4.24it/s for batch_size=64. both are too faster than 136it/s)
  • referenced a openslide based code in wiki, to reproduce almost the same speed.

@CDPDisk
Copy link

CDPDisk commented Dec 3, 2024

@CDPDisk I tried to reproduce the speed above by supressing the log output and got 137.17it/s. Does this reproduce the speed in your environment?

def supress_log():
    # https://github.com/CellProfiler/python-bioformats/issues/137#issuecomment-802313393
    # https://github.com/pskeshu/microscoper/blob/master/microscoper/io.py#L141-L162
    rootLoggerName = javabridge.get_static_field("org/slf4j/Logger",
                                                 "ROOT_LOGGER_NAME",
                                                 "Ljava/lang/String;")

    rootLogger = javabridge.static_call("org/slf4j/LoggerFactory",
                                        "getLogger",
                                        "(Ljava/lang/String;)Lorg/slf4j/Logger;",
                                        rootLoggerName)

    logLevel = javabridge.get_static_field("ch/qos/logback/classic/Level",
                                           "WARN",
                                           "Lch/qos/logback/classic/Level;")

    javabridge.call(rootLogger,
                    "setLevel",
                    "(Lch/qos/logback/classic/Level;)V",
                    logLevel)
                    
def speed_javabridge():
    javabridge.start_vm(class_path=bioformats.JARS)
    supress_log()
    ImageReader = javabridge.JClassWrapper("loci.formats.ImageReader")
    reader = ImageReader()
    reader.setId(wsi)
    xs = range(0, reader.getSizeX(), tile_size)
    ys = range(0, reader.getSizeY(), tile_size)
    width = reader.getSizeX()
    height = reader.getSizeY()

    for x, y in tqdm(list(product(xs, ys)), desc="javabridge"):
        w, h = min(tile_size,  width - x), min(tile_size, height - y)
        byte = reader.openBytes(0, x, y, w, h)
        tensor = torch.tensor(byte, dtype=torch.uint8).reshape(
            h, w, 3).permute(2, 0, 1)
    javabridge.kill_vm()   

For reference, I also did below, but none of them showed good result.

  • download the javabridge with the latest version of Apr 2023
  • inserted some python-bioformats based codes
  • used torch.utils.data.Dataset to run in parallel ( -> 18.85it/s for batch_size=16, 4.24it/s for batch_size=64. both are too faster than 136it/s)
  • referenced a openslide based code in wiki, to reproduce almost the same speed.

Thanks for the code, it did stop outputting the log, but I found it wasn't as fast as openslide. So I tried different things mainly with openslide, including reading different files, different sizes (512, 5120 and 10240, then subdivided into 512), different types of hard disks (SSD and HDD), different ways of reading (dz in the type column is deepzoom from the wiki, openslide is openslide.read_ region), and the read speed it/s for 512 size patches.
I will share the result file.
record.csv

These are some conclusions with my understand but I don't if it is correct:

  • Reading data is enough to read the patches in order, there is no need to read larger image chunks at once and then tile them
  • The data read speed is related to the data format. Data reading speed is related to the format. Standardized data is faster to read, some scanners get data that is not standardized, and the data is slower to read.
  • Multi-process is effective. However, the multi-process dataloader under windows when using openslide will generate errors, you need to manually set getstate and setstate to avoid the error
  • openslide use read_region function is faster than deepzoom read speed
  • (Confusing) There is no significant speed difference between HDD and SSD reads. I'm not sure what's causing this

This is my test code:

from openslide import OpenSlide
import openslide
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from openslide.deepzoom import DeepZoomGenerator
from tqdm import tqdm
from einops import rearrange
from pathlib import Path
import time
import pandas as pd

class DatasetWithOpenSlide(Dataset):

    def __init__(self, wsi, type, tile_size):
        self.wsi_path = wsi
        self.wsi = openslide.OpenSlide(wsi)
        self.tile_size = tile_size
        self.dz = DeepZoomGenerator(
            self.wsi,
            tile_size=self.tile_size,
            overlap=0)
        self.tiles_x, self.tiles_y = self.dz.level_tiles[-1]
        self.deepest_layer = self.dz.level_count - 1
        self.type = type

    def __len__(self):
        return self.tiles_x * self.tiles_y

    def __getitem__(self, idx):
        idx = np.random.randint(0, self.tiles_x * self.tiles_y)
        x = idx % self.tiles_x
        y = idx // self.tiles_x
        if self.type == 'dz':
            try:
                img = np.array(self.dz.get_tile(self.deepest_layer, (x, y)))
            except:
                img = np.zeros((512, 512, 3))
        elif self.type == 'openslide':
            img = np.array(self.wsi.read_region((x*self.tile_size, y*self.tile_size), 0, (self.tile_size, self.tile_size)))
        else:
            raise ValueError('type should be dz or openslide')

        if self.tile_size!=512:
            try:
                img = torch.tensor(img)
                pad_h = 512 - img.shape[0]%512
                pad_w = 512 - img.shape[1]%512
                padding = (0, 0, 0, pad_w, 0, pad_h)
                img = torch.nn.functional.pad(img, padding, mode='constant', value=0)
                img = rearrange(img.unfold(0, 512, 512).unfold(1, 512, 512), 'nh nw h w c -> (nh nw) h w c')
            except:
                pass
            return img
        return torch.tensor(img).unsqueeze(0)
    
    def __getstate__(self):
        try:
            self.__dict__.pop('wsi')
        except:
            pass
        try:
            self.__dict__.pop('dz')
        except:
            pass
        state = self.__dict__.copy()
        return state

    def __setstate__(self, state):
        state['wsi'] = openslide.OpenSlide(state['wsi_path'])
        state['dz'] = DeepZoomGenerator(
            state['wsi'],
            tile_size=state['tile_size'],
            overlap=0)
        self.__dict__.update(state)

if __name__ == '__main__':
    tile_size = 512
    batch_size = 1
    test_disk = 'SSD'
    num_worker = 0
    type='openslide'
    record = {'file': [],
              'disk': [],
              'tile_size': [],
              'num_workers': [],
              'speed': [],
              'type':[]}
    for tile_size in [512, 512*10, 512*20]:
        for num_worker in [1, 4]:
            for test_disk in ['HDD', 'SSD']:
                if test_disk == 'HDD':
                    wsi_path_list = Path(r'E:/FUSCC2_data/read_test/').glob('*')
                    # wsi_path_list = Path(r'E:/FUSCC2_data/read_test/')
                else:
                    wsi_path_list = Path(r'C:/Users/Administrator.DESKTOP-JADQ414/Desktop/data_example/swinmil/read_test').glob('*')
                    # wsi_path_list = Path(r'C:/Users/Administrator.DESKTOP-JADQ414/Desktop/data_example/swinmil/read_test')
                    for wsi_path in wsi_path_list:
                        for type in ['dz', 'openslide']:
                            # wsi_path = r'E:\FUSCC2_data\VLM\batch1\NeoPlatform\2022-34841.svs'
                            # wsi_path = wsi_path_list/'T2018-05476.ndpi'
                            def speed_DataLoaderWithOpenSlide():
                                con = 0
                                dataset = DatasetWithOpenSlide(wsi_path, type=type, tile_size=tile_size)
                                # dataset = WSIDataset(wsi, patch_size=tile_size)
                                start_time = time.time()
                                data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_worker)
                                for img in tqdm(data_loader):
                                    con += img.shape[1]

                                print(f'file:{wsi_path.name}, disk: {test_disk}, tile_size:{tile_size}, num_workers: {num_worker}, speed: {con/(time.time()-start_time)}')
                                record['file'].append(wsi_path.name)
                                record['disk'].append(test_disk)
                                record['tile_size'].append(tile_size)
                                record['num_workers'].append(num_worker)
                                record['type'].append(type)
                                record['speed'].append(con/(time.time()-start_time))

    speed_DataLoaderWithOpenSlide()
    pd.DataFrame(record).to_csv('record.csv', index=False)

@tand826
Copy link
Owner Author

tand826 commented Dec 4, 2024

@CDPDisk
Thank you for the records and insightful analyses. I also explored the data and found some points with the code below.

  • with num_workers==4, SSD is more likely to be faster than HDD comparing to num_workers=1.
  • with type=="dz", there are more difference compared to type=="openslide".

As you have mentioned, reading speed depends on the data format because the numbers of layers in a whole slide image data pyramid are different. I'm not sure, but SVS (Aperio) has 3 layers while NDPI (Hamamatsu) has almost 10 for a same image of 50000x50000. This leads slower speed for SVS because it has less indexed layer to load from. Please see level_dimensions of openslide and set the tile_size to some number near to the dimensions output. Also this might be one of the reasons of no significant difference between HDD and SSD.
Anyway, if you need to repeat a number of experiments without changing the coordinates of tiles, I recommend that you tile all the images first and load them from pre-extracted image file for 2x faster loading speed. wsiprocess can be good tool for this purpose :)

# helper

files = pd.Series(sorted(df.file.unique()), name="file")
def show_diff(tile_size, type, num_workers, rename):
    df = pd.read_csv("record.csv")
    df = df[(df.type==type) & (df.tile_size==tile_size) & (df.num_workers==num_workers)].reset_index(drop=True)
    ssd = df[df.disk == "SSD"].sort_values(by="file").speed.reset_index(drop=True)
    hdd = df[df.disk == "HDD"].sort_values(by="file").speed.reset_index(drop=True)
    diff = (ssd - hdd).rename(rename)
    return diff

# pd.concat([
#     files,
#     show_diff(512, "openslide", 1, "t512-w1"),
#     show_diff(5120, "openslide", 1, "t5120-w1"),
#     show_diff(10240, "openslide", 1, "t10240-w1"),
#     show_diff(512, "openslide", 4, "t512-w4"),
#     show_diff(5120, "openslide", 4, "t5120-w4"),
#     show_diff(10240, "openslide", 4, "t10240-w4")
# ], axis=1)

|   |            files |   t512-w1 |  t5120-w1 |  t10240-w1 |    t512-w4 |   t5120-w4 | t10240-w4 |
|--:|-----------------:|----------:|----------:|-----------:|-----------:|-----------:|----------:|
| 0 |   2016-27600.svs | -0.719293 | -0.040872 |  -0.941112 | -10.437483 | -35.053484 |  9.573670 |
| 1 |  2017-16772.ndpi |  2.660345 |  0.411925 |  -6.603403 |  -3.760096 | -52.624435 | 23.775951 |
| 2 |  2018-56178.ndpi | -1.588323 | -2.843337 |   0.591185 |  -8.121937 | -41.055469 | -1.777307 |
| 3 |  2018-57913.ndpi |  0.760055 | -4.158040 |  -2.688662 |  -1.053764 | -56.187543 | -8.596195 |
| 4 |   2019-31077.svs |  0.324398 |  0.410919 |   0.328029 |   3.259479 | -18.384691 | -1.016658 |
| 5 |  D2020-13525.svs |  0.653814 | -0.387489 |  -1.347961 |  -4.567287 |   6.325705 |  0.492831 |
| 6 |  D2021-15343.svs | -0.555875 | -0.341506 |  -1.482710 |   6.321090 |   6.675624 |  2.414112 |
| 7 | T2018-05476.ndpi | -0.964013 | -1.661222 | -15.845805 |  21.999301 |  -5.834994 |  1.447804 |

# pd.concat([
#     files,
#     show_diff(512, "dz", 1, "t512-w1"),
#     show_diff(5120, "dz", 1, "t5120-w1"),
#     show_diff(10240, "dz", 1, "t10240-w1"),
#     show_diff(512, "dz", 4, "t512-w4"),
#     show_diff(5120, "dz", 4, "t5120-w4"),
#     show_diff(10240, "dz", 4, "t10240-w4")
# ], axis=1)

|   |             file |    t512-w1 |  t5120-w1 | t10240-w1 |    t512-w4 |   t5120-w4 |  t10240-w4 |
|--:|-----------------:|-----------:|----------:|----------:|-----------:|-----------:|-----------:|
| 0 |   2016-27600.svs |   0.092700 | 14.292636 |  9.629455 |  80.693135 |  50.028105 |  61.569070 |
| 1 |  2017-16772.ndpi |  40.957503 |  2.934863 | -0.369446 |   9.006010 |   8.737644 |  31.673067 |
| 2 |  2018-56178.ndpi |   9.865274 | 43.074879 | 59.077770 |  20.562455 | 113.167572 | 250.569624 |
| 3 |  2018-57913.ndpi |   7.887157 |  4.709428 |  2.230740 |  18.212027 |  25.220513 |  27.763726 |
| 4 |   2019-31077.svs |  58.328462 | 26.117867 | 15.106393 | 298.646397 | 180.995066 | 143.793427 |
| 5 |  D2020-13525.svs |  59.747630 | 25.610529 | 14.544780 | 293.596099 | 145.770164 | 137.956814 |
| 6 |  D2021-15343.svs |  53.866016 | 19.862468 |  9.989578 | 273.043651 | 146.639732 | 102.914911 |
| 7 | T2018-05476.ndpi | -24.659575 | -0.636017 | -3.715397 |  19.266292 | -10.067838 |   5.670925 |

@CDPDisk
Copy link

CDPDisk commented Dec 4, 2024

@CDPDisk Thank you for the records and insightful analyses. I also explored the data and found some points with the code below.

  • with num_workers==4, SSD is more likely to be faster than HDD comparing to num_workers=1.
  • with type=="dz", there are more difference compared to type=="openslide".

As you have mentioned, reading speed depends on the data format because the numbers of layers in a whole slide image data pyramid are different. I'm not sure, but SVS (Aperio) has 3 layers while NDPI (Hamamatsu) has almost 10 for a same image of 50000x50000. This leads slower speed for SVS because it has less indexed layer to load from. Please see level_dimensions of openslide and set the tile_size to some number near to the dimensions output. Also this might be one of the reasons of no significant difference between HDD and SSD. Anyway, if you need to repeat a number of experiments without changing the coordinates of tiles, I recommend that you tile all the images first and load them from pre-extracted image file for 2x faster loading speed. wsiprocess can be good tool for this purpose :)

# helper

files = pd.Series(sorted(df.file.unique()), name="file")
def show_diff(tile_size, type, num_workers, rename):
    df = pd.read_csv("record.csv")
    df = df[(df.type==type) & (df.tile_size==tile_size) & (df.num_workers==num_workers)].reset_index(drop=True)
    ssd = df[df.disk == "SSD"].sort_values(by="file").speed.reset_index(drop=True)
    hdd = df[df.disk == "HDD"].sort_values(by="file").speed.reset_index(drop=True)
    diff = (ssd - hdd).rename(rename)
    return diff

# pd.concat([
#     files,
#     show_diff(512, "openslide", 1, "t512-w1"),
#     show_diff(5120, "openslide", 1, "t5120-w1"),
#     show_diff(10240, "openslide", 1, "t10240-w1"),
#     show_diff(512, "openslide", 4, "t512-w4"),
#     show_diff(5120, "openslide", 4, "t5120-w4"),
#     show_diff(10240, "openslide", 4, "t10240-w4")
# ], axis=1)

|   |            files |   t512-w1 |  t5120-w1 |  t10240-w1 |    t512-w4 |   t5120-w4 | t10240-w4 |
|--:|-----------------:|----------:|----------:|-----------:|-----------:|-----------:|----------:|
| 0 |   2016-27600.svs | -0.719293 | -0.040872 |  -0.941112 | -10.437483 | -35.053484 |  9.573670 |
| 1 |  2017-16772.ndpi |  2.660345 |  0.411925 |  -6.603403 |  -3.760096 | -52.624435 | 23.775951 |
| 2 |  2018-56178.ndpi | -1.588323 | -2.843337 |   0.591185 |  -8.121937 | -41.055469 | -1.777307 |
| 3 |  2018-57913.ndpi |  0.760055 | -4.158040 |  -2.688662 |  -1.053764 | -56.187543 | -8.596195 |
| 4 |   2019-31077.svs |  0.324398 |  0.410919 |   0.328029 |   3.259479 | -18.384691 | -1.016658 |
| 5 |  D2020-13525.svs |  0.653814 | -0.387489 |  -1.347961 |  -4.567287 |   6.325705 |  0.492831 |
| 6 |  D2021-15343.svs | -0.555875 | -0.341506 |  -1.482710 |   6.321090 |   6.675624 |  2.414112 |
| 7 | T2018-05476.ndpi | -0.964013 | -1.661222 | -15.845805 |  21.999301 |  -5.834994 |  1.447804 |

# pd.concat([
#     files,
#     show_diff(512, "dz", 1, "t512-w1"),
#     show_diff(5120, "dz", 1, "t5120-w1"),
#     show_diff(10240, "dz", 1, "t10240-w1"),
#     show_diff(512, "dz", 4, "t512-w4"),
#     show_diff(5120, "dz", 4, "t5120-w4"),
#     show_diff(10240, "dz", 4, "t10240-w4")
# ], axis=1)

|   |             file |    t512-w1 |  t5120-w1 | t10240-w1 |    t512-w4 |   t5120-w4 |  t10240-w4 |
|--:|-----------------:|-----------:|----------:|----------:|-----------:|-----------:|-----------:|
| 0 |   2016-27600.svs |   0.092700 | 14.292636 |  9.629455 |  80.693135 |  50.028105 |  61.569070 |
| 1 |  2017-16772.ndpi |  40.957503 |  2.934863 | -0.369446 |   9.006010 |   8.737644 |  31.673067 |
| 2 |  2018-56178.ndpi |   9.865274 | 43.074879 | 59.077770 |  20.562455 | 113.167572 | 250.569624 |
| 3 |  2018-57913.ndpi |   7.887157 |  4.709428 |  2.230740 |  18.212027 |  25.220513 |  27.763726 |
| 4 |   2019-31077.svs |  58.328462 | 26.117867 | 15.106393 | 298.646397 | 180.995066 | 143.793427 |
| 5 |  D2020-13525.svs |  59.747630 | 25.610529 | 14.544780 | 293.596099 | 145.770164 | 137.956814 |
| 6 |  D2021-15343.svs |  53.866016 | 19.862468 |  9.989578 | 273.043651 | 146.639732 | 102.914911 |
| 7 | T2018-05476.ndpi | -24.659575 | -0.636017 | -3.715397 |  19.266292 | -10.067838 |   5.670925 |

Thanks for your reply. They are helpful!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants