20
20
from __future__ import annotations
21
21
22
22
from abc import ABC , abstractmethod
23
+ from collections .abc import Iterable , Iterator , Sequence
24
+ from contextlib import ExitStack , contextmanager
25
+ import copy
23
26
from dataclasses import dataclass
27
+ from functools import cached_property
28
+ from itertools import zip_longest
24
29
import os
25
30
from pathlib import Path , PurePath
26
31
import re
27
32
import tarfile
33
+ import tempfile
28
34
import time
29
35
from types import TracebackType
30
- from typing import BinaryIO , Self
36
+ from typing import BinaryIO , Self , cast
31
37
import zipfile
32
38
33
39
34
40
@dataclass
35
41
class Member (ABC ):
36
42
path : PurePath
37
43
44
+ @property
45
+ def relpath (self ) -> PurePath :
46
+ return PurePath (* self .path .parts [1 :])
47
+
48
+ def with_base (self , base : PurePath ) -> Member :
49
+ member = copy .copy (self )
50
+ member .path = base / member .relpath
51
+ return member
52
+
38
53
39
54
@dataclass
40
55
class FileMember (Member ):
@@ -53,7 +68,7 @@ class SymlinkMember(Member):
53
68
54
69
class ArchiveWriter (ABC ):
55
70
def __init__ (self , path : Path ):
56
- self .base = PurePath ( re . sub ( ' \\ .(tar \\ .xz|zip)$' , '' , path . name ) )
71
+ self .base = _path_base ( path )
57
72
self ._members : dict [PurePath , Member ] = {}
58
73
59
74
def __enter__ (self ) -> Self :
@@ -163,3 +178,101 @@ def close(self) -> None:
163
178
elif isinstance (member , SymlinkMember ):
164
179
raise Exception ('Symlinks not supported in Zip' )
165
180
self ._zip .close ()
181
+
182
+
183
+ class ArchiveReader (ABC ):
184
+ def __init__ (self , path : Path ):
185
+ self .base = _path_base (path )
186
+ self ._tempdir = tempfile .TemporaryDirectory (prefix = 'openslide-bin-' )
187
+ self ._dir = Path (self ._tempdir .name )
188
+
189
+ @classmethod
190
+ @contextmanager
191
+ def group (cls , fhs : Iterable [BinaryIO ]) -> Iterator [Iterator [MemberSet ]]:
192
+ with ExitStack () as stack :
193
+ readers = [
194
+ # mypy thinks we're initializing this ABC, not a subclass
195
+ stack .enter_context (cls (fh )) # type: ignore[arg-type]
196
+ for fh in fhs
197
+ ]
198
+ yield (MemberSet (members ) for members in zip_longest (* readers ))
199
+
200
+ def __enter__ (self ) -> Self :
201
+ return self
202
+
203
+ def __exit__ (
204
+ self ,
205
+ exc_type : type [BaseException ] | None ,
206
+ exc_val : BaseException | None ,
207
+ exc_tb : TracebackType | None ,
208
+ ) -> None :
209
+ self .close ()
210
+
211
+ @abstractmethod
212
+ def close (self ) -> None :
213
+ self ._tempdir .cleanup ()
214
+
215
+ @abstractmethod
216
+ def __iter__ (self ) -> Iterator [Member ]:
217
+ pass
218
+
219
+
220
+ class TarArchiveReader (ArchiveReader ):
221
+ def __init__ (self , fh : BinaryIO ):
222
+ super ().__init__ (Path (fh .name ))
223
+ self ._tar = tarfile .open (fileobj = fh )
224
+ if hasattr (tarfile , 'data_filter' ):
225
+ self ._tar .extraction_filter = tarfile .data_filter
226
+
227
+ def close (self ) -> None :
228
+ self ._tar .close ()
229
+ super ().close ()
230
+
231
+ def __iter__ (self ) -> Iterator [Member ]:
232
+ while True :
233
+ info = self ._tar .next ()
234
+ if info is None :
235
+ return
236
+ path = PurePath (info .name )
237
+ if info .type == tarfile .DIRTYPE :
238
+ yield DirMember (path )
239
+ elif info .type == tarfile .REGTYPE :
240
+ self ._tar .extract (info , self ._dir )
241
+ yield FileMember (path , open (self ._dir / path , 'rb' ))
242
+ elif info .type == tarfile .SYMTYPE :
243
+ yield SymlinkMember (path , PurePath (info .linkname ))
244
+ else :
245
+ raise Exception (
246
+ f'Unsupported member type: { info .type .decode ()} '
247
+ )
248
+
249
+
250
+ class MemberSet :
251
+ def __init__ (self , members : Sequence [Member | None ]):
252
+ if not all (members ):
253
+ raise Exception ('Missing member in one or more archives' )
254
+ self .members = cast (Sequence [Member ], members )
255
+
256
+ def __getitem__ (self , idx : int ) -> Member :
257
+ return self .members [idx ]
258
+
259
+ def __iter__ (self ) -> Iterator [Member ]:
260
+ return iter (self .members )
261
+
262
+ @property
263
+ def relpaths (self ) -> Sequence [PurePath ]:
264
+ return [member .relpath for member in self ]
265
+
266
+ @cached_property
267
+ def datas (self ) -> Sequence [bytes ]:
268
+ ret = []
269
+ for member in self :
270
+ if not isinstance (member , FileMember ):
271
+ raise Exception ('Member is not a file' )
272
+ ret .append (member .fh .read ())
273
+ member .fh .seek (0 )
274
+ return ret
275
+
276
+
277
+ def _path_base (path : Path ) -> PurePath :
278
+ return PurePath (re .sub ('\\ .(tar\\ .xz|zip)$' , '' , path .name ))
0 commit comments