15
15
16
16
FTEXT , FHCRC , FEXTRA , FNAME , FCOMMENT = 1 , 2 , 4 , 8 , 16
17
17
18
- READ , WRITE = 1 , 2
18
+ READ = 'rb'
19
+ WRITE = 'wb'
19
20
20
21
_COMPRESS_LEVEL_FAST = 1
21
22
_COMPRESS_LEVEL_TRADEOFF = 6
22
23
_COMPRESS_LEVEL_BEST = 9
23
24
25
+ READ_BUFFER_SIZE = 128 * 1024
26
+ _WRITE_BUFFER_SIZE = 4 * io .DEFAULT_BUFFER_SIZE
27
+
24
28
25
29
def open (filename , mode = "rb" , compresslevel = _COMPRESS_LEVEL_BEST ,
26
30
encoding = None , errors = None , newline = None ):
@@ -118,6 +122,21 @@ class BadGzipFile(OSError):
118
122
"""Exception raised in some cases for invalid gzip files."""
119
123
120
124
125
+ class _WriteBufferStream (io .RawIOBase ):
126
+ """Minimal object to pass WriteBuffer flushes into GzipFile"""
127
+ def __init__ (self , gzip_file ):
128
+ self .gzip_file = gzip_file
129
+
130
+ def write (self , data ):
131
+ return self .gzip_file ._write_raw (data )
132
+
133
+ def seekable (self ):
134
+ return False
135
+
136
+ def writable (self ):
137
+ return True
138
+
139
+
121
140
class GzipFile (_compression .BaseStream ):
122
141
"""The GzipFile class simulates most of the methods of a file object with
123
142
the exception of the truncate() method.
@@ -160,9 +179,10 @@ def __init__(self, filename=None, mode=None,
160
179
and 9 is slowest and produces the most compression. 0 is no compression
161
180
at all. The default is 9.
162
181
163
- The mtime argument is an optional numeric timestamp to be written
164
- to the last modification time field in the stream when compressing.
165
- If omitted or None, the current time is used.
182
+ The optional mtime argument is the timestamp requested by gzip. The time
183
+ is in Unix format, i.e., seconds since 00:00:00 UTC, January 1, 1970.
184
+ If mtime is omitted or None, the current time is used. Use mtime = 0
185
+ to generate a compressed stream that does not depend on creation time.
166
186
167
187
"""
168
188
@@ -182,6 +202,7 @@ def __init__(self, filename=None, mode=None,
182
202
if mode is None :
183
203
mode = getattr (fileobj , 'mode' , 'rb' )
184
204
205
+
185
206
if mode .startswith ('r' ):
186
207
self .mode = READ
187
208
raw = _GzipReader (fileobj )
@@ -204,6 +225,9 @@ def __init__(self, filename=None, mode=None,
204
225
zlib .DEF_MEM_LEVEL ,
205
226
0 )
206
227
self ._write_mtime = mtime
228
+ self ._buffer_size = _WRITE_BUFFER_SIZE
229
+ self ._buffer = io .BufferedWriter (_WriteBufferStream (self ),
230
+ buffer_size = self ._buffer_size )
207
231
else :
208
232
raise ValueError ("Invalid mode: {!r}" .format (mode ))
209
233
@@ -212,14 +236,6 @@ def __init__(self, filename=None, mode=None,
212
236
if self .mode == WRITE :
213
237
self ._write_gzip_header (compresslevel )
214
238
215
- @property
216
- def filename (self ):
217
- import warnings
218
- warnings .warn ("use the name attribute" , DeprecationWarning , 2 )
219
- if self .mode == WRITE and self .name [- 3 :] != ".gz" :
220
- return self .name + ".gz"
221
- return self .name
222
-
223
239
@property
224
240
def mtime (self ):
225
241
"""Last modification time read from stream, or None"""
@@ -237,6 +253,11 @@ def _init_write(self, filename):
237
253
self .bufsize = 0
238
254
self .offset = 0 # Current file offset for seek(), tell(), etc
239
255
256
+ def tell (self ):
257
+ self ._check_not_closed ()
258
+ self ._buffer .flush ()
259
+ return super ().tell ()
260
+
240
261
def _write_gzip_header (self , compresslevel ):
241
262
self .fileobj .write (b'\037 \213 ' ) # magic header
242
263
self .fileobj .write (b'\010 ' ) # compression method
@@ -278,6 +299,10 @@ def write(self,data):
278
299
if self .fileobj is None :
279
300
raise ValueError ("write() on closed GzipFile object" )
280
301
302
+ return self ._buffer .write (data )
303
+
304
+ def _write_raw (self , data ):
305
+ # Called by our self._buffer underlying WriteBufferStream.
281
306
if isinstance (data , (bytes , bytearray )):
282
307
length = len (data )
283
308
else :
@@ -326,18 +351,19 @@ def closed(self):
326
351
327
352
def close (self ):
328
353
fileobj = self .fileobj
329
- if fileobj is None :
354
+ if fileobj is None or self . _buffer . closed :
330
355
return
331
- self .fileobj = None
332
356
try :
333
357
if self .mode == WRITE :
358
+ self ._buffer .flush ()
334
359
fileobj .write (self .compress .flush ())
335
360
write32u (fileobj , self .crc )
336
361
# self.size may exceed 2 GiB, or even 4 GiB
337
362
write32u (fileobj , self .size & 0xffffffff )
338
363
elif self .mode == READ :
339
364
self ._buffer .close ()
340
365
finally :
366
+ self .fileobj = None
341
367
myfileobj = self .myfileobj
342
368
if myfileobj :
343
369
self .myfileobj = None
@@ -346,6 +372,7 @@ def close(self):
346
372
def flush (self ,zlib_mode = zlib .Z_SYNC_FLUSH ):
347
373
self ._check_not_closed ()
348
374
if self .mode == WRITE :
375
+ self ._buffer .flush ()
349
376
# Ensure the compressor's buffer is flushed
350
377
self .fileobj .write (self .compress .flush (zlib_mode ))
351
378
self .fileobj .flush ()
@@ -376,6 +403,9 @@ def seekable(self):
376
403
377
404
def seek (self , offset , whence = io .SEEK_SET ):
378
405
if self .mode == WRITE :
406
+ self ._check_not_closed ()
407
+ # Flush buffer to ensure validity of self.offset
408
+ self ._buffer .flush ()
379
409
if whence != io .SEEK_SET :
380
410
if whence == io .SEEK_CUR :
381
411
offset = self .offset + offset
@@ -384,10 +414,10 @@ def seek(self, offset, whence=io.SEEK_SET):
384
414
if offset < self .offset :
385
415
raise OSError ('Negative seek in write mode' )
386
416
count = offset - self .offset
387
- chunk = b'\0 ' * 1024
388
- for i in range (count // 1024 ):
417
+ chunk = b'\0 ' * self . _buffer_size
418
+ for i in range (count // self . _buffer_size ):
389
419
self .write (chunk )
390
- self .write (b'\0 ' * (count % 1024 ))
420
+ self .write (b'\0 ' * (count % self . _buffer_size ))
391
421
elif self .mode == READ :
392
422
self ._check_not_closed ()
393
423
return self ._buffer .seek (offset , whence )
@@ -454,7 +484,7 @@ def _read_gzip_header(fp):
454
484
455
485
class _GzipReader (_compression .DecompressReader ):
456
486
def __init__ (self , fp ):
457
- super ().__init__ (_PaddedFile (fp ), zlib .decompressobj ,
487
+ super ().__init__ (_PaddedFile (fp ), zlib ._ZlibDecompressor ,
458
488
wbits = - zlib .MAX_WBITS )
459
489
# Set flag indicating start of a new member
460
490
self ._new_member = True
@@ -502,12 +532,13 @@ def read(self, size=-1):
502
532
self ._new_member = False
503
533
504
534
# Read a chunk of data from the file
505
- buf = self ._fp .read (io .DEFAULT_BUFFER_SIZE )
535
+ if self ._decompressor .needs_input :
536
+ buf = self ._fp .read (READ_BUFFER_SIZE )
537
+ uncompress = self ._decompressor .decompress (buf , size )
538
+ else :
539
+ uncompress = self ._decompressor .decompress (b"" , size )
506
540
507
- uncompress = self ._decompressor .decompress (buf , size )
508
- if self ._decompressor .unconsumed_tail != b"" :
509
- self ._fp .prepend (self ._decompressor .unconsumed_tail )
510
- elif self ._decompressor .unused_data != b"" :
541
+ if self ._decompressor .unused_data != b"" :
511
542
# Prepend the already read bytes to the fileobj so they can
512
543
# be seen by _read_eof() and _read_gzip_header()
513
544
self ._fp .prepend (self ._decompressor .unused_data )
@@ -518,14 +549,11 @@ def read(self, size=-1):
518
549
raise EOFError ("Compressed file ended before the "
519
550
"end-of-stream marker was reached" )
520
551
521
- self ._add_read_data ( uncompress )
552
+ self ._crc = zlib .crc32 (uncompress , self ._crc )
553
+ self ._stream_size += len (uncompress )
522
554
self ._pos += len (uncompress )
523
555
return uncompress
524
556
525
- def _add_read_data (self , data ):
526
- self ._crc = zlib .crc32 (data , self ._crc )
527
- self ._stream_size = self ._stream_size + len (data )
528
-
529
557
def _read_eof (self ):
530
558
# We've read to the end of the file
531
559
# We check that the computed CRC and size of the
@@ -552,43 +580,21 @@ def _rewind(self):
552
580
self ._new_member = True
553
581
554
582
555
- def _create_simple_gzip_header (compresslevel : int ,
556
- mtime = None ) -> bytes :
557
- """
558
- Write a simple gzip header with no extra fields.
559
- :param compresslevel: Compresslevel used to determine the xfl bytes.
560
- :param mtime: The mtime (must support conversion to a 32-bit integer).
561
- :return: A bytes object representing the gzip header.
562
- """
563
- if mtime is None :
564
- mtime = time .time ()
565
- if compresslevel == _COMPRESS_LEVEL_BEST :
566
- xfl = 2
567
- elif compresslevel == _COMPRESS_LEVEL_FAST :
568
- xfl = 4
569
- else :
570
- xfl = 0
571
- # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
572
- # fields added to header), mtime, xfl and os (255 for unknown OS).
573
- return struct .pack ("<BBBBLBB" , 0x1f , 0x8b , 8 , 0 , int (mtime ), xfl , 255 )
574
-
575
-
576
- def compress (data , compresslevel = _COMPRESS_LEVEL_BEST , * , mtime = None ):
583
+ def compress (data , compresslevel = _COMPRESS_LEVEL_BEST , * , mtime = 0 ):
577
584
"""Compress data in one shot and return the compressed string.
578
585
579
586
compresslevel sets the compression level in range of 0-9.
580
- mtime can be used to set the modification time. The modification time is
581
- set to the current time by default.
587
+ mtime can be used to set the modification time.
588
+ The modification time is set to 0 by default, for reproducibility .
582
589
"""
583
- if mtime == 0 :
584
- # Use zlib as it creates the header with 0 mtime by default.
585
- # This is faster and with less overhead.
586
- return zlib .compress (data , level = compresslevel , wbits = 31 )
587
- header = _create_simple_gzip_header (compresslevel , mtime )
588
- trailer = struct .pack ("<LL" , zlib .crc32 (data ), (len (data ) & 0xffffffff ))
589
- # Wbits=-15 creates a raw deflate block.
590
- return (header + zlib .compress (data , level = compresslevel , wbits = - 15 ) +
591
- trailer )
590
+ # Wbits=31 automatically includes a gzip header and trailer.
591
+ gzip_data = zlib .compress (data , level = compresslevel , wbits = 31 )
592
+ if mtime is None :
593
+ mtime = time .time ()
594
+ # Reuse gzip header created by zlib, replace mtime and OS byte for
595
+ # consistency.
596
+ header = struct .pack ("<4sLBB" , gzip_data , int (mtime ), gzip_data [8 ], 255 )
597
+ return header + gzip_data [10 :]
592
598
593
599
594
600
def decompress (data ):
@@ -655,7 +661,7 @@ def main():
655
661
f = builtins .open (arg , "rb" )
656
662
g = open (arg + ".gz" , "wb" )
657
663
while True :
658
- chunk = f .read (io . DEFAULT_BUFFER_SIZE )
664
+ chunk = f .read (READ_BUFFER_SIZE )
659
665
if not chunk :
660
666
break
661
667
g .write (chunk )
0 commit comments