Skip to content

Commit e71258a

Browse files
committed
Issue #15955: Add an option to limit the output size in bz2.decompress().
Patch by Nikolaus Rath.
1 parent 87f5015 commit e71258a

5 files changed

Lines changed: 360 additions & 78 deletions

File tree

Doc/library/bz2.rst

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,32 @@ Incremental (de)compression
162162
you need to decompress a multi-stream input with :class:`BZ2Decompressor`,
163163
you must use a new decompressor for each stream.
164164

165-
.. method:: decompress(data)
165+
.. method:: decompress(data, max_length=-1)
166166

167-
Provide data to the decompressor object. Returns a chunk of decompressed
168-
data if possible, or an empty byte string otherwise.
167+
Decompress *data* (a :term:`bytes-like object`), returning
168+
uncompressed data as bytes. Some of *data* may be buffered
169+
internally, for use in later calls to :meth:`decompress`. The
170+
returned data should be concatenated with the output of any
171+
previous calls to :meth:`decompress`.
169172

170-
Attempting to decompress data after the end of the current stream is
171-
reached raises an :exc:`EOFError`. If any data is found after the end of
172-
the stream, it is ignored and saved in the :attr:`unused_data` attribute.
173+
If *max_length* is nonnegative, returns at most *max_length*
174+
bytes of decompressed data. If this limit is reached and further
175+
output can be produced, the :attr:`~.needs_input` attribute will
176+
be set to ``False``. In this case, the next call to
177+
:meth:`~.decompress` may provide *data* as ``b''`` to obtain
178+
more of the output.
173179

180+
If all of the input data was decompressed and returned (either
181+
because this was less than *max_length* bytes, or because
182+
*max_length* was negative), the :attr:`~.needs_input` attribute
183+
will be set to ``True``.
184+
185+
Attempting to decompress data after the end of stream is reached
186+
raises an `EOFError`. Any data found after the end of the
187+
stream is ignored and saved in the :attr:`~.unused_data` attribute.
188+
189+
.. versionchanged:: 3.5
190+
Added the *max_length* parameter.
174191

175192
.. attribute:: eof
176193

@@ -186,6 +203,13 @@ Incremental (de)compression
186203
If this attribute is accessed before the end of the stream has been
187204
reached, its value will be ``b''``.
188205

206+
.. attribute:: needs_input
207+
208+
``False`` if the :meth:`.decompress` method can provide more
209+
decompressed data before requiring new uncompressed input.
210+
211+
.. versionadded:: 3.5
212+
189213

190214
One-shot (de)compression
191215
------------------------

Lib/test/test_bz2.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from io import BytesIO
66
import os
77
import pickle
8+
import glob
89
import random
910
import subprocess
1011
import sys
@@ -51,6 +52,19 @@ class BaseTest(unittest.TestCase):
5152
EMPTY_DATA = b'BZh9\x17rE8P\x90\x00\x00\x00\x00'
5253
BAD_DATA = b'this is not a valid bzip2 file'
5354

55+
# Some tests need more than one block of uncompressed data. Since one block
56+
# is at least 100 kB, we gather some data dynamically and compress it.
57+
# Note that this assumes that compression works correctly, so we cannot
58+
# simply use the bigger test data for all tests.
59+
test_size = 0
60+
BIG_TEXT = bytearray(128*1024)
61+
for fname in glob.glob(os.path.join(os.path.dirname(__file__), '*.py')):
62+
with open(fname, 'rb') as fh:
63+
test_size += fh.readinto(memoryview(BIG_TEXT)[test_size:])
64+
if test_size > 128*1024:
65+
break
66+
BIG_DATA = bz2.compress(BIG_TEXT, compresslevel=1)
67+
5468
def setUp(self):
5569
self.filename = support.TESTFN
5670

@@ -707,6 +721,95 @@ def testPickle(self):
707721
with self.assertRaises(TypeError):
708722
pickle.dumps(BZ2Decompressor(), proto)
709723

724+
def testDecompressorChunksMaxsize(self):
725+
bzd = BZ2Decompressor()
726+
max_length = 100
727+
out = []
728+
729+
# Feed some input
730+
len_ = len(self.BIG_DATA) - 64
731+
out.append(bzd.decompress(self.BIG_DATA[:len_],
732+
max_length=max_length))
733+
self.assertFalse(bzd.needs_input)
734+
self.assertEqual(len(out[-1]), max_length)
735+
736+
# Retrieve more data without providing more input
737+
out.append(bzd.decompress(b'', max_length=max_length))
738+
self.assertFalse(bzd.needs_input)
739+
self.assertEqual(len(out[-1]), max_length)
740+
741+
# Retrieve more data while providing more input
742+
out.append(bzd.decompress(self.BIG_DATA[len_:],
743+
max_length=max_length))
744+
self.assertLessEqual(len(out[-1]), max_length)
745+
746+
# Retrieve remaining uncompressed data
747+
while not bzd.eof:
748+
out.append(bzd.decompress(b'', max_length=max_length))
749+
self.assertLessEqual(len(out[-1]), max_length)
750+
751+
out = b"".join(out)
752+
self.assertEqual(out, self.BIG_TEXT)
753+
self.assertEqual(bzd.unused_data, b"")
754+
755+
def test_decompressor_inputbuf_1(self):
756+
# Test reusing input buffer after moving existing
757+
# contents to beginning
758+
bzd = BZ2Decompressor()
759+
out = []
760+
761+
# Create input buffer and fill it
762+
self.assertEqual(bzd.decompress(self.DATA[:100],
763+
max_length=0), b'')
764+
765+
# Retrieve some results, freeing capacity at beginning
766+
# of input buffer
767+
out.append(bzd.decompress(b'', 2))
768+
769+
# Add more data that fits into input buffer after
770+
# moving existing data to beginning
771+
out.append(bzd.decompress(self.DATA[100:105], 15))
772+
773+
# Decompress rest of data
774+
out.append(bzd.decompress(self.DATA[105:]))
775+
self.assertEqual(b''.join(out), self.TEXT)
776+
777+
def test_decompressor_inputbuf_2(self):
778+
# Test reusing input buffer by appending data at the
779+
# end right away
780+
bzd = BZ2Decompressor()
781+
out = []
782+
783+
# Create input buffer and empty it
784+
self.assertEqual(bzd.decompress(self.DATA[:200],
785+
max_length=0), b'')
786+
out.append(bzd.decompress(b''))
787+
788+
# Fill buffer with new data
789+
out.append(bzd.decompress(self.DATA[200:280], 2))
790+
791+
# Append some more data, not enough to require resize
792+
out.append(bzd.decompress(self.DATA[280:300], 2))
793+
794+
# Decompress rest of data
795+
out.append(bzd.decompress(self.DATA[300:]))
796+
self.assertEqual(b''.join(out), self.TEXT)
797+
798+
def test_decompressor_inputbuf_3(self):
799+
# Test reusing input buffer after extending it
800+
801+
bzd = BZ2Decompressor()
802+
out = []
803+
804+
# Create almost full input buffer
805+
out.append(bzd.decompress(self.DATA[:200], 5))
806+
807+
# Add even more data to it, requiring resize
808+
out.append(bzd.decompress(self.DATA[200:300], 5))
809+
810+
# Decompress rest of data
811+
out.append(bzd.decompress(self.DATA[300:]))
812+
self.assertEqual(b''.join(out), self.TEXT)
710813

711814
class CompressDecompressTest(BaseTest):
712815
def testCompress(self):

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Core and Builtins
1313
Library
1414
-------
1515

16+
- Issue #15955: Add an option to limit the output size in bz2.decompress().
17+
Patch by Nikolaus Rath.
18+
1619
- Issue #6639: Module-level turtle functions no longer raise TclError after
1720
closing the window.
1821

0 commit comments

Comments
 (0)