import _compression import io from sys import maxsize try: from os import PathLike except ImportError: # For Python 3.5 class PathLike: pass try: # Import C implementation from .c.c_pyzstd import * from .c.c_pyzstd import _train_dict, _finalize_dict, \ _ZSTD_DStreamInSize except ImportError: try: # Import CFFI implementation from .cffi.cffi_pyzstd import * from .cffi.cffi_pyzstd import _train_dict, _finalize_dict, \ _ZSTD_DStreamInSize CFFI_PYZSTD = True except ImportError: raise ImportError( "pyzstd module: Neither C implementation nor CFFI implementation " "can be imported. If pyzstd module is dynamically linked to zstd " "library, make sure not to remove zstd library, and the run-time " "zstd library's version can't be lower than that at compile-time.") __version__ = '0.15.3' __doc__ = '''\ Python bindings to Zstandard (zstd) compression library, the API is similar to Python's bz2/lzma/zlib modules. Documentation: https://pyzstd.readthedocs.io GitHub: https://github.com/animalize/pyzstd PyPI: https://pypi.org/project/pyzstd''' __all__ = ('ZstdCompressor', 'RichMemZstdCompressor', 'ZstdDecompressor', 'EndlessZstdDecompressor', 'CParameter', 'DParameter', 'Strategy', 'ZstdError', 'compress', 'richmem_compress', 'decompress', 'compress_stream', 'decompress_stream', 'ZstdDict', 'train_dict', 'finalize_dict', 'get_frame_info', 'get_frame_size', 'ZstdFile', 'open', 'zstd_version', 'zstd_version_info', 'zstd_support_multithread', 'compressionLevel_values') def compress(data, level_or_option=None, zstd_dict=None): """Compress a block of data, return a bytes object. Compressing b'' will get an empty content frame (9 bytes or more). Parameters data: A bytes-like object, data to be compressed. level_or_option: When it's an int object, it represents compression level. When it's a dict object, it contains advanced compression parameters. zstd_dict: A ZstdDict object, pre-trained dictionary for compression. """ comp = ZstdCompressor(level_or_option, zstd_dict) return comp.compress(data, ZstdCompressor.FLUSH_FRAME) def richmem_compress(data, level_or_option=None, zstd_dict=None): """Compress a block of data, return a bytes object. Use rich memory mode, it's faster than compress() in some cases, but allocates more memory. Compressing b'' will get an empty content frame (9 bytes or more). Parameters data: A bytes-like object, data to be compressed. level_or_option: When it's an int object, it represents compression level. When it's a dict object, it contains advanced compression parameters. zstd_dict: A ZstdDict object, pre-trained dictionary for compression. """ comp = RichMemZstdCompressor(level_or_option, zstd_dict) return comp.compress(data) def _nbytes(dat): if isinstance(dat, (bytes, bytearray)): return len(dat) return memoryview(dat).nbytes def train_dict(samples, dict_size): """Train a zstd dictionary, return a ZstdDict object. Parameters samples: An iterable of samples, a sample is a bytes-like object represents a file. dict_size: The dictionary's maximum size, in bytes. """ # Check argument's type if not isinstance(dict_size, int): raise TypeError('dict_size argument should be an int object.') # Prepare data chunks = [] chunk_sizes = [] for chunk in samples: chunks.append(chunk) chunk_sizes.append(_nbytes(chunk)) chunks = b''.join(chunks) if not chunks: raise ValueError("The samples are empty content, can't train dictionary.") # samples_bytes: samples be stored concatenated in a single flat buffer. # samples_size_list: a list of each sample's size. # dict_size: size of the dictionary, in bytes. dict_content = _train_dict(chunks, chunk_sizes, dict_size) return ZstdDict(dict_content) def finalize_dict(zstd_dict, samples, dict_size, level): """Finalize a zstd dictionary, return a ZstdDict object. Given a custom content as a basis for dictionary, and a set of samples, finalize dictionary by adding headers and statistics according to the zstd dictionary format. You may compose an effective dictionary content by hand, which is used as basis dictionary, and use some samples to finalize a dictionary. The basis dictionary can be a "raw content" dictionary, see is_raw parameter in ZstdDict.__init__ method. Parameters zstd_dict: A ZstdDict object, basis dictionary. samples: An iterable of samples, a sample is a bytes-like object represents a file. dict_size: The dictionary's maximum size, in bytes. level: The compression level expected to use in production. The statistics for each compression level differ, so tuning the dictionary for the compression level can help quite a bit. """ if zstd_version_info < (1, 4, 5): msg = ("This function only available when the underlying zstd " "library's version is greater than or equal to v1.4.5, " "the current underlying zstd library's version is v%s.") % zstd_version raise NotImplementedError(msg) # Check arguments' type if not isinstance(zstd_dict, ZstdDict): raise TypeError('zstd_dict argument should be a ZstdDict object.') if not isinstance(dict_size, int): raise TypeError('dict_size argument should be an int object.') if not isinstance(level, int): raise TypeError('level argument should be an int object.') # Prepare data chunks = [] chunk_sizes = [] for chunk in samples: chunks.append(chunk) chunk_sizes.append(_nbytes(chunk)) chunks = b''.join(chunks) if not chunks: raise ValueError("The samples are empty content, can't finalize dictionary.") # custom_dict_bytes: existing dictionary. # samples_bytes: samples be stored concatenated in a single flat buffer. # samples_size_list: a list of each sample's size. # dict_size: maximal size of the dictionary, in bytes. # compression_level: compression level expected to use in production. dict_content = _finalize_dict(zstd_dict.dict_content, chunks, chunk_sizes, dict_size, level) return ZstdDict(dict_content) # In pyzstd module's blocks output buffer, the first block is 32 KiB. It has a # fast path for this size, if the output data is 32 KiB, it only allocates # memory and copies data once. _32_KiB = 32*1024 # Copied from Python stdlib (_compression.py), except: # 1, ZstdDecompressReader.read(): # Uses ZSTD_DStreamInSize() (131,075 in zstd v1.x) instead of # _compression.BUFFER_SIZE (default is 8 KiB) as read size. # 2, ZstdDecompressReader.seek(): # Uses 32 KiB instead of io.DEFAULT_BUFFER_SIZE (default is 8 KiB) as # max_length. See _32_KiB's comment for details. class ZstdDecompressReader(_compression.DecompressReader): # Add .readall() method for speedup # https://bugs.python.org/issue41486 def readall(self): chunks = [] while True: # sys.maxsize means the max length of output buffer is unlimited, # so that the whole input buffer can be decompressed within one # .decompress() call. data = self.read(maxsize) if not data: break chunks.append(data) return b''.join(chunks) # Compare to super().read(): # 1, Use ZSTD_DStreamInSize() instead of BUFFER_SIZE (default is 8 KiB) # 2, Use EndlessZstdDecompressor to simplify the code def read(self, size=-1): if size < 0: return self.readall() if not size or self._eof: return b"" # Depending on the input data, our call to the decompressor may not # return any data. In this case, try again after reading another block. while True: if self._decompressor.needs_input: rawblock = self._fp.read(_ZSTD_DStreamInSize) if not rawblock: if self._decompressor.at_frame_edge: self._eof = True self._size = self._pos return b"" else: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") else: rawblock = b"" data = self._decompressor.decompress(rawblock, size) if data: self._pos += len(data) return data # Copied from base class, except use 32 KiB instead of # io.DEFAULT_BUFFER_SIZE (default is 8 KiB) as max_length. def seek(self, offset, whence=io.SEEK_SET): # Recalculate offset as an absolute file position. if whence == io.SEEK_SET: pass elif whence == io.SEEK_CUR: offset = self._pos + offset elif whence == io.SEEK_END: # Seeking relative to EOF - we need to know the file's size. if self._size < 0: while self.read(_32_KiB): pass offset = self._size + offset else: raise ValueError("Invalid value for whence: {}".format(whence)) # Make it so that offset is the number of bytes to skip forward. if offset < self._pos: self._rewind() else: offset -= self._pos # Read and discard data until we reach the desired position. while offset > 0: data = self.read(min(_32_KiB, offset)) if not data: break offset -= len(data) return self._pos _MODE_CLOSED = 0 _MODE_READ = 1 _MODE_WRITE = 2 # Copied from Python stdlib (lzma.py), except: # 1, Add .readinto()/.readinto1() methods. # 2, Implement .flush() method. # 3, Remove BaseStream._check_*() overheads. # The implementation needs to ensure: # If in _MODE_READ mode, ._buffer is an io.BufferedReader object. # If in _MODE_WRITE mode, ._compressor is a ZstdCompressor object. # If in _MODE_CLOSED mode, they and ._fp don't exist or are None. # Then if not in a mode, perform the corresponding actions will raise # AttributeError, and ._check_mode() will raise a proper exception. # 4, ZstdFile.__init__(): # io.BufferedReader uses 32 KiB buffer size instead of default value # io.DEFAULT_BUFFER_SIZE (default is 8 KiB). # See _32_KiB's comment for details. # 5, ZstdFile.read1(): # Uses 32 KiB instead of io.DEFAULT_BUFFER_SIZE (default is 8 KiB). # Consistent with ZstdFile.__init__(). class ZstdFile(io.BufferedIOBase): """A file object providing transparent zstd (de)compression. A ZstdFile can act as a wrapper for an existing file object, or refer directly to a named file on disk. Note that ZstdFile provides a *binary* file interface - data read is returned as bytes, and data to be written must be given as bytes. """ def __init__(self, filename, mode="r", *, level_or_option=None, zstd_dict=None): """Open a zstd compressed file in binary mode. filename can be either an actual file name (given as a str, bytes, or PathLike object), in which case the named file is opened, or it can be an existing file object to read from or write to. mode can be "r" for reading (default), "w" for (over)writing, "x" for creating exclusively, or "a" for appending. These can equivalently be given as "rb", "wb", "xb" and "ab" respectively. Parameters level_or_option: When it's an int object, it represents compression level. When it's a dict object, it contains advanced compression parameters. Note, in read mode (decompression), it can only be a dict object, that represents decompression option. It doesn't support int type compression level in this case. zstd_dict: A ZstdDict object, pre-trained dictionary for compression / decompression. """ self._fp = None self._closefp = False self._mode = _MODE_CLOSED if not isinstance(zstd_dict, (type(None), ZstdDict)): raise TypeError("zstd_dict argument should be a ZstdDict object.") # Read or write mode if mode in ("r", "rb"): if not isinstance(level_or_option, (type(None), dict)): msg = ("In read mode (decompression), level_or_option argument " "should be a dict object, that represents decompression " "option. It doesn't support int type compression level " "in this case.") raise TypeError(msg) mode_code = _MODE_READ elif mode in ("w", "wb", "a", "ab", "x", "xb"): if not isinstance(level_or_option, (type(None), int, dict)): msg = "level_or_option argument should be int or dict object." raise TypeError(msg) mode_code = _MODE_WRITE self._compressor = ZstdCompressor(level_or_option, zstd_dict) self._pos = 0 else: raise ValueError("Invalid mode: {!r}".format(mode)) # File object if isinstance(filename, (str, bytes, PathLike)): if "b" not in mode: mode += "b" self._fp = io.open(filename, mode) self._closefp = True # Set ._mode here for ._closefp in .close(). If the following code # fails, IOBase's cleanup code will call .close(), so that ._fp can # be closed. self._mode = mode_code elif hasattr(filename, "read") or hasattr(filename, "write"): self._fp = filename self._mode = mode_code else: raise TypeError("filename must be a str, bytes, file or PathLike object") # ZstdDecompressReader if mode_code == _MODE_READ: raw = ZstdDecompressReader(self._fp, EndlessZstdDecompressor, trailing_error=ZstdError, zstd_dict=zstd_dict, option=level_or_option) self._buffer = io.BufferedReader(raw, _32_KiB) def close(self): """Flush and close the file. May be called more than once without error. Once the file is closed, any other operation on it will raise a ValueError. """ if self._mode == _MODE_CLOSED: return try: # In .__init__ method, if fails after setting ._mode to _MODE_READ, # ._buffer doesn't exist. if hasattr(self, "_buffer"): try: self._buffer.close() finally: # Set to None for ._check_mode() self._buffer = None elif self._mode == _MODE_WRITE: try: self._fp.write(self._compressor.flush()) finally: # Set to None for ._check_mode() self._compressor = None finally: try: if self._closefp: self._fp.close() finally: self._fp = None self._closefp = False self._mode = _MODE_CLOSED # None argument means the file should be closed def _check_mode(self, expected_mode=None): # If closed, raise ValueError. if self._mode == _MODE_CLOSED: raise ValueError("I/O operation on closed file") # Check _MODE_READ/_MODE_WRITE mode if expected_mode == _MODE_READ: if self._mode != _MODE_READ: raise io.UnsupportedOperation("File not open for reading") elif expected_mode == _MODE_WRITE: if self._mode != _MODE_WRITE: raise io.UnsupportedOperation("File not open for writing") # Re-raise other AttributeError exception raise def write(self, data): """Write a bytes-like object to the file. Returns the number of uncompressed bytes written, which is always the length of data in bytes. Note that due to buffering, the file on disk may not reflect the data written until close() is called. """ # Get the length of uncompressed data if isinstance(data, (bytes, bytearray)): length = len(data) else: # Accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes # Compress try: compressed = self._compressor.compress(data) except AttributeError: self._check_mode(_MODE_WRITE) # Write to file. If haven't gathered enough uncompressed data for one # zstd block (128 KiB at most), `compressed` is b''. if compressed: self._fp.write(compressed) self._pos += length return length def flush(self): """Flush remaining data to the underlying stream. It uses ZstdCompressor.FLUSH_BLOCK mode. Abuse of this method will reduce compression ratio, use it only when necessary. If the program is interrupted afterwards, all data can be recovered. To ensure saving to disk, also need to use os.fsync(fd). This method does nothing in reading mode. """ # Like IOBase.flush(), do nothing in reading mode. # TextIOWrapper.close() relies on this behavior. if self._mode == _MODE_READ: return # Flush zstd block try: compressed = self._compressor.flush(ZstdCompressor.FLUSH_BLOCK) except AttributeError: # Closed, raise ValueError. self._check_mode() # Write to file if compressed: self._fp.write(compressed) # Flush the file. Some file-like objects don't have .flush() method. if hasattr(self._fp, "flush"): self._fp.flush() def read(self, size=-1): """Read up to size uncompressed bytes from the file. If size is negative or omitted, read until EOF is reached. Returns b"" if the file is already at EOF. """ try: return self._buffer.read(size) except AttributeError: self._check_mode(_MODE_READ) def read1(self, size=-1): """Read up to size uncompressed bytes, while trying to avoid making multiple reads from the underlying stream. Reads up to a buffer's worth of data if size is negative. Returns b"" if the file is at EOF. """ if size < 0: size = _32_KiB try: return self._buffer.read1(size) except AttributeError: self._check_mode(_MODE_READ) def readinto(self, b): """Read bytes into b. Returns the number of bytes read (0 for EOF). """ try: return self._buffer.readinto(b) except AttributeError: self._check_mode(_MODE_READ) def readinto1(self, b): """Read bytes into b, while trying to avoid making multiple reads from the underlying stream. Returns the number of bytes read (0 for EOF). """ try: return self._buffer.readinto1(b) except AttributeError: self._check_mode(_MODE_READ) def readline(self, size=-1): """Read a line of uncompressed bytes from the file. The terminating newline (if present) is retained. If size is non-negative, no more than size bytes will be read (in which case the line may be incomplete). Returns b'' if already at EOF. """ try: return self._buffer.readline(size) except AttributeError: self._check_mode(_MODE_READ) def seek(self, offset, whence=io.SEEK_SET): """Change the file position. The new position is specified by offset, relative to the position indicated by whence. Possible values for whence are: 0: start of stream (default): offset must not be negative 1: current stream position 2: end of stream; offset must not be positive Returns the new file position. Note that seeking is emulated, so depending on the arguments, this operation may be extremely slow. """ try: # BufferedReader.seek() checks seekable return self._buffer.seek(offset, whence) except AttributeError: self._check_mode(_MODE_READ) def peek(self, size=-1): """Return buffered data without advancing the file position. Always returns at least one byte of data, unless at EOF. The exact number of bytes returned is unspecified. """ # Relies on the undocumented fact that BufferedReader.peek() always # returns at least one byte (except at EOF) try: return self._buffer.peek(size) except AttributeError: self._check_mode(_MODE_READ) def tell(self): """Return the current file position.""" if self._mode == _MODE_READ: return self._buffer.tell() elif self._mode == _MODE_WRITE: return self._pos # Closed, raise ValueError. self._check_mode() def fileno(self): """Return the file descriptor for the underlying file.""" try: return self._fp.fileno() except AttributeError: # Closed, raise ValueError. self._check_mode() @property def closed(self): """True if this file is closed.""" return self._mode == _MODE_CLOSED def writable(self): """Return whether the file was opened for writing.""" if self._mode == _MODE_WRITE: return True elif self._mode == _MODE_READ: return False # Closed, raise ValueError. self._check_mode() def readable(self): """Return whether the file was opened for reading.""" if self._mode == _MODE_READ: return True elif self._mode == _MODE_WRITE: return False # Closed, raise ValueError. self._check_mode() def seekable(self): """Return whether the file supports seeking.""" if self._mode == _MODE_READ: return self._buffer.seekable() elif self._mode == _MODE_WRITE: return False # Closed, raise ValueError. self._check_mode() # Copied from lzma module def open(filename, mode="rb", *, level_or_option=None, zstd_dict=None, encoding=None, errors=None, newline=None): """Open a zstd compressed file in binary or text mode. filename can be either an actual file name (given as a str, bytes, or PathLike object), in which case the named file is opened, or it can be an existing file object to read from or write to. The mode parameter can be "r", "rb" (default), "w", "wb", "x", "xb", "a", "ab" for binary mode, or "rt", "wt", "xt", "at" for text mode. The level_or_option and zstd_dict parameters specify the settings, as for ZstdCompressor, ZstdDecompressor and ZstdFile. When using read mode (decompression), the level_or_option parameter can only be a dict object, that represents decompression option. It doesn't support int type compression level in this case. For binary mode, this function is equivalent to the ZstdFile constructor: ZstdFile(filename, mode, ...). In this case, the encoding, errors and newline parameters must not be provided. For text mode, an ZstdFile object is created, and wrapped in an io.TextIOWrapper instance with the specified encoding, error handling behavior, and line ending(s). """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) else: if encoding is not None: raise ValueError("Argument 'encoding' not supported in binary mode") if errors is not None: raise ValueError("Argument 'errors' not supported in binary mode") if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") zstd_mode = mode.replace("t", "") binary_file = ZstdFile(filename, zstd_mode, level_or_option=level_or_option, zstd_dict=zstd_dict) if "t" in mode: return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file