// *************************************************************************** // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- // Last modified: 16 August 2010 (DB) // --------------------------------------------------------------------------- // BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. // --------------------------------------------------------------------------- // Provides the basic functionality for reading & writing BGZF files // *************************************************************************** #include #include "BGZF.h" using namespace BamTools; using std::string; using std::min; BgzfData::BgzfData(void) : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) , CompressedBlockSize(MAX_BLOCK_SIZE) , BlockLength(0) , BlockOffset(0) , BlockAddress(0) , IsOpen(false) , IsWriteOnly(false) , IsWriteUncompressed(false) , Stream(NULL) , UncompressedBlock(NULL) , CompressedBlock(NULL) { try { CompressedBlock = new char[CompressedBlockSize]; UncompressedBlock = new char[UncompressedBlockSize]; } catch( std::bad_alloc& ba ) { printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n"); exit(1); } } // destructor BgzfData::~BgzfData(void) { if( CompressedBlock ) delete[] CompressedBlock; if( UncompressedBlock ) delete[] UncompressedBlock; } // closes BGZF file void BgzfData::Close(void) { // skip if file not open, otherwise set flag if ( !IsOpen ) return; // if writing to file, flush the current BGZF block, // then write an empty block (as EOF marker) if ( IsWriteOnly ) { FlushBlock(); int blockLength = DeflateBlock(); fwrite(CompressedBlock, 1, blockLength, Stream); } // flush and close fflush(Stream); fclose(Stream); IsWriteUncompressed = false; IsOpen = false; } // compresses the current block int BgzfData::DeflateBlock(void) { // initialize the gzip header char* buffer = CompressedBlock; memset(buffer, 0, 18); buffer[0] = GZIP_ID1; buffer[1] = (char)GZIP_ID2; buffer[2] = CM_DEFLATE; buffer[3] = FLG_FEXTRA; buffer[9] = (char)OS_UNKNOWN; buffer[10] = BGZF_XLEN; buffer[12] = BGZF_ID1; buffer[13] = BGZF_ID2; buffer[14] = BGZF_LEN; // set compression level const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); // loop to retry for blocks that do not compress enough int inputLength = BlockOffset; int compressedLength = 0; unsigned int bufferSize = CompressedBlockSize; while ( true ) { // initialize zstream values z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)UncompressedBlock; zs.avail_in = inputLength; zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; // initialize the zlib compression algorithm if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { printf("BGZF ERROR: zlib deflate initialization failed.\n"); exit(1); } // compress the data int status = deflate(&zs, Z_FINISH); if ( status != Z_STREAM_END ) { deflateEnd(&zs); // reduce the input length and try again if ( status == Z_OK ) { inputLength -= 1024; if( inputLength < 0 ) { printf("BGZF ERROR: input reduction failed.\n"); exit(1); } continue; } printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); exit(1); } // finalize the compression routine if ( deflateEnd(&zs) != Z_OK ) { printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); exit(1); } compressedLength = zs.total_out; compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; if ( compressedLength > MAX_BLOCK_SIZE ) { printf("BGZF ERROR: deflate overflow.\n"); exit(1); } break; } // store the compressed length BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); // store the CRC32 checksum unsigned int crc = crc32(0, NULL, 0); crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); // ensure that we have less than a block of data left int remaining = BlockOffset - inputLength; if ( remaining > 0 ) { if ( remaining > inputLength ) { printf("BGZF ERROR: after deflate, remainder too large.\n"); exit(1); } memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); } BlockOffset = remaining; return compressedLength; } // flushes the data in the BGZF block void BgzfData::FlushBlock(void) { // flush all of the remaining blocks while ( BlockOffset > 0 ) { // compress the data block int blockLength = DeflateBlock(); // flush the data to our output stream int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); if ( numBytesWritten != blockLength ) { printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); exit(1); } BlockAddress += blockLength; } } // de-compresses the current block int BgzfData::InflateBlock(const int& blockLength) { // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)CompressedBlock + 18; zs.avail_in = blockLength - 16; zs.next_out = (Bytef*)UncompressedBlock; zs.avail_out = UncompressedBlockSize; int status = inflateInit2(&zs, GZIP_WINDOW_BITS); if ( status != Z_OK ) { printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); return -1; } status = inflate(&zs, Z_FINISH); if ( status != Z_STREAM_END ) { inflateEnd(&zs); printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); return -1; } status = inflateEnd(&zs); if ( status != Z_OK ) { printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); return -1; } return zs.total_out; } // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { // determine open mode if ( strcmp(mode, "rb") == 0 ) IsWriteOnly = false; else if ( strcmp(mode, "wb") == 0) IsWriteOnly = true; else { printf("BGZF ERROR: unknown file mode: %s\n", mode); return false; } // ---------------------------------------------------------------- // open Stream to read to/write from file, stdin, or stdout // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) // read/write BGZF data to/from a file if ( (filename != "stdin") && (filename != "stdout") ) Stream = fopen(filename.c_str(), mode); // read BGZF data from stdin else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) Stream = freopen(NULL, mode, stdin); // write BGZF data to stdout else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) Stream = freopen(NULL, mode, stdout); if ( !Stream ) { printf("BGZF ERROR: unable to open file %s\n", filename.c_str() ); return false; } // set flags, return success IsOpen = true; IsWriteUncompressed = isWriteUncompressed; return true; } // reads BGZF data into a byte buffer int BgzfData::Read(char* data, const unsigned int dataLength) { if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; char* output = data; unsigned int numBytesRead = 0; while ( numBytesRead < dataLength ) { int bytesAvailable = BlockLength - BlockOffset; if ( bytesAvailable <= 0 ) { if ( !ReadBlock() ) return -1; bytesAvailable = BlockLength - BlockOffset; if ( bytesAvailable <= 0 ) break; } char* buffer = UncompressedBlock; int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); memcpy(output, buffer + BlockOffset, copyLength); BlockOffset += copyLength; output += copyLength; numBytesRead += copyLength; } if ( BlockOffset == BlockLength ) { BlockAddress = ftell64(Stream); BlockOffset = 0; BlockLength = 0; } return numBytesRead; } // reads a BGZF block bool BgzfData::ReadBlock(void) { char header[BLOCK_HEADER_LENGTH]; int64_t blockAddress = ftell64(Stream); int count = fread(header, 1, sizeof(header), Stream); if ( count == 0 ) { BlockLength = 0; return true; } if ( count != sizeof(header) ) { printf("BGZF ERROR: read block failed - could not read block header\n"); return false; } if ( !BgzfData::CheckBlockHeader(header) ) { printf("BGZF ERROR: read block failed - invalid block header\n"); return false; } int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; char* compressedBlock = CompressedBlock; memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); int remaining = blockLength - BLOCK_HEADER_LENGTH; count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); if ( count != remaining ) { printf("BGZF ERROR: read block failed - could not read data from block\n"); return false; } count = InflateBlock(blockLength); if ( count < 0 ) { printf("BGZF ERROR: read block failed - could not decompress block data\n"); return false; } if ( BlockLength != 0 ) BlockOffset = 0; BlockAddress = blockAddress; BlockLength = count; return true; } // seek to position in BGZF file bool BgzfData::Seek(int64_t position) { if ( !IsOpen ) return false; int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { printf("BGZF ERROR: unable to seek in file\n"); return false; } BlockLength = 0; BlockAddress = blockAddress; BlockOffset = blockOffset; return true; } // get file position in BGZF file int64_t BgzfData::Tell(void) { if ( !IsOpen ) return false; else return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); } // writes the supplied data into the BGZF buffer unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { if ( !IsOpen || !IsWriteOnly ) return false; // initialize unsigned int numBytesWritten = 0; const char* input = data; unsigned int blockLength = UncompressedBlockSize; // copy the data to the buffer while ( numBytesWritten < dataLen ) { unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); char* buffer = UncompressedBlock; memcpy(buffer + BlockOffset, input, copyLength); BlockOffset += copyLength; input += copyLength; numBytesWritten += copyLength; if ( BlockOffset == blockLength ) FlushBlock(); } return numBytesWritten; }