0.10.3.1 Source Code

As stated in the comments, these are modified versions of Mark Nelson and Jean-Loup Gailly's huff.c from their excellent publication. They rely on the bitwise-I/O package included with his book. However, you should be able to implement your own replacement fairly easily. See the References section for more information about The Data Compression Book.

/*-- Huffman.c --*/ // // This is huffman.c based on huff.c from Mark Nelson and Jean-Loup Gailly's // "The Data Compression Book" and relies on the bitio library from the same. // To use this code, make a struct called BIT_FILE and link against another // object file with two routines: // // int InputBit (BIT_FILE *) // void OutputBits (BIT_FILE *, int bitv, int length) // // Entrypoints to this module are CompressFile and ExpandFile. // // Or buy the book and use theirs. One day I will get around to commenting, // modifying and converting bitio.c but until then I do not want to put some- // one elses' copyrighted code up on the web. If you can't do this yourself // then bother me and I may do it for you. If you did it yourself, send me // the file and I'll put it up here so others can use it. // #include #include #include #include #include "bitio.h" #include "debug.h" #include "global.h" #include "main.h" // // The NODE structure is a node in the Huffman decoding tree. It has // a count, which is its weight in the tree, and the node numbers of // its two children. // typedef struct _NODE { unsigned int uWeight; // scaled byte count int iLeftChild; // child zero int iRightChild; // child one } NODE; // // Since walking the Huffman tree is a slow process we only want to do it // once for each symbol (not every time we see the symbol). When encoding, // first walk the tree for each symbol and save the codes in a code table. // This is a code table entry. // typedef struct _CODE { unsigned int uCode; int iCodeBits; } CODE; // // The special EOS symbol is 256, the first available symbol after all // of the possible bytes. When decoding, reading this symbols // indicates that all of the data has been read in. // #define END_OF_STREAM 256 // // Local function prototypes, defined with ANSI rules only. // void compress_data(FILE *pfInput, BIT_FILE *pbfOutput, CODE *pcCodes); BOOL output_counts(FILE *pfOutputFile, NODE *pnNodes); BOOL input_counts(FILE *pfInputFile, NODE *pnNodes); void count_bytes(FILE *pfInput, unsigned long *puCounts); void scale_counts(unsigned long *puCounts, NODE *pnNodes); int build_tree(NODE *pnNodes); void convert_tree_to_code(NODE *pnNodes, CODE *pcCodes, unsigned int iCodeSoFar, int iBits, int iNode); // // This routine works in a fairly straightforward manner. First, it // has to allocate storage for three different arrays of data. Next, // it counts all the bytes in the input file. The counts are all // stored in long int, so the next step is scale them down to single // byte counts in the NODE array. After the counts are scaled, the // Huffman decoding tree is built on top of the NODE array. Another // routine walks through the tree to build a table of codes, one per // symbol. Finally, when the codes are all ready, compressing the // file is a simple matter. After the file is compressed, the storage // is freed up, and the routine returns. // BOOL CompressFile(FILE *pfInputFile, BIT_FILE *pbfOutputFile, int argc, char *argv[]) { unsigned long *puCounts = NULL; NODE *pnNodes = NULL; CODE *pcCodes = NULL; int iRootNode; BOOL fRetVal = FALSE; // // Allocate memory: // // // This will keep track of the count for each ASCII byte in the input // file. // puCounts = (unsigned long *) malloc( 256 * sizeof(unsigned long)); if (NULL == puCounts) { fprintf(stderr, "Out of memory.\n"); goto CompressReturn; } memset(puCounts, 0, 256 * sizeof(unsigned long)); // // These will be the nodes on the Huffman tree; 514 is the largest number // we need because letter data will sit at leaf nodes. Since in a tree // with n leaf nodes there will be (n-1) internal nodes and there are // 257 possible characters (256 byte values and 1 made up EOS character) // they will need 256 internal nodes. That's 513 total nodes. // pnNodes = (NODE *) malloc (514 * sizeof(NODE)); if (NULL == pnNodes) { fprintf(stderr, "Out of memory.\n"); goto CompressReturn; } // // These will make up the byte -> code mapping table. We need 257 because // we need one for every byte value plus one for the EOS character we made // up. // pcCodes = (CODE *) malloc(257 * sizeof(CODE)); if (NULL == pcCodes) { fprintf(stderr, "Out of memory.\n"); goto CompressReturn; } // // Run through the input stream and count the frequency of each byte. // Place the data in the counts table. // count_bytes(pfInputFile, puCounts); // // This scales the weights of characters counted in the count_bytes // routine down so as to limit the size of the Huffman codes (yet to // be generated) to at most 16 bits each. // scale_counts(puCounts, pnNodes); // // Write the counts to the output file header // if (!output_counts(pbfOutputFile->file, pnNodes )) { goto CompressReturn; } // // Build up the Huffman tree by combining the count trees // if (!(iRootNode = build_tree( pnNodes ))) { goto CompressReturn; } // // Walk the tree for each character symbol and get a code value for // each one // convert_tree_to_code(pnNodes, pcCodes, 0, 0, iRootNode); // // Do the actual compression // compress_data( pfInputFile, pbfOutputFile, pcCodes ); // // We made it! // fRetVal = TRUE; CompressReturn: // // Cleanup // if (puCounts) free(puCounts); if (pnNodes) free(pnNodes); if (pcCodes) free(pcCodes); return(fRetVal); } BOOL ExpandFile(BIT_FILE *pbfInputFile, FILE *pfOutput, int argc, char *argv[]) { NODE *pnNodes; int iRootNode; // // Allocate memory for the tree -- 514 is the max sizeof the tree because // there are at most 257 leaf nodes and therefore max 256 internal nodes // and we use one node position to store a HUGE tree while building it. // if ((pnNodes = (NODE *) malloc(514 * sizeof(NODE))) == NULL) { fprintf(stderr, "Out of memory.\n"); return(FALSE); } // // This routine reads the header count table from the compressed image // if (!input_counts(pbfInputFile->file, pnNodes)) { free(pnNodes); return(FALSE); } // // Build the Huffman tree from the table we read in the last step. // iRootNode = build_tree(pnNodes); // // Do the actual expansion of the compressed file. // expand_data(pbfInput, pfOutput, pnNodes, iRootNode); // // Cleanup and return // free(pnNodes); return(TRUE); } */ // // In order for the compressor to build the same model, I have to store // the symbol counts in the compressed file so the expander can read // them in. In order to save space, I don't save all 256 symbols // unconditionally. Instead store only characters with a positive count // in the format: // // char, count, char, count, ... 0, 0 (beginning of compressed data) // // The code from "The Data Compression Book" stores them in a different // (more efficient) format. Unfortunately, I don't think the extra space // savings are worth complicating the code. See the source for a better // way of doing this. // BOOL output_counts(FILE *pfOutputFile, NODE *pnNodes ) { int iFirst = 0; int i; // // Write each non-zero count // for (iFirst = 0; iFirst < 256; iFirst++) { if (pnNodes[iFirst].uWeight != 0) { if (putc(iFirst, pfOutputFile) != iFirst) { perror("putc"); return(FALSE); } if (putc(pnNodes[iFirst].uWeight, pfOutputFile) != pnNodes[iFirst].uWeight) { perror("putc"); return(FALSE); } } } // // Write the 0 0 terminator part // for (iFirst = 0; iFirst < 2; iFirst++) { if (putc(0, pfOutputFile) != 0) { perror("putc"); return(FALSE); } } return(TRUE); } // // Read the counts from a compressed file header // BOOL input_counts(FILE *pfInputFile, NODE *pnNodes) { int i; int iChar; int iCount; // // Initialize all counts to zero // for (i = 0; i < 256; i++) { pnNodes[i].uWeight = 0; } while(1) { // // Read the character // if ((iChar = getc(pfInputFile)) == EOF) { fprintf(stderr, "Error reading byte counts\n"); return(FALSE); } // // Read the count (hi and low byte) // if ((iCount = getc(pfInputFile)) == EOF) { fprintf(stderr, "Error reading byte counts\n"); return(FALSE); } if (iCount) { pnNodes[iChar].uWeight = (unsigned) iCount; } else { break; } } pnNodes[END_OF_STREAM].uWeight = 1; return(TRUE); } // // This routine counts the frequency of occurence of every byte in // the input file. It marks the place in the input stream where it // started, counts up all the bytes, then returns to the place where // it started. In most C implementations, the length of a file // cannot exceed an unsigned long, so this routine should always // work. // // This routine could possible be sped up by buffering the input file // and counting characters in the buffer. Hopefully your operating // system is doing some buffering behind your back, though... // void count_bytes(FILE *pfInput, unsigned long *puCounts) { long lInputMarker; int c; // // Preconditions // ASSERT(pfInput); ASSERT(puCounts); // // Remember where the file pointer is now. // lInputMarker = ftell(pfInput); ASSERT(lInputMarker >= 0); // // Tally the characters from here to EOF // while ((c = getc(pfInput)) != EOF) puCounts[c]++; // // Put the pointer back where it was. // fseek(pfInput, lInputMarker, SEEK_SET ); ASSERT(ftell(pfInput) == lInputMarker); } // // In order to limit the size of my Huffman codes to 16 bits, I scale // my counts down so they fit in an unsigned char, and then store them // all as initial weights in my NODE array. The only thing to be // careful of is to make sure that a node with a non-zero count doesn't // get scaled down to 0. Nodes with values of 0 don't get codes. // void scale_counts(unsigned long *puCounts, NODE *pnNodes) { unsigned long uMaxCount; int i; // // Preconditions // ASSERT(puCounts); ASSERT(pnNodes); // // Run through the counts and look for the one with the greatest weight. // uMaxCount = 0; for (i = 0; i < 256; i++) { if (puCounts[i] > uMaxCount) uMaxCount = puCounts[i]; } // // If there were no characters in the count table (i.e. we are trying // to compress an empty file) make the max count one so the scaling // formula works right. // if (uMaxCount == 0) { puCounts[0] = 1; uMaxCount = 1; } // // Now create the node weights for each symbol in the input stream -- // use the counts scaled down by 1 / uMaxCount // uMaxCount /= 255; uMaxCount += 1; for (i = 0; i < 256; i++) { pnNodes[i].uWeight = (unsigned int) (puCounts[i] / uMaxCount); // // As the comment stated, make sure we never scale too much such that // a non-zero count byte achieves a scaled weight of zero. // if ((pnNodes[i].uWeight == 0) && (puCounts[i] != 0)) { pnNodes[i].uWeight = 1; } } // // Special end of stream symbol // pnNodes[END_OF_STREAM].uWeight = 1; } // // Building the Huffman tree is fairly simple. All of the active nodes // are scanned in order to locate the two nodes with the minimum // weights. These two weights are added together and assigned to a new // node. The new node makes the two minimum nodes into its 0 child // and 1 child. The two minimum nodes are then marked as inactive. // This process repeats until their is only one node left, which is the // root node. The tree is done, and the root node is passed back // to the calling routine. // // Node 513 is used here to arbitratily provide a node with a guaranteed // maximum value. It starts off being min_1 and min_2 and then we look // for other nodes two other nodes that have smaller values. After all // active (i.e. non-zero weight) nodes have been scanned, we can tell if // there is really only one active node left in the pool by checking to // see if one of the min pointers is still set to 513 (the huge node). // int build_tree(NODE *pnNodes) { int iNextFree; int i; int iMin1; int iMin2; ASSERT(pnNodes); // // This node is guaranteed max value. // pnNodes[513].uWeight = 0xffff; // // NextFree will be used to store positions of internal nodes. Start // at 257 (0..256 may be character leaves, 257 is our little end of // stream symbol) and work up. // for (iNextFree = END_OF_STREAM + 1 ; ; iNextFree++) { ASSERT(iNextFree < 513); // // The minimum we have found so far (max possible weight value) // iMin1 = 513; iMin2 = 513; // // This could be done more efficiently by sorting // for (i = 0; i < iNextFree; i++) { if (pnNodes[i].uWeight != 0) { if (pnNodes[i].uWeight < pnNodes[iMin1].uWeight) { iMin2 = iMin1; iMin1 = i; } else if (pnNodes[i].uWeight < pnNodes[iMin2].uWeight) { iMin2 = i; } } } // // Is there only one tree (the tree and the fat node at 513)? // if (iMin2 == 513) { break; } // // Otherwise combine trees // pnNodes[iNextFree].uWeight = pnNodes[iMin1].uWeight; pnNodes[iNextFree].uWeight += pnNodes[iMin2].uWeight; // // These two nodes no longer in consideration for smallest... // pnNodes[iMin1].uWeight = 0; pnNodes[iMin2].uWeight = 0; pnNodes[iNextFree].iLeftChild = iMin1; pnNodes[iNextFree].iRightChild = iMin2; } // // Subtract one to get last used -- the root node // iNextFree--; return(iNextFree); } // // Since the Huffman tree is built as a decoding tree, there is // no simple way to get the encoding values for each symbol out of // it. This routine recursively walks through the tree, adding the // child bits to each code until it gets to a leaf. When it gets // to a leaf, it stores the code value in the CODE element, and // returns. // // This is a really cool routine... check out how it works. // void convert_tree_to_code(NODE *pnNodes, CODE *pcCodes, unsigned int iCodeSoFar, int iBits, int iNode) { // // Preconditions // ASSERT(pnNodes); ASSERT(pcCodes); // // If this is a leaf node we are done recursing, assign code and pop stack // if (iNode <= END_OF_STREAM) { ASSERT(iBits); ASSERT(iCodeSoFar); // // Code // pcCodes[iNode].uCode = iCodeSoFar; // // Length of code // pcCodes[iNode].iCodeBits = iBits; return; } // // Otherwise we are on an internal node and need to keep going // iCodeSoFar <<= 1; ASSERT((iCodeSoFar | 0) == iCodeSoFar); // // One more bit about to be added to the length // iBits++; // // When going right, add a zero to the code so far.. // convert_tree_to_code(pnNodes, pcCodes, iCodeSoFar, iBits, pnNodes[iNode].iLeftChild); // // When going left add a one.. // convert_tree_to_code(pnNodes, pcCodes, iCodeSoFar | 1, iBits, pnNodes[iNode].iRightChild); } // // Once the tree gets built, and the CODE table is built, compressing // the data is a breeze. Each byte is read in, and its corresponding // Huffman code is sent out. // void compress_data(FILE *pfInput, BIT_FILE *pbfOutput, CODE *pcCodes) { int ch; // used for reading data byte by byte from input // // Preconditions // ASSERT(pfInput); ASSERT(pbfOutput); ASSERT(pcCodes); // // For each character in input, output the equivalent code // while ((ch = getc(pfInput)) != EOF) { OutputBits(pbfOutput, (unsigned long) pcCodes[ch].uCode, pcCodes[ch].iCodeBits); } // // EOS mark -- needed in the expansion process // OutputBits(pbfOutput, (unsigned long) pcCodes[END_OF_STREAM].uCode, pcCodes[END_OF_STREAM].iCodeBits); } // // Expanding compressed data is a little harder than the compression // phase. As each new symbol is decoded, the tree is traversed, // starting at the root node, reading a bit in, and taking either the // iLeftChild or iRightChild path. Eventually, the tree winds down to a // leaf node, and the corresponding symbol is output. If the symbol // is the END_OF_STREAM symbol, it doesn't get written out, and // instead the whole process terminates. // void expand_data(BIT_FILE *pbfInput, FILE *pfOutput, NODE *pnNodes, int iRootNode) { int iNode; // the index of the node in the huffman tree for // the current input symbol. // // Preconditions // ASSERT(pbfInput); ASSERT(pfOutput); ASSERT(pnNodes); while (1) { iNode = iRootNode; // // We will read the input file bit by bit and move the node pointer one // level in the tree for each bit read. We will stop when we get to a // leaf node, write the uncompressed character to output, reset the node // pointer to the root, and repeat the process. // do { // // If we read a set bit (i.e. a "1") then traverse right else // it's a "0" so left in the Huff tree.. // if (InputBit(pbfInput)) iNode = pnNodes[iNode].iRightChild; else iNode = pnNodes[iNode].iLeftChild; } while (iNode > END_OF_STREAM); // // i.e. while we're pointing to an internal node.. stop at a leaf. // // // We are now at a leaf node. Determine if this is a real symbol or // the end of file (represented by a made up EOS character). // if (iNode == END_OF_STREAM) break; // // It's a real thing... put it. // if ((putc(iNode, pfOutput)) != iNode) fprintf(stderr, "Error writing to output file!\n"); } }