next up previous contents index Search
Next: 0.10.3.2 References Up: 0.10.3 Huffman Compression Previous: 0.10.3 Huffman Compression

0.10.3.1 Source Code

As stated in the comments, these are modified versions of Mark Nelson and Jean-Loup Gailly's huff.c from their excellent publication. They rely on the bitwise-I/O package included with his book. However, you should be able to implement your own replacement fairly easily. See the References section for more information about The Data Compression Book.



/*-- Huffman.c --*/



//
 // This is huffman.c based on huff.c from Mark Nelson and Jean-Loup Gailly's
 // "The Data Compression Book" and relies on the bitio library from the same.
 // To use this code, make a struct called BIT_FILE and link against another
 // object file with two routines:
 //
 //   int InputBit (BIT_FILE *)
 //   void OutputBits (BIT_FILE *, int bitv, int length)
 //
 // Entrypoints to this module are CompressFile and ExpandFile.
 //
 // Or buy the book and use theirs.  One day I will get around to commenting,
 // modifying and converting bitio.c but until then I do not want to put some-
 // one elses' copyrighted code up on the web.  If you can't do this yourself
 // then bother me and I may do it for you.  If you did it yourself, send me
 // the file and I'll put it up here so others can use it.
 //
 
 #include 
 #include 
 #include 
 #include 
 
 #include "bitio.h"
 #include "debug.h"
 #include "global.h"
 #include "main.h"
 
 //
 // The NODE structure is a node in the Huffman decoding tree.  It has
 // a count, which is its weight in the tree, and the node numbers of
 // its two children.  
 //
 typedef struct _NODE 
 {
   unsigned int uWeight;        // scaled byte count
   int iLeftChild;              // child zero
   int iRightChild;             // child one
 } NODE;
 
 //
 // Since walking the Huffman tree is a slow process we only want to do it
 // once for each symbol (not every time we see the symbol).  When encoding,
 // first walk the tree for each symbol and save the codes in a code table.
 // This is a code table entry.
 //
 typedef struct _CODE 
 {
   unsigned int uCode;
   int iCodeBits;
 } CODE;
 
 //
 // The special EOS symbol is 256, the first available symbol after all
 // of the possible bytes.  When decoding, reading this symbols
 // indicates that all of the data has been read in.
 //
 #define END_OF_STREAM  256
 
 //
 // Local function prototypes, defined with ANSI rules only.
 //
 void compress_data(FILE *pfInput, BIT_FILE *pbfOutput, CODE *pcCodes);
 
 BOOL output_counts(FILE *pfOutputFile, NODE *pnNodes);
 BOOL input_counts(FILE *pfInputFile, NODE *pnNodes);
 void count_bytes(FILE *pfInput, unsigned long *puCounts);
 void scale_counts(unsigned long *puCounts, NODE *pnNodes);
 int build_tree(NODE *pnNodes);
 void convert_tree_to_code(NODE *pnNodes, CODE *pcCodes, 
 			  unsigned int iCodeSoFar, int iBits, int iNode);
 
 
 
 
 //
 // This routine works in a fairly straightforward manner.  First, it
 // has to allocate storage for three different arrays of data.  Next,
 // it counts all the bytes in the input file.  The counts are all
 // stored in long int, so the next step is scale them down to single
 // byte counts in the NODE array.  After the counts are scaled, the
 // Huffman decoding tree is built on top of the NODE array.  Another
 // routine walks through the tree to build a table of codes, one per
 // symbol.  Finally, when the codes are all ready, compressing the
 // file is a simple matter.  After the file is compressed, the storage
 // is freed up, and the routine returns.
 //
 
 BOOL CompressFile(FILE *pfInputFile, BIT_FILE *pbfOutputFile, int argc, 
 		  char *argv[])
 {
   unsigned long *puCounts = NULL;
   NODE *pnNodes = NULL;
   CODE *pcCodes = NULL;
   int iRootNode;
   BOOL fRetVal = FALSE;
 
   //
   // Allocate memory:
   //
 
   //
   // This will keep track of the count for each ASCII byte in the input
   // file.
   //
   puCounts = (unsigned long *) malloc( 256 * sizeof(unsigned long));
   if (NULL == puCounts)
   {
     fprintf(stderr, "Out of memory.\n");
     goto CompressReturn;
   }
   memset(puCounts, 0, 256 * sizeof(unsigned long));
 
   //
   // These will be the nodes on the Huffman tree; 514 is the largest number
   // we need because letter data will sit at leaf nodes.  Since in a tree
   // with n leaf nodes there will be (n-1) internal nodes and there are
   // 257 possible characters (256 byte values and 1 made up EOS character)
   // they will need 256 internal nodes.  That's 513 total nodes.
   //
   pnNodes = (NODE *) malloc (514 * sizeof(NODE));
   if (NULL == pnNodes)
   {
     fprintf(stderr, "Out of memory.\n");
     goto CompressReturn;
   }
 
   //
   // These will make up the byte -> code mapping table.  We need 257 because
   // we need one for every byte value plus one for the EOS character we made
   // up.
   //
   pcCodes = (CODE *) malloc(257 * sizeof(CODE));
   if (NULL == pcCodes) 
   {
     fprintf(stderr, "Out of memory.\n");
     goto CompressReturn;
   }
 
   //
   // Run through the input stream and count the frequency of each byte.
   // Place the data in the counts table.
   //
   count_bytes(pfInputFile, puCounts);
 
   //
   // This scales the weights of characters counted in the count_bytes
   // routine down so as to limit the size of the Huffman codes (yet to
   // be generated) to at most 16 bits each.
   //
   scale_counts(puCounts, pnNodes);
 
   //
   // Write the counts to the output file header
   //
   if (!output_counts(pbfOutputFile->file, pnNodes ))
   {
     goto CompressReturn;
   }
 
   //
   // Build up the Huffman tree by combining the count trees
   // 
   if (!(iRootNode = build_tree( pnNodes )))
   {
     goto CompressReturn;
   }
 
   //
   // Walk the tree for each character symbol and get a code value for
   // each one
   //
   convert_tree_to_code(pnNodes, pcCodes, 0, 0, iRootNode);
 
   //
   // Do the actual compression
   //
   compress_data( pfInputFile, pbfOutputFile, pcCodes );
   
   //
   // We made it!
   //
   fRetVal = TRUE;
 
 CompressReturn:
 
   //
   // Cleanup
   //
   if (puCounts) free(puCounts);
   if (pnNodes)  free(pnNodes);
   if (pcCodes)  free(pcCodes);
 
   return(fRetVal);
 }
 
 
 
 BOOL ExpandFile(BIT_FILE *pbfInputFile, FILE *pfOutput, int argc, char *argv[])
 {
   NODE *pnNodes;
   int iRootNode;
 
   //
   // Allocate memory for the tree -- 514 is the max sizeof the tree because
   // there are at most 257 leaf nodes and therefore max 256 internal nodes
   // and we use one node position to store a HUGE tree while building it.
   //
   if ((pnNodes = (NODE *) malloc(514 * sizeof(NODE))) == NULL) 
   {
     fprintf(stderr, "Out of memory.\n");
     return(FALSE);
   }
 
   //
   // This routine reads the header count table from the compressed image
   //
   if (!input_counts(pbfInputFile->file, pnNodes))
   {
     free(pnNodes);
     return(FALSE);
   }
 
   //
   // Build the Huffman tree from the table we read in the last step.
   //
   iRootNode = build_tree(pnNodes);
 
   //
   // Do the actual expansion of the compressed file.
   //
   expand_data(pbfInput, pfOutput, pnNodes, iRootNode);
 
   // 
   // Cleanup and return
   //
   free(pnNodes);
   return(TRUE);
 }
 */
 
 
 //
 // In order for the compressor to build the same model, I have to store
 // the symbol counts in the compressed file so the expander can read
 // them in.  In order to save space, I don't save all 256 symbols
 // unconditionally.  Instead store only characters with a positive count
 // in the format:
 //
 // char, count, char, count, ... 0, 0 (beginning of compressed data)
 //
 // The code from "The Data Compression Book" stores them in a different
 // (more efficient) format.  Unfortunately, I don't think the extra space
 // savings are worth complicating the code.  See the source for a better
 // way of doing this.
 //
 BOOL output_counts(FILE *pfOutputFile, NODE *pnNodes )
 {
   int iFirst = 0;
   int i;
 
   //
   // Write each non-zero count
   //
   for (iFirst = 0; iFirst < 256; iFirst++)
   {
     if (pnNodes[iFirst].uWeight != 0)
     {
       if (putc(iFirst, pfOutputFile) != iFirst)
       {
 	perror("putc");
 	return(FALSE);
       }
 
       if (putc(pnNodes[iFirst].uWeight, pfOutputFile) != 
 	  pnNodes[iFirst].uWeight)
       {
 	perror("putc");
 	return(FALSE);
       }
 
     }
   }
 
   //
   // Write the 0 0 terminator part
   //
   for (iFirst = 0; iFirst < 2; iFirst++) 
   {
     if (putc(0, pfOutputFile) != 0)
     {
       perror("putc");
       return(FALSE);
     }
   }
 
   return(TRUE);
 }
 
 
 
 
 //
 // Read the counts from a compressed file header
 //
 BOOL input_counts(FILE *pfInputFile, NODE *pnNodes)
 {
   int i;
   int iChar;
   int iCount;
 
   //
   // Initialize all counts to zero
   //
   for (i = 0; i < 256; i++)
   {
     pnNodes[i].uWeight = 0;
   }
 
   while(1)
   {
 
     //
     // Read the character
     //
     if ((iChar = getc(pfInputFile)) == EOF) 
     {
       fprintf(stderr, "Error reading byte counts\n");
       return(FALSE);
     }
     
     //
     // Read the count (hi and low byte)
     //
     if ((iCount = getc(pfInputFile)) == EOF)
     {
       fprintf(stderr, "Error reading byte counts\n");
       return(FALSE);
     }
     
     if (iCount) 
     {
       pnNodes[iChar].uWeight = (unsigned) iCount;
     }
     else
     {
       break;
     }
   }
   pnNodes[END_OF_STREAM].uWeight = 1;
   return(TRUE);
 }
 
 
 //
 // This routine counts the frequency of occurence of every byte in
 // the input file.  It marks the place in the input stream where it
 // started, counts up all the bytes, then returns to the place where
 // it started.  In most C implementations, the length of a file
 // cannot exceed an unsigned long, so this routine should always
 // work.
 //
 // This routine could possible be sped up by buffering the input file
 // and counting characters in the buffer.  Hopefully your operating
 // system is doing some buffering behind your back, though...
 //
 void count_bytes(FILE *pfInput, unsigned long *puCounts)
 {
   long lInputMarker;
   int c;
 
   //
   // Preconditions
   //
   ASSERT(pfInput);
   ASSERT(puCounts);
 
   //
   // Remember where the file pointer is now.
   //
   lInputMarker = ftell(pfInput);
   ASSERT(lInputMarker >= 0);
   
   //
   // Tally the characters from here to EOF
   //
   while ((c = getc(pfInput)) != EOF)
     puCounts[c]++;
   
   //
   // Put the pointer back where it was.
   //
   fseek(pfInput, lInputMarker, SEEK_SET );
   ASSERT(ftell(pfInput) == lInputMarker);
 }
 
 
 //
 // In order to limit the size of my Huffman codes to 16 bits, I scale
 // my counts down so they fit in an unsigned char, and then store them
 // all as initial weights in my NODE array.  The only thing to be
 // careful of is to make sure that a node with a non-zero count doesn't
 // get scaled down to 0.  Nodes with values of 0 don't get codes.
 //
 void scale_counts(unsigned long *puCounts, NODE *pnNodes)
 {
   unsigned long uMaxCount;
   int i;
 
   //
   // Preconditions
   //
   ASSERT(puCounts);
   ASSERT(pnNodes);
 
   //
   // Run through the counts and look for the one with the greatest weight.
   //
   uMaxCount = 0;
   for (i = 0; i < 256; i++)
   {
     if (puCounts[i] > uMaxCount)
       uMaxCount = puCounts[i];
   }
 
   //
   // If there were no characters in the count table (i.e. we are trying
   // to compress an empty file) make the max count one so the scaling
   // formula works right.
   //
   if (uMaxCount == 0)
   {
     puCounts[0] = 1;
     uMaxCount = 1;
   }
 
   //
   // Now create the node weights for each symbol in the input stream --
   // use the counts scaled down by 1 / uMaxCount
   //
   uMaxCount /= 255;
   uMaxCount += 1;
   for (i = 0; i < 256; i++) 
   {
     pnNodes[i].uWeight = (unsigned int) (puCounts[i] / uMaxCount);
 
     // 
     // As the comment stated, make sure we never scale too much such that
     // a non-zero count byte achieves a scaled weight of zero.
     //
     if ((pnNodes[i].uWeight == 0) && (puCounts[i] != 0))
     {
       pnNodes[i].uWeight = 1;
     }
   }
 
   //
   // Special end of stream symbol
   //
   pnNodes[END_OF_STREAM].uWeight = 1;
 }
 
 
 
 //
 // Building the Huffman tree is fairly simple.  All of the active nodes
 // are scanned in order to locate the two nodes with the minimum
 // weights.  These two weights are added together and assigned to a new
 // node.  The new node makes the two minimum nodes into its 0 child
 // and 1 child.  The two minimum nodes are then marked as inactive.
 // This process repeats until their is only one node left, which is the
 // root node.  The tree is done, and the root node is passed back
 // to the calling routine.
 //
 // Node 513 is used here to arbitratily provide a node with a guaranteed
 // maximum value.  It starts off being min_1 and min_2 and then we look
 // for other nodes two other nodes that have smaller values.  After all 
 // active (i.e. non-zero weight) nodes have been scanned, we can tell if
 // there is really only one active node left in the pool by checking to
 // see if one of the min pointers is still set to 513 (the huge node).
 //
 int build_tree(NODE *pnNodes)
 {
   int iNextFree;
   int i;
   int iMin1;
   int iMin2;
 
   ASSERT(pnNodes);
 
   //
   // This node is guaranteed max value.
   //
   pnNodes[513].uWeight = 0xffff;
 
   //
   // NextFree will be used to store positions of internal nodes.  Start
   // at 257 (0..256 may be character leaves, 257 is our little end of 
   // stream symbol) and work up.
   //
   for (iNextFree = END_OF_STREAM + 1 ; ; iNextFree++)
   {
     ASSERT(iNextFree < 513);
 
     //
     // The minimum we have found so far (max possible weight value)
     //
     iMin1 = 513;
     iMin2 = 513;
 
     //
     // This could be done more efficiently by sorting
     //
     for (i = 0; i < iNextFree; i++)
     {
       if (pnNodes[i].uWeight != 0)
       {
 	if (pnNodes[i].uWeight < pnNodes[iMin1].uWeight) 
         {
 	  iMin2 = iMin1;
 	  iMin1 = i;
 	} 
 	else if (pnNodes[i].uWeight < pnNodes[iMin2].uWeight) 
 	{
 	  iMin2 = i;
 	}
       }
     }
 
     //
     // Is there only one tree (the tree and the fat node at 513)?
     //
     if (iMin2 == 513)
     {
       break;
     }
 	
     //
     // Otherwise combine trees
     //
     pnNodes[iNextFree].uWeight = pnNodes[iMin1].uWeight;
     pnNodes[iNextFree].uWeight += pnNodes[iMin2].uWeight;
 
     //
     // These two nodes no longer in consideration for smallest...
     //
     pnNodes[iMin1].uWeight = 0;
     pnNodes[iMin2].uWeight = 0;
 
     pnNodes[iNextFree].iLeftChild = iMin1;
     pnNodes[iNextFree].iRightChild = iMin2;
   }
 
   //
   // Subtract one to get last used -- the root node
   //
   iNextFree--;
   return(iNextFree);
 }
 
 
 
 //
 // Since the Huffman tree is built as a decoding tree, there is
 // no simple way to get the encoding values for each symbol out of
 // it.  This routine recursively walks through the tree, adding the
 // child bits to each code until it gets to a leaf.  When it gets
 // to a leaf, it stores the code value in the CODE element, and
 // returns.
 //
 // This is a really cool routine... check out how it works.
 //
 void convert_tree_to_code(NODE *pnNodes, CODE *pcCodes, 
 			  unsigned int iCodeSoFar, int iBits, int iNode)
 {
   //
   // Preconditions
   //
   ASSERT(pnNodes);
   ASSERT(pcCodes);
 
   //
   // If this is a leaf node we are done recursing, assign code and pop stack
   //
   if (iNode <= END_OF_STREAM) 
   {
     ASSERT(iBits);
     ASSERT(iCodeSoFar);
 
     //
     // Code
     //
     pcCodes[iNode].uCode = iCodeSoFar;
 
     //
     // Length of code
     //
     pcCodes[iNode].iCodeBits = iBits;
     return;
   }
 
   //
   // Otherwise we are on an internal node and need to keep going
   //
   iCodeSoFar <<= 1;
   ASSERT((iCodeSoFar | 0) == iCodeSoFar);
 
   //
   // One more bit about to be added to the length
   //
   iBits++;
 
   //
   // When going right, add a zero to the code so far..
   //
   convert_tree_to_code(pnNodes, pcCodes, iCodeSoFar, iBits,
 		       pnNodes[iNode].iLeftChild);
 
   //
   // When going left add a one..
   //
   convert_tree_to_code(pnNodes, pcCodes, iCodeSoFar | 1, iBits, 
 		       pnNodes[iNode].iRightChild);
 }
 
 
 //
 // Once the tree gets built, and the CODE table is built, compressing
 // the data is a breeze.  Each byte is read in, and its corresponding
 // Huffman code is sent out.
 //
 void compress_data(FILE *pfInput, BIT_FILE *pbfOutput, CODE *pcCodes)
 {
   int ch;               // used for reading data byte by byte from input
 
   //
   // Preconditions
   //
   ASSERT(pfInput);
   ASSERT(pbfOutput);
   ASSERT(pcCodes);
 
   //
   // For each character in input, output the equivalent code
   //
   while ((ch = getc(pfInput)) != EOF)
   {
     OutputBits(pbfOutput, (unsigned long) pcCodes[ch].uCode,
 	       pcCodes[ch].iCodeBits);
   }
 
   //
   // EOS mark -- needed in the expansion process
   //
   OutputBits(pbfOutput, (unsigned long) pcCodes[END_OF_STREAM].uCode,
 	     pcCodes[END_OF_STREAM].iCodeBits);
 }
 
 
 
 //
 // Expanding compressed data is a little harder than the compression
 // phase.  As each new symbol is decoded, the tree is traversed,
 // starting at the root node, reading a bit in, and taking either the
 // iLeftChild or iRightChild path.  Eventually, the tree winds down to a
 // leaf node, and the corresponding symbol is output.  If the symbol
 // is the END_OF_STREAM symbol, it doesn't get written out, and
 // instead the whole process terminates.
 //
 
 void expand_data(BIT_FILE *pbfInput, FILE *pfOutput, NODE *pnNodes, 
 		 int iRootNode) 
 {
   int iNode;                 // the index of the node in the huffman tree for
                              // the current input symbol.
 
   //
   // Preconditions
   //
   ASSERT(pbfInput);
   ASSERT(pfOutput);
   ASSERT(pnNodes);
 
   while (1)
   {
     iNode = iRootNode;
 
     //
     // We will read the input file bit by bit and move the node pointer one
     // level in the tree for each bit read.  We will stop when we get to a
     // leaf node, write the uncompressed character to output, reset the node
     // pointer to the root, and repeat the process.
     //
 
     do 
     {
 
       //
       // If we read a set bit (i.e. a "1") then traverse right else 
       // it's a "0" so left in the Huff tree..
       //
       if (InputBit(pbfInput))
 	iNode = pnNodes[iNode].iRightChild;
       else
 	iNode = pnNodes[iNode].iLeftChild;
      
 
     } 
     while (iNode > END_OF_STREAM);
     //
     // i.e. while we're pointing to an internal node.. stop at a leaf.
     //
 
     //
     // We are now at a leaf node.  Determine if this is a real symbol or
     // the end of file (represented by a made up EOS character).
     //
     if (iNode == END_OF_STREAM)
       break;
 
     //
     // It's a real thing... put it.
     //
     if ((putc(iNode, pfOutput)) != iNode)
       fprintf(stderr, "Error writing to output file!\n");
   }
 }



Scott Gasch
1999-07-09