哈夫曼字符串编码c语言实现,基于哈夫曼(haffuman)算法的文件压缩的实现(C语言)(原创)...
本文首先簡(jiǎn)要闡述哈夫曼算法的基本思想,然后介紹了使用哈夫曼算法進(jìn)行文件壓縮和解壓縮的
處理步驟,最后給出了C語言實(shí)現(xiàn)的文件壓縮和解壓縮的源代碼。
哈夫曼算法的主要思想是:
①首先遍歷要處理的字符串,得到每個(gè)字符的出現(xiàn)的次數(shù);
②將每個(gè)字符(以其出現(xiàn)次數(shù)為權(quán)值)分別構(gòu)造為二叉樹(注意此時(shí)的二叉樹只有一個(gè)節(jié)點(diǎn));
③取所有二叉樹種種字符出現(xiàn)次數(shù)最小的二叉樹合并為一顆新的二叉樹,新二叉樹根節(jié)點(diǎn)的權(quán)值等于兩個(gè)子節(jié)點(diǎn)的權(quán)值之和,新節(jié)點(diǎn)中的字符忽略;
④重復(fù)過程③直到所有樹被合并為同一棵二叉樹
⑤遍歷最后得到的二叉樹,自頂向下按路徑編號(hào),指向左節(jié)點(diǎn)的邊編號(hào)0,指向右節(jié)點(diǎn)的邊編號(hào)1,從根到葉節(jié)點(diǎn)的所有邊上的0和1鏈接起來,就是葉子節(jié)點(diǎn)中字符的哈夫曼編碼。
下圖展示了哈夫曼編碼的基本思想。
基于哈夫曼算法的文件壓縮和解壓縮過程分別說明如下:
一、文件壓縮:
①統(tǒng)計(jì)詞頻:讀取文件的每個(gè)字節(jié),使用整數(shù)數(shù)組int statistic[MAX_CHARS]統(tǒng)計(jì)每個(gè)字符出現(xiàn)的次數(shù),
由于一個(gè)字節(jié)最多表示2^8-1個(gè)字符,所以MAX_CHARS=256就足夠了。在統(tǒng)計(jì)字符數(shù)的時(shí)候,
對(duì)于每一個(gè)byte, 有statistic[(unsigned char)byte]++。
②構(gòu)造哈夫曼樹:根據(jù)statistic數(shù)組,基于哈夫曼樹算法造哈夫曼樹,由于構(gòu)造的過程中每次都要取最小權(quán)值的字符,
所以需要用優(yōu)先隊(duì)列來維護(hù)每棵樹的根節(jié)點(diǎn)。
③生成編碼:深度優(yōu)先遍歷哈弗曼樹,得到每個(gè)葉子節(jié)點(diǎn)中的字符的編碼并存入字符串?dāng)?shù)組char *dictionary[MAX_CHARS];
④存儲(chǔ)詞頻:新建存儲(chǔ)壓縮數(shù)據(jù)的文件,首先寫入不同字符的個(gè)數(shù),然后將每個(gè)字符及其對(duì)應(yīng)的詞頻寫入文件。
⑤存儲(chǔ)壓縮數(shù)據(jù):再次讀取待壓縮文件的每個(gè)字節(jié)byte,由dictionary[(unsigned int)byte]得到對(duì)應(yīng)的編碼(注意每個(gè)字符
編碼的長(zhǎng)度不一),使用位運(yùn)算一次將編碼中的每個(gè)位(BIT)設(shè)置到一個(gè)char類型的位緩沖中,可能多個(gè)編碼才能填滿一個(gè)
位緩沖,每填滿一次,將位緩沖區(qū)以單個(gè)字節(jié)的形式寫入文件。當(dāng)文件遍歷完成的時(shí)候,文件的壓縮也就完成了。
二、文件解壓:
①讀取詞頻:讀取壓縮文件,將每個(gè)字符的出現(xiàn)次數(shù)存入數(shù)組statistic
②構(gòu)造哈夫曼編碼樹:根據(jù)statistic數(shù)組構(gòu)造哈夫曼編碼樹
③繼續(xù)讀取壓縮文件,對(duì)于每個(gè)字節(jié),使用位運(yùn)算得到每個(gè)位(BIT)。對(duì)于每個(gè)BIT,根據(jù)BIT從根開始遍歷哈夫曼樹,如果BIT是0
就走左分支,如果BIT是1就走有分支,走到葉子節(jié)點(diǎn)的時(shí)候,輸出對(duì)應(yīng)的字符。走到葉子節(jié)點(diǎn)后,重新從哈夫曼樹根節(jié)點(diǎn)開始匹配
每個(gè)位。當(dāng)整個(gè)壓縮文件讀取完畢時(shí),文件解壓縮也完成了。
上文介紹了基于哈夫曼算法的文件壓縮和解壓縮,下面給出基于上述思想的C語言源代碼,一共有5個(gè)文件,其中pq.h和pq.c
是優(yōu)先隊(duì)列,compress.h和compress.c是壓縮和解壓縮的實(shí)現(xiàn),main.c是測(cè)試文件。
pq.h和pq.c請(qǐng)參見另外一篇文章《優(yōu)先隊(duì)列(priority_queue)的C語言實(shí)現(xiàn)》。
另外三個(gè)文件內(nèi)容如下:
/*
* File: compress.h
* Purpose: To compress file using the Haffman algorithm
* Author: puresky
* Date: 2011/05/01
*/
#ifndef _FILE_COMPRESSION_H
#define _FILE_COMPRESSION_H
//Haffuman Tree Node
typedef struct HaffumanTreeNode HTN;
struct HaffumanTreeNode
{
char _ch; //character
int _count; //frequency
struct HaffumanTreeNode *_left; //left child
struct HaffumanTreeNode *_right;//rigth child
};
//FileCompress Struct
#define BITS_PER_CHAR 8 //the number of bits in a char
#define MAX_CHARS 256 //the max number of chars
#define FILE_BUF_SIZE 8192 //the size of Buffer for FILE I/O
typedef struct FileCompressStruct FCS;
struct FileCompressStruct
{
HTN *_haffuman; //A pointer to the root of hafumman tree
unsigned int _charsCount; //To store the number of chars
unsigned int _total; //Total bytes in a file.
char *_dictionary[MAX_CHARS]; //to store the encoding of each character
int _statistic[MAX_CHARS]; //To store the number of each character
};
FCS *fcs_new();
void fcs_compress(FCS *fcs, const char *inFileName, const char *outFileName);
void fcs_decompress(FCS *fcs, const char *inFileName, const char *outFileName);
void fcs_free(FCS *fcs);
#endif
/*
* File: compress.c
* Purpose: To compress file using the Haffman algorithm
* Author: puresky
* Date: 2011/05/01
*/
#include
#include
#include
#include "compress.h"
#include "pq.h"
static const unsigned char mask[8] =
{
0x80, /* 10000000 */
0x40, /* 01000000 */
0x20, /* 00100000 */
0x10, /* 00010000 */
0x08, /* 00001000 */
0x04, /* 00000100 */
0x02, /* 00000010 */
0x01 /* 00000001 */
};
//static functions of HTN
static HTN *htn_new(char ch, int count)
{
HTN *htn = (HTN *)malloc(sizeof(HTN));
htn->_left = NULL;
htn->_right = NULL;
htn->_ch = ch;
htn->_count = count;
return htn;
}
static void htn_print_recursive(HTN *htn, int depth)
{
int i;
if(htn)
{
for(i = 0; i < depth; ++i)
printf(" ");
printf("%d:%d\n", htn->_ch, htn->_count);
htn_print_recursive(htn->_left, depth + 1);
htn_print_recursive(htn->_right, depth + 1);
}
}
static void htn_print(HTN *htn)
{
htn_print_recursive(htn, 0);
}
static void htn_free(HTN *htn)
{
if(htn)
{
htn_free(htn->_left);
htn_free(htn->_right);
free(htn);
}
}
//static functions of FCS
static void fcs_generate_statistic(FCS *fcs, const char *inFileName)
{
int ret, i;
unsigned char buf[FILE_BUF_SIZE];
FILE *pf = fopen(inFileName, "rb");
if(!pf)
{
fprintf(stderr, "can't open file:%s\n", inFileName);
return;
}
while((ret = fread(buf, 1, FILE_BUF_SIZE, pf)) > 0)
{
fcs->_total += ret;
for(i = 0; i < ret; ++i)
{
if(fcs->_statistic[buf[i]] == 0)
fcs->_charsCount++;
fcs->_statistic[buf[i]]++;
}
}
fclose(pf);
}
static void fcs_create_haffuman_tree(FCS *fcs)
{
int i, count;
HTN *htn, *parent, *left, *right;
KeyValue *kv, *kv1, *kv2;
PriorityQueue *pq;
pq = priority_queue_new(PRIORITY_MIN);
for(i = 0; i < MAX_CHARS; ++i)
{
if(fcs->_statistic[i])
{
htn = htn_new((char)i, fcs->_statistic[i]);
kv = key_value_new(fcs->_statistic[i], htn);
priority_queue_enqueue(pq, kv);
}
}
//fprintf(stdout, "the number of haffuman leaf is %d\n", priority_queue_size(pq));
while(!priority_queue_empty(pq))
{
//fprintf(stdout, "priority queue size:%d\n", priority_queue_size(pq));
kv1 = priority_queue_dequeue(pq);
kv2 = priority_queue_dequeue(pq);
if(kv2 == NULL)
{
fcs->_haffuman = kv1->_value;
key_value_free(kv1, NULL);
}
else
{
left = (HTN *)kv1->_value;
right = (HTN *)kv2->_value;
count = left->_count + right->_count;
key_value_free(kv1, NULL);
key_value_free(kv2, NULL);
parent = htn_new(0, count);
parent->_left = left;
parent->_right = right;
kv = key_value_new(count, parent);
priority_queue_enqueue(pq, kv);
}
}
priority_queue_free(pq, NULL);
//htn_print(fcs->_haffuman);
}
static void fcs_generate_dictionary_recursively(HTN *htn, char *dictionary[], char path[], int depth)
{
char *code = NULL;
if(htn)
{
if(htn->_left == NULL && htn->_right == NULL)
{
code = (char *)malloc(sizeof(char) * (depth + 1));
memset(code, 0, sizeof(char) * (depth + 1));
memcpy(code, path, depth);
dictionary[(unsigned char)htn->_ch] = code;
}
if(htn->_left)
{
path[depth] = '0';
fcs_generate_dictionary_recursively(htn->_left, dictionary, path, depth + 1);
}
if(htn->_right)
{
path[depth] = '1';
fcs_generate_dictionary_recursively(htn->_right, dictionary, path, depth + 1);
}
}
}
static void fcs_generate_dictionary(FCS *fcs)
{
char path[32];
fcs_generate_dictionary_recursively(fcs->_haffuman, fcs->_dictionary, path, 0);
//fcs_print_dictionary(fcs);
}
static void fcs_print_dictionary(FCS *fcs)
{
int i;
for(i = 0; i < MAX_CHARS; ++i)
if(fcs->_dictionary[i] != NULL)
fprintf(stdout, "%d:%s\n", i, fcs->_dictionary[i]);
}
static void fcs_write_statistic(FCS *fcs, FILE *pf)
{
int i;
fprintf(pf, "%d\n", fcs->_charsCount);
for(i = 0; i < MAX_CHARS; ++i)
if(fcs->_statistic[i] != 0)
fprintf(pf, "%d %d\n", i, fcs->_statistic[i]);
}
static void fcs_do_compress(FCS *fcs, const char *inFileName, const char* outFileName)
{
int i, j, ret;
char *dictEntry, len;
unsigned int bytes;
char bitBuf;
int bitPos;
unsigned char inBuf[FILE_BUF_SIZE];
FILE *pfIn, *pfOut;
pfIn = fopen(inFileName, "rb");
if(!pfIn)
{
fprintf(stderr, "can't open file:%s\n", inFileName);
return;
}
pfOut = fopen(outFileName, "wb");
if(!pfOut)
{
fclose(pfIn);
fprintf(stderr, "can't open file:%s\n", outFileName);
return;
}
fcs_write_statistic(fcs, pfOut);
bitBuf = 0x00;
bitPos = 0;
bytes = 0;
while((ret = fread(inBuf, 1, FILE_BUF_SIZE, pfIn)) > 0)
{
for(i = 0; i < ret; ++i)
{
len = strlen(fcs->_dictionary[inBuf[i]]);
dictEntry = fcs->_dictionary[inBuf[i]];
//printf("%s\n", dictEntry);
for(j = 0; j < len; ++j)
{
if(dictEntry[j] == '1')
{
bitBuf |= mask[bitPos++];
}
else
{
bitPos++;
}
if(bitPos == BITS_PER_CHAR)
{
fwrite(&bitBuf, 1, sizeof(bitBuf), pfOut);
bitBuf = 0x00;
bitPos = 0;
bytes++;
}
}
}
}
if(bitPos != 0)
{
fwrite(&bitBuf, 1, sizeof(bitBuf), pfOut);
bytes++;
}
fclose(pfIn);
fclose(pfOut);
printf("The compression ratio is:%f%%\n",
(fcs->_total - bytes) * 100.0 / fcs->_total);
}
static void fcs_read_statistic(FCS *fcs, FILE *pf)
{
int i, charsCount = 0;
int ch;
int num;
fscanf(pf, "%d\n", &charsCount);
fcs->_charsCount = charsCount;
for(i = 0; i < charsCount; ++i)
{
fscanf(pf, "%d %d\n", &ch, &num);
fcs->_statistic[(unsigned int)ch] = num;
fcs->_total += num;
}
}
static void fcs_do_decompress(FCS *fcs, FILE *pfIn, const char *outFileName)
{
int i, j, ret;
unsigned char ch;
HTN *htn;
unsigned char buf[FILE_BUF_SIZE];
unsigned char bitCode;
int bitPos;
FILE *pfOut;
pfOut = fopen(outFileName, "wb");
if(!pfOut)
{
fprintf(stderr, "can't open file:%s\n", outFileName);
return;
}
htn = fcs->_haffuman;
bitCode = 0x00;
bitPos = 0;
while((ret = fread(buf, 1, FILE_BUF_SIZE, pfIn)) > 0)
{
for(i = 0; i < ret; ++i)
{
ch = buf[i];
for(j = 0; j < BITS_PER_CHAR; ++j)
{
if(ch & mask[j])
{
htn = htn->_right;
}
else
{
htn = htn->_left;
}
if(htn->_left == NULL && htn->_right == NULL) //leaf
{
if(fcs->_total > 0)
{
fwrite(&htn->_ch, 1, sizeof(char), pfOut);
fcs->_total--;
}
htn = fcs->_haffuman;
}
}
}
}
fclose(pfOut);
}
//FCS functions
FCS *fcs_new()
{
FCS *fcs = (FCS *)malloc(sizeof(FCS));
fcs->_charsCount = 0;
fcs->_total = 0;
memset(fcs->_statistic, 0, sizeof(fcs->_statistic));
memset(fcs->_dictionary, 0, sizeof(fcs->_dictionary));
fcs->_haffuman = NULL;
return fcs;
}
void fcs_free(FCS *fcs)
{
int i;
if(fcs)
{
if(fcs->_haffuman)
htn_free(fcs->_haffuman);
for(i = 0; i < MAX_CHARS; ++i)
free(fcs->_dictionary[i]);
free(fcs);
}
}
void fcs_compress(FCS *fcs, const char *inFileName, const char *outFileName)
{
fprintf(stdout, "To compress file: %s ...\n", inFileName);
fcs_generate_statistic(fcs, inFileName);
fcs_create_haffuman_tree(fcs);
fcs_generate_dictionary(fcs);
fcs_do_compress(fcs, inFileName, outFileName);
fprintf(stdout, "The compressed data of file: %s stored at %s!\n",
inFileName, outFileName);
}
void fcs_decompress(FCS *fcs, const char *inFileName, const char *outFileName)
{
FILE *pfIn;
fprintf(stdout, "To decompress file: %s ...\n", inFileName);
pfIn= fopen(inFileName, "rb");
if(!pfIn)
{
fprintf(stderr, "can't open file: %s\n", inFileName);
return ;
}
fcs_read_statistic(fcs, pfIn);
fcs_create_haffuman_tree(fcs);
fcs_generate_dictionary(fcs);
fcs_do_decompress(fcs, pfIn, outFileName);
fclose(pfIn);
fprintf(stdout, "The decompressed data of file: %s stored at %s\n",
inFileName, outFileName);
}
/*
* File: main.c
* Purpose: testing File Compression
* Author:puresky
* Date: 2011/05/01
*/
#include
#include "compress.h"
const int DO_COMPRESS = 1;
const int DO_DECOMPRESS = 1;
const char *InFile = "data.txt"; //The file to compress.
const char *CompressedFile = "data.hfm"; //Compressed data of the file.
const char *OutFile = "data2.txt"; //The decompressed file of the data.
int main(int argc, char **argv)
{
//1. compress file
if(DO_COMPRESS)
{
FCS *fcs1;
fcs1 = fcs_new();
fcs_compress(fcs1, InFile, CompressedFile);
fcs_free(fcs1);
}
//2. decompress file
if(DO_DECOMPRESS)
{
FCS *fcs2;
fcs2 = fcs_new();
fcs_decompress(fcs2, CompressedFile, OutFile);
fcs_free(fcs2);
}
system("pause");
return 0;
}
總結(jié)
以上是生活随笔為你收集整理的哈夫曼字符串编码c语言实现,基于哈夫曼(haffuman)算法的文件压缩的实现(C语言)(原创)...的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: 后端:REST API URI 设计的七
- 下一篇: Python Tkinter 音乐播放器
