/* -*- mode: C -*- * * File: pdf-stm-f-lzw.c * Date: Wed Aug 15 14:41:18 2007 * * GNU PDF Library - LZW encoder/decoder filter * */ /* Copyright (C) 2007-2011 Free Software Foundation, Inc. */ /* This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include /* Define LZW encoder */ PDF_STM_FILTER_DEFINE (pdf_stm_f_lzwenc_get, stm_f_lzwenc_init, stm_f_lzwenc_apply, stm_f_lzwenc_deinit); /* Define LZW decoder */ PDF_STM_FILTER_DEFINE (pdf_stm_f_lzwdec_get, stm_f_lzwdec_init, stm_f_lzwdec_apply, stm_f_lzwdec_deinit); #define LZW_PARAM_EARLY_CHANGE "EarlyChange" /* -- LZW helper definitions -- */ typedef unsigned lzw_code_t; #define LZW_DEFAULT_EARLY_CHANGE 1 #define LZW_CODE_SIZE (sizeof (lzw_code_t) << 3) #define LZW_CACHE_SIZE (sizeof (lzw_code_t) << 3) #define LZW_MIN_BITSIZE 9 #define LZW_MAX_BITSIZE 12 #define LZW_MAX_DICTSIZE (1 << LZW_MAX_BITSIZE) #define LZW_NULL_INDEX ~0U enum lzw_special_codes_e { LZW_RESET_CODE = 256, LZW_EOD_CODE, LZW_FIRST_CODE }; /* -- LZW code output/input -- */ /* * Object to read and write codes of variable bitsize in a buffer. * Warning: using both get and put functions may break the buffer. */ struct lzw_buffer_s { pdf_buffer_t *buf; pdf_uchar_t cache [LZW_CACHE_SIZE]; pdf_size_t cache_rp; pdf_size_t cache_wp; lzw_code_t valbuf; lzw_code_t maxval; int bitsize; int valbits; }; static void lzw_buffer_init (struct lzw_buffer_s *b, int bitsize) { b->buf = NULL; b->valbuf = 0; b->valbits = 0; b->bitsize = bitsize; b->maxval = (1 << bitsize) - 1; b->cache_wp = 0; b->cache_rp = 0; } static enum pdf_stm_filter_apply_status_e lzw_buffer_get_code (struct lzw_buffer_s *b, pdf_bool_t finish, lzw_code_t *code) /* out */ { lzw_code_t r; while (b->valbits <= LZW_CODE_SIZE - 8 && !finish) { if (!pdf_buffer_eob_p (b->buf)) { b->valbuf |= (lzw_code_t) b->buf->data [b->buf->rp++] << (LZW_CODE_SIZE - 8 - b->valbits); b->valbits += 8; } else return PDF_STM_FILTER_APPLY_STATUS_NO_INPUT; } if (b->valbits < b->bitsize) return PDF_STM_FILTER_APPLY_STATUS_EOF; r = b->valbuf >> (LZW_CODE_SIZE - b->bitsize); b->valbuf <<= b->bitsize; b->valbits -= b->bitsize; *code = r; return PDF_STM_FILTER_APPLY_STATUS_OK; } /* Once finished, call with 0 as code value to flush the buffer. */ static void lzw_buffer_put_code (struct lzw_buffer_s *b, lzw_code_t code) { b->valbuf |= (lzw_code_t) code << (LZW_CODE_SIZE - b->bitsize - b->valbits); b->valbits += b->bitsize; while (b->valbits >= 8) { if (pdf_buffer_full_p (b->buf)) { b->cache [b->cache_wp++] = (pdf_uchar_t) (b->valbuf >> (LZW_CODE_SIZE - 8)); } else { b->buf->data [b->buf->wp++] = (pdf_uchar_t) (b->valbuf >> (LZW_CODE_SIZE - 8)); } b->valbuf <<= 8; b->valbits -= 8; } } static pdf_bool_t lzw_buffer_flush (struct lzw_buffer_s *b) { while (b->cache_rp != b->cache_wp && !pdf_buffer_full_p (b->buf)) { b->buf->data [b->buf->wp++] = b->cache [b->cache_rp++]; } if (pdf_buffer_full_p (b->buf)) return PDF_FALSE; b->cache_wp = 0; b->cache_rp = 0; return PDF_TRUE; } static pdf_bool_t lzw_buffer_inc_bitsize (struct lzw_buffer_s *b) { if (b->bitsize == LZW_MAX_BITSIZE) return PDF_FALSE; ++b->bitsize; b->maxval = (1 << b->bitsize) - 1; return PDF_TRUE; } static void lzw_buffer_set_bitsize (struct lzw_buffer_s *b, int newsize) { b->bitsize = newsize; b->maxval = (1 << newsize) - 1; } /* -- LZW dictionary handler -- */ /* * The strings are stored in a non balanced ordered binary tree. */ struct lzw_string_s { lzw_code_t prefix; /* Prefix string code */ pdf_uchar_t suffix; /* Appended character */ lzw_code_t first; /* First string with the same prefix. */ lzw_code_t left; /* Next string with smaller suffix and same prefix. */ lzw_code_t right; /* Next string with greater suffix and same prefix. */ }; struct lzw_dict_s { struct lzw_string_s table [LZW_MAX_DICTSIZE]; pdf_size_t size; }; static void lzw_string_init (struct lzw_string_s *s) { memset (s, 0xFF, sizeof (struct lzw_string_s)); } static void lzw_dict_init (struct lzw_dict_s *d) { int i; memset (d->table, 0xFF, sizeof (struct lzw_string_s) * LZW_MAX_DICTSIZE); for (i = 0; i < LZW_FIRST_CODE; i++) { d->table[i].suffix = (pdf_uchar_t) (i & 0xFF); } d->size = LZW_FIRST_CODE; } static pdf_bool_t lzw_dict_add (struct lzw_dict_s *d, struct lzw_string_s *s) { lzw_code_t index; pdf_bool_t must_add; if (s->prefix == LZW_NULL_INDEX) { s->prefix = s->suffix; return PDF_FALSE; /* The string is a basic character, found! */ } index = d->table[s->prefix].first; if (index == LZW_NULL_INDEX) { d->table[s->prefix].first = d->size; } else { must_add = PDF_FALSE; while (!must_add) { if (s->suffix == d->table[index].suffix) { s->prefix = index; return PDF_FALSE; /* The string is already in the table, found! */ } else if (s->suffix < d->table[index].suffix) { if (d->table[index].left == LZW_NULL_INDEX) { d->table[index].left = d->size; must_add = PDF_TRUE; } else { index = d->table[index].left; } } else { if (d->table[index].right == LZW_NULL_INDEX) { d->table[index].right = d->size; must_add = PDF_TRUE; } else { index = d->table[index].right; } } } } d->table[d->size++] = *s; return PDF_TRUE; } static void lzw_dict_reset (struct lzw_dict_s *dict) { lzw_dict_init (dict); } static void lzw_dict_fast_add (struct lzw_dict_s *d, lzw_code_t prefix, pdf_uchar_t suffix) { d->table[d->size].prefix = prefix; d->table[d->size].suffix = suffix; d->size++; } static void lzw_dict_decode (struct lzw_dict_s *d, lzw_code_t code, pdf_uchar_t **decode, pdf_size_t *size) { *size = 0; do { *(*decode)-- = d->table[code].suffix; ++(*size); code = d->table[code].prefix; } while (code != LZW_NULL_INDEX); (*decode)++; } /* -- THE ENCODER -- */ struct lzwenc_state_s { /* cached params */ pdf_i32_t early_change; /* encoding state */ pdf_bool_t must_reset; struct lzw_buffer_s buffer; struct lzw_dict_s dict; struct lzw_string_s string; pdf_bool_t really_finish; }; static pdf_bool_t stm_f_lzwenc_init (const pdf_hash_t *params, void **state, pdf_error_t **error) { struct lzwenc_state_s *filter_state; filter_state = pdf_alloc (sizeof (struct lzwenc_state_s)); if (!filter_state) { pdf_set_error (error, PDF_EDOMAIN_BASE_STM, PDF_ENOMEM, "cannot create LZW encoder internal state: " "couldn't allocate %lu bytes", (unsigned long)sizeof (struct lzwenc_state_s)); return PDF_FALSE; } filter_state->early_change = LZW_DEFAULT_EARLY_CHANGE; /* set default */ /* EarlyChange is optional! */ if (params && pdf_hash_key_p (params, LZW_PARAM_EARLY_CHANGE)) { filter_state->early_change = pdf_hash_get_bool (params, LZW_PARAM_EARLY_CHANGE); } lzw_buffer_init (&filter_state->buffer, LZW_MIN_BITSIZE); lzw_dict_init (&filter_state->dict); lzw_string_init (&filter_state->string); filter_state->must_reset = PDF_TRUE; filter_state->really_finish = PDF_FALSE; *state = filter_state; return PDF_TRUE; } static enum pdf_stm_filter_apply_status_e stm_f_lzwenc_apply (void *state, pdf_buffer_t *in, pdf_buffer_t *out, pdf_bool_t finish, pdf_error_t **error) { struct lzwenc_state_s *st; st = state; st->buffer.buf = out; if (!lzw_buffer_flush (&st->buffer)) return PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT; if (st->really_finish) return PDF_STM_FILTER_APPLY_STATUS_EOF; if (st->must_reset) { lzw_buffer_put_code (&st->buffer, LZW_RESET_CODE); st->must_reset = PDF_FALSE; } while (!pdf_buffer_eob_p (in) && !pdf_buffer_full_p (out)) { st->string.suffix = in->data [in->rp++]; /* FIXME: only test if value is present (use lzw_dict_find) * * lzw_dict_add tests for the longest matching input and if not * matching it adds a new entry into dict. Adding to dict at this point * makes trouble with bitsize increase. * As stated in ISO32000 bitsize increase shall happen after inserting * table entry 511. This is what we do. But the algorithm steps in * ISO32000 are: * 1) Accumulate longest input sequence * 2) Emmit Code * 3) Create new table entry * * We actually do: * 1,3) inside of lzw_dict_add * 2) */ if (lzw_dict_add (&st->dict, &st->string)) { lzw_buffer_put_code (&st->buffer, st->string.prefix); /* FIXME: insert new dict entry here */ st->string.prefix = st->string.suffix; /* FIXME: first bitsize increase (EarlyChange 0) will happen if * (511 == 511 - 0) but this is one code to early. * dict.size is 511 after adding entry 510. ISO32000 specify bitsize * increase after emitting entry 511 (dict.size is 512 at this point) * We should test with (st->dict.size - 1 == ...) */ if (st->dict.size == st->buffer.maxval - st->early_change) { PDF_DEBUG_BASE ("[LZW encoder] Increasing bitsize... " "(dictsize[%d] == maxval[%d] - earlychange[%d])", st->dict.size, st->buffer.maxval, st->early_change); if (!lzw_buffer_inc_bitsize (&st->buffer)) { lzw_buffer_put_code (&st->buffer, LZW_RESET_CODE); lzw_buffer_set_bitsize (&st->buffer, LZW_MIN_BITSIZE); lzw_dict_reset (&st->dict); } } } } if (finish) { lzw_buffer_put_code (&st->buffer, st->string.prefix); if (st->dict.size == st->buffer.maxval - st->early_change) { PDF_DEBUG_BASE ("[LZW encoder] Increasing bitsize (FINISHING)... " "(dictsize[%d] == maxval[%d] - earlychange[%d])", st->dict.size, st->buffer.maxval, st->early_change); lzw_buffer_inc_bitsize (&st->buffer); } lzw_buffer_put_code (&st->buffer, LZW_EOD_CODE); lzw_buffer_put_code (&st->buffer, 0); /* flush */ /* Delete 0 byte if buffer was already aligned before flush. */ if (out->data[out->wp - 1] == 0x0) out->wp--; st->really_finish = PDF_TRUE; return (pdf_buffer_full_p (out) ? PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT : PDF_STM_FILTER_APPLY_STATUS_EOF); } if (pdf_buffer_full_p (out)) return PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT; /* Default, request more input */ return PDF_STM_FILTER_APPLY_STATUS_NO_INPUT; } static void stm_f_lzwenc_deinit (void *state) { pdf_dealloc (state); } /* -- THE DECODER -- */ enum lzwdec_state { LZWDEC_STATE_START, LZWDEC_STATE_CLEAN, LZWDEC_STATE_WRITE, LZWDEC_STATE_READ, LZWDEC_STATE_LOOP_WRITE, LZWDEC_STATE_LOOP_READ }; struct lzwdec_state_s { /* cached params */ pdf_i32_t early_change; /* state */ pdf_uchar_t dec_buf [LZW_MAX_DICTSIZE]; pdf_uchar_t *decoded; pdf_size_t dec_size; lzw_code_t new_code; lzw_code_t old_code; /* flow managment */ enum lzwdec_state state_pos; pdf_error_t *tmp_error; struct lzw_buffer_s buffer; struct lzw_dict_s dict; }; static pdf_bool_t stm_f_lzwdec_init (const pdf_hash_t *params, void **state, pdf_error_t **error) { struct lzwdec_state_s *filter_state; filter_state = pdf_alloc (sizeof (struct lzwdec_state_s)); if (!filter_state) { pdf_set_error (error, PDF_EDOMAIN_BASE_STM, PDF_ENOMEM, "cannot create LZW decoder internal state: " "couldn't allocate %lu bytes", (unsigned long)sizeof (struct lzwdec_state_s)); return PDF_FALSE; } filter_state->early_change = LZW_DEFAULT_EARLY_CHANGE; /* set default */ /* EarlyChange is optional! */ if (params && pdf_hash_key_p (params, LZW_PARAM_EARLY_CHANGE)) { filter_state->early_change = pdf_hash_get_bool (params, LZW_PARAM_EARLY_CHANGE); } lzw_buffer_init (&filter_state->buffer, LZW_MIN_BITSIZE); lzw_dict_init (&filter_state->dict); filter_state->old_code = LZW_NULL_INDEX; filter_state->decoded = filter_state->dec_buf + (LZW_MAX_DICTSIZE-2); filter_state->dec_size = 0; filter_state->state_pos = LZWDEC_STATE_START; filter_state->tmp_error = NULL; *state = filter_state; return PDF_TRUE; } static void stm_f_lzwdec_deinit (void *state) { pdf_dealloc (state); } /* Returns PDF_FALSE if no more output */ static pdf_bool_t lzwdec_put_decoded (struct lzwdec_state_s *st, pdf_buffer_t *out) { pdf_bool_t no_output = PDF_FALSE; if (st->dec_size) { pdf_size_t to_write; PDF_ASSERT (out->size >= out->wp); /* output the decoded string */ to_write = st->dec_size; if (st->dec_size > (out->size - out->wp)) { to_write = out->size - out->wp; no_output = PDF_TRUE; } memcpy (out->data + out->wp, st->decoded, to_write); out->wp += to_write; st->decoded += to_write; st->dec_size -= to_write; } return !no_output; } /* Returns PDF_FALSE if no more output */ static pdf_bool_t lzwdec_put_code (struct lzwdec_state_s *st, pdf_buffer_t *out, unsigned long code) { if (pdf_buffer_full_p (out)) return PDF_FALSE; out->data [out->wp++] = (pdf_uchar_t) (code & 0xFF); return PDF_TRUE; } #define LZWDEC_CHECK(st, pos, what) \ do { \ enum pdf_stm_filter_apply_status_e ret; \ \ (st)->state_pos = (pos); \ if ((ret = (what)) != PDF_STM_FILTER_APPLY_STATUS_OK) \ { return ret; } \ } while (0); static enum pdf_stm_filter_apply_status_e stm_f_lzwdec_apply (void *state, pdf_buffer_t *in, pdf_buffer_t *out, pdf_bool_t finish, pdf_error_t **error) { struct lzwdec_state_s *st; st = state; st->buffer.buf = in; switch (st->state_pos) { case LZWDEC_STATE_START: break; case LZWDEC_STATE_CLEAN: goto lzwdec_state_clean; case LZWDEC_STATE_WRITE: goto lzwdec_state_write; case LZWDEC_STATE_READ: goto lzwdec_state_read; case LZWDEC_STATE_LOOP_WRITE: goto lzwdec_state_loop_write; case LZWDEC_STATE_LOOP_READ: goto lzwdec_state_loop_read; default: break; } do { /* need a reset */ lzw_buffer_set_bitsize (&st->buffer, LZW_MIN_BITSIZE); lzw_dict_reset (&st->dict); do { lzwdec_state_clean: LZWDEC_CHECK (st, LZWDEC_STATE_CLEAN, lzw_buffer_get_code (&st->buffer, finish, &st->new_code)); } while (st->new_code == LZW_RESET_CODE); if (st->new_code != LZW_EOD_CODE) { lzwdec_state_write: st->state_pos = LZWDEC_STATE_WRITE; if (!lzwdec_put_code (st, out, st->new_code)) return PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT; /* No more output */ st->old_code = st->new_code; lzwdec_state_read: LZWDEC_CHECK (st, LZWDEC_STATE_READ, lzw_buffer_get_code (&st->buffer, finish, &st->new_code)); } while (st->new_code != LZW_EOD_CODE && st->new_code != LZW_RESET_CODE) { st->decoded = st->dec_buf + (LZW_MAX_DICTSIZE - 2); /* Is new code in the dict? */ if (st->new_code < st->dict.size) { lzw_dict_decode (&st->dict, st->new_code, &st->decoded, &st->dec_size); lzw_dict_fast_add (&st->dict, st->old_code, st->decoded[0]); } else { lzw_dict_decode (&st->dict, st->old_code, &st->decoded, &st->dec_size); lzw_dict_fast_add (&st->dict, st->old_code, st->decoded[0]); st->decoded [st->dec_size++] = st->decoded [0]; } /* output the decoded string */ lzwdec_state_loop_write: st->state_pos = LZWDEC_STATE_LOOP_WRITE; if (!lzwdec_put_decoded (st, out)) return PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT; /* No more output */ if (st->dict.size == st->buffer.maxval - 1 - st->early_change) { lzw_buffer_inc_bitsize (&st->buffer); /* break; We must wait for the reset code, don't reset yet. */ } /* get next code */ st->old_code = st->new_code; lzwdec_state_loop_read: LZWDEC_CHECK (st, LZWDEC_STATE_LOOP_READ, lzw_buffer_get_code (&st->buffer, finish, &st->new_code)); } } while (st->new_code != LZW_EOD_CODE); st->state_pos = LZWDEC_STATE_START; return PDF_STM_FILTER_APPLY_STATUS_EOF; } /* End of pdf_stm_f_lzw.c */