### Download

## jcs

/## subtext

/## puff.c

### (View History)

jcs
puff: Add small inflate module from zlib |
Latest amendment: 310 on 2023-02-23 |

1 | /* |

2 | * puff.c |

3 | * Copyright (C) 2002-2013 Mark Adler |

4 | * For conditions of distribution and use, see copyright notice in puff.h |

5 | * version 2.3, 21 Jan 2013 |

6 | * |

7 | * puff.c is a simple inflate written to be an unambiguous way to specify the |

8 | * deflate format. It is not written for speed but rather simplicity. As a |

9 | * side benefit, this code might actually be useful when small code is more |

10 | * important than speed, such as bootstrap applications. For typical deflate |

11 | * data, zlib's inflate() is about four times as fast as puff(). zlib's |

12 | * inflate compiles to around 20K on my machine, whereas puff.c compiles to |

13 | * around 4K on my machine (a PowerPC using GNU cc). If the faster decode() |

14 | * function here is used, then puff() is only twice as slow as zlib's |

15 | * inflate(). |

16 | * |

17 | * All dynamically allocated memory comes from the stack. The stack required |

18 | * is less than 2K bytes. This code is compatible with 16-bit int's and |

19 | * assumes that long's are at least 32 bits. puff.c uses the short data type, |

20 | * assumed to be 16 bits, for arrays in order to conserve memory. The code |

21 | * works whether integers are stored big endian or little endian. |

22 | * |

23 | * In the comments below are "Format notes" that describe the inflate process |

24 | * and document some of the less obvious aspects of the format. This source |

25 | * code is meant to supplement RFC 1951, which formally describes the deflate |

26 | * format: |

27 | * |

28 | * http://www.zlib.org/rfc-deflate.html |

29 | */ |

30 | |

31 | /* |

32 | * Change history: |

33 | * |

34 | * 1.0 10 Feb 2002 - First version |

35 | * 1.1 17 Feb 2002 - Clarifications of some comments and notes |

36 | * - Update puff() dest and source pointers on negative |

37 | * errors to facilitate debugging deflators |

38 | * - Remove longest from struct huffman -- not needed |

39 | * - Simplify offs[] index in construct() |

40 | * - Add input size and checking, using longjmp() to |

41 | * maintain easy readability |

42 | * - Use short data type for large arrays |

43 | * - Use pointers instead of long to specify source and |

44 | * destination sizes to avoid arbitrary 4 GB limits |

45 | * 1.2 17 Mar 2002 - Add faster version of decode(), doubles speed (!), |

46 | * but leave simple version for readability |

47 | * - Make sure invalid distances detected if pointers |

48 | * are 16 bits |

49 | * - Fix fixed codes table error |

50 | * - Provide a scanning mode for determining size of |

51 | * uncompressed data |

52 | * 1.3 20 Mar 2002 - Go back to lengths for puff() parameters [Gailly] |

53 | * - Add a puff.h file for the interface |

54 | * - Add braces in puff() for else do [Gailly] |

55 | * - Use indexes instead of pointers for readability |

56 | * 1.4 31 Mar 2002 - Simplify construct() code set check |

57 | * - Fix some comments |

58 | * - Add FIXLCODES #define |

59 | * 1.5 6 Apr 2002 - Minor comment fixes |

60 | * 1.6 7 Aug 2002 - Minor format changes |

61 | * 1.7 3 Mar 2003 - Added test code for distribution |

62 | * - Added zlib-like license |

63 | * 1.8 9 Jan 2004 - Added some comments on no distance codes case |

64 | * 1.9 21 Feb 2008 - Fix bug on 16-bit integer architectures [Pohland] |

65 | * - Catch missing end-of-block symbol error |

66 | * 2.0 25 Jul 2008 - Add #define to permit distance too far back |

67 | * - Add option in TEST code for puff to write the data |

68 | * - Add option in TEST code to skip input bytes |

69 | * - Allow TEST code to read from piped stdin |

70 | * 2.1 4 Apr 2010 - Avoid variable initialization for happier compilers |

71 | * - Avoid unsigned comparisons for even happier compilers |

72 | * 2.2 25 Apr 2010 - Fix bug in variable initializations [Oberhumer] |

73 | * - Add const where appropriate [Oberhumer] |

74 | * - Split if's and ?'s for coverage testing |

75 | * - Break out test code to separate file |

76 | * - Move NIL to puff.h |

77 | * - Allow incomplete code only if single code length is 1 |

78 | * - Add full code coverage test to Makefile |

79 | * 2.3 21 Jan 2013 - Check for invalid code length codes in dynamic blocks |

80 | */ |

81 | |

82 | #include <setjmp.h> /* for setjmp(), longjmp(), and jmp_buf */ |

83 | #include "puff.h" /* prototype for puff() */ |

84 | |

85 | #define local static /* for local function definitions */ |

86 | |

87 | /* |

88 | * Maximums for allocations and loops. It is not useful to change these -- |

89 | * they are fixed by the deflate format. |

90 | */ |

91 | #define MAXBITS 15 /* maximum bits in a code */ |

92 | #define MAXLCODES 286 /* maximum number of literal/length codes */ |

93 | #define MAXDCODES 30 /* maximum number of distance codes */ |

94 | #define MAXCODES (MAXLCODES+MAXDCODES) /* maximum codes lengths to read */ |

95 | #define FIXLCODES 288 /* number of fixed literal/length codes */ |

96 | |

97 | /* input and output state */ |

98 | struct state { |

99 | /* output state */ |

100 | unsigned char *out; /* output buffer */ |

101 | unsigned long outlen; /* available space at out */ |

102 | unsigned long outcnt; /* bytes written to out so far */ |

103 | |

104 | /* input state */ |

105 | const unsigned char *in; /* input buffer */ |

106 | unsigned long inlen; /* available input at in */ |

107 | unsigned long incnt; /* bytes read so far */ |

108 | int bitbuf; /* bit buffer */ |

109 | int bitcnt; /* number of bits in bit buffer */ |

110 | |

111 | /* input limit error return state for bits() and decode() */ |

112 | jmp_buf env; |

113 | }; |

114 | |

115 | /* |

116 | * Return need bits from the input stream. This always leaves less than |

117 | * eight bits in the buffer. bits() works properly for need == 0. |

118 | * |

119 | * Format notes: |

120 | * |

121 | * - Bits are stored in bytes from the least significant bit to the most |

122 | * significant bit. Therefore bits are dropped from the bottom of the bit |

123 | * buffer, using shift right, and new bytes are appended to the top of the |

124 | * bit buffer, using shift left. |

125 | */ |

126 | local int bits(struct state *s, int need) |

127 | { |

128 | long val; /* bit accumulator (can use up to 20 bits) */ |

129 | |

130 | /* load at least need bits into val */ |

131 | val = s->bitbuf; |

132 | while (s->bitcnt < need) { |

133 | if (s->incnt == s->inlen) |

134 | longjmp(s->env, 1); /* out of input */ |

135 | val |= (long)(s->in[s->incnt++]) << s->bitcnt; /* load eight bits */ |

136 | s->bitcnt += 8; |

137 | } |

138 | |

139 | /* drop need bits and update buffer, always zero to seven bits left */ |

140 | s->bitbuf = (int)(val >> need); |

141 | s->bitcnt -= need; |

142 | |

143 | /* return need bits, zeroing the bits above that */ |

144 | return (int)(val & ((1L << need) - 1)); |

145 | } |

146 | |

147 | /* |

148 | * Process a stored block. |

149 | * |

150 | * Format notes: |

151 | * |

152 | * - After the two-bit stored block type (00), the stored block length and |

153 | * stored bytes are byte-aligned for fast copying. Therefore any leftover |

154 | * bits in the byte that has the last bit of the type, as many as seven, are |

155 | * discarded. The value of the discarded bits are not defined and should not |

156 | * be checked against any expectation. |

157 | * |

158 | * - The second inverted copy of the stored block length does not have to be |

159 | * checked, but it's probably a good idea to do so anyway. |

160 | * |

161 | * - A stored block can have zero length. This is sometimes used to byte-align |

162 | * subsets of the compressed data for random access or partial recovery. |

163 | */ |

164 | local int stored(struct state *s) |

165 | { |

166 | unsigned len; /* length of stored block */ |

167 | |

168 | /* discard leftover bits from current byte (assumes s->bitcnt < 8) */ |

169 | s->bitbuf = 0; |

170 | s->bitcnt = 0; |

171 | |

172 | /* get length and check against its one's complement */ |

173 | if (s->incnt + 4 > s->inlen) |

174 | return 2; /* not enough input */ |

175 | len = s->in[s->incnt++]; |

176 | len |= s->in[s->incnt++] << 8; |

177 | if (s->in[s->incnt++] != (~len & 0xff) || |

178 | s->in[s->incnt++] != ((~len >> 8) & 0xff)) |

179 | return -2; /* didn't match complement! */ |

180 | |

181 | /* copy len bytes from in to out */ |

182 | if (s->incnt + len > s->inlen) |

183 | return 2; /* not enough input */ |

184 | if (s->out != NIL) { |

185 | if (s->outcnt + len > s->outlen) |

186 | return 1; /* not enough output space */ |

187 | while (len--) |

188 | s->out[s->outcnt++] = s->in[s->incnt++]; |

189 | } |

190 | else { /* just scanning */ |

191 | s->outcnt += len; |

192 | s->incnt += len; |

193 | } |

194 | |

195 | /* done with a valid stored block */ |

196 | return 0; |

197 | } |

198 | |

199 | /* |

200 | * Huffman code decoding tables. count[1..MAXBITS] is the number of symbols of |

201 | * each length, which for a canonical code are stepped through in order. |

202 | * symbol[] are the symbol values in canonical order, where the number of |

203 | * entries is the sum of the counts in count[]. The decoding process can be |

204 | * seen in the function decode() below. |

205 | */ |

206 | struct huffman { |

207 | short *count; /* number of symbols of each length */ |

208 | short *symbol; /* canonically ordered symbols */ |

209 | }; |

210 | |

211 | /* |

212 | * Decode a code from the stream s using huffman table h. Return the symbol or |

213 | * a negative value if there is an error. If all of the lengths are zero, i.e. |

214 | * an empty code, or if the code is incomplete and an invalid code is received, |

215 | * then -10 is returned after reading MAXBITS bits. |

216 | * |

217 | * Format notes: |

218 | * |

219 | * - The codes as stored in the compressed data are bit-reversed relative to |

220 | * a simple integer ordering of codes of the same lengths. Hence below the |

221 | * bits are pulled from the compressed data one at a time and used to |

222 | * build the code value reversed from what is in the stream in order to |

223 | * permit simple integer comparisons for decoding. A table-based decoding |

224 | * scheme (as used in zlib) does not need to do this reversal. |

225 | * |

226 | * - The first code for the shortest length is all zeros. Subsequent codes of |

227 | * the same length are simply integer increments of the previous code. When |

228 | * moving up a length, a zero bit is appended to the code. For a complete |

229 | * code, the last code of the longest length will be all ones. |

230 | * |

231 | * - Incomplete codes are handled by this decoder, since they are permitted |

232 | * in the deflate format. See the format notes for fixed() and dynamic(). |

233 | */ |

234 | #ifdef SLOW |

235 | local int decode(struct state *s, const struct huffman *h) |

236 | { |

237 | int len; /* current number of bits in code */ |

238 | int code; /* len bits being decoded */ |

239 | int first; /* first code of length len */ |

240 | int count; /* number of codes of length len */ |

241 | int index; /* index of first code of length len in symbol table */ |

242 | |

243 | code = first = index = 0; |

244 | for (len = 1; len <= MAXBITS; len++) { |

245 | code |= bits(s, 1); /* get next bit */ |

246 | count = h->count[len]; |

247 | if (code - count < first) /* if length len, return symbol */ |

248 | return h->symbol[index + (code - first)]; |

249 | index += count; /* else update for next length */ |

250 | first += count; |

251 | first <<= 1; |

252 | code <<= 1; |

253 | } |

254 | return -10; /* ran out of codes */ |

255 | } |

256 | |

257 | /* |

258 | * A faster version of decode() for real applications of this code. It's not |

259 | * as readable, but it makes puff() twice as fast. And it only makes the code |

260 | * a few percent larger. |

261 | */ |

262 | #else /* !SLOW */ |

263 | local int decode(struct state *s, const struct huffman *h) |

264 | { |

265 | int len; /* current number of bits in code */ |

266 | int code; /* len bits being decoded */ |

267 | int first; /* first code of length len */ |

268 | int count; /* number of codes of length len */ |

269 | int index; /* index of first code of length len in symbol table */ |

270 | int bitbuf; /* bits from stream */ |

271 | int left; /* bits left in next or left to process */ |

272 | short *next; /* next number of codes */ |

273 | |

274 | bitbuf = s->bitbuf; |

275 | left = s->bitcnt; |

276 | code = first = index = 0; |

277 | len = 1; |

278 | next = h->count + 1; |

279 | while (1) { |

280 | while (left--) { |

281 | code |= bitbuf & 1; |

282 | bitbuf >>= 1; |

283 | count = *next++; |

284 | if (code - count < first) { /* if length len, return symbol */ |

285 | s->bitbuf = bitbuf; |

286 | s->bitcnt = (s->bitcnt - len) & 7; |

287 | return h->symbol[index + (code - first)]; |

288 | } |

289 | index += count; /* else update for next length */ |

290 | first += count; |

291 | first <<= 1; |

292 | code <<= 1; |

293 | len++; |

294 | } |

295 | left = (MAXBITS+1) - len; |

296 | if (left == 0) |

297 | break; |

298 | if (s->incnt == s->inlen) |

299 | longjmp(s->env, 1); /* out of input */ |

300 | bitbuf = s->in[s->incnt++]; |

301 | if (left > 8) |

302 | left = 8; |

303 | } |

304 | return -10; /* ran out of codes */ |

305 | } |

306 | #endif /* SLOW */ |

307 | |

308 | /* |

309 | * Given the list of code lengths length[0..n-1] representing a canonical |

310 | * Huffman code for n symbols, construct the tables required to decode those |

311 | * codes. Those tables are the number of codes of each length, and the symbols |

312 | * sorted by length, retaining their original order within each length. The |

313 | * return value is zero for a complete code set, negative for an over- |

314 | * subscribed code set, and positive for an incomplete code set. The tables |

315 | * can be used if the return value is zero or positive, but they cannot be used |

316 | * if the return value is negative. If the return value is zero, it is not |

317 | * possible for decode() using that table to return an error--any stream of |

318 | * enough bits will resolve to a symbol. If the return value is positive, then |

319 | * it is possible for decode() using that table to return an error for received |

320 | * codes past the end of the incomplete lengths. |

321 | * |

322 | * Not used by decode(), but used for error checking, h->count[0] is the number |

323 | * of the n symbols not in the code. So n - h->count[0] is the number of |

324 | * codes. This is useful for checking for incomplete codes that have more than |

325 | * one symbol, which is an error in a dynamic block. |

326 | * |

327 | * Assumption: for all i in 0..n-1, 0 <= length[i] <= MAXBITS |

328 | * This is assured by the construction of the length arrays in dynamic() and |

329 | * fixed() and is not verified by construct(). |

330 | * |

331 | * Format notes: |

332 | * |

333 | * - Permitted and expected examples of incomplete codes are one of the fixed |

334 | * codes and any code with a single symbol which in deflate is coded as one |

335 | * bit instead of zero bits. See the format notes for fixed() and dynamic(). |

336 | * |

337 | * - Within a given code length, the symbols are kept in ascending order for |

338 | * the code bits definition. |

339 | */ |

340 | local int construct(struct huffman *h, const short *length, int n) |

341 | { |

342 | int symbol; /* current symbol when stepping through length[] */ |

343 | int len; /* current length when stepping through h->count[] */ |

344 | int left; /* number of possible codes left of current length */ |

345 | short offs[MAXBITS+1]; /* offsets in symbol table for each length */ |

346 | |

347 | /* count number of codes of each length */ |

348 | for (len = 0; len <= MAXBITS; len++) |

349 | h->count[len] = 0; |

350 | for (symbol = 0; symbol < n; symbol++) |

351 | (h->count[length[symbol]])++; /* assumes lengths are within bounds */ |

352 | if (h->count[0] == n) /* no codes! */ |

353 | return 0; /* complete, but decode() will fail */ |

354 | |

355 | /* check for an over-subscribed or incomplete set of lengths */ |

356 | left = 1; /* one possible code of zero length */ |

357 | for (len = 1; len <= MAXBITS; len++) { |

358 | left <<= 1; /* one more bit, double codes left */ |

359 | left -= h->count[len]; /* deduct count from possible codes */ |

360 | if (left < 0) |

361 | return left; /* over-subscribed--return negative */ |

362 | } /* left > 0 means incomplete */ |

363 | |

364 | /* generate offsets into symbol table for each length for sorting */ |

365 | offs[1] = 0; |

366 | for (len = 1; len < MAXBITS; len++) |

367 | offs[len + 1] = offs[len] + h->count[len]; |

368 | |

369 | /* |

370 | * put symbols in table sorted by length, by symbol order within each |

371 | * length |

372 | */ |

373 | for (symbol = 0; symbol < n; symbol++) |

374 | if (length[symbol] != 0) |

375 | h->symbol[offs[length[symbol]]++] = symbol; |

376 | |

377 | /* return zero for complete set, positive for incomplete set */ |

378 | return left; |

379 | } |

380 | |

381 | /* |

382 | * Decode literal/length and distance codes until an end-of-block code. |

383 | * |

384 | * Format notes: |

385 | * |

386 | * - Compressed data that is after the block type if fixed or after the code |

387 | * description if dynamic is a combination of literals and length/distance |

388 | * pairs terminated by and end-of-block code. Literals are simply Huffman |

389 | * coded bytes. A length/distance pair is a coded length followed by a |

390 | * coded distance to represent a string that occurs earlier in the |

391 | * uncompressed data that occurs again at the current location. |

392 | * |

393 | * - Literals, lengths, and the end-of-block code are combined into a single |

394 | * code of up to 286 symbols. They are 256 literals (0..255), 29 length |

395 | * symbols (257..285), and the end-of-block symbol (256). |

396 | * |

397 | * - There are 256 possible lengths (3..258), and so 29 symbols are not enough |

398 | * to represent all of those. Lengths 3..10 and 258 are in fact represented |

399 | * by just a length symbol. Lengths 11..257 are represented as a symbol and |

400 | * some number of extra bits that are added as an integer to the base length |

401 | * of the length symbol. The number of extra bits is determined by the base |

402 | * length symbol. These are in the static arrays below, lens[] for the base |

403 | * lengths and lext[] for the corresponding number of extra bits. |

404 | * |

405 | * - The reason that 258 gets its own symbol is that the longest length is used |

406 | * often in highly redundant files. Note that 258 can also be coded as the |

407 | * base value 227 plus the maximum extra value of 31. While a good deflate |

408 | * should never do this, it is not an error, and should be decoded properly. |

409 | * |

410 | * - If a length is decoded, including its extra bits if any, then it is |

411 | * followed a distance code. There are up to 30 distance symbols. Again |

412 | * there are many more possible distances (1..32768), so extra bits are added |

413 | * to a base value represented by the symbol. The distances 1..4 get their |

414 | * own symbol, but the rest require extra bits. The base distances and |

415 | * corresponding number of extra bits are below in the static arrays dist[] |

416 | * and dext[]. |

417 | * |

418 | * - Literal bytes are simply written to the output. A length/distance pair is |

419 | * an instruction to copy previously uncompressed bytes to the output. The |

420 | * copy is from distance bytes back in the output stream, copying for length |

421 | * bytes. |

422 | * |

423 | * - Distances pointing before the beginning of the output data are not |

424 | * permitted. |

425 | * |

426 | * - Overlapped copies, where the length is greater than the distance, are |

427 | * allowed and common. For example, a distance of one and a length of 258 |

428 | * simply copies the last byte 258 times. A distance of four and a length of |

429 | * twelve copies the last four bytes three times. A simple forward copy |

430 | * ignoring whether the length is greater than the distance or not implements |

431 | * this correctly. You should not use memcpy() since its behavior is not |

432 | * defined for overlapped arrays. You should not use memmove() or bcopy() |

433 | * since though their behavior -is- defined for overlapping arrays, it is |

434 | * defined to do the wrong thing in this case. |

435 | */ |

436 | local int codes(struct state *s, |

437 | const struct huffman *lencode, |

438 | const struct huffman *distcode) |

439 | { |

440 | int symbol; /* decoded symbol */ |

441 | int len; /* length for copy */ |

442 | unsigned dist; /* distance for copy */ |

443 | static const short lens[29] = { /* Size base for length codes 257..285 */ |

444 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, |

445 | 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258}; |

446 | static const short lext[29] = { /* Extra bits for length codes 257..285 */ |

447 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, |

448 | 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; |

449 | static const short dists[30] = { /* Offset base for distance codes 0..29 */ |

450 | 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, |

451 | 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, |

452 | 8193, 12289, 16385, 24577}; |

453 | static const short dext[30] = { /* Extra bits for distance codes 0..29 */ |

454 | 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, |

455 | 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |

456 | 12, 12, 13, 13}; |

457 | |

458 | /* decode literals and length/distance pairs */ |

459 | do { |

460 | symbol = decode(s, lencode); |

461 | if (symbol < 0) |

462 | return symbol; /* invalid symbol */ |

463 | if (symbol < 256) { /* literal: symbol is the byte */ |

464 | /* write out the literal */ |

465 | if (s->out != NIL) { |

466 | if (s->outcnt == s->outlen) |

467 | return 1; |

468 | s->out[s->outcnt] = symbol; |

469 | } |

470 | s->outcnt++; |

471 | } |

472 | else if (symbol > 256) { /* length */ |

473 | /* get and compute length */ |

474 | symbol -= 257; |

475 | if (symbol >= 29) |

476 | return -10; /* invalid fixed code */ |

477 | len = lens[symbol] + bits(s, lext[symbol]); |

478 | |

479 | /* get and check distance */ |

480 | symbol = decode(s, distcode); |

481 | if (symbol < 0) |

482 | return symbol; /* invalid symbol */ |

483 | dist = dists[symbol] + bits(s, dext[symbol]); |

484 | #ifndef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR |

485 | if (dist > s->outcnt) |

486 | return -11; /* distance too far back */ |

487 | #endif |

488 | |

489 | /* copy length bytes from distance bytes back */ |

490 | if (s->out != NIL) { |

491 | if (s->outcnt + len > s->outlen) |

492 | return 1; |

493 | while (len--) { |

494 | s->out[s->outcnt] = |

495 | #ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR |

496 | dist > s->outcnt ? |

497 | 0 : |

498 | #endif |

499 | s->out[s->outcnt - dist]; |

500 | s->outcnt++; |

501 | } |

502 | } |

503 | else |

504 | s->outcnt += len; |

505 | } |

506 | } while (symbol != 256); /* end of block symbol */ |

507 | |

508 | /* done with a valid fixed or dynamic block */ |

509 | return 0; |

510 | } |

511 | |

512 | /* |

513 | * Process a fixed codes block. |

514 | * |

515 | * Format notes: |

516 | * |

517 | * - This block type can be useful for compressing small amounts of data for |

518 | * which the size of the code descriptions in a dynamic block exceeds the |

519 | * benefit of custom codes for that block. For fixed codes, no bits are |

520 | * spent on code descriptions. Instead the code lengths for literal/length |

521 | * codes and distance codes are fixed. The specific lengths for each symbol |

522 | * can be seen in the "for" loops below. |

523 | * |

524 | * - The literal/length code is complete, but has two symbols that are invalid |

525 | * and should result in an error if received. This cannot be implemented |

526 | * simply as an incomplete code since those two symbols are in the "middle" |

527 | * of the code. They are eight bits long and the longest literal/length\ |

528 | * code is nine bits. Therefore the code must be constructed with those |

529 | * symbols, and the invalid symbols must be detected after decoding. |

530 | * |

531 | * - The fixed distance codes also have two invalid symbols that should result |

532 | * in an error if received. Since all of the distance codes are the same |

533 | * length, this can be implemented as an incomplete code. Then the invalid |

534 | * codes are detected while decoding. |

535 | */ |

536 | local int fixed(struct state *s) |

537 | { |

538 | static int virgin = 1; |

539 | static short lencnt[MAXBITS+1], lensym[FIXLCODES]; |

540 | static short distcnt[MAXBITS+1], distsym[MAXDCODES]; |

541 | static struct huffman lencode, distcode; |

542 | |

543 | /* build fixed huffman tables if first call (may not be thread safe) */ |

544 | if (virgin) { |

545 | int symbol; |

546 | short lengths[FIXLCODES]; |

547 | |

548 | /* construct lencode and distcode */ |

549 | lencode.count = lencnt; |

550 | lencode.symbol = lensym; |

551 | distcode.count = distcnt; |

552 | distcode.symbol = distsym; |

553 | |

554 | /* literal/length table */ |

555 | for (symbol = 0; symbol < 144; symbol++) |

556 | lengths[symbol] = 8; |

557 | for (; symbol < 256; symbol++) |

558 | lengths[symbol] = 9; |

559 | for (; symbol < 280; symbol++) |

560 | lengths[symbol] = 7; |

561 | for (; symbol < FIXLCODES; symbol++) |

562 | lengths[symbol] = 8; |

563 | construct(&lencode, lengths, FIXLCODES); |

564 | |

565 | /* distance table */ |

566 | for (symbol = 0; symbol < MAXDCODES; symbol++) |

567 | lengths[symbol] = 5; |

568 | construct(&distcode, lengths, MAXDCODES); |

569 | |

570 | /* do this just once */ |

571 | virgin = 0; |

572 | } |

573 | |

574 | /* decode data until end-of-block code */ |

575 | return codes(s, &lencode, &distcode); |

576 | } |

577 | |

578 | /* |

579 | * Process a dynamic codes block. |

580 | * |

581 | * Format notes: |

582 | * |

583 | * - A dynamic block starts with a description of the literal/length and |

584 | * distance codes for that block. New dynamic blocks allow the compressor to |

585 | * rapidly adapt to changing data with new codes optimized for that data. |

586 | * |

587 | * - The codes used by the deflate format are "canonical", which means that |

588 | * the actual bits of the codes are generated in an unambiguous way simply |

589 | * from the number of bits in each code. Therefore the code descriptions |

590 | * are simply a list of code lengths for each symbol. |

591 | * |

592 | * - The code lengths are stored in order for the symbols, so lengths are |

593 | * provided for each of the literal/length symbols, and for each of the |

594 | * distance symbols. |

595 | * |

596 | * - If a symbol is not used in the block, this is represented by a zero as |

597 | * as the code length. This does not mean a zero-length code, but rather |

598 | * that no code should be created for this symbol. There is no way in the |

599 | * deflate format to represent a zero-length code. |

600 | * |

601 | * - The maximum number of bits in a code is 15, so the possible lengths for |

602 | * any code are 1..15. |

603 | * |

604 | * - The fact that a length of zero is not permitted for a code has an |

605 | * interesting consequence. Normally if only one symbol is used for a given |

606 | * code, then in fact that code could be represented with zero bits. However |

607 | * in deflate, that code has to be at least one bit. So for example, if |

608 | * only a single distance base symbol appears in a block, then it will be |

609 | * represented by a single code of length one, in particular one 0 bit. This |

610 | * is an incomplete code, since if a 1 bit is received, it has no meaning, |

611 | * and should result in an error. So incomplete distance codes of one symbol |

612 | * should be permitted, and the receipt of invalid codes should be handled. |

613 | * |

614 | * - It is also possible to have a single literal/length code, but that code |

615 | * must be the end-of-block code, since every dynamic block has one. This |

616 | * is not the most efficient way to create an empty block (an empty fixed |

617 | * block is fewer bits), but it is allowed by the format. So incomplete |

618 | * literal/length codes of one symbol should also be permitted. |

619 | * |

620 | * - If there are only literal codes and no lengths, then there are no distance |

621 | * codes. This is represented by one distance code with zero bits. |

622 | * |

623 | * - The list of up to 286 length/literal lengths and up to 30 distance lengths |

624 | * are themselves compressed using Huffman codes and run-length encoding. In |

625 | * the list of code lengths, a 0 symbol means no code, a 1..15 symbol means |

626 | * that length, and the symbols 16, 17, and 18 are run-length instructions. |

627 | * Each of 16, 17, and 18 are followed by extra bits to define the length of |

628 | * the run. 16 copies the last length 3 to 6 times. 17 represents 3 to 10 |

629 | * zero lengths, and 18 represents 11 to 138 zero lengths. Unused symbols |

630 | * are common, hence the special coding for zero lengths. |

631 | * |

632 | * - The symbols for 0..18 are Huffman coded, and so that code must be |

633 | * described first. This is simply a sequence of up to 19 three-bit values |

634 | * representing no code (0) or the code length for that symbol (1..7). |

635 | * |

636 | * - A dynamic block starts with three fixed-size counts from which is computed |

637 | * the number of literal/length code lengths, the number of distance code |

638 | * lengths, and the number of code length code lengths (ok, you come up with |

639 | * a better name!) in the code descriptions. For the literal/length and |

640 | * distance codes, lengths after those provided are considered zero, i.e. no |

641 | * code. The code length code lengths are received in a permuted order (see |

642 | * the order[] array below) to make a short code length code length list more |

643 | * likely. As it turns out, very short and very long codes are less likely |

644 | * to be seen in a dynamic code description, hence what may appear initially |

645 | * to be a peculiar ordering. |

646 | * |

647 | * - Given the number of literal/length code lengths (nlen) and distance code |

648 | * lengths (ndist), then they are treated as one long list of nlen + ndist |

649 | * code lengths. Therefore run-length coding can and often does cross the |

650 | * boundary between the two sets of lengths. |

651 | * |

652 | * - So to summarize, the code description at the start of a dynamic block is |

653 | * three counts for the number of code lengths for the literal/length codes, |

654 | * the distance codes, and the code length codes. This is followed by the |

655 | * code length code lengths, three bits each. This is used to construct the |

656 | * code length code which is used to read the remainder of the lengths. Then |

657 | * the literal/length code lengths and distance lengths are read as a single |

658 | * set of lengths using the code length codes. Codes are constructed from |

659 | * the resulting two sets of lengths, and then finally you can start |

660 | * decoding actual compressed data in the block. |

661 | * |

662 | * - For reference, a "typical" size for the code description in a dynamic |

663 | * block is around 80 bytes. |

664 | */ |

665 | local int dynamic(struct state *s) |

666 | { |

667 | int nlen, ndist, ncode; /* number of lengths in descriptor */ |

668 | int index; /* index of lengths[] */ |

669 | int err; /* construct() return value */ |

670 | short lengths[MAXCODES]; /* descriptor code lengths */ |

671 | short lencnt[MAXBITS+1], lensym[MAXLCODES]; /* lencode memory */ |

672 | short distcnt[MAXBITS+1], distsym[MAXDCODES]; /* distcode memory */ |

673 | struct huffman lencode, distcode; /* length and distance codes */ |

674 | static const short order[19] = /* permutation of code length codes */ |

675 | {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; |

676 | |

677 | /* construct lencode and distcode */ |

678 | lencode.count = lencnt; |

679 | lencode.symbol = lensym; |

680 | distcode.count = distcnt; |

681 | distcode.symbol = distsym; |

682 | |

683 | /* get number of lengths in each table, check lengths */ |

684 | nlen = bits(s, 5) + 257; |

685 | ndist = bits(s, 5) + 1; |

686 | ncode = bits(s, 4) + 4; |

687 | if (nlen > MAXLCODES || ndist > MAXDCODES) |

688 | return -3; /* bad counts */ |

689 | |

690 | /* read code length code lengths (really), missing lengths are zero */ |

691 | for (index = 0; index < ncode; index++) |

692 | lengths[order[index]] = bits(s, 3); |

693 | for (; index < 19; index++) |

694 | lengths[order[index]] = 0; |

695 | |

696 | /* build huffman table for code lengths codes (use lencode temporarily) */ |

697 | err = construct(&lencode, lengths, 19); |

698 | if (err != 0) /* require complete code set here */ |

699 | return -4; |

700 | |

701 | /* read length/literal and distance code length tables */ |

702 | index = 0; |

703 | while (index < nlen + ndist) { |

704 | int symbol; /* decoded value */ |

705 | int len; /* last length to repeat */ |

706 | |

707 | symbol = decode(s, &lencode); |

708 | if (symbol < 0) |

709 | return symbol; /* invalid symbol */ |

710 | if (symbol < 16) /* length in 0..15 */ |

711 | lengths[index++] = symbol; |

712 | else { /* repeat instruction */ |

713 | len = 0; /* assume repeating zeros */ |

714 | if (symbol == 16) { /* repeat last length 3..6 times */ |

715 | if (index == 0) |

716 | return -5; /* no last length! */ |

717 | len = lengths[index - 1]; /* last length */ |

718 | symbol = 3 + bits(s, 2); |

719 | } |

720 | else if (symbol == 17) /* repeat zero 3..10 times */ |

721 | symbol = 3 + bits(s, 3); |

722 | else /* == 18, repeat zero 11..138 times */ |

723 | symbol = 11 + bits(s, 7); |

724 | if (index + symbol > nlen + ndist) |

725 | return -6; /* too many lengths! */ |

726 | while (symbol--) /* repeat last or zero symbol times */ |

727 | lengths[index++] = len; |

728 | } |

729 | } |

730 | |

731 | /* check for end-of-block code -- there better be one! */ |

732 | if (lengths[256] == 0) |

733 | return -9; |

734 | |

735 | /* build huffman table for literal/length codes */ |

736 | err = construct(&lencode, lengths, nlen); |

737 | if (err && (err < 0 || nlen != lencode.count[0] + lencode.count[1])) |

738 | return -7; /* incomplete code ok only for single length 1 code */ |

739 | |

740 | /* build huffman table for distance codes */ |

741 | err = construct(&distcode, lengths + nlen, ndist); |

742 | if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1])) |

743 | return -8; /* incomplete code ok only for single length 1 code */ |

744 | |

745 | /* decode data until end-of-block code */ |

746 | return codes(s, &lencode, &distcode); |

747 | } |

748 | |

749 | /* |

750 | * Inflate source to dest. On return, destlen and sourcelen are updated to the |

751 | * size of the uncompressed data and the size of the deflate data respectively. |

752 | * On success, the return value of puff() is zero. If there is an error in the |

753 | * source data, i.e. it is not in the deflate format, then a negative value is |

754 | * returned. If there is not enough input available or there is not enough |

755 | * output space, then a positive error is returned. In that case, destlen and |

756 | * sourcelen are not updated to facilitate retrying from the beginning with the |

757 | * provision of more input data or more output space. In the case of invalid |

758 | * inflate data (a negative error), the dest and source pointers are updated to |

759 | * facilitate the debugging of deflators. |

760 | * |

761 | * puff() also has a mode to determine the size of the uncompressed output with |

762 | * no output written. For this dest must be (unsigned char *)0. In this case, |

763 | * the input value of *destlen is ignored, and on return *destlen is set to the |

764 | * size of the uncompressed output. |

765 | * |

766 | * The return codes are: |

767 | * |

768 | * 2: available inflate data did not terminate |

769 | * 1: output space exhausted before completing inflate |

770 | * 0: successful inflate |

771 | * -1: invalid block type (type == 3) |

772 | * -2: stored block length did not match one's complement |

773 | * -3: dynamic block code description: too many length or distance codes |

774 | * -4: dynamic block code description: code lengths codes incomplete |

775 | * -5: dynamic block code description: repeat lengths with no first length |

776 | * -6: dynamic block code description: repeat more than specified lengths |

777 | * -7: dynamic block code description: invalid literal/length code lengths |

778 | * -8: dynamic block code description: invalid distance code lengths |

779 | * -9: dynamic block code description: missing end-of-block code |

780 | * -10: invalid literal/length or distance code in fixed or dynamic block |

781 | * -11: distance is too far back in fixed or dynamic block |

782 | * |

783 | * Format notes: |

784 | * |

785 | * - Three bits are read for each block to determine the kind of block and |

786 | * whether or not it is the last block. Then the block is decoded and the |

787 | * process repeated if it was not the last block. |

788 | * |

789 | * - The leftover bits in the last byte of the deflate data after the last |

790 | * block (if it was a fixed or dynamic block) are undefined and have no |

791 | * expected values to check. |

792 | */ |

793 | int puff(unsigned char *dest, /* pointer to destination pointer */ |

794 | unsigned long *destlen, /* amount of output space */ |

795 | const unsigned char *source, /* pointer to source data pointer */ |

796 | unsigned long *sourcelen) /* amount of input available */ |

797 | { |

798 | struct state s; /* input/output state */ |

799 | int last, type; /* block information */ |

800 | int err; /* return value */ |

801 | |

802 | /* initialize output state */ |

803 | s.out = dest; |

804 | s.outlen = *destlen; /* ignored if dest is NIL */ |

805 | s.outcnt = 0; |

806 | |

807 | /* initialize input state */ |

808 | s.in = source; |

809 | s.inlen = *sourcelen; |

810 | s.incnt = 0; |

811 | s.bitbuf = 0; |

812 | s.bitcnt = 0; |

813 | |

814 | /* return if bits() or decode() tries to read past available input */ |

815 | if (setjmp(s.env) != 0) /* if came back here via longjmp() */ |

816 | err = 2; /* then skip do-loop, return error */ |

817 | else { |

818 | /* process blocks until last block or error */ |

819 | do { |

820 | last = bits(&s, 1); /* one if last block */ |

821 | type = bits(&s, 2); /* block type 0..3 */ |

822 | err = type == 0 ? |

823 | stored(&s) : |

824 | (type == 1 ? |

825 | fixed(&s) : |

826 | (type == 2 ? |

827 | dynamic(&s) : |

828 | -1)); /* type == 3, invalid */ |

829 | if (err != 0) |

830 | break; /* return with error */ |

831 | } while (!last); |

832 | } |

833 | |

834 | /* update the lengths and return */ |

835 | if (err <= 0) { |

836 | *destlen = s.outcnt; |

837 | *sourcelen = s.incnt; |

838 | } |

839 | return err; |

840 | } |