Data, in particular image data, is compressed in the
Suikoden II data files by what appears to be a variant of LZ77 compression. Character portraits, sprite sheets, and other image data are all stored in compressed buffers within the BIN files. Most of the other data on the disc is stored, uncompressed, in the same files. This includes text, game and scenario scripts, NPC and enemy data, item data, and on an on.
Compression Details
After compressed data is located (how will not be discussed), it can be decoded using the following steps.
- Compression works on a rotating, 1,024 byte window. The first 990 bytes of it are initialized to zero, and the window is updated starting at the first, uninitialized byte.
- Within the BIN files, the first 32-bits of a compressed buffer is the size of the compressed data.
- The next 8-bits is a flag that can either be 0 or 1. If it is zero, then the data is compressed as described here. It is unknown as of this writing what a 1 represents.
- The following byte is the descriptor byte of the first, compressed unit. Compressed units could be exceedingly large, under some circumstances, but a typical unit will be between 9 and 17 bytes (including its descriptor).
- The descriptor is a bit-mask that describes how each of the packets within the unit was compressed. "Packets" are generally a single byte of raw, uncompressed data, or some sort of length-distance pair, i.e. a typical packet is either one or two bytes. However, it is possible for a packet to contain as many as 72 bytes.
- If the mask bit is 0, the packet it describes is a single, uncompressed byte to be copied directly into the uncompressed output.
- If the mask bit is 1, then the packet is somehow compressed. How is indicated by the uppermost two bits of the first byte in the packet.
- If the compression type is 0 or 1 (bits 00 or 01), then this byte (length) and the next (distance) contain a length-distance pair. The length is actually encoded in bits 2 through 7 (upper six) of the length byte. The two least-significant bits of the length are the most significant bits of the distance. So the final value of distance is given by (length AND 3) OR distance, and the final value of length is (length RIGHT-SHIFT 2) + 3. The threshold for this type of match is 3, so when the length is encoded, 3 is subtracted from it.
- If the compression type is 2 (bits 10) then the current byte is itself a length-distance pair. The distance is actually computed by subtracting it from the current write offset into the window. The offset can be a a minimum of -15 to maximum of 0, so the actual window for this compression is incredibly narrow. The threshold for this match is 2, since it can encode a 2-byte match for an actual gain. When it is encoded, 2 is subtracted from the length it is left-shifted 4, and ORed with 0x80. Then the distance/offset is ORed with that. 0x80 OR (length - 2 LEFT-SHIFT 4) OR distance
- If the compression type is 3 (bits 11) then the current byte (excluding the type bits) is a count of raw, uncompressed bytes to be copied to the output. The threshold for this type of non-match is 8, so when encoded, 8 is subtracted from the count. Between 8 and 71 bytes that follow are raw, uncompressed bytes. Note: I have never seen this in the compressed data. The images in Suikoden II are typically 4-bits per-pixel, and the scan-lines will tend to be repetitive. A run of more than 8 bytes matching nothing in the sliding window is fairly rare.
Decompression Algorithm
Below is the decompression algorithm. It's located in the game's main executable (at 0x80075F58 in SLUS_009.58 for the North American version). This is reverse-engineered from the assembly, with a few modifications. One thing that was not changed is the way loops are counted for the length. If you're comparing to the descriptions above, the values added to lengths in this code are 1 shy because the loops do post-fixed checks for greater-equal. The lengths are correct, even if they're potentially confusing.
//deviating a little bit here. Just for convenience, have it return the uncompressed size.
//Konami hard-coded sizes, and didn't store them with the compressed data in most cases.
unsigned int decompress (unsigned int size, unsigned char *pCompressed, unsigned char *pUncompressed) {
unsigned char window[0x400];
unsigned char *pCompEnd = pCompressed + size;
unsigned char *pSave = pUncompressed;
unsigned int offset = 0x3DE;
memset(window, 0, offset); //was a loop in the game.
while(pCompressed < pCompEnd) {
unsigned int proc = *pCompressed | 0xFF00;
pCompressed++;
do {
if(!(proc & 1)) { //first byte of each segment is a set of flags. 1 if compressed, 0 if not
*pUncompressed = *pCompressed; // not compressed, just emit the next byte in compressed buffer
window[offset] = *pCompressed;
offset++;
pUncompressed++;
pCompressed++;
if(pCompressed >= pCompEnd) {
return pUncompressed - pSave - 1;
}
if(offset >= 0x400) offset = 0;
} else { //compressed. next byte's upper two determine how.
char length = *pCompressed;
unsigned int distance;
int count;
pCompressed++;
if((unsigned char)length < 0x80) { //this byte and the next are a length-distance pair
distance = *pCompressed | ((length & 3) << 8);
pCompressed++;
length = (length >> 2) + 2;
count = 0;
if(length >= 0) {
do {
unsigned char *match_window = window + ((distance + count) & 0x3FF);
*pUncompressed = *match_window;
window[offset] = *match_window;
offset = (offset + 1) & 0x3FF;
pUncompressed++;
count++;
} while(length >= count);
}
} else if ((unsigned char)length < 0xC0) { //this byte alone stores length + offset
unsigned int tmpOff = length & 0xF;
length = ((length & 0x30) >> 4) + 1;
if(length >= 0) {
count = 0;
do {
unsigned char* tmpframe = window + ((offset - tmpOff) & 0x3FF);
*pUncompressed = *tmpframe;
window[offset] = *tmpframe;
offset = (offset + 1) & 0x3FF;
pUncompressed++;
count++;
} while(length >= count);
}
} else { //not actually compressed. run of 8 or more uncompressed bytes
length = (length & 0x3F) + 7;
if(length >= 0) {
count = 0;
do {
*pUncompressed = *pCompressed;
window[offset] = *pCompressed;
pCompressed++;
pUncompressed++;
offset = (offset + 1) & 0x3FF;
count++;
} while(length >= count);
}
}
}
proc >>= 1;
} while(proc & 0x100);
}
return (pUncompressed - pSave) - 1;
}
Compression Algorithm
Below is a possible compression algorithm. It does not provide for the multiple-byte non-match (compression type 3), mostly because it's a pain and doesn't seem strictly necessary. I believe Konami's original compression worked backwards from the current position of the window's write pointer. This would tend to bias the results toward the type 2 matches, which take up one less byte for any match between 3 and 5 bytes in length. I had already settled on working forward through the window when I realized this. I may experiment with this later on, as I believe it accounts for the small bit of relative inefficiency in this algorithm's compression.
When compressing an 8,192 byte file that occupies 2,346 bytes in the game data, this method produces a file 2,349 bytes in length. Decompressing yields a file byte-for-byte identical to the original, so it's a minor inefficiency, rather than error.
unsigned int compress(unsigned int size, unsigned char *pUncompressed, unsigned char *pCompressed) {
unsigned char window[WINDOW_SIZE], *pEnd = pUncompressed + size;
unsigned char *pStart = pUncompressed, *pFlag = pCompressed;
unsigned int offSearch = 0x3BC, offWrite = 0x3DE, lasthit, units, unit_bytes, comp_size = 1; //uncpos;
Location matchpos;
memset(window, 0, 0x3DE);
units = unit_bytes = *pFlag = 0;
comp_size = 1;
while(pUncompressed < pEnd) {
register int i, j, k;
matchpos.len = 1;
matchpos.pos = 0;
for(i = 0; i < 0x400; i++) {
j = (offSearch + i) & 0x3FF;
k = 0;
do {
if(j >= offWrite && j <= ((offWrite + 0x41) & 0x3FF))
break;
if( pUncompressed[k] != window[j] )
break;
if((++j) >= WINDOW_SIZE) j = 0;
k++;
} while (k < MAX_MATCH_SIZE && (pUncompressed + k) < pEnd);
if(k > matchpos.len) {
matchpos.len = k;
matchpos.pos = (offSearch + i) & 0x3FF;
if( k > MAX_MATCH_SIZE || (pUncompressed + k) >= pEnd)
break;
}
}
if(matchpos.len < THRESHOLD) {
if(matchpos.len > 1 && (offWrite - matchpos.pos) < 0xF && (offWrite - matchpos.pos) > 0) {
*pFlag |= 1 << units;
pCompressed[comp_size] = 0x80 | ((matchpos.len - 2) << 4) | (offWrite - matchpos.pos);
} else {
pCompressed[comp_size] = *pUncompressed;
matchpos.len = 1;
}
comp_size++;
unit_bytes++;
units++;
} else {
if(matchpos.len <= 5 && (offWrite - matchpos.pos) < 0xF && (offWrite - matchpos.pos) > 0) {
*pFlag |= 1 << units;
pCompressed[comp_size] = 0x80 | ((matchpos.len - 2) << 4) | (offWrite - matchpos.pos);
comp_size++;
unit_bytes++;
} else {
*pFlag |= 1 << units;
pCompressed[comp_size] = ((matchpos.len - 3) << 2) | (matchpos.pos >> 8);
pCompressed[comp_size + 1] = matchpos.pos & 0xFF;
comp_size += 2;
unit_bytes += 2;
}
units++;
lasthit = matchpos.pos;
offSearch = offWrite;
}
if(units == 8) {
pFlag = &pCompressed[comp_size];
*pFlag = 0;
comp_size++;
units = unit_bytes = 0;
}
for(i = 0; i < matchpos.len; i++) {
j = (offWrite + i) & 0x3FF;
window[j] = pUncompressed[i];
}
offWrite = (offWrite + matchpos.len) & 0x3FF;
pUncompressed += matchpos.len;
}
return comp_size;
}