tools/yaz0.py

#!/usr/bin/env python3
import os, sys, argparse

def read_file(name):
    file_data=[]

    try:
        with open(name, 'rb') as f:
            file_data = f.read()
    except IOError:
        print('failed to read file ' + name)
        sys.exit(2)
    return file_data


def write_file(name, file_data):
    try:
        with open(name, 'wb') as f:
            f.write(file_data)
    except IOError:
        print('failed to write file ' + name)
        sys.exit(2)


def yaz0_decompress(input):
    output = bytearray()

    return output


max_len = 0xFF + 0x12
def back_seach(input, size, start_pos):
    best_len = 1
    match_pos = 0
    search_pos = max(start_pos - 0x1000, 0)
    end_pos = min(size, start_pos + max_len)

    # Seach for substrings that are at least 3 bytes long (the smallest size resulting in a compressed chunk)
    token_end_pos = min(start_pos + 3, size)
    seatch_len = token_end_pos - start_pos
    token = input[start_pos:token_end_pos]

    while search_pos < start_pos:
        search_pos = input.find(token, search_pos, start_pos + seatch_len - 1)
        if search_pos == -1:
            break

        pos1 = search_pos + seatch_len
        pos2 = start_pos + seatch_len

        # Find how many more bytes match
        while pos2 < end_pos and input[pos1] == input[pos2]:
            pos1 += 1
            pos2 += 1

        found_len = pos2 - start_pos

        if found_len > best_len:
            best_len = found_len
            seatch_len = found_len
            match_pos = search_pos

            if best_len == max_len:
                break

            token_end_pos = start_pos + seatch_len
            token = input[start_pos:start_pos + seatch_len]

        search_pos += 1

    return best_len, match_pos


prev_flag = False
prev_len = 0
prev_pos = 0
def cached_encode(input, size, pos):
    global prev_flag
    global prev_len
    global prev_pos

    # If a previous search found that it was better to have an uncompressed byte, return the position and length that we already found
    if prev_flag:
        prev_flag = False
        return prev_len, prev_pos

    comp_len, comp_pos = back_seach(input, size, pos)

    # Check that it wouldn't be better to have an uncompressed byte then compressing the following data
    if comp_len >= 3:
        prev_len, prev_pos = back_seach(input, size, pos + 1)
        if prev_len >= comp_len + 2: # +2 to account for the uncompressed byte plus 1 more to see if it's better compression
            comp_len = 1
            prev_flag = True

    return comp_len, comp_pos


def write_yaz0_header(output, size):
    output += 'Yaz0'.encode()

    output.append((size & 0xFF000000) >> 24)
    output.append((size & 0x00FF0000) >> 16)
    output.append((size & 0x0000FF00) >> 8)
    output.append( size & 0x000000FF)

    output += '\0\0\0\0\0\0\0\0'.encode()


def yaz0_compress(input):
    output = bytearray()

    decompressed_size = len(input)

    write_yaz0_header(output, decompressed_size)

    curr_pos = 0
    chunk_bits = 0
    chunk_num_bits = 0
    chunk_data = bytearray()
    while curr_pos < decompressed_size:
        num_bytes, match_pos = cached_encode(input, decompressed_size, curr_pos)

        if num_bytes < 3:
            chunk_data.append(input[curr_pos])
            curr_pos += 1
            chunk_bits |= (0x80 >> chunk_num_bits)
        else:
            dist = curr_pos - match_pos - 1

            if num_bytes >= 0x12:
                chunk_data.append(dist >> 8)
                chunk_data.append(dist & 0xFF)
                chunk_data.append(num_bytes - 0x12)
            else:
                chunk_data.append(((num_bytes - 2) << 4) | (dist >> 8))
                chunk_data.append(dist & 0xFF)

            curr_pos += num_bytes

        chunk_num_bits += 1

        if chunk_num_bits == 8:
            output.append(chunk_bits)
            output += chunk_data

            chunk_bits = 0
            chunk_num_bits = 0
            chunk_data = bytearray()

    if chunk_num_bits > 0:
        output.append(chunk_bits)
        output += chunk_data

    output_size = len(output)
    output_padding_amount = ((output_size + 15) // 16) * 16 - output_size
    for i in range(output_padding_amount):
        output.append(0)

    return output


def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='input file')
    parser.add_argument('output', help='output file')
    parser.add_argument('-d', '--decompress', help='decompress file, otherwise compress it', action='store_true', default=False)
    args = parser.parse_args()

    input_data = read_file(args.input)

    if args.decompress:
        output_data = yaz0_decompress(input_data)
    else:
        output_data = yaz0_compress(input_data)

    write_file(args.output, output_data)


if __name__ == "__main__":
    main(sys.argv[1:])