Skip to content

Support AppleDict Binary key texts (morphology) #406

@soshial

Description

@soshial

KeyText.data file

KeyText.data file contains morphological word forms or spelling variants. We need this data to make dictionary articles searchable by different wordforms.

Solution

from struct import unpack
from typing import Dict

filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/KeyText.data'
f = open(filename, 'rb')


def read_int(f, address) -> int:
    f.seek(address)
    return unpack('i', f.read(4))[0]


def read_int_here(f) -> int:
    return unpack('i', f.read(4))[0]


def read_x_bytes_as_word(f, x) -> str:
    word = ''
    while x > 0:
        word += chr(read_2_bytes_here(f))
        x -= 2
    return word


def read_2_bytes(f, address) -> int:
    f.seek(address)
    return read_2_bytes_here(f)


def read_2_bytes_here(f) -> int:
    lower_byte = f.read(1)
    higher_byte = f.read(1)
    return ord(higher_byte) * 0x100 + ord(lower_byte)


def print_letter(f, address):
    i = read_2_bytes(f, address)
    print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))


if __name__ == "__main__":
    file_header_len = 0x40
    word_by_id: Dict = {}
    for addr_start in range(0x0, 0x2f4, 2):
        print_letter(f, addr_start)
    quit()
    limit = read_int(f, file_header_len)
    print(f'limit {limit}')
    jump = 0
    section_start = 0x44
    while f.tell() < limit:
        f.seek(section_start)
        jump = read_2_bytes_here(f) + 4
        zero1 = read_2_bytes_here(f)  # 0x00
        big_len = read_2_bytes_here(f)  # 0x2c
        zero2 = read_2_bytes_here(f)  # 0x00
        word_forms_number = read_2_bytes_here(f)  # 0x01
        # print(f'section_start {hex(section_start)} jump {hex(jump)}')
        # print('word_forms', word_forms_number)
        if zero1 != 0:
            print('zero1')
            quit()
        if zero2 != 0:
            print('zero2')
            quit()
        for word_form_n in range(0, word_forms_number):
            # EXAMPLE: <d:index d:value="made" d:title="made (make)"/>
            # If the entry for "make" contains these <d:index> definitions, the entry can be searched not only by "make" but also by "makes" or "made".
            # On the search result list, title value texts like "made" are displayed.
            # EXAMPLE: <d:index d:value="make it" d:title="make it" d:parental-control="1" d:anchor="xpointer(//*[@id='make_it'])"/>
            # EXAMPLE: <d:index d:value="工夫する" d:title="工夫する" d:yomi="くふうする" d:anchor="xpointer(//*[@id='kufuu-suru'])" />

            zero3 = read_2_bytes_here(f)  # 0x00
            small_len = read_2_bytes_here(f)  # 0x2c
            # read 4 bytes and then add file header length (0x40)
            # the resulting number must match with Contents/Body.data address of the entry
            word_form_id = read_int_here(f) + file_header_len
            priority_and_parental_control = read_2_bytes_here(f)  # 0x13
            priority = priority_and_parental_control / 2  # d:priority between 0x00..0x12, priority = [0..9]
            parental_control = priority_and_parental_control % 2  # d:parental-control
            form1_len = read_2_bytes_here(f)  # 0xxc

            # dictionary article will be searchable by this word form
            # for example: flavoured, colouring, behaviours
            wordform_search_by = read_x_bytes_as_word(f, form1_len)
            form2_len = read_2_bytes_here(f)  # 0xxc

            # this word form will be shown as correct spelling
            # for example if dictionary is for American English all British words will be shown in American spelling
            # flavoured -> flavored, colouring -> coloring, behaviours -> behaviors
            wordform_shown = read_x_bytes_as_word(f, form2_len)

            # NB! here also goes form3_len, form3, anchor_len, d:anchor and ends with 0x00
            form3_len = read_2_bytes_here(f)  # 0x00 or byte length of the 3rd form
            if form3_len != 0:
                form3 = read_x_bytes_as_word(f, form3_len)
                form4_len = read_2_bytes_here(f)  # 0x00 or byte length of the 4th form
                if form4_len != 0:
                    form4 = read_x_bytes_as_word(f, form4_len) # d:anchor
                    zero5 = read_2_bytes_here(f) # 0x00 should be zero
            if word_form_id not in word_by_id:
                word_by_id[word_form_id] = set()
            word_by_id[word_form_id].add(tuple([wordform_search_by, wordform_shown]))

            # print(
            #     f"        word_form_id {hex(word_form_id)}",
            #     f"word1 {wordform_search_by}",
            #     f"word2 {wordform_shown}"
            # )

        section_start += jump
    with open("ids.txt", 'w') as fid:
        for k, v in word_by_id.items():
            fid.write(f'${hex(k)}\t{v}\n')

EntryID.data file

EntryID.data is a database of all dictionary entries with address offsets in Body.data file.

Solution

from struct import unpack
from typing import Dict, List

# filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/EntryID.data'
filename = '/System/Library/AssetsV2/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/7d335a455a4827b5e26f66e6ac5221bab87ab3a5.asset/AssetData/Oxford Dictionary of English.dictionary/Contents/Resources/EntryID.data'
f = open(filename, 'rb')


def read_int(f, address) -> int:
    f.seek(address)
    return unpack('i', f.read(4))[0]


def read_int_here(f) -> int:
    return unpack('i', f.read(4))[0]


def read_x_bytes_as_word(f, x) -> str:
    word = ''
    while x > 0:
        word += chr(read_2_bytes_here(f))
        x -= 2
    return word


def read_2_bytes(f, address) -> int:
    f.seek(address)
    return read_2_bytes_here(f)


def read_2_bytes_here(f) -> int:
    lower_byte = f.read(1)
    higher_byte = f.read(1)
    return ord(higher_byte) * 0x100 + ord(lower_byte)


def print_letter(f, address):
    i = read_2_bytes(f, address)
    print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))


def pretty_print(cnt: int, dt: List[int]):
    print(f'{cnt}: [', ", ".join(hex(x) for x in dt), ']')


if __name__ == "__main__":
    is_custom = False
    file_header_len = 0x44 if is_custom else 0x60

    for addr_start in range(0x0, 0xf4, 2):
        print_letter(f, addr_start)

    cnt = 0
    all_ids = []
    f.seek(file_header_len)
    while True:
        addr = f.tell()
        first_int = read_int_here(f)  # 1200 0000
        data_len = first_int // 2
        data = [first_int]
        for l in range(data_len):
            read = read_2_bytes_here(f)
            data.append(read)
        if len(data) < 3:
            break
        # article address offset in Body.data file
        # formatting type A: 8 values
        # no sections, each entry has is compressed separately
        article_address_offset = data[6] + data[7] * 0x10000 + file_header_len
        # formatting type B: 10 values
        # Oxford English dictionary with 114151 entries is split into 777 sections with length ~ 0x8000 bytes
        # to get a specific entry we need to decpmpress the whole section
        # then we the offset inside the uncompressed text is equal data[6]
        # article_address_offset = data[6] + data[8] + data[9] * 0x10000 + file_header_len
        if cnt > 19600: #article_address_id == 0x40:
            pretty_print(cnt, data)
        # if article_address_id == 0x40: quit()
        # if cnt > 100: quit()
        all_ids.append(article_address_offset)
        cnt += 1
    with open('entry_ids.txt', 'w') as output_file:
        sorted_ids = sorted(all_ids)
        for i in sorted_ids:
            output_file.write(hex(i) + '\n')
    print(f'{cnt} processed')

Sample file(s)
https://mega.nz/folder/elwgBDzC#qLRuACGNkXqyiEq7oeG8Nw (KeyText.data)

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions