Support AppleDict Binary key texts (morphology)

### `KeyText.data` file
`KeyText.data` file contains morphological word forms or spelling variants. We need this data to make dictionary articles searchable by different wordforms.

**Solution**

```python

from struct import unpack
from typing import Dict

filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/KeyText.data'
f = open(filename, 'rb')


def read_int(f, address) -> int:
    f.seek(address)
    return unpack('i', f.read(4))[0]


def read_int_here(f) -> int:
    return unpack('i', f.read(4))[0]


def read_x_bytes_as_word(f, x) -> str:
    word = ''
    while x > 0:
        word += chr(read_2_bytes_here(f))
        x -= 2
    return word


def read_2_bytes(f, address) -> int:
    f.seek(address)
    return read_2_bytes_here(f)


def read_2_bytes_here(f) -> int:
    lower_byte = f.read(1)
    higher_byte = f.read(1)
    return ord(higher_byte) * 0x100 + ord(lower_byte)


def print_letter(f, address):
    i = read_2_bytes(f, address)
    print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))


if __name__ == "__main__":
    file_header_len = 0x40
    word_by_id: Dict = {}
    for addr_start in range(0x0, 0x2f4, 2):
        print_letter(f, addr_start)
    quit()
    limit = read_int(f, file_header_len)
    print(f'limit {limit}')
    jump = 0
    section_start = 0x44
    while f.tell() < limit:
        f.seek(section_start)
        jump = read_2_bytes_here(f) + 4
        zero1 = read_2_bytes_here(f)  # 0x00
        big_len = read_2_bytes_here(f)  # 0x2c
        zero2 = read_2_bytes_here(f)  # 0x00
        word_forms_number = read_2_bytes_here(f)  # 0x01
        # print(f'section_start {hex(section_start)} jump {hex(jump)}')
        # print('word_forms', word_forms_number)
        if zero1 != 0:
            print('zero1')
            quit()
        if zero2 != 0:
            print('zero2')
            quit()
        for word_form_n in range(0, word_forms_number):
            # EXAMPLE: <d:index d:value="made" d:title="made (make)"/>
            # If the entry for "make" contains these <d:index> definitions, the entry can be searched not only by "make" but also by "makes" or "made".
            # On the search result list, title value texts like "made" are displayed.
            # EXAMPLE: <d:index d:value="make it" d:title="make it" d:parental-control="1" d:anchor="xpointer(//*[@id='make_it'])"/>
            # EXAMPLE: <d:index d:value="工夫する" d:title="工夫する" d:yomi="くふうする" d:anchor="xpointer(//*[@id='kufuu-suru'])" />

            zero3 = read_2_bytes_here(f)  # 0x00
            small_len = read_2_bytes_here(f)  # 0x2c
            # read 4 bytes and then add file header length (0x40)
            # the resulting number must match with Contents/Body.data address of the entry
            word_form_id = read_int_here(f) + file_header_len
            priority_and_parental_control = read_2_bytes_here(f)  # 0x13
            priority = priority_and_parental_control / 2  # d:priority between 0x00..0x12, priority = [0..9]
            parental_control = priority_and_parental_control % 2  # d:parental-control
            form1_len = read_2_bytes_here(f)  # 0xxc

            # dictionary article will be searchable by this word form
            # for example: flavoured, colouring, behaviours
            wordform_search_by = read_x_bytes_as_word(f, form1_len)
            form2_len = read_2_bytes_here(f)  # 0xxc

            # this word form will be shown as correct spelling
            # for example if dictionary is for American English all British words will be shown in American spelling
            # flavoured -> flavored, colouring -> coloring, behaviours -> behaviors
            wordform_shown = read_x_bytes_as_word(f, form2_len)

            # NB! here also goes form3_len, form3, anchor_len, d:anchor and ends with 0x00
            form3_len = read_2_bytes_here(f)  # 0x00 or byte length of the 3rd form
            if form3_len != 0:
                form3 = read_x_bytes_as_word(f, form3_len)
                form4_len = read_2_bytes_here(f)  # 0x00 or byte length of the 4th form
                if form4_len != 0:
                    form4 = read_x_bytes_as_word(f, form4_len) # d:anchor
                    zero5 = read_2_bytes_here(f) # 0x00 should be zero
            if word_form_id not in word_by_id:
                word_by_id[word_form_id] = set()
            word_by_id[word_form_id].add(tuple([wordform_search_by, wordform_shown]))

            # print(
            #     f"        word_form_id {hex(word_form_id)}",
            #     f"word1 {wordform_search_by}",
            #     f"word2 {wordform_shown}"
            # )

        section_start += jump
    with open("ids.txt", 'w') as fid:
        for k, v in word_by_id.items():
            fid.write(f'${hex(k)}\t{v}\n')

```

### `EntryID.data` file
`EntryID.data` is a database of all dictionary entries with address offsets in `Body.data` file.

**Solution**
```python

from struct import unpack
from typing import Dict, List

# filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/EntryID.data'
filename = '/System/Library/AssetsV2/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/7d335a455a4827b5e26f66e6ac5221bab87ab3a5.asset/AssetData/Oxford Dictionary of English.dictionary/Contents/Resources/EntryID.data'
f = open(filename, 'rb')


def read_int(f, address) -> int:
    f.seek(address)
    return unpack('i', f.read(4))[0]


def read_int_here(f) -> int:
    return unpack('i', f.read(4))[0]


def read_x_bytes_as_word(f, x) -> str:
    word = ''
    while x > 0:
        word += chr(read_2_bytes_here(f))
        x -= 2
    return word


def read_2_bytes(f, address) -> int:
    f.seek(address)
    return read_2_bytes_here(f)


def read_2_bytes_here(f) -> int:
    lower_byte = f.read(1)
    higher_byte = f.read(1)
    return ord(higher_byte) * 0x100 + ord(lower_byte)


def print_letter(f, address):
    i = read_2_bytes(f, address)
    print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))


def pretty_print(cnt: int, dt: List[int]):
    print(f'{cnt}: [', ", ".join(hex(x) for x in dt), ']')


if __name__ == "__main__":
    is_custom = False
    file_header_len = 0x44 if is_custom else 0x60

    for addr_start in range(0x0, 0xf4, 2):
        print_letter(f, addr_start)

    cnt = 0
    all_ids = []
    f.seek(file_header_len)
    while True:
        addr = f.tell()
        first_int = read_int_here(f)  # 1200 0000
        data_len = first_int // 2
        data = [first_int]
        for l in range(data_len):
            read = read_2_bytes_here(f)
            data.append(read)
        if len(data) < 3:
            break
        # article address offset in Body.data file
        # formatting type A: 8 values
        # no sections, each entry has is compressed separately
        article_address_offset = data[6] + data[7] * 0x10000 + file_header_len
        # formatting type B: 10 values
        # Oxford English dictionary with 114151 entries is split into 777 sections with length ~ 0x8000 bytes
        # to get a specific entry we need to decpmpress the whole section
        # then we the offset inside the uncompressed text is equal data[6]
        # article_address_offset = data[6] + data[8] + data[9] * 0x10000 + file_header_len
        if cnt > 19600: #article_address_id == 0x40:
            pretty_print(cnt, data)
        # if article_address_id == 0x40: quit()
        # if cnt > 100: quit()
        all_ids.append(article_address_offset)
        cnt += 1
    with open('entry_ids.txt', 'w') as output_file:
        sorted_ids = sorted(all_ids)
        for i in sorted_ids:
            output_file.write(hex(i) + '\n')
    print(f'{cnt} processed')
```

**Sample file(s)**
https://mega.nz/folder/elwgBDzC#qLRuACGNkXqyiEq7oeG8Nw (KeyText.data)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Support AppleDict Binary key texts (morphology) #406

`KeyText.data` file

`EntryID.data` file

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Support AppleDict Binary key texts (morphology) #406

Description

KeyText.data file

EntryID.data file

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions

`KeyText.data` file

`EntryID.data` file