KeyText.data file
KeyText.data file contains morphological word forms or spelling variants. We need this data to make dictionary articles searchable by different wordforms.
Solution
from struct import unpack
from typing import Dict
filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/KeyText.data'
f = open(filename, 'rb')
def read_int(f, address) -> int:
f.seek(address)
return unpack('i', f.read(4))[0]
def read_int_here(f) -> int:
return unpack('i', f.read(4))[0]
def read_x_bytes_as_word(f, x) -> str:
word = ''
while x > 0:
word += chr(read_2_bytes_here(f))
x -= 2
return word
def read_2_bytes(f, address) -> int:
f.seek(address)
return read_2_bytes_here(f)
def read_2_bytes_here(f) -> int:
lower_byte = f.read(1)
higher_byte = f.read(1)
return ord(higher_byte) * 0x100 + ord(lower_byte)
def print_letter(f, address):
i = read_2_bytes(f, address)
print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))
if __name__ == "__main__":
file_header_len = 0x40
word_by_id: Dict = {}
for addr_start in range(0x0, 0x2f4, 2):
print_letter(f, addr_start)
quit()
limit = read_int(f, file_header_len)
print(f'limit {limit}')
jump = 0
section_start = 0x44
while f.tell() < limit:
f.seek(section_start)
jump = read_2_bytes_here(f) + 4
zero1 = read_2_bytes_here(f) # 0x00
big_len = read_2_bytes_here(f) # 0x2c
zero2 = read_2_bytes_here(f) # 0x00
word_forms_number = read_2_bytes_here(f) # 0x01
# print(f'section_start {hex(section_start)} jump {hex(jump)}')
# print('word_forms', word_forms_number)
if zero1 != 0:
print('zero1')
quit()
if zero2 != 0:
print('zero2')
quit()
for word_form_n in range(0, word_forms_number):
# EXAMPLE: <d:index d:value="made" d:title="made (make)"/>
# If the entry for "make" contains these <d:index> definitions, the entry can be searched not only by "make" but also by "makes" or "made".
# On the search result list, title value texts like "made" are displayed.
# EXAMPLE: <d:index d:value="make it" d:title="make it" d:parental-control="1" d:anchor="xpointer(//*[@id='make_it'])"/>
# EXAMPLE: <d:index d:value="工夫する" d:title="工夫する" d:yomi="くふうする" d:anchor="xpointer(//*[@id='kufuu-suru'])" />
zero3 = read_2_bytes_here(f) # 0x00
small_len = read_2_bytes_here(f) # 0x2c
# read 4 bytes and then add file header length (0x40)
# the resulting number must match with Contents/Body.data address of the entry
word_form_id = read_int_here(f) + file_header_len
priority_and_parental_control = read_2_bytes_here(f) # 0x13
priority = priority_and_parental_control / 2 # d:priority between 0x00..0x12, priority = [0..9]
parental_control = priority_and_parental_control % 2 # d:parental-control
form1_len = read_2_bytes_here(f) # 0xxc
# dictionary article will be searchable by this word form
# for example: flavoured, colouring, behaviours
wordform_search_by = read_x_bytes_as_word(f, form1_len)
form2_len = read_2_bytes_here(f) # 0xxc
# this word form will be shown as correct spelling
# for example if dictionary is for American English all British words will be shown in American spelling
# flavoured -> flavored, colouring -> coloring, behaviours -> behaviors
wordform_shown = read_x_bytes_as_word(f, form2_len)
# NB! here also goes form3_len, form3, anchor_len, d:anchor and ends with 0x00
form3_len = read_2_bytes_here(f) # 0x00 or byte length of the 3rd form
if form3_len != 0:
form3 = read_x_bytes_as_word(f, form3_len)
form4_len = read_2_bytes_here(f) # 0x00 or byte length of the 4th form
if form4_len != 0:
form4 = read_x_bytes_as_word(f, form4_len) # d:anchor
zero5 = read_2_bytes_here(f) # 0x00 should be zero
if word_form_id not in word_by_id:
word_by_id[word_form_id] = set()
word_by_id[word_form_id].add(tuple([wordform_search_by, wordform_shown]))
# print(
# f" word_form_id {hex(word_form_id)}",
# f"word1 {wordform_search_by}",
# f"word2 {wordform_shown}"
# )
section_start += jump
with open("ids.txt", 'w') as fid:
for k, v in word_by_id.items():
fid.write(f'${hex(k)}\t{v}\n')
EntryID.data file
EntryID.data is a database of all dictionary entries with address offsets in Body.data file.
Solution
from struct import unpack
from typing import Dict, List
# filename = '/Users/soshial/Desktop/rus-pol.dictionary/Contents/EntryID.data'
filename = '/System/Library/AssetsV2/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/7d335a455a4827b5e26f66e6ac5221bab87ab3a5.asset/AssetData/Oxford Dictionary of English.dictionary/Contents/Resources/EntryID.data'
f = open(filename, 'rb')
def read_int(f, address) -> int:
f.seek(address)
return unpack('i', f.read(4))[0]
def read_int_here(f) -> int:
return unpack('i', f.read(4))[0]
def read_x_bytes_as_word(f, x) -> str:
word = ''
while x > 0:
word += chr(read_2_bytes_here(f))
x -= 2
return word
def read_2_bytes(f, address) -> int:
f.seek(address)
return read_2_bytes_here(f)
def read_2_bytes_here(f) -> int:
lower_byte = f.read(1)
higher_byte = f.read(1)
return ord(higher_byte) * 0x100 + ord(lower_byte)
def print_letter(f, address):
i = read_2_bytes(f, address)
print(f'addr: {hex(address)} ', hex(i), '\t', chr(i))
def pretty_print(cnt: int, dt: List[int]):
print(f'{cnt}: [', ", ".join(hex(x) for x in dt), ']')
if __name__ == "__main__":
is_custom = False
file_header_len = 0x44 if is_custom else 0x60
for addr_start in range(0x0, 0xf4, 2):
print_letter(f, addr_start)
cnt = 0
all_ids = []
f.seek(file_header_len)
while True:
addr = f.tell()
first_int = read_int_here(f) # 1200 0000
data_len = first_int // 2
data = [first_int]
for l in range(data_len):
read = read_2_bytes_here(f)
data.append(read)
if len(data) < 3:
break
# article address offset in Body.data file
# formatting type A: 8 values
# no sections, each entry has is compressed separately
article_address_offset = data[6] + data[7] * 0x10000 + file_header_len
# formatting type B: 10 values
# Oxford English dictionary with 114151 entries is split into 777 sections with length ~ 0x8000 bytes
# to get a specific entry we need to decpmpress the whole section
# then we the offset inside the uncompressed text is equal data[6]
# article_address_offset = data[6] + data[8] + data[9] * 0x10000 + file_header_len
if cnt > 19600: #article_address_id == 0x40:
pretty_print(cnt, data)
# if article_address_id == 0x40: quit()
# if cnt > 100: quit()
all_ids.append(article_address_offset)
cnt += 1
with open('entry_ids.txt', 'w') as output_file:
sorted_ids = sorted(all_ids)
for i in sorted_ids:
output_file.write(hex(i) + '\n')
print(f'{cnt} processed')
Sample file(s)
https://mega.nz/folder/elwgBDzC#qLRuACGNkXqyiEq7oeG8Nw (KeyText.data)
KeyText.datafileKeyText.datafile contains morphological word forms or spelling variants. We need this data to make dictionary articles searchable by different wordforms.Solution
EntryID.datafileEntryID.datais a database of all dictionary entries with address offsets inBody.datafile.Solution
Sample file(s)
https://mega.nz/folder/elwgBDzC#qLRuACGNkXqyiEq7oeG8Nw (KeyText.data)