-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathchars_gen
More file actions
executable file
·92 lines (67 loc) · 2.47 KB
/
chars_gen
File metadata and controls
executable file
·92 lines (67 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env php
<?php
$savePath = __DIR__.'/chars.dat';
$polyphoneCharsPath = __DIR__.'/sources/multi_readings_chars.txt';
$unihanReadingsFile = __DIR__ . '/sources/Unihan_Readings.txt';
$charPatchesFile = __DIR__ . '/patches/chars.txt';
if (version_compare(PHP_VERSION, '7.0.0', '<')) {
exit('PHP7 required.');
}
function unicode_to_utf8($unicode_str) {
$char = json_decode('"\u'.$unicode_str.'"');
return preg_match('/\p{Han}/u', $char) ? $char : false;
}
$unihanChars = [];
$polyphoneChars = [];
$patches = [];
foreach (file($charPatchesFile) as $line) {
list($char, $pinyin) = explode(',', trim($line));
$patches[$char] = $pinyin;
}
foreach (file($unihanReadingsFile) as $line) {
if ($line[0] == '#') {
continue;
}
$parts = explode("\t", trim($line));
if (count($parts) < 3) {
continue;
}
list($code, $type, $reading) = $parts;
$code = substr($code, 2);
if (!in_array($type, ['kHanyuPinlu','kXHC1983','kHanyuPinyin', 'kMandarin'])) {
continue;
}
$readings = array_filter(preg_split('/[\*,.:()0-9\s]+/', $reading)); // filter empty items.
// 记录多音字: [
// '4E50' => [pinyin1, pinyin2, ...]
// ]
if (count($readings) > 1) {
$polyphoneChars[$code] = array_unique(array_merge($polyphoneChars[$code] ?? [], $readings));
}
$reading = reset($readings);
$unihanChars[$code][$type] = $reading;
}
$output = [];
$polyphones = []; // 去除了多音字常用音,只留下不常用的音
foreach ($unihanChars as $code => $group) {
$pinyin = $group['kMandarin'] ?? $group['kHanyuPinlu'] ?? $group['kXHC1983'] ?? $group['kHanyuPinyin'] ?? null;
$han = unicode_to_utf8($code);
if (is_null($pinyin) || empty($han)) {
echo "Skip non-chinese code: $code(".$han.")\n";
continue;
}
// 覆盖错误的拼音
if (!empty($patches[$han])) {
$pinyin = $patches[$han];
}
// 如果是多音字
if (!empty($polyphoneChars[$code])) {
// 从多个拼音里把常用的那个音放前面单独放置
$polyphones[] = $han.' '.$pinyin.' ('.join(')|(', array_diff($polyphoneChars[$code], [$pinyin])).')';
}
$output[] = sprintf("%s,%s", $han, $pinyin);
}
file_put_contents($savePath, join("\n", $output));
file_put_contents($polyphoneCharsPath, join("\n", $polyphones));
echo count($output)." mandarin readings saved in $savePath\n";
echo count($polyphones)." multi mandarin chars saved in $polyphoneCharsPath";