forked from LeeSureman/Flat-Lattice-Transformer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformat_check_script.py
More file actions
executable file
·118 lines (92 loc) · 3.96 KB
/
format_check_script.py
File metadata and controls
executable file
·118 lines (92 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# coding=utf-8
import zipfile
import shutil
import os
from collections import defaultdict
class NER(object):
def __init__(self, tid, start, end, ttype, text=''):
self.tid = str(tid).strip()
self.start = int(start)
self.end = int(end)
self.text = str(text).strip()
self.ttype = str(ttype).strip()
def span_matches(self, other, mode='strict'):
assert mode in ('strict', 'lenient')
if mode == 'strict':
if self.start == other.start and self.end == other.end:
return True
else:
if (self.end > other.start and self.start < other.end) or \
(self.start < other.end and other.start < self.end):
return True
return False
def equals(self, other, mode='strict'):
assert mode in ('strict', 'lenient')
return other.ttype == self.ttype and self.span_matches(other, mode)
def __str__(self):
return '{}\t{}\t({}:{})'.format(self.ttype, self.text, self.start, self.end)
class RecordTrack(object):
def __init__(self, file_path):
self.path = os.path.abspath(file_path)
self.basename = os.path.basename(self.path)
self.annotations = self._get_annotations()
@property
def tags(self):
return self.annotations['tags']
def _get_annotations(self):
annotations = defaultdict(dict)
with open(self.path) as annotation_file:
lines = annotation_file.readlines()
for line_num, line in enumerate(lines):
if line.strip().startswith('T'):
try:
tag_id, tag_m, tag_text = line.strip().split('\t')
except ValueError:
print(self.path, line)
# adapt to Brat tool:
if len(tag_m.split(' ')) == 3:
tag_type, tag_start, tag_end = tag_m.split(' ')
elif len(tag_m.split(' ')) == 4:
tag_type, tag_start, _, tag_end = tag_m.split(' ')
elif len(tag_m.split(' ')) == 5:
tag_type, tag_start, _, _, tag_end = tag_m.split(' ')
else:
print(self.path)
print(line)
tag_start, tag_end = int(tag_start), int(tag_end)
annotations['tags'][tag_id] = NER(tag_id, tag_start, tag_end, tag_type, tag_text)
return annotations
def parse_ann_file(ann_file):
return RecordTrack(ann_file)
'''
解压 压缩文件 到 解压目录, 返回解压后的答案目录,要求答案目录下存放生成的ann文件
@@ extract_dir: 解压目录
@@ zip_file: 选手上传的zip文件
'''
def get_answer_dir(extract_dir, zip_file):
answer_dir = ''
if os.path.isdir(extract_dir):
shutil.rmtree(extract_dir)
with zipfile.ZipFile(zip_file, "r") as zip_data:
zip_data.extractall(extract_dir)
zip_data.close()
# 遍历解压后的目录,取首次出现的目录,如果存在一些不相关目录,如MACOSX之类的,请删除掉
for item in os.listdir(extract_dir):
answer_dir = '/'.join([extract_dir, item])
if os.path.isdir(answer_dir):
break
return answer_dir
if __name__=="__main__":
'''
format checker
'''
# NOTE: 实际测试时候请替换成选手自己机器的目录
extract_dir = '/Users/jason/Desktop/ann'
zip_file = '/Users/jason/Desktop/ann.zip'
## check zip file, 要求解压后得到的answer_dir目录存放生成的ann文件
answer_dir = get_answer_dir(extract_dir, zip_file)
print ('Answer dir: ', answer_dir)
## check ann file format:
ann_file = '/Users/jason/Desktop/ann/ann/1000.ann'
record = parse_ann_file(ann_file)
print ('Total annotation number: ', len(record.tags))