-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpost_processing_module.py
More file actions
141 lines (122 loc) · 5.78 KB
/
post_processing_module.py
File metadata and controls
141 lines (122 loc) · 5.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
import os
import rules as rs
from ex_dictionary import ExDictionary as Ed
from openpyxl import Workbook
class PostProcessModule:
def __init__(self, _path, _output_type):
"""
:param _path: path of input file
:param _output_type: type of output
"""
self.__path = _path
self.__output_type = _output_type
self.__s = list()
self.__d = Ed.make_ex_dictionary() # make exception expression dictionary
self.__is_process_finished = False
# method
def process(self):
"""
:return: result of post-processing Korean language dependency
"""
# read input file
with open(self.__path, 'r', encoding='utf-8-sig') as input_file:
before_sentence_id = 1
for each_line in input_file:
string = each_line.strip()
buffer = string.split('\t')
if int(buffer[1]) != before_sentence_id:
self.__checking_rule() # checking rules
self.__print_data() # print sentence data
self.__s = list() # initialize sentence buffer
before_sentence_id += 1
buffer.append('-')
buffer.append('-')
buffer.append(self.__is_same_gov(buffer[6], buffer[9]))
self.__s.append(buffer)
self.__checking_rule() # checking rules
self.__print_data() # print sentence data
@staticmethod
def __is_same_gov(n, m):
num_n = int(n)
num_m = int(m)
if num_n == num_m:
return '1'
return '0'
def __checking_rule(self):
for i in range(len(self.__s)):
# relation_name, evidence 가 이미 있으면 skip 으로 처리
if self.__s[i][10] != '-':
continue
try:
# =========================================================
# RULE_01 SF와 SP의 처리 punch
if rs.Rules.condition_01(self.__s, i):
rs.Rules.rule_01(self.__s, i)
# =========================================================
# RULE_02 관용어의 의존관계 '~수' fixed
# RULE_03 관용어의 의존관계 '~있(VA)', '~없' aux
elif rs.Rules.condition_02(self.__s, i):
rs.Rules.rule_02(self.__s, i)
# =========================================================
# RULE_04 보조용언(VX)의 본용언(VV) 지배소 찾기 aux
elif rs.Rules.condition_04(self.__s, i):
rs.Rules.rule_04(self.__s, i)
# =========================================================
# RULE_05 '~에', '~를' 등등 fixed
# EX_RULE_01 '대해', '위해' 등등 obl
elif rs.Rules.condition_05(self.__s, i, self.__d):
rs.Rules.rule_05(self.__s, i)
# =========================================================
# RULE_06 대등접속사 '및'의 처리 cc
# '등' 추가필요
elif rs.Rules.condition_06(self.__s, i):
rs.Rules.rule_06(self.__s, i)
# =========================================================
# RULE_07 관형절의 처리 acl
elif rs.Rules.condition_07(self.__s, i):
rs.Rules.rule_07(self.__s, i)
# =========================================================
except IndexError as error:
print(error)
self.__is_process_finished = True
def __print_data(self):
if self.__is_process_finished:
output_file_path = os.getcwd() + '\\output_data'
if self.__output_type == 'text':
self.__print_data_to_text(output_file_path)
elif self.__output_type == 'excel':
self.__print_data_to_excel(output_file_path)
elif self.__output_type == 'console':
self.__print_data_to_console()
else:
print('output type args error')
else:
print('process error')
self.__s = list()
def __print_data_to_text(self, _output_file_path):
with open(_output_file_path + '.txt', 'a', encoding='utf-8') as output_file:
print_list = [0, 1, 2, 3, 6, 10, 11, 12]
for i in range(len(self.__s)):
result = ''
for j in range(len(print_list)):
result += str(self.__s[i][print_list[j]]) + '\t'
result = result[:-1]
print(result, file=output_file)
def __print_data_to_excel(self, _output_file_path):
write_workbook = Workbook()
write_worksheet = write_workbook.active
for i in range(len(self.__s)):
write_worksheet.append(self.__s[i])
write_workbook.save(_output_file_path + '.xlsx')
def __print_data_to_console(self):
for i in range(len(self.__s)):
for j in range(len(self.__s[i])):
print(str(self.__s[i]) + ' ', end='')
print('')
@staticmethod
def to_conllu_format(_input_file_path, _conllu_output_file_path):
with open(_conllu_output_file_path, 'w', encoding='utf-8') as output_file:
with open(_input_file_path, 'r', encoding='utf-8-sig') as input_file:
for each_line in input_file.readlines():
string = each_line.strip()