ImageTextifier/ImageTextifier.py at master · Maetel/ImageTextifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#from skimage.metrics import structural_similarity as ssim
import cv2 as cv
import numpy as np
from typing import List
import multiprocessing
import time
from functools import partial

ITEX_ALGO_DERIVATIVE = 0
ITEX_ALGO_BINARIZE = 1
ITEX_ALGO_AS_IS = 2

ITEX_RESOLUTION_VERY_HIGH = 250
ITEX_RESOLUTION_HIGH = 100
ITEX_RESOLUTION_MEDIUM = 60
ITEX_RESOLUTION_LOW = 30

########################################## helpers
def showimg(img):
    cv.imshow("temp", img)
    cv.waitKey(0)


def array_to_2D(list: List, stride: int):
    if (len(list) % stride) != 0:
        raise Exception(
            f"list(length:{len(list)}) not divisible by stride({stride})")
        return None
    retval = []
    rows = len(list) // stride
    for row in range(rows):
        start, end = row * stride, (row+1) * stride
        retval.append(list[start:end])
    return retval

class Timer:
    def __init__(self):
        self._created = time.perf_counter()
        self._begin = 0
        self._last = 0

    # from instance creation till now
    def until_now(self):
        return time.perf_counter() - self._created

    def begin(self):
        self._begin = time.perf_counter()
        return self._begin

    def end(self):
        self._last = time.perf_counter() - self._begin
        return self._last

    def last_result(self):
        return self._last

# !helpers
########################################## preprocessors
def _preproc_create_texts() -> str:
    alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    nums = "0123456789"
    chars = " ,./<>?`~!@#$%^&*()-_=+[{]}\|;:\'\""
    kor = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉ"
    retval = alphabets + alphabets.lower() + nums + chars + kor
    return retval, len(retval)


def _preproc_create_text_block_image(text: str, block_wid: int = 30, block_hi: int = 30):
    if not text:
        text = ' '
    if len(text) > 1:
        text = text[0]

    # magical settings
    '''
    FONT_HERSHEY_SIMPLEX        = 0, //!< normal size sans-serif font
    FONT_HERSHEY_PLAIN          = 1, //!< small size sans-serif font
    FONT_HERSHEY_DUPLEX         = 2, //!< normal size sans-serif font (more complex than FONT_HERSHEY_SIMPLEX)
    FONT_HERSHEY_COMPLEX        = 3, //!< normal size serif font
    FONT_HERSHEY_TRIPLEX        = 4, //!< normal size serif font (more complex than FONT_HERSHEY_COMPLEX)
    FONT_HERSHEY_COMPLEX_SMALL  = 5, //!< smaller version of FONT_HERSHEY_COMPLEX
    FONT_HERSHEY_SCRIPT_SIMPLEX = 6, //!< hand-writing style font
    FONT_HERSHEY_SCRIPT_COMPLEX
    '''
    #font = cv.FONT_HERSHEY_PLAIN
    font = cv.FONT_HERSHEY_DUPLEX
    fontScale = 1
    fontColor = (255, 255, 255)
    lineType = 1
    block = 30
    vertical_magic = 8
    wid, hi = block, block + vertical_magic
    img = np.zeros((hi, wid, 1), np.uint8)
    hor_offset = int(block/10)

    #x_coord = idx * block + hor_offset
    x_coord = hor_offset
    y_coord = block - hor_offset  # vertical_offset
    cv.putText(img, text,
                (x_coord, y_coord),
                font,
                fontScale,
                fontColor,
                lineType)
    img = cv.resize(img, (block_wid, block_hi))
    return img


def _preproc_create_images(block_wid: int = 30, block_hi: int = 30):
    retval = {}
    texts, textlen = _preproc_create_texts()
    for idx, text in enumerate(texts):
        img = _preproc_create_text_block_image(text, block_wid, block_hi)
        retval[texts[idx]] = img
    return retval

def _preproc_binarize(src):
    img = src[:]
    thres, otsu = cv.threshold(
            img, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU)
        # showimg(otsu)
    for row, row_data in enumerate(otsu):
        for col, pixel_value in enumerate(row_data):
            if not pixel_value:
                img[row][col] = 0
    return img

def _preproc_derivative(src):
    hi, wid = src.shape
    img = np.zeros((hi, wid, 1), np.uint8)
    #der = cv.Scharr(src, cv.CV_64F, 1, 0)
    #der = cv.Laplacian(src, cv.CV_64F)
    der = cv.Canny(src, 100, 100)
    vf = np.vectorize(lambda x : abs(x))
    vf(der)

    min, max = np.amin(der), np.amax(der)
    thres = 0#max*112/113
    vf = np.vectorize(lambda x: 0 if x < thres else x)
    vf(der)
    #dif = max - min
    img = (((der-thres)/(max-thres)) * 255).astype(np.uint8)

    bin = _preproc_binarize(img)

    #showimg(der)
    #showimg(bin)

    #for row, row_data in enumerate(scharr):
    #    for col, pixel_value in enumerate(row_data):
    return bin

def preprocess_source(src, algorithm, invert=True):
    retval = None
    img = src[:]
    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    if invert:
        gray = (255-gray)
    blurred = cv.GaussianBlur(gray, (5, 5), 0)

    if algorithm == ITEX_ALGO_BINARIZE:
        retval = _preproc_binarize(blurred)
    elif algorithm == ITEX_ALGO_AS_IS:
        retval = blurred
    else: #default
        retval = _preproc_derivative(blurred)

    return retval
#!preprocessors
########################################## Pure functions

def block_idx_to_img(img, block_hor, block_wid, block_hi, comparer, fill_blank, block_idx):
    col = block_idx % block_hor
    row = block_idx // block_hor

    block = img[row*block_hi: (row+1)*block_hi,
                col*block_wid: (col+1)*block_wid]
    text, score = comparer(block, fill_blank)
    return text

# !Pure functions
########################################## Main class
class ImageTextifier:
    def __init__(self, block_wid: int = 20, block_hi: int = 20):
        self.dataset = None
        self.create_dataset(block_wid, block_hi)

    def create_dataset(self, block_wid: int = 20, block_hi: int = 20):
        self.dataset = _preproc_create_images(block_wid, block_hi)
        return self.dataset

    # match dataset size to source block size
    def update_dataset_size(self, src_block):
        block_hi, block_wid = src_block.shape
        self.update_dataset_size(block_hi, block_wid)

    def update_dataset_size(self, block_hi, block_wid):
        if not self.dataset:
            return
        datum_hi, datum_wid = self.dataset['A'].shape
        if (block_wid, block_hi) != (datum_wid, datum_hi):
            for k, v in self.dataset.items():
                self.dataset[k] = cv.resize(v, (block_wid, block_hi))

    # compare a small image to a set of text images, and returns a character that matches
    def compare_block(self, src_block, fill_blank = ' '):
        if not self.dataset:
            return
        #if empty block, return fill_blank with the highest score
        if not np.sum(src_block):
            return fill_blank, 1

        # assume dataset size is already updated
        #self.update_dataset_size(src_block)
        highest_score = 0
        highest_text = fill_blank
        for k, v in self.dataset.items():
            # similarity = ssim(v, src_block) #skimage
            '''
            matchTemplate methods :
            cv.TM_CCOEFF
            cv.TM_CCOEFF_NORMED
            cv.TM_CCORR
            cv.TM_CCORR_NORMED
            cv.TM_SQDIFF
            cv.TM_SQDIFF_NORMED
            '''
            similarity = cv.matchTemplate(v, src_block, cv.TM_CCOEFF_NORMED)
            if highest_score < similarity:
                highest_score = similarity
                highest_text = k
                #print(f"Highest score/text : {highest_score}/{highest_text}")
        return highest_text, highest_score


    # main method
    def textify(self, src, grid_size=ITEX_RESOLUTION_MEDIUM, algorithm = ITEX_ALGO_DERIVATIVE, speak_process=True, speak_result_as_text=True, return_text_image=True, invert_image=False, fill_blank = ' '):
        total_timer, process_timer = Timer(), Timer()
        total_timer.begin()
        if speak_process:
            print("Warming up...")
        #setup basic variables
        hi_src, wid_src, _ = src.shape
        wid_dst, hi_dst = wid_src - wid_src % grid_size, hi_src - hi_src % grid_size
        smaller_block_size = min(wid_dst // grid_size, hi_dst // grid_size)
        block_wid, block_hi = smaller_block_size, smaller_block_size
        block_hor, block_ver = wid_dst // block_wid, hi_dst // block_hi
        block_count = block_hor * block_ver
        self.update_dataset_size(block_hi, block_wid)

        img = cv.resize(src[:], (wid_dst, hi_dst))
        img = preprocess_source(img, algorithm, invert_image)

        if not fill_blank:
            fill_blank = ' '
        elif len(fill_blank) > 1:
            fill_blank = fill_blank[0]

        # begin main processing
        if speak_process:
            print("Processing...")
        process_timer.begin()
        pool = multiprocessing.Pool()
        func = partial(block_idx_to_img, img, block_hor, block_wid, block_hi, self.compare_block, fill_blank)
        retval = pool.map(func, range(block_count))
        pool.close()
        pool.join()
        if speak_process:
            print(f"Finished ({process_timer.end():0.4f})s")

        # handle results
        if speak_result_as_text:
            for row in range(block_ver):
                start = block_ver * row
                print("".join(retval[start: start + block_hor]))

        retval_2D = array_to_2D(retval, block_hor)

        if not return_text_image:
            return retval_2D

        text_image = np.zeros_like(img)
        for block_idx, char in enumerate(retval):
            col = block_idx % block_hor
            row = block_idx // block_hor
            text_image[row*block_hi: (row+1)*block_hi,
                        col*block_wid: (col+1)*block_wid] = self.dataset[char]
        text_image = cv.resize(text_image, (wid_src, hi_src))
        if speak_process:
            print(f"Total elapsed ({total_timer.end():0.4f})s")
        return retval_2D, text_image

# !Main class