Skip to content

Commit ea244e6

Browse files
committed
Refactoring
Added support for duplicate picture names in Docx.merge(). Refactored Docx.replace(), Docx.search(), merge_text().
1 parent 22b2eda commit ea244e6

File tree

1 file changed

+70
-58
lines changed

1 file changed

+70
-58
lines changed

oodocx/oodocx.py

Lines changed: 70 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -183,71 +183,75 @@ def get_body(self):
183183
namespaces=NSPREFIXES)[0]
184184

185185
def search(self, search, result_type='text', ignore_runs=True):
186-
'''Search for a regex, returns first matching element object or
187-
None if nothing found. Will return the first element if match
188-
spans multiple text elements.'''
186+
'''Search each paragraph for a regex, returns first matching
187+
element object or None if nothing found. Will return the
188+
first element if match spans multiple text elements.'''
189189
searchre = re.compile(search)
190190
result = None
191191
if ignore_runs:
192-
para_list = [child for child in
193-
self.document.iter('{' + NSPREFIXES['w'] + '}p')]
194-
text_positions = []
195-
raw_text = []
196-
start = 0
197-
for para in para_list:
198-
for element in para.iter('{' + NSPREFIXES['w'] + '}t'):
199-
if element.text:
200-
raw_text.append(element.text)
201-
text_positions.append((start,
202-
start + len(element.text) - 1, element))
203-
start += len(element.text)
204-
match = searchre.search(''.join(raw_text))
205-
if match:
206-
for value in text_positions:
207-
if match.start() in range(value[0], value[1] + 1):
208-
result = value[2]
209-
break
192+
for paragraph in self.document.iter('{' + NSPREFIXES['w'] + '}p'):
193+
text_positions = []
194+
start = 0
195+
paragraph_string = get_text(paragraph)
196+
for text_element in paragraph.iter('{' + NSPREFIXES['w'] +
197+
'}t'):
198+
if text_element.text:
199+
text_positions.append({'start': start,
200+
'end': start + len(text_element.text) - 1,
201+
'element': text_element})
202+
start += len(text_element.text)
203+
match = searchre.search(paragraph_string)
204+
if match:
205+
for position in text_positions:
206+
if match.start() in range(position['start'],
207+
position['end'] + 1):
208+
result = position['element']
209+
break
210+
break
210211
else:
211212
for element in self.document.iter('{' + NSPREFIXES['w'] + '}t'):
212213
if element.text and searchre.search(element.text):
213214
result = element
214215
break
215216
if result is not None:
216217
if result_type.lower() == 'paragraph':
217-
while not result.tag == '{' + NSPREFIXES['w'] + '}p':
218-
result = result.getparent()
218+
if (result.iterancestors('{' + NSPREFIXES['w'] + '}p')
219+
is not None):
220+
while not result.tag == '{' + NSPREFIXES['w'] + '}p':
221+
result = result.getparent()
222+
else:
223+
raise
219224
elif result_type.lower() == 'run':
220-
while not result.tag == '{' + NSPREFIXES['w'] + '}r':
221-
result = result.getparent()
225+
if (result.iterancestors('{' + NSPREFIXES['w'] + '}r')
226+
is not None):
227+
while not result.tag == '{' + NSPREFIXES['w'] + '}r':
228+
result = result.getparent()
229+
else:
230+
raise
222231
return result
223232

224233
def replace(self, search, replace, ignore_runs=True):
225234
'''Replace all occurrences of string with a different string.
226235
If ignore_runs is true, the function will ignore separate run
227-
and text elements and instead search each raw paragraph text
236+
and text elements and instead search each paragraph text
228237
content as a single string. Note that this will also ignore
229238
formatting elements within a paragraph such as tabs, which
230239
may cause unexpected results. Set ignore_runs to false if you
231240
want a more conservative search.'''
232241
searchre = re.compile(search)
233242
if ignore_runs:
234-
para_list = [child for child in
235-
self.document.iter('{' + NSPREFIXES['w'] + '}p')]
236-
for para in para_list:
237-
paratext = []
243+
for paragraph_element in self.document.iter('{' + NSPREFIXES['w'] +
244+
'}p'):
238245
rundict = collections.OrderedDict()
239246
start = 0
240-
for element in para.iter('{' + NSPREFIXES['w'] + '}r'):
241-
runtext = []
242-
for subelement in element.iter('{' + NSPREFIXES['w'] + '}t'):
243-
if subelement.text:
244-
paratext.append(subelement.text)
245-
runtext.append(subelement.text)
246-
rundict[element] = [start, start +
247-
len(subelement.text), ''.join(runtext)]
248-
start += len(subelement.text)
247+
for run_element in paragraph_element.iter('{' + NSPREFIXES['w']
248+
+ '}r'):
249+
run_string = get_text(run_element)
250+
rundict[run_element] = [start, start + len(run_string),
251+
run_string]
252+
start += len(run_string)
249253
match_slices = [match.span() for match in
250-
re.finditer(searchre, ''.join(paratext))]
254+
re.finditer(searchre, get_text(paragraph_element))]
251255
preliminary_runs = collections.OrderedDict()
252256
runs_to_exclude = set()
253257
for run, text_info in rundict.items():
@@ -275,7 +279,7 @@ def replace(self, search, replace, ignore_runs=True):
275279
previous_text.text += text_info[2]
276280
previous_text_info[1] += len(text_info[2])
277281
previous_text_info[2] += text_info[2]
278-
para.remove(run)
282+
paragraph_element.remove(run)
279283
overflow = 0
280284
for index, (run, text_info) in enumerate(
281285
runs_to_modify.items()):
@@ -325,7 +329,7 @@ def clean(self):
325329
element.getparent().remove(element)
326330

327331
def add_style(self, styleId, type, default=None, name=None):
328-
if default in (1, '1', True):
332+
if default:
329333
style = makeelement('style', attributes={'styleId': styleId,
330334
'type': type, 'default': default})
331335
else:
@@ -428,10 +432,16 @@ def get_document_text(self):
428432
return paratextlist
429433

430434
def merge(self, docpath, page_break=True):
435+
'''Appends a .docx to the end of this document. docpath can
436+
either be a Docx object or a file path. This method will likely
437+
break if both documents possess the same type of elements that
438+
require id mapping such as lists or comments. Pictures and other
439+
<w: drawing> elements, however, should work 100% of the time.'''
431440
if isinstance(docpath, Docx):
432441
fromdoc = docpath
433442
else:
434443
fromdoc = Docx(docpath)
444+
# Update relationship Ids
435445
for relationship in fromdoc.relationships:
436446
old_rId = relationship.values()[0]
437447
relationship_type = relationship.values()[1]
@@ -460,8 +470,13 @@ def merge(self, docpath, page_break=True):
460470
head, tail = os.path.split(dirpath)
461471
if tail == 'media':
462472
for file in filenames:
463-
shutil.copyfile(os.path.join(fromdoc.media_dir, file),
464-
os.path.join(self.media_dir, file))
473+
if os.path.join(self.media_dir, file) not in tofiles:
474+
shutil.copyfile(os.path.join(fromdoc.media_dir, file),
475+
os.path.join(self.media_dir, file))
476+
else: # Account for duplicate picture names
477+
shutil.copyfile(os.path.join(fromdoc.media_dir,
478+
'new_' + file),
479+
os.path.join(self.media_dir, file))
465480
else:
466481
for file in filenames:
467482
if not os.path.isdir(os.path.join(self.write_dir, relpath)):
@@ -498,6 +513,7 @@ def merge(self, docpath, page_break=True):
498513
self.body.extend(fromdoc.body.iterchildren())
499514

500515
def save(self, output):
516+
'''Saves the Docx to the output path provided.'''
501517
docxfile = zipfile.ZipFile(output, mode='w',
502518
compression=zipfile.ZIP_DEFLATED)
503519
# Move to the template data path
@@ -506,7 +522,8 @@ def save(self, output):
506522
# Write changes made to xml files in write directory between __init__()
507523
# and save()
508524
for xmlfile, relpath in self.xmlfiles.items():
509-
absolutepath = os.path.split(os.path.join(self.write_dir, relpath))[0]
525+
absolutepath = os.path.split(
526+
os.path.join(self.write_dir, relpath))[0]
510527
if not os.path.isdir(absolutepath):
511528
os.mkdir(absolutepath)
512529
newdoc = io.open(relpath, 'w')
@@ -529,19 +546,14 @@ def save(self, output):
529546
pass
530547

531548
def merge_text(run):
532-
runtext = ''
533-
first = True
534-
for child in run.iterchildren('{' + NSPREFIXES['w'] + '}t'):
535-
if child.text:
536-
runtext == ''
537-
runtext += child.text
538-
if first:
539-
first_text_element = child
540-
first = False
541-
else:
542-
run.remove(child)
543-
first_text_element.text = runtext
544-
549+
'''Combines the text of all text elements in a run into a single
550+
text element, removes the other text elements.'''
551+
for index, child in enumerate(
552+
run.iterchildren('{' + NSPREFIXES['w'] + '}t')):
553+
if index == 0:
554+
child.text = get_text(run)
555+
else:
556+
run.remove(child)
545557

546558
def modify_font(elements, name='default', size='default', underline='default',
547559
color='default', highlight='default', strikethrough='default', bold='default',

0 commit comments

Comments
 (0)