|
1 | 1 | import re |
2 | 2 | import logging |
| 3 | +import typing |
3 | 4 | from pprint import pformat |
4 | 5 | from roam2doc.tree import (Root, Branch, Section, Heading, Text, Paragraph, BlankLine, TargetText, |
5 | 6 | LinkTarget, BoldText, ItalicText, |
@@ -190,7 +191,19 @@ def get_section_parser(self): |
190 | 191 | while p is not None and not isinstance(p, SectionParse): |
191 | 192 | p.self.doc_parser.get_parser_parent(p) |
192 | 193 | return p |
193 | | - |
| 194 | + |
| 195 | +class ParagraphParse(ParseTool): |
| 196 | + |
| 197 | + def __init__(self, doc_parser, start, end): |
| 198 | + self.start = start |
| 199 | + self.end = end |
| 200 | + self.cursor = start |
| 201 | + super().__init__(doc_parser) |
| 202 | + |
| 203 | + def parse(self): |
| 204 | + # tell the caller where in |
| 205 | + pass |
| 206 | + |
194 | 207 |
|
195 | 208 | class SectionParse(ParseTool): |
196 | 209 |
|
@@ -304,90 +317,6 @@ def __init__(self, doc_parser, name=None): |
304 | 317 | } |
305 | 318 |
|
306 | 319 |
|
307 | | - def old_parse(self): |
308 | | - matcher = MatchList() |
309 | | - heading_matcher = MatchHeading() |
310 | | - table_matcher = MatchTable() |
311 | | - sec_p = self.get_section_parser() |
312 | | - start_pos = sec_p.cursor |
313 | | - end = sec_p.end |
314 | | - short_id = f"List@{start_pos}" |
315 | | - blank_count = 0 |
316 | | - spaces_per_level = 0 |
317 | | - while start_pos < end: |
318 | | - for line in self.doc_parser.lines[start_pos:end]: |
319 | | - # look for end conditions |
320 | | - if (heading_matcher.match_line(line) |
321 | | - or table_matcher.match_line(line)): |
322 | | - return |
323 | | - if matcher.match_line(line): |
324 | | - res = self.old_list_line_get_type(line) |
325 | | - if self.list_type != res['list_type']: |
326 | | - raise Exception('nested other type lists not done yet') |
327 | | - ordinal = res['ordinal'] |
328 | | - if self.list_type != "def": |
329 | | - # calculate level |
330 | | - indent = len(line) - len(line.lstrip()) |
331 | | - if indent == self.margin: |
332 | | - level = 1 |
333 | | - elif spaces_per_level == 0: |
334 | | - # this must be the first indent |
335 | | - spaces_per_level = indent - self.margin |
336 | | - level = 2 |
337 | | - else: |
338 | | - level = int(indent / spaces_per_level) + 1 |
339 | | - # now figure out what parts are what |
340 | | - tmp = line.lstrip().split() |
341 | | - bullet = tmp.pop(0) |
342 | | - content = None |
343 | | - made_sense = True |
344 | | - if len(tmp) == 0: |
345 | | - content = "" |
346 | | - else: |
347 | | - while len(tmp) > 0 and made_sense: |
348 | | - token = tmp[0] |
349 | | - if token.startswith('[@'): |
350 | | - # is counter |
351 | | - discard = tmp.pop(0) |
352 | | - continue |
353 | | - if token in ('[ ]', '[X]', '[x]', '[+]'): |
354 | | - # is checkbox, maybe we shouldn't skip? |
355 | | - discard = tmp.pop(0) |
356 | | - continue |
357 | | - if self.list_type != "def": |
358 | | - content = ' '.join(tmp) |
359 | | - break |
360 | | - elif len(tmp) >= 2: |
361 | | - tag = tmp.pop(0) |
362 | | - if tmp[0] != "::": |
363 | | - made_sense = False |
364 | | - break |
365 | | - if len(tmp) > 0: |
366 | | - content = ' '.join(tmp) |
367 | | - break |
368 | | - if not made_sense: |
369 | | - self.logger.warning("could not parse list line %s", line) |
370 | | - elif self.list_type == "ordered": |
371 | | - content_list = [Text(the_list, content),] |
372 | | - item = OrderedListItem(the_list, level, ordinal, content_list) |
373 | | - elif self.list_type == "unordered": |
374 | | - content_list = [Text(the_list, content),] |
375 | | - item = UnorderedListItem(the_list, level, content_list) |
376 | | - else: |
377 | | - raise Exception('no code for dict lists yet') |
378 | | - self.logger.debug(self.match_log_format, short_id, str(matcher), line) |
379 | | - sec_p.cursor += 1 |
380 | | - else: |
381 | | - if len(line) == 0: |
382 | | - sec_p.cursor += 1 |
383 | | - blank_count += 1 |
384 | | - BlankLine(self.the_list.children[-1]) |
385 | | - if blank_count == 2: |
386 | | - return |
387 | | - continue |
388 | | - return |
389 | | - |
390 | | - |
391 | 320 | def parse(self): |
392 | 321 | sec_p = self.get_section_parser() |
393 | 322 | parent_parser = self.doc_parser.get_parser_parent(self) |
@@ -530,15 +459,20 @@ def parse_list_item(self, line, list_type='unordered'): |
530 | 459 |
|
531 | 460 |
|
532 | 461 | class LineRegexMatch: |
533 | | - |
| 462 | + """ The structure of this class and its children might look a bit funny, |
| 463 | + but i am trying to ensure that the re patterns are compiled just once, |
| 464 | + not every time a class is instantiated""" |
| 465 | + |
534 | 466 | def __init__(self, patterns): |
535 | 467 | self.patterns = patterns |
536 | 468 |
|
537 | 469 | def match_line(self, line): |
538 | 470 | for re in self.patterns: |
539 | 471 | sr = re.match(line) |
540 | 472 | if sr: |
541 | | - return dict(start=sr.start(), end=sr.end(), matched=sr) |
| 473 | + return dict(start=sr.start(), end=sr.end(), |
| 474 | + groupdict=sr.groupdict(), |
| 475 | + matched=sr) |
542 | 476 | return False |
543 | 477 |
|
544 | 478 | def get_parse_tool(self, doc_parser, name=None): |
@@ -582,3 +516,106 @@ def __init__(self): |
582 | 516 |
|
583 | 517 | def get_parse_tool(self, doc_parser, name=None): |
584 | 518 | return ListParse(doc_parser, name) |
| 519 | + |
| 520 | +class LineRegexAndEndMatch(LineRegexMatch): |
| 521 | + |
| 522 | + def __init__(self, patterns, end_pattern): |
| 523 | + super().__init__(self.patterns) |
| 524 | + self.end_pattern = end_pattern |
| 525 | + |
| 526 | + def match_end_line(self, line): |
| 527 | + sr = self.end_pattern.match(line) |
| 528 | + if sr: |
| 529 | + return dict(start=sr.start(), end=sr.end(), |
| 530 | + groupdict=sr.groupdict(), |
| 531 | + matched=sr) |
| 532 | + return False |
| 533 | + |
| 534 | +class MatchSrc(LineRegexAndEndMatch): |
| 535 | + patterns = [re.compile('^\#\+BEGIN_SRc\s*(?P<language>\w+.)?', re.IGNORECASE),] |
| 536 | + end_pattern = re.compile('^\#\+END_SRC', re.IGNORECASE) |
| 537 | + |
| 538 | + def __init__(self): |
| 539 | + super().__init__(self.patterns, self.end_pattern) |
| 540 | + |
| 541 | + |
| 542 | +class MatchQuote(LineRegexAndEndMatch): |
| 543 | + patterns = [re.compile('^\#\+BEGIN_QUOTE\s*(?P<cite>\w+.*)?', re.IGNORECASE),] |
| 544 | + end_pattern = re.compile('^\#\+END_QUOTE', re.IGNORECASE) |
| 545 | + |
| 546 | + def __init__(self): |
| 547 | + super().__init__(self.patterns, self.end_pattern) |
| 548 | + |
| 549 | +class MatchCenter(LineRegexAndEndMatch): |
| 550 | + patterns = [re.compile('^\#\+BEGIN_CENTER', re.IGNORECASE),] |
| 551 | + end_pattern = re.compile('^\#\+END_CENTER', re.IGNORECASE) |
| 552 | + |
| 553 | + def __init__(self): |
| 554 | + super().__init__(self.patterns, self.end_pattern) |
| 555 | + |
| 556 | +class MatchExample(LineRegexAndEndMatch): |
| 557 | + patterns = [re.compile('^\#\+BEGIN_EXAMPLE', re.IGNORECASE),] |
| 558 | + end_pattern = re.compile('^\#\+END_EXAMPLE', re.IGNORECASE) |
| 559 | + |
| 560 | + def __init__(self): |
| 561 | + super().__init__(self.patterns, self.end_pattern) |
| 562 | + |
| 563 | +class Detector: |
| 564 | + heading_matcher = MatchHeading() |
| 565 | + table_matcher = MatchTable() |
| 566 | + list_matcher = MatchList() |
| 567 | + quote_matcher = MatchQuote() |
| 568 | + center_matcher = MatchCenter() |
| 569 | + # end of greater |
| 570 | + src_matcher = MatchSrc() |
| 571 | + |
| 572 | + def detect_greater_element(self, lines, start, end): |
| 573 | + """ See https://orgmode.org/worg/org-syntax.html#Elements. Some |
| 574 | + things covered elsewhere such as the zeroth section, which is detected by the doc parser.""" |
| 575 | + pos = start |
| 576 | + for line in lines[start:end]: |
| 577 | + # greater elements |
| 578 | + for match_type, matcher in dict(heading=self.heading_matcher, |
| 579 | + table=self.table_matcher, |
| 580 | + list=self.list_matcher, |
| 581 | + quote=self.quote_matcher, |
| 582 | + center=self.center_matcher).items(): |
| 583 | + match_res = matcher.match_line(line) |
| 584 | + if match_res: |
| 585 | + #parser = matcher.get_parse_tool() |
| 586 | + matched = match_res['matched'] |
| 587 | + res = dict(match_type=match_type, pos=pos, |
| 588 | + string=matched.string, |
| 589 | + start=match_res['start'], |
| 590 | + end=match_res['end'], |
| 591 | + group_dict=matched.groupdict()) |
| 592 | + return res |
| 593 | + # lesser elements |
| 594 | + pos += 1 |
| 595 | + return None |
| 596 | + |
| 597 | + def detect_object(self, lines, start, end): |
| 598 | + """ See https://orgmode.org/worg/org-syntax.html#Elements. Some |
| 599 | + things covered elsewhere such as the zeroth section, which is detected by the doc parser.""" |
| 600 | + pos = start |
| 601 | + for line in lines[start:end]: |
| 602 | + if heading_matcher.match_line(line): |
| 603 | + return dict(matched='heading', pos=pos) |
| 604 | + if table_matcher.match_line(line): |
| 605 | + return dict(matched='table', pos=pos) |
| 606 | + if list_matcher.match_line(line): |
| 607 | + return dict(matched='list', pos=pos) |
| 608 | + pos += 1 |
| 609 | + return None |
| 610 | + |
| 611 | + |
| 612 | +if __name__=="__main__": |
| 613 | + lines = [] |
| 614 | + lines.append('#+Begin_Center') |
| 615 | + lines.append('Stuff in the middle') |
| 616 | + lines.append('More Stuff in the middle') |
| 617 | + lines.append('#+END_CenTer') |
| 618 | + detector = Detector() |
| 619 | + elem = detector.detect_greater_element(lines, 0, len(lines)) |
| 620 | + print(elem) |
| 621 | + |
0 commit comments