Skip to content

Commit 545da0e

Browse files
author
Dennis Parker
committed
checkpoint
1 parent 57deea1 commit 545da0e

File tree

4 files changed

+141
-87
lines changed

4 files changed

+141
-87
lines changed

README.org

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
** Things it doesn't do and probably should
1111
- Produce LaTex output, including any LaTex features found in the org files
1212
- Parse and do something useful with the time management aspects of org files, this
13+
** Things it doesn't do and maybe never will
14+
- Inlinetasks are not parsed, they will be treated as headings an will make things ugly
15+
- Footnotes are not parsed, they will be treated as ordinary text
1316
** History, what I wanted and why it lead to this.
1417
*** What
1518
I wanted to be able to take notes on a wide range of topics and relate them together
@@ -48,3 +51,13 @@
4851
The scale of the modifications needed to achieve my goals convinced me that I was going
4952
to contort the structure so badly that it would be dificult to maintain. So I decided
5053
to start over.
54+
55+
56+
#+begin_quote
57+
Stuff here
58+
#+end_quote
59+
60+
#+begin_src python
61+
def foo(s):
62+
pass
63+
#+end_src

dev_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pytest==8.3.5
22
pytest-cov==6.0.0
3+
ipdb==0.13.13

src/roam2doc/parse.py

Lines changed: 124 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import logging
3+
import typing
34
from pprint import pformat
45
from roam2doc.tree import (Root, Branch, Section, Heading, Text, Paragraph, BlankLine, TargetText,
56
LinkTarget, BoldText, ItalicText,
@@ -190,7 +191,19 @@ def get_section_parser(self):
190191
while p is not None and not isinstance(p, SectionParse):
191192
p.self.doc_parser.get_parser_parent(p)
192193
return p
193-
194+
195+
class ParagraphParse(ParseTool):
196+
197+
def __init__(self, doc_parser, start, end):
198+
self.start = start
199+
self.end = end
200+
self.cursor = start
201+
super().__init__(doc_parser)
202+
203+
def parse(self):
204+
# tell the caller where in
205+
pass
206+
194207

195208
class SectionParse(ParseTool):
196209

@@ -304,90 +317,6 @@ def __init__(self, doc_parser, name=None):
304317
}
305318

306319

307-
def old_parse(self):
308-
matcher = MatchList()
309-
heading_matcher = MatchHeading()
310-
table_matcher = MatchTable()
311-
sec_p = self.get_section_parser()
312-
start_pos = sec_p.cursor
313-
end = sec_p.end
314-
short_id = f"List@{start_pos}"
315-
blank_count = 0
316-
spaces_per_level = 0
317-
while start_pos < end:
318-
for line in self.doc_parser.lines[start_pos:end]:
319-
# look for end conditions
320-
if (heading_matcher.match_line(line)
321-
or table_matcher.match_line(line)):
322-
return
323-
if matcher.match_line(line):
324-
res = self.old_list_line_get_type(line)
325-
if self.list_type != res['list_type']:
326-
raise Exception('nested other type lists not done yet')
327-
ordinal = res['ordinal']
328-
if self.list_type != "def":
329-
# calculate level
330-
indent = len(line) - len(line.lstrip())
331-
if indent == self.margin:
332-
level = 1
333-
elif spaces_per_level == 0:
334-
# this must be the first indent
335-
spaces_per_level = indent - self.margin
336-
level = 2
337-
else:
338-
level = int(indent / spaces_per_level) + 1
339-
# now figure out what parts are what
340-
tmp = line.lstrip().split()
341-
bullet = tmp.pop(0)
342-
content = None
343-
made_sense = True
344-
if len(tmp) == 0:
345-
content = ""
346-
else:
347-
while len(tmp) > 0 and made_sense:
348-
token = tmp[0]
349-
if token.startswith('[@'):
350-
# is counter
351-
discard = tmp.pop(0)
352-
continue
353-
if token in ('[ ]', '[X]', '[x]', '[+]'):
354-
# is checkbox, maybe we shouldn't skip?
355-
discard = tmp.pop(0)
356-
continue
357-
if self.list_type != "def":
358-
content = ' '.join(tmp)
359-
break
360-
elif len(tmp) >= 2:
361-
tag = tmp.pop(0)
362-
if tmp[0] != "::":
363-
made_sense = False
364-
break
365-
if len(tmp) > 0:
366-
content = ' '.join(tmp)
367-
break
368-
if not made_sense:
369-
self.logger.warning("could not parse list line %s", line)
370-
elif self.list_type == "ordered":
371-
content_list = [Text(the_list, content),]
372-
item = OrderedListItem(the_list, level, ordinal, content_list)
373-
elif self.list_type == "unordered":
374-
content_list = [Text(the_list, content),]
375-
item = UnorderedListItem(the_list, level, content_list)
376-
else:
377-
raise Exception('no code for dict lists yet')
378-
self.logger.debug(self.match_log_format, short_id, str(matcher), line)
379-
sec_p.cursor += 1
380-
else:
381-
if len(line) == 0:
382-
sec_p.cursor += 1
383-
blank_count += 1
384-
BlankLine(self.the_list.children[-1])
385-
if blank_count == 2:
386-
return
387-
continue
388-
return
389-
390-
391320
def parse(self):
392321
sec_p = self.get_section_parser()
393322
parent_parser = self.doc_parser.get_parser_parent(self)
@@ -530,15 +459,20 @@ def parse_list_item(self, line, list_type='unordered'):
530459

531460

532461
class LineRegexMatch:
533-
462+
""" The structure of this class and its children might look a bit funny,
463+
but i am trying to ensure that the re patterns are compiled just once,
464+
not every time a class is instantiated"""
465+
534466
def __init__(self, patterns):
535467
self.patterns = patterns
536468

537469
def match_line(self, line):
538470
for re in self.patterns:
539471
sr = re.match(line)
540472
if sr:
541-
return dict(start=sr.start(), end=sr.end(), matched=sr)
473+
return dict(start=sr.start(), end=sr.end(),
474+
groupdict=sr.groupdict(),
475+
matched=sr)
542476
return False
543477

544478
def get_parse_tool(self, doc_parser, name=None):
@@ -582,3 +516,106 @@ def __init__(self):
582516

583517
def get_parse_tool(self, doc_parser, name=None):
584518
return ListParse(doc_parser, name)
519+
520+
class LineRegexAndEndMatch(LineRegexMatch):
521+
522+
def __init__(self, patterns, end_pattern):
523+
super().__init__(self.patterns)
524+
self.end_pattern = end_pattern
525+
526+
def match_end_line(self, line):
527+
sr = self.end_pattern.match(line)
528+
if sr:
529+
return dict(start=sr.start(), end=sr.end(),
530+
groupdict=sr.groupdict(),
531+
matched=sr)
532+
return False
533+
534+
class MatchSrc(LineRegexAndEndMatch):
535+
patterns = [re.compile('^\#\+BEGIN_SRc\s*(?P<language>\w+.)?', re.IGNORECASE),]
536+
end_pattern = re.compile('^\#\+END_SRC', re.IGNORECASE)
537+
538+
def __init__(self):
539+
super().__init__(self.patterns, self.end_pattern)
540+
541+
542+
class MatchQuote(LineRegexAndEndMatch):
543+
patterns = [re.compile('^\#\+BEGIN_QUOTE\s*(?P<cite>\w+.*)?', re.IGNORECASE),]
544+
end_pattern = re.compile('^\#\+END_QUOTE', re.IGNORECASE)
545+
546+
def __init__(self):
547+
super().__init__(self.patterns, self.end_pattern)
548+
549+
class MatchCenter(LineRegexAndEndMatch):
550+
patterns = [re.compile('^\#\+BEGIN_CENTER', re.IGNORECASE),]
551+
end_pattern = re.compile('^\#\+END_CENTER', re.IGNORECASE)
552+
553+
def __init__(self):
554+
super().__init__(self.patterns, self.end_pattern)
555+
556+
class MatchExample(LineRegexAndEndMatch):
557+
patterns = [re.compile('^\#\+BEGIN_EXAMPLE', re.IGNORECASE),]
558+
end_pattern = re.compile('^\#\+END_EXAMPLE', re.IGNORECASE)
559+
560+
def __init__(self):
561+
super().__init__(self.patterns, self.end_pattern)
562+
563+
class Detector:
564+
heading_matcher = MatchHeading()
565+
table_matcher = MatchTable()
566+
list_matcher = MatchList()
567+
quote_matcher = MatchQuote()
568+
center_matcher = MatchCenter()
569+
# end of greater
570+
src_matcher = MatchSrc()
571+
572+
def detect_greater_element(self, lines, start, end):
573+
""" See https://orgmode.org/worg/org-syntax.html#Elements. Some
574+
things covered elsewhere such as the zeroth section, which is detected by the doc parser."""
575+
pos = start
576+
for line in lines[start:end]:
577+
# greater elements
578+
for match_type, matcher in dict(heading=self.heading_matcher,
579+
table=self.table_matcher,
580+
list=self.list_matcher,
581+
quote=self.quote_matcher,
582+
center=self.center_matcher).items():
583+
match_res = matcher.match_line(line)
584+
if match_res:
585+
#parser = matcher.get_parse_tool()
586+
matched = match_res['matched']
587+
res = dict(match_type=match_type, pos=pos,
588+
string=matched.string,
589+
start=match_res['start'],
590+
end=match_res['end'],
591+
group_dict=matched.groupdict())
592+
return res
593+
# lesser elements
594+
pos += 1
595+
return None
596+
597+
def detect_object(self, lines, start, end):
598+
""" See https://orgmode.org/worg/org-syntax.html#Elements. Some
599+
things covered elsewhere such as the zeroth section, which is detected by the doc parser."""
600+
pos = start
601+
for line in lines[start:end]:
602+
if heading_matcher.match_line(line):
603+
return dict(matched='heading', pos=pos)
604+
if table_matcher.match_line(line):
605+
return dict(matched='table', pos=pos)
606+
if list_matcher.match_line(line):
607+
return dict(matched='list', pos=pos)
608+
pos += 1
609+
return None
610+
611+
612+
if __name__=="__main__":
613+
lines = []
614+
lines.append('#+Begin_Center')
615+
lines.append('Stuff in the middle')
616+
lines.append('More Stuff in the middle')
617+
lines.append('#+END_CenTer')
618+
detector = Detector()
619+
elem = detector.detect_greater_element(lines, 0, len(lines))
620+
print(elem)
621+

tests/org_files/examples/props_title_and_list.org

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
+ Another
1111
+ under
12+
A paragraph here
13+
14+
+ That works
1215

1316

1417

0 commit comments

Comments
 (0)