inline translation in book

neta-elad · neta-elad · commit c880781cc622 · 2022-08-20T16:24:22.000-07:00
diff --git a/README.md b/README.md
@@ -59,6 +59,7 @@ and [extended markup](https://python-markdown.github.io/extensions/).
 Sound changes are defined in `changes.lsc`, 
 and applied using Lexurgy.
 
+### Lexicon
 Pyconlang's lexicon is defined in `lexicon.txt`.
 You define basic entries using the `entry` declaration:
 ```
@@ -129,7 +130,7 @@ entry &noun <apple> *saka (n.) apple, any kind of tree-fruit
 You can see the diagrams for the lexicon syntax
 [here](https://htmlpreview.github.io/?https://github.com/neta-elad/pyconlang/blob/main/diagrams.html).
 
-### Complete Example
+#### Example
 Given the sound changes `changes.lsc`:
 ```
 Class vowel {a, e, i, o, u}
@@ -181,6 +182,15 @@ The following entries will appear in the book:
 >
 > ...
 
+### Markdown Extensions
+Inline translations (using the lexicon)
+can be inserted between two hash signs:
+```
+**An example: #*aki@after-palatalization <stone>.PL#.**
+```
+will turn out as
+> **An example: agi abagigim.**
+
 
 
 ## TODO
@@ -202,4 +212,4 @@ The following entries will appear in the book:
   - [ ] Affixes list
   - [ ] Phonology tables
   - [ ] Conjugation tables
-  - [ ] Inline translation
+  - [x] Inline translation
diff --git a/pyconlang/book/__init__.py b/pyconlang/book/__init__.py
@@ -10,7 +10,8 @@
 from .. import PYCONLANG_PATH
 from .block import Boxed
 from .inline import InlineDelete, InlineInsert
-from .preprocess import LexiconInserter, SkipLine
+from .lexicon_inserter import LexiconInserter
+from .preprocess import SkipLine
 
 
 class Compiler:
diff --git a/pyconlang/book/lexicon_inserter.py b/pyconlang/book/lexicon_inserter.py
@@ -0,0 +1,112 @@
+import string
+from itertools import chain
+from typing import Any, Dict, List, Match, Tuple, Union
+from xml.etree.ElementTree import Element
+
+from markdown import Extension, Markdown
+from markdown.inlinepatterns import InlineProcessor
+from markdown.preprocessors import Preprocessor
+
+from ..lexicon import Lexicon
+from ..lexicon.parser import parse_lexicon_file, parse_sentence
+from ..lexurgy import evolve
+from ..types import AffixType, Entry, Form, ResolvedForm
+
+
+class LexiconPreprocessor(Preprocessor):
+    lexicon: Lexicon
+
+    def __init__(self, md: Markdown, lexicon: Lexicon) -> None:
+        super().__init__(md)
+        self.lexicon = lexicon
+
+    def run(self, lines: List[str]) -> List[str]:
+        new_lines = []
+        for line in lines:
+            if line.strip() == "!lexicon":
+                lexicon: Dict[str, List[Tuple[List[str], Entry]]] = {}
+                for entry in self.lexicon.entries:
+                    evolved = self.evolve_all(entry)
+                    letter = evolved[0][0]
+                    lexicon.setdefault(letter, [])
+                    lexicon[letter].append((evolved, entry))
+                for letter in string.ascii_lowercase:
+                    new_lines.append(f"## {letter.upper()}")
+
+                    if letter not in lexicon:
+                        continue
+
+                    lexicon[letter].sort()
+                    for evolved, entry in lexicon[letter]:
+                        protos = " + ".join(
+                            f"_\\*{proto}_" for proto in self.form_to_protos(entry.form)
+                        )
+                        all_evolved = ", ".join(f"**{each}**" for each in evolved)
+                        new_lines.append(
+                            f"""
+                        {all_evolved} {protos} ({entry.part_of_speech.name}.) {entry.definition}
+                        """.strip()
+                        )
+                        new_lines.append("")
+            else:
+                new_lines.append(line)
+        return new_lines
+
+    def form_to_protos(self, form: Form) -> List[str]:
+        return self.resolved_form_to_protos(self.lexicon.resolve(form))
+
+    def resolved_form_to_protos(self, form: ResolvedForm) -> List[str]:
+        protos = [[form.stem.form]]
+        for affix in form.affixes:
+            affix_protos = self.resolved_form_to_protos(affix.form)
+            if affix.affix.type is AffixType.PREFIX:
+                protos.insert(0, affix_protos)
+            else:
+                protos.append(affix_protos)
+
+        return list(chain(*protos))
+
+    def evolve_all(self, entry: Entry) -> List[str]:
+        return [
+            evolve(self.lexicon.substitute(var, entry.form))
+            for var in self.lexicon.get_vars(entry.template)
+        ]
+
+
+class LexiconInlineProcessor(InlineProcessor):
+    lexicon: Lexicon
+
+    def __init__(self, lexicon: Lexicon) -> None:
+        super().__init__(r"#(.*?)#")
+        self.lexicon = lexicon
+
+    # InlineProcessor and its parent Pattern
+    # have contradictory type annotations,
+    # so we have to ignore type.
+    def handleMatch(  # type: ignore
+        self, m: Match[str], data: Any
+    ) -> Union[Tuple[Element, int, int], Tuple[None, None, None]]:
+        element = Element("span")
+        element.text = self.evolve(m.group(1))
+        return element, m.start(), m.end()
+
+    def evolve(self, raw: str) -> str:
+        return " ".join(
+            evolve(self.lexicon.resolve(form)) for form in parse_sentence(raw)
+        )
+
+
+class LexiconInserter(Extension):
+    lexicon: Lexicon
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.lexicon = parse_lexicon_file()
+
+    def extendMarkdown(self, md: Markdown) -> None:
+        md.registerExtension(self)
+        md.preprocessors.register(LexiconPreprocessor(md, self.lexicon), "lexicon", 0)
+        md.inlinePatterns.register(
+            LexiconInlineProcessor(self.lexicon), "inline-lexicon", 200
+        )
diff --git a/pyconlang/book/preprocess.py b/pyconlang/book/preprocess.py
@@ -1,16 +1,9 @@
-import string
-from itertools import chain
-from typing import Dict, List, Tuple
+from typing import List
 
 from markdown import Markdown
 from markdown.extensions import Extension
 from markdown.preprocessors import Preprocessor
 
-from ..lexicon import Lexicon
-from ..lexicon.parser import parse_lexicon_file
-from ..lexurgy import evolve
-from ..types import AffixType, Entry, Form, ResolvedForm
-
 
 class SkipLinePreprocessor(Preprocessor):
     state: "SkipLine"
@@ -44,71 +37,3 @@ def extendMarkdown(self, md: Markdown) -> None:
 
     def reset(self) -> None:
         self.skipped = []
-
-
-class LexiconPreprocessor(Preprocessor):
-    lexicon: Lexicon
-
-    def __init__(self, md: Markdown, lexicon: Lexicon) -> None:
-        super().__init__(md)
-        self.lexicon = lexicon
-
-    def run(self, lines: List[str]) -> List[str]:
-        new_lines = []
-        for line in lines:
-            if line.strip() == "!lexicon":
-                lexicon: Dict[str, List[Tuple[List[str], Entry]]] = {}
-                for entry in self.lexicon.entries:
-                    evolved = self.evolve_all(entry)
-                    letter = evolved[0][0]
-                    lexicon.setdefault(letter, [])
-                    lexicon[letter].append((evolved, entry))
-                for letter in string.ascii_lowercase:
-                    new_lines.append(f"## {letter.upper()}")
-
-                    if letter not in lexicon:
-                        continue
-
-                    lexicon[letter].sort()
-                    for evolved, entry in lexicon[letter]:
-                        protos = " + ".join(
-                            f"_\\*{proto}_" for proto in self.form_to_protos(entry.form)
-                        )
-                        all_evolved = ", ".join(f"**{each}**" for each in evolved)
-                        new_lines.append(
-                            f"""
-                        {all_evolved} {protos} ({entry.part_of_speech.name}.) {entry.definition}
-                        """.strip()
-                        )
-                        new_lines.append("")
-            else:
-                new_lines.append(line)
-        return new_lines
-
-    def form_to_protos(self, form: Form) -> List[str]:
-        return self.resolved_form_to_protos(self.lexicon.resolve(form))
-
-    def resolved_form_to_protos(self, form: ResolvedForm) -> List[str]:
-        protos = [[form.stem.form]]
-        for affix in form.affixes:
-            affix_protos = self.resolved_form_to_protos(affix.form)
-            if affix.affix.type is AffixType.PREFIX:
-                protos.insert(0, affix_protos)
-            else:
-                protos.append(affix_protos)
-
-        return list(chain(*protos))
-
-    def evolve_all(self, entry: Entry) -> List[str]:
-        return [
-            evolve(self.lexicon.substitute(var, entry.form))
-            for var in self.lexicon.get_vars(entry.template)
-        ]
-
-
-class LexiconInserter(Extension):
-    def extendMarkdown(self, md: Markdown) -> None:
-        md.registerExtension(self)
-        md.preprocessors.register(
-            LexiconPreprocessor(md, parse_lexicon_file()), "lexicon", 0
-        )
diff --git a/pyconlang/lexicon/parser.py b/pyconlang/lexicon/parser.py
@@ -1,15 +1,13 @@
 from pathlib import Path
-from typing import Any, Callable, TypeVar, Union
+from typing import Any, Callable, List, TypeVar, Union, cast
 
 from pyparsing import (
     Group,
-    OneOrMore,
     Opt,
     ParserElement,
     ParseResults,
     Suppress,
     Word,
-    ZeroOrMore,
     alphanums,
     alphas,
     pyparsing_unicode,
@@ -23,6 +21,7 @@
     AffixType,
     Canonical,
     Entry,
+    Form,
     Fusion,
     PartOfSpeech,
     Proto,
@@ -60,7 +59,7 @@ def parse_lexicon(string: str) -> Lexicon:
     result = lexicon.parse_string(string, parse_all=True)[0]
 
     if not isinstance(result, Lexicon):
-        raise RuntimeError("Bad parsing")
+        raise RuntimeError(f"Could not parse {string}")
 
     return result
 
@@ -69,6 +68,10 @@ def parse_lexicon_file(filename: Path = Path("lexicon.txt")) -> Lexicon:
     return parse_lexicon(filename.read_text())
 
 
+def parse_sentence(string: str) -> List[Form]:
+    return cast(List[Form], list(sentence.parse_string(string, parse_all=True)))
+
+
 ident = Word(alphanums + "-").set_name("ident")
 rule = (Suppress("@") + ident).set_parse_action(token_map(Rule)).set_name("rule")
 canonical = (
@@ -107,23 +110,23 @@ def parse_lexicon_file(filename: Path = Path("lexicon.txt")) -> Lexicon:
 )
 affix = (prefix | suffix).set_name("affix")
 var = (
-    (ZeroOrMore(prefix) + Suppress("$") + ZeroOrMore(suffix))
+    (prefix[...] + Suppress("$") + suffix[...])
     .set_parse_action(Var.from_iterable)
     .set_name("var")
 )
 template = (
-    (Suppress("template") + template_name + OneOrMore(var))
+    (Suppress("template") + template_name + var[1, ...])
     .set_parse_action(tokens_map(Template.from_args))
     .set_name("template")
 )
 fusion = (
-    (Group(ZeroOrMore(prefix), True) + canonical + Group(ZeroOrMore(suffix), True))
+    (Group(prefix[...], True) + canonical + Group(suffix[...], True))
     .set_parse_action(tokens_map(Fusion.from_prefixes_and_suffixes))
     .set_name("fusion")
 )
 form = (proto | fusion).set_name("form")
 lexical_sources = (
-    Suppress("(") + OneOrMore(canonical).set_parse_action(tuple) + Suppress(")")
+    Suppress("(") + canonical[1, ...].set_parse_action(tuple) + Suppress(")")
 )
 affix_definition = (
     (
@@ -153,11 +156,12 @@ def parse_lexicon_file(filename: Path = Path("lexicon.txt")) -> Lexicon:
     .set_name("entry")
 )
 lexicon = (
-    ZeroOrMore(entry | affix_definition | template)
+    (entry | affix_definition | template)[...]
     .set_parse_action(Lexicon.from_iterable)
     .set_name("lexicon")
 )
 
+sentence = form[...]
 
 if __name__ == "__main__":
     make_diagrams()
diff --git a/tests/lexicon/test_parser.py b/tests/lexicon/test_parser.py
@@ -7,6 +7,7 @@
     form,
     fusion,
     lexical_sources,
+    parse_sentence,
     part_of_speech,
     proto,
     rule,
@@ -181,5 +182,14 @@ def test_var(sample_lexicon):
     )
 
 
+def test_sentence():
+    assert tuple(parse_sentence("*aka <strong> COL.<with space> *taka@start")) == (
+        Proto("aka", None),
+        Fusion(Canonical("strong"), ()),
+        Fusion(Canonical("with space"), (Affix("COL", AffixType.PREFIX),)),
+        Proto("taka", Rule("start")),
+    )
+
+
 def parse(parser, string):
-    return parser.parse_string(string)[0]
+    return parser.parse_string(string, parse_all=True)[0]
diff --git a/tests/test_book.py b/tests/test_book.py
@@ -3,12 +3,21 @@
 
 
 def test_book(simple_pyconlang):
+    (simple_pyconlang / "grammar.md").write_text(
+        "**This is an example: #*kika@era1 <stone>.PL#**"
+    )
+
     compile_book()
 
     html = (PYCONLANG_PATH / "output.html").read_text()
 
     assert "By Mr. Tester" in html
     assert "TestLang" in html
+
+    assert (
+        "<p><strong>This is an example: <span>kiga abagigi</span></strong></p>" in html
+    )
+
     assert (
         "<p><strong>abagigi</strong> <em>*apak</em> + <em>*iki</em> (n.) gravel</p>"
         in html