Files
MarkdownToIHKChemnits/markdown_parser.go
T

420 lines
15 KiB
Go

package main
import (
"bytes"
"fmt"
"log"
"os"
"strings"
"github.com/yuin/goldmark"
meta "github.com/yuin/goldmark-meta"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
extast "github.com/yuin/goldmark/extension/ast"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
"gopkg.in/yaml.v3"
)
// ParseMarkdown reads a Markdown file, extracts the YAML front matter into a
// Config and returns the parsed AST together with the raw source bytes.
func ParseMarkdown(mdPath string) (Config, ast.Node, []byte, error) {
content, err := os.ReadFile(mdPath)
if err != nil {
return Config{}, nil, nil, err
}
md := goldmark.New(
goldmark.WithExtensions(meta.Meta, extension.Table),
)
ctx := parser.NewContext()
doc := md.Parser().Parse(text.NewReader(content), parser.WithContext(ctx))
metaData := meta.Get(ctx)
var config Config
raw, _ := yaml.Marshal(metaData)
if err = yaml.Unmarshal(raw, &config); err != nil {
return Config{}, nil, nil, fmt.Errorf("YAML front matter: %w", err)
}
return config, doc, content, nil
}
// parserState tracks transient state during the AST walk.
type parserState struct {
nextCodeIsAppendix bool
nextAppendixLandscape bool // set by @AnhangUMLQuer: — landscape for diagram appendix
nextAppendixRotated bool // set by @AnhangUMLGedreht: — portrait page, image rotated 90° CCW
appendixTitle string
nextCodeBlockAppendix bool // set by @AnhangCode: — next non-diagram code block → appendix
codeBlockAppendixTitle string
nextTableCaption string // set by @Tabelle: directive
nextTableIsAppendix bool // set by @TabelleAnhang: or @TabelleAnhangQuer:
nextTableIsLandscape bool // set by @TabelleAnhangQuer:
nextDiagramLandscape bool // set by @DiagrammQuer: directive
nextDiagramCaption string // caption for the landscape diagram page
listStack []listFrame // stack for nested list tracking
}
// listFrame tracks the type and item counter for one list nesting level.
type listFrame struct {
ordered bool
index int
}
// RenderAST walks the Goldmark AST and dispatches to IHKRenderer methods.
//
// Front-matter detection: a level-1 heading that is not a numbered section
// and has a name in the front-matter list ("Vorwort", "Abkürzungsverzeichnis")
// stays in Roman-numeral territory. All other level-1 headings trigger
// StartMainBody() and switch to Arabic page numbering.
func RenderAST(doc ast.Node, content []byte, r *IHKRenderer) error {
r.StartFrontMatter()
state := &parserState{}
return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
switch node := n.(type) {
// ── Headings ──────────────────────────────────────────────────────────
case *ast.Heading:
if !entering {
return ast.WalkContinue, nil
}
title := extractPlainText(node, content)
if node.Level == 1 && r.numType == NumRoman {
if !isFrontMatterSection(title) {
r.StartMainBody()
}
}
r.RenderHeader(node.Level, title)
return ast.WalkSkipChildren, nil
// ── Paragraphs ────────────────────────────────────────────────────────
case *ast.Paragraph:
if !entering {
return ast.WalkContinue, nil
}
plain := extractPlainText(node, content)
// Special directives embedded in paragraphs
if handled := handleDirectives(plain, state, r); handled {
return ast.WalkSkipChildren, nil
}
spans := extractInlineSpans(node, content)
r.RenderParagraphSpans(spans)
return ast.WalkSkipChildren, nil
// ── Fenced code blocks ────────────────────────────────────────────────
case *ast.FencedCodeBlock:
if !entering {
return ast.WalkContinue, nil
}
lang := string(node.Language(content))
code := extractCodeBlock(node, content)
if lang == "mermaid" || lang == "plantuml" || lang == "puml" {
imgPath, err := RenderDiagramViaKroki(lang, code)
if err != nil {
log.Printf("warning: diagram render failed (%s): %v — falling back to code block", lang, err)
state.nextDiagramLandscape = false
state.nextDiagramCaption = ""
state.nextCodeIsAppendix = false
state.nextAppendixLandscape = false
state.nextAppendixRotated = false
}
if err == nil {
switch {
case state.nextDiagramLandscape:
r.RenderLandscapeDiagram(imgPath, state.nextDiagramCaption)
state.nextDiagramLandscape = false
state.nextDiagramCaption = ""
case state.nextCodeIsAppendix:
switch {
case state.nextAppendixLandscape:
r.AddLandscapeAppendix(state.appendixTitle + " | " + imgPath)
state.nextAppendixLandscape = false
case state.nextAppendixRotated:
r.AddRotatedUMLAppendix(state.appendixTitle, imgPath)
state.nextAppendixRotated = false
default:
r.AddAppendix(state.appendixTitle + " | " + imgPath)
}
state.nextCodeIsAppendix = false
default:
caption := state.nextDiagramCaption
if caption == "" {
caption = "Diagram (" + lang + ")"
}
state.nextDiagramCaption = ""
r.RenderImage(imgPath, caption)
}
return ast.WalkSkipChildren, nil
}
// Fall through: render as plain code block on error
}
if state.nextCodeBlockAppendix {
r.AddCodeAppendix(state.codeBlockAppendixTitle, lang, code)
state.nextCodeBlockAppendix = false
state.codeBlockAppendixTitle = ""
return ast.WalkSkipChildren, nil
}
// Render as a numbered code block (gutter + monospace body).
r.RenderCodeBlock(lang, code)
return ast.WalkSkipChildren, nil
// ── Images ────────────────────────────────────────────────────────────
case *ast.Image:
if !entering {
return ast.WalkContinue, nil
}
imgPath := string(node.Destination)
caption := extractPlainText(node, content)
if caption == "" {
caption = string(node.Title)
}
r.RenderImage(imgPath, caption)
return ast.WalkSkipChildren, nil
// ── Block quotes (alternative @Quelle syntax) ─────────────────────────
case *ast.Blockquote:
if !entering {
return ast.WalkContinue, nil
}
first := node.FirstChild()
if first != nil {
if para, ok := first.(*ast.Paragraph); ok {
pText := extractPlainText(para, content)
if strings.HasPrefix(pText, "Quelle:") || strings.HasPrefix(pText, "Source:") {
src := strings.TrimPrefix(strings.TrimPrefix(pText, "Quelle:"), "Source:")
r.AddSource(src)
return ast.WalkSkipChildren, nil
}
}
}
return ast.WalkContinue, nil
// ── Lists ─────────────────────────────────────────────────────────────
case *ast.List:
if entering {
state.listStack = append(state.listStack, listFrame{ordered: node.IsOrdered()})
} else {
if len(state.listStack) > 0 {
state.listStack = state.listStack[:len(state.listStack)-1]
}
// Add breathing room after the outermost list so the next
// paragraph is not glued to the last bullet.
if len(state.listStack) == 0 {
r.pdf.Ln(dinSpaceAfterList)
}
}
return ast.WalkContinue, nil
case *ast.ListItem:
if !entering {
return ast.WalkContinue, nil
}
depth := len(state.listStack) - 1
if depth < 0 {
depth = 0
}
frame := &state.listStack[depth]
frame.index++
spans := extractInlineSpansFromListItem(node, content)
r.RenderListItem(spans, frame.ordered, frame.index, depth)
return ast.WalkSkipChildren, nil
// ── Tables ────────────────────────────────────────────────────────────
case *extast.Table:
if !entering {
return ast.WalkContinue, nil
}
var tableData [][][]InlineSpan
for row := node.FirstChild(); row != nil; row = row.NextSibling() {
var rowData [][]InlineSpan
for cell := row.FirstChild(); cell != nil; cell = cell.NextSibling() {
rowData = append(rowData, extractInlineSpans(cell, content))
}
tableData = append(tableData, rowData)
}
if state.nextTableIsAppendix {
if state.nextTableIsLandscape {
r.AddTableAppendixLandscape(state.nextTableCaption, tableData)
state.nextTableIsLandscape = false
} else {
r.AddTableAppendix(state.nextTableCaption, tableData)
}
state.nextTableIsAppendix = false
state.nextTableCaption = ""
} else {
caption := state.nextTableCaption
state.nextTableCaption = ""
r.RenderTable(tableData, caption)
}
return ast.WalkSkipChildren, nil
}
return ast.WalkContinue, nil
})
}
// isFrontMatterSection returns true for level-1 headings that belong to the
// Roman-numbered front matter (before the main body begins).
func isFrontMatterSection(title string) bool {
switch strings.TrimSpace(title) {
case "Vorwort", "Einleitung", "Abkürzungsverzeichnis":
return true
}
// Everything else (including numbered sections like "1. Problem Statement")
// belongs to the Arabic-numbered main body.
return false
}
// handleDirectives processes special @-prefixed control lines in a paragraph.
// Returns true if the paragraph was fully consumed as a directive.
func handleDirectives(text string, state *parserState, r *IHKRenderer) bool {
lines := strings.Split(text, "\n")
handled := false
for _, line := range lines {
line = strings.TrimSpace(line)
switch {
case strings.HasPrefix(line, "@Quelle:"):
r.AddSource(strings.TrimSpace(strings.TrimPrefix(line, "@Quelle:")))
handled = true
case strings.HasPrefix(line, "@AnhangCode:"):
state.nextCodeBlockAppendix = true
state.codeBlockAppendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangCode:"))
handled = true
case strings.HasPrefix(line, "@Anhang:"):
r.AddAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@Anhang:")))
handled = true
case strings.HasPrefix(line, "@AnhangBildQuer:"):
r.AddLandscapeAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@AnhangBildQuer:")))
handled = true
case strings.HasPrefix(line, "@AnhangUMLQuer:"):
state.nextCodeIsAppendix = true
state.nextAppendixLandscape = true
state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUMLQuer:"))
handled = true
case strings.HasPrefix(line, "@AnhangUMLGedreht:"):
// Portrait page, image rotated 90° CCW — long axis runs top-to-bottom.
state.nextCodeIsAppendix = true
state.nextAppendixRotated = true
state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUMLGedreht:"))
handled = true
case strings.HasPrefix(line, "@AnhangBildGedreht:"):
r.AddRotatedImageAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@AnhangBildGedreht:")))
handled = true
case strings.HasPrefix(line, "@AnhangUML:"):
state.nextCodeIsAppendix = true
state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUML:"))
handled = true
case strings.HasPrefix(line, "@TabelleAnhangQuer:"):
state.nextTableIsAppendix = true
state.nextTableIsLandscape = true
state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@TabelleAnhangQuer:"))
handled = true
case strings.HasPrefix(line, "@TabelleAnhang:"):
state.nextTableIsAppendix = true
state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@TabelleAnhang:"))
handled = true
case strings.HasPrefix(line, "@Tabelle:"):
state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@Tabelle:"))
handled = true
case strings.HasPrefix(line, "@DiagrammQuer:"):
state.nextDiagramLandscape = true
state.nextDiagramCaption = strings.TrimSpace(strings.TrimPrefix(line, "@DiagrammQuer:"))
handled = true
case strings.HasPrefix(line, "@Diagramm:"):
// Portrait inline diagram — rendered at current position via RenderImage.
state.nextDiagramCaption = strings.TrimSpace(strings.TrimPrefix(line, "@Diagramm:"))
handled = true
}
}
return handled
}
// extractPlainText returns the plain-text content of an AST node by
// recursively concatenating all text leaf nodes.
func extractPlainText(n ast.Node, content []byte) string {
var sb strings.Builder
for child := n.FirstChild(); child != nil; child = child.NextSibling() {
if t, ok := child.(*ast.Text); ok {
sb.Write(t.Segment.Value(content))
if t.SoftLineBreak() || t.HardLineBreak() {
sb.WriteByte('\n')
}
} else {
sb.WriteString(extractPlainText(child, content))
}
}
if sb.Len() == 0 {
return string(n.Text(content))
}
return sb.String()
}
// extractInlineSpans walks the children of a paragraph node and builds a
// slice of InlineSpan values that preserve bold, italic, and code formatting.
func extractInlineSpans(n ast.Node, content []byte) []InlineSpan {
var spans []InlineSpan
walkInline(n, content, false, false, false, &spans)
return spans
}
// extractInlineSpansFromListItem extracts spans from the first paragraph child
// of a list item, which is how Goldmark represents list item content.
func extractInlineSpansFromListItem(item *ast.ListItem, content []byte) []InlineSpan {
for child := item.FirstChild(); child != nil; child = child.NextSibling() {
if _, ok := child.(*ast.Paragraph); ok {
return extractInlineSpans(child, content)
}
// TextBlock is used for tight lists
return extractInlineSpans(child, content)
}
return nil
}
// walkInline recursively collects InlineSpan values from an AST subtree,
// propagating bold/italic context down through Emphasis nodes.
func walkInline(n ast.Node, content []byte, bold, italic, code bool, out *[]InlineSpan) {
for child := n.FirstChild(); child != nil; child = child.NextSibling() {
switch c := child.(type) {
case *ast.Text:
seg := string(c.Segment.Value(content))
if c.SoftLineBreak() {
seg += " "
} else if c.HardLineBreak() {
seg += "\n"
}
if seg != "" {
*out = append(*out, InlineSpan{Text: seg, Bold: bold, Italic: italic, Code: code})
}
case *ast.Emphasis:
childBold := bold || c.Level == 2
childItalic := italic || c.Level == 1
walkInline(c, content, childBold, childItalic, code, out)
case *ast.CodeSpan:
raw := string(c.Text(content))
*out = append(*out, InlineSpan{Text: raw, Bold: bold, Italic: italic, Code: true})
case *ast.Link:
// Render link text; the href is not shown (no footnote support yet).
walkInline(c, content, bold, italic, code, out)
default:
walkInline(c, content, bold, italic, code, out)
}
}
}
// extractCodeBlock returns the raw source text of a fenced code block.
func extractCodeBlock(n *ast.FencedCodeBlock, content []byte) string {
var buf bytes.Buffer
for i := 0; i < n.Lines().Len(); i++ {
line := n.Lines().At(i)
buf.Write(line.Value(content))
}
return buf.String()
}