MarkdownToIHKChemnits/markdown_parser.go

package main

import (
	"bytes"
	"fmt"
	"log"
	"os"
	"strings"

	"github.com/yuin/goldmark"
	meta "github.com/yuin/goldmark-meta"
	"github.com/yuin/goldmark/ast"
	"github.com/yuin/goldmark/extension"
	extast "github.com/yuin/goldmark/extension/ast"
	"github.com/yuin/goldmark/parser"
	"github.com/yuin/goldmark/text"
	"gopkg.in/yaml.v3"
)

// ParseMarkdown reads a Markdown file, extracts the YAML front matter into a
// Config and returns the parsed AST together with the raw source bytes.
func ParseMarkdown(mdPath string) (Config, ast.Node, []byte, error) {
	content, err := os.ReadFile(mdPath)
	if err != nil {
		return Config{}, nil, nil, err
	}

	md := goldmark.New(
		goldmark.WithExtensions(meta.Meta, extension.Table),
	)

	ctx := parser.NewContext()
	doc := md.Parser().Parse(text.NewReader(content), parser.WithContext(ctx))

	metaData := meta.Get(ctx)
	var config Config
	raw, _ := yaml.Marshal(metaData)
	if err = yaml.Unmarshal(raw, &config); err != nil {
		return Config{}, nil, nil, fmt.Errorf("YAML front matter: %w", err)
	}

	return config, doc, content, nil
}

// parserState tracks transient state during the AST walk.
type parserState struct {
	nextCodeIsAppendix     bool
	nextAppendixLandscape  bool // set by @AnhangUMLQuer: — landscape for diagram appendix
	nextAppendixRotated    bool // set by @AnhangUMLGedreht: — portrait page, image rotated 90° CCW
	appendixTitle          string
	nextCodeBlockAppendix  bool // set by @AnhangCode: — next non-diagram code block → appendix
	codeBlockAppendixTitle string
	nextTableCaption       string      // set by @Tabelle: directive
	nextTableIsAppendix    bool        // set by @TabelleAnhang: or @TabelleAnhangQuer:
	nextTableIsLandscape   bool        // set by @TabelleAnhangQuer:
	nextDiagramLandscape   bool        // set by @DiagrammQuer: directive
	nextDiagramCaption     string      // caption for the landscape diagram page
	listStack              []listFrame // stack for nested list tracking
}

// listFrame tracks the type and item counter for one list nesting level.
type listFrame struct {
	ordered bool
	index   int
}

// RenderAST walks the Goldmark AST and dispatches to IHKRenderer methods.
//
// Front-matter detection: a level-1 heading that is not a numbered section
// and has a name in the front-matter list ("Vorwort", "Abkürzungsverzeichnis")
// stays in Roman-numeral territory. All other level-1 headings trigger
// StartMainBody() and switch to Arabic page numbering.
func RenderAST(doc ast.Node, content []byte, r *IHKRenderer) error {
	r.StartFrontMatter()
	state := &parserState{}

	return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
		switch node := n.(type) {

		// ── Headings ──────────────────────────────────────────────────────────
		case *ast.Heading:
			if !entering {
				return ast.WalkContinue, nil
			}
			title := extractPlainText(node, content)
			if node.Level == 1 && r.numType == NumRoman {
				if !isFrontMatterSection(title) {
					r.StartMainBody()
				}
			}
			r.RenderHeader(node.Level, title)
			return ast.WalkSkipChildren, nil

		// ── Paragraphs ────────────────────────────────────────────────────────
		case *ast.Paragraph:
			if !entering {
				return ast.WalkContinue, nil
			}
			plain := extractPlainText(node, content)

			// Special directives embedded in paragraphs
			if handled := handleDirectives(plain, state, r); handled {
				return ast.WalkSkipChildren, nil
			}

			spans := extractInlineSpans(node, content)
			r.RenderParagraphSpans(spans)
			return ast.WalkSkipChildren, nil

		// ── Fenced code blocks ────────────────────────────────────────────────
		case *ast.FencedCodeBlock:
			if !entering {
				return ast.WalkContinue, nil
			}
			lang := string(node.Language(content))
			code := extractCodeBlock(node, content)

			if lang == "mermaid" || lang == "plantuml" || lang == "puml" {
				imgPath, err := RenderDiagramViaKroki(lang, code)
				if err != nil {
					log.Printf("warning: diagram render failed (%s): %v — falling back to code block", lang, err)
					state.nextDiagramLandscape = false
					state.nextDiagramCaption = ""
					state.nextCodeIsAppendix = false
					state.nextAppendixLandscape = false
					state.nextAppendixRotated = false
				}
				if err == nil {
					switch {
					case state.nextDiagramLandscape:
						r.RenderLandscapeDiagram(imgPath, state.nextDiagramCaption)
						state.nextDiagramLandscape = false
						state.nextDiagramCaption = ""
					case state.nextCodeIsAppendix:
						switch {
						case state.nextAppendixLandscape:
							r.AddLandscapeAppendix(state.appendixTitle + " | " + imgPath)
							state.nextAppendixLandscape = false
						case state.nextAppendixRotated:
							r.AddRotatedUMLAppendix(state.appendixTitle, imgPath)
							state.nextAppendixRotated = false
						default:
							r.AddAppendix(state.appendixTitle + " | " + imgPath)
						}
						state.nextCodeIsAppendix = false
					default:
						caption := state.nextDiagramCaption
						if caption == "" {
							caption = "Diagram (" + lang + ")"
						}
						state.nextDiagramCaption = ""
						r.RenderImage(imgPath, caption)
					}
					return ast.WalkSkipChildren, nil
				}
				// Fall through: render as plain code block on error
			}
			if state.nextCodeBlockAppendix {
				r.AddCodeAppendix(state.codeBlockAppendixTitle, lang, code)
				state.nextCodeBlockAppendix = false
				state.codeBlockAppendixTitle = ""
				return ast.WalkSkipChildren, nil
			}
			// Render as a numbered code block (gutter + monospace body).
			r.RenderCodeBlock(lang, code)
			return ast.WalkSkipChildren, nil

		// ── Images ────────────────────────────────────────────────────────────
		case *ast.Image:
			if !entering {
				return ast.WalkContinue, nil
			}
			imgPath := string(node.Destination)
			caption := extractPlainText(node, content)
			if caption == "" {
				caption = string(node.Title)
			}
			r.RenderImage(imgPath, caption)
			return ast.WalkSkipChildren, nil

		// ── Block quotes (alternative @Quelle syntax) ─────────────────────────
		case *ast.Blockquote:
			if !entering {
				return ast.WalkContinue, nil
			}
			first := node.FirstChild()
			if first != nil {
				if para, ok := first.(*ast.Paragraph); ok {
					pText := extractPlainText(para, content)
					if strings.HasPrefix(pText, "Quelle:") || strings.HasPrefix(pText, "Source:") {
						src := strings.TrimPrefix(strings.TrimPrefix(pText, "Quelle:"), "Source:")
						r.AddSource(src)
						return ast.WalkSkipChildren, nil
					}
				}
			}
			return ast.WalkContinue, nil

		// ── Lists ─────────────────────────────────────────────────────────────
		case *ast.List:
			if entering {
				state.listStack = append(state.listStack, listFrame{ordered: node.IsOrdered()})
			} else {
				if len(state.listStack) > 0 {
					state.listStack = state.listStack[:len(state.listStack)-1]
				}
				// Add breathing room after the outermost list so the next
				// paragraph is not glued to the last bullet.
				if len(state.listStack) == 0 {
					r.pdf.Ln(dinSpaceAfterList)
				}
			}
			return ast.WalkContinue, nil

		case *ast.ListItem:
			if !entering {
				return ast.WalkContinue, nil
			}
			depth := len(state.listStack) - 1
			if depth < 0 {
				depth = 0
			}
			frame := &state.listStack[depth]
			frame.index++
			spans := extractInlineSpansFromListItem(node, content)
			r.RenderListItem(spans, frame.ordered, frame.index, depth)
			return ast.WalkSkipChildren, nil

		// ── Tables ────────────────────────────────────────────────────────────
		case *extast.Table:
			if !entering {
				return ast.WalkContinue, nil
			}
			var tableData [][][]InlineSpan
			for row := node.FirstChild(); row != nil; row = row.NextSibling() {
				var rowData [][]InlineSpan
				for cell := row.FirstChild(); cell != nil; cell = cell.NextSibling() {
					rowData = append(rowData, extractInlineSpans(cell, content))
				}
				tableData = append(tableData, rowData)
			}
			if state.nextTableIsAppendix {
				if state.nextTableIsLandscape {
					r.AddTableAppendixLandscape(state.nextTableCaption, tableData)
					state.nextTableIsLandscape = false
				} else {
					r.AddTableAppendix(state.nextTableCaption, tableData)
				}
				state.nextTableIsAppendix = false
				state.nextTableCaption = ""
			} else {
				caption := state.nextTableCaption
				state.nextTableCaption = ""
				r.RenderTable(tableData, caption)
			}
			return ast.WalkSkipChildren, nil
		}

		return ast.WalkContinue, nil
	})
}

// isFrontMatterSection returns true for level-1 headings that belong to the
// Roman-numbered front matter (before the main body begins).
func isFrontMatterSection(title string) bool {
	switch strings.TrimSpace(title) {
	case "Vorwort", "Einleitung", "Abkürzungsverzeichnis":
		return true
	}
	// Everything else (including numbered sections like "1. Problem Statement")
	// belongs to the Arabic-numbered main body.
	return false
}

// handleDirectives processes special @-prefixed control lines in a paragraph.
// Returns true if the paragraph was fully consumed as a directive.
func handleDirectives(text string, state *parserState, r *IHKRenderer) bool {
	lines := strings.Split(text, "\n")
	handled := false
	for _, line := range lines {
		line = strings.TrimSpace(line)
		switch {
		case strings.HasPrefix(line, "@Quelle:"):
			r.AddSource(strings.TrimSpace(strings.TrimPrefix(line, "@Quelle:")))
			handled = true
		case strings.HasPrefix(line, "@AnhangCode:"):
			state.nextCodeBlockAppendix = true
			state.codeBlockAppendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangCode:"))
			handled = true
		case strings.HasPrefix(line, "@Anhang:"):
			r.AddAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@Anhang:")))
			handled = true
		case strings.HasPrefix(line, "@AnhangBildQuer:"):
			r.AddLandscapeAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@AnhangBildQuer:")))
			handled = true
		case strings.HasPrefix(line, "@AnhangUMLQuer:"):
			state.nextCodeIsAppendix = true
			state.nextAppendixLandscape = true
			state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUMLQuer:"))
			handled = true
		case strings.HasPrefix(line, "@AnhangUMLGedreht:"):
			// Portrait page, image rotated 90° CCW — long axis runs top-to-bottom.
			state.nextCodeIsAppendix = true
			state.nextAppendixRotated = true
			state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUMLGedreht:"))
			handled = true
		case strings.HasPrefix(line, "@AnhangBildGedreht:"):
			r.AddRotatedImageAppendix(strings.TrimSpace(strings.TrimPrefix(line, "@AnhangBildGedreht:")))
			handled = true
		case strings.HasPrefix(line, "@AnhangUML:"):
			state.nextCodeIsAppendix = true
			state.appendixTitle = strings.TrimSpace(strings.TrimPrefix(line, "@AnhangUML:"))
			handled = true
		case strings.HasPrefix(line, "@TabelleAnhangQuer:"):
			state.nextTableIsAppendix = true
			state.nextTableIsLandscape = true
			state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@TabelleAnhangQuer:"))
			handled = true
		case strings.HasPrefix(line, "@TabelleAnhang:"):
			state.nextTableIsAppendix = true
			state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@TabelleAnhang:"))
			handled = true
		case strings.HasPrefix(line, "@Tabelle:"):
			state.nextTableCaption = strings.TrimSpace(strings.TrimPrefix(line, "@Tabelle:"))
			handled = true
		case strings.HasPrefix(line, "@DiagrammQuer:"):
			state.nextDiagramLandscape = true
			state.nextDiagramCaption = strings.TrimSpace(strings.TrimPrefix(line, "@DiagrammQuer:"))
			handled = true
		case strings.HasPrefix(line, "@Diagramm:"):
			// Portrait inline diagram — rendered at current position via RenderImage.
			state.nextDiagramCaption = strings.TrimSpace(strings.TrimPrefix(line, "@Diagramm:"))
			handled = true
		}
	}
	return handled
}

// extractPlainText returns the plain-text content of an AST node by
// recursively concatenating all text leaf nodes.
func extractPlainText(n ast.Node, content []byte) string {
	var sb strings.Builder
	for child := n.FirstChild(); child != nil; child = child.NextSibling() {
		if t, ok := child.(*ast.Text); ok {
			sb.Write(t.Segment.Value(content))
			if t.SoftLineBreak() || t.HardLineBreak() {
				sb.WriteByte('\n')
			}
		} else {
			sb.WriteString(extractPlainText(child, content))
		}
	}
	if sb.Len() == 0 {
		return string(n.Text(content))
	}
	return sb.String()
}

// extractInlineSpans walks the children of a paragraph node and builds a
// slice of InlineSpan values that preserve bold, italic, and code formatting.
func extractInlineSpans(n ast.Node, content []byte) []InlineSpan {
	var spans []InlineSpan
	walkInline(n, content, false, false, false, &spans)
	return spans
}

// extractInlineSpansFromListItem extracts spans from the first paragraph child
// of a list item, which is how Goldmark represents list item content.
func extractInlineSpansFromListItem(item *ast.ListItem, content []byte) []InlineSpan {
	for child := item.FirstChild(); child != nil; child = child.NextSibling() {
		if _, ok := child.(*ast.Paragraph); ok {
			return extractInlineSpans(child, content)
		}
		// TextBlock is used for tight lists
		return extractInlineSpans(child, content)
	}
	return nil
}

// walkInline recursively collects InlineSpan values from an AST subtree,
// propagating bold/italic context down through Emphasis nodes.
func walkInline(n ast.Node, content []byte, bold, italic, code bool, out *[]InlineSpan) {
	for child := n.FirstChild(); child != nil; child = child.NextSibling() {
		switch c := child.(type) {
		case *ast.Text:
			seg := string(c.Segment.Value(content))
			if c.SoftLineBreak() {
				seg += " "
			} else if c.HardLineBreak() {
				seg += "\n"
			}
			if seg != "" {
				*out = append(*out, InlineSpan{Text: seg, Bold: bold, Italic: italic, Code: code})
			}
		case *ast.Emphasis:
			childBold := bold || c.Level == 2
			childItalic := italic || c.Level == 1
			walkInline(c, content, childBold, childItalic, code, out)
		case *ast.CodeSpan:
			raw := string(c.Text(content))
			*out = append(*out, InlineSpan{Text: raw, Bold: bold, Italic: italic, Code: true})
		case *ast.Link:
			// Render link text; the href is not shown (no footnote support yet).
			walkInline(c, content, bold, italic, code, out)
		default:
			walkInline(c, content, bold, italic, code, out)
		}
	}
}

// extractCodeBlock returns the raw source text of a fenced code block.
func extractCodeBlock(n *ast.FencedCodeBlock, content []byte) string {
	var buf bytes.Buffer
	for i := 0; i < n.Lines().Len(); i++ {
		line := n.Lines().At(i)
		buf.Write(line.Value(content))
	}
	return buf.String()
}