79592291

Date: 2025-04-25 10:13:06
Score: 0.5
Natty:
Report link

Since as far as I could find (and based on the lack of responses) it seems like there is not a way lua filters can do this, I decided to solve this issue with Python and mark this as solved.

The workaround I could find is:

The code I used is provided below. Maybe someone finds a way to do something like this within pandoc, but as for now, this effectively solves my problem :)

import os
import re

import pypandoc

# Pre-processes a Gitlab-flavored Markdown file such that
#   - ::include directives are replaced by the actual file
#   - [[_TOC_]]

# Requires pandoc!!!
# See https://pypi.org/project/pypandoc/

pandoc_location = r'<pandoc_folder>\pandoc.exe'
input_file = r'<path_to_your_file.md>'
to_format = 'html5'

print(f'Setting pandoc location to {pandoc_location}')
os.environ.setdefault('PYPANDOC_PANDOC', pandoc_location)

current_path = __file__
current_folder, current_filename = os.path.split(current_path)
tmp_file = os.path.join(current_folder, 'tmp.md')
print(f'Using tmp. file {tmp_file}')

with open(input_file, 'r') as f:
    input_md = f.read()

print(f'Read {input_file}. Length={len(input_md)}')

input_folder, input_file = os.path.split(input_file)
input_base, input_ext = os.path.splitext(input_file)

all_matches = [re.match(r'\:\:include{file=([\W\w\.\/\d]+)}', e) for e in input_md.splitlines() ]
all_matches = [e for e in all_matches if e is not None]
for include_match in all_matches:
    include_path = include_match.group(1)
    abs_path = os.path.abspath(os.path.join(input_folder, include_path))
    print(f'Including {abs_path}')
    try:
        with open(abs_path, 'r') as f:
            include_file_content = f.read()
        input_md = input_md.replace(include_match.group(0), include_file_content)
    except Exception as e:
        print(f'Could not include file: {e}')

# Process ToC
def slugify(text):
    """Converts heading text into a GitHub-style anchor slug."""
    text = text.strip().lower()
    text = re.sub(r'[^\w\s-]', '', text)
    return re.sub(r'[\s]+', '-', text)

def strip_markdown_links(text):
    """Extracts visible text from markdown-style links [text](url)."""
    return re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

def extract_headings(markdown):
    """Extracts headings ignoring code blocks, and handles markdown links."""
    headings = []
    in_code_block = False

    for line in markdown.splitlines():
        if line.strip().startswith("```"):
            in_code_block = not in_code_block
            continue
        if in_code_block:
            continue

        match = re.match(r'^(#{1,6})\s+(.*)', line)
        if match:
            level = len(match.group(1))
            raw_text = match.group(2).strip()
            clean_text = strip_markdown_links(raw_text)
            slug = slugify(clean_text)
            headings.append((level, clean_text, slug))

    return headings

def generate_toc(headings):
    """Generates TOC from extracted headings."""
    toc_lines = []
    for level, text, slug in headings:
        indent = '  ' * (level - 1)
        toc_lines.append(f"{indent}- [{text}](#{slug})")
    return '\n'.join(toc_lines)

# Replace Gitlab's [[_TOC_]] with the actual ToC
print(f'Generating ToC from [[_TOC_]]')
headings_input = extract_headings(input_md)
toc = generate_toc(headings_input)

# The HTML output seems NOT to like it if the anchor is "#3gppsa2".
# The number "3" is lost in the HTML conversion. This should remedy this
# Please note that this "hack" results in the navigation of tmp.md being broken. But the output HTML is OK
toc = toc.replace('(#3gppsa2', '(#gppsa2')

input_md = input_md.replace('[[_TOC_]]', toc)

with open(tmp_file, 'w') as f:
    f.write(input_md)
print(f'Wrote {tmp_file}')

print(f'Converting {tmp_file} to {to_format}')
# CSS from https://jez.io/pandoc-markdown-css-theme/#usage
# https://github.com/jez/pandoc-markdown-css-theme
# Fixed title with https://stackoverflow.com/questions/63928077/how-can-i-add-header-metadata-without-adding-the-h1
# Using markdon-smart to fix wrongly-displayed single-quotes
output = pypandoc.convert_file(
    source_file='tmp.md',
    to=f'{to_format}',
    extra_args=[
        '--from=markdown-smart',
        '--standalone',
        '--embed-resources=true',
        '--css=theme.css',
        '--html-q-tags=true',
        f'--metadata=title={input_base}',
        '--variable=title='
    ])

match to_format:
    case 'html' | 'html5':
        output_ext = 'html'
    case _:
        output_ext = to_format

output_file = os.path.join(input_folder, f'{input_base}.{output_ext}')

with open(output_file, 'w') as f:
    f.write(output)
print(f'PyPandoc output saved to: {output_file}')
Reasons:
  • Blacklisted phrase (1): stackoverflow
  • Long answer (-1):
  • Has code block (-0.5):
  • Self-answer (0.5):
  • Low reputation (0.5):
Posted by: Josep