Since as far as I could find (and based on the lack of responses) it seems like there is not a way lua filters can do this, I decided to solve this issue with Python and mark this as solved.
The workaround I could find is:
The code I used is provided below. Maybe someone finds a way to do something like this within pandoc, but as for now, this effectively solves my problem :)
import os
import re
import pypandoc
# Pre-processes a Gitlab-flavored Markdown file such that
# - ::include directives are replaced by the actual file
# - [[_TOC_]]
# Requires pandoc!!!
# See https://pypi.org/project/pypandoc/
pandoc_location = r'<pandoc_folder>\pandoc.exe'
input_file = r'<path_to_your_file.md>'
to_format = 'html5'
print(f'Setting pandoc location to {pandoc_location}')
os.environ.setdefault('PYPANDOC_PANDOC', pandoc_location)
current_path = __file__
current_folder, current_filename = os.path.split(current_path)
tmp_file = os.path.join(current_folder, 'tmp.md')
print(f'Using tmp. file {tmp_file}')
with open(input_file, 'r') as f:
input_md = f.read()
print(f'Read {input_file}. Length={len(input_md)}')
input_folder, input_file = os.path.split(input_file)
input_base, input_ext = os.path.splitext(input_file)
all_matches = [re.match(r'\:\:include{file=([\W\w\.\/\d]+)}', e) for e in input_md.splitlines() ]
all_matches = [e for e in all_matches if e is not None]
for include_match in all_matches:
include_path = include_match.group(1)
abs_path = os.path.abspath(os.path.join(input_folder, include_path))
print(f'Including {abs_path}')
try:
with open(abs_path, 'r') as f:
include_file_content = f.read()
input_md = input_md.replace(include_match.group(0), include_file_content)
except Exception as e:
print(f'Could not include file: {e}')
# Process ToC
def slugify(text):
"""Converts heading text into a GitHub-style anchor slug."""
text = text.strip().lower()
text = re.sub(r'[^\w\s-]', '', text)
return re.sub(r'[\s]+', '-', text)
def strip_markdown_links(text):
"""Extracts visible text from markdown-style links [text](url)."""
return re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
def extract_headings(markdown):
"""Extracts headings ignoring code blocks, and handles markdown links."""
headings = []
in_code_block = False
for line in markdown.splitlines():
if line.strip().startswith("```"):
in_code_block = not in_code_block
continue
if in_code_block:
continue
match = re.match(r'^(#{1,6})\s+(.*)', line)
if match:
level = len(match.group(1))
raw_text = match.group(2).strip()
clean_text = strip_markdown_links(raw_text)
slug = slugify(clean_text)
headings.append((level, clean_text, slug))
return headings
def generate_toc(headings):
"""Generates TOC from extracted headings."""
toc_lines = []
for level, text, slug in headings:
indent = ' ' * (level - 1)
toc_lines.append(f"{indent}- [{text}](#{slug})")
return '\n'.join(toc_lines)
# Replace Gitlab's [[_TOC_]] with the actual ToC
print(f'Generating ToC from [[_TOC_]]')
headings_input = extract_headings(input_md)
toc = generate_toc(headings_input)
# The HTML output seems NOT to like it if the anchor is "#3gppsa2".
# The number "3" is lost in the HTML conversion. This should remedy this
# Please note that this "hack" results in the navigation of tmp.md being broken. But the output HTML is OK
toc = toc.replace('(#3gppsa2', '(#gppsa2')
input_md = input_md.replace('[[_TOC_]]', toc)
with open(tmp_file, 'w') as f:
f.write(input_md)
print(f'Wrote {tmp_file}')
print(f'Converting {tmp_file} to {to_format}')
# CSS from https://jez.io/pandoc-markdown-css-theme/#usage
# https://github.com/jez/pandoc-markdown-css-theme
# Fixed title with https://stackoverflow.com/questions/63928077/how-can-i-add-header-metadata-without-adding-the-h1
# Using markdon-smart to fix wrongly-displayed single-quotes
output = pypandoc.convert_file(
source_file='tmp.md',
to=f'{to_format}',
extra_args=[
'--from=markdown-smart',
'--standalone',
'--embed-resources=true',
'--css=theme.css',
'--html-q-tags=true',
f'--metadata=title={input_base}',
'--variable=title='
])
match to_format:
case 'html' | 'html5':
output_ext = 'html'
case _:
output_ext = to_format
output_file = os.path.join(input_folder, f'{input_base}.{output_ext}')
with open(output_file, 'w') as f:
f.write(output)
print(f'PyPandoc output saved to: {output_file}')