#!/usr/bin/env python3
"""
Parse a markdown file and pretty-print to the terminal, including colors, glyphs, and text wrapping.
The supported Markdown syntax is given by: https://github.com/Python-Markdown/markdown
"""
import os
import re
import sys
import subprocess
import markdown
import xml.etree.ElementTree as ET
from html import unescape
from textwrap import wrap
from typing import List, Dict, Union, Tuple, Optional, Iterator, TextIO
class AnsiUtils:
_ansi_re: re.Pattern = re.compile('\x1b\\[[0-9]+m')
_ansi_codes: Dict[str, str] = {k: '\x1b[{}m'.format(v) for k, v in {
'reset': 0,
'bold': 1, 'dark': 2, 'italic': 3, 'underline': 4, 'reverse': 7, 'strikethrough': 9,
'grey': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 'magenta': 35, 'cyan': 36, 'white': 37,
}.items()}
@classmethod
def str_len(cls, text: Union[str, bytes]) -> int:
"""
Exclude zero-width ansi codes from string length calculation.
"""
return len(cls._ansi_re.sub('', text)) if isinstance(text, (str, bytes)) else len(text)
@classmethod
def str_prefix(cls, rx_class: str, text: str) -> str:
return re.match(r'^' + rx_class + '*', text).group(0) # type: ignore
@classmethod
def _line_continuation(cls, text: List[str], indent: int = 0) -> Iterator[str]:
"""
Continue for example underlines in the next line, but after a possible space indent or prefix.
"""
active_codes: List[str] = []
for line in text:
if len(active_codes): # from previous line
indent_width: int = indent + len(cls.str_prefix(' ', line[indent:]))
line = line[:indent_width] + ''.join(active_codes) + line[indent_width:]
active_codes.clear() # will be re-found below
for match in cls._ansi_re.finditer(line):
code: str = match.group(0)
if code == cls._ansi_codes['reset']:
active_codes.clear()
else:
active_codes.append(code)
yield line + cls._ansi_codes['reset'] if len(active_codes) else line
@classmethod
def wrap(cls, *args, **kwargs) -> Iterator[str]:
"""
Wrapper for textwrap.wrap() that skips ansi codes for string lengths and ends/continues codes across lines.
Uses a similar hacky approach as the `ansiwrap` package, by monkey-patching the textwrap's len().
"""
from unittest.mock import patch
with patch('textwrap.len', cls.str_len):
lines: List[str] = wrap(*args, **kwargs)
yield from cls._line_continuation(lines, indent=min(len(kwargs.get("initial_indent", "")),
len(kwargs.get("subsequent_indent", ""))))
@classmethod
def colored(cls, text: Optional[str], modifiers: List[str], do_colors: bool = True) -> str:
"""
Surround the string with ansi codes that correspond to the given modifiers.
"""
if not text or not do_colors or not modifiers:
return text if text else ''
prefix_len: int = len(cls.str_prefix('[ \t\n\v\f\r]', text))
parts, text = [text[:prefix_len]], text[prefix_len:]
parts.extend([cls._ansi_codes[_] for _ in modifiers])
parts.extend([text, cls._ansi_codes['reset']])
return ''.join(parts)
class EtCliParser:
"""
HTML pretty-printer that only supports tags and structures that markdown could generate.
"""
def __init__(self, root: ET.Element, width: int, wrap_pre: bool) -> None:
self._root: ET.Element = root
self._width: int = width
self._wrap_pre: bool = wrap_pre
self._indent: int = 2
self._placeholder: str = u"\uFFFD"
def _wrap(self, line: str, indent: int = 0, pre: bool = False, do_wrap: bool = True,
prefix: str = "", prefix_cont: str = "") -> Iterator[str]:
"""
Transform a string into a list of strings by wrapping to the configured width, with the given indentation width
and line-prefix.
"""
indentation: str = " " * indent * self._indent
line = line.replace("\t", " " * self._indent)
line = line.rstrip() if pre else re.sub(r"[ \t\n\v\f\r]+", " ", line).strip()
if not line or not do_wrap:
yield "".join([indentation, prefix, line])
return
if pre:
line_indent: str = AnsiUtils.str_prefix(" ", line)
prefix_cont += line_indent
yield from AnsiUtils.wrap(
line,
width=self._width,
initial_indent=indentation + prefix,
subsequent_indent=indentation + prefix_cont,
expand_tabs=True, tabsize=self._indent,
replace_whitespace=True, drop_whitespace=True,
fix_sentence_endings=False,
break_long_words=True, break_on_hyphens=not pre,
)
def parse(self) -> Iterator[str]:
"""
Successively parse the root tree and give formatted lines to be printed to the terminal.
"""
first: bool = True
for elem in self._root:
if not first:
yield ""
first = False
if elem.tag in ["h1", "h2", "h3", "h4"]:
yield from self._parse_h(elem)
elif elem.tag in ["p"]:
yield from self._parse_p(elem)
elif elem.tag in ["ul", "ol"]:
yield from self._parse_ul(elem)
elif elem.tag in ["pre"]:
yield from self._parse_code(elem)
elif elem.tag in ["hr"]:
yield from self._parse_hr(elem)
elif elem.tag in ["blockquote"]:
yield from self._parse_quote(elem)
else:
yield from self._parse_unknown(elem)
continue
if elem.tail is not None and elem.tail.strip():
yield from self._wrap(AnsiUtils.colored(elem.tail, ["reverse"]))
def _to_html(self, elem: ET.Element) -> str:
return ET.tostring(elem, encoding="unicode", method="xml")
def _to_text(self, elem: ET.Element, include_tail: bool) -> str:
return "".join(["".join(self._decode_text(_) for _ in elem.itertext()),
self._decode_text(elem.tail) if include_tail else ""])
def _decode_text(self, text: Optional[str]) -> str:
if not text:
return ""
text = markdown.util.HTML_PLACEHOLDER_RE.sub(self._placeholder, text)
text = unescape(text)
text = re.sub(r'[\x00-\x08\x0b\x0d-\x1f\x7f-\x9f]+', self._placeholder, text) # control characters but \t\r\n
return text
def _single_child(self, elem: ET.Element) -> Optional[ET.Element]:
child: Optional[ET.Element] = list(elem)[0] if len(elem) == 1 else None
if child is not None and not elem.text and not child.tail:
return child # there is only one child, so we can "flatten" it in some cases
return None
def _parse_unknown(self, elem: ET.Element) -> Iterator[str]:
for line in self._to_html(elem).splitlines(keepends=False):
yield from self._wrap(AnsiUtils.colored(line, ["reverse"]), pre=True)
def _parse_p(self, elem: ET.Element) -> Iterator[str]:
child: Optional[ET.Element] = self._single_child(elem)
if child is not None and child.tag == "code":
yield from self._parse_code(child) # treat a p that only consists of code as pre, e.g. for ```-style blocks
else:
yield from self._wrap(self._parse_string_block(elem, include_tail=False))
def _parse_h(self, elem: ET.Element) -> Iterator[str]:
depth: int = int(elem.tag[-1])
yield from self._wrap(AnsiUtils.colored(self._to_text(elem, include_tail=False),
["bold", "underline"] if depth < 3 else ["underline"]))
def _parse_hr(self, _: ET.Element) -> Iterator[str]:
yield "─" * self._width
def _parse_code(self, elem: ET.Element) -> Iterator[str]:
for line in self._to_text(elem, include_tail=False).splitlines(keepends=False):
if self._wrap_pre:
yield from self._wrap(AnsiUtils.colored(line, ["dark"]), pre=True, prefix="┃ ", prefix_cont="┠ ")
else:
yield from self._wrap(AnsiUtils.colored(line, ["dark"]), pre=True, do_wrap=False)
def _parse_quote(self, elem: ET.Element, depth: int = 0) -> Iterator[str]:
for child in elem:
if child.tag == "blockquote":
yield from self._parse_quote(child, depth + 1)
elif child.tag == "p":
for line in self._to_text(child, include_tail=True).splitlines(keepends=False):
if self._wrap_pre:
prefix: str = "│" * depth
yield from self._wrap(AnsiUtils.colored(line, ["dark"]), pre=True,
prefix=prefix + "│ ", prefix_cont=prefix + "├ ")
else:
prefix = (">" * (depth + 1)) + " "
yield from self._wrap(AnsiUtils.colored(line, ["dark"]), pre=True, do_wrap=False,
prefix=prefix, prefix_cont=prefix)
else:
yield from self._parse_unknown(child)
def _parse_ul(self, elem: ET.Element, depth: int = 0) -> Iterator[str]:
for child in elem:
if child.tag in ["ul", "ol"]:
yield from self._parse_ul(child, depth + 1)
elif child.tag == "li":
uls: List[ET.Element] = [_ for _ in child if _.tag in ["ul", "ol"]]
for ul in uls:
child.remove(ul)
yield from self._wrap(self._parse_string_block(child, include_tail=True), indent=depth + 1,
prefix="• ", prefix_cont=" ")
for ul in uls:
yield from self._parse_ul(ul, depth + 1)
else:
yield from self._parse_unknown(child)
if elem.tail and elem.tail.strip():
yield from self._wrap(self._decode_text(elem.tail), indent=depth + 1,
prefix=self._placeholder + " ", prefix_cont=" ")
def _parse_string_blocks(self, elems: List[ET.Element], do_colors: bool) -> str:
return "".join(self._parse_string_block(_, include_tail=True, do_colors=do_colors) for _ in elems)
def _parse_string_block(self, elem: ET.Element, include_tail: bool, do_colors: bool = True) -> str:
"""
Helper for an inline text block, which gives a single line (to be wrapped).
A bit messy, follows no software engineering, and does not yet support recursion into nested inline formats.
"""
if elem.tag in ["a", "img"]:
if elem.tag == "a":
title: str = self._decode_text(elem.attrib.get("title", ""))
src: str = self._decode_text(elem.attrib.get("href", "#")) or "#"
text: str = self._decode_text(elem.text) + self._parse_string_blocks(list(elem), do_colors=False)
else:
title, src, text = self._decode_text(elem.attrib.get("title")), \
self._decode_text(elem.attrib.get("src", "#")) or "#", \
self._decode_text(elem.attrib.get("alt"))
if title and not text or title == text:
title, text = "", title
if src == text:
src = ""
return "".join([
AnsiUtils.colored(text, ["blue", "underline"], do_colors=do_colors),
"{}[{}{}]".format(" " if text else "", src, " – " + title if title else "") if src or title else "",
self._decode_text(elem.tail) if include_tail else "",
])
elif elem.tag in ["em", "strong", "code"]:
modifiers: Dict[str, List[str]] = {
"em": ["bold"],
"strong": ["bold", "underline"],
"code": ["bold", "dark"],
}
text = self._decode_text(elem.text) + self._parse_string_blocks(list(elem), do_colors=False)
return AnsiUtils.colored(text, modifiers[elem.tag], do_colors=do_colors) + (self._decode_text(elem.tail)
if include_tail else "")
elif elem.tag in ["br"]:
return "\n" + self._decode_text(elem.tail) if include_tail else ""
elif elem.tag in ["p", "li"]:
return "".join([
self._decode_text(elem.text),
self._parse_string_blocks(list(elem), do_colors=do_colors),
self._decode_text(elem.tail) if include_tail else "",
])
else:
return AnsiUtils.colored(self._to_html(elem), ["reverse"], do_colors=do_colors)
class MarkdownCommentPreProcessor(markdown.preprocessors.Preprocessor): # type: ignore
"""
The HTML parser preprocessor does not support HTML-style comments found in markdown sources.
So we strip them from the source beforehand (and even before whitespace/newline normalization).
"""
def run(self, lines: List[str]) -> List[str]:
source: str = "\n".join(lines)
pos: int = 0
while True:
start: int = source.find("<!--", pos)
if start < 0:
break
end: int = source.find("-->", start+4)
if end < 0:
break
source = source[:start] + source[end + 3:]
pos = start
return source.split("\n")
@classmethod
def register(cls, md: markdown.Markdown) -> None:
md.preprocessors.register(cls(md), cls.__name__, 35)
class MarkdownCodeBlockProcessor(markdown.blockprocessors.BlockProcessor): # type: ignore
"""
Support ```-style code blocks that seem to be quite popular nowadays (the 'fenced_code' extension would be rather
strict and doesn't seem to reliably work anyway).
Use the same <pre><code> approach such that other parses like EmptyBlockProcessor can recognize it.
Not properly started or ended blocks are treated by splitting.
Indented blocks, such as for example in list items, are not supported.
"""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._sep: str = "```"
def test(self, parent: ET.Element, block: str) -> bool:
return block.startswith(self._sep) or "\n" + self._sep in block
def _split_list(self, lines: List[str], sep: str, is_prefix: bool = False) -> Tuple[List[str], Optional[List[str]]]:
for i in range(len(lines)):
if lines[i] == sep or (is_prefix and lines[i].startswith(sep)):
return lines[:i], lines[i:]
return lines, None
def run(self, parent: ET.Element, blocks: List[str]) -> bool:
if not blocks[0].startswith(self._sep): # treat blocks that are started within a block by splitting
pre_lines, post_lines = self._split_list(blocks.pop(0).splitlines(keepends=False),
self._sep, is_prefix=True)
if post_lines is not None: # should not happen, tested before
blocks.insert(0, "\n".join(post_lines))
blocks.insert(0, "\n".join(pre_lines))
return False # continue with next processor on blocks[0]
lang: str = ""
code_blocks: List[str] = []
while len(blocks):
# directly consume subsequent blocks as we cannot recognize them such as classically indented code blocks.
lines: List[str] = blocks.pop(0).splitlines(keepends=False)
if not len(code_blocks): # first block line
lang = lines.pop(0)[len(self._sep):]
if lang.endswith(self._sep): # and one line only...
code_blocks.append(lang[:-len(self._sep)])
lang = ""
if len(lines):
blocks.insert(0, "\n".join(lines))
break
lines, rest_lines = self._split_list(lines, self._sep, is_prefix=False)
code_blocks.append("\n".join(lines))
if rest_lines is not None: # treat blocks that are started within a block by splitting
blocks.insert(0, "\n".join(rest_lines[1:]))
break # end found in any case
pre: ET.Element = ET.SubElement(parent, "pre")
code: ET.Element = ET.SubElement(pre, "code")
code.text = markdown.util.AtomicString("\n\n".join(markdown.util.code_escape(block) for block in code_blocks))
if lang:
code.set("class", "language-" + lang.lower().replace(" ", "-"))
return True # re-start all processors on blocks[0]
@classmethod
def register(cls, md: markdown.Markdown) -> None:
md.parser.blockprocessors.register(cls(md.parser), cls.__name__, 75)
class MarkdownETProcessor(markdown.treeprocessors.Treeprocessor): # type: ignore
"""
Registered Markdown processor to capture the element tree before it gets serialized.
"""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._root_tag: str = 'div'
self._et: Optional[ET.Element] = None
def run(self, root: ET.Element) -> Optional[ET.Element]:
if root.tag == self._root_tag:
self._et = root
return ET.Element(self._root_tag) # optimization and should prevent in-place ET modifications
return None
def _get_root(self) -> Optional[ET.Element]:
return self._et
@classmethod
def convert(cls, text: str) -> Optional[ET.Element]:
md: markdown.Markdown = markdown.Markdown()
MarkdownCommentPreProcessor.register(md)
MarkdownCodeBlockProcessor.register(md)
processor: MarkdownETProcessor = cls(md)
md.treeprocessors.register(processor, processor.__class__.__name__, 0)
md.convert(text) # discard output, root replaced anyway
return processor._get_root()
def spawn_pager() -> Optional[subprocess.Popen]:
pager: List[str] = os.getenv("PAGER", "/usr/bin/less -SR").split(" ")
try:
return subprocess.Popen(pager, shell=False, executable=None,
cwd=None, restore_signals=True, env=None,
bufsize=0, stdin=subprocess.PIPE, close_fds=True,
encoding="utf-8", errors="replace")
except (OSError, ValueError, subprocess.SubprocessError):
return None
def parse(text: str, columns: int, as_html: bool, wrap_pre: bool, pager: bool) -> bool:
"""
Pretty-print Markdown text to stdout.
"""
root: Optional[ET.Element] = MarkdownETProcessor.convert(text)
if root is None:
print("Cannot parse markdown", file=sys.stderr, flush=True)
return False
if as_html:
ET.dump(root)
return True
parser: EtCliParser = EtCliParser(root, columns, wrap_pre)
pager_proc: Optional[subprocess.Popen] = None
try:
if pager:
pager_proc = spawn_pager()
if pager_proc is None:
print("Cannot spawn pager", file=sys.stderr, flush=True)
return False
out_fp: TextIO = pager_proc.stdin if pager_proc is not None else sys.stdout # type: ignore
for line in parser.parse():
try:
print(line, file=out_fp, flush=True)
except BrokenPipeError:
sys.stderr.close()
return False
out_fp.close()
if pager_proc is not None:
pager_proc.wait()
pager_proc = None
finally:
if pager_proc is not None:
pager_proc.send_signal(15) # SIGTERM
pager_proc.wait()
return True
def get_term_width(default: int = 0) -> int:
"""
Try to detect the current terminal's width by $COLUMNS, ioctl(), or given default.
"""
import fcntl
import struct
try:
return int(os.getenv("COLUMNS", None)) # type: ignore
except (ValueError, TypeError):
pass
for fd in [sys.stdout.fileno(), sys.stderr.fileno()]: # try stderr in case stdout is piped somewhere
try:
ws_st: bytes = fcntl.ioctl(fd, 21523, b"\x00\x00" * 4) # TIOCGWINSZ
return struct.unpack("hhhh", ws_st)[1] # ws_row, ws_col, ws_xpixel, ws_ypixel shorts
except (OSError, struct.error):
pass
return default
def get_input(filename: str) -> Optional[str]:
try:
if filename == "-":
text: str = sys.stdin.read()
else:
with open(filename, "r") as fp:
text = fp.read()
except OSError as e:
print(f"Cannot read from '{filename}': {str(e)}", file=sys.stderr, flush=True)
return None
try:
sys.stdin.close()
except OSError:
pass
return text
def main() -> int:
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter
class Formatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter):
pass
parser = ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument("--columns", "-c", type=int, default=None,
help="terminal width, autodetect from $COLUMNS or ioctl() if not given, fallback 80"
" – tip: export or set --columns=$COLUMNS to take the shell width when using a pager")
parser.add_argument("--html", action="store_const", const=True, default=False,
help="print parsed HTML output instead, mostly useful for debugging")
parser.add_argument("--no-wrap-pre", action="store_const", const=True, default=False,
help="don't prefix and wrap code/quote blocks"
" – useful if you want to copy/paste or are using a pager with horizontal scroll")
parser.add_argument("--pager", "-p", action="store_const", const=True, default=False,
help="don't write to stdout but spawn a pager instead, as given by $PAGER or 'less' by default")
parser.add_argument("file", metavar="file.md",
help="markdown file to print, '-' for stdin")
args = parser.parse_args()
try:
text: Optional[str] = get_input(args.file)
if text is None:
return 1
elif not text.strip():
return 0
else:
return 0 if parse(text,
columns=args.columns if args.columns is not None else get_term_width(80),
as_html=args.html,
wrap_pre=not args.no_wrap_pre,
pager=args.pager) else 1
except KeyboardInterrupt:
print("Interrupt", file=sys.stderr, flush=True)
return 1
if __name__ == "__main__":
sys.exit(main())