Module fpdf.html

HTML Renderer for FPDF.py

Expand source code
"HTML Renderer for FPDF.py"

__author__ = "Mariano Reingart <reingart@gmail.com>"
__copyright__ = "Copyright (C) 2010 Mariano Reingart"
__license__ = "LGPL 3.0"

# Inspired by tuto5.py and several examples from fpdf.org, html2fpdf, etc.

import logging, warnings
from html.parser import HTMLParser

from .enums import XPos, YPos

import re

LOGGER = logging.getLogger(__name__)
BULLET_WIN1252 = "\x95"  # BULLET character in Windows-1252 encoding
DEFAULT_HEADING_SIZES = dict(h1=24, h2=18, h3=14, h4=12, h5=10, h6=8)
LEADING_SPACE = re.compile(r"^\s+")
WHITESPACE = re.compile(r"(\s)(\s*)")
TRAILING_SPACE = re.compile(r"\s$")

COLOR_DICT = {
    "black": "#000000",
    "navy": "#000080",
    "darkblue": "#00008b",
    "mediumblue": "#0000cd",
    "blue": "#0000ff",
    "darkgreen": "#006400",
    "green": "#008000",
    "teal": "#008080",
    "darkcyan": "#008b8b",
    "deepskyblue": "#00bfff",
    "darkturquoise": "#00ced1",
    "mediumspringgreen": "#00fa9a",
    "lime": "#00ff00",
    "springgreen": "#00ff7f",
    "aqua": "#00ffff",
    "cyan": "#00ffff",
    "midnightblue": "#191970",
    "dodgerblue": "#1e90ff",
    "lightseagreen": "#20b2aa",
    "forestgreen": "#228b22",
    "seagreen": "#2e8b57",
    "darkslategray": "#2f4f4f",
    "darkslategrey": "#2f4f4f",
    "limegreen": "#32cd32",
    "mediumseagreen": "#3cb371",
    "turquoise": "#40e0d0",
    "royalblue": "#4169e1",
    "steelblue": "#4682b4",
    "darkslateblue": "#483d8b",
    "mediumturquoise": "#48d1cc",
    "indigo": "#4b0082",
    "darkolivegreen": "#556b2f",
    "cadetblue": "#5f9ea0",
    "cornflowerblue": "#6495ed",
    "rebeccapurple": "#663399",
    "mediumaquamarine": "#66cdaa",
    "dimgray": "#696969",
    "dimgrey": "#696969",
    "slateblue": "#6a5acd",
    "olivedrab": "#6b8e23",
    "slategray": "#708090",
    "slategrey": "#708090",
    "lightslategray": "#778899",
    "lightslategrey": "#778899",
    "mediumslateblue": "#7b68ee",
    "lawngreen": "#7cfc00",
    "chartreuse": "#7fff00",
    "aquamarine": "#7fffd4",
    "maroon": "#800000",
    "purple": "#800080",
    "olive": "#808000",
    "gray": "#808080",
    "grey": "#808080",
    "skyblue": "#87ceeb",
    "lightskyblue": "#87cefa",
    "blueviolet": "#8a2be2",
    "darkred": "#8b0000",
    "darkmagenta": "#8b008b",
    "saddlebrown": "#8b4513",
    "darkseagreen": "#8fbc8f",
    "lightgreen": "#90ee90",
    "mediumpurple": "#9370db",
    "darkviolet": "#9400d3",
    "palegreen": "#98fb98",
    "darkorchid": "#9932cc",
    "yellowgreen": "#9acd32",
    "sienna": "#a0522d",
    "brown": "#a52a2a",
    "darkgray": "#a9a9a9",
    "darkgrey": "#a9a9a9",
    "lightblue": "#add8e6",
    "greenyellow": "#adff2f",
    "paleturquoise": "#afeeee",
    "lightsteelblue": "#b0c4de",
    "powderblue": "#b0e0e6",
    "firebrick": "#b22222",
    "darkgoldenrod": "#b8860b",
    "mediumorchid": "#ba55d3",
    "rosybrown": "#bc8f8f",
    "darkkhaki": "#bdb76b",
    "silver": "#c0c0c0",
    "mediumvioletred": "#c71585",
    "indianred": "#cd5c5c",
    "peru": "#cd853f",
    "chocolate": "#d2691e",
    "tan": "#d2b48c",
    "lightgray": "#d3d3d3",
    "lightgrey": "#d3d3d3",
    "thistle": "#d8bfd8",
    "orchid": "#da70d6",
    "goldenrod": "#daa520",
    "palevioletred": "#db7093",
    "crimson": "#dc143c",
    "gainsboro": "#dcdcdc",
    "plum": "#dda0dd",
    "burlywood": "#deb887",
    "lightcyan": "#e0ffff",
    "lavender": "#e6e6fa",
    "darksalmon": "#e9967a",
    "violet": "#ee82ee",
    "palegoldenrod": "#eee8aa",
    "lightcoral": "#f08080",
    "khaki": "#f0e68c",
    "aliceblue": "#f0f8ff",
    "honeydew": "#f0fff0",
    "azure": "#f0ffff",
    "sandybrown": "#f4a460",
    "wheat": "#f5deb3",
    "beige": "#f5f5dc",
    "whitesmoke": "#f5f5f5",
    "mintcream": "#f5fffa",
    "ghostwhite": "#f8f8ff",
    "salmon": "#fa8072",
    "antiquewhite": "#faebd7",
    "linen": "#faf0e6",
    "lightgoldenrodyellow": "#fafad2",
    "oldlace": "#fdf5e6",
    "red": "#ff0000",
    "fuchsia": "#ff00ff",
    "magenta": "#ff00ff",
    "deeppink": "#ff1493",
    "orangered": "#ff4500",
    "tomato": "#ff6347",
    "hotpink": "#ff69b4",
    "coral": "#ff7f50",
    "darkorange": "#ff8c00",
    "lightsalmon": "#ffa07a",
    "orange": "#ffa500",
    "lightpink": "#ffb6c1",
    "pink": "#ffc0cb",
    "gold": "#ffd700",
    "peachpuff": "#ffdab9",
    "navajowhite": "#ffdead",
    "moccasin": "#ffe4b5",
    "bisque": "#ffe4c4",
    "mistyrose": "#ffe4e1",
    "blanchedalmond": "#ffebcd",
    "papayawhip": "#ffefd5",
    "lavenderblush": "#fff0f5",
    "seashell": "#fff5ee",
    "cornsilk": "#fff8dc",
    "lemonchiffon": "#fffacd",
    "floralwhite": "#fffaf0",
    "snow": "#fffafa",
    "yellow": "#ffff00",
    "lightyellow": "#ffffe0",
    "ivory": "#fffff0",
    "white": "#ffffff",
}


def px2mm(px):
    return px * 25.4 / 72


def color_as_decimal(color="#000000"):
    if not color:
        return None

    # Checks if color is a name and gets the hex value
    hexcolor = COLOR_DICT.get(color.lower(), color)

    if len(hexcolor) == 4:
        r = int(hexcolor[1] * 2, 16)
        g = int(hexcolor[2] * 2, 16)
        b = int(hexcolor[3] * 2, 16)
        return r, g, b

    r = int(hexcolor[1:3], 16)
    g = int(hexcolor[3:5], 16)
    b = int(hexcolor[5:7], 16)
    return r, g, b


class HTML2FPDF(HTMLParser):
    "Render basic HTML to FPDF"

    HTML_UNCLOSED_TAGS = ("br", "dd", "dt", "hr", "img", "li", "td", "tr")

    def __init__(
        self,
        pdf,
        image_map=None,
        li_tag_indent=5,
        dd_tag_indent=10,
        table_line_separators=False,
        ul_bullet_char=BULLET_WIN1252,
        heading_sizes=None,
        warn_on_tags_not_matching=True,
        **_,
    ):
        """
        Args:
            pdf (FPDF): an instance of `fpdf.FPDF`
            image_map (function): an optional one-argument function that map <img> "src"
                to new image URLs
            li_tag_indent (int): numeric indentation of <li> elements
            dd_tag_indent (int): numeric indentation of <dd> elements
            table_line_separators (bool): enable horizontal line separators in <table>
            ul_bullet_char (str): bullet character for <ul> elements
        """
        super().__init__()
        self.pdf = pdf
        self.image_map = image_map or (lambda src: src)
        self.li_tag_indent = li_tag_indent
        self.dd_tag_indent = dd_tag_indent
        self.table_line_separators = table_line_separators
        self.ul_bullet_char = ul_bullet_char
        self.style = dict(b=False, i=False, u=False)
        self.pre_formatted = False
        self.follows_fmt_tag = False
        self.follows_trailing_space = False
        self.href = ""
        self.align = ""
        self.page_links = {}
        self.font_stack = []
        self.indent = 0
        self.bullet = []
        self.font_size = pdf.font_size_pt
        self.set_font(pdf.font_family or "times", size=self.font_size)
        self.font_color = 0, 0, 0  # initialize font color, r,g,b format
        self.table = None  # table attributes
        self.table_col_width = None  # column (header) widths
        self.table_col_index = None  # current column index
        self.td = None  # inside a <td>, attributes dict
        self.th = None  # inside a <th>, attributes dict
        self.tr = None  # inside a <tr>, attributes dict
        self.thead = None  # inside a <thead>, attributes dict
        self.tfoot = None  # inside a <tfoot>, attributes dict
        self.tr_index = None  # row index
        self.theader = None  # table header cells
        self.tfooter = None  # table footer cells
        self.theader_out = self.tfooter_out = False
        self.table_row_height = 0
        self.heading_level = None
        self.heading_sizes = dict(**DEFAULT_HEADING_SIZES)
        self.heading_above = 0.2  # extra space above heading, relative to font size
        self.heading_below = 0.2  # extra space below heading, relative to font size
        if heading_sizes:
            self.heading_sizes.update(heading_sizes)
        self._only_imgs_in_td = False
        self.warn_on_tags_not_matching = warn_on_tags_not_matching
        self._tags_stack = []

    def width2unit(self, length):
        "Handle conversion of % measures into the measurement unit used"
        if length[-1] == "%":
            total = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
            if self.table["width"][-1] == "%":
                total *= int(self.table["width"][:-1]) / 100
            return int(length[:-1]) * total / 100
        return int(length)

    def handle_data(self, data):
        trailing_space_flag = TRAILING_SPACE.search(data)
        if self.td is not None:  # drawing a table?
            self._insert_td(data)
        elif self.table is not None:
            # ignore anything else than td inside a table
            pass
        elif self.align:
            LOGGER.debug("align '%s'", data.replace("\n", "\\n"))
            self.pdf.multi_cell(
                0,
                self.h,
                data,
                border=0,
                new_x=XPos.LMARGIN,
                new_y=YPos.NEXT,
                align=self.align[0].upper(),
                link=self.href,
            )
        elif self.pre_formatted:  # for pre blocks
            self.pdf.write(self.h, data)

        elif self.follows_fmt_tag and not self.follows_trailing_space:
            # don't trim leading whitespace if following a format tag with no trailing whitespace
            data = WHITESPACE.sub(whitespace_repl, data)
            if trailing_space_flag:
                self.follows_trailing_space = True
            if self.href:
                self.put_link(data)
            else:
                if self.heading_level:
                    self.pdf.start_section(data, self.heading_level - 1, strict=False)
                LOGGER.debug(
                    "write '%s' h=%d",
                    WHITESPACE.sub(whitespace_repl, data),
                    self.h,
                )
                self.pdf.write(self.h, data)
            self.follows_fmt_tag = False

        else:
            data = LEADING_SPACE.sub(leading_whitespace_repl, data)
            data = WHITESPACE.sub(whitespace_repl, data)
            self.follows_trailing_space = trailing_space_flag
            if self.href:
                self.put_link(data)
            else:
                if self.heading_level:
                    self.pdf.start_section(data, self.heading_level - 1, strict=False)
                LOGGER.debug(
                    "write '%s' h=%d",
                    WHITESPACE.sub(whitespace_repl, data),
                    self.h,
                )
                self.pdf.write(self.h, data)
            self.follows_fmt_tag = False

    def _insert_td(self, data=""):
        self._only_imgs_in_td = False
        width = self._td_width()
        height = int(self.td.get("height", 0)) // 4 or self.h * 1.30
        if not self.table_row_height:
            self.table_row_height = height
        elif self.table_row_height > height:
            height = self.table_row_height
        border = int(self.table.get("border", 0))
        if self.th:
            self.set_style("B", True)
            border = border or "B"
            align = self.td.get("align", "C")[0].upper()
        else:
            align = self.td.get("align", "L")[0].upper()
            border = border and "LR"
        bgcolor = color_as_decimal(self.td.get("bgcolor", self.tr.get("bgcolor", "")))
        # parsing table header/footer (drawn later):
        if self.thead is not None:
            self.theader.append(
                (
                    dict(
                        w=width,
                        h=height,
                        txt=data,
                        border=border,
                        new_x=XPos.RIGHT,
                        new_y=YPos.TOP,
                        align=align,
                    ),
                    bgcolor,
                )
            )
        if self.tfoot is not None:
            self.tfooter.append(
                (
                    dict(
                        w=width,
                        h=height,
                        txt=data,
                        border=border,
                        new_x=XPos.RIGHT,
                        new_y=YPos.TOP,
                        align=align,
                    ),
                    bgcolor,
                )
            )
        # check if reached end of page, add table footer and header:
        if self.tfooter:
            height += self.tfooter[0][0]["h"]
        if self.pdf.y + height > self.pdf.page_break_trigger and not self.th:
            self.output_table_footer()
            self.pdf.add_page(same=True)
            self.theader_out = self.tfooter_out = False
        if self.tfoot is None and self.thead is None:
            if not self.theader_out:
                self.output_table_header()
            self.box_shadow(width, height, bgcolor)
            # self.pdf.x may have shifted due to <img> inside <td>:
            self.pdf.set_x(self._td_x())
            LOGGER.debug(
                "td cell x=%d width=%d height=%d border=%s align=%s '%s'",
                self.pdf.x,
                width,
                height,
                border,
                align,
                data.replace("\n", "\\n"),
            )
            self.pdf.cell(
                width,
                height,
                data,
                border=border,
                align=align,
                new_x=XPos.RIGHT,
                new_y=YPos.TOP,
            )

    def _td_x(self):
        "Return the current table cell left side horizontal position"
        prev_cells_total_width = sum(
            self.width2unit(width)
            for width in self.table_col_width[: self.table_col_index]
        )
        return self.table_offset + prev_cells_total_width

    def _td_width(self):
        "Return the current table cell width"
        # pylint: disable=raise-missing-from
        if "width" in self.td:
            column_widths = [self.td["width"]]
        elif "colspan" in self.td:
            i = self.table_col_index
            colspan = int(self.td["colspan"])
            column_widths = self.table_col_width[i : i + colspan]
        else:
            try:
                column_widths = [self.table_col_width[self.table_col_index]]
            except IndexError:
                raise ValueError(
                    f"Width not specified for table column {self.table_col_index},"
                    " unable to continue"
                )
        return sum(self.width2unit(width) for width in column_widths)

    def box_shadow(self, w, h, bgcolor):
        LOGGER.debug("box_shadow w=%d h=%d bgcolor=%s", w, h, bgcolor)
        if bgcolor:
            fill_color = self.pdf.fill_color
            self.pdf.set_fill_color(*bgcolor)
            self.pdf.rect(self.pdf.x, self.pdf.y, w, h, "F")
            self.pdf.set_fill_color(*fill_color.colors)

    def output_table_header(self):
        if self.theader:
            b = self.style.get("b")
            self.pdf.set_x(self.table_offset)
            self.set_style("b", True)
            for celldict, bgcolor in self.theader:
                self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                self.pdf.cell(**celldict)  # includes the border
            self.set_style("b", b)
            self.pdf.ln(self.theader[0][0]["h"])
            self.pdf.set_x(self.table_offset)
            # self.pdf.set_x(prev_x)
        self.theader_out = True

    def output_table_footer(self):
        if self.tfooter:
            x = self.pdf.x
            self.pdf.set_x(self.table_offset)
            for celldict, bgcolor in self.tfooter:
                self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                self.pdf.cell(**celldict)
            self.pdf.ln(self.tfooter[0][0]["h"])
            self.pdf.set_x(x)
        if self.table.get("border"):
            self.output_table_sep()
        self.tfooter_out = True

    def output_table_sep(self):
        x1 = self.pdf.x
        y1 = self.pdf.y
        width = sum(self.width2unit(length) for length in self.table_col_width)
        self.pdf.line(x1, y1, x1 + width, y1)

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        LOGGER.debug("STARTTAG %s %s", tag, attrs)
        self._tags_stack.append(tag)
        if tag == "dt":
            self.pdf.ln(self.h)
            tag = "b"
        if tag == "dd":
            self.pdf.ln(self.h)
            self.pdf.write(self.h, " " * self.dd_tag_indent)
        if tag == "strong":
            tag = "b"
        if tag == "em":
            tag = "i"
        if tag in ("b", "i", "u"):
            self.set_style(tag, True)
        if tag == "a":
            self.href = attrs["href"]
        if tag == "br":
            self.pdf.ln(self.h)
        if tag == "p":
            self.pdf.ln(self.h)
            if "align" in attrs:
                self.align = attrs.get("align")
            if "line-height" in attrs:
                line_height = float(attrs.get("line-height"))
                self.h = px2mm(self.font_size) * line_height
        if tag in self.heading_sizes:
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.heading_level = int(tag[1:])
            hsize = self.heading_sizes[tag]
            self.pdf.set_text_color(150, 0, 0)
            self.pdf.ln(self.h + self.heading_above * hsize)  # more space above heading
            self.set_font(size=hsize)
            if attrs:
                self.align = attrs.get("align")
        if tag == "hr":
            self.pdf.add_page(same=True)
        if tag == "code":
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.set_font("courier", 11)
        if tag == "pre":
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.set_font("courier", 11)
            self.pre_formatted = True
        if tag == "blockquote":
            self.pdf.set_text_color(100, 0, 45)
            self.indent += 1
            self.pdf.ln(3)
        if tag == "ul":
            self.indent += 1
            self.bullet.append(self.ul_bullet_char)
        if tag == "ol":
            self.indent += 1
            self.bullet.append(0)
        if tag == "li":
            self.pdf.ln(self.h + 2)
            self.pdf.set_text_color(190, 0, 0)
            bullet = self.bullet[self.indent - 1]
            if not isinstance(bullet, str):
                bullet += 1
                self.bullet[self.indent - 1] = bullet
                bullet = f"{bullet}. "
            self.pdf.write(self.h, f"{' ' * self.li_tag_indent * self.indent}{bullet} ")
            self.set_text_color(*self.font_color)
        if tag == "font":
            # save previous font state:
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            if "color" in attrs:
                color = color_as_decimal(attrs["color"])
                self.font_color = color
            if "face" in attrs:
                face = attrs.get("face").lower()
                try:
                    self.pdf.set_font(face)
                    self.font_face = face
                except RuntimeError:
                    pass  # font not found, ignore
            if "size" in attrs:
                self.font_size = int(attrs.get("size"))
            self.set_font()
            self.set_text_color(*self.font_color)
        if tag == "table":
            self.table = {k.lower(): v for k, v in attrs.items()}
            if "width" not in self.table:
                self.table["width"] = "100%"
            if self.table["width"][-1] == "%":
                w = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
                w *= int(self.table["width"][:-1]) / 100
                self.table_offset = (self.pdf.w - w) / 2
            self.table_col_width = []
            self.theader_out = self.tfooter_out = False
            self.theader = []
            self.tfooter = []
            self.thead = None
            self.tfoot = None
            self.pdf.ln()
        if tag == "tr":
            self.tr_index = 0 if self.tr_index is None else (self.tr_index + 1)
            self.tr = {k.lower(): v for k, v in attrs.items()}
            self.table_col_index = 0
            self.table_row_height = 0
            self.pdf.set_x(self.table_offset)
            # Adding an horizontal line separator between rows:
            if self.table_line_separators and self.tr_index > 0:
                self.output_table_sep()
        if tag == "td":
            self.td = {k.lower(): v for k, v in attrs.items()}
            if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                assert self.table_col_index == len(
                    self.table_col_width
                ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                self.table_col_width.append(self.td["width"])
            if attrs:
                self.align = attrs.get("align")
            self._only_imgs_in_td = False
        if tag == "th":
            self.td = {k.lower(): v for k, v in attrs.items()}
            self.th = True
            if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                assert self.table_col_index == len(
                    self.table_col_width
                ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                self.table_col_width.append(self.td["width"])
        if tag == "thead":
            self.thead = {}
        if tag == "tfoot":
            self.tfoot = {}
        if tag == "img" and "src" in attrs:
            width = px2mm(int(attrs.get("width", 0)))
            height = px2mm(int(attrs.get("height", 0)))
            if self.pdf.y + height > self.pdf.page_break_trigger:
                self.pdf.add_page(same=True)
            y = self.pdf.get_y()
            if self.table_col_index is not None:
                self._only_imgs_in_td = True
                # <img> in a <td>: its width must not exceed the cell width:
                td_width = self._td_width()
                if not width or width > td_width:
                    if width:  # Preserving image aspect ratio:
                        height *= td_width / width
                    width = td_width
                x = self._td_x()
                if self.align and self.align[0].upper() == "C":
                    x += (td_width - width) / 2
            else:
                x = self.pdf.get_x()
                if self.align and self.align[0].upper() == "C":
                    x = self.pdf.w / 2 - width / 2
            LOGGER.debug(
                'image "%s" x=%d y=%d width=%d height=%d',
                attrs["src"],
                x,
                y,
                width,
                height,
            )
            image_info = self.pdf.image(
                self.image_map(attrs["src"]), x, y, width, height, link=self.href
            )
            width = image_info["rendered_width"]
            height = image_info["rendered_height"]
            self.pdf.set_x(x + width)
            if self.table_col_index is not None:
                # <img> in a <td>: we grow the cell height according to the image height:
                if height > self.table_row_height:
                    self.table_row_height = height
            else:
                self.pdf.set_y(y + height)
        if tag in ("b", "i", "u"):
            self.set_style(tag, True)
        if tag == "center":
            self.align = "Center"
        if tag == "toc":
            self.pdf.insert_toc_placeholder(
                self.render_toc, pages=int(attrs.get("pages", 1))
            )
        if tag == "sup":
            self.pdf.char_vpos = "SUP"
        if tag == "sub":
            self.pdf.char_vpos = "SUB"

    def handle_endtag(self, tag):
        LOGGER.debug("ENDTAG %s", tag)
        while (
            self._tags_stack
            and tag != self._tags_stack[-1]
            and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS
        ):
            self._tags_stack.pop()
        if not self._tags_stack:
            if self.warn_on_tags_not_matching:
                LOGGER.warning(
                    "Unexpected HTML end tag </%s>, start tag may be missing?", tag
                )
        elif tag == self._tags_stack[-1]:
            self._tags_stack.pop()
        elif self.warn_on_tags_not_matching:
            LOGGER.warning(
                "Unexpected HTML end tag </%s>, start tag was <%s>",
                tag,
                self._tags_stack[-1],
            )
        if tag in self.heading_sizes:
            self.heading_level = None
            face, size, color = self.font_stack.pop()
            # more space below heading:
            self.pdf.ln(self.h + self.h * self.heading_below)
            self.set_font(face, size)
            self.set_text_color(*color)
            self.align = None
        if tag == "code":
            face, size, color = self.font_stack.pop()
            self.set_font(face, size)
            self.set_text_color(*color)
        if tag == "pre":
            face, size, color = self.font_stack.pop()
            self.set_font(face, size)
            self.set_text_color(*color)
            self.pre_formatted = False
        if tag == "blockquote":
            self.set_text_color(*self.font_color)
            self.indent -= 1
            self.pdf.ln(3)
        if tag in ("strong", "dt"):
            tag = "b"
        if tag == "em":
            tag = "i"
        if tag in ("b", "i", "u"):
            self.set_style(tag, False)
            self.follows_fmt_tag = True
        if tag == "a":
            self.href = ""
        if tag == "p":
            self.pdf.ln(self.h)
            self.align = ""
            self.h = px2mm(self.font_size)
        if tag in ("ul", "ol"):
            self.indent -= 1
            self.bullet.pop()
        if tag == "table":
            if not self.tfooter_out:
                self.output_table_footer()
            self.table = None
            self.th = False
            self.theader = None
            self.tfooter = None
            self.pdf.ln(self.h)
            self.tr_index = None
        if tag == "thead":
            self.thead = None
            self.tr_index = None
        if tag == "tfoot":
            self.tfoot = None
            self.tr_index = None
        if tag == "tbody":
            self.tbody = None
            self.tr_index = None
        if tag == "tr":
            if self.tfoot is None:
                self.pdf.ln(self.table_row_height)
            self.table_col_index = None
            self.tr = None
        if tag in ("td", "th"):
            if self.th:
                LOGGER.debug("revert style")
                self.set_style("b", False)  # revert style
            elif self._only_imgs_in_td:
                self._insert_td()
            self.table_col_index += int(self.td.get("colspan", "1"))
            self.td = None
            self.th = False
        if tag == "font":
            # recover last font state
            face, size, color = self.font_stack.pop()
            self.font_color = color
            self.set_font(face, size)
            self.set_text_color(*self.font_color)
        if tag == "center":
            self.align = None
        if tag == "sup":
            self.pdf.char_vpos = "LINE"
            self.follows_fmt_tag = True
        if tag == "sub":
            self.pdf.char_vpos = "LINE"
            self.follows_fmt_tag = True

    def feed(self, data):
        super().feed(data)
        while self._tags_stack and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS:
            self._tags_stack.pop()
        if self._tags_stack and self.warn_on_tags_not_matching:
            LOGGER.warning("Missing HTML end tag for <%s>", self._tags_stack[-1])

    def set_font(self, face=None, size=None):
        if face:
            self.font_face = face
        if size:
            self.font_size = size
            self.h = px2mm(size)
            LOGGER.debug("H %s", self.h)
        style = "".join(s for s in ("b", "i", "u") if self.style.get(s)).upper()
        if (self.font_face, style) != (self.pdf.font_family, self.pdf.font_style):
            self.pdf.set_font(self.font_face, style, self.font_size)
        if self.font_size != self.pdf.font_size:
            self.pdf.set_font_size(self.font_size)

    def set_style(self, tag=None, enable=False):
        # Modify style and select corresponding font
        if tag:
            self.style[tag.lower()] = enable
        style = "".join(s for s in ("b", "i", "u") if self.style.get(s))
        LOGGER.debug("SET_FONT_STYLE %s", style)
        self.pdf.set_font(style=style)

    def set_text_color(self, r=None, g=0, b=0):
        self.pdf.set_text_color(r, g, b)

    def put_link(self, txt):
        # Put a hyperlink
        self.set_text_color(0, 0, 255)
        self.set_style("u", True)
        self.pdf.write(self.h, txt, self.href)
        self.set_style("u", False)
        self.set_text_color(*self.font_color)

    def render_toc(self, pdf, outline):
        "This method can be overriden by subclasses to customize the Table of Contents style."
        pdf.ln()
        for section in outline:
            link = pdf.add_link(page=section.page_number)
            text = f'{" " * section.level * 2} {section.name}'
            text += f' {"." * (60 - section.level*2 - len(section.name))} {section.page_number}'
            pdf.multi_cell(
                w=pdf.epw,
                h=pdf.font_size,
                txt=text,
                new_x=XPos.LMARGIN,
                new_y=YPos.NEXT,
                link=link,
            )

    # Subclasses of _markupbase.ParserBase must implement this:
    def error(self, message):
        raise RuntimeError(message)


def leading_whitespace_repl(matchobj):
    trimmed_str = ""
    for char in matchobj.group(0):  # check if leading whitespace contains nbsp
        if char == "\u00a0":
            trimmed_str += "\u00a0"
        elif char == "\u202f":
            trimmed_str += "\u202f"
    return trimmed_str


def whitespace_repl(matchobj):
    trimmed_str = ""
    for char in matchobj.group(
        1
    ):  # allow 1 whitespace char, check for narrow no-break space
        if char == "\u202f":
            trimmed_str += "\u202f"
        else:
            trimmed_str += " "
    for char in matchobj.group(2):  # remove following whitespace char unless nbsp
        if char == "\u00a0":
            trimmed_str += "\u00a0"
        elif char == "\u202f":
            trimmed_str += "\u202f"
    return trimmed_str


class HTMLMixin:
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(
            "The HTMLMixin class is deprecated. "
            "Simply use the FPDF class as a replacement.",
            DeprecationWarning,
            stacklevel=2,
        )

Functions

def color_as_decimal(color='#000000')
Expand source code
def color_as_decimal(color="#000000"):
    if not color:
        return None

    # Checks if color is a name and gets the hex value
    hexcolor = COLOR_DICT.get(color.lower(), color)

    if len(hexcolor) == 4:
        r = int(hexcolor[1] * 2, 16)
        g = int(hexcolor[2] * 2, 16)
        b = int(hexcolor[3] * 2, 16)
        return r, g, b

    r = int(hexcolor[1:3], 16)
    g = int(hexcolor[3:5], 16)
    b = int(hexcolor[5:7], 16)
    return r, g, b
def leading_whitespace_repl(matchobj)
Expand source code
def leading_whitespace_repl(matchobj):
    trimmed_str = ""
    for char in matchobj.group(0):  # check if leading whitespace contains nbsp
        if char == "\u00a0":
            trimmed_str += "\u00a0"
        elif char == "\u202f":
            trimmed_str += "\u202f"
    return trimmed_str
def px2mm(px)
Expand source code
def px2mm(px):
    return px * 25.4 / 72
def whitespace_repl(matchobj)
Expand source code
def whitespace_repl(matchobj):
    trimmed_str = ""
    for char in matchobj.group(
        1
    ):  # allow 1 whitespace char, check for narrow no-break space
        if char == "\u202f":
            trimmed_str += "\u202f"
        else:
            trimmed_str += " "
    for char in matchobj.group(2):  # remove following whitespace char unless nbsp
        if char == "\u00a0":
            trimmed_str += "\u00a0"
        elif char == "\u202f":
            trimmed_str += "\u202f"
    return trimmed_str

Classes

class HTML2FPDF (pdf, image_map=None, li_tag_indent=5, dd_tag_indent=10, table_line_separators=False, ul_bullet_char='\x95', heading_sizes=None, warn_on_tags_not_matching=True, **_)

Render basic HTML to FPDF

Args

pdf : FPDF
an instance of FPDF
image_map : function
an optional one-argument function that map "src" to new image URLs
li_tag_indent : int
numeric indentation of
  • elements
  • dd_tag_indent : int
    numeric indentation of
    elements
    table_line_separators : bool
    enable horizontal line separators in
    ul_bullet_char : str
    bullet character for
      elements
    Expand source code
    class HTML2FPDF(HTMLParser):
        "Render basic HTML to FPDF"
    
        HTML_UNCLOSED_TAGS = ("br", "dd", "dt", "hr", "img", "li", "td", "tr")
    
        def __init__(
            self,
            pdf,
            image_map=None,
            li_tag_indent=5,
            dd_tag_indent=10,
            table_line_separators=False,
            ul_bullet_char=BULLET_WIN1252,
            heading_sizes=None,
            warn_on_tags_not_matching=True,
            **_,
        ):
            """
            Args:
                pdf (FPDF): an instance of `fpdf.FPDF`
                image_map (function): an optional one-argument function that map <img> "src"
                    to new image URLs
                li_tag_indent (int): numeric indentation of <li> elements
                dd_tag_indent (int): numeric indentation of <dd> elements
                table_line_separators (bool): enable horizontal line separators in <table>
                ul_bullet_char (str): bullet character for <ul> elements
            """
            super().__init__()
            self.pdf = pdf
            self.image_map = image_map or (lambda src: src)
            self.li_tag_indent = li_tag_indent
            self.dd_tag_indent = dd_tag_indent
            self.table_line_separators = table_line_separators
            self.ul_bullet_char = ul_bullet_char
            self.style = dict(b=False, i=False, u=False)
            self.pre_formatted = False
            self.follows_fmt_tag = False
            self.follows_trailing_space = False
            self.href = ""
            self.align = ""
            self.page_links = {}
            self.font_stack = []
            self.indent = 0
            self.bullet = []
            self.font_size = pdf.font_size_pt
            self.set_font(pdf.font_family or "times", size=self.font_size)
            self.font_color = 0, 0, 0  # initialize font color, r,g,b format
            self.table = None  # table attributes
            self.table_col_width = None  # column (header) widths
            self.table_col_index = None  # current column index
            self.td = None  # inside a <td>, attributes dict
            self.th = None  # inside a <th>, attributes dict
            self.tr = None  # inside a <tr>, attributes dict
            self.thead = None  # inside a <thead>, attributes dict
            self.tfoot = None  # inside a <tfoot>, attributes dict
            self.tr_index = None  # row index
            self.theader = None  # table header cells
            self.tfooter = None  # table footer cells
            self.theader_out = self.tfooter_out = False
            self.table_row_height = 0
            self.heading_level = None
            self.heading_sizes = dict(**DEFAULT_HEADING_SIZES)
            self.heading_above = 0.2  # extra space above heading, relative to font size
            self.heading_below = 0.2  # extra space below heading, relative to font size
            if heading_sizes:
                self.heading_sizes.update(heading_sizes)
            self._only_imgs_in_td = False
            self.warn_on_tags_not_matching = warn_on_tags_not_matching
            self._tags_stack = []
    
        def width2unit(self, length):
            "Handle conversion of % measures into the measurement unit used"
            if length[-1] == "%":
                total = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
                if self.table["width"][-1] == "%":
                    total *= int(self.table["width"][:-1]) / 100
                return int(length[:-1]) * total / 100
            return int(length)
    
        def handle_data(self, data):
            trailing_space_flag = TRAILING_SPACE.search(data)
            if self.td is not None:  # drawing a table?
                self._insert_td(data)
            elif self.table is not None:
                # ignore anything else than td inside a table
                pass
            elif self.align:
                LOGGER.debug("align '%s'", data.replace("\n", "\\n"))
                self.pdf.multi_cell(
                    0,
                    self.h,
                    data,
                    border=0,
                    new_x=XPos.LMARGIN,
                    new_y=YPos.NEXT,
                    align=self.align[0].upper(),
                    link=self.href,
                )
            elif self.pre_formatted:  # for pre blocks
                self.pdf.write(self.h, data)
    
            elif self.follows_fmt_tag and not self.follows_trailing_space:
                # don't trim leading whitespace if following a format tag with no trailing whitespace
                data = WHITESPACE.sub(whitespace_repl, data)
                if trailing_space_flag:
                    self.follows_trailing_space = True
                if self.href:
                    self.put_link(data)
                else:
                    if self.heading_level:
                        self.pdf.start_section(data, self.heading_level - 1, strict=False)
                    LOGGER.debug(
                        "write '%s' h=%d",
                        WHITESPACE.sub(whitespace_repl, data),
                        self.h,
                    )
                    self.pdf.write(self.h, data)
                self.follows_fmt_tag = False
    
            else:
                data = LEADING_SPACE.sub(leading_whitespace_repl, data)
                data = WHITESPACE.sub(whitespace_repl, data)
                self.follows_trailing_space = trailing_space_flag
                if self.href:
                    self.put_link(data)
                else:
                    if self.heading_level:
                        self.pdf.start_section(data, self.heading_level - 1, strict=False)
                    LOGGER.debug(
                        "write '%s' h=%d",
                        WHITESPACE.sub(whitespace_repl, data),
                        self.h,
                    )
                    self.pdf.write(self.h, data)
                self.follows_fmt_tag = False
    
        def _insert_td(self, data=""):
            self._only_imgs_in_td = False
            width = self._td_width()
            height = int(self.td.get("height", 0)) // 4 or self.h * 1.30
            if not self.table_row_height:
                self.table_row_height = height
            elif self.table_row_height > height:
                height = self.table_row_height
            border = int(self.table.get("border", 0))
            if self.th:
                self.set_style("B", True)
                border = border or "B"
                align = self.td.get("align", "C")[0].upper()
            else:
                align = self.td.get("align", "L")[0].upper()
                border = border and "LR"
            bgcolor = color_as_decimal(self.td.get("bgcolor", self.tr.get("bgcolor", "")))
            # parsing table header/footer (drawn later):
            if self.thead is not None:
                self.theader.append(
                    (
                        dict(
                            w=width,
                            h=height,
                            txt=data,
                            border=border,
                            new_x=XPos.RIGHT,
                            new_y=YPos.TOP,
                            align=align,
                        ),
                        bgcolor,
                    )
                )
            if self.tfoot is not None:
                self.tfooter.append(
                    (
                        dict(
                            w=width,
                            h=height,
                            txt=data,
                            border=border,
                            new_x=XPos.RIGHT,
                            new_y=YPos.TOP,
                            align=align,
                        ),
                        bgcolor,
                    )
                )
            # check if reached end of page, add table footer and header:
            if self.tfooter:
                height += self.tfooter[0][0]["h"]
            if self.pdf.y + height > self.pdf.page_break_trigger and not self.th:
                self.output_table_footer()
                self.pdf.add_page(same=True)
                self.theader_out = self.tfooter_out = False
            if self.tfoot is None and self.thead is None:
                if not self.theader_out:
                    self.output_table_header()
                self.box_shadow(width, height, bgcolor)
                # self.pdf.x may have shifted due to <img> inside <td>:
                self.pdf.set_x(self._td_x())
                LOGGER.debug(
                    "td cell x=%d width=%d height=%d border=%s align=%s '%s'",
                    self.pdf.x,
                    width,
                    height,
                    border,
                    align,
                    data.replace("\n", "\\n"),
                )
                self.pdf.cell(
                    width,
                    height,
                    data,
                    border=border,
                    align=align,
                    new_x=XPos.RIGHT,
                    new_y=YPos.TOP,
                )
    
        def _td_x(self):
            "Return the current table cell left side horizontal position"
            prev_cells_total_width = sum(
                self.width2unit(width)
                for width in self.table_col_width[: self.table_col_index]
            )
            return self.table_offset + prev_cells_total_width
    
        def _td_width(self):
            "Return the current table cell width"
            # pylint: disable=raise-missing-from
            if "width" in self.td:
                column_widths = [self.td["width"]]
            elif "colspan" in self.td:
                i = self.table_col_index
                colspan = int(self.td["colspan"])
                column_widths = self.table_col_width[i : i + colspan]
            else:
                try:
                    column_widths = [self.table_col_width[self.table_col_index]]
                except IndexError:
                    raise ValueError(
                        f"Width not specified for table column {self.table_col_index},"
                        " unable to continue"
                    )
            return sum(self.width2unit(width) for width in column_widths)
    
        def box_shadow(self, w, h, bgcolor):
            LOGGER.debug("box_shadow w=%d h=%d bgcolor=%s", w, h, bgcolor)
            if bgcolor:
                fill_color = self.pdf.fill_color
                self.pdf.set_fill_color(*bgcolor)
                self.pdf.rect(self.pdf.x, self.pdf.y, w, h, "F")
                self.pdf.set_fill_color(*fill_color.colors)
    
        def output_table_header(self):
            if self.theader:
                b = self.style.get("b")
                self.pdf.set_x(self.table_offset)
                self.set_style("b", True)
                for celldict, bgcolor in self.theader:
                    self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                    self.pdf.cell(**celldict)  # includes the border
                self.set_style("b", b)
                self.pdf.ln(self.theader[0][0]["h"])
                self.pdf.set_x(self.table_offset)
                # self.pdf.set_x(prev_x)
            self.theader_out = True
    
        def output_table_footer(self):
            if self.tfooter:
                x = self.pdf.x
                self.pdf.set_x(self.table_offset)
                for celldict, bgcolor in self.tfooter:
                    self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                    self.pdf.cell(**celldict)
                self.pdf.ln(self.tfooter[0][0]["h"])
                self.pdf.set_x(x)
            if self.table.get("border"):
                self.output_table_sep()
            self.tfooter_out = True
    
        def output_table_sep(self):
            x1 = self.pdf.x
            y1 = self.pdf.y
            width = sum(self.width2unit(length) for length in self.table_col_width)
            self.pdf.line(x1, y1, x1 + width, y1)
    
        def handle_starttag(self, tag, attrs):
            attrs = dict(attrs)
            LOGGER.debug("STARTTAG %s %s", tag, attrs)
            self._tags_stack.append(tag)
            if tag == "dt":
                self.pdf.ln(self.h)
                tag = "b"
            if tag == "dd":
                self.pdf.ln(self.h)
                self.pdf.write(self.h, " " * self.dd_tag_indent)
            if tag == "strong":
                tag = "b"
            if tag == "em":
                tag = "i"
            if tag in ("b", "i", "u"):
                self.set_style(tag, True)
            if tag == "a":
                self.href = attrs["href"]
            if tag == "br":
                self.pdf.ln(self.h)
            if tag == "p":
                self.pdf.ln(self.h)
                if "align" in attrs:
                    self.align = attrs.get("align")
                if "line-height" in attrs:
                    line_height = float(attrs.get("line-height"))
                    self.h = px2mm(self.font_size) * line_height
            if tag in self.heading_sizes:
                self.font_stack.append((self.font_face, self.font_size, self.font_color))
                self.heading_level = int(tag[1:])
                hsize = self.heading_sizes[tag]
                self.pdf.set_text_color(150, 0, 0)
                self.pdf.ln(self.h + self.heading_above * hsize)  # more space above heading
                self.set_font(size=hsize)
                if attrs:
                    self.align = attrs.get("align")
            if tag == "hr":
                self.pdf.add_page(same=True)
            if tag == "code":
                self.font_stack.append((self.font_face, self.font_size, self.font_color))
                self.set_font("courier", 11)
            if tag == "pre":
                self.font_stack.append((self.font_face, self.font_size, self.font_color))
                self.set_font("courier", 11)
                self.pre_formatted = True
            if tag == "blockquote":
                self.pdf.set_text_color(100, 0, 45)
                self.indent += 1
                self.pdf.ln(3)
            if tag == "ul":
                self.indent += 1
                self.bullet.append(self.ul_bullet_char)
            if tag == "ol":
                self.indent += 1
                self.bullet.append(0)
            if tag == "li":
                self.pdf.ln(self.h + 2)
                self.pdf.set_text_color(190, 0, 0)
                bullet = self.bullet[self.indent - 1]
                if not isinstance(bullet, str):
                    bullet += 1
                    self.bullet[self.indent - 1] = bullet
                    bullet = f"{bullet}. "
                self.pdf.write(self.h, f"{' ' * self.li_tag_indent * self.indent}{bullet} ")
                self.set_text_color(*self.font_color)
            if tag == "font":
                # save previous font state:
                self.font_stack.append((self.font_face, self.font_size, self.font_color))
                if "color" in attrs:
                    color = color_as_decimal(attrs["color"])
                    self.font_color = color
                if "face" in attrs:
                    face = attrs.get("face").lower()
                    try:
                        self.pdf.set_font(face)
                        self.font_face = face
                    except RuntimeError:
                        pass  # font not found, ignore
                if "size" in attrs:
                    self.font_size = int(attrs.get("size"))
                self.set_font()
                self.set_text_color(*self.font_color)
            if tag == "table":
                self.table = {k.lower(): v for k, v in attrs.items()}
                if "width" not in self.table:
                    self.table["width"] = "100%"
                if self.table["width"][-1] == "%":
                    w = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
                    w *= int(self.table["width"][:-1]) / 100
                    self.table_offset = (self.pdf.w - w) / 2
                self.table_col_width = []
                self.theader_out = self.tfooter_out = False
                self.theader = []
                self.tfooter = []
                self.thead = None
                self.tfoot = None
                self.pdf.ln()
            if tag == "tr":
                self.tr_index = 0 if self.tr_index is None else (self.tr_index + 1)
                self.tr = {k.lower(): v for k, v in attrs.items()}
                self.table_col_index = 0
                self.table_row_height = 0
                self.pdf.set_x(self.table_offset)
                # Adding an horizontal line separator between rows:
                if self.table_line_separators and self.tr_index > 0:
                    self.output_table_sep()
            if tag == "td":
                self.td = {k.lower(): v for k, v in attrs.items()}
                if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                    assert self.table_col_index == len(
                        self.table_col_width
                    ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                    self.table_col_width.append(self.td["width"])
                if attrs:
                    self.align = attrs.get("align")
                self._only_imgs_in_td = False
            if tag == "th":
                self.td = {k.lower(): v for k, v in attrs.items()}
                self.th = True
                if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                    assert self.table_col_index == len(
                        self.table_col_width
                    ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                    self.table_col_width.append(self.td["width"])
            if tag == "thead":
                self.thead = {}
            if tag == "tfoot":
                self.tfoot = {}
            if tag == "img" and "src" in attrs:
                width = px2mm(int(attrs.get("width", 0)))
                height = px2mm(int(attrs.get("height", 0)))
                if self.pdf.y + height > self.pdf.page_break_trigger:
                    self.pdf.add_page(same=True)
                y = self.pdf.get_y()
                if self.table_col_index is not None:
                    self._only_imgs_in_td = True
                    # <img> in a <td>: its width must not exceed the cell width:
                    td_width = self._td_width()
                    if not width or width > td_width:
                        if width:  # Preserving image aspect ratio:
                            height *= td_width / width
                        width = td_width
                    x = self._td_x()
                    if self.align and self.align[0].upper() == "C":
                        x += (td_width - width) / 2
                else:
                    x = self.pdf.get_x()
                    if self.align and self.align[0].upper() == "C":
                        x = self.pdf.w / 2 - width / 2
                LOGGER.debug(
                    'image "%s" x=%d y=%d width=%d height=%d',
                    attrs["src"],
                    x,
                    y,
                    width,
                    height,
                )
                image_info = self.pdf.image(
                    self.image_map(attrs["src"]), x, y, width, height, link=self.href
                )
                width = image_info["rendered_width"]
                height = image_info["rendered_height"]
                self.pdf.set_x(x + width)
                if self.table_col_index is not None:
                    # <img> in a <td>: we grow the cell height according to the image height:
                    if height > self.table_row_height:
                        self.table_row_height = height
                else:
                    self.pdf.set_y(y + height)
            if tag in ("b", "i", "u"):
                self.set_style(tag, True)
            if tag == "center":
                self.align = "Center"
            if tag == "toc":
                self.pdf.insert_toc_placeholder(
                    self.render_toc, pages=int(attrs.get("pages", 1))
                )
            if tag == "sup":
                self.pdf.char_vpos = "SUP"
            if tag == "sub":
                self.pdf.char_vpos = "SUB"
    
        def handle_endtag(self, tag):
            LOGGER.debug("ENDTAG %s", tag)
            while (
                self._tags_stack
                and tag != self._tags_stack[-1]
                and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS
            ):
                self._tags_stack.pop()
            if not self._tags_stack:
                if self.warn_on_tags_not_matching:
                    LOGGER.warning(
                        "Unexpected HTML end tag </%s>, start tag may be missing?", tag
                    )
            elif tag == self._tags_stack[-1]:
                self._tags_stack.pop()
            elif self.warn_on_tags_not_matching:
                LOGGER.warning(
                    "Unexpected HTML end tag </%s>, start tag was <%s>",
                    tag,
                    self._tags_stack[-1],
                )
            if tag in self.heading_sizes:
                self.heading_level = None
                face, size, color = self.font_stack.pop()
                # more space below heading:
                self.pdf.ln(self.h + self.h * self.heading_below)
                self.set_font(face, size)
                self.set_text_color(*color)
                self.align = None
            if tag == "code":
                face, size, color = self.font_stack.pop()
                self.set_font(face, size)
                self.set_text_color(*color)
            if tag == "pre":
                face, size, color = self.font_stack.pop()
                self.set_font(face, size)
                self.set_text_color(*color)
                self.pre_formatted = False
            if tag == "blockquote":
                self.set_text_color(*self.font_color)
                self.indent -= 1
                self.pdf.ln(3)
            if tag in ("strong", "dt"):
                tag = "b"
            if tag == "em":
                tag = "i"
            if tag in ("b", "i", "u"):
                self.set_style(tag, False)
                self.follows_fmt_tag = True
            if tag == "a":
                self.href = ""
            if tag == "p":
                self.pdf.ln(self.h)
                self.align = ""
                self.h = px2mm(self.font_size)
            if tag in ("ul", "ol"):
                self.indent -= 1
                self.bullet.pop()
            if tag == "table":
                if not self.tfooter_out:
                    self.output_table_footer()
                self.table = None
                self.th = False
                self.theader = None
                self.tfooter = None
                self.pdf.ln(self.h)
                self.tr_index = None
            if tag == "thead":
                self.thead = None
                self.tr_index = None
            if tag == "tfoot":
                self.tfoot = None
                self.tr_index = None
            if tag == "tbody":
                self.tbody = None
                self.tr_index = None
            if tag == "tr":
                if self.tfoot is None:
                    self.pdf.ln(self.table_row_height)
                self.table_col_index = None
                self.tr = None
            if tag in ("td", "th"):
                if self.th:
                    LOGGER.debug("revert style")
                    self.set_style("b", False)  # revert style
                elif self._only_imgs_in_td:
                    self._insert_td()
                self.table_col_index += int(self.td.get("colspan", "1"))
                self.td = None
                self.th = False
            if tag == "font":
                # recover last font state
                face, size, color = self.font_stack.pop()
                self.font_color = color
                self.set_font(face, size)
                self.set_text_color(*self.font_color)
            if tag == "center":
                self.align = None
            if tag == "sup":
                self.pdf.char_vpos = "LINE"
                self.follows_fmt_tag = True
            if tag == "sub":
                self.pdf.char_vpos = "LINE"
                self.follows_fmt_tag = True
    
        def feed(self, data):
            super().feed(data)
            while self._tags_stack and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS:
                self._tags_stack.pop()
            if self._tags_stack and self.warn_on_tags_not_matching:
                LOGGER.warning("Missing HTML end tag for <%s>", self._tags_stack[-1])
    
        def set_font(self, face=None, size=None):
            if face:
                self.font_face = face
            if size:
                self.font_size = size
                self.h = px2mm(size)
                LOGGER.debug("H %s", self.h)
            style = "".join(s for s in ("b", "i", "u") if self.style.get(s)).upper()
            if (self.font_face, style) != (self.pdf.font_family, self.pdf.font_style):
                self.pdf.set_font(self.font_face, style, self.font_size)
            if self.font_size != self.pdf.font_size:
                self.pdf.set_font_size(self.font_size)
    
        def set_style(self, tag=None, enable=False):
            # Modify style and select corresponding font
            if tag:
                self.style[tag.lower()] = enable
            style = "".join(s for s in ("b", "i", "u") if self.style.get(s))
            LOGGER.debug("SET_FONT_STYLE %s", style)
            self.pdf.set_font(style=style)
    
        def set_text_color(self, r=None, g=0, b=0):
            self.pdf.set_text_color(r, g, b)
    
        def put_link(self, txt):
            # Put a hyperlink
            self.set_text_color(0, 0, 255)
            self.set_style("u", True)
            self.pdf.write(self.h, txt, self.href)
            self.set_style("u", False)
            self.set_text_color(*self.font_color)
    
        def render_toc(self, pdf, outline):
            "This method can be overriden by subclasses to customize the Table of Contents style."
            pdf.ln()
            for section in outline:
                link = pdf.add_link(page=section.page_number)
                text = f'{" " * section.level * 2} {section.name}'
                text += f' {"." * (60 - section.level*2 - len(section.name))} {section.page_number}'
                pdf.multi_cell(
                    w=pdf.epw,
                    h=pdf.font_size,
                    txt=text,
                    new_x=XPos.LMARGIN,
                    new_y=YPos.NEXT,
                    link=link,
                )
    
        # Subclasses of _markupbase.ParserBase must implement this:
        def error(self, message):
            raise RuntimeError(message)

    Ancestors

    • html.parser.HTMLParser
    • _markupbase.ParserBase

    Class variables

    var HTML_UNCLOSED_TAGS

    Methods

    def box_shadow(self, w, h, bgcolor)
    Expand source code
    def box_shadow(self, w, h, bgcolor):
        LOGGER.debug("box_shadow w=%d h=%d bgcolor=%s", w, h, bgcolor)
        if bgcolor:
            fill_color = self.pdf.fill_color
            self.pdf.set_fill_color(*bgcolor)
            self.pdf.rect(self.pdf.x, self.pdf.y, w, h, "F")
            self.pdf.set_fill_color(*fill_color.colors)
    def error(self, message)
    Expand source code
    def error(self, message):
        raise RuntimeError(message)
    def feed(self, data)

    Feed data to the parser.

    Call this as often as you want, with as little or as much text as you want (may include '\n').

    Expand source code
    def feed(self, data):
        super().feed(data)
        while self._tags_stack and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS:
            self._tags_stack.pop()
        if self._tags_stack and self.warn_on_tags_not_matching:
            LOGGER.warning("Missing HTML end tag for <%s>", self._tags_stack[-1])
    def handle_data(self, data)
    Expand source code
    def handle_data(self, data):
        trailing_space_flag = TRAILING_SPACE.search(data)
        if self.td is not None:  # drawing a table?
            self._insert_td(data)
        elif self.table is not None:
            # ignore anything else than td inside a table
            pass
        elif self.align:
            LOGGER.debug("align '%s'", data.replace("\n", "\\n"))
            self.pdf.multi_cell(
                0,
                self.h,
                data,
                border=0,
                new_x=XPos.LMARGIN,
                new_y=YPos.NEXT,
                align=self.align[0].upper(),
                link=self.href,
            )
        elif self.pre_formatted:  # for pre blocks
            self.pdf.write(self.h, data)
    
        elif self.follows_fmt_tag and not self.follows_trailing_space:
            # don't trim leading whitespace if following a format tag with no trailing whitespace
            data = WHITESPACE.sub(whitespace_repl, data)
            if trailing_space_flag:
                self.follows_trailing_space = True
            if self.href:
                self.put_link(data)
            else:
                if self.heading_level:
                    self.pdf.start_section(data, self.heading_level - 1, strict=False)
                LOGGER.debug(
                    "write '%s' h=%d",
                    WHITESPACE.sub(whitespace_repl, data),
                    self.h,
                )
                self.pdf.write(self.h, data)
            self.follows_fmt_tag = False
    
        else:
            data = LEADING_SPACE.sub(leading_whitespace_repl, data)
            data = WHITESPACE.sub(whitespace_repl, data)
            self.follows_trailing_space = trailing_space_flag
            if self.href:
                self.put_link(data)
            else:
                if self.heading_level:
                    self.pdf.start_section(data, self.heading_level - 1, strict=False)
                LOGGER.debug(
                    "write '%s' h=%d",
                    WHITESPACE.sub(whitespace_repl, data),
                    self.h,
                )
                self.pdf.write(self.h, data)
            self.follows_fmt_tag = False
    def handle_endtag(self, tag)
    Expand source code
    def handle_endtag(self, tag):
        LOGGER.debug("ENDTAG %s", tag)
        while (
            self._tags_stack
            and tag != self._tags_stack[-1]
            and self._tags_stack[-1] in self.HTML_UNCLOSED_TAGS
        ):
            self._tags_stack.pop()
        if not self._tags_stack:
            if self.warn_on_tags_not_matching:
                LOGGER.warning(
                    "Unexpected HTML end tag </%s>, start tag may be missing?", tag
                )
        elif tag == self._tags_stack[-1]:
            self._tags_stack.pop()
        elif self.warn_on_tags_not_matching:
            LOGGER.warning(
                "Unexpected HTML end tag </%s>, start tag was <%s>",
                tag,
                self._tags_stack[-1],
            )
        if tag in self.heading_sizes:
            self.heading_level = None
            face, size, color = self.font_stack.pop()
            # more space below heading:
            self.pdf.ln(self.h + self.h * self.heading_below)
            self.set_font(face, size)
            self.set_text_color(*color)
            self.align = None
        if tag == "code":
            face, size, color = self.font_stack.pop()
            self.set_font(face, size)
            self.set_text_color(*color)
        if tag == "pre":
            face, size, color = self.font_stack.pop()
            self.set_font(face, size)
            self.set_text_color(*color)
            self.pre_formatted = False
        if tag == "blockquote":
            self.set_text_color(*self.font_color)
            self.indent -= 1
            self.pdf.ln(3)
        if tag in ("strong", "dt"):
            tag = "b"
        if tag == "em":
            tag = "i"
        if tag in ("b", "i", "u"):
            self.set_style(tag, False)
            self.follows_fmt_tag = True
        if tag == "a":
            self.href = ""
        if tag == "p":
            self.pdf.ln(self.h)
            self.align = ""
            self.h = px2mm(self.font_size)
        if tag in ("ul", "ol"):
            self.indent -= 1
            self.bullet.pop()
        if tag == "table":
            if not self.tfooter_out:
                self.output_table_footer()
            self.table = None
            self.th = False
            self.theader = None
            self.tfooter = None
            self.pdf.ln(self.h)
            self.tr_index = None
        if tag == "thead":
            self.thead = None
            self.tr_index = None
        if tag == "tfoot":
            self.tfoot = None
            self.tr_index = None
        if tag == "tbody":
            self.tbody = None
            self.tr_index = None
        if tag == "tr":
            if self.tfoot is None:
                self.pdf.ln(self.table_row_height)
            self.table_col_index = None
            self.tr = None
        if tag in ("td", "th"):
            if self.th:
                LOGGER.debug("revert style")
                self.set_style("b", False)  # revert style
            elif self._only_imgs_in_td:
                self._insert_td()
            self.table_col_index += int(self.td.get("colspan", "1"))
            self.td = None
            self.th = False
        if tag == "font":
            # recover last font state
            face, size, color = self.font_stack.pop()
            self.font_color = color
            self.set_font(face, size)
            self.set_text_color(*self.font_color)
        if tag == "center":
            self.align = None
        if tag == "sup":
            self.pdf.char_vpos = "LINE"
            self.follows_fmt_tag = True
        if tag == "sub":
            self.pdf.char_vpos = "LINE"
            self.follows_fmt_tag = True
    def handle_starttag(self, tag, attrs)
    Expand source code
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        LOGGER.debug("STARTTAG %s %s", tag, attrs)
        self._tags_stack.append(tag)
        if tag == "dt":
            self.pdf.ln(self.h)
            tag = "b"
        if tag == "dd":
            self.pdf.ln(self.h)
            self.pdf.write(self.h, " " * self.dd_tag_indent)
        if tag == "strong":
            tag = "b"
        if tag == "em":
            tag = "i"
        if tag in ("b", "i", "u"):
            self.set_style(tag, True)
        if tag == "a":
            self.href = attrs["href"]
        if tag == "br":
            self.pdf.ln(self.h)
        if tag == "p":
            self.pdf.ln(self.h)
            if "align" in attrs:
                self.align = attrs.get("align")
            if "line-height" in attrs:
                line_height = float(attrs.get("line-height"))
                self.h = px2mm(self.font_size) * line_height
        if tag in self.heading_sizes:
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.heading_level = int(tag[1:])
            hsize = self.heading_sizes[tag]
            self.pdf.set_text_color(150, 0, 0)
            self.pdf.ln(self.h + self.heading_above * hsize)  # more space above heading
            self.set_font(size=hsize)
            if attrs:
                self.align = attrs.get("align")
        if tag == "hr":
            self.pdf.add_page(same=True)
        if tag == "code":
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.set_font("courier", 11)
        if tag == "pre":
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            self.set_font("courier", 11)
            self.pre_formatted = True
        if tag == "blockquote":
            self.pdf.set_text_color(100, 0, 45)
            self.indent += 1
            self.pdf.ln(3)
        if tag == "ul":
            self.indent += 1
            self.bullet.append(self.ul_bullet_char)
        if tag == "ol":
            self.indent += 1
            self.bullet.append(0)
        if tag == "li":
            self.pdf.ln(self.h + 2)
            self.pdf.set_text_color(190, 0, 0)
            bullet = self.bullet[self.indent - 1]
            if not isinstance(bullet, str):
                bullet += 1
                self.bullet[self.indent - 1] = bullet
                bullet = f"{bullet}. "
            self.pdf.write(self.h, f"{' ' * self.li_tag_indent * self.indent}{bullet} ")
            self.set_text_color(*self.font_color)
        if tag == "font":
            # save previous font state:
            self.font_stack.append((self.font_face, self.font_size, self.font_color))
            if "color" in attrs:
                color = color_as_decimal(attrs["color"])
                self.font_color = color
            if "face" in attrs:
                face = attrs.get("face").lower()
                try:
                    self.pdf.set_font(face)
                    self.font_face = face
                except RuntimeError:
                    pass  # font not found, ignore
            if "size" in attrs:
                self.font_size = int(attrs.get("size"))
            self.set_font()
            self.set_text_color(*self.font_color)
        if tag == "table":
            self.table = {k.lower(): v for k, v in attrs.items()}
            if "width" not in self.table:
                self.table["width"] = "100%"
            if self.table["width"][-1] == "%":
                w = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
                w *= int(self.table["width"][:-1]) / 100
                self.table_offset = (self.pdf.w - w) / 2
            self.table_col_width = []
            self.theader_out = self.tfooter_out = False
            self.theader = []
            self.tfooter = []
            self.thead = None
            self.tfoot = None
            self.pdf.ln()
        if tag == "tr":
            self.tr_index = 0 if self.tr_index is None else (self.tr_index + 1)
            self.tr = {k.lower(): v for k, v in attrs.items()}
            self.table_col_index = 0
            self.table_row_height = 0
            self.pdf.set_x(self.table_offset)
            # Adding an horizontal line separator between rows:
            if self.table_line_separators and self.tr_index > 0:
                self.output_table_sep()
        if tag == "td":
            self.td = {k.lower(): v for k, v in attrs.items()}
            if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                assert self.table_col_index == len(
                    self.table_col_width
                ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                self.table_col_width.append(self.td["width"])
            if attrs:
                self.align = attrs.get("align")
            self._only_imgs_in_td = False
        if tag == "th":
            self.td = {k.lower(): v for k, v in attrs.items()}
            self.th = True
            if "width" in self.td and self.table_col_index >= len(self.table_col_width):
                assert self.table_col_index == len(
                    self.table_col_width
                ), f"table_col_index={self.table_col_index} #table_col_width={len(self.table_col_width)}"
                self.table_col_width.append(self.td["width"])
        if tag == "thead":
            self.thead = {}
        if tag == "tfoot":
            self.tfoot = {}
        if tag == "img" and "src" in attrs:
            width = px2mm(int(attrs.get("width", 0)))
            height = px2mm(int(attrs.get("height", 0)))
            if self.pdf.y + height > self.pdf.page_break_trigger:
                self.pdf.add_page(same=True)
            y = self.pdf.get_y()
            if self.table_col_index is not None:
                self._only_imgs_in_td = True
                # <img> in a <td>: its width must not exceed the cell width:
                td_width = self._td_width()
                if not width or width > td_width:
                    if width:  # Preserving image aspect ratio:
                        height *= td_width / width
                    width = td_width
                x = self._td_x()
                if self.align and self.align[0].upper() == "C":
                    x += (td_width - width) / 2
            else:
                x = self.pdf.get_x()
                if self.align and self.align[0].upper() == "C":
                    x = self.pdf.w / 2 - width / 2
            LOGGER.debug(
                'image "%s" x=%d y=%d width=%d height=%d',
                attrs["src"],
                x,
                y,
                width,
                height,
            )
            image_info = self.pdf.image(
                self.image_map(attrs["src"]), x, y, width, height, link=self.href
            )
            width = image_info["rendered_width"]
            height = image_info["rendered_height"]
            self.pdf.set_x(x + width)
            if self.table_col_index is not None:
                # <img> in a <td>: we grow the cell height according to the image height:
                if height > self.table_row_height:
                    self.table_row_height = height
            else:
                self.pdf.set_y(y + height)
        if tag in ("b", "i", "u"):
            self.set_style(tag, True)
        if tag == "center":
            self.align = "Center"
        if tag == "toc":
            self.pdf.insert_toc_placeholder(
                self.render_toc, pages=int(attrs.get("pages", 1))
            )
        if tag == "sup":
            self.pdf.char_vpos = "SUP"
        if tag == "sub":
            self.pdf.char_vpos = "SUB"
    Expand source code
    def output_table_footer(self):
        if self.tfooter:
            x = self.pdf.x
            self.pdf.set_x(self.table_offset)
            for celldict, bgcolor in self.tfooter:
                self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                self.pdf.cell(**celldict)
            self.pdf.ln(self.tfooter[0][0]["h"])
            self.pdf.set_x(x)
        if self.table.get("border"):
            self.output_table_sep()
        self.tfooter_out = True
    def output_table_header(self)
    Expand source code
    def output_table_header(self):
        if self.theader:
            b = self.style.get("b")
            self.pdf.set_x(self.table_offset)
            self.set_style("b", True)
            for celldict, bgcolor in self.theader:
                self.box_shadow(celldict["w"], celldict["h"], bgcolor)
                self.pdf.cell(**celldict)  # includes the border
            self.set_style("b", b)
            self.pdf.ln(self.theader[0][0]["h"])
            self.pdf.set_x(self.table_offset)
            # self.pdf.set_x(prev_x)
        self.theader_out = True
    def output_table_sep(self)
    Expand source code
    def output_table_sep(self):
        x1 = self.pdf.x
        y1 = self.pdf.y
        width = sum(self.width2unit(length) for length in self.table_col_width)
        self.pdf.line(x1, y1, x1 + width, y1)
    Expand source code
    def put_link(self, txt):
        # Put a hyperlink
        self.set_text_color(0, 0, 255)
        self.set_style("u", True)
        self.pdf.write(self.h, txt, self.href)
        self.set_style("u", False)
        self.set_text_color(*self.font_color)
    def render_toc(self, pdf, outline)

    This method can be overriden by subclasses to customize the Table of Contents style.

    Expand source code
    def render_toc(self, pdf, outline):
        "This method can be overriden by subclasses to customize the Table of Contents style."
        pdf.ln()
        for section in outline:
            link = pdf.add_link(page=section.page_number)
            text = f'{" " * section.level * 2} {section.name}'
            text += f' {"." * (60 - section.level*2 - len(section.name))} {section.page_number}'
            pdf.multi_cell(
                w=pdf.epw,
                h=pdf.font_size,
                txt=text,
                new_x=XPos.LMARGIN,
                new_y=YPos.NEXT,
                link=link,
            )
    def set_font(self, face=None, size=None)
    Expand source code
    def set_font(self, face=None, size=None):
        if face:
            self.font_face = face
        if size:
            self.font_size = size
            self.h = px2mm(size)
            LOGGER.debug("H %s", self.h)
        style = "".join(s for s in ("b", "i", "u") if self.style.get(s)).upper()
        if (self.font_face, style) != (self.pdf.font_family, self.pdf.font_style):
            self.pdf.set_font(self.font_face, style, self.font_size)
        if self.font_size != self.pdf.font_size:
            self.pdf.set_font_size(self.font_size)
    def set_style(self, tag=None, enable=False)
    Expand source code
    def set_style(self, tag=None, enable=False):
        # Modify style and select corresponding font
        if tag:
            self.style[tag.lower()] = enable
        style = "".join(s for s in ("b", "i", "u") if self.style.get(s))
        LOGGER.debug("SET_FONT_STYLE %s", style)
        self.pdf.set_font(style=style)
    def set_text_color(self, r=None, g=0, b=0)
    Expand source code
    def set_text_color(self, r=None, g=0, b=0):
        self.pdf.set_text_color(r, g, b)
    def width2unit(self, length)

    Handle conversion of % measures into the measurement unit used

    Expand source code
    def width2unit(self, length):
        "Handle conversion of % measures into the measurement unit used"
        if length[-1] == "%":
            total = self.pdf.w - self.pdf.r_margin - self.pdf.l_margin
            if self.table["width"][-1] == "%":
                total *= int(self.table["width"][:-1]) / 100
            return int(length[:-1]) * total / 100
        return int(length)
    class HTMLMixin (*args, **kwargs)
    Expand source code
    class HTMLMixin:
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            warnings.warn(
                "The HTMLMixin class is deprecated. "
                "Simply use the FPDF class as a replacement.",
                DeprecationWarning,
                stacklevel=2,
            )