UnisMindMap/mineru/utils/draw_bbox.py

import json
from io import BytesIO

from loguru import logger
from pypdf import PdfReader, PdfWriter, PageObject
from reportlab.pdfgen import canvas

from .enum_class import BlockType, ContentType, SplitFlag


def cal_canvas_rect(page, bbox):
    """
    Calculate the rectangle coordinates on the canvas based on the original PDF page and bounding box.

    Args:
        page: A PyPDF2 Page object representing a single page in the PDF.
        bbox: [x0, y0, x1, y1] representing the bounding box coordinates.

    Returns:
        rect: [x0, y0, width, height] representing the rectangle coordinates on the canvas.
    """
    page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
    
    actual_width = page_width    # The width of the final PDF display
    actual_height = page_height  # The height of the final PDF display
    
    rotation_obj = page.get("/Rotate", 0)
    try:
        rotation = int(rotation_obj) % 360  # cast rotation to int to handle IndirectObject
    except (ValueError, TypeError) as e:
        logger.warning(f"Invalid /Rotate value {rotation_obj!r} on page; defaulting to 0. Error: {e}")
        rotation = 0
    
    if rotation in [90, 270]:
        # PDF is rotated 90 degrees or 270 degrees, and the width and height need to be swapped
        actual_width, actual_height = actual_height, actual_width
        
    x0, y0, x1, y1 = bbox
    rect_w = abs(x1 - x0)
    rect_h = abs(y1 - y0)
    
    if rotation == 270:
        rect_w, rect_h = rect_h, rect_w
        x0 = actual_height - y1
        y0 = actual_width - x1
    elif rotation == 180:
        x0 = page_width - x1
        # y0 stays the same
    elif rotation == 90:
        rect_w, rect_h = rect_h, rect_w
        x0, y0 = y0, x0 
    else:
        # rotation == 0
        y0 = page_height - y1
    
    rect = [x0, y0, rect_w, rect_h]        
    return rect


def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
    new_rgb = [float(color) / 255 for color in rgb_config]
    page_data = bbox_list[i]

    for bbox in page_data:
        rect = cal_canvas_rect(page, bbox)  # Define the rectangle  

        if fill_config:  # filled rectangle
            c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
            c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
        else:  # bounding box
            c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
            c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
    return c


def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
    new_rgb = [float(color) / 255 for color in rgb_config]
    page_data = bbox_list[i]
    # 强制转换为 float
    page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])

    for j, bbox in enumerate(page_data):
        # 确保bbox的每个元素都是float
        rect = cal_canvas_rect(page, bbox)  # Define the rectangle  
        
        if draw_bbox:
            if fill_config:
                c.setFillColorRGB(*new_rgb, 0.3)
                c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
            else:
                c.setStrokeColorRGB(*new_rgb)
                c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
        c.setFillColorRGB(*new_rgb, 1.0)
        c.setFontSize(size=10)
        
        c.saveState()
        rotation_obj = page.get("/Rotate", 0)
        try:
            rotation = int(rotation_obj) % 360  # cast rotation to int to handle IndirectObject
        except (ValueError, TypeError):
            logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")
            rotation = 0

        if rotation == 0:
            c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
        elif rotation == 90:
            c.translate(rect[0] + 10, rect[1] + rect[3] + 2)
        elif rotation == 180:
            c.translate(rect[0] - 2, rect[1] + 10)
        elif rotation == 270:
            c.translate(rect[0] + rect[2] - 10, rect[1] - 2)
            
        c.rotate(rotation)
        c.drawString(0, 0, str(j + 1))
        c.restoreState()

    return c


def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
    codes_body_list, codes_caption_list = [], []
    titles_list = []
    texts_list = []
    interequations_list = []
    lists_list = []
    list_items_list = []
    indexs_list = []

    for page in pdf_info:
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
        imgs_body, imgs_caption, imgs_footnote = [], [], []
        codes_body, codes_caption = [], []
        titles = []
        texts = []
        interequations = []
        lists = []
        list_items = []
        indices = []

        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
        dropped_bbox_list.append(page_dropped_list)
        for block in page["para_blocks"]:
            bbox = block["bbox"]
            if block["type"] == BlockType.TABLE:
                for nested_block in block["blocks"]:
                    bbox = nested_block["bbox"]
                    if nested_block["type"] == BlockType.TABLE_BODY:
                        tables_body.append(bbox)
                    elif nested_block["type"] == BlockType.TABLE_CAPTION:
                        tables_caption.append(bbox)
                    elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
                        if nested_block.get(SplitFlag.CROSS_PAGE, False):
                            continue
                        tables_footnote.append(bbox)
            elif block["type"] == BlockType.IMAGE:
                for nested_block in block["blocks"]:
                    bbox = nested_block["bbox"]
                    if nested_block["type"] == BlockType.IMAGE_BODY:
                        imgs_body.append(bbox)
                    elif nested_block["type"] == BlockType.IMAGE_CAPTION:
                        imgs_caption.append(bbox)
                    elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
                        imgs_footnote.append(bbox)
            elif block["type"] == BlockType.CODE:
                for nested_block in block["blocks"]:
                    if nested_block["type"] == BlockType.CODE_BODY:
                        bbox = nested_block["bbox"]
                        codes_body.append(bbox)
                    elif nested_block["type"] == BlockType.CODE_CAPTION:
                        bbox = nested_block["bbox"]
                        codes_caption.append(bbox)
            elif block["type"] == BlockType.TITLE:
                titles.append(bbox)
            elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
                texts.append(bbox)
            elif block["type"] == BlockType.INTERLINE_EQUATION:
                interequations.append(bbox)
            elif block["type"] == BlockType.LIST:
                lists.append(bbox)
                if "blocks" in block:
                    for sub_block in block["blocks"]:
                        list_items.append(sub_block["bbox"])
            elif block["type"] == BlockType.INDEX:
                indices.append(bbox)

        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        imgs_footnote_list.append(imgs_footnote)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
        lists_list.append(lists)
        list_items_list.append(list_items)
        indexs_list.append(indices)
        codes_body_list.append(codes_body)
        codes_caption_list.append(codes_caption)

    layout_bbox_list = []

    table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
    for page in pdf_info:
        page_block_list = []
        for block in page["para_blocks"]:
            if block["type"] in [
                BlockType.TEXT,
                BlockType.REF_TEXT,
                BlockType.TITLE,
                BlockType.INTERLINE_EQUATION,
                BlockType.LIST,
                BlockType.INDEX,
            ]:
                bbox = block["bbox"]
                page_block_list.append(bbox)
            elif block["type"] in [BlockType.IMAGE]:
                for sub_block in block["blocks"]:
                    bbox = sub_block["bbox"]
                    page_block_list.append(bbox)
            elif block["type"] in [BlockType.TABLE]:
                sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
                for sub_block in sorted_blocks:
                    if sub_block.get(SplitFlag.CROSS_PAGE, False):
                        continue
                    bbox = sub_block["bbox"]
                    page_block_list.append(bbox)
            elif block["type"] in [BlockType.CODE]:
                for sub_block in block["blocks"]:
                    bbox = sub_block["bbox"]
                    page_block_list.append(bbox)

        layout_bbox_list.append(page_block_list)

    pdf_bytes_io = BytesIO(pdf_bytes)
    pdf_docs = PdfReader(pdf_bytes_io)
    output_pdf = PdfWriter()

    for i, page in enumerate(pdf_docs.pages):
        # 获取原始页面尺寸
        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
        custom_page_size = (page_width, page_height)

        packet = BytesIO()
        # 使用原始PDF的尺寸创建canvas
        c = canvas.Canvas(packet, pagesize=custom_page_size)

        c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
        c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
        c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
        c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
        c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
        c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
        c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
        c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
        c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
        c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
        c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
        c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
        c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
        c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
        c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
        c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)

        c.save()
        packet.seek(0)
        overlay_pdf = PdfReader(packet)

        # 添加检查确保overlay_pdf.pages不为空
        if len(overlay_pdf.pages) > 0:
            new_page = PageObject(pdf=None)
            new_page.update(page)
            page = new_page
            page.merge_page(overlay_pdf.pages[0])
        else:
            # 记录日志并继续处理下一个页面
            # logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
            pass

        output_pdf.add_page(page)

    # 保存结果
    with open(f"{out_path}/{filename}", "wb") as f:
        output_pdf.write(f)


def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
    text_list = []
    inline_equation_list = []
    interline_equation_list = []
    image_list = []
    table_list = []
    dropped_list = []

    def get_span_info(span):
        if span['type'] == ContentType.TEXT:
            page_text_list.append(span['bbox'])
        elif span['type'] == ContentType.INLINE_EQUATION:
            page_inline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.INTERLINE_EQUATION:
            page_interline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.IMAGE:
            page_image_list.append(span['bbox'])
        elif span['type'] == ContentType.TABLE:
            page_table_list.append(span['bbox'])

    for page in pdf_info:
        page_text_list = []
        page_inline_equation_list = []
        page_interline_equation_list = []
        page_image_list = []
        page_table_list = []
        page_dropped_list = []


        # 构造dropped_list
        for block in page['discarded_blocks']:
            if block['type'] == BlockType.DISCARDED:
                for line in block['lines']:
                    for span in line['spans']:
                        page_dropped_list.append(span['bbox'])
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
        for block in page['preproc_blocks']:
            if block['type'] in [
                BlockType.TEXT,
                BlockType.TITLE,
                BlockType.INTERLINE_EQUATION,
                BlockType.LIST,
                BlockType.INDEX,
            ]:
                for line in block['lines']:
                    for span in line['spans']:
                        get_span_info(span)
            elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
                for sub_block in block['blocks']:
                    for line in sub_block['lines']:
                        for span in line['spans']:
                            get_span_info(span)
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
        interline_equation_list.append(page_interline_equation_list)
        image_list.append(page_image_list)
        table_list.append(page_table_list)

    pdf_bytes_io = BytesIO(pdf_bytes)
    pdf_docs = PdfReader(pdf_bytes_io)
    output_pdf = PdfWriter()

    for i, page in enumerate(pdf_docs.pages):
        # 获取原始页面尺寸
        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
        custom_page_size = (page_width, page_height)

        packet = BytesIO()
        # 使用原始PDF的尺寸创建canvas
        c = canvas.Canvas(packet, pagesize=custom_page_size)

        # 获取当前页面的数据
        draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
        draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
        draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
        draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
        draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
        draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)

        c.save()
        packet.seek(0)
        overlay_pdf = PdfReader(packet)

        # 添加检查确保overlay_pdf.pages不为空
        if len(overlay_pdf.pages) > 0:
            new_page = PageObject(pdf=None)
            new_page.update(page)
            page = new_page
            page.merge_page(overlay_pdf.pages[0])
        else:
            # 记录日志并继续处理下一个页面
            # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
            pass

        output_pdf.add_page(page)

    # Save the PDF
    with open(f"{out_path}/{filename}", "wb") as f:
        output_pdf.write(f)


def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
    layout_bbox_list = []

    for page in pdf_info:
        page_line_list = []
        for block in page['preproc_blocks']:
            if block['type'] in [BlockType.TEXT]:
                for line in block['lines']:
                    bbox = line['bbox']
                    index = line['index']
                    page_line_list.append({'index': index, 'bbox': bbox})
            elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
                if 'virtual_lines' in block:
                    if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
                        for line in block['virtual_lines']:
                            bbox = line['bbox']
                            index = line['index']
                            page_line_list.append({'index': index, 'bbox': bbox})
                else:
                    for line in block['lines']:
                        bbox = line['bbox']
                        index = line['index']
                        page_line_list.append({'index': index, 'bbox': bbox})
            elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
                for sub_block in block['blocks']:
                    if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
                        if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
                            for line in sub_block['virtual_lines']:
                                bbox = line['bbox']
                                index = line['index']
                                page_line_list.append({'index': index, 'bbox': bbox})
                        else:
                            for line in sub_block['lines']:
                                bbox = line['bbox']
                                index = line['index']
                                page_line_list.append({'index': index, 'bbox': bbox})
                    elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:
                        for line in sub_block['lines']:
                            bbox = line['bbox']
                            index = line['index']
                            page_line_list.append({'index': index, 'bbox': bbox})
        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
    pdf_bytes_io = BytesIO(pdf_bytes)
    pdf_docs = PdfReader(pdf_bytes_io)
    output_pdf = PdfWriter()

    for i, page in enumerate(pdf_docs.pages):
        # 获取原始页面尺寸
        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
        custom_page_size = (page_width, page_height)

        packet = BytesIO()
        # 使用原始PDF的尺寸创建canvas
        c = canvas.Canvas(packet, pagesize=custom_page_size)

        # 获取当前页面的数据
        draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)

        c.save()
        packet.seek(0)
        overlay_pdf = PdfReader(packet)

        # 添加检查确保overlay_pdf.pages不为空
        if len(overlay_pdf.pages) > 0:
            new_page = PageObject(pdf=None)
            new_page.update(page)
            page = new_page
            page.merge_page(overlay_pdf.pages[0])
        else:
            # 记录日志并继续处理下一个页面
            # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
            pass

        output_pdf.add_page(page)

    # Save the PDF
    with open(f"{out_path}/{filename}", "wb") as f:
        output_pdf.write(f)


if __name__ == "__main__":
    # 读取PDF文件
    pdf_path = "examples/demo1.pdf"
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # 从json文件读取pdf_info

    json_path = "examples/demo1_1746005777.0863056_middle.json"
    with open(json_path, "r", encoding="utf-8") as f:
        pdf_ann = json.load(f)
    pdf_info = pdf_ann["pdf_info"]
    # 调用可视化函数,输出到examples目录
    draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")
feat(思维导图助手)：初始化提交 2026-03-20 03:07:54 +00:00			`import json`
			`from io import BytesIO`

			`from loguru import logger`
			`from pypdf import PdfReader, PdfWriter, PageObject`
			`from reportlab.pdfgen import canvas`

			`from .enum_class import BlockType, ContentType, SplitFlag`


			`def cal_canvas_rect(page, bbox):`
			`"""`
			`Calculate the rectangle coordinates on the canvas based on the original PDF page and bounding box.`

			`Args:`
			`page: A PyPDF2 Page object representing a single page in the PDF.`
			`bbox: [x0, y0, x1, y1] representing the bounding box coordinates.`

			`Returns:`
			`rect: [x0, y0, width, height] representing the rectangle coordinates on the canvas.`
			`"""`
			`page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])`

			`actual_width = page_width # The width of the final PDF display`
			`actual_height = page_height # The height of the final PDF display`

			`rotation_obj = page.get("/Rotate", 0)`
			`try:`
			`rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject`
			`except (ValueError, TypeError) as e:`
			`logger.warning(f"Invalid /Rotate value {rotation_obj!r} on page; defaulting to 0. Error: {e}")`
			`rotation = 0`

			`if rotation in [90, 270]:`
			`# PDF is rotated 90 degrees or 270 degrees, and the width and height need to be swapped`
			`actual_width, actual_height = actual_height, actual_width`

			`x0, y0, x1, y1 = bbox`
			`rect_w = abs(x1 - x0)`
			`rect_h = abs(y1 - y0)`

			`if rotation == 270:`
			`rect_w, rect_h = rect_h, rect_w`
			`x0 = actual_height - y1`
			`y0 = actual_width - x1`
			`elif rotation == 180:`
			`x0 = page_width - x1`
			`# y0 stays the same`
			`elif rotation == 90:`
			`rect_w, rect_h = rect_h, rect_w`
			`x0, y0 = y0, x0`
			`else:`
			`# rotation == 0`
			`y0 = page_height - y1`

			`rect = [x0, y0, rect_w, rect_h]`
			`return rect`


			`def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):`
			`new_rgb = [float(color) / 255 for color in rgb_config]`
			`page_data = bbox_list[i]`

			`for bbox in page_data:`
			`rect = cal_canvas_rect(page, bbox) # Define the rectangle`

			`if fill_config: # filled rectangle`
			`c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)`
			`c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)`
			`else: # bounding box`
			`c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])`
			`c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)`
			`return c`


			`def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):`
			`new_rgb = [float(color) / 255 for color in rgb_config]`
			`page_data = bbox_list[i]`
			`# 强制转换为 float`
			`page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])`

			`for j, bbox in enumerate(page_data):`
			`# 确保bbox的每个元素都是float`
			`rect = cal_canvas_rect(page, bbox) # Define the rectangle`

			`if draw_bbox:`
			`if fill_config:`
			`c.setFillColorRGB(*new_rgb, 0.3)`
			`c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)`
			`else:`
			`c.setStrokeColorRGB(*new_rgb)`
			`c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)`
			`c.setFillColorRGB(*new_rgb, 1.0)`
			`c.setFontSize(size=10)`

			`c.saveState()`
			`rotation_obj = page.get("/Rotate", 0)`
			`try:`
			`rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject`
			`except (ValueError, TypeError):`
			`logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")`
			`rotation = 0`

			`if rotation == 0:`
			`c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)`
			`elif rotation == 90:`
			`c.translate(rect[0] + 10, rect[1] + rect[3] + 2)`
			`elif rotation == 180:`
			`c.translate(rect[0] - 2, rect[1] + 10)`
			`elif rotation == 270:`
			`c.translate(rect[0] + rect[2] - 10, rect[1] - 2)`

			`c.rotate(rotation)`
			`c.drawString(0, 0, str(j + 1))`
			`c.restoreState()`

			`return c`


			`def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):`
			`dropped_bbox_list = []`
			`tables_body_list, tables_caption_list, tables_footnote_list = [], [], []`
			`imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []`
			`codes_body_list, codes_caption_list = [], []`
			`titles_list = []`
			`texts_list = []`
			`interequations_list = []`
			`lists_list = []`
			`list_items_list = []`
			`indexs_list = []`

			`for page in pdf_info:`
			`page_dropped_list = []`
			`tables_body, tables_caption, tables_footnote = [], [], []`
			`imgs_body, imgs_caption, imgs_footnote = [], [], []`
			`codes_body, codes_caption = [], []`
			`titles = []`
			`texts = []`
			`interequations = []`
			`lists = []`
			`list_items = []`
			`indices = []`

			`for dropped_bbox in page['discarded_blocks']:`
			`page_dropped_list.append(dropped_bbox['bbox'])`
			`dropped_bbox_list.append(page_dropped_list)`
			`for block in page["para_blocks"]:`
			`bbox = block["bbox"]`
			`if block["type"] == BlockType.TABLE:`
			`for nested_block in block["blocks"]:`
			`bbox = nested_block["bbox"]`
			`if nested_block["type"] == BlockType.TABLE_BODY:`
			`tables_body.append(bbox)`
			`elif nested_block["type"] == BlockType.TABLE_CAPTION:`
			`tables_caption.append(bbox)`
			`elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:`
			`if nested_block.get(SplitFlag.CROSS_PAGE, False):`
			`continue`
			`tables_footnote.append(bbox)`
			`elif block["type"] == BlockType.IMAGE:`
			`for nested_block in block["blocks"]:`
			`bbox = nested_block["bbox"]`
			`if nested_block["type"] == BlockType.IMAGE_BODY:`
			`imgs_body.append(bbox)`
			`elif nested_block["type"] == BlockType.IMAGE_CAPTION:`
			`imgs_caption.append(bbox)`
			`elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:`
			`imgs_footnote.append(bbox)`
			`elif block["type"] == BlockType.CODE:`
			`for nested_block in block["blocks"]:`
			`if nested_block["type"] == BlockType.CODE_BODY:`
			`bbox = nested_block["bbox"]`
			`codes_body.append(bbox)`
			`elif nested_block["type"] == BlockType.CODE_CAPTION:`
			`bbox = nested_block["bbox"]`
			`codes_caption.append(bbox)`
			`elif block["type"] == BlockType.TITLE:`
			`titles.append(bbox)`
			`elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:`
			`texts.append(bbox)`
			`elif block["type"] == BlockType.INTERLINE_EQUATION:`
			`interequations.append(bbox)`
			`elif block["type"] == BlockType.LIST:`
			`lists.append(bbox)`
			`if "blocks" in block:`
			`for sub_block in block["blocks"]:`
			`list_items.append(sub_block["bbox"])`
			`elif block["type"] == BlockType.INDEX:`
			`indices.append(bbox)`

			`tables_body_list.append(tables_body)`
			`tables_caption_list.append(tables_caption)`
			`tables_footnote_list.append(tables_footnote)`
			`imgs_body_list.append(imgs_body)`
			`imgs_caption_list.append(imgs_caption)`
			`imgs_footnote_list.append(imgs_footnote)`
			`titles_list.append(titles)`
			`texts_list.append(texts)`
			`interequations_list.append(interequations)`
			`lists_list.append(lists)`
			`list_items_list.append(list_items)`
			`indexs_list.append(indices)`
			`codes_body_list.append(codes_body)`
			`codes_caption_list.append(codes_caption)`

			`layout_bbox_list = []`

			`table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}`
			`for page in pdf_info:`
			`page_block_list = []`
			`for block in page["para_blocks"]:`
			`if block["type"] in [`
			`BlockType.TEXT,`
			`BlockType.REF_TEXT,`
			`BlockType.TITLE,`
			`BlockType.INTERLINE_EQUATION,`
			`BlockType.LIST,`
			`BlockType.INDEX,`
			`]:`
			`bbox = block["bbox"]`
			`page_block_list.append(bbox)`
			`elif block["type"] in [BlockType.IMAGE]:`
			`for sub_block in block["blocks"]:`
			`bbox = sub_block["bbox"]`
			`page_block_list.append(bbox)`
			`elif block["type"] in [BlockType.TABLE]:`
			`sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])`
			`for sub_block in sorted_blocks:`
			`if sub_block.get(SplitFlag.CROSS_PAGE, False):`
			`continue`
			`bbox = sub_block["bbox"]`
			`page_block_list.append(bbox)`
			`elif block["type"] in [BlockType.CODE]:`
			`for sub_block in block["blocks"]:`
			`bbox = sub_block["bbox"]`
			`page_block_list.append(bbox)`

			`layout_bbox_list.append(page_block_list)`

			`pdf_bytes_io = BytesIO(pdf_bytes)`
			`pdf_docs = PdfReader(pdf_bytes_io)`
			`output_pdf = PdfWriter()`

			`for i, page in enumerate(pdf_docs.pages):`
			`# 获取原始页面尺寸`
			`page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])`
			`custom_page_size = (page_width, page_height)`

			`packet = BytesIO()`
			`# 使用原始PDF的尺寸创建canvas`
			`c = canvas.Canvas(packet, pagesize=custom_page_size)`

			`c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)`
			`c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)`
			`c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)`
			`c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)`
			`c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)`
			`c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)`
			`c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)`
			`c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)`
			`c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)`
			`c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)`
			`c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)`
			`c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)`
			`c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)`
			`c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)`
			`c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)`
			`c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)`

			`c.save()`
			`packet.seek(0)`
			`overlay_pdf = PdfReader(packet)`

			`# 添加检查确保overlay_pdf.pages不为空`
			`if len(overlay_pdf.pages) > 0:`
			`new_page = PageObject(pdf=None)`
			`new_page.update(page)`
			`page = new_page`
			`page.merge_page(overlay_pdf.pages[0])`
			`else:`
			`# 记录日志并继续处理下一个页面`
			`# logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")`
			`pass`

			`output_pdf.add_page(page)`

			`# 保存结果`
			`with open(f"{out_path}/{filename}", "wb") as f:`
			`output_pdf.write(f)`


			`def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):`
			`text_list = []`
			`inline_equation_list = []`
			`interline_equation_list = []`
			`image_list = []`
			`table_list = []`
			`dropped_list = []`

			`def get_span_info(span):`
			`if span['type'] == ContentType.TEXT:`
			`page_text_list.append(span['bbox'])`
			`elif span['type'] == ContentType.INLINE_EQUATION:`
			`page_inline_equation_list.append(span['bbox'])`
			`elif span['type'] == ContentType.INTERLINE_EQUATION:`
			`page_interline_equation_list.append(span['bbox'])`
			`elif span['type'] == ContentType.IMAGE:`
			`page_image_list.append(span['bbox'])`
			`elif span['type'] == ContentType.TABLE:`
			`page_table_list.append(span['bbox'])`

			`for page in pdf_info:`
			`page_text_list = []`
			`page_inline_equation_list = []`
			`page_interline_equation_list = []`
			`page_image_list = []`
			`page_table_list = []`
			`page_dropped_list = []`


			`# 构造dropped_list`
			`for block in page['discarded_blocks']:`
			`if block['type'] == BlockType.DISCARDED:`
			`for line in block['lines']:`
			`for span in line['spans']:`
			`page_dropped_list.append(span['bbox'])`
			`dropped_list.append(page_dropped_list)`
			`# 构造其余useful_list`
			`# for block in page['para_blocks']: # span直接用分段合并前的结果就可以`
			`for block in page['preproc_blocks']:`
			`if block['type'] in [`
			`BlockType.TEXT,`
			`BlockType.TITLE,`
			`BlockType.INTERLINE_EQUATION,`
			`BlockType.LIST,`
			`BlockType.INDEX,`
			`]:`
			`for line in block['lines']:`
			`for span in line['spans']:`
			`get_span_info(span)`
			`elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:`
			`for sub_block in block['blocks']:`
			`for line in sub_block['lines']:`
			`for span in line['spans']:`
			`get_span_info(span)`
			`text_list.append(page_text_list)`
			`inline_equation_list.append(page_inline_equation_list)`
			`interline_equation_list.append(page_interline_equation_list)`
			`image_list.append(page_image_list)`
			`table_list.append(page_table_list)`

			`pdf_bytes_io = BytesIO(pdf_bytes)`
			`pdf_docs = PdfReader(pdf_bytes_io)`
			`output_pdf = PdfWriter()`

			`for i, page in enumerate(pdf_docs.pages):`
			`# 获取原始页面尺寸`
			`page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])`
			`custom_page_size = (page_width, page_height)`

			`packet = BytesIO()`
			`# 使用原始PDF的尺寸创建canvas`
			`c = canvas.Canvas(packet, pagesize=custom_page_size)`

			`# 获取当前页面的数据`
			`draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)`
			`draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)`
			`draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)`
			`draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)`
			`draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)`
			`draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)`

			`c.save()`
			`packet.seek(0)`
			`overlay_pdf = PdfReader(packet)`

			`# 添加检查确保overlay_pdf.pages不为空`
			`if len(overlay_pdf.pages) > 0:`
			`new_page = PageObject(pdf=None)`
			`new_page.update(page)`
			`page = new_page`
			`page.merge_page(overlay_pdf.pages[0])`
			`else:`
			`# 记录日志并继续处理下一个页面`
			`# logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")`
			`pass`

			`output_pdf.add_page(page)`

			`# Save the PDF`
			`with open(f"{out_path}/{filename}", "wb") as f:`
			`output_pdf.write(f)`


			`def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):`
			`layout_bbox_list = []`

			`for page in pdf_info:`
			`page_line_list = []`
			`for block in page['preproc_blocks']:`
			`if block['type'] in [BlockType.TEXT]:`
			`for line in block['lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:`
			`if 'virtual_lines' in block:`
			`if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:`
			`for line in block['virtual_lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`else:`
			`for line in block['lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:`
			`for sub_block in block['blocks']:`
			`if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:`
			`if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:`
			`for line in sub_block['virtual_lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`else:`
			`for line in sub_block['lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:`
			`for line in sub_block['lines']:`
			`bbox = line['bbox']`
			`index = line['index']`
			`page_line_list.append({'index': index, 'bbox': bbox})`
			`sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])`
			`layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)`
			`pdf_bytes_io = BytesIO(pdf_bytes)`
			`pdf_docs = PdfReader(pdf_bytes_io)`
			`output_pdf = PdfWriter()`

			`for i, page in enumerate(pdf_docs.pages):`
			`# 获取原始页面尺寸`
			`page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])`
			`custom_page_size = (page_width, page_height)`

			`packet = BytesIO()`
			`# 使用原始PDF的尺寸创建canvas`
			`c = canvas.Canvas(packet, pagesize=custom_page_size)`

			`# 获取当前页面的数据`
			`draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)`

			`c.save()`
			`packet.seek(0)`
			`overlay_pdf = PdfReader(packet)`

			`# 添加检查确保overlay_pdf.pages不为空`
			`if len(overlay_pdf.pages) > 0:`
			`new_page = PageObject(pdf=None)`
			`new_page.update(page)`
			`page = new_page`
			`page.merge_page(overlay_pdf.pages[0])`
			`else:`
			`# 记录日志并继续处理下一个页面`
			`# logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")`
			`pass`

			`output_pdf.add_page(page)`

			`# Save the PDF`
			`with open(f"{out_path}/{filename}", "wb") as f:`
			`output_pdf.write(f)`


			`if __name__ == "__main__":`
			`# 读取PDF文件`
			`pdf_path = "examples/demo1.pdf"`
			`with open(pdf_path, "rb") as f:`
			`pdf_bytes = f.read()`

			`# 从json文件读取pdf_info`

			`json_path = "examples/demo1_1746005777.0863056_middle.json"`
			`with open(json_path, "r", encoding="utf-8") as f:`
			`pdf_ann = json.load(f)`
			`pdf_info = pdf_ann["pdf_info"]`
			`# 调用可视化函数,输出到examples目录`
			`draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")`