spider-ctext/初学记/test_extract2.py

import bs4
import re
import os

html_dir = "epub_extracted/OPS"


def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    poem_divs = soup.find_all("div", class_="poem")

    events = []

    for div in poem_divs:
        for p in div.find_all("p"):
            current_line = []
            for child in p.children:
                if child.name == "br":
                    if current_line:
                        events.append(("line", "".join(current_line).strip()))
                        current_line = []
                elif (
                    child.name == "span"
                    and child.get("id")
                    and child.text.strip().endswith("部")
                ):
                    events.append(("section", child.text.strip()))
                elif child.name == "small":
                    # Convert small text to brackets
                    small_text = child.get_text()
                    # It might already have brackets inside.
                    if not small_text.startswith("〈") and not small_text.startswith(
                        "（"
                    ):
                        current_line.append(f"（{small_text}）")
                    else:
                        # Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
                        # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
                        # Let's look at child.get_text()
                        current_line.append(small_text)
                else:
                    current_line.append(child.get_text())
            if current_line:
                events.append(("line", "".join(current_line).strip()))
    return events


events = parse_html(
    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for e in events[:30]:
    print(e)