spider-ctext/初学记/test_extract.py

import bs4
import re
import sys

def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    current_section = None

    poem_divs = soup.find_all("div", class_="poem")
    for div in poem_divs:
        # Before we process children, let's look at p tags
        for p in div.find_all("p"):
            for child in p.children:
                if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
                    current_section = child.text.strip()
                    print("Found Section:", current_section)
                elif child.name == "br":
                    pass
                elif type(child) == bs4.element.NavigableString:
                    pass

parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")