import bs4 import re import sys def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") current_section = None poem_divs = soup.find_all("div", class_="poem") for div in poem_divs: # Before we process children, let's look at p tags for p in div.find_all("p"): for child in p.children: if child.name == "span" and child.get("id") and child.text.strip().endswith("部"): current_section = child.text.strip() print("Found Section:", current_section) elif child.name == "br": pass elif type(child) == bs4.element.NavigableString: pass parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")