import bs4 import re import os html_dir = "epub_extracted/OPS" def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") poem_divs = soup.find_all("div", class_="poem") events = [] for div in poem_divs: for p in div.find_all("p"): current_line = [] for child in p.children: if child.name == "br": if current_line: events.append(("line", "".join(current_line).strip())) current_line = [] elif ( child.name == "span" and child.get("id") and child.text.strip().endswith("部") ): events.append(("section", child.text.strip())) elif child.name == "small": # Convert small text to brackets small_text = child.get_text() # It might already have brackets inside. if not small_text.startswith("〈") and not small_text.startswith( "(" ): current_line.append(f"({small_text})") else: # Sometimes it has pseudo-brackets like `叙事...` # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean). # Let's look at child.get_text() current_line.append(small_text) else: current_line.append(child.get_text()) if current_line: events.append(("line", "".join(current_line).strip())) return events events = parse_html( os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") ) for e in events[:30]: print(e)