import bs4 import os import json import re html_dir = "epub_extracted/OPS" def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") poem_divs = soup.find_all("div", class_="poem") texts = [] for div in poem_divs: for p in div.find_all("p"): # We want to process the text inside

while respecting
as separators. # But actually, inside a

, there are text nodes, , ,
, etc. current_line = [] for child in p.children: if child.name == "br": texts.append("".join(current_line).strip()) current_line = [] else: current_line.append(child.get_text()) if current_line: texts.append("".join(current_line).strip()) return texts texts = parse_html( os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") ) for i, t in enumerate(texts[:50]): print(f"{i}: {t}")