import bs4 import os import json import re html_dir = "epub_extracted/OPS" def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") poem_divs = soup.find_all("div", class_="poem") texts = [] for div in poem_divs: for p in div.find_all("p"): # We want to process the text inside
while respecting
as separators.
# But actually, inside a
, there are text nodes, , ,
, etc.
current_line = []
for child in p.children:
if child.name == "br":
texts.append("".join(current_line).strip())
current_line = []
else:
current_line.append(child.get_text())
if current_line:
texts.append("".join(current_line).strip())
return texts
texts = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for i, t in enumerate(texts[:50]):
print(f"{i}: {t}")